fix: wait for tailscale operator recovery
Deploy Cluster / Terraform (push) Successful in 46s
Deploy Cluster / Ansible (push) Successful in 30m34s

This commit is contained in:
2026-05-04 17:58:03 +00:00
parent 329497cf8a
commit cefc028f83
+28 -15
View File
@@ -22,22 +22,35 @@ retry() {
done done
} }
restart_tailscale_operator() { wait_for_tailscale_operator() {
local timeout_seconds="${1:-1200}"
local elapsed=0
local available
if ! "${KUBECTL[@]}" -n tailscale-system get deployment/operator >/dev/null 2>&1; then if ! "${KUBECTL[@]}" -n tailscale-system get deployment/operator >/dev/null 2>&1; then
return 0 return 0
fi fi
if "${KUBECTL[@]}" -n tailscale-system rollout status deployment/operator --timeout=60s; then while [ "$elapsed" -lt "$timeout_seconds" ]; do
return 0 available="$("${KUBECTL[@]}" -n tailscale-system get deployment/operator -o jsonpath='{.status.conditions[?(@.type=="Available")].status}' 2>/dev/null || true)"
fi if [ "$available" = "True" ] && "${KUBECTL[@]}" -n tailscale-system wait --for=condition=Ready pod -l app=operator --timeout=10s >/dev/null 2>&1; then
return 0
fi
echo "Restarting unhealthy Tailscale operator before smoke checks" if [ "$elapsed" -gt 0 ] && [ $((elapsed % 120)) -eq 0 ]; then
"${KUBECTL[@]}" -n tailscale-system delete pod -l app=operator --wait=false echo "Waiting for Tailscale operator to recover (${elapsed}s/${timeout_seconds}s)" >&2
if ! "${KUBECTL[@]}" -n tailscale-system rollout status deployment/operator --timeout=600s; then "${KUBECTL[@]}" -n tailscale-system get pods -o wide >&2 || true
"${KUBECTL[@]}" -n tailscale-system get pods -o wide >&2 || true "${KUBECTL[@]}" -n tailscale-system logs deployment/operator --previous --tail=40 >&2 || true
"${KUBECTL[@]}" -n tailscale-system logs deployment/operator --tail=100 >&2 || true fi
return 1
fi sleep 10
elapsed=$((elapsed + 10))
done
"${KUBECTL[@]}" -n tailscale-system get pods -o wide >&2 || true
"${KUBECTL[@]}" -n tailscale-system logs deployment/operator --previous --tail=100 >&2 || true
"${KUBECTL[@]}" -n tailscale-system logs deployment/operator --tail=100 >&2 || true
return 1
} }
restart_unhealthy_tailscale_proxies() { restart_unhealthy_tailscale_proxies() {
@@ -77,10 +90,10 @@ restart_service_tailscale_proxy() {
proxy_pods="$("${KUBECTL[@]}" -n tailscale-system get pods -l "app=${service_uid}" -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true)" proxy_pods="$("${KUBECTL[@]}" -n tailscale-system get pods -l "app=${service_uid}" -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true)"
if [ -z "$proxy_pods" ]; then if [ -z "$proxy_pods" ]; then
echo "No proxy pod with app=$service_uid for $namespace/$service_name; restarting operator and waiting for proxy creation" >&2 echo "No proxy pod with app=$service_uid for $namespace/$service_name; waiting for operator and proxy creation" >&2
restart_tailscale_operator wait_for_tailscale_operator 1200
sleep 30 sleep 30
if ! retry 18 10 service_proxy_ready "$namespace" "$service_name"; then if ! retry 60 10 service_proxy_ready "$namespace" "$service_name"; then
"${KUBECTL[@]}" -n "$namespace" get svc "$service_name" -o yaml >&2 || true "${KUBECTL[@]}" -n "$namespace" get svc "$service_name" -o yaml >&2 || true
"${KUBECTL[@]}" -n tailscale-system get pods -o wide >&2 || true "${KUBECTL[@]}" -n tailscale-system get pods -o wide >&2 || true
return 1 return 1
@@ -187,7 +200,7 @@ check_service() {
echo "HTTP status OK for $url" echo "HTTP status OK for $url"
} }
restart_tailscale_operator wait_for_tailscale_operator 1200
restart_unhealthy_tailscale_proxies restart_unhealthy_tailscale_proxies
check_service "cattle-system" "rancher-tailscale" "rancher.silverside-gopher.ts.net" "https://rancher.silverside-gopher.ts.net/" check_service "cattle-system" "rancher-tailscale" "rancher.silverside-gopher.ts.net" "https://rancher.silverside-gopher.ts.net/"