fix: wait for tailscale operator recovery

2026-05-04 17:58:03 +00:00
parent 329497cf8a
commit cefc028f83
1 changed files with 28 additions and 15 deletions
@@ -22,22 +22,35 @@ retry() {
  done
 }
-restart_tailscale_operator() {
+wait_for_tailscale_operator() {
  local timeout_seconds="${1:-1200}"
  local elapsed=0
  local available
  if ! "${KUBECTL[@]}" -n tailscale-system get deployment/operator >/dev/null 2>&1; then
    return 0
  fi
-  if "${KUBECTL[@]}" -n tailscale-system rollout status deployment/operator --timeout=60s; then
+  while [ "$elapsed" -lt "$timeout_seconds" ]; do
-    return 0
+    available="$("${KUBECTL[@]}" -n tailscale-system get deployment/operator -o jsonpath='{.status.conditions[?(@.type=="Available")].status}' 2>/dev/null || true)"
-  fi
+    if [ "$available" = "True" ] && "${KUBECTL[@]}" -n tailscale-system wait --for=condition=Ready pod -l app=operator --timeout=10s >/dev/null 2>&1; then
      return 0
    fi
-  echo "Restarting unhealthy Tailscale operator before smoke checks"
+    if [ "$elapsed" -gt 0 ] && [ $((elapsed % 120)) -eq 0 ]; then
-  "${KUBECTL[@]}" -n tailscale-system delete pod -l app=operator --wait=false
+      echo "Waiting for Tailscale operator to recover (${elapsed}s/${timeout_seconds}s)" >&2
-  if ! "${KUBECTL[@]}" -n tailscale-system rollout status deployment/operator --timeout=600s; then
+      "${KUBECTL[@]}" -n tailscale-system get pods -o wide >&2 || true
-    "${KUBECTL[@]}" -n tailscale-system get pods -o wide >&2 || true
+      "${KUBECTL[@]}" -n tailscale-system logs deployment/operator --previous --tail=40 >&2 || true
-    "${KUBECTL[@]}" -n tailscale-system logs deployment/operator --tail=100 >&2 || true
+    fi
-    return 1
+
-  fi
+    sleep 10
    elapsed=$((elapsed + 10))
  done
  "${KUBECTL[@]}" -n tailscale-system get pods -o wide >&2 || true
  "${KUBECTL[@]}" -n tailscale-system logs deployment/operator --previous --tail=100 >&2 || true
  "${KUBECTL[@]}" -n tailscale-system logs deployment/operator --tail=100 >&2 || true
  return 1
 }
 restart_unhealthy_tailscale_proxies() {
@@ -77,10 +90,10 @@ restart_service_tailscale_proxy() {
  proxy_pods="$("${KUBECTL[@]}" -n tailscale-system get pods -l "app=${service_uid}" -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true)"
  if [ -z "$proxy_pods" ]; then
-    echo "No proxy pod with app=$service_uid for $namespace/$service_name; restarting operator and waiting for proxy creation" >&2
+    echo "No proxy pod with app=$service_uid for $namespace/$service_name; waiting for operator and proxy creation" >&2
-    restart_tailscale_operator
+    wait_for_tailscale_operator 1200
    sleep 30
-    if ! retry 18 10 service_proxy_ready "$namespace" "$service_name"; then
+    if ! retry 60 10 service_proxy_ready "$namespace" "$service_name"; then
      "${KUBECTL[@]}" -n "$namespace" get svc "$service_name" -o yaml >&2 || true
      "${KUBECTL[@]}" -n tailscale-system get pods -o wide >&2 || true
      return 1
@@ -187,7 +200,7 @@ check_service() {
  echo "HTTP status OK for $url"
 }
-restart_tailscale_operator
+wait_for_tailscale_operator 1200
 restart_unhealthy_tailscale_proxies
 check_service "cattle-system" "rancher-tailscale" "rancher.silverside-gopher.ts.net" "https://rancher.silverside-gopher.ts.net/"