diff --git a/scripts/smoke-check-tailnet-services.sh b/scripts/smoke-check-tailnet-services.sh index ec1c9e8..66e9251 100644 --- a/scripts/smoke-check-tailnet-services.sh +++ b/scripts/smoke-check-tailnet-services.sh @@ -22,22 +22,35 @@ retry() { done } -restart_tailscale_operator() { +wait_for_tailscale_operator() { + local timeout_seconds="${1:-1200}" + local elapsed=0 + local available + if ! "${KUBECTL[@]}" -n tailscale-system get deployment/operator >/dev/null 2>&1; then return 0 fi - if "${KUBECTL[@]}" -n tailscale-system rollout status deployment/operator --timeout=60s; then - return 0 - fi + while [ "$elapsed" -lt "$timeout_seconds" ]; do + available="$("${KUBECTL[@]}" -n tailscale-system get deployment/operator -o jsonpath='{.status.conditions[?(@.type=="Available")].status}' 2>/dev/null || true)" + if [ "$available" = "True" ] && "${KUBECTL[@]}" -n tailscale-system wait --for=condition=Ready pod -l app=operator --timeout=10s >/dev/null 2>&1; then + return 0 + fi - echo "Restarting unhealthy Tailscale operator before smoke checks" - "${KUBECTL[@]}" -n tailscale-system delete pod -l app=operator --wait=false - if ! "${KUBECTL[@]}" -n tailscale-system rollout status deployment/operator --timeout=600s; then - "${KUBECTL[@]}" -n tailscale-system get pods -o wide >&2 || true - "${KUBECTL[@]}" -n tailscale-system logs deployment/operator --tail=100 >&2 || true - return 1 - fi + if [ "$elapsed" -gt 0 ] && [ $((elapsed % 120)) -eq 0 ]; then + echo "Waiting for Tailscale operator to recover (${elapsed}s/${timeout_seconds}s)" >&2 + "${KUBECTL[@]}" -n tailscale-system get pods -o wide >&2 || true + "${KUBECTL[@]}" -n tailscale-system logs deployment/operator --previous --tail=40 >&2 || true + fi + + sleep 10 + elapsed=$((elapsed + 10)) + done + + "${KUBECTL[@]}" -n tailscale-system get pods -o wide >&2 || true + "${KUBECTL[@]}" -n tailscale-system logs deployment/operator --previous --tail=100 >&2 || true + "${KUBECTL[@]}" -n tailscale-system logs deployment/operator --tail=100 >&2 || true + return 1 } restart_unhealthy_tailscale_proxies() { @@ -77,10 +90,10 @@ restart_service_tailscale_proxy() { proxy_pods="$("${KUBECTL[@]}" -n tailscale-system get pods -l "app=${service_uid}" -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true)" if [ -z "$proxy_pods" ]; then - echo "No proxy pod with app=$service_uid for $namespace/$service_name; restarting operator and waiting for proxy creation" >&2 - restart_tailscale_operator + echo "No proxy pod with app=$service_uid for $namespace/$service_name; waiting for operator and proxy creation" >&2 + wait_for_tailscale_operator 1200 sleep 30 - if ! retry 18 10 service_proxy_ready "$namespace" "$service_name"; then + if ! retry 60 10 service_proxy_ready "$namespace" "$service_name"; then "${KUBECTL[@]}" -n "$namespace" get svc "$service_name" -o yaml >&2 || true "${KUBECTL[@]}" -n tailscale-system get pods -o wide >&2 || true return 1 @@ -187,7 +200,7 @@ check_service() { echo "HTTP status OK for $url" } -restart_tailscale_operator +wait_for_tailscale_operator 1200 restart_unhealthy_tailscale_proxies check_service "cattle-system" "rancher-tailscale" "rancher.silverside-gopher.ts.net" "https://rancher.silverside-gopher.ts.net/"