From bccc17c42276f5426166c1ac68b890d8817a20c6 Mon Sep 17 00:00:00 2001 From: MichaelFisher1997 Date: Mon, 4 May 2026 00:59:49 +0000 Subject: [PATCH] fix: self-heal tailscale smoke checks --- scripts/smoke-check-tailnet-services.sh | 65 +++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 5 deletions(-) diff --git a/scripts/smoke-check-tailnet-services.sh b/scripts/smoke-check-tailnet-services.sh index 6409090..6db62d6 100644 --- a/scripts/smoke-check-tailnet-services.sh +++ b/scripts/smoke-check-tailnet-services.sh @@ -45,6 +45,37 @@ restart_unhealthy_tailscale_proxies() { "${KUBECTL[@]}" -n tailscale-system wait --for=condition=Ready pod -l tailscale.com/managed=true --timeout=600s } +restart_service_tailscale_proxy() { + local namespace="$1" + local service_name="$2" + local service_uid + local proxy_pods + + service_uid="$("${KUBECTL[@]}" -n "$namespace" get svc "$service_name" -o jsonpath='{.metadata.uid}' 2>/dev/null || true)" + if [ -z "$service_uid" ]; then + echo "Cannot restart proxy for $namespace/$service_name: service UID not found" >&2 + return 1 + fi + + proxy_pods="$("${KUBECTL[@]}" -n tailscale-system get pods -l "app=${service_uid}" -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true)" + if [ -z "$proxy_pods" ]; then + echo "Cannot restart proxy for $namespace/$service_name: no proxy pod with app=$service_uid" >&2 + "${KUBECTL[@]}" -n tailscale-system get pods -o wide >&2 || true + return 1 + fi + + echo "Restarting Tailscale proxy pod for $namespace/$service_name" + while read -r pod; do + [ -n "$pod" ] || continue + "${KUBECTL[@]}" -n tailscale-system delete pod "$pod" --wait=false + done < $hostname" - retry 18 10 service_proxy_ready "$namespace" "$service_name" - retry 18 10 assigned_hostname_matches "$namespace" "$service_name" "$hostname" - retry 18 10 dns_resolves "$hostname" - retry 18 10 tailscale_ping_succeeds "$hostname" - retry 18 10 http_status_is_expected "$url" + if ! retry 18 10 service_proxy_ready "$namespace" "$service_name"; then + echo "Tailscale proxy did not become Ready for $namespace/$service_name; restarting proxy and retrying" >&2 + restart_service_tailscale_proxy "$namespace" "$service_name" + retry 18 10 service_proxy_ready "$namespace" "$service_name" + fi + + if ! retry 18 10 assigned_hostname_matches "$namespace" "$service_name" "$hostname"; then + echo "Tailscale service hostname did not match $hostname for $namespace/$service_name" >&2 + "${KUBECTL[@]}" -n "$namespace" get svc "$service_name" -o yaml >&2 || true + return 1 + fi + + if ! retry 18 10 dns_resolves "$hostname"; then + echo "DNS did not resolve for $hostname; restarting proxy and retrying" >&2 + restart_service_tailscale_proxy "$namespace" "$service_name" + retry 18 10 dns_resolves "$hostname" + fi + + if ! retry 18 10 tailscale_ping_succeeds "$hostname"; then + echo "Tailscale ping failed for $hostname; restarting proxy and retrying" >&2 + restart_service_tailscale_proxy "$namespace" "$service_name" + retry 18 10 tailscale_ping_succeeds "$hostname" + fi + + if ! retry 18 10 http_status_is_expected "$url"; then + echo "HTTP check failed for $url; restarting proxy and retrying" >&2 + restart_service_tailscale_proxy "$namespace" "$service_name" + retry 18 10 http_status_is_expected "$url" + fi echo "Resolved hostname: $(getent hosts "$hostname" | awk '{print $1}' | head -1)" echo "HTTP status OK for $url"