fix: self-heal tailscale smoke checks
Deploy Cluster / Terraform (push) Successful in 34s
Deploy Cluster / Ansible (push) Successful in 22m55s

This commit is contained in:
2026-05-04 00:59:49 +00:00
parent 055cb50bdd
commit bccc17c422
+56 -1
View File
@@ -45,6 +45,37 @@ restart_unhealthy_tailscale_proxies() {
"${KUBECTL[@]}" -n tailscale-system wait --for=condition=Ready pod -l tailscale.com/managed=true --timeout=600s
}
restart_service_tailscale_proxy() {
local namespace="$1"
local service_name="$2"
local service_uid
local proxy_pods
service_uid="$("${KUBECTL[@]}" -n "$namespace" get svc "$service_name" -o jsonpath='{.metadata.uid}' 2>/dev/null || true)"
if [ -z "$service_uid" ]; then
echo "Cannot restart proxy for $namespace/$service_name: service UID not found" >&2
return 1
fi
proxy_pods="$("${KUBECTL[@]}" -n tailscale-system get pods -l "app=${service_uid}" -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true)"
if [ -z "$proxy_pods" ]; then
echo "Cannot restart proxy for $namespace/$service_name: no proxy pod with app=$service_uid" >&2
"${KUBECTL[@]}" -n tailscale-system get pods -o wide >&2 || true
return 1
fi
echo "Restarting Tailscale proxy pod for $namespace/$service_name"
while read -r pod; do
[ -n "$pod" ] || continue
"${KUBECTL[@]}" -n tailscale-system delete pod "$pod" --wait=false
done <<EOF_PODS
$proxy_pods
EOF_PODS
sleep 30
"${KUBECTL[@]}" -n tailscale-system wait --for=condition=Ready pod -l "app=${service_uid}" --timeout=600s
}
service_proxy_ready() {
local namespace="$1"
local service_name="$2"
@@ -98,11 +129,35 @@ check_service() {
local url="$4"
echo "Checking $namespace/$service_name -> $hostname"
if ! retry 18 10 service_proxy_ready "$namespace" "$service_name"; then
echo "Tailscale proxy did not become Ready for $namespace/$service_name; restarting proxy and retrying" >&2
restart_service_tailscale_proxy "$namespace" "$service_name"
retry 18 10 service_proxy_ready "$namespace" "$service_name"
retry 18 10 assigned_hostname_matches "$namespace" "$service_name" "$hostname"
fi
if ! retry 18 10 assigned_hostname_matches "$namespace" "$service_name" "$hostname"; then
echo "Tailscale service hostname did not match $hostname for $namespace/$service_name" >&2
"${KUBECTL[@]}" -n "$namespace" get svc "$service_name" -o yaml >&2 || true
return 1
fi
if ! retry 18 10 dns_resolves "$hostname"; then
echo "DNS did not resolve for $hostname; restarting proxy and retrying" >&2
restart_service_tailscale_proxy "$namespace" "$service_name"
retry 18 10 dns_resolves "$hostname"
fi
if ! retry 18 10 tailscale_ping_succeeds "$hostname"; then
echo "Tailscale ping failed for $hostname; restarting proxy and retrying" >&2
restart_service_tailscale_proxy "$namespace" "$service_name"
retry 18 10 tailscale_ping_succeeds "$hostname"
fi
if ! retry 18 10 http_status_is_expected "$url"; then
echo "HTTP check failed for $url; restarting proxy and retrying" >&2
restart_service_tailscale_proxy "$namespace" "$service_name"
retry 18 10 http_status_is_expected "$url"
fi
echo "Resolved hostname: $(getent hosts "$hostname" | awk '{print $1}' | head -1)"
echo "HTTP status OK for $url"