fix: self-heal tailscale smoke checks
This commit is contained in:
@@ -45,6 +45,37 @@ restart_unhealthy_tailscale_proxies() {
|
||||
"${KUBECTL[@]}" -n tailscale-system wait --for=condition=Ready pod -l tailscale.com/managed=true --timeout=600s
|
||||
}
|
||||
|
||||
restart_service_tailscale_proxy() {
|
||||
local namespace="$1"
|
||||
local service_name="$2"
|
||||
local service_uid
|
||||
local proxy_pods
|
||||
|
||||
service_uid="$("${KUBECTL[@]}" -n "$namespace" get svc "$service_name" -o jsonpath='{.metadata.uid}' 2>/dev/null || true)"
|
||||
if [ -z "$service_uid" ]; then
|
||||
echo "Cannot restart proxy for $namespace/$service_name: service UID not found" >&2
|
||||
return 1
|
||||
fi
|
||||
|
||||
proxy_pods="$("${KUBECTL[@]}" -n tailscale-system get pods -l "app=${service_uid}" -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true)"
|
||||
if [ -z "$proxy_pods" ]; then
|
||||
echo "Cannot restart proxy for $namespace/$service_name: no proxy pod with app=$service_uid" >&2
|
||||
"${KUBECTL[@]}" -n tailscale-system get pods -o wide >&2 || true
|
||||
return 1
|
||||
fi
|
||||
|
||||
echo "Restarting Tailscale proxy pod for $namespace/$service_name"
|
||||
while read -r pod; do
|
||||
[ -n "$pod" ] || continue
|
||||
"${KUBECTL[@]}" -n tailscale-system delete pod "$pod" --wait=false
|
||||
done <<EOF_PODS
|
||||
$proxy_pods
|
||||
EOF_PODS
|
||||
|
||||
sleep 30
|
||||
"${KUBECTL[@]}" -n tailscale-system wait --for=condition=Ready pod -l "app=${service_uid}" --timeout=600s
|
||||
}
|
||||
|
||||
service_proxy_ready() {
|
||||
local namespace="$1"
|
||||
local service_name="$2"
|
||||
@@ -98,11 +129,35 @@ check_service() {
|
||||
local url="$4"
|
||||
|
||||
echo "Checking $namespace/$service_name -> $hostname"
|
||||
if ! retry 18 10 service_proxy_ready "$namespace" "$service_name"; then
|
||||
echo "Tailscale proxy did not become Ready for $namespace/$service_name; restarting proxy and retrying" >&2
|
||||
restart_service_tailscale_proxy "$namespace" "$service_name"
|
||||
retry 18 10 service_proxy_ready "$namespace" "$service_name"
|
||||
retry 18 10 assigned_hostname_matches "$namespace" "$service_name" "$hostname"
|
||||
fi
|
||||
|
||||
if ! retry 18 10 assigned_hostname_matches "$namespace" "$service_name" "$hostname"; then
|
||||
echo "Tailscale service hostname did not match $hostname for $namespace/$service_name" >&2
|
||||
"${KUBECTL[@]}" -n "$namespace" get svc "$service_name" -o yaml >&2 || true
|
||||
return 1
|
||||
fi
|
||||
|
||||
if ! retry 18 10 dns_resolves "$hostname"; then
|
||||
echo "DNS did not resolve for $hostname; restarting proxy and retrying" >&2
|
||||
restart_service_tailscale_proxy "$namespace" "$service_name"
|
||||
retry 18 10 dns_resolves "$hostname"
|
||||
fi
|
||||
|
||||
if ! retry 18 10 tailscale_ping_succeeds "$hostname"; then
|
||||
echo "Tailscale ping failed for $hostname; restarting proxy and retrying" >&2
|
||||
restart_service_tailscale_proxy "$namespace" "$service_name"
|
||||
retry 18 10 tailscale_ping_succeeds "$hostname"
|
||||
fi
|
||||
|
||||
if ! retry 18 10 http_status_is_expected "$url"; then
|
||||
echo "HTTP check failed for $url; restarting proxy and retrying" >&2
|
||||
restart_service_tailscale_proxy "$namespace" "$service_name"
|
||||
retry 18 10 http_status_is_expected "$url"
|
||||
fi
|
||||
|
||||
echo "Resolved hostname: $(getent hosts "$hostname" | awk '{print $1}' | head -1)"
|
||||
echo "HTTP status OK for $url"
|
||||
|
||||
Reference in New Issue
Block a user