fix: recover tailscale operator during smoke checks
Deploy Cluster / Terraform (push) Successful in 32s
Deploy Cluster / Ansible (push) Successful in 13m59s

This commit is contained in:
2026-05-04 06:20:26 +00:00
parent 15bb1eaf06
commit 6b9a77aae7
+26 -1
View File
@@ -22,6 +22,24 @@ retry() {
done done
} }
restart_tailscale_operator() {
if ! "${KUBECTL[@]}" -n tailscale-system get deployment/operator >/dev/null 2>&1; then
return 0
fi
if "${KUBECTL[@]}" -n tailscale-system rollout status deployment/operator --timeout=60s; then
return 0
fi
echo "Restarting unhealthy Tailscale operator before smoke checks"
"${KUBECTL[@]}" -n tailscale-system delete pod -l app=operator --wait=false
if ! "${KUBECTL[@]}" -n tailscale-system rollout status deployment/operator --timeout=600s; then
"${KUBECTL[@]}" -n tailscale-system get pods -o wide >&2 || true
"${KUBECTL[@]}" -n tailscale-system logs deployment/operator --tail=100 >&2 || true
return 1
fi
}
restart_unhealthy_tailscale_proxies() { restart_unhealthy_tailscale_proxies() {
local unhealthy_pods local unhealthy_pods
unhealthy_pods="$(mktemp)" unhealthy_pods="$(mktemp)"
@@ -59,10 +77,16 @@ restart_service_tailscale_proxy() {
proxy_pods="$("${KUBECTL[@]}" -n tailscale-system get pods -l "app=${service_uid}" -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true)" proxy_pods="$("${KUBECTL[@]}" -n tailscale-system get pods -l "app=${service_uid}" -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true)"
if [ -z "$proxy_pods" ]; then if [ -z "$proxy_pods" ]; then
echo "Cannot restart proxy for $namespace/$service_name: no proxy pod with app=$service_uid" >&2 echo "No proxy pod with app=$service_uid for $namespace/$service_name; restarting operator and waiting for proxy creation" >&2
restart_tailscale_operator
sleep 30
if ! retry 18 10 service_proxy_ready "$namespace" "$service_name"; then
"${KUBECTL[@]}" -n "$namespace" get svc "$service_name" -o yaml >&2 || true
"${KUBECTL[@]}" -n tailscale-system get pods -o wide >&2 || true "${KUBECTL[@]}" -n tailscale-system get pods -o wide >&2 || true
return 1 return 1
fi fi
return 0
fi
echo "Restarting Tailscale proxy pod for $namespace/$service_name" echo "Restarting Tailscale proxy pod for $namespace/$service_name"
while read -r pod; do while read -r pod; do
@@ -163,6 +187,7 @@ check_service() {
echo "HTTP status OK for $url" echo "HTTP status OK for $url"
} }
restart_tailscale_operator
restart_unhealthy_tailscale_proxies restart_unhealthy_tailscale_proxies
check_service "cattle-system" "rancher-tailscale" "rancher.silverside-gopher.ts.net" "https://rancher.silverside-gopher.ts.net/" check_service "cattle-system" "rancher-tailscale" "rancher.silverside-gopher.ts.net" "https://rancher.silverside-gopher.ts.net/"