fix: restart tailscale proxies before smoke checks

2026-05-03 23:02:04 +00:00
parent 877dd027ea
commit 7a08f58719
1 changed files with 25 additions and 0 deletions
@@ -22,6 +22,29 @@ retry() {
  done
 }

+restart_unhealthy_tailscale_proxies() {
+  local unhealthy_pods
+  unhealthy_pods="$(mktemp)"
+
+  "${KUBECTL[@]}" -n tailscale-system get pods -l tailscale.com/managed=true --no-headers \
+    | grep -Ev "[[:space:]](Running|Completed)[[:space:]]" \
+    | awk '{print $1}' >"${unhealthy_pods}" || true
+
+  if [ ! -s "${unhealthy_pods}" ]; then
+    rm -f "${unhealthy_pods}"
+    return 0
+  fi
+
+  echo "Restarting unhealthy Tailscale-managed proxy pods before smoke checks"
+  while read -r pod; do
+    "${KUBECTL[@]}" -n tailscale-system delete pod "${pod}" --wait=false
+  done <"${unhealthy_pods}"
+  rm -f "${unhealthy_pods}"
+
+  sleep 30
+  "${KUBECTL[@]}" -n tailscale-system wait --for=condition=Ready pod -l tailscale.com/managed=true --timeout=600s
+}
+
 service_proxy_ready() {
  local namespace="$1"
  local service_name="$2"
@@ -85,6 +108,8 @@ check_service() {
  echo "HTTP status OK for $url"
 }

+restart_unhealthy_tailscale_proxies
+
 check_service "cattle-system" "rancher-tailscale" "rancher.silverside-gopher.ts.net" "https://rancher.silverside-gopher.ts.net/"
 check_service "observability" "grafana-tailscale" "grafana.silverside-gopher.ts.net" "http://grafana.silverside-gopher.ts.net/"
 check_service "observability" "prometheus-tailscale" "prometheus.silverside-gopher.ts.net" "http://prometheus.silverside-gopher.ts.net:9090/"