Files
HetznerTerra/scripts/smoke-check-tailnet-services.sh
T
micqdf 6b9a77aae7
Deploy Cluster / Terraform (push) Successful in 32s
Deploy Cluster / Ansible (push) Successful in 13m59s
fix: recover tailscale operator during smoke checks
2026-05-04 06:20:26 +00:00

196 lines
6.2 KiB
Bash

#!/usr/bin/env bash
set -euo pipefail
if kubectl get --raw=/readyz >/dev/null 2>&1; then
KUBECTL=(kubectl)
else
KUBECTL=(sudo k3s kubectl)
fi
retry() {
local attempts="$1"
local delay_seconds="$2"
shift 2
local attempt=1
until "$@"; do
if [ "$attempt" -ge "$attempts" ]; then
return 1
fi
sleep "$delay_seconds"
attempt=$((attempt + 1))
done
}
restart_tailscale_operator() {
if ! "${KUBECTL[@]}" -n tailscale-system get deployment/operator >/dev/null 2>&1; then
return 0
fi
if "${KUBECTL[@]}" -n tailscale-system rollout status deployment/operator --timeout=60s; then
return 0
fi
echo "Restarting unhealthy Tailscale operator before smoke checks"
"${KUBECTL[@]}" -n tailscale-system delete pod -l app=operator --wait=false
if ! "${KUBECTL[@]}" -n tailscale-system rollout status deployment/operator --timeout=600s; then
"${KUBECTL[@]}" -n tailscale-system get pods -o wide >&2 || true
"${KUBECTL[@]}" -n tailscale-system logs deployment/operator --tail=100 >&2 || true
return 1
fi
}
restart_unhealthy_tailscale_proxies() {
local unhealthy_pods
unhealthy_pods="$(mktemp)"
"${KUBECTL[@]}" -n tailscale-system get pods -l tailscale.com/managed=true --no-headers \
| grep -Ev "[[:space:]](Running|Completed)[[:space:]]" \
| awk '{print $1}' >"${unhealthy_pods}" || true
if [ ! -s "${unhealthy_pods}" ]; then
rm -f "${unhealthy_pods}"
return 0
fi
echo "Restarting unhealthy Tailscale-managed proxy pods before smoke checks"
while read -r pod; do
"${KUBECTL[@]}" -n tailscale-system delete pod "${pod}" --wait=false
done <"${unhealthy_pods}"
rm -f "${unhealthy_pods}"
sleep 30
"${KUBECTL[@]}" -n tailscale-system wait --for=condition=Ready pod -l tailscale.com/managed=true --timeout=600s
}
restart_service_tailscale_proxy() {
local namespace="$1"
local service_name="$2"
local service_uid
local proxy_pods
service_uid="$("${KUBECTL[@]}" -n "$namespace" get svc "$service_name" -o jsonpath='{.metadata.uid}' 2>/dev/null || true)"
if [ -z "$service_uid" ]; then
echo "Cannot restart proxy for $namespace/$service_name: service UID not found" >&2
return 1
fi
proxy_pods="$("${KUBECTL[@]}" -n tailscale-system get pods -l "app=${service_uid}" -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true)"
if [ -z "$proxy_pods" ]; then
echo "No proxy pod with app=$service_uid for $namespace/$service_name; restarting operator and waiting for proxy creation" >&2
restart_tailscale_operator
sleep 30
if ! retry 18 10 service_proxy_ready "$namespace" "$service_name"; then
"${KUBECTL[@]}" -n "$namespace" get svc "$service_name" -o yaml >&2 || true
"${KUBECTL[@]}" -n tailscale-system get pods -o wide >&2 || true
return 1
fi
return 0
fi
echo "Restarting Tailscale proxy pod for $namespace/$service_name"
while read -r pod; do
[ -n "$pod" ] || continue
"${KUBECTL[@]}" -n tailscale-system delete pod "$pod" --wait=false
done <<EOF_PODS
$proxy_pods
EOF_PODS
sleep 30
"${KUBECTL[@]}" -n tailscale-system wait --for=condition=Ready pod -l "app=${service_uid}" --timeout=600s
}
service_proxy_ready() {
local namespace="$1"
local service_name="$2"
"${KUBECTL[@]}" get svc "$service_name" -n "$namespace" \
-o jsonpath='{.status.conditions[?(@.type=="TailscaleProxyReady")].status}' 2>/dev/null \
| grep -qx 'True'
}
assigned_hostname_matches() {
local namespace="$1"
local service_name="$2"
local expected_hostname="$3"
"${KUBECTL[@]}" get svc "$service_name" -n "$namespace" \
-o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null \
| grep -qx "$expected_hostname"
}
dns_resolves() {
local hostname="$1"
getent hosts "$hostname" >/dev/null 2>&1
}
tailscale_ping_succeeds() {
local hostname="$1"
timeout 20s tailscale ping -c 1 "$hostname" >/dev/null 2>&1
}
http_status_is_expected() {
local url="$1"
local status
status="$(curl -skS -o /dev/null -w '%{http_code}' --max-time 15 "$url" || true)"
case "$status" in
200|301|302|401|403)
return 0
;;
*)
echo "Unexpected HTTP status for $url: $status" >&2
return 1
;;
esac
}
check_service() {
local namespace="$1"
local service_name="$2"
local hostname="$3"
local url="$4"
echo "Checking $namespace/$service_name -> $hostname"
if ! retry 18 10 service_proxy_ready "$namespace" "$service_name"; then
echo "Tailscale proxy did not become Ready for $namespace/$service_name; restarting proxy and retrying" >&2
restart_service_tailscale_proxy "$namespace" "$service_name"
retry 18 10 service_proxy_ready "$namespace" "$service_name"
fi
if ! retry 18 10 assigned_hostname_matches "$namespace" "$service_name" "$hostname"; then
echo "Tailscale service hostname did not match $hostname for $namespace/$service_name" >&2
"${KUBECTL[@]}" -n "$namespace" get svc "$service_name" -o yaml >&2 || true
return 1
fi
if ! retry 18 10 dns_resolves "$hostname"; then
echo "DNS did not resolve for $hostname; restarting proxy and retrying" >&2
restart_service_tailscale_proxy "$namespace" "$service_name"
retry 18 10 dns_resolves "$hostname"
fi
if ! retry 18 10 tailscale_ping_succeeds "$hostname"; then
echo "Tailscale ping failed for $hostname; restarting proxy and retrying" >&2
restart_service_tailscale_proxy "$namespace" "$service_name"
retry 18 10 tailscale_ping_succeeds "$hostname"
fi
if ! retry 18 10 http_status_is_expected "$url"; then
echo "HTTP check failed for $url; restarting proxy and retrying" >&2
restart_service_tailscale_proxy "$namespace" "$service_name"
retry 18 10 http_status_is_expected "$url"
fi
echo "Resolved hostname: $(getent hosts "$hostname" | awk '{print $1}' | head -1)"
echo "HTTP status OK for $url"
}
restart_tailscale_operator
restart_unhealthy_tailscale_proxies
check_service "cattle-system" "rancher-tailscale" "rancher.silverside-gopher.ts.net" "https://rancher.silverside-gopher.ts.net/"
check_service "observability" "grafana-tailscale" "grafana.silverside-gopher.ts.net" "http://grafana.silverside-gopher.ts.net/"
check_service "observability" "prometheus-tailscale" "prometheus.silverside-gopher.ts.net" "http://prometheus.silverside-gopher.ts.net:9090/"