#!/usr/bin/env bash set -euo pipefail if kubectl get --raw=/readyz >/dev/null 2>&1; then KUBECTL=(kubectl) else KUBECTL=(sudo k3s kubectl) fi retry() { local attempts="$1" local delay_seconds="$2" shift 2 local attempt=1 until "$@"; do if [ "$attempt" -ge "$attempts" ]; then return 1 fi sleep "$delay_seconds" attempt=$((attempt + 1)) done } restart_unhealthy_tailscale_proxies() { local unhealthy_pods unhealthy_pods="$(mktemp)" "${KUBECTL[@]}" -n tailscale-system get pods -l tailscale.com/managed=true --no-headers \ | grep -Ev "[[:space:]](Running|Completed)[[:space:]]" \ | awk '{print $1}' >"${unhealthy_pods}" || true if [ ! -s "${unhealthy_pods}" ]; then rm -f "${unhealthy_pods}" return 0 fi echo "Restarting unhealthy Tailscale-managed proxy pods before smoke checks" while read -r pod; do "${KUBECTL[@]}" -n tailscale-system delete pod "${pod}" --wait=false done <"${unhealthy_pods}" rm -f "${unhealthy_pods}" sleep 30 "${KUBECTL[@]}" -n tailscale-system wait --for=condition=Ready pod -l tailscale.com/managed=true --timeout=600s } restart_service_tailscale_proxy() { local namespace="$1" local service_name="$2" local service_uid local proxy_pods service_uid="$("${KUBECTL[@]}" -n "$namespace" get svc "$service_name" -o jsonpath='{.metadata.uid}' 2>/dev/null || true)" if [ -z "$service_uid" ]; then echo "Cannot restart proxy for $namespace/$service_name: service UID not found" >&2 return 1 fi proxy_pods="$("${KUBECTL[@]}" -n tailscale-system get pods -l "app=${service_uid}" -o jsonpath='{range .items[*]}{.metadata.name}{"\n"}{end}' 2>/dev/null || true)" if [ -z "$proxy_pods" ]; then echo "Cannot restart proxy for $namespace/$service_name: no proxy pod with app=$service_uid" >&2 "${KUBECTL[@]}" -n tailscale-system get pods -o wide >&2 || true return 1 fi echo "Restarting Tailscale proxy pod for $namespace/$service_name" while read -r pod; do [ -n "$pod" ] || continue "${KUBECTL[@]}" -n tailscale-system delete pod "$pod" --wait=false done </dev/null \ | grep -qx 'True' } assigned_hostname_matches() { local namespace="$1" local service_name="$2" local expected_hostname="$3" "${KUBECTL[@]}" get svc "$service_name" -n "$namespace" \ -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null \ | grep -qx "$expected_hostname" } dns_resolves() { local hostname="$1" getent hosts "$hostname" >/dev/null 2>&1 } tailscale_ping_succeeds() { local hostname="$1" timeout 20s tailscale ping -c 1 "$hostname" >/dev/null 2>&1 } http_status_is_expected() { local url="$1" local status status="$(curl -skS -o /dev/null -w '%{http_code}' --max-time 15 "$url" || true)" case "$status" in 200|301|302|401|403) return 0 ;; *) echo "Unexpected HTTP status for $url: $status" >&2 return 1 ;; esac } check_service() { local namespace="$1" local service_name="$2" local hostname="$3" local url="$4" echo "Checking $namespace/$service_name -> $hostname" if ! retry 18 10 service_proxy_ready "$namespace" "$service_name"; then echo "Tailscale proxy did not become Ready for $namespace/$service_name; restarting proxy and retrying" >&2 restart_service_tailscale_proxy "$namespace" "$service_name" retry 18 10 service_proxy_ready "$namespace" "$service_name" fi if ! retry 18 10 assigned_hostname_matches "$namespace" "$service_name" "$hostname"; then echo "Tailscale service hostname did not match $hostname for $namespace/$service_name" >&2 "${KUBECTL[@]}" -n "$namespace" get svc "$service_name" -o yaml >&2 || true return 1 fi if ! retry 18 10 dns_resolves "$hostname"; then echo "DNS did not resolve for $hostname; restarting proxy and retrying" >&2 restart_service_tailscale_proxy "$namespace" "$service_name" retry 18 10 dns_resolves "$hostname" fi if ! retry 18 10 tailscale_ping_succeeds "$hostname"; then echo "Tailscale ping failed for $hostname; restarting proxy and retrying" >&2 restart_service_tailscale_proxy "$namespace" "$service_name" retry 18 10 tailscale_ping_succeeds "$hostname" fi if ! retry 18 10 http_status_is_expected "$url"; then echo "HTTP check failed for $url; restarting proxy and retrying" >&2 restart_service_tailscale_proxy "$namespace" "$service_name" retry 18 10 http_status_is_expected "$url" fi echo "Resolved hostname: $(getent hosts "$hostname" | awk '{print $1}' | head -1)" echo "HTTP status OK for $url" } restart_unhealthy_tailscale_proxies check_service "cattle-system" "rancher-tailscale" "rancher.silverside-gopher.ts.net" "https://rancher.silverside-gopher.ts.net/" check_service "observability" "grafana-tailscale" "grafana.silverside-gopher.ts.net" "http://grafana.silverside-gopher.ts.net/" check_service "observability" "prometheus-tailscale" "prometheus.silverside-gopher.ts.net" "http://prometheus.silverside-gopher.ts.net:9090/"