fix: harden final health checks
Deploy Cluster / Terraform (push) Successful in 31s
Deploy Cluster / Ansible (push) Failing after 17m50s

This commit is contained in:
2026-04-26 02:14:02 +00:00
parent a4f1d179e9
commit 46b2ff7d19
3 changed files with 48 additions and 3 deletions
+13 -2
View File
@@ -779,6 +779,15 @@ jobs:
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-config --timeout=300s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup --timeout=300s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup-config --timeout=300s
reconcile_at=$(date +%s)
kubectl -n flux-system annotate helmrelease/kube-prometheus-stack \
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
--overwrite
kubectl -n flux-system annotate kustomization/addon-observability \
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
--overwrite
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=1200s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=300s
kubectl -n flux-system wait --for=condition=Ready helmrelease --all --timeout=1200s
@@ -786,13 +795,15 @@ jobs:
kubectl annotate storageclass flash-nfs storageclass.kubernetes.io/is-default-class=true --overwrite
kubectl get storageclass | grep -E "^flash-nfs.*\\(default\\)"
! kubectl get storageclass | grep -E "^local-path.*\\(default\\)"
unhealthy_pods=$(mktemp)
kubectl get pods -A --no-headers \
| grep -Ev "[[:space:]](Running|Completed)[[:space:]]" \
| grep -Ev "^cattle-system[[:space:]]+helm-operation-" \
| grep -Ev "^cattle-capi-system[[:space:]]+capi-controller-manager-" \
| grep -Ev "^cattle-resources-system[[:space:]]+rancher-backup-patch-sa-" \
| grep -Ev "^kube-system[[:space:]]+helm-install-" \
| tee /tmp/unhealthy-pods || true
test ! -s /tmp/unhealthy-pods
| tee "${unhealthy_pods}" || true
test ! -s "${unhealthy_pods}"
kubectl -n kube-system get pods -o wide
kubectl -n tailscale-system get pods -o wide
kubectl -n external-secrets get pods -o wide