fix: enforce post-deploy health checks
Deploy Cluster / Terraform (push) Successful in 29s
Deploy Cluster / Ansible (push) Has been cancelled

This commit is contained in:
2026-04-25 02:22:16 +00:00
parent 7e3ebec95b
commit bfcf57bcc5
8 changed files with 67 additions and 34 deletions
+27 -6
View File
@@ -407,6 +407,8 @@ jobs:
kubectl -n tailscale-system rollout status deployment/operator --timeout=600s
wait_for_flux_helm_release nfs-subdir-external-provisioner flux-system-nfs-subdir-external-provisioner nfs-subdir-external-provisioner kube-system 600s 600s 600
kubectl -n kube-system rollout status deployment/kube-system-nfs-subdir-external-provisioner --timeout=600s
kubectl annotate storageclass local-path storageclass.kubernetes.io/is-default-class=false --overwrite
kubectl annotate storageclass flash-nfs storageclass.kubernetes.io/is-default-class=true --overwrite
kubectl get storageclass flash-nfs
- name: Wait for Rancher and backup operator
@@ -595,12 +597,31 @@ jobs:
- name: Post-deploy cluster health checks
working-directory: ansible
run: |
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get nodes -o wide"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n flux-system get gitrepositories,kustomizations,helmreleases"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n kube-system get pods -o wide"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass flash-nfs"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n tailscale-system get pods -o wide"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n external-secrets get pods"
set -euo pipefail
ansible -i inventory.ini 'control_plane[0]' -m shell -a '
set -euo pipefail
kubectl get nodes -o wide
kubectl -n flux-system get gitrepositories,kustomizations,helmreleases,ocirepositories
kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-cert-manager --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-proxyclass --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-config --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup-config --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=60s
kubectl -n flux-system wait --for=condition=Ready helmrelease --all --timeout=60s
kubectl get storageclass | grep -E "^flash-nfs.*\\(default\\)"
kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded --no-headers | tee /tmp/nonrunning-pods
test ! -s /tmp/nonrunning-pods
kubectl -n kube-system get pods -o wide
kubectl -n tailscale-system get pods -o wide
kubectl -n external-secrets get pods -o wide
'
env:
ANSIBLE_HOST_KEY_CHECKING: "False"