fix: enforce post-deploy health checks

2026-04-25 02:22:16 +00:00
parent 7e3ebec95b
commit bfcf57bcc5
8 changed files with 67 additions and 34 deletions
@@ -407,6 +407,8 @@ jobs:
          kubectl -n tailscale-system rollout status deployment/operator --timeout=600s
          wait_for_flux_helm_release nfs-subdir-external-provisioner flux-system-nfs-subdir-external-provisioner nfs-subdir-external-provisioner kube-system 600s 600s 600
          kubectl -n kube-system rollout status deployment/kube-system-nfs-subdir-external-provisioner --timeout=600s
+          kubectl annotate storageclass local-path storageclass.kubernetes.io/is-default-class=false --overwrite
+          kubectl annotate storageclass flash-nfs storageclass.kubernetes.io/is-default-class=true --overwrite
          kubectl get storageclass flash-nfs

      - name: Wait for Rancher and backup operator
@@ -595,12 +597,31 @@ jobs:
      - name: Post-deploy cluster health checks
        working-directory: ansible
        run: |
-          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get nodes -o wide"
-          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n flux-system get gitrepositories,kustomizations,helmreleases"
-          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n kube-system get pods -o wide"
-          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass flash-nfs"
-          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n tailscale-system get pods -o wide"
-          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n external-secrets get pods"
+          set -euo pipefail
+          ansible -i inventory.ini 'control_plane[0]' -m shell -a '
+            set -euo pipefail
+            kubectl get nodes -o wide
+            kubectl -n flux-system get gitrepositories,kustomizations,helmreleases,ocirepositories
+            kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=60s
+            kubectl -n flux-system wait --for=condition=Ready kustomization/addon-cert-manager --timeout=60s
+            kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets --timeout=60s
+            kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=60s
+            kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=60s
+            kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-proxyclass --timeout=60s
+            kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=60s
+            kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-config --timeout=60s
+            kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup --timeout=60s
+            kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup-config --timeout=60s
+            kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=60s
+            kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=60s
+            kubectl -n flux-system wait --for=condition=Ready helmrelease --all --timeout=60s
+            kubectl get storageclass | grep -E "^flash-nfs.*\\(default\\)"
+            kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded --no-headers | tee /tmp/nonrunning-pods
+            test ! -s /tmp/nonrunning-pods
+            kubectl -n kube-system get pods -o wide
+            kubectl -n tailscale-system get pods -o wide
+            kubectl -n external-secrets get pods -o wide
+          '
        env:
          ANSIBLE_HOST_KEY_CHECKING: "False"