fix: enforce post-deploy health checks
Deploy Cluster / Terraform (push) Successful in 29s
Deploy Cluster / Ansible (push) Has been cancelled

This commit is contained in:
2026-04-25 02:22:16 +00:00
parent 7e3ebec95b
commit bfcf57bcc5
8 changed files with 67 additions and 34 deletions
+27 -6
View File
@@ -407,6 +407,8 @@ jobs:
kubectl -n tailscale-system rollout status deployment/operator --timeout=600s kubectl -n tailscale-system rollout status deployment/operator --timeout=600s
wait_for_flux_helm_release nfs-subdir-external-provisioner flux-system-nfs-subdir-external-provisioner nfs-subdir-external-provisioner kube-system 600s 600s 600 wait_for_flux_helm_release nfs-subdir-external-provisioner flux-system-nfs-subdir-external-provisioner nfs-subdir-external-provisioner kube-system 600s 600s 600
kubectl -n kube-system rollout status deployment/kube-system-nfs-subdir-external-provisioner --timeout=600s kubectl -n kube-system rollout status deployment/kube-system-nfs-subdir-external-provisioner --timeout=600s
kubectl annotate storageclass local-path storageclass.kubernetes.io/is-default-class=false --overwrite
kubectl annotate storageclass flash-nfs storageclass.kubernetes.io/is-default-class=true --overwrite
kubectl get storageclass flash-nfs kubectl get storageclass flash-nfs
- name: Wait for Rancher and backup operator - name: Wait for Rancher and backup operator
@@ -595,12 +597,31 @@ jobs:
- name: Post-deploy cluster health checks - name: Post-deploy cluster health checks
working-directory: ansible working-directory: ansible
run: | run: |
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get nodes -o wide" set -euo pipefail
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n flux-system get gitrepositories,kustomizations,helmreleases" ansible -i inventory.ini 'control_plane[0]' -m shell -a '
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n kube-system get pods -o wide" set -euo pipefail
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass flash-nfs" kubectl get nodes -o wide
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n tailscale-system get pods -o wide" kubectl -n flux-system get gitrepositories,kustomizations,helmreleases,ocirepositories
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n external-secrets get pods" kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-cert-manager --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-proxyclass --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-config --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup-config --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=60s
kubectl -n flux-system wait --for=condition=Ready helmrelease --all --timeout=60s
kubectl get storageclass | grep -E "^flash-nfs.*\\(default\\)"
kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded --no-headers | tee /tmp/nonrunning-pods
test ! -s /tmp/nonrunning-pods
kubectl -n kube-system get pods -o wide
kubectl -n tailscale-system get pods -o wide
kubectl -n external-secrets get pods -o wide
'
env: env:
ANSIBLE_HOST_KEY_CHECKING: "False" ANSIBLE_HOST_KEY_CHECKING: "False"
@@ -5,6 +5,7 @@ metadata:
namespace: flux-system namespace: flux-system
spec: spec:
interval: 10m interval: 10m
timeout: 15m
targetNamespace: observability targetNamespace: observability
chart: chart:
spec: spec:
@@ -32,7 +33,7 @@ spec:
serve_from_sub_path: false serve_from_sub_path: false
persistence: persistence:
enabled: true enabled: true
storageClassName: local-path storageClassName: flash-nfs
size: 5Gi size: 5Gi
service: service:
type: ClusterIP type: ClusterIP
@@ -55,7 +56,7 @@ spec:
storageSpec: storageSpec:
volumeClaimTemplate: volumeClaimTemplate:
spec: spec:
storageClassName: local-path storageClassName: flash-nfs
accessModes: accessModes:
- ReadWriteOnce - ReadWriteOnce
resources: resources:
@@ -6,13 +6,9 @@ metadata:
spec: spec:
interval: 10m interval: 10m
targetNamespace: observability targetNamespace: observability
chart: chartRef:
spec: kind: OCIRepository
chart: loki name: loki
version: 6.10.0
sourceRef:
kind: HelmRepository
name: grafana
namespace: flux-system namespace: flux-system
install: install:
createNamespace: true createNamespace: true
@@ -50,7 +46,7 @@ spec:
replicas: 1 replicas: 1
persistence: persistence:
size: 10Gi size: 10Gi
storageClass: local-path storageClass: flash-nfs
resources: resources:
requests: requests:
cpu: 100m cpu: 100m
@@ -6,13 +6,9 @@ metadata:
spec: spec:
interval: 10m interval: 10m
targetNamespace: observability targetNamespace: observability
chart: chartRef:
spec: kind: OCIRepository
chart: promtail name: promtail
version: 6.16.6
sourceRef:
kind: HelmRepository
name: grafana
namespace: flux-system namespace: flux-system
install: install:
createNamespace: true createNamespace: true
@@ -1,8 +0,0 @@
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: grafana
namespace: flux-system
spec:
interval: 1h
url: https://grafana.github.io/helm-charts
@@ -4,7 +4,8 @@ resources:
- namespace.yaml - namespace.yaml
- grafana-admin-externalsecret.yaml - grafana-admin-externalsecret.yaml
- helmrepository-prometheus-community.yaml - helmrepository-prometheus-community.yaml
- helmrepository-grafana.yaml - ocirepository-loki.yaml
- ocirepository-promtail.yaml
- helmrelease-kube-prometheus-stack.yaml - helmrelease-kube-prometheus-stack.yaml
- helmrelease-loki.yaml - helmrelease-loki.yaml
- helmrelease-promtail.yaml - helmrelease-promtail.yaml
@@ -0,0 +1,13 @@
apiVersion: source.toolkit.fluxcd.io/v1
kind: OCIRepository
metadata:
name: loki
namespace: flux-system
spec:
interval: 10m
url: oci://ghcr.io/grafana/helm-charts/loki
ref:
tag: 6.46.0
layerSelector:
mediaType: application/vnd.cncf.helm.chart.content.v1.tar+gzip
operation: copy
@@ -0,0 +1,13 @@
apiVersion: source.toolkit.fluxcd.io/v1
kind: OCIRepository
metadata:
name: promtail
namespace: flux-system
spec:
interval: 10m
url: oci://ghcr.io/grafana/helm-charts/promtail
ref:
tag: 6.16.6
layerSelector:
mediaType: application/vnd.cncf.helm.chart.content.v1.tar+gzip
operation: copy