fix: harden final health checks
Deploy Cluster / Terraform (push) Successful in 31s
Deploy Cluster / Ansible (push) Failing after 17m50s

This commit is contained in:
2026-04-26 02:14:02 +00:00
parent a4f1d179e9
commit 46b2ff7d19
3 changed files with 48 additions and 3 deletions
+13 -2
View File
@@ -779,6 +779,15 @@ jobs:
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-config --timeout=300s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup --timeout=300s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup-config --timeout=300s
reconcile_at=$(date +%s)
kubectl -n flux-system annotate helmrelease/kube-prometheus-stack \
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
--overwrite
kubectl -n flux-system annotate kustomization/addon-observability \
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
--overwrite
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=1200s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=300s
kubectl -n flux-system wait --for=condition=Ready helmrelease --all --timeout=1200s
@@ -786,13 +795,15 @@ jobs:
kubectl annotate storageclass flash-nfs storageclass.kubernetes.io/is-default-class=true --overwrite
kubectl get storageclass | grep -E "^flash-nfs.*\\(default\\)"
! kubectl get storageclass | grep -E "^local-path.*\\(default\\)"
unhealthy_pods=$(mktemp)
kubectl get pods -A --no-headers \
| grep -Ev "[[:space:]](Running|Completed)[[:space:]]" \
| grep -Ev "^cattle-system[[:space:]]+helm-operation-" \
| grep -Ev "^cattle-capi-system[[:space:]]+capi-controller-manager-" \
| grep -Ev "^cattle-resources-system[[:space:]]+rancher-backup-patch-sa-" \
| grep -Ev "^kube-system[[:space:]]+helm-install-" \
| tee /tmp/unhealthy-pods || true
test ! -s /tmp/unhealthy-pods
| tee "${unhealthy_pods}" || true
test ! -s "${unhealthy_pods}"
kubectl -n kube-system get pods -o wide
kubectl -n tailscale-system get pods -o wide
kubectl -n external-secrets get pods -o wide
+33
View File
@@ -89,6 +89,39 @@
roles:
- k3s-server
- name: Export kube-vip image from primary control plane
hosts: control_plane[0]
become: true
tasks:
- name: Export kube-vip image for secondary control planes
command: >-
/usr/local/bin/ctr -n k8s.io images export
/tmp/kube-vip-bootstrap.tar
ghcr.io/kube-vip/kube-vip:v1.1.2
changed_when: false
- name: Fetch kube-vip image archive
fetch:
src: /tmp/kube-vip-bootstrap.tar
dest: ../outputs/kube-vip-bootstrap.tar
flat: true
- name: Seed kube-vip image on secondary control planes
hosts: control_plane[1:]
become: true
tasks:
- name: Copy kube-vip image archive
copy:
src: ../outputs/kube-vip-bootstrap.tar
dest: /tmp/kube-vip-bootstrap.tar
mode: "0644"
- name: Import kube-vip image into containerd
command: /usr/local/bin/ctr -n k8s.io images import /tmp/kube-vip-bootstrap.tar
changed_when: false
- name: Setup workers
hosts: workers
become: true
@@ -23,6 +23,7 @@ spec:
retries: 3
values:
hostname: rancher.silverside-gopher.ts.net
systemDefaultRegistry: registry.rancher.com
replicas: 1
extraEnv:
- name: CATTLE_PROMETHEUS_METRICS
@@ -31,7 +32,7 @@ spec:
value: "managed-system-upgrade-controller=false"
webhook:
image:
repository: registry.rancher.com/rancher/rancher-webhook
repository: rancher/rancher-webhook
tag: v0.9.3
imagePullPolicy: IfNotPresent
resources: