fix: wait on Rancher and storage runtime objects during bootstrap
Deploy Cluster / Terraform (push) Successful in 26s
Deploy Cluster / Ansible (push) Failing after 25m19s

Flux can leave HelmRelease and Kustomization conditions stale after transient
chart fetch or image pull failures even when the underlying workloads recover.
Switch the deploy workflow to wait on the concrete runtime resources we care
about: the NFS provisioner deployment and StorageClass, Rancher deployment,
webhook, cert-manager issuer/certificate, and the rancher-backup deployment.
This commit is contained in:
2026-04-22 18:41:09 +00:00
parent 55d7b8201e
commit 098bd98876
+12 -6
View File
@@ -249,10 +249,9 @@ jobs:
reconcile.fluxcd.io/resetAt="$TS" \ reconcile.fluxcd.io/resetAt="$TS" \
reconcile.fluxcd.io/forceAt="$TS" \ reconcile.fluxcd.io/forceAt="$TS" \
--overwrite || true --overwrite || true
kubectl -n flux-system annotate kustomization/addon-nfs-storage reconcile.fluxcd.io/requestedAt="$TS" --overwrite || true
kubectl -n flux-system wait --for=condition=Ready helmrelease/nfs-subdir-external-provisioner --timeout=600s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=600s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s
kubectl -n kube-system rollout status deployment/kube-system-nfs-subdir-external-provisioner --timeout=600s
kubectl get storageclass flash-nfs
- name: Wait for Rancher and backup operator - name: Wait for Rancher and backup operator
env: env:
@@ -265,13 +264,20 @@ jobs:
reconcile.fluxcd.io/resetAt="$TS" \ reconcile.fluxcd.io/resetAt="$TS" \
reconcile.fluxcd.io/forceAt="$TS" \ reconcile.fluxcd.io/forceAt="$TS" \
--overwrite || true --overwrite || true
kubectl -n flux-system annotate helmrelease/rancher-backup \
reconcile.fluxcd.io/requestedAt="$TS" \
reconcile.fluxcd.io/resetAt="$TS" \
reconcile.fluxcd.io/forceAt="$TS" \
--overwrite || true
echo "Waiting for Rancher..." echo "Waiting for Rancher..."
kubectl -n flux-system wait --for=condition=Ready helmrelease/rancher --timeout=900s kubectl -n cattle-system rollout status deployment/cattle-system-rancher --timeout=900s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=900s kubectl -n cattle-system rollout status deployment/rancher-webhook --timeout=900s
kubectl -n cattle-system wait --for=condition=Ready issuer/cattle-system-rancher --timeout=900s
kubectl -n cattle-system wait --for=condition=Ready certificate/tls-rancher-ingress --timeout=900s
echo "Waiting for rancher-backup operator..." echo "Waiting for rancher-backup operator..."
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup --timeout=600s || true kubectl -n cattle-resources-system rollout status deployment/rancher-backup --timeout=900s
- name: Restore Rancher from latest B2 backup - name: Restore Rancher from latest B2 backup
env: env: