From 098bd98876063ec6844138b692b11f33e3f165ab Mon Sep 17 00:00:00 2001 From: MichaelFisher1997 Date: Wed, 22 Apr 2026 18:41:09 +0000 Subject: [PATCH] fix: wait on Rancher and storage runtime objects during bootstrap Flux can leave HelmRelease and Kustomization conditions stale after transient chart fetch or image pull failures even when the underlying workloads recover. Switch the deploy workflow to wait on the concrete runtime resources we care about: the NFS provisioner deployment and StorageClass, Rancher deployment, webhook, cert-manager issuer/certificate, and the rancher-backup deployment. --- .gitea/workflows/deploy.yml | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/.gitea/workflows/deploy.yml b/.gitea/workflows/deploy.yml index 653081e..0ecf835 100644 --- a/.gitea/workflows/deploy.yml +++ b/.gitea/workflows/deploy.yml @@ -249,10 +249,9 @@ jobs: reconcile.fluxcd.io/resetAt="$TS" \ reconcile.fluxcd.io/forceAt="$TS" \ --overwrite || true - kubectl -n flux-system annotate kustomization/addon-nfs-storage reconcile.fluxcd.io/requestedAt="$TS" --overwrite || true - kubectl -n flux-system wait --for=condition=Ready helmrelease/nfs-subdir-external-provisioner --timeout=600s - kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=600s kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s + kubectl -n kube-system rollout status deployment/kube-system-nfs-subdir-external-provisioner --timeout=600s + kubectl get storageclass flash-nfs - name: Wait for Rancher and backup operator env: @@ -265,13 +264,20 @@ jobs: reconcile.fluxcd.io/resetAt="$TS" \ reconcile.fluxcd.io/forceAt="$TS" \ --overwrite || true + kubectl -n flux-system annotate helmrelease/rancher-backup \ + reconcile.fluxcd.io/requestedAt="$TS" \ + reconcile.fluxcd.io/resetAt="$TS" \ + reconcile.fluxcd.io/forceAt="$TS" \ + --overwrite || true echo "Waiting for Rancher..." - kubectl -n flux-system wait --for=condition=Ready helmrelease/rancher --timeout=900s - kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=900s + kubectl -n cattle-system rollout status deployment/cattle-system-rancher --timeout=900s + kubectl -n cattle-system rollout status deployment/rancher-webhook --timeout=900s + kubectl -n cattle-system wait --for=condition=Ready issuer/cattle-system-rancher --timeout=900s + kubectl -n cattle-system wait --for=condition=Ready certificate/tls-rancher-ingress --timeout=900s echo "Waiting for rancher-backup operator..." - kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup --timeout=600s || true + kubectl -n cattle-resources-system rollout status deployment/rancher-backup --timeout=900s - name: Restore Rancher from latest B2 backup env: