fix: fail fast on stalled Flux Helm releases
Deploy Cluster / Terraform (push) Successful in 30s
Deploy Cluster / Ansible (push) Failing after 10m33s

This commit is contained in:
2026-04-25 01:40:42 +00:00
parent 5523feb563
commit 0c31c3b1d5
+74 -14
View File
@@ -244,6 +244,39 @@ jobs:
kubectl -n external-secrets get pods -o wide || true kubectl -n external-secrets get pods -o wide || true
} }
wait_for_helmrelease_ready() {
local release_name="$1"
local target_namespace="$2"
local timeout_seconds="$3"
local elapsed=0
local ready
local stalled
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
ready="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
stalled="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)"
if [ "${ready}" = "True" ]; then
return 0
fi
if [ "${stalled}" = "True" ]; then
echo "HelmRelease ${release_name} is stalled" >&2
kubectl -n flux-system describe "helmrelease/${release_name}" || true
kubectl -n "${target_namespace}" get pods -o wide || true
exit 1
fi
sleep 10
elapsed=$((elapsed + 10))
done
echo "Timed out waiting for HelmRelease ${release_name} to become Ready" >&2
kubectl -n flux-system describe "helmrelease/${release_name}" || true
kubectl -n "${target_namespace}" get pods -o wide || true
exit 1
}
wait_for_flux_oci_helm_release() { wait_for_flux_oci_helm_release() {
local oci_name="$1" local oci_name="$1"
local release_name="$2" local release_name="$2"
@@ -266,10 +299,7 @@ jobs:
exit 1 exit 1
fi fi
if ! kubectl -n flux-system wait --for=condition=Ready "helmrelease/${release_name}" --timeout="${release_timeout}"; then wait_for_helmrelease_ready "${release_name}" "${target_namespace}" "${release_timeout}"
eso_diagnostics
exit 1
fi
} }
flux_helm_diagnostics() { flux_helm_diagnostics() {
@@ -315,10 +345,7 @@ jobs:
exit 1 exit 1
fi fi
if ! kubectl -n flux-system wait --for=condition=Ready "helmrelease/${release_name}" --timeout="${release_timeout}"; then wait_for_helmrelease_ready "${release_name}" "${target_namespace}" "${release_timeout}"
flux_helm_diagnostics "${repo_name}" "${chart_name}" "${release_name}" "${target_namespace}"
exit 1
fi
} }
kubectl create namespace flux-system --dry-run=client -o yaml | kubectl apply -f - kubectl create namespace flux-system --dry-run=client -o yaml | kubectl apply -f -
@@ -349,7 +376,7 @@ jobs:
# Wait directly on the ESO Helm objects; Kustomization readiness hides useful failure details. # Wait directly on the ESO Helm objects; Kustomization readiness hides useful failure details.
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-external-secrets 600 wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-external-secrets 600
kubectl -n flux-system annotate kustomization/addon-external-secrets reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite kubectl -n flux-system annotate kustomization/addon-external-secrets reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
wait_for_flux_oci_helm_release external-secrets external-secrets external-secrets 1800s 1800s wait_for_flux_oci_helm_release external-secrets external-secrets external-secrets 600s 600
wait_for_resource "" crd/clustersecretstores.external-secrets.io 900 wait_for_resource "" crd/clustersecretstores.external-secrets.io 900
wait_for_resource "" crd/externalsecrets.external-secrets.io 900 wait_for_resource "" crd/externalsecrets.external-secrets.io 900
kubectl wait --for=condition=established --timeout=600s crd/clustersecretstores.external-secrets.io kubectl wait --for=condition=established --timeout=600s crd/clustersecretstores.external-secrets.io
@@ -376,9 +403,9 @@ jobs:
namespace: external-secrets namespace: external-secrets
EOF EOF
# Wait for the storage layer and private access components # Wait for the storage layer and private access components
wait_for_flux_helm_release tailscale flux-system-tailscale-operator tailscale-operator tailscale-system 1200s 1800s 1800s wait_for_flux_helm_release tailscale flux-system-tailscale-operator tailscale-operator tailscale-system 600s 600s 600
kubectl -n tailscale-system rollout status deployment/operator --timeout=600s kubectl -n tailscale-system rollout status deployment/operator --timeout=600s
wait_for_flux_helm_release nfs-subdir-external-provisioner flux-system-nfs-subdir-external-provisioner nfs-subdir-external-provisioner kube-system 1200s 1800s 1800s wait_for_flux_helm_release nfs-subdir-external-provisioner flux-system-nfs-subdir-external-provisioner nfs-subdir-external-provisioner kube-system 600s 600s 600
kubectl -n kube-system rollout status deployment/kube-system-nfs-subdir-external-provisioner --timeout=600s kubectl -n kube-system rollout status deployment/kube-system-nfs-subdir-external-provisioner --timeout=600s
kubectl get storageclass flash-nfs kubectl get storageclass flash-nfs
@@ -422,12 +449,45 @@ jobs:
--overwrite --overwrite
} }
wait_for_helmrelease_ready() {
local release_name="$1"
local target_namespace="$2"
local timeout_seconds="$3"
local elapsed=0
local ready
local stalled
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
ready="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
stalled="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)"
if [ "${ready}" = "True" ]; then
return 0
fi
if [ "${stalled}" = "True" ]; then
echo "HelmRelease ${release_name} is stalled" >&2
kubectl -n flux-system describe "helmrelease/${release_name}" || true
kubectl -n "${target_namespace}" get pods -o wide || true
exit 1
fi
sleep 10
elapsed=$((elapsed + 10))
done
echo "Timed out waiting for HelmRelease ${release_name} to become Ready" >&2
kubectl -n flux-system describe "helmrelease/${release_name}" || true
kubectl -n "${target_namespace}" get pods -o wide || true
exit 1
}
echo "Waiting for Rancher..." echo "Waiting for Rancher..."
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-rancher 600 wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-rancher 600
kubectl -n flux-system annotate kustomization/addon-rancher reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite kubectl -n flux-system annotate kustomization/addon-rancher reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher 600 wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher 600
reconcile_helmrelease rancher reconcile_helmrelease rancher
kubectl -n flux-system wait --for=condition=Ready helmrelease/rancher --timeout=1800s wait_for_helmrelease_ready rancher cattle-system 900
wait_for_resource "" namespace/cattle-system 600 wait_for_resource "" namespace/cattle-system 600
kubectl -n cattle-system rollout status deployment/cattle-system-rancher --timeout=900s kubectl -n cattle-system rollout status deployment/cattle-system-rancher --timeout=900s
kubectl -n cattle-system rollout status deployment/rancher-webhook --timeout=900s kubectl -n cattle-system rollout status deployment/rancher-webhook --timeout=900s
@@ -441,8 +501,8 @@ jobs:
wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher-backup 600 wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher-backup 600
reconcile_helmrelease rancher-backup-crd reconcile_helmrelease rancher-backup-crd
reconcile_helmrelease rancher-backup reconcile_helmrelease rancher-backup
kubectl -n flux-system wait --for=condition=Ready helmrelease/rancher-backup-crd --timeout=1200s wait_for_helmrelease_ready rancher-backup-crd cattle-resources-system 600
kubectl -n flux-system wait --for=condition=Ready helmrelease/rancher-backup --timeout=1200s wait_for_helmrelease_ready rancher-backup cattle-resources-system 600
wait_for_resource "" namespace/cattle-resources-system 600 wait_for_resource "" namespace/cattle-resources-system 600
kubectl -n cattle-resources-system rollout status deployment/rancher-backup --timeout=900s kubectl -n cattle-resources-system rollout status deployment/rancher-backup --timeout=900s