fix: fail fast on stalled Flux Helm releases
This commit is contained in:
+74
-14
@@ -244,6 +244,39 @@ jobs:
|
||||
kubectl -n external-secrets get pods -o wide || true
|
||||
}
|
||||
|
||||
wait_for_helmrelease_ready() {
|
||||
local release_name="$1"
|
||||
local target_namespace="$2"
|
||||
local timeout_seconds="$3"
|
||||
local elapsed=0
|
||||
local ready
|
||||
local stalled
|
||||
|
||||
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
|
||||
ready="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
|
||||
stalled="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)"
|
||||
|
||||
if [ "${ready}" = "True" ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [ "${stalled}" = "True" ]; then
|
||||
echo "HelmRelease ${release_name} is stalled" >&2
|
||||
kubectl -n flux-system describe "helmrelease/${release_name}" || true
|
||||
kubectl -n "${target_namespace}" get pods -o wide || true
|
||||
exit 1
|
||||
fi
|
||||
|
||||
sleep 10
|
||||
elapsed=$((elapsed + 10))
|
||||
done
|
||||
|
||||
echo "Timed out waiting for HelmRelease ${release_name} to become Ready" >&2
|
||||
kubectl -n flux-system describe "helmrelease/${release_name}" || true
|
||||
kubectl -n "${target_namespace}" get pods -o wide || true
|
||||
exit 1
|
||||
}
|
||||
|
||||
wait_for_flux_oci_helm_release() {
|
||||
local oci_name="$1"
|
||||
local release_name="$2"
|
||||
@@ -266,10 +299,7 @@ jobs:
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! kubectl -n flux-system wait --for=condition=Ready "helmrelease/${release_name}" --timeout="${release_timeout}"; then
|
||||
eso_diagnostics
|
||||
exit 1
|
||||
fi
|
||||
wait_for_helmrelease_ready "${release_name}" "${target_namespace}" "${release_timeout}"
|
||||
}
|
||||
|
||||
flux_helm_diagnostics() {
|
||||
@@ -315,10 +345,7 @@ jobs:
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! kubectl -n flux-system wait --for=condition=Ready "helmrelease/${release_name}" --timeout="${release_timeout}"; then
|
||||
flux_helm_diagnostics "${repo_name}" "${chart_name}" "${release_name}" "${target_namespace}"
|
||||
exit 1
|
||||
fi
|
||||
wait_for_helmrelease_ready "${release_name}" "${target_namespace}" "${release_timeout}"
|
||||
}
|
||||
|
||||
kubectl create namespace flux-system --dry-run=client -o yaml | kubectl apply -f -
|
||||
@@ -349,7 +376,7 @@ jobs:
|
||||
# Wait directly on the ESO Helm objects; Kustomization readiness hides useful failure details.
|
||||
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-external-secrets 600
|
||||
kubectl -n flux-system annotate kustomization/addon-external-secrets reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
|
||||
wait_for_flux_oci_helm_release external-secrets external-secrets external-secrets 1800s 1800s
|
||||
wait_for_flux_oci_helm_release external-secrets external-secrets external-secrets 600s 600
|
||||
wait_for_resource "" crd/clustersecretstores.external-secrets.io 900
|
||||
wait_for_resource "" crd/externalsecrets.external-secrets.io 900
|
||||
kubectl wait --for=condition=established --timeout=600s crd/clustersecretstores.external-secrets.io
|
||||
@@ -376,9 +403,9 @@ jobs:
|
||||
namespace: external-secrets
|
||||
EOF
|
||||
# Wait for the storage layer and private access components
|
||||
wait_for_flux_helm_release tailscale flux-system-tailscale-operator tailscale-operator tailscale-system 1200s 1800s 1800s
|
||||
wait_for_flux_helm_release tailscale flux-system-tailscale-operator tailscale-operator tailscale-system 600s 600s 600
|
||||
kubectl -n tailscale-system rollout status deployment/operator --timeout=600s
|
||||
wait_for_flux_helm_release nfs-subdir-external-provisioner flux-system-nfs-subdir-external-provisioner nfs-subdir-external-provisioner kube-system 1200s 1800s 1800s
|
||||
wait_for_flux_helm_release nfs-subdir-external-provisioner flux-system-nfs-subdir-external-provisioner nfs-subdir-external-provisioner kube-system 600s 600s 600
|
||||
kubectl -n kube-system rollout status deployment/kube-system-nfs-subdir-external-provisioner --timeout=600s
|
||||
kubectl get storageclass flash-nfs
|
||||
|
||||
@@ -422,12 +449,45 @@ jobs:
|
||||
--overwrite
|
||||
}
|
||||
|
||||
wait_for_helmrelease_ready() {
|
||||
local release_name="$1"
|
||||
local target_namespace="$2"
|
||||
local timeout_seconds="$3"
|
||||
local elapsed=0
|
||||
local ready
|
||||
local stalled
|
||||
|
||||
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
|
||||
ready="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
|
||||
stalled="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)"
|
||||
|
||||
if [ "${ready}" = "True" ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [ "${stalled}" = "True" ]; then
|
||||
echo "HelmRelease ${release_name} is stalled" >&2
|
||||
kubectl -n flux-system describe "helmrelease/${release_name}" || true
|
||||
kubectl -n "${target_namespace}" get pods -o wide || true
|
||||
exit 1
|
||||
fi
|
||||
|
||||
sleep 10
|
||||
elapsed=$((elapsed + 10))
|
||||
done
|
||||
|
||||
echo "Timed out waiting for HelmRelease ${release_name} to become Ready" >&2
|
||||
kubectl -n flux-system describe "helmrelease/${release_name}" || true
|
||||
kubectl -n "${target_namespace}" get pods -o wide || true
|
||||
exit 1
|
||||
}
|
||||
|
||||
echo "Waiting for Rancher..."
|
||||
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-rancher 600
|
||||
kubectl -n flux-system annotate kustomization/addon-rancher reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
|
||||
wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher 600
|
||||
reconcile_helmrelease rancher
|
||||
kubectl -n flux-system wait --for=condition=Ready helmrelease/rancher --timeout=1800s
|
||||
wait_for_helmrelease_ready rancher cattle-system 900
|
||||
wait_for_resource "" namespace/cattle-system 600
|
||||
kubectl -n cattle-system rollout status deployment/cattle-system-rancher --timeout=900s
|
||||
kubectl -n cattle-system rollout status deployment/rancher-webhook --timeout=900s
|
||||
@@ -441,8 +501,8 @@ jobs:
|
||||
wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher-backup 600
|
||||
reconcile_helmrelease rancher-backup-crd
|
||||
reconcile_helmrelease rancher-backup
|
||||
kubectl -n flux-system wait --for=condition=Ready helmrelease/rancher-backup-crd --timeout=1200s
|
||||
kubectl -n flux-system wait --for=condition=Ready helmrelease/rancher-backup --timeout=1200s
|
||||
wait_for_helmrelease_ready rancher-backup-crd cattle-resources-system 600
|
||||
wait_for_helmrelease_ready rancher-backup cattle-resources-system 600
|
||||
wait_for_resource "" namespace/cattle-resources-system 600
|
||||
kubectl -n cattle-resources-system rollout status deployment/rancher-backup --timeout=900s
|
||||
|
||||
|
||||
Reference in New Issue
Block a user