fix: fail fast on stalled Flux Helm releases
This commit is contained in:
+74
-14
@@ -244,6 +244,39 @@ jobs:
|
|||||||
kubectl -n external-secrets get pods -o wide || true
|
kubectl -n external-secrets get pods -o wide || true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
wait_for_helmrelease_ready() {
|
||||||
|
local release_name="$1"
|
||||||
|
local target_namespace="$2"
|
||||||
|
local timeout_seconds="$3"
|
||||||
|
local elapsed=0
|
||||||
|
local ready
|
||||||
|
local stalled
|
||||||
|
|
||||||
|
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
|
||||||
|
ready="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
|
||||||
|
stalled="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)"
|
||||||
|
|
||||||
|
if [ "${ready}" = "True" ]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "${stalled}" = "True" ]; then
|
||||||
|
echo "HelmRelease ${release_name} is stalled" >&2
|
||||||
|
kubectl -n flux-system describe "helmrelease/${release_name}" || true
|
||||||
|
kubectl -n "${target_namespace}" get pods -o wide || true
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
sleep 10
|
||||||
|
elapsed=$((elapsed + 10))
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Timed out waiting for HelmRelease ${release_name} to become Ready" >&2
|
||||||
|
kubectl -n flux-system describe "helmrelease/${release_name}" || true
|
||||||
|
kubectl -n "${target_namespace}" get pods -o wide || true
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
wait_for_flux_oci_helm_release() {
|
wait_for_flux_oci_helm_release() {
|
||||||
local oci_name="$1"
|
local oci_name="$1"
|
||||||
local release_name="$2"
|
local release_name="$2"
|
||||||
@@ -266,10 +299,7 @@ jobs:
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if ! kubectl -n flux-system wait --for=condition=Ready "helmrelease/${release_name}" --timeout="${release_timeout}"; then
|
wait_for_helmrelease_ready "${release_name}" "${target_namespace}" "${release_timeout}"
|
||||||
eso_diagnostics
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
}
|
}
|
||||||
|
|
||||||
flux_helm_diagnostics() {
|
flux_helm_diagnostics() {
|
||||||
@@ -315,10 +345,7 @@ jobs:
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if ! kubectl -n flux-system wait --for=condition=Ready "helmrelease/${release_name}" --timeout="${release_timeout}"; then
|
wait_for_helmrelease_ready "${release_name}" "${target_namespace}" "${release_timeout}"
|
||||||
flux_helm_diagnostics "${repo_name}" "${chart_name}" "${release_name}" "${target_namespace}"
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
}
|
}
|
||||||
|
|
||||||
kubectl create namespace flux-system --dry-run=client -o yaml | kubectl apply -f -
|
kubectl create namespace flux-system --dry-run=client -o yaml | kubectl apply -f -
|
||||||
@@ -349,7 +376,7 @@ jobs:
|
|||||||
# Wait directly on the ESO Helm objects; Kustomization readiness hides useful failure details.
|
# Wait directly on the ESO Helm objects; Kustomization readiness hides useful failure details.
|
||||||
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-external-secrets 600
|
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-external-secrets 600
|
||||||
kubectl -n flux-system annotate kustomization/addon-external-secrets reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
|
kubectl -n flux-system annotate kustomization/addon-external-secrets reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
|
||||||
wait_for_flux_oci_helm_release external-secrets external-secrets external-secrets 1800s 1800s
|
wait_for_flux_oci_helm_release external-secrets external-secrets external-secrets 600s 600
|
||||||
wait_for_resource "" crd/clustersecretstores.external-secrets.io 900
|
wait_for_resource "" crd/clustersecretstores.external-secrets.io 900
|
||||||
wait_for_resource "" crd/externalsecrets.external-secrets.io 900
|
wait_for_resource "" crd/externalsecrets.external-secrets.io 900
|
||||||
kubectl wait --for=condition=established --timeout=600s crd/clustersecretstores.external-secrets.io
|
kubectl wait --for=condition=established --timeout=600s crd/clustersecretstores.external-secrets.io
|
||||||
@@ -376,9 +403,9 @@ jobs:
|
|||||||
namespace: external-secrets
|
namespace: external-secrets
|
||||||
EOF
|
EOF
|
||||||
# Wait for the storage layer and private access components
|
# Wait for the storage layer and private access components
|
||||||
wait_for_flux_helm_release tailscale flux-system-tailscale-operator tailscale-operator tailscale-system 1200s 1800s 1800s
|
wait_for_flux_helm_release tailscale flux-system-tailscale-operator tailscale-operator tailscale-system 600s 600s 600
|
||||||
kubectl -n tailscale-system rollout status deployment/operator --timeout=600s
|
kubectl -n tailscale-system rollout status deployment/operator --timeout=600s
|
||||||
wait_for_flux_helm_release nfs-subdir-external-provisioner flux-system-nfs-subdir-external-provisioner nfs-subdir-external-provisioner kube-system 1200s 1800s 1800s
|
wait_for_flux_helm_release nfs-subdir-external-provisioner flux-system-nfs-subdir-external-provisioner nfs-subdir-external-provisioner kube-system 600s 600s 600
|
||||||
kubectl -n kube-system rollout status deployment/kube-system-nfs-subdir-external-provisioner --timeout=600s
|
kubectl -n kube-system rollout status deployment/kube-system-nfs-subdir-external-provisioner --timeout=600s
|
||||||
kubectl get storageclass flash-nfs
|
kubectl get storageclass flash-nfs
|
||||||
|
|
||||||
@@ -422,12 +449,45 @@ jobs:
|
|||||||
--overwrite
|
--overwrite
|
||||||
}
|
}
|
||||||
|
|
||||||
|
wait_for_helmrelease_ready() {
|
||||||
|
local release_name="$1"
|
||||||
|
local target_namespace="$2"
|
||||||
|
local timeout_seconds="$3"
|
||||||
|
local elapsed=0
|
||||||
|
local ready
|
||||||
|
local stalled
|
||||||
|
|
||||||
|
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
|
||||||
|
ready="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
|
||||||
|
stalled="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)"
|
||||||
|
|
||||||
|
if [ "${ready}" = "True" ]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "${stalled}" = "True" ]; then
|
||||||
|
echo "HelmRelease ${release_name} is stalled" >&2
|
||||||
|
kubectl -n flux-system describe "helmrelease/${release_name}" || true
|
||||||
|
kubectl -n "${target_namespace}" get pods -o wide || true
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
sleep 10
|
||||||
|
elapsed=$((elapsed + 10))
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Timed out waiting for HelmRelease ${release_name} to become Ready" >&2
|
||||||
|
kubectl -n flux-system describe "helmrelease/${release_name}" || true
|
||||||
|
kubectl -n "${target_namespace}" get pods -o wide || true
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
echo "Waiting for Rancher..."
|
echo "Waiting for Rancher..."
|
||||||
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-rancher 600
|
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-rancher 600
|
||||||
kubectl -n flux-system annotate kustomization/addon-rancher reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
|
kubectl -n flux-system annotate kustomization/addon-rancher reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
|
||||||
wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher 600
|
wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher 600
|
||||||
reconcile_helmrelease rancher
|
reconcile_helmrelease rancher
|
||||||
kubectl -n flux-system wait --for=condition=Ready helmrelease/rancher --timeout=1800s
|
wait_for_helmrelease_ready rancher cattle-system 900
|
||||||
wait_for_resource "" namespace/cattle-system 600
|
wait_for_resource "" namespace/cattle-system 600
|
||||||
kubectl -n cattle-system rollout status deployment/cattle-system-rancher --timeout=900s
|
kubectl -n cattle-system rollout status deployment/cattle-system-rancher --timeout=900s
|
||||||
kubectl -n cattle-system rollout status deployment/rancher-webhook --timeout=900s
|
kubectl -n cattle-system rollout status deployment/rancher-webhook --timeout=900s
|
||||||
@@ -441,8 +501,8 @@ jobs:
|
|||||||
wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher-backup 600
|
wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher-backup 600
|
||||||
reconcile_helmrelease rancher-backup-crd
|
reconcile_helmrelease rancher-backup-crd
|
||||||
reconcile_helmrelease rancher-backup
|
reconcile_helmrelease rancher-backup
|
||||||
kubectl -n flux-system wait --for=condition=Ready helmrelease/rancher-backup-crd --timeout=1200s
|
wait_for_helmrelease_ready rancher-backup-crd cattle-resources-system 600
|
||||||
kubectl -n flux-system wait --for=condition=Ready helmrelease/rancher-backup --timeout=1200s
|
wait_for_helmrelease_ready rancher-backup cattle-resources-system 600
|
||||||
wait_for_resource "" namespace/cattle-resources-system 600
|
wait_for_resource "" namespace/cattle-resources-system 600
|
||||||
kubectl -n cattle-resources-system rollout status deployment/rancher-backup --timeout=900s
|
kubectl -n cattle-resources-system rollout status deployment/rancher-backup --timeout=900s
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user