fix: cap long flux reconcile waits

This commit is contained in:
2026-05-04 03:38:04 +00:00
parent 095a1fcde2
commit eec7375268
+74 -14
View File
@@ -268,6 +268,24 @@ jobs:
return 0 return 0
fi fi
ready="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
healthy="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.conditions[?(@.type=="Healthy")].status}' 2>/dev/null || true)"
reconciling="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.conditions[?(@.type=="Reconciling")].status}' 2>/dev/null || true)"
generation="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.metadata.generation}' 2>/dev/null || true)"
observed_generation="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.observedGeneration}' 2>/dev/null || true)"
if [[ "${resource}" == helmrelease/* ]] && [ "${reconciling}" = "True" ]; then
echo "${resource} is actively reconciling; continuing without waiting for reconcile token ${reconcile_at}"
return 0
fi
if [ "${ready}" = "True" ] && [ "${observed_generation}" = "${generation}" ]; then
if [ -z "${healthy}" ] || [ "${healthy}" = "True" ]; then
echo "${resource} is already Ready; continuing without waiting for reconcile token ${reconcile_at}"
return 0
fi
fi
sleep 5 sleep 5
elapsed=$((elapsed + 5)) elapsed=$((elapsed + 5))
done done
@@ -640,6 +658,24 @@ jobs:
return 0 return 0
fi fi
ready="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
healthy="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.conditions[?(@.type=="Healthy")].status}' 2>/dev/null || true)"
reconciling="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.conditions[?(@.type=="Reconciling")].status}' 2>/dev/null || true)"
generation="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.metadata.generation}' 2>/dev/null || true)"
observed_generation="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.observedGeneration}' 2>/dev/null || true)"
if [[ "${resource}" == helmrelease/* ]] && [ "${reconciling}" = "True" ]; then
echo "${resource} is actively reconciling; continuing without waiting for reconcile token ${reconcile_at}"
return 0
fi
if [ "${ready}" = "True" ] && [ "${observed_generation}" = "${generation}" ]; then
if [ -z "${healthy}" ] || [ "${healthy}" = "True" ]; then
echo "${resource} is already Ready; continuing without waiting for reconcile token ${reconcile_at}"
return 0
fi
fi
sleep 5 sleep 5
elapsed=$((elapsed + 5)) elapsed=$((elapsed + 5))
done done
@@ -911,20 +947,20 @@ jobs:
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-secrets --timeout=900s kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-secrets --timeout=900s
wait_for_rancher_bootstrap_secrets 900 wait_for_rancher_bootstrap_secrets 900
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-rancher 600 wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-rancher 600
reconcile_flux_resource flux-system kustomization/addon-rancher 1800 reconcile_flux_resource flux-system kustomization/addon-rancher 600
wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher 600 wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher 600
reconcile_helmrelease rancher 300 reconcile_helmrelease rancher 300
wait_for_helmchart_ready flux-system-rancher rancher 180s 5 wait_for_helmchart_ready flux-system-rancher rancher 120s 3
wait_for_helmrelease_ready rancher cattle-system 900 wait_for_helmrelease_ready rancher cattle-system 600
wait_for_resource "" namespace/cattle-system 600 wait_for_resource "" namespace/cattle-system 600
wait_for_resource cattle-system deployment/cattle-system-rancher 600 wait_for_resource cattle-system deployment/cattle-system-rancher 600
kubectl -n cattle-system rollout status deployment/cattle-system-rancher --timeout=900s kubectl -n cattle-system rollout status deployment/cattle-system-rancher --timeout=600s
wait_for_resource cattle-system deployment/rancher-webhook 900 wait_for_resource cattle-system deployment/rancher-webhook 600
kubectl -n cattle-system rollout status deployment/rancher-webhook --timeout=900s kubectl -n cattle-system rollout status deployment/rancher-webhook --timeout=600s
wait_for_resource cattle-system issuer/cattle-system-rancher 900 wait_for_resource cattle-system issuer/cattle-system-rancher 600
wait_for_resource cattle-system certificate/tls-rancher-ingress 900 wait_for_resource cattle-system certificate/tls-rancher-ingress 600
kubectl -n cattle-system wait --for=condition=Ready issuer/cattle-system-rancher --timeout=900s kubectl -n cattle-system wait --for=condition=Ready issuer/cattle-system-rancher --timeout=600s
kubectl -n cattle-system wait --for=condition=Ready certificate/tls-rancher-ingress --timeout=900s kubectl -n cattle-system wait --for=condition=Ready certificate/tls-rancher-ingress --timeout=600s
- name: Reconcile observability stack - name: Reconcile observability stack
env: env:
@@ -1051,16 +1087,40 @@ jobs:
wait_for_ocirepository_ready_or_cached() { wait_for_ocirepository_ready_or_cached() {
local repository="$1" local repository="$1"
local timeout="$2" local timeout="$2"
local attempts="${3:-6}" local attempts="${3:-3}"
local artifact_storage local artifact_storage
local attempt local attempt
local ready
for attempt in $(seq 1 "${attempts}"); do for attempt in $(seq 1 "${attempts}"); do
reconcile_flux_resource "ocirepository/${repository}" 300 if ! kubectl get --raw=/readyz --request-timeout=10s >/dev/null 2>&1; then
echo "Kubernetes API is not ready while waiting for OCIRepository ${repository}; failing fast" >&2
kubectl -n kube-system get pods -o wide || true
exit 1
fi
ready="$(kubectl -n flux-system get "ocirepository/${repository}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
if [ "${ready}" = "True" ]; then
return 0
fi
artifact_storage="$(kubectl -n flux-system get "ocirepository/${repository}" -o jsonpath='{.status.conditions[?(@.type=="ArtifactInStorage")].status}' 2>/dev/null || true)"
if [ "${artifact_storage}" = "True" ]; then
echo "OCIRepository ${repository} is not currently Ready; continuing with cached artifact" >&2
return 0
fi
reconcile_flux_resource "ocirepository/${repository}" 120
if kubectl -n flux-system wait --for=condition=Ready "ocirepository/${repository}" --timeout="${timeout}"; then if kubectl -n flux-system wait --for=condition=Ready "ocirepository/${repository}" --timeout="${timeout}"; then
return 0 return 0
fi fi
if ! kubectl get --raw=/readyz --request-timeout=10s >/dev/null 2>&1; then
echo "Kubernetes API became unavailable while waiting for OCIRepository ${repository}; failing fast" >&2
kubectl -n kube-system get pods -o wide || true
exit 1
fi
artifact_storage="$(kubectl -n flux-system get "ocirepository/${repository}" -o jsonpath='{.status.conditions[?(@.type=="ArtifactInStorage")].status}' 2>/dev/null || true)" artifact_storage="$(kubectl -n flux-system get "ocirepository/${repository}" -o jsonpath='{.status.conditions[?(@.type=="ArtifactInStorage")].status}' 2>/dev/null || true)"
if [ "${artifact_storage}" = "True" ]; then if [ "${artifact_storage}" = "True" ]; then
echo "OCIRepository ${repository} is not currently Ready; continuing with cached artifact" >&2 echo "OCIRepository ${repository} is not currently Ready; continuing with cached artifact" >&2
@@ -1117,8 +1177,8 @@ jobs:
wait_for_flux_ready kustomization/addon-observability 300s wait_for_flux_ready kustomization/addon-observability 300s
wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/loki 300 wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/loki 300
wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/promtail 300 wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/promtail 300
wait_for_ocirepository_ready_or_cached loki 300s wait_for_ocirepository_ready_or_cached loki 90s 3
wait_for_ocirepository_ready_or_cached promtail 300s wait_for_ocirepository_ready_or_cached promtail 90s 3
for release in kube-prometheus-stack loki promtail; do for release in kube-prometheus-stack loki promtail; do
wait_for_resource flux-system "helmrelease.helm.toolkit.fluxcd.io/${release}" 300 wait_for_resource flux-system "helmrelease.helm.toolkit.fluxcd.io/${release}" 300
request_helmrelease_reconcile "${release}" request_helmrelease_reconcile "${release}"