From eec7375268b4bcc91ff6fa05d5c7636a8e8ba081 Mon Sep 17 00:00:00 2001 From: MichaelFisher1997 Date: Mon, 4 May 2026 03:38:04 +0000 Subject: [PATCH] fix: cap long flux reconcile waits --- .gitea/workflows/deploy.yml | 88 +++++++++++++++++++++++++++++++------ 1 file changed, 74 insertions(+), 14 deletions(-) diff --git a/.gitea/workflows/deploy.yml b/.gitea/workflows/deploy.yml index 915b006..d2f3a10 100644 --- a/.gitea/workflows/deploy.yml +++ b/.gitea/workflows/deploy.yml @@ -268,6 +268,24 @@ jobs: return 0 fi + ready="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)" + healthy="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.conditions[?(@.type=="Healthy")].status}' 2>/dev/null || true)" + reconciling="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.conditions[?(@.type=="Reconciling")].status}' 2>/dev/null || true)" + generation="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.metadata.generation}' 2>/dev/null || true)" + observed_generation="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.observedGeneration}' 2>/dev/null || true)" + + if [[ "${resource}" == helmrelease/* ]] && [ "${reconciling}" = "True" ]; then + echo "${resource} is actively reconciling; continuing without waiting for reconcile token ${reconcile_at}" + return 0 + fi + + if [ "${ready}" = "True" ] && [ "${observed_generation}" = "${generation}" ]; then + if [ -z "${healthy}" ] || [ "${healthy}" = "True" ]; then + echo "${resource} is already Ready; continuing without waiting for reconcile token ${reconcile_at}" + return 0 + fi + fi + sleep 5 elapsed=$((elapsed + 5)) done @@ -640,6 +658,24 @@ jobs: return 0 fi + ready="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)" + healthy="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.conditions[?(@.type=="Healthy")].status}' 2>/dev/null || true)" + reconciling="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.conditions[?(@.type=="Reconciling")].status}' 2>/dev/null || true)" + generation="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.metadata.generation}' 2>/dev/null || true)" + observed_generation="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.observedGeneration}' 2>/dev/null || true)" + + if [[ "${resource}" == helmrelease/* ]] && [ "${reconciling}" = "True" ]; then + echo "${resource} is actively reconciling; continuing without waiting for reconcile token ${reconcile_at}" + return 0 + fi + + if [ "${ready}" = "True" ] && [ "${observed_generation}" = "${generation}" ]; then + if [ -z "${healthy}" ] || [ "${healthy}" = "True" ]; then + echo "${resource} is already Ready; continuing without waiting for reconcile token ${reconcile_at}" + return 0 + fi + fi + sleep 5 elapsed=$((elapsed + 5)) done @@ -911,20 +947,20 @@ jobs: kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-secrets --timeout=900s wait_for_rancher_bootstrap_secrets 900 wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-rancher 600 - reconcile_flux_resource flux-system kustomization/addon-rancher 1800 + reconcile_flux_resource flux-system kustomization/addon-rancher 600 wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher 600 reconcile_helmrelease rancher 300 - wait_for_helmchart_ready flux-system-rancher rancher 180s 5 - wait_for_helmrelease_ready rancher cattle-system 900 + wait_for_helmchart_ready flux-system-rancher rancher 120s 3 + wait_for_helmrelease_ready rancher cattle-system 600 wait_for_resource "" namespace/cattle-system 600 wait_for_resource cattle-system deployment/cattle-system-rancher 600 - kubectl -n cattle-system rollout status deployment/cattle-system-rancher --timeout=900s - wait_for_resource cattle-system deployment/rancher-webhook 900 - kubectl -n cattle-system rollout status deployment/rancher-webhook --timeout=900s - wait_for_resource cattle-system issuer/cattle-system-rancher 900 - wait_for_resource cattle-system certificate/tls-rancher-ingress 900 - kubectl -n cattle-system wait --for=condition=Ready issuer/cattle-system-rancher --timeout=900s - kubectl -n cattle-system wait --for=condition=Ready certificate/tls-rancher-ingress --timeout=900s + kubectl -n cattle-system rollout status deployment/cattle-system-rancher --timeout=600s + wait_for_resource cattle-system deployment/rancher-webhook 600 + kubectl -n cattle-system rollout status deployment/rancher-webhook --timeout=600s + wait_for_resource cattle-system issuer/cattle-system-rancher 600 + wait_for_resource cattle-system certificate/tls-rancher-ingress 600 + kubectl -n cattle-system wait --for=condition=Ready issuer/cattle-system-rancher --timeout=600s + kubectl -n cattle-system wait --for=condition=Ready certificate/tls-rancher-ingress --timeout=600s - name: Reconcile observability stack env: @@ -1051,16 +1087,40 @@ jobs: wait_for_ocirepository_ready_or_cached() { local repository="$1" local timeout="$2" - local attempts="${3:-6}" + local attempts="${3:-3}" local artifact_storage local attempt + local ready for attempt in $(seq 1 "${attempts}"); do - reconcile_flux_resource "ocirepository/${repository}" 300 + if ! kubectl get --raw=/readyz --request-timeout=10s >/dev/null 2>&1; then + echo "Kubernetes API is not ready while waiting for OCIRepository ${repository}; failing fast" >&2 + kubectl -n kube-system get pods -o wide || true + exit 1 + fi + + ready="$(kubectl -n flux-system get "ocirepository/${repository}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)" + if [ "${ready}" = "True" ]; then + return 0 + fi + + artifact_storage="$(kubectl -n flux-system get "ocirepository/${repository}" -o jsonpath='{.status.conditions[?(@.type=="ArtifactInStorage")].status}' 2>/dev/null || true)" + if [ "${artifact_storage}" = "True" ]; then + echo "OCIRepository ${repository} is not currently Ready; continuing with cached artifact" >&2 + return 0 + fi + + reconcile_flux_resource "ocirepository/${repository}" 120 if kubectl -n flux-system wait --for=condition=Ready "ocirepository/${repository}" --timeout="${timeout}"; then return 0 fi + if ! kubectl get --raw=/readyz --request-timeout=10s >/dev/null 2>&1; then + echo "Kubernetes API became unavailable while waiting for OCIRepository ${repository}; failing fast" >&2 + kubectl -n kube-system get pods -o wide || true + exit 1 + fi + artifact_storage="$(kubectl -n flux-system get "ocirepository/${repository}" -o jsonpath='{.status.conditions[?(@.type=="ArtifactInStorage")].status}' 2>/dev/null || true)" if [ "${artifact_storage}" = "True" ]; then echo "OCIRepository ${repository} is not currently Ready; continuing with cached artifact" >&2 @@ -1117,8 +1177,8 @@ jobs: wait_for_flux_ready kustomization/addon-observability 300s wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/loki 300 wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/promtail 300 - wait_for_ocirepository_ready_or_cached loki 300s - wait_for_ocirepository_ready_or_cached promtail 300s + wait_for_ocirepository_ready_or_cached loki 90s 3 + wait_for_ocirepository_ready_or_cached promtail 90s 3 for release in kube-prometheus-stack loki promtail; do wait_for_resource flux-system "helmrelease.helm.toolkit.fluxcd.io/${release}" 300 request_helmrelease_reconcile "${release}"