diff --git a/.gitea/workflows/deploy.yml b/.gitea/workflows/deploy.yml index ee2b8de..0979aab 100644 --- a/.gitea/workflows/deploy.yml +++ b/.gitea/workflows/deploy.yml @@ -244,6 +244,39 @@ jobs: kubectl -n external-secrets get pods -o wide || true } + wait_for_helmrelease_ready() { + local release_name="$1" + local target_namespace="$2" + local timeout_seconds="$3" + local elapsed=0 + local ready + local stalled + + while [ "${elapsed}" -lt "${timeout_seconds}" ]; do + ready="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)" + stalled="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)" + + if [ "${ready}" = "True" ]; then + return 0 + fi + + if [ "${stalled}" = "True" ]; then + echo "HelmRelease ${release_name} is stalled" >&2 + kubectl -n flux-system describe "helmrelease/${release_name}" || true + kubectl -n "${target_namespace}" get pods -o wide || true + exit 1 + fi + + sleep 10 + elapsed=$((elapsed + 10)) + done + + echo "Timed out waiting for HelmRelease ${release_name} to become Ready" >&2 + kubectl -n flux-system describe "helmrelease/${release_name}" || true + kubectl -n "${target_namespace}" get pods -o wide || true + exit 1 + } + wait_for_flux_oci_helm_release() { local oci_name="$1" local release_name="$2" @@ -266,10 +299,7 @@ jobs: exit 1 fi - if ! kubectl -n flux-system wait --for=condition=Ready "helmrelease/${release_name}" --timeout="${release_timeout}"; then - eso_diagnostics - exit 1 - fi + wait_for_helmrelease_ready "${release_name}" "${target_namespace}" "${release_timeout}" } flux_helm_diagnostics() { @@ -315,10 +345,7 @@ jobs: exit 1 fi - if ! kubectl -n flux-system wait --for=condition=Ready "helmrelease/${release_name}" --timeout="${release_timeout}"; then - flux_helm_diagnostics "${repo_name}" "${chart_name}" "${release_name}" "${target_namespace}" - exit 1 - fi + wait_for_helmrelease_ready "${release_name}" "${target_namespace}" "${release_timeout}" } kubectl create namespace flux-system --dry-run=client -o yaml | kubectl apply -f - @@ -349,7 +376,7 @@ jobs: # Wait directly on the ESO Helm objects; Kustomization readiness hides useful failure details. wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-external-secrets 600 kubectl -n flux-system annotate kustomization/addon-external-secrets reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite - wait_for_flux_oci_helm_release external-secrets external-secrets external-secrets 1800s 1800s + wait_for_flux_oci_helm_release external-secrets external-secrets external-secrets 600s 600 wait_for_resource "" crd/clustersecretstores.external-secrets.io 900 wait_for_resource "" crd/externalsecrets.external-secrets.io 900 kubectl wait --for=condition=established --timeout=600s crd/clustersecretstores.external-secrets.io @@ -376,9 +403,9 @@ jobs: namespace: external-secrets EOF # Wait for the storage layer and private access components - wait_for_flux_helm_release tailscale flux-system-tailscale-operator tailscale-operator tailscale-system 1200s 1800s 1800s + wait_for_flux_helm_release tailscale flux-system-tailscale-operator tailscale-operator tailscale-system 600s 600s 600 kubectl -n tailscale-system rollout status deployment/operator --timeout=600s - wait_for_flux_helm_release nfs-subdir-external-provisioner flux-system-nfs-subdir-external-provisioner nfs-subdir-external-provisioner kube-system 1200s 1800s 1800s + wait_for_flux_helm_release nfs-subdir-external-provisioner flux-system-nfs-subdir-external-provisioner nfs-subdir-external-provisioner kube-system 600s 600s 600 kubectl -n kube-system rollout status deployment/kube-system-nfs-subdir-external-provisioner --timeout=600s kubectl get storageclass flash-nfs @@ -422,12 +449,45 @@ jobs: --overwrite } + wait_for_helmrelease_ready() { + local release_name="$1" + local target_namespace="$2" + local timeout_seconds="$3" + local elapsed=0 + local ready + local stalled + + while [ "${elapsed}" -lt "${timeout_seconds}" ]; do + ready="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)" + stalled="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)" + + if [ "${ready}" = "True" ]; then + return 0 + fi + + if [ "${stalled}" = "True" ]; then + echo "HelmRelease ${release_name} is stalled" >&2 + kubectl -n flux-system describe "helmrelease/${release_name}" || true + kubectl -n "${target_namespace}" get pods -o wide || true + exit 1 + fi + + sleep 10 + elapsed=$((elapsed + 10)) + done + + echo "Timed out waiting for HelmRelease ${release_name} to become Ready" >&2 + kubectl -n flux-system describe "helmrelease/${release_name}" || true + kubectl -n "${target_namespace}" get pods -o wide || true + exit 1 + } + echo "Waiting for Rancher..." wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-rancher 600 kubectl -n flux-system annotate kustomization/addon-rancher reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher 600 reconcile_helmrelease rancher - kubectl -n flux-system wait --for=condition=Ready helmrelease/rancher --timeout=1800s + wait_for_helmrelease_ready rancher cattle-system 900 wait_for_resource "" namespace/cattle-system 600 kubectl -n cattle-system rollout status deployment/cattle-system-rancher --timeout=900s kubectl -n cattle-system rollout status deployment/rancher-webhook --timeout=900s @@ -441,8 +501,8 @@ jobs: wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher-backup 600 reconcile_helmrelease rancher-backup-crd reconcile_helmrelease rancher-backup - kubectl -n flux-system wait --for=condition=Ready helmrelease/rancher-backup-crd --timeout=1200s - kubectl -n flux-system wait --for=condition=Ready helmrelease/rancher-backup --timeout=1200s + wait_for_helmrelease_ready rancher-backup-crd cattle-resources-system 600 + wait_for_helmrelease_ready rancher-backup cattle-resources-system 600 wait_for_resource "" namespace/cattle-resources-system 600 kubectl -n cattle-resources-system rollout status deployment/rancher-backup --timeout=900s