fix: shorten observability iteration loop
Deploy Cluster / Terraform (push) Has been cancelled
Deploy Cluster / Ansible (push) Has been cancelled
Reconcile Observability / Observability (push) Failing after 6m15s

This commit is contained in:
2026-05-01 19:37:26 +00:00
parent e9327b0c61
commit bd71017a85
3 changed files with 364 additions and 32 deletions
+105 -18
View File
@@ -4,6 +4,16 @@ on:
push:
branches:
- main
paths-ignore:
- "ansible/dashboards.yml"
- "ansible/roles/observability-content/**"
- "infrastructure/addons/observability/**"
- "infrastructure/addons/observability-content/**"
- "infrastructure/addons/observability-secrets/**"
- "infrastructure/addons/kustomization-observability.yaml"
- "infrastructure/addons/kustomization-observability-content.yaml"
- "infrastructure/addons/kustomization-observability-secrets.yaml"
- "infrastructure/charts/kube-prometheus-stack/**"
pull_request:
branches:
- main
@@ -879,6 +889,20 @@ jobs:
run: |
set -euo pipefail
observability_diagnostics() {
kubectl -n flux-system get gitrepositories,kustomizations,ocirepositories,helmreleases || true
kubectl -n flux-system describe kustomization/addon-observability-secrets || true
kubectl -n flux-system describe kustomization/addon-observability || true
kubectl -n flux-system describe kustomization/addon-observability-content || true
kubectl -n flux-system describe ocirepository/loki || true
kubectl -n flux-system describe ocirepository/promtail || true
kubectl -n flux-system describe helmrelease/kube-prometheus-stack || true
kubectl -n flux-system describe helmrelease/loki || true
kubectl -n flux-system describe helmrelease/promtail || true
kubectl -n observability get pods,pvc,svc -o wide || true
kubectl -n observability get events --sort-by=.lastTimestamp || true
}
wait_for_resource() {
local namespace="$1"
local resource="$2"
@@ -888,7 +912,7 @@ jobs:
until kubectl -n "${namespace}" get "${resource}" >/dev/null 2>&1; do
if [ "${elapsed}" -ge "${timeout_seconds}" ]; then
echo "Timed out waiting for ${resource} to exist" >&2
kubectl -n flux-system get kustomizations,helmreleases || true
observability_diagnostics
exit 1
fi
@@ -915,7 +939,7 @@ jobs:
done
echo "Timed out waiting for ${resource} to handle reconcile ${reconcile_at}" >&2
kubectl -n flux-system describe "${resource}" || true
observability_diagnostics
exit 1
}
@@ -928,7 +952,7 @@ jobs:
wait_for_reconcile_handled "${resource}" "${reconcile_at}" "${timeout_seconds}"
}
reconcile_helmrelease() {
request_helmrelease_reconcile() {
local release="$1"
local reconcile_at
reconcile_at="$(date +%s%N)"
@@ -937,25 +961,88 @@ jobs:
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
--overwrite
wait_for_reconcile_handled "helmrelease/${release}" "${reconcile_at}" 300
}
wait_for_flux_ready() {
local resource="$1"
local timeout="$2"
if ! kubectl -n flux-system wait --for=condition=Ready "${resource}" --timeout="${timeout}"; then
observability_diagnostics
exit 1
fi
}
wait_for_ocirepository_ready_or_cached() {
local repository="$1"
local timeout="$2"
local artifact_storage
if kubectl -n flux-system wait --for=condition=Ready "ocirepository/${repository}" --timeout="${timeout}"; then
return 0
fi
artifact_storage="$(kubectl -n flux-system get "ocirepository/${repository}" -o jsonpath='{.status.conditions[?(@.type=="ArtifactInStorage")].status}' 2>/dev/null || true)"
if [ "${artifact_storage}" = "True" ]; then
echo "OCIRepository ${repository} is not currently Ready; continuing with cached artifact" >&2
return 0
fi
observability_diagnostics
exit 1
}
wait_for_helmrelease_ready() {
local release="$1"
local timeout_seconds="$2"
local elapsed=0
local ready
local stalled
local generation
local observed_generation
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
ready="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
stalled="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)"
generation="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.metadata.generation}' 2>/dev/null || true)"
observed_generation="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.observedGeneration}' 2>/dev/null || true)"
if [ "${ready}" = "True" ] && [ "${observed_generation}" = "${generation}" ]; then
return 0
fi
if [ "${stalled}" = "True" ]; then
echo "HelmRelease ${release} is stalled" >&2
observability_diagnostics
exit 1
fi
sleep 10
elapsed=$((elapsed + 10))
done
echo "Timed out waiting for HelmRelease ${release} to become Ready" >&2
observability_diagnostics
exit 1
}
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability-secrets 600
reconcile_flux_resource kustomization/addon-observability-secrets 600
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-secrets --timeout=600s
reconcile_flux_resource kustomization/addon-observability-secrets 300
wait_for_flux_ready kustomization/addon-observability-secrets 300s
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability 600
reconcile_flux_resource kustomization/addon-observability 1800
if ! kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=1800s; then
kubectl -n flux-system describe kustomization/addon-observability || true
kubectl -n flux-system describe helmrelease/kube-prometheus-stack || true
kubectl -n flux-system describe helmrelease/loki || true
kubectl -n flux-system describe helmrelease/promtail || true
kubectl -n observability get pods -o wide || true
exit 1
fi
reconcile_flux_resource kustomization/addon-observability 600
wait_for_flux_ready kustomization/addon-observability 300s
wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/loki 300
wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/promtail 300
wait_for_ocirepository_ready_or_cached loki 300s
wait_for_ocirepository_ready_or_cached promtail 300s
for release in kube-prometheus-stack loki promtail; do
reconcile_helmrelease "${release}"
wait_for_resource flux-system "helmrelease.helm.toolkit.fluxcd.io/${release}" 300
request_helmrelease_reconcile "${release}"
wait_for_helmrelease_ready "${release}" 600
done
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability-content 300
reconcile_flux_resource kustomization/addon-observability-content 300
wait_for_flux_ready kustomization/addon-observability-content 300s
kubectl -n observability rollout restart deployment/observability-kube-prometheus-stack-grafana || true
- name: Post-deploy cluster health checks
@@ -977,9 +1064,9 @@ jobs:
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=900s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-config --timeout=300s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-secrets --timeout=300s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=1800s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=300s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=300s
kubectl -n flux-system wait --for=condition=Ready helmrelease --all --timeout=1200s
kubectl -n flux-system wait --for=condition=Ready helmrelease --all --timeout=600s
kubectl annotate storageclass local-path storageclass.kubernetes.io/is-default-class=false --overwrite
kubectl annotate storageclass flash-nfs storageclass.kubernetes.io/is-default-class=true --overwrite
kubectl get storageclass | grep -E "^flash-nfs.*\\(default\\)"