fix: shorten observability iteration loop
This commit is contained in:
+105
-18
@@ -4,6 +4,16 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths-ignore:
|
||||
- "ansible/dashboards.yml"
|
||||
- "ansible/roles/observability-content/**"
|
||||
- "infrastructure/addons/observability/**"
|
||||
- "infrastructure/addons/observability-content/**"
|
||||
- "infrastructure/addons/observability-secrets/**"
|
||||
- "infrastructure/addons/kustomization-observability.yaml"
|
||||
- "infrastructure/addons/kustomization-observability-content.yaml"
|
||||
- "infrastructure/addons/kustomization-observability-secrets.yaml"
|
||||
- "infrastructure/charts/kube-prometheus-stack/**"
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
@@ -879,6 +889,20 @@ jobs:
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
observability_diagnostics() {
|
||||
kubectl -n flux-system get gitrepositories,kustomizations,ocirepositories,helmreleases || true
|
||||
kubectl -n flux-system describe kustomization/addon-observability-secrets || true
|
||||
kubectl -n flux-system describe kustomization/addon-observability || true
|
||||
kubectl -n flux-system describe kustomization/addon-observability-content || true
|
||||
kubectl -n flux-system describe ocirepository/loki || true
|
||||
kubectl -n flux-system describe ocirepository/promtail || true
|
||||
kubectl -n flux-system describe helmrelease/kube-prometheus-stack || true
|
||||
kubectl -n flux-system describe helmrelease/loki || true
|
||||
kubectl -n flux-system describe helmrelease/promtail || true
|
||||
kubectl -n observability get pods,pvc,svc -o wide || true
|
||||
kubectl -n observability get events --sort-by=.lastTimestamp || true
|
||||
}
|
||||
|
||||
wait_for_resource() {
|
||||
local namespace="$1"
|
||||
local resource="$2"
|
||||
@@ -888,7 +912,7 @@ jobs:
|
||||
until kubectl -n "${namespace}" get "${resource}" >/dev/null 2>&1; do
|
||||
if [ "${elapsed}" -ge "${timeout_seconds}" ]; then
|
||||
echo "Timed out waiting for ${resource} to exist" >&2
|
||||
kubectl -n flux-system get kustomizations,helmreleases || true
|
||||
observability_diagnostics
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -915,7 +939,7 @@ jobs:
|
||||
done
|
||||
|
||||
echo "Timed out waiting for ${resource} to handle reconcile ${reconcile_at}" >&2
|
||||
kubectl -n flux-system describe "${resource}" || true
|
||||
observability_diagnostics
|
||||
exit 1
|
||||
}
|
||||
|
||||
@@ -928,7 +952,7 @@ jobs:
|
||||
wait_for_reconcile_handled "${resource}" "${reconcile_at}" "${timeout_seconds}"
|
||||
}
|
||||
|
||||
reconcile_helmrelease() {
|
||||
request_helmrelease_reconcile() {
|
||||
local release="$1"
|
||||
local reconcile_at
|
||||
reconcile_at="$(date +%s%N)"
|
||||
@@ -937,25 +961,88 @@ jobs:
|
||||
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
|
||||
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
|
||||
--overwrite
|
||||
wait_for_reconcile_handled "helmrelease/${release}" "${reconcile_at}" 300
|
||||
}
|
||||
|
||||
wait_for_flux_ready() {
|
||||
local resource="$1"
|
||||
local timeout="$2"
|
||||
if ! kubectl -n flux-system wait --for=condition=Ready "${resource}" --timeout="${timeout}"; then
|
||||
observability_diagnostics
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
wait_for_ocirepository_ready_or_cached() {
|
||||
local repository="$1"
|
||||
local timeout="$2"
|
||||
local artifact_storage
|
||||
|
||||
if kubectl -n flux-system wait --for=condition=Ready "ocirepository/${repository}" --timeout="${timeout}"; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
artifact_storage="$(kubectl -n flux-system get "ocirepository/${repository}" -o jsonpath='{.status.conditions[?(@.type=="ArtifactInStorage")].status}' 2>/dev/null || true)"
|
||||
if [ "${artifact_storage}" = "True" ]; then
|
||||
echo "OCIRepository ${repository} is not currently Ready; continuing with cached artifact" >&2
|
||||
return 0
|
||||
fi
|
||||
|
||||
observability_diagnostics
|
||||
exit 1
|
||||
}
|
||||
|
||||
wait_for_helmrelease_ready() {
|
||||
local release="$1"
|
||||
local timeout_seconds="$2"
|
||||
local elapsed=0
|
||||
local ready
|
||||
local stalled
|
||||
local generation
|
||||
local observed_generation
|
||||
|
||||
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
|
||||
ready="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
|
||||
stalled="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)"
|
||||
generation="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.metadata.generation}' 2>/dev/null || true)"
|
||||
observed_generation="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.observedGeneration}' 2>/dev/null || true)"
|
||||
|
||||
if [ "${ready}" = "True" ] && [ "${observed_generation}" = "${generation}" ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [ "${stalled}" = "True" ]; then
|
||||
echo "HelmRelease ${release} is stalled" >&2
|
||||
observability_diagnostics
|
||||
exit 1
|
||||
fi
|
||||
|
||||
sleep 10
|
||||
elapsed=$((elapsed + 10))
|
||||
done
|
||||
|
||||
echo "Timed out waiting for HelmRelease ${release} to become Ready" >&2
|
||||
observability_diagnostics
|
||||
exit 1
|
||||
}
|
||||
|
||||
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability-secrets 600
|
||||
reconcile_flux_resource kustomization/addon-observability-secrets 600
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-secrets --timeout=600s
|
||||
reconcile_flux_resource kustomization/addon-observability-secrets 300
|
||||
wait_for_flux_ready kustomization/addon-observability-secrets 300s
|
||||
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability 600
|
||||
reconcile_flux_resource kustomization/addon-observability 1800
|
||||
if ! kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=1800s; then
|
||||
kubectl -n flux-system describe kustomization/addon-observability || true
|
||||
kubectl -n flux-system describe helmrelease/kube-prometheus-stack || true
|
||||
kubectl -n flux-system describe helmrelease/loki || true
|
||||
kubectl -n flux-system describe helmrelease/promtail || true
|
||||
kubectl -n observability get pods -o wide || true
|
||||
exit 1
|
||||
fi
|
||||
reconcile_flux_resource kustomization/addon-observability 600
|
||||
wait_for_flux_ready kustomization/addon-observability 300s
|
||||
wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/loki 300
|
||||
wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/promtail 300
|
||||
wait_for_ocirepository_ready_or_cached loki 300s
|
||||
wait_for_ocirepository_ready_or_cached promtail 300s
|
||||
for release in kube-prometheus-stack loki promtail; do
|
||||
reconcile_helmrelease "${release}"
|
||||
wait_for_resource flux-system "helmrelease.helm.toolkit.fluxcd.io/${release}" 300
|
||||
request_helmrelease_reconcile "${release}"
|
||||
wait_for_helmrelease_ready "${release}" 600
|
||||
done
|
||||
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability-content 300
|
||||
reconcile_flux_resource kustomization/addon-observability-content 300
|
||||
wait_for_flux_ready kustomization/addon-observability-content 300s
|
||||
kubectl -n observability rollout restart deployment/observability-kube-prometheus-stack-grafana || true
|
||||
|
||||
- name: Post-deploy cluster health checks
|
||||
@@ -977,9 +1064,9 @@ jobs:
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=900s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-config --timeout=300s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-secrets --timeout=300s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=1800s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=300s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=300s
|
||||
kubectl -n flux-system wait --for=condition=Ready helmrelease --all --timeout=1200s
|
||||
kubectl -n flux-system wait --for=condition=Ready helmrelease --all --timeout=600s
|
||||
kubectl annotate storageclass local-path storageclass.kubernetes.io/is-default-class=false --overwrite
|
||||
kubectl annotate storageclass flash-nfs storageclass.kubernetes.io/is-default-class=true --overwrite
|
||||
kubectl get storageclass | grep -E "^flash-nfs.*\\(default\\)"
|
||||
|
||||
Reference in New Issue
Block a user