fix: shorten observability iteration loop
Deploy Cluster / Terraform (push) Has been cancelled
Deploy Cluster / Ansible (push) Has been cancelled
Reconcile Observability / Observability (push) Failing after 6m15s

This commit is contained in:
2026-05-01 19:37:26 +00:00
parent e9327b0c61
commit bd71017a85
3 changed files with 364 additions and 32 deletions
+105 -18
View File
@@ -4,6 +4,16 @@ on:
push:
branches:
- main
paths-ignore:
- "ansible/dashboards.yml"
- "ansible/roles/observability-content/**"
- "infrastructure/addons/observability/**"
- "infrastructure/addons/observability-content/**"
- "infrastructure/addons/observability-secrets/**"
- "infrastructure/addons/kustomization-observability.yaml"
- "infrastructure/addons/kustomization-observability-content.yaml"
- "infrastructure/addons/kustomization-observability-secrets.yaml"
- "infrastructure/charts/kube-prometheus-stack/**"
pull_request:
branches:
- main
@@ -879,6 +889,20 @@ jobs:
run: |
set -euo pipefail
observability_diagnostics() {
kubectl -n flux-system get gitrepositories,kustomizations,ocirepositories,helmreleases || true
kubectl -n flux-system describe kustomization/addon-observability-secrets || true
kubectl -n flux-system describe kustomization/addon-observability || true
kubectl -n flux-system describe kustomization/addon-observability-content || true
kubectl -n flux-system describe ocirepository/loki || true
kubectl -n flux-system describe ocirepository/promtail || true
kubectl -n flux-system describe helmrelease/kube-prometheus-stack || true
kubectl -n flux-system describe helmrelease/loki || true
kubectl -n flux-system describe helmrelease/promtail || true
kubectl -n observability get pods,pvc,svc -o wide || true
kubectl -n observability get events --sort-by=.lastTimestamp || true
}
wait_for_resource() {
local namespace="$1"
local resource="$2"
@@ -888,7 +912,7 @@ jobs:
until kubectl -n "${namespace}" get "${resource}" >/dev/null 2>&1; do
if [ "${elapsed}" -ge "${timeout_seconds}" ]; then
echo "Timed out waiting for ${resource} to exist" >&2
kubectl -n flux-system get kustomizations,helmreleases || true
observability_diagnostics
exit 1
fi
@@ -915,7 +939,7 @@ jobs:
done
echo "Timed out waiting for ${resource} to handle reconcile ${reconcile_at}" >&2
kubectl -n flux-system describe "${resource}" || true
observability_diagnostics
exit 1
}
@@ -928,7 +952,7 @@ jobs:
wait_for_reconcile_handled "${resource}" "${reconcile_at}" "${timeout_seconds}"
}
reconcile_helmrelease() {
request_helmrelease_reconcile() {
local release="$1"
local reconcile_at
reconcile_at="$(date +%s%N)"
@@ -937,25 +961,88 @@ jobs:
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
--overwrite
wait_for_reconcile_handled "helmrelease/${release}" "${reconcile_at}" 300
}
wait_for_flux_ready() {
local resource="$1"
local timeout="$2"
if ! kubectl -n flux-system wait --for=condition=Ready "${resource}" --timeout="${timeout}"; then
observability_diagnostics
exit 1
fi
}
wait_for_ocirepository_ready_or_cached() {
local repository="$1"
local timeout="$2"
local artifact_storage
if kubectl -n flux-system wait --for=condition=Ready "ocirepository/${repository}" --timeout="${timeout}"; then
return 0
fi
artifact_storage="$(kubectl -n flux-system get "ocirepository/${repository}" -o jsonpath='{.status.conditions[?(@.type=="ArtifactInStorage")].status}' 2>/dev/null || true)"
if [ "${artifact_storage}" = "True" ]; then
echo "OCIRepository ${repository} is not currently Ready; continuing with cached artifact" >&2
return 0
fi
observability_diagnostics
exit 1
}
wait_for_helmrelease_ready() {
local release="$1"
local timeout_seconds="$2"
local elapsed=0
local ready
local stalled
local generation
local observed_generation
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
ready="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
stalled="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)"
generation="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.metadata.generation}' 2>/dev/null || true)"
observed_generation="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.observedGeneration}' 2>/dev/null || true)"
if [ "${ready}" = "True" ] && [ "${observed_generation}" = "${generation}" ]; then
return 0
fi
if [ "${stalled}" = "True" ]; then
echo "HelmRelease ${release} is stalled" >&2
observability_diagnostics
exit 1
fi
sleep 10
elapsed=$((elapsed + 10))
done
echo "Timed out waiting for HelmRelease ${release} to become Ready" >&2
observability_diagnostics
exit 1
}
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability-secrets 600
reconcile_flux_resource kustomization/addon-observability-secrets 600
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-secrets --timeout=600s
reconcile_flux_resource kustomization/addon-observability-secrets 300
wait_for_flux_ready kustomization/addon-observability-secrets 300s
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability 600
reconcile_flux_resource kustomization/addon-observability 1800
if ! kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=1800s; then
kubectl -n flux-system describe kustomization/addon-observability || true
kubectl -n flux-system describe helmrelease/kube-prometheus-stack || true
kubectl -n flux-system describe helmrelease/loki || true
kubectl -n flux-system describe helmrelease/promtail || true
kubectl -n observability get pods -o wide || true
exit 1
fi
reconcile_flux_resource kustomization/addon-observability 600
wait_for_flux_ready kustomization/addon-observability 300s
wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/loki 300
wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/promtail 300
wait_for_ocirepository_ready_or_cached loki 300s
wait_for_ocirepository_ready_or_cached promtail 300s
for release in kube-prometheus-stack loki promtail; do
reconcile_helmrelease "${release}"
wait_for_resource flux-system "helmrelease.helm.toolkit.fluxcd.io/${release}" 300
request_helmrelease_reconcile "${release}"
wait_for_helmrelease_ready "${release}" 600
done
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability-content 300
reconcile_flux_resource kustomization/addon-observability-content 300
wait_for_flux_ready kustomization/addon-observability-content 300s
kubectl -n observability rollout restart deployment/observability-kube-prometheus-stack-grafana || true
- name: Post-deploy cluster health checks
@@ -977,9 +1064,9 @@ jobs:
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=900s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-config --timeout=300s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-secrets --timeout=300s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=1800s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=300s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=300s
kubectl -n flux-system wait --for=condition=Ready helmrelease --all --timeout=1200s
kubectl -n flux-system wait --for=condition=Ready helmrelease --all --timeout=600s
kubectl annotate storageclass local-path storageclass.kubernetes.io/is-default-class=false --overwrite
kubectl annotate storageclass flash-nfs storageclass.kubernetes.io/is-default-class=true --overwrite
kubectl get storageclass | grep -E "^flash-nfs.*\\(default\\)"
+258
View File
@@ -0,0 +1,258 @@
name: Reconcile Observability
on:
push:
branches:
- main
paths:
- "infrastructure/addons/observability/**"
- "infrastructure/addons/observability-content/**"
- "infrastructure/addons/observability-secrets/**"
- "infrastructure/addons/kustomization-observability.yaml"
- "infrastructure/addons/kustomization-observability-content.yaml"
- "infrastructure/addons/kustomization-observability-secrets.yaml"
- "infrastructure/charts/kube-prometheus-stack/**"
workflow_dispatch:
concurrency:
group: prod-cluster
cancel-in-progress: false
env:
TF_VERSION: "1.14.9"
KUBECTL_VERSION: "v1.34.6"
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
TF_VAR_proxmox_insecure: "true"
jobs:
observability:
name: Observability
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Terraform
uses: hashicorp/setup-terraform@v3
with:
terraform_version: ${{ env.TF_VERSION }}
terraform_wrapper: false
- name: Setup SSH Keys
run: |
mkdir -p ~/.ssh
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
chmod 644 ~/.ssh/id_ed25519.pub
- name: Terraform Init
working-directory: terraform
run: |
terraform init \
-lockfile=readonly \
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
-backend-config="region=auto" \
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
-backend-config="skip_requesting_account_id=true"
- name: Install kubectl
run: |
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl"
chmod +x /usr/local/bin/kubectl
- name: Validate observability manifests
run: |
set -euo pipefail
kubectl kustomize infrastructure/addons/observability >/dev/null
kubectl kustomize infrastructure/addons/observability-secrets >/dev/null
kubectl kustomize infrastructure/addons/observability-content >/dev/null
kubectl kustomize infrastructure/addons >/dev/null
- name: Refresh kubeconfig
run: |
set -euo pipefail
mkdir -p outputs
PRIMARY_IP="$(terraform -chdir=terraform output -raw primary_control_plane_ip)"
SSH_KEY="$HOME/.ssh/id_ed25519" scripts/refresh-kubeconfig.sh "${PRIMARY_IP}"
- name: Reconcile observability
env:
KUBECONFIG: outputs/kubeconfig
run: |
set -euo pipefail
observability_diagnostics() {
kubectl -n flux-system get gitrepositories,kustomizations,ocirepositories,helmreleases || true
kubectl -n flux-system describe gitrepository/platform || true
kubectl -n flux-system describe kustomization/infrastructure || true
kubectl -n flux-system describe kustomization/addon-observability-secrets || true
kubectl -n flux-system describe kustomization/addon-observability || true
kubectl -n flux-system describe kustomization/addon-observability-content || true
kubectl -n flux-system describe ocirepository/loki || true
kubectl -n flux-system describe ocirepository/promtail || true
kubectl -n flux-system describe helmrelease/kube-prometheus-stack || true
kubectl -n flux-system describe helmrelease/loki || true
kubectl -n flux-system describe helmrelease/promtail || true
kubectl -n observability get pods,pvc,svc -o wide || true
kubectl -n observability get events --sort-by=.lastTimestamp || true
}
wait_for_resource() {
local namespace="$1"
local resource="$2"
local timeout_seconds="$3"
local elapsed=0
until kubectl -n "${namespace}" get "${resource}" >/dev/null 2>&1; do
if [ "${elapsed}" -ge "${timeout_seconds}" ]; then
echo "Timed out waiting for ${resource} to exist" >&2
observability_diagnostics
exit 1
fi
sleep 10
elapsed=$((elapsed + 10))
done
}
wait_for_reconcile_handled() {
local resource="$1"
local reconcile_at="$2"
local timeout_seconds="$3"
local elapsed=0
local handled
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
handled="$(kubectl -n flux-system get "${resource}" -o jsonpath='{.status.lastHandledReconcileAt}' 2>/dev/null || true)"
if [ "${handled}" = "${reconcile_at}" ]; then
return 0
fi
sleep 5
elapsed=$((elapsed + 5))
done
echo "Timed out waiting for ${resource} to handle reconcile ${reconcile_at}" >&2
observability_diagnostics
exit 1
}
reconcile_flux_resource() {
local resource="$1"
local timeout_seconds="${2:-300}"
local reconcile_at
reconcile_at="$(date +%s%N)"
kubectl -n flux-system annotate "${resource}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
wait_for_reconcile_handled "${resource}" "${reconcile_at}" "${timeout_seconds}"
}
request_helmrelease_reconcile() {
local release="$1"
local reconcile_at
reconcile_at="$(date +%s%N)"
kubectl -n flux-system annotate "helmrelease/${release}" \
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
--overwrite
}
wait_for_flux_ready() {
local resource="$1"
local timeout="$2"
if ! kubectl -n flux-system wait --for=condition=Ready "${resource}" --timeout="${timeout}"; then
observability_diagnostics
exit 1
fi
}
wait_for_ocirepository_ready_or_cached() {
local repository="$1"
local timeout="$2"
local artifact_storage
if kubectl -n flux-system wait --for=condition=Ready "ocirepository/${repository}" --timeout="${timeout}"; then
return 0
fi
artifact_storage="$(kubectl -n flux-system get "ocirepository/${repository}" -o jsonpath='{.status.conditions[?(@.type=="ArtifactInStorage")].status}' 2>/dev/null || true)"
if [ "${artifact_storage}" = "True" ]; then
echo "OCIRepository ${repository} is not currently Ready; continuing with cached artifact" >&2
return 0
fi
observability_diagnostics
exit 1
}
wait_for_helmrelease_ready() {
local release="$1"
local timeout_seconds="$2"
local elapsed=0
local ready
local stalled
local generation
local observed_generation
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
ready="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
stalled="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)"
generation="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.metadata.generation}' 2>/dev/null || true)"
observed_generation="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.observedGeneration}' 2>/dev/null || true)"
if [ "${ready}" = "True" ] && [ "${observed_generation}" = "${generation}" ]; then
return 0
fi
if [ "${stalled}" = "True" ]; then
echo "HelmRelease ${release} is stalled" >&2
observability_diagnostics
exit 1
fi
sleep 10
elapsed=$((elapsed + 10))
done
echo "Timed out waiting for HelmRelease ${release} to become Ready" >&2
observability_diagnostics
exit 1
}
reconcile_flux_resource gitrepository/platform 300
wait_for_flux_ready gitrepository/platform 300s
reconcile_flux_resource kustomization/infrastructure 300
wait_for_flux_ready kustomization/infrastructure 300s
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability-secrets 300
reconcile_flux_resource kustomization/addon-observability-secrets 300
wait_for_flux_ready kustomization/addon-observability-secrets 300s
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability 300
reconcile_flux_resource kustomization/addon-observability 600
wait_for_flux_ready kustomization/addon-observability 300s
wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/loki 300
wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/promtail 300
wait_for_ocirepository_ready_or_cached loki 300s
wait_for_ocirepository_ready_or_cached promtail 300s
for release in kube-prometheus-stack loki promtail; do
wait_for_resource flux-system "helmrelease.helm.toolkit.fluxcd.io/${release}" 300
request_helmrelease_reconcile "${release}"
wait_for_helmrelease_ready "${release}" 600
done
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability-content 300
reconcile_flux_resource kustomization/addon-observability-content 300
wait_for_flux_ready kustomization/addon-observability-content 300s
kubectl -n observability rollout restart deployment/observability-kube-prometheus-stack-grafana || true