fix: shorten observability iteration loop
This commit is contained in:
+105
-18
@@ -4,6 +4,16 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths-ignore:
|
||||
- "ansible/dashboards.yml"
|
||||
- "ansible/roles/observability-content/**"
|
||||
- "infrastructure/addons/observability/**"
|
||||
- "infrastructure/addons/observability-content/**"
|
||||
- "infrastructure/addons/observability-secrets/**"
|
||||
- "infrastructure/addons/kustomization-observability.yaml"
|
||||
- "infrastructure/addons/kustomization-observability-content.yaml"
|
||||
- "infrastructure/addons/kustomization-observability-secrets.yaml"
|
||||
- "infrastructure/charts/kube-prometheus-stack/**"
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
@@ -879,6 +889,20 @@ jobs:
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
observability_diagnostics() {
|
||||
kubectl -n flux-system get gitrepositories,kustomizations,ocirepositories,helmreleases || true
|
||||
kubectl -n flux-system describe kustomization/addon-observability-secrets || true
|
||||
kubectl -n flux-system describe kustomization/addon-observability || true
|
||||
kubectl -n flux-system describe kustomization/addon-observability-content || true
|
||||
kubectl -n flux-system describe ocirepository/loki || true
|
||||
kubectl -n flux-system describe ocirepository/promtail || true
|
||||
kubectl -n flux-system describe helmrelease/kube-prometheus-stack || true
|
||||
kubectl -n flux-system describe helmrelease/loki || true
|
||||
kubectl -n flux-system describe helmrelease/promtail || true
|
||||
kubectl -n observability get pods,pvc,svc -o wide || true
|
||||
kubectl -n observability get events --sort-by=.lastTimestamp || true
|
||||
}
|
||||
|
||||
wait_for_resource() {
|
||||
local namespace="$1"
|
||||
local resource="$2"
|
||||
@@ -888,7 +912,7 @@ jobs:
|
||||
until kubectl -n "${namespace}" get "${resource}" >/dev/null 2>&1; do
|
||||
if [ "${elapsed}" -ge "${timeout_seconds}" ]; then
|
||||
echo "Timed out waiting for ${resource} to exist" >&2
|
||||
kubectl -n flux-system get kustomizations,helmreleases || true
|
||||
observability_diagnostics
|
||||
exit 1
|
||||
fi
|
||||
|
||||
@@ -915,7 +939,7 @@ jobs:
|
||||
done
|
||||
|
||||
echo "Timed out waiting for ${resource} to handle reconcile ${reconcile_at}" >&2
|
||||
kubectl -n flux-system describe "${resource}" || true
|
||||
observability_diagnostics
|
||||
exit 1
|
||||
}
|
||||
|
||||
@@ -928,7 +952,7 @@ jobs:
|
||||
wait_for_reconcile_handled "${resource}" "${reconcile_at}" "${timeout_seconds}"
|
||||
}
|
||||
|
||||
reconcile_helmrelease() {
|
||||
request_helmrelease_reconcile() {
|
||||
local release="$1"
|
||||
local reconcile_at
|
||||
reconcile_at="$(date +%s%N)"
|
||||
@@ -937,25 +961,88 @@ jobs:
|
||||
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
|
||||
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
|
||||
--overwrite
|
||||
wait_for_reconcile_handled "helmrelease/${release}" "${reconcile_at}" 300
|
||||
}
|
||||
|
||||
wait_for_flux_ready() {
|
||||
local resource="$1"
|
||||
local timeout="$2"
|
||||
if ! kubectl -n flux-system wait --for=condition=Ready "${resource}" --timeout="${timeout}"; then
|
||||
observability_diagnostics
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
wait_for_ocirepository_ready_or_cached() {
|
||||
local repository="$1"
|
||||
local timeout="$2"
|
||||
local artifact_storage
|
||||
|
||||
if kubectl -n flux-system wait --for=condition=Ready "ocirepository/${repository}" --timeout="${timeout}"; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
artifact_storage="$(kubectl -n flux-system get "ocirepository/${repository}" -o jsonpath='{.status.conditions[?(@.type=="ArtifactInStorage")].status}' 2>/dev/null || true)"
|
||||
if [ "${artifact_storage}" = "True" ]; then
|
||||
echo "OCIRepository ${repository} is not currently Ready; continuing with cached artifact" >&2
|
||||
return 0
|
||||
fi
|
||||
|
||||
observability_diagnostics
|
||||
exit 1
|
||||
}
|
||||
|
||||
wait_for_helmrelease_ready() {
|
||||
local release="$1"
|
||||
local timeout_seconds="$2"
|
||||
local elapsed=0
|
||||
local ready
|
||||
local stalled
|
||||
local generation
|
||||
local observed_generation
|
||||
|
||||
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
|
||||
ready="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
|
||||
stalled="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)"
|
||||
generation="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.metadata.generation}' 2>/dev/null || true)"
|
||||
observed_generation="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.observedGeneration}' 2>/dev/null || true)"
|
||||
|
||||
if [ "${ready}" = "True" ] && [ "${observed_generation}" = "${generation}" ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [ "${stalled}" = "True" ]; then
|
||||
echo "HelmRelease ${release} is stalled" >&2
|
||||
observability_diagnostics
|
||||
exit 1
|
||||
fi
|
||||
|
||||
sleep 10
|
||||
elapsed=$((elapsed + 10))
|
||||
done
|
||||
|
||||
echo "Timed out waiting for HelmRelease ${release} to become Ready" >&2
|
||||
observability_diagnostics
|
||||
exit 1
|
||||
}
|
||||
|
||||
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability-secrets 600
|
||||
reconcile_flux_resource kustomization/addon-observability-secrets 600
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-secrets --timeout=600s
|
||||
reconcile_flux_resource kustomization/addon-observability-secrets 300
|
||||
wait_for_flux_ready kustomization/addon-observability-secrets 300s
|
||||
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability 600
|
||||
reconcile_flux_resource kustomization/addon-observability 1800
|
||||
if ! kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=1800s; then
|
||||
kubectl -n flux-system describe kustomization/addon-observability || true
|
||||
kubectl -n flux-system describe helmrelease/kube-prometheus-stack || true
|
||||
kubectl -n flux-system describe helmrelease/loki || true
|
||||
kubectl -n flux-system describe helmrelease/promtail || true
|
||||
kubectl -n observability get pods -o wide || true
|
||||
exit 1
|
||||
fi
|
||||
reconcile_flux_resource kustomization/addon-observability 600
|
||||
wait_for_flux_ready kustomization/addon-observability 300s
|
||||
wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/loki 300
|
||||
wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/promtail 300
|
||||
wait_for_ocirepository_ready_or_cached loki 300s
|
||||
wait_for_ocirepository_ready_or_cached promtail 300s
|
||||
for release in kube-prometheus-stack loki promtail; do
|
||||
reconcile_helmrelease "${release}"
|
||||
wait_for_resource flux-system "helmrelease.helm.toolkit.fluxcd.io/${release}" 300
|
||||
request_helmrelease_reconcile "${release}"
|
||||
wait_for_helmrelease_ready "${release}" 600
|
||||
done
|
||||
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability-content 300
|
||||
reconcile_flux_resource kustomization/addon-observability-content 300
|
||||
wait_for_flux_ready kustomization/addon-observability-content 300s
|
||||
kubectl -n observability rollout restart deployment/observability-kube-prometheus-stack-grafana || true
|
||||
|
||||
- name: Post-deploy cluster health checks
|
||||
@@ -977,9 +1064,9 @@ jobs:
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=900s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-config --timeout=300s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-secrets --timeout=300s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=1800s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=300s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=300s
|
||||
kubectl -n flux-system wait --for=condition=Ready helmrelease --all --timeout=1200s
|
||||
kubectl -n flux-system wait --for=condition=Ready helmrelease --all --timeout=600s
|
||||
kubectl annotate storageclass local-path storageclass.kubernetes.io/is-default-class=false --overwrite
|
||||
kubectl annotate storageclass flash-nfs storageclass.kubernetes.io/is-default-class=true --overwrite
|
||||
kubectl get storageclass | grep -E "^flash-nfs.*\\(default\\)"
|
||||
|
||||
@@ -0,0 +1,258 @@
|
||||
name: Reconcile Observability
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "infrastructure/addons/observability/**"
|
||||
- "infrastructure/addons/observability-content/**"
|
||||
- "infrastructure/addons/observability-secrets/**"
|
||||
- "infrastructure/addons/kustomization-observability.yaml"
|
||||
- "infrastructure/addons/kustomization-observability-content.yaml"
|
||||
- "infrastructure/addons/kustomization-observability-secrets.yaml"
|
||||
- "infrastructure/charts/kube-prometheus-stack/**"
|
||||
workflow_dispatch:
|
||||
|
||||
concurrency:
|
||||
group: prod-cluster
|
||||
cancel-in-progress: false
|
||||
|
||||
env:
|
||||
TF_VERSION: "1.14.9"
|
||||
KUBECTL_VERSION: "v1.34.6"
|
||||
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
|
||||
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
|
||||
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
|
||||
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
|
||||
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
|
||||
TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
|
||||
TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
|
||||
TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
|
||||
TF_VAR_proxmox_insecure: "true"
|
||||
|
||||
jobs:
|
||||
observability:
|
||||
name: Observability
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Terraform
|
||||
uses: hashicorp/setup-terraform@v3
|
||||
with:
|
||||
terraform_version: ${{ env.TF_VERSION }}
|
||||
terraform_wrapper: false
|
||||
|
||||
- name: Setup SSH Keys
|
||||
run: |
|
||||
mkdir -p ~/.ssh
|
||||
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
|
||||
chmod 600 ~/.ssh/id_ed25519
|
||||
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
|
||||
chmod 644 ~/.ssh/id_ed25519.pub
|
||||
|
||||
- name: Terraform Init
|
||||
working-directory: terraform
|
||||
run: |
|
||||
terraform init \
|
||||
-lockfile=readonly \
|
||||
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
||||
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
||||
-backend-config="region=auto" \
|
||||
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
|
||||
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
|
||||
-backend-config="skip_requesting_account_id=true"
|
||||
|
||||
- name: Install kubectl
|
||||
run: |
|
||||
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl"
|
||||
chmod +x /usr/local/bin/kubectl
|
||||
|
||||
- name: Validate observability manifests
|
||||
run: |
|
||||
set -euo pipefail
|
||||
kubectl kustomize infrastructure/addons/observability >/dev/null
|
||||
kubectl kustomize infrastructure/addons/observability-secrets >/dev/null
|
||||
kubectl kustomize infrastructure/addons/observability-content >/dev/null
|
||||
kubectl kustomize infrastructure/addons >/dev/null
|
||||
|
||||
- name: Refresh kubeconfig
|
||||
run: |
|
||||
set -euo pipefail
|
||||
mkdir -p outputs
|
||||
PRIMARY_IP="$(terraform -chdir=terraform output -raw primary_control_plane_ip)"
|
||||
SSH_KEY="$HOME/.ssh/id_ed25519" scripts/refresh-kubeconfig.sh "${PRIMARY_IP}"
|
||||
|
||||
- name: Reconcile observability
|
||||
env:
|
||||
KUBECONFIG: outputs/kubeconfig
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
observability_diagnostics() {
|
||||
kubectl -n flux-system get gitrepositories,kustomizations,ocirepositories,helmreleases || true
|
||||
kubectl -n flux-system describe gitrepository/platform || true
|
||||
kubectl -n flux-system describe kustomization/infrastructure || true
|
||||
kubectl -n flux-system describe kustomization/addon-observability-secrets || true
|
||||
kubectl -n flux-system describe kustomization/addon-observability || true
|
||||
kubectl -n flux-system describe kustomization/addon-observability-content || true
|
||||
kubectl -n flux-system describe ocirepository/loki || true
|
||||
kubectl -n flux-system describe ocirepository/promtail || true
|
||||
kubectl -n flux-system describe helmrelease/kube-prometheus-stack || true
|
||||
kubectl -n flux-system describe helmrelease/loki || true
|
||||
kubectl -n flux-system describe helmrelease/promtail || true
|
||||
kubectl -n observability get pods,pvc,svc -o wide || true
|
||||
kubectl -n observability get events --sort-by=.lastTimestamp || true
|
||||
}
|
||||
|
||||
wait_for_resource() {
|
||||
local namespace="$1"
|
||||
local resource="$2"
|
||||
local timeout_seconds="$3"
|
||||
local elapsed=0
|
||||
|
||||
until kubectl -n "${namespace}" get "${resource}" >/dev/null 2>&1; do
|
||||
if [ "${elapsed}" -ge "${timeout_seconds}" ]; then
|
||||
echo "Timed out waiting for ${resource} to exist" >&2
|
||||
observability_diagnostics
|
||||
exit 1
|
||||
fi
|
||||
|
||||
sleep 10
|
||||
elapsed=$((elapsed + 10))
|
||||
done
|
||||
}
|
||||
|
||||
wait_for_reconcile_handled() {
|
||||
local resource="$1"
|
||||
local reconcile_at="$2"
|
||||
local timeout_seconds="$3"
|
||||
local elapsed=0
|
||||
local handled
|
||||
|
||||
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
|
||||
handled="$(kubectl -n flux-system get "${resource}" -o jsonpath='{.status.lastHandledReconcileAt}' 2>/dev/null || true)"
|
||||
if [ "${handled}" = "${reconcile_at}" ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
sleep 5
|
||||
elapsed=$((elapsed + 5))
|
||||
done
|
||||
|
||||
echo "Timed out waiting for ${resource} to handle reconcile ${reconcile_at}" >&2
|
||||
observability_diagnostics
|
||||
exit 1
|
||||
}
|
||||
|
||||
reconcile_flux_resource() {
|
||||
local resource="$1"
|
||||
local timeout_seconds="${2:-300}"
|
||||
local reconcile_at
|
||||
reconcile_at="$(date +%s%N)"
|
||||
kubectl -n flux-system annotate "${resource}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
|
||||
wait_for_reconcile_handled "${resource}" "${reconcile_at}" "${timeout_seconds}"
|
||||
}
|
||||
|
||||
request_helmrelease_reconcile() {
|
||||
local release="$1"
|
||||
local reconcile_at
|
||||
reconcile_at="$(date +%s%N)"
|
||||
kubectl -n flux-system annotate "helmrelease/${release}" \
|
||||
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
|
||||
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
|
||||
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
|
||||
--overwrite
|
||||
}
|
||||
|
||||
wait_for_flux_ready() {
|
||||
local resource="$1"
|
||||
local timeout="$2"
|
||||
if ! kubectl -n flux-system wait --for=condition=Ready "${resource}" --timeout="${timeout}"; then
|
||||
observability_diagnostics
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
wait_for_ocirepository_ready_or_cached() {
|
||||
local repository="$1"
|
||||
local timeout="$2"
|
||||
local artifact_storage
|
||||
|
||||
if kubectl -n flux-system wait --for=condition=Ready "ocirepository/${repository}" --timeout="${timeout}"; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
artifact_storage="$(kubectl -n flux-system get "ocirepository/${repository}" -o jsonpath='{.status.conditions[?(@.type=="ArtifactInStorage")].status}' 2>/dev/null || true)"
|
||||
if [ "${artifact_storage}" = "True" ]; then
|
||||
echo "OCIRepository ${repository} is not currently Ready; continuing with cached artifact" >&2
|
||||
return 0
|
||||
fi
|
||||
|
||||
observability_diagnostics
|
||||
exit 1
|
||||
}
|
||||
|
||||
wait_for_helmrelease_ready() {
|
||||
local release="$1"
|
||||
local timeout_seconds="$2"
|
||||
local elapsed=0
|
||||
local ready
|
||||
local stalled
|
||||
local generation
|
||||
local observed_generation
|
||||
|
||||
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
|
||||
ready="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
|
||||
stalled="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)"
|
||||
generation="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.metadata.generation}' 2>/dev/null || true)"
|
||||
observed_generation="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.observedGeneration}' 2>/dev/null || true)"
|
||||
|
||||
if [ "${ready}" = "True" ] && [ "${observed_generation}" = "${generation}" ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
if [ "${stalled}" = "True" ]; then
|
||||
echo "HelmRelease ${release} is stalled" >&2
|
||||
observability_diagnostics
|
||||
exit 1
|
||||
fi
|
||||
|
||||
sleep 10
|
||||
elapsed=$((elapsed + 10))
|
||||
done
|
||||
|
||||
echo "Timed out waiting for HelmRelease ${release} to become Ready" >&2
|
||||
observability_diagnostics
|
||||
exit 1
|
||||
}
|
||||
|
||||
reconcile_flux_resource gitrepository/platform 300
|
||||
wait_for_flux_ready gitrepository/platform 300s
|
||||
reconcile_flux_resource kustomization/infrastructure 300
|
||||
wait_for_flux_ready kustomization/infrastructure 300s
|
||||
|
||||
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability-secrets 300
|
||||
reconcile_flux_resource kustomization/addon-observability-secrets 300
|
||||
wait_for_flux_ready kustomization/addon-observability-secrets 300s
|
||||
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability 300
|
||||
reconcile_flux_resource kustomization/addon-observability 600
|
||||
wait_for_flux_ready kustomization/addon-observability 300s
|
||||
|
||||
wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/loki 300
|
||||
wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/promtail 300
|
||||
wait_for_ocirepository_ready_or_cached loki 300s
|
||||
wait_for_ocirepository_ready_or_cached promtail 300s
|
||||
|
||||
for release in kube-prometheus-stack loki promtail; do
|
||||
wait_for_resource flux-system "helmrelease.helm.toolkit.fluxcd.io/${release}" 300
|
||||
request_helmrelease_reconcile "${release}"
|
||||
wait_for_helmrelease_ready "${release}" 600
|
||||
done
|
||||
|
||||
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability-content 300
|
||||
reconcile_flux_resource kustomization/addon-observability-content 300
|
||||
wait_for_flux_ready kustomization/addon-observability-content 300s
|
||||
kubectl -n observability rollout restart deployment/observability-kube-prometheus-stack-grafana || true
|
||||
Reference in New Issue
Block a user