283 lines
12 KiB
YAML
283 lines
12 KiB
YAML
name: Reconcile Observability
|
|
|
|
on:
|
|
push:
|
|
branches:
|
|
- main
|
|
paths:
|
|
- "infrastructure/addons/observability/**"
|
|
- "infrastructure/addons/observability-content/**"
|
|
- "infrastructure/addons/observability-secrets/**"
|
|
- "infrastructure/addons/kustomization-observability.yaml"
|
|
- "infrastructure/addons/kustomization-observability-content.yaml"
|
|
- "infrastructure/addons/kustomization-observability-secrets.yaml"
|
|
- "infrastructure/charts/kube-prometheus-stack/**"
|
|
workflow_dispatch:
|
|
|
|
concurrency:
|
|
group: prod-cluster
|
|
cancel-in-progress: false
|
|
|
|
env:
|
|
TF_VERSION: "1.14.9"
|
|
KUBECTL_VERSION: "v1.34.6"
|
|
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
|
|
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
|
|
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
|
|
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
|
|
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
|
|
TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
|
|
TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
|
|
TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
|
|
TF_VAR_proxmox_insecure: "true"
|
|
|
|
jobs:
|
|
observability:
|
|
name: Observability
|
|
runs-on: ubuntu-22.04
|
|
steps:
|
|
- name: Checkout
|
|
uses: actions/checkout@v4
|
|
|
|
- name: Setup Terraform
|
|
uses: hashicorp/setup-terraform@v3
|
|
with:
|
|
terraform_version: ${{ env.TF_VERSION }}
|
|
terraform_wrapper: false
|
|
|
|
- name: Setup SSH Keys
|
|
run: |
|
|
mkdir -p ~/.ssh
|
|
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
|
|
chmod 600 ~/.ssh/id_ed25519
|
|
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
|
|
chmod 644 ~/.ssh/id_ed25519.pub
|
|
|
|
- name: Terraform Init
|
|
working-directory: terraform
|
|
run: |
|
|
terraform init \
|
|
-lockfile=readonly \
|
|
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
|
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
|
-backend-config="region=auto" \
|
|
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
|
|
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
|
|
-backend-config="skip_requesting_account_id=true"
|
|
|
|
- name: Install kubectl
|
|
run: |
|
|
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl"
|
|
chmod +x /usr/local/bin/kubectl
|
|
|
|
- name: Validate observability manifests
|
|
run: |
|
|
set -euo pipefail
|
|
kubectl kustomize infrastructure/addons/observability >/dev/null
|
|
kubectl kustomize infrastructure/addons/observability-secrets >/dev/null
|
|
kubectl kustomize infrastructure/addons/observability-content >/dev/null
|
|
kubectl kustomize infrastructure/addons >/dev/null
|
|
|
|
- name: Refresh kubeconfig
|
|
run: |
|
|
set -euo pipefail
|
|
mkdir -p outputs
|
|
PRIMARY_IP="$(terraform -chdir=terraform output -raw primary_control_plane_ip)"
|
|
SSH_KEY="$HOME/.ssh/id_ed25519" scripts/refresh-kubeconfig.sh "${PRIMARY_IP}"
|
|
|
|
- name: Reconcile observability
|
|
env:
|
|
KUBECONFIG: outputs/kubeconfig
|
|
run: |
|
|
set -euo pipefail
|
|
|
|
observability_diagnostics() {
|
|
kubectl -n flux-system get gitrepositories,kustomizations,ocirepositories,helmreleases || true
|
|
kubectl -n flux-system describe gitrepository/platform || true
|
|
kubectl -n flux-system describe kustomization/infrastructure || true
|
|
kubectl -n flux-system describe kustomization/addon-observability-secrets || true
|
|
kubectl -n flux-system describe kustomization/addon-observability || true
|
|
kubectl -n flux-system describe kustomization/addon-observability-content || true
|
|
kubectl describe clustersecretstore/doppler-hetznerterra || true
|
|
kubectl -n observability describe externalsecret/grafana-admin || true
|
|
kubectl -n observability get secret/grafana-admin-credentials || true
|
|
kubectl -n flux-system describe ocirepository/loki || true
|
|
kubectl -n flux-system describe ocirepository/promtail || true
|
|
kubectl -n flux-system describe helmrelease/kube-prometheus-stack || true
|
|
kubectl -n flux-system describe helmrelease/loki || true
|
|
kubectl -n flux-system describe helmrelease/promtail || true
|
|
kubectl -n observability get pods,pvc,svc -o wide || true
|
|
kubectl -n observability get events --sort-by=.lastTimestamp || true
|
|
}
|
|
|
|
wait_for_resource() {
|
|
local namespace="$1"
|
|
local resource="$2"
|
|
local timeout_seconds="$3"
|
|
local elapsed=0
|
|
|
|
until kubectl -n "${namespace}" get "${resource}" >/dev/null 2>&1; do
|
|
if [ "${elapsed}" -ge "${timeout_seconds}" ]; then
|
|
echo "Timed out waiting for ${resource} to exist" >&2
|
|
observability_diagnostics
|
|
exit 1
|
|
fi
|
|
|
|
sleep 10
|
|
elapsed=$((elapsed + 10))
|
|
done
|
|
}
|
|
|
|
wait_for_reconcile_handled() {
|
|
local resource="$1"
|
|
local reconcile_at="$2"
|
|
local timeout_seconds="$3"
|
|
local elapsed=0
|
|
local handled
|
|
|
|
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
|
|
handled="$(kubectl -n flux-system get "${resource}" -o jsonpath='{.status.lastHandledReconcileAt}' 2>/dev/null || true)"
|
|
if [ "${handled}" = "${reconcile_at}" ]; then
|
|
return 0
|
|
fi
|
|
|
|
sleep 5
|
|
elapsed=$((elapsed + 5))
|
|
done
|
|
|
|
echo "Timed out waiting for ${resource} to handle reconcile ${reconcile_at}" >&2
|
|
observability_diagnostics
|
|
exit 1
|
|
}
|
|
|
|
reconcile_flux_resource() {
|
|
local resource="$1"
|
|
local timeout_seconds="${2:-300}"
|
|
local reconcile_at
|
|
reconcile_at="$(date +%s%N)"
|
|
kubectl -n flux-system annotate "${resource}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
|
|
wait_for_reconcile_handled "${resource}" "${reconcile_at}" "${timeout_seconds}"
|
|
}
|
|
|
|
request_helmrelease_reconcile() {
|
|
local release="$1"
|
|
local reconcile_at
|
|
reconcile_at="$(date +%s%N)"
|
|
kubectl -n flux-system annotate "helmrelease/${release}" \
|
|
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
|
|
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
|
|
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
|
|
--overwrite
|
|
}
|
|
|
|
wait_for_flux_ready() {
|
|
local resource="$1"
|
|
local timeout="$2"
|
|
if ! kubectl -n flux-system wait --for=condition=Ready "${resource}" --timeout="${timeout}"; then
|
|
observability_diagnostics
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
wait_for_grafana_secret() {
|
|
local timeout_seconds="$1"
|
|
local elapsed=0
|
|
|
|
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
|
|
if kubectl wait --for=condition=Ready clustersecretstore/doppler-hetznerterra --timeout=30s \
|
|
&& kubectl -n observability wait --for=condition=Ready externalsecret/grafana-admin --timeout=30s \
|
|
&& kubectl -n observability get secret/grafana-admin-credentials >/dev/null 2>&1; then
|
|
return 0
|
|
fi
|
|
|
|
sleep 15
|
|
elapsed=$((elapsed + 75))
|
|
done
|
|
|
|
echo "Timed out waiting for Grafana admin ExternalSecret to sync" >&2
|
|
observability_diagnostics
|
|
exit 1
|
|
}
|
|
|
|
wait_for_ocirepository_ready_or_cached() {
|
|
local repository="$1"
|
|
local timeout="$2"
|
|
local artifact_storage
|
|
|
|
if kubectl -n flux-system wait --for=condition=Ready "ocirepository/${repository}" --timeout="${timeout}"; then
|
|
return 0
|
|
fi
|
|
|
|
artifact_storage="$(kubectl -n flux-system get "ocirepository/${repository}" -o jsonpath='{.status.conditions[?(@.type=="ArtifactInStorage")].status}' 2>/dev/null || true)"
|
|
if [ "${artifact_storage}" = "True" ]; then
|
|
echo "OCIRepository ${repository} is not currently Ready; continuing with cached artifact" >&2
|
|
return 0
|
|
fi
|
|
|
|
observability_diagnostics
|
|
exit 1
|
|
}
|
|
|
|
wait_for_helmrelease_ready() {
|
|
local release="$1"
|
|
local timeout_seconds="$2"
|
|
local elapsed=0
|
|
local ready
|
|
local stalled
|
|
local generation
|
|
local observed_generation
|
|
|
|
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
|
|
ready="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
|
|
stalled="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)"
|
|
generation="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.metadata.generation}' 2>/dev/null || true)"
|
|
observed_generation="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.observedGeneration}' 2>/dev/null || true)"
|
|
|
|
if [ "${ready}" = "True" ] && [ "${observed_generation}" = "${generation}" ]; then
|
|
return 0
|
|
fi
|
|
|
|
if [ "${stalled}" = "True" ]; then
|
|
echo "HelmRelease ${release} is stalled" >&2
|
|
observability_diagnostics
|
|
exit 1
|
|
fi
|
|
|
|
sleep 10
|
|
elapsed=$((elapsed + 10))
|
|
done
|
|
|
|
echo "Timed out waiting for HelmRelease ${release} to become Ready" >&2
|
|
observability_diagnostics
|
|
exit 1
|
|
}
|
|
|
|
reconcile_flux_resource gitrepository/platform 300
|
|
wait_for_flux_ready gitrepository/platform 300s
|
|
reconcile_flux_resource kustomization/infrastructure 300
|
|
wait_for_flux_ready kustomization/infrastructure 300s
|
|
|
|
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability-secrets 300
|
|
reconcile_flux_resource kustomization/addon-observability-secrets 300
|
|
wait_for_flux_ready kustomization/addon-observability-secrets 300s
|
|
wait_for_grafana_secret 900
|
|
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability 300
|
|
reconcile_flux_resource kustomization/addon-observability 600
|
|
wait_for_flux_ready kustomization/addon-observability 300s
|
|
|
|
wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/loki 300
|
|
wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/promtail 300
|
|
wait_for_ocirepository_ready_or_cached loki 300s
|
|
wait_for_ocirepository_ready_or_cached promtail 300s
|
|
|
|
for release in kube-prometheus-stack loki promtail; do
|
|
wait_for_resource flux-system "helmrelease.helm.toolkit.fluxcd.io/${release}" 300
|
|
request_helmrelease_reconcile "${release}"
|
|
wait_for_helmrelease_ready "${release}" 600
|
|
done
|
|
|
|
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability-content 300
|
|
reconcile_flux_resource kustomization/addon-observability-content 300
|
|
wait_for_flux_ready kustomization/addon-observability-content 300s
|
|
kubectl -n observability rollout restart deployment/observability-kube-prometheus-stack-grafana || true
|