name: Reconcile Observability on: push: branches: - main paths: - "infrastructure/addons/observability/**" - "infrastructure/addons/observability-content/**" - "infrastructure/addons/observability-secrets/**" - "infrastructure/addons/kustomization-observability.yaml" - "infrastructure/addons/kustomization-observability-content.yaml" - "infrastructure/addons/kustomization-observability-secrets.yaml" - "infrastructure/charts/kube-prometheus-stack/**" workflow_dispatch: concurrency: group: prod-cluster cancel-in-progress: false env: TF_VERSION: "1.14.9" KUBECTL_VERSION: "v1.34.6" TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }} TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }} TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }} TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }} TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }} TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }} TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }} TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }} TF_VAR_proxmox_insecure: "true" jobs: observability: name: Observability runs-on: ubuntu-22.04 steps: - name: Checkout uses: actions/checkout@v4 - name: Setup Terraform uses: hashicorp/setup-terraform@v3 with: terraform_version: ${{ env.TF_VERSION }} terraform_wrapper: false - name: Setup SSH Keys run: | mkdir -p ~/.ssh echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519 chmod 600 ~/.ssh/id_ed25519 echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub chmod 644 ~/.ssh/id_ed25519.pub - name: Terraform Init working-directory: terraform run: | terraform init \ -lockfile=readonly \ -backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \ -backend-config="bucket=${{ secrets.S3_BUCKET }}" \ -backend-config="region=auto" \ -backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \ -backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \ -backend-config="skip_requesting_account_id=true" - name: Install kubectl run: | curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl" chmod +x /usr/local/bin/kubectl - name: Validate observability manifests run: | set -euo pipefail kubectl kustomize infrastructure/addons/observability >/dev/null kubectl kustomize infrastructure/addons/observability-secrets >/dev/null kubectl kustomize infrastructure/addons/observability-content >/dev/null kubectl kustomize infrastructure/addons >/dev/null - name: Refresh kubeconfig run: | set -euo pipefail mkdir -p outputs PRIMARY_IP="$(terraform -chdir=terraform output -raw primary_control_plane_ip)" SSH_KEY="$HOME/.ssh/id_ed25519" scripts/refresh-kubeconfig.sh "${PRIMARY_IP}" - name: Reconcile observability env: KUBECONFIG: outputs/kubeconfig run: | set -euo pipefail observability_diagnostics() { kubectl -n flux-system get gitrepositories,kustomizations,ocirepositories,helmreleases || true kubectl -n flux-system describe gitrepository/platform || true kubectl -n flux-system describe kustomization/infrastructure || true kubectl -n flux-system describe kustomization/addon-observability-secrets || true kubectl -n flux-system describe kustomization/addon-observability || true kubectl -n flux-system describe kustomization/addon-observability-content || true kubectl -n flux-system describe ocirepository/loki || true kubectl -n flux-system describe ocirepository/promtail || true kubectl -n flux-system describe helmrelease/kube-prometheus-stack || true kubectl -n flux-system describe helmrelease/loki || true kubectl -n flux-system describe helmrelease/promtail || true kubectl -n observability get pods,pvc,svc -o wide || true kubectl -n observability get events --sort-by=.lastTimestamp || true } wait_for_resource() { local namespace="$1" local resource="$2" local timeout_seconds="$3" local elapsed=0 until kubectl -n "${namespace}" get "${resource}" >/dev/null 2>&1; do if [ "${elapsed}" -ge "${timeout_seconds}" ]; then echo "Timed out waiting for ${resource} to exist" >&2 observability_diagnostics exit 1 fi sleep 10 elapsed=$((elapsed + 10)) done } wait_for_reconcile_handled() { local resource="$1" local reconcile_at="$2" local timeout_seconds="$3" local elapsed=0 local handled while [ "${elapsed}" -lt "${timeout_seconds}" ]; do handled="$(kubectl -n flux-system get "${resource}" -o jsonpath='{.status.lastHandledReconcileAt}' 2>/dev/null || true)" if [ "${handled}" = "${reconcile_at}" ]; then return 0 fi sleep 5 elapsed=$((elapsed + 5)) done echo "Timed out waiting for ${resource} to handle reconcile ${reconcile_at}" >&2 observability_diagnostics exit 1 } reconcile_flux_resource() { local resource="$1" local timeout_seconds="${2:-300}" local reconcile_at reconcile_at="$(date +%s%N)" kubectl -n flux-system annotate "${resource}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite wait_for_reconcile_handled "${resource}" "${reconcile_at}" "${timeout_seconds}" } request_helmrelease_reconcile() { local release="$1" local reconcile_at reconcile_at="$(date +%s%N)" kubectl -n flux-system annotate "helmrelease/${release}" \ reconcile.fluxcd.io/requestedAt="${reconcile_at}" \ reconcile.fluxcd.io/resetAt="${reconcile_at}" \ reconcile.fluxcd.io/forceAt="${reconcile_at}" \ --overwrite } wait_for_flux_ready() { local resource="$1" local timeout="$2" if ! kubectl -n flux-system wait --for=condition=Ready "${resource}" --timeout="${timeout}"; then observability_diagnostics exit 1 fi } wait_for_ocirepository_ready_or_cached() { local repository="$1" local timeout="$2" local artifact_storage if kubectl -n flux-system wait --for=condition=Ready "ocirepository/${repository}" --timeout="${timeout}"; then return 0 fi artifact_storage="$(kubectl -n flux-system get "ocirepository/${repository}" -o jsonpath='{.status.conditions[?(@.type=="ArtifactInStorage")].status}' 2>/dev/null || true)" if [ "${artifact_storage}" = "True" ]; then echo "OCIRepository ${repository} is not currently Ready; continuing with cached artifact" >&2 return 0 fi observability_diagnostics exit 1 } wait_for_helmrelease_ready() { local release="$1" local timeout_seconds="$2" local elapsed=0 local ready local stalled local generation local observed_generation while [ "${elapsed}" -lt "${timeout_seconds}" ]; do ready="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)" stalled="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)" generation="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.metadata.generation}' 2>/dev/null || true)" observed_generation="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.observedGeneration}' 2>/dev/null || true)" if [ "${ready}" = "True" ] && [ "${observed_generation}" = "${generation}" ]; then return 0 fi if [ "${stalled}" = "True" ]; then echo "HelmRelease ${release} is stalled" >&2 observability_diagnostics exit 1 fi sleep 10 elapsed=$((elapsed + 10)) done echo "Timed out waiting for HelmRelease ${release} to become Ready" >&2 observability_diagnostics exit 1 } reconcile_flux_resource gitrepository/platform 300 wait_for_flux_ready gitrepository/platform 300s reconcile_flux_resource kustomization/infrastructure 300 wait_for_flux_ready kustomization/infrastructure 300s wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability-secrets 300 reconcile_flux_resource kustomization/addon-observability-secrets 300 wait_for_flux_ready kustomization/addon-observability-secrets 300s wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability 300 reconcile_flux_resource kustomization/addon-observability 600 wait_for_flux_ready kustomization/addon-observability 300s wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/loki 300 wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/promtail 300 wait_for_ocirepository_ready_or_cached loki 300s wait_for_ocirepository_ready_or_cached promtail 300s for release in kube-prometheus-stack loki promtail; do wait_for_resource flux-system "helmrelease.helm.toolkit.fluxcd.io/${release}" 300 request_helmrelease_reconcile "${release}" wait_for_helmrelease_ready "${release}" 600 done wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability-content 300 reconcile_flux_resource kustomization/addon-observability-content 300 wait_for_flux_ready kustomization/addon-observability-content 300s kubectl -n observability rollout restart deployment/observability-kube-prometheus-stack-grafana || true