From df3d49c0d42a4ce3350deb8d01ba79ba18f7c9a7 Mon Sep 17 00:00:00 2001 From: MichaelFisher1997 Date: Sat, 2 May 2026 00:07:47 +0000 Subject: [PATCH] fix: remove separate observability workflow --- .gitea/workflows/observability.yml | 282 ----------------------------- 1 file changed, 282 deletions(-) delete mode 100644 .gitea/workflows/observability.yml diff --git a/.gitea/workflows/observability.yml b/.gitea/workflows/observability.yml deleted file mode 100644 index 0feb587..0000000 --- a/.gitea/workflows/observability.yml +++ /dev/null @@ -1,282 +0,0 @@ -name: Reconcile Observability - -on: - push: - branches: - - main - paths: - - "infrastructure/addons/observability/**" - - "infrastructure/addons/observability-content/**" - - "infrastructure/addons/observability-secrets/**" - - "infrastructure/addons/kustomization-observability.yaml" - - "infrastructure/addons/kustomization-observability-content.yaml" - - "infrastructure/addons/kustomization-observability-secrets.yaml" - - "infrastructure/charts/kube-prometheus-stack/**" - workflow_dispatch: - -concurrency: - group: prod-cluster - cancel-in-progress: false - -env: - TF_VERSION: "1.14.9" - KUBECTL_VERSION: "v1.34.6" - TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }} - TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }} - TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }} - TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }} - TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }} - TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }} - TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }} - TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }} - TF_VAR_proxmox_insecure: "true" - -jobs: - observability: - name: Observability - runs-on: ubuntu-22.04 - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Setup Terraform - uses: hashicorp/setup-terraform@v3 - with: - terraform_version: ${{ env.TF_VERSION }} - terraform_wrapper: false - - - name: Setup SSH Keys - run: | - mkdir -p ~/.ssh - echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519 - chmod 600 ~/.ssh/id_ed25519 - echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub - chmod 644 ~/.ssh/id_ed25519.pub - - - name: Terraform Init - working-directory: terraform - run: | - terraform init \ - -lockfile=readonly \ - -backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \ - -backend-config="bucket=${{ secrets.S3_BUCKET }}" \ - -backend-config="region=auto" \ - -backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \ - -backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \ - -backend-config="skip_requesting_account_id=true" - - - name: Install kubectl - run: | - curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl" - chmod +x /usr/local/bin/kubectl - - - name: Validate observability manifests - run: | - set -euo pipefail - kubectl kustomize infrastructure/addons/observability >/dev/null - kubectl kustomize infrastructure/addons/observability-secrets >/dev/null - kubectl kustomize infrastructure/addons/observability-content >/dev/null - kubectl kustomize infrastructure/addons >/dev/null - - - name: Refresh kubeconfig - run: | - set -euo pipefail - mkdir -p outputs - PRIMARY_IP="$(terraform -chdir=terraform output -raw primary_control_plane_ip)" - SSH_KEY="$HOME/.ssh/id_ed25519" scripts/refresh-kubeconfig.sh "${PRIMARY_IP}" - - - name: Reconcile observability - env: - KUBECONFIG: outputs/kubeconfig - run: | - set -euo pipefail - - observability_diagnostics() { - kubectl -n flux-system get gitrepositories,kustomizations,ocirepositories,helmreleases || true - kubectl -n flux-system describe gitrepository/platform || true - kubectl -n flux-system describe kustomization/infrastructure || true - kubectl -n flux-system describe kustomization/addon-observability-secrets || true - kubectl -n flux-system describe kustomization/addon-observability || true - kubectl -n flux-system describe kustomization/addon-observability-content || true - kubectl describe clustersecretstore/doppler-hetznerterra || true - kubectl -n observability describe externalsecret/grafana-admin || true - kubectl -n observability get secret/grafana-admin-credentials || true - kubectl -n flux-system describe ocirepository/loki || true - kubectl -n flux-system describe ocirepository/promtail || true - kubectl -n flux-system describe helmrelease/kube-prometheus-stack || true - kubectl -n flux-system describe helmrelease/loki || true - kubectl -n flux-system describe helmrelease/promtail || true - kubectl -n observability get pods,pvc,svc -o wide || true - kubectl -n observability get events --sort-by=.lastTimestamp || true - } - - wait_for_resource() { - local namespace="$1" - local resource="$2" - local timeout_seconds="$3" - local elapsed=0 - - until kubectl -n "${namespace}" get "${resource}" >/dev/null 2>&1; do - if [ "${elapsed}" -ge "${timeout_seconds}" ]; then - echo "Timed out waiting for ${resource} to exist" >&2 - observability_diagnostics - exit 1 - fi - - sleep 10 - elapsed=$((elapsed + 10)) - done - } - - wait_for_reconcile_handled() { - local resource="$1" - local reconcile_at="$2" - local timeout_seconds="$3" - local elapsed=0 - local handled - - while [ "${elapsed}" -lt "${timeout_seconds}" ]; do - handled="$(kubectl -n flux-system get "${resource}" -o jsonpath='{.status.lastHandledReconcileAt}' 2>/dev/null || true)" - if [ "${handled}" = "${reconcile_at}" ]; then - return 0 - fi - - sleep 5 - elapsed=$((elapsed + 5)) - done - - echo "Timed out waiting for ${resource} to handle reconcile ${reconcile_at}" >&2 - observability_diagnostics - exit 1 - } - - reconcile_flux_resource() { - local resource="$1" - local timeout_seconds="${2:-300}" - local reconcile_at - reconcile_at="$(date +%s%N)" - kubectl -n flux-system annotate "${resource}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite - wait_for_reconcile_handled "${resource}" "${reconcile_at}" "${timeout_seconds}" - } - - request_helmrelease_reconcile() { - local release="$1" - local reconcile_at - reconcile_at="$(date +%s%N)" - kubectl -n flux-system annotate "helmrelease/${release}" \ - reconcile.fluxcd.io/requestedAt="${reconcile_at}" \ - reconcile.fluxcd.io/resetAt="${reconcile_at}" \ - reconcile.fluxcd.io/forceAt="${reconcile_at}" \ - --overwrite - } - - wait_for_flux_ready() { - local resource="$1" - local timeout="$2" - if ! kubectl -n flux-system wait --for=condition=Ready "${resource}" --timeout="${timeout}"; then - observability_diagnostics - exit 1 - fi - } - - wait_for_grafana_secret() { - local timeout_seconds="$1" - local elapsed=0 - - while [ "${elapsed}" -lt "${timeout_seconds}" ]; do - if kubectl wait --for=condition=Ready clustersecretstore/doppler-hetznerterra --timeout=30s \ - && kubectl -n observability wait --for=condition=Ready externalsecret/grafana-admin --timeout=30s \ - && kubectl -n observability get secret/grafana-admin-credentials >/dev/null 2>&1; then - return 0 - fi - - sleep 15 - elapsed=$((elapsed + 75)) - done - - echo "Timed out waiting for Grafana admin ExternalSecret to sync" >&2 - observability_diagnostics - exit 1 - } - - wait_for_ocirepository_ready_or_cached() { - local repository="$1" - local timeout="$2" - local artifact_storage - - if kubectl -n flux-system wait --for=condition=Ready "ocirepository/${repository}" --timeout="${timeout}"; then - return 0 - fi - - artifact_storage="$(kubectl -n flux-system get "ocirepository/${repository}" -o jsonpath='{.status.conditions[?(@.type=="ArtifactInStorage")].status}' 2>/dev/null || true)" - if [ "${artifact_storage}" = "True" ]; then - echo "OCIRepository ${repository} is not currently Ready; continuing with cached artifact" >&2 - return 0 - fi - - observability_diagnostics - exit 1 - } - - wait_for_helmrelease_ready() { - local release="$1" - local timeout_seconds="$2" - local elapsed=0 - local ready - local stalled - local generation - local observed_generation - - while [ "${elapsed}" -lt "${timeout_seconds}" ]; do - ready="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)" - stalled="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)" - generation="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.metadata.generation}' 2>/dev/null || true)" - observed_generation="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.observedGeneration}' 2>/dev/null || true)" - - if [ "${ready}" = "True" ] && [ "${observed_generation}" = "${generation}" ]; then - return 0 - fi - - if [ "${stalled}" = "True" ]; then - echo "HelmRelease ${release} is stalled" >&2 - observability_diagnostics - exit 1 - fi - - sleep 10 - elapsed=$((elapsed + 10)) - done - - echo "Timed out waiting for HelmRelease ${release} to become Ready" >&2 - observability_diagnostics - exit 1 - } - - reconcile_flux_resource gitrepository/platform 300 - wait_for_flux_ready gitrepository/platform 300s - reconcile_flux_resource kustomization/infrastructure 300 - wait_for_flux_ready kustomization/infrastructure 300s - - wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability-secrets 300 - reconcile_flux_resource kustomization/addon-observability-secrets 300 - wait_for_flux_ready kustomization/addon-observability-secrets 300s - wait_for_grafana_secret 900 - wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability 300 - reconcile_flux_resource kustomization/addon-observability 600 - wait_for_flux_ready kustomization/addon-observability 300s - - wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/loki 300 - wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/promtail 300 - wait_for_ocirepository_ready_or_cached loki 300s - wait_for_ocirepository_ready_or_cached promtail 300s - - for release in kube-prometheus-stack loki promtail; do - wait_for_resource flux-system "helmrelease.helm.toolkit.fluxcd.io/${release}" 300 - request_helmrelease_reconcile "${release}" - wait_for_helmrelease_ready "${release}" 600 - done - - wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability-content 300 - reconcile_flux_resource kustomization/addon-observability-content 300 - wait_for_flux_ready kustomization/addon-observability-content 300s - kubectl -n observability rollout restart deployment/observability-kube-prometheus-stack-grafana || true