diff --git a/.gitea/workflows/deploy.yml b/.gitea/workflows/deploy.yml index 0234439..25c2096 100644 --- a/.gitea/workflows/deploy.yml +++ b/.gitea/workflows/deploy.yml @@ -4,6 +4,16 @@ on: push: branches: - main + paths-ignore: + - "ansible/dashboards.yml" + - "ansible/roles/observability-content/**" + - "infrastructure/addons/observability/**" + - "infrastructure/addons/observability-content/**" + - "infrastructure/addons/observability-secrets/**" + - "infrastructure/addons/kustomization-observability.yaml" + - "infrastructure/addons/kustomization-observability-content.yaml" + - "infrastructure/addons/kustomization-observability-secrets.yaml" + - "infrastructure/charts/kube-prometheus-stack/**" pull_request: branches: - main @@ -879,6 +889,20 @@ jobs: run: | set -euo pipefail + observability_diagnostics() { + kubectl -n flux-system get gitrepositories,kustomizations,ocirepositories,helmreleases || true + kubectl -n flux-system describe kustomization/addon-observability-secrets || true + kubectl -n flux-system describe kustomization/addon-observability || true + kubectl -n flux-system describe kustomization/addon-observability-content || true + kubectl -n flux-system describe ocirepository/loki || true + kubectl -n flux-system describe ocirepository/promtail || true + kubectl -n flux-system describe helmrelease/kube-prometheus-stack || true + kubectl -n flux-system describe helmrelease/loki || true + kubectl -n flux-system describe helmrelease/promtail || true + kubectl -n observability get pods,pvc,svc -o wide || true + kubectl -n observability get events --sort-by=.lastTimestamp || true + } + wait_for_resource() { local namespace="$1" local resource="$2" @@ -888,7 +912,7 @@ jobs: until kubectl -n "${namespace}" get "${resource}" >/dev/null 2>&1; do if [ "${elapsed}" -ge "${timeout_seconds}" ]; then echo "Timed out waiting for ${resource} to exist" >&2 - kubectl -n flux-system get kustomizations,helmreleases || true + observability_diagnostics exit 1 fi @@ -915,7 +939,7 @@ jobs: done echo "Timed out waiting for ${resource} to handle reconcile ${reconcile_at}" >&2 - kubectl -n flux-system describe "${resource}" || true + observability_diagnostics exit 1 } @@ -928,7 +952,7 @@ jobs: wait_for_reconcile_handled "${resource}" "${reconcile_at}" "${timeout_seconds}" } - reconcile_helmrelease() { + request_helmrelease_reconcile() { local release="$1" local reconcile_at reconcile_at="$(date +%s%N)" @@ -937,25 +961,88 @@ jobs: reconcile.fluxcd.io/resetAt="${reconcile_at}" \ reconcile.fluxcd.io/forceAt="${reconcile_at}" \ --overwrite - wait_for_reconcile_handled "helmrelease/${release}" "${reconcile_at}" 300 + } + + wait_for_flux_ready() { + local resource="$1" + local timeout="$2" + if ! kubectl -n flux-system wait --for=condition=Ready "${resource}" --timeout="${timeout}"; then + observability_diagnostics + exit 1 + fi + } + + wait_for_ocirepository_ready_or_cached() { + local repository="$1" + local timeout="$2" + local artifact_storage + + if kubectl -n flux-system wait --for=condition=Ready "ocirepository/${repository}" --timeout="${timeout}"; then + return 0 + fi + + artifact_storage="$(kubectl -n flux-system get "ocirepository/${repository}" -o jsonpath='{.status.conditions[?(@.type=="ArtifactInStorage")].status}' 2>/dev/null || true)" + if [ "${artifact_storage}" = "True" ]; then + echo "OCIRepository ${repository} is not currently Ready; continuing with cached artifact" >&2 + return 0 + fi + + observability_diagnostics + exit 1 + } + + wait_for_helmrelease_ready() { + local release="$1" + local timeout_seconds="$2" + local elapsed=0 + local ready + local stalled + local generation + local observed_generation + + while [ "${elapsed}" -lt "${timeout_seconds}" ]; do + ready="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)" + stalled="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)" + generation="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.metadata.generation}' 2>/dev/null || true)" + observed_generation="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.observedGeneration}' 2>/dev/null || true)" + + if [ "${ready}" = "True" ] && [ "${observed_generation}" = "${generation}" ]; then + return 0 + fi + + if [ "${stalled}" = "True" ]; then + echo "HelmRelease ${release} is stalled" >&2 + observability_diagnostics + exit 1 + fi + + sleep 10 + elapsed=$((elapsed + 10)) + done + + echo "Timed out waiting for HelmRelease ${release} to become Ready" >&2 + observability_diagnostics + exit 1 } wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability-secrets 600 - reconcile_flux_resource kustomization/addon-observability-secrets 600 - kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-secrets --timeout=600s + reconcile_flux_resource kustomization/addon-observability-secrets 300 + wait_for_flux_ready kustomization/addon-observability-secrets 300s wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability 600 - reconcile_flux_resource kustomization/addon-observability 1800 - if ! kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=1800s; then - kubectl -n flux-system describe kustomization/addon-observability || true - kubectl -n flux-system describe helmrelease/kube-prometheus-stack || true - kubectl -n flux-system describe helmrelease/loki || true - kubectl -n flux-system describe helmrelease/promtail || true - kubectl -n observability get pods -o wide || true - exit 1 - fi + reconcile_flux_resource kustomization/addon-observability 600 + wait_for_flux_ready kustomization/addon-observability 300s + wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/loki 300 + wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/promtail 300 + wait_for_ocirepository_ready_or_cached loki 300s + wait_for_ocirepository_ready_or_cached promtail 300s for release in kube-prometheus-stack loki promtail; do - reconcile_helmrelease "${release}" + wait_for_resource flux-system "helmrelease.helm.toolkit.fluxcd.io/${release}" 300 + request_helmrelease_reconcile "${release}" + wait_for_helmrelease_ready "${release}" 600 done + wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability-content 300 + reconcile_flux_resource kustomization/addon-observability-content 300 + wait_for_flux_ready kustomization/addon-observability-content 300s kubectl -n observability rollout restart deployment/observability-kube-prometheus-stack-grafana || true - name: Post-deploy cluster health checks @@ -977,9 +1064,9 @@ jobs: kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=900s kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-config --timeout=300s kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-secrets --timeout=300s - kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=1800s + kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=300s kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=300s - kubectl -n flux-system wait --for=condition=Ready helmrelease --all --timeout=1200s + kubectl -n flux-system wait --for=condition=Ready helmrelease --all --timeout=600s kubectl annotate storageclass local-path storageclass.kubernetes.io/is-default-class=false --overwrite kubectl annotate storageclass flash-nfs storageclass.kubernetes.io/is-default-class=true --overwrite kubectl get storageclass | grep -E "^flash-nfs.*\\(default\\)" diff --git a/.gitea/workflows/observability.yml b/.gitea/workflows/observability.yml new file mode 100644 index 0000000..1621c0d --- /dev/null +++ b/.gitea/workflows/observability.yml @@ -0,0 +1,258 @@ +name: Reconcile Observability + +on: + push: + branches: + - main + paths: + - "infrastructure/addons/observability/**" + - "infrastructure/addons/observability-content/**" + - "infrastructure/addons/observability-secrets/**" + - "infrastructure/addons/kustomization-observability.yaml" + - "infrastructure/addons/kustomization-observability-content.yaml" + - "infrastructure/addons/kustomization-observability-secrets.yaml" + - "infrastructure/charts/kube-prometheus-stack/**" + workflow_dispatch: + +concurrency: + group: prod-cluster + cancel-in-progress: false + +env: + TF_VERSION: "1.14.9" + KUBECTL_VERSION: "v1.34.6" + TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }} + TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }} + TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }} + TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }} + TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }} + TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }} + TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }} + TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }} + TF_VAR_proxmox_insecure: "true" + +jobs: + observability: + name: Observability + runs-on: ubuntu-22.04 + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Terraform + uses: hashicorp/setup-terraform@v3 + with: + terraform_version: ${{ env.TF_VERSION }} + terraform_wrapper: false + + - name: Setup SSH Keys + run: | + mkdir -p ~/.ssh + echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519 + chmod 600 ~/.ssh/id_ed25519 + echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub + chmod 644 ~/.ssh/id_ed25519.pub + + - name: Terraform Init + working-directory: terraform + run: | + terraform init \ + -lockfile=readonly \ + -backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \ + -backend-config="bucket=${{ secrets.S3_BUCKET }}" \ + -backend-config="region=auto" \ + -backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \ + -backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \ + -backend-config="skip_requesting_account_id=true" + + - name: Install kubectl + run: | + curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl" + chmod +x /usr/local/bin/kubectl + + - name: Validate observability manifests + run: | + set -euo pipefail + kubectl kustomize infrastructure/addons/observability >/dev/null + kubectl kustomize infrastructure/addons/observability-secrets >/dev/null + kubectl kustomize infrastructure/addons/observability-content >/dev/null + kubectl kustomize infrastructure/addons >/dev/null + + - name: Refresh kubeconfig + run: | + set -euo pipefail + mkdir -p outputs + PRIMARY_IP="$(terraform -chdir=terraform output -raw primary_control_plane_ip)" + SSH_KEY="$HOME/.ssh/id_ed25519" scripts/refresh-kubeconfig.sh "${PRIMARY_IP}" + + - name: Reconcile observability + env: + KUBECONFIG: outputs/kubeconfig + run: | + set -euo pipefail + + observability_diagnostics() { + kubectl -n flux-system get gitrepositories,kustomizations,ocirepositories,helmreleases || true + kubectl -n flux-system describe gitrepository/platform || true + kubectl -n flux-system describe kustomization/infrastructure || true + kubectl -n flux-system describe kustomization/addon-observability-secrets || true + kubectl -n flux-system describe kustomization/addon-observability || true + kubectl -n flux-system describe kustomization/addon-observability-content || true + kubectl -n flux-system describe ocirepository/loki || true + kubectl -n flux-system describe ocirepository/promtail || true + kubectl -n flux-system describe helmrelease/kube-prometheus-stack || true + kubectl -n flux-system describe helmrelease/loki || true + kubectl -n flux-system describe helmrelease/promtail || true + kubectl -n observability get pods,pvc,svc -o wide || true + kubectl -n observability get events --sort-by=.lastTimestamp || true + } + + wait_for_resource() { + local namespace="$1" + local resource="$2" + local timeout_seconds="$3" + local elapsed=0 + + until kubectl -n "${namespace}" get "${resource}" >/dev/null 2>&1; do + if [ "${elapsed}" -ge "${timeout_seconds}" ]; then + echo "Timed out waiting for ${resource} to exist" >&2 + observability_diagnostics + exit 1 + fi + + sleep 10 + elapsed=$((elapsed + 10)) + done + } + + wait_for_reconcile_handled() { + local resource="$1" + local reconcile_at="$2" + local timeout_seconds="$3" + local elapsed=0 + local handled + + while [ "${elapsed}" -lt "${timeout_seconds}" ]; do + handled="$(kubectl -n flux-system get "${resource}" -o jsonpath='{.status.lastHandledReconcileAt}' 2>/dev/null || true)" + if [ "${handled}" = "${reconcile_at}" ]; then + return 0 + fi + + sleep 5 + elapsed=$((elapsed + 5)) + done + + echo "Timed out waiting for ${resource} to handle reconcile ${reconcile_at}" >&2 + observability_diagnostics + exit 1 + } + + reconcile_flux_resource() { + local resource="$1" + local timeout_seconds="${2:-300}" + local reconcile_at + reconcile_at="$(date +%s%N)" + kubectl -n flux-system annotate "${resource}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite + wait_for_reconcile_handled "${resource}" "${reconcile_at}" "${timeout_seconds}" + } + + request_helmrelease_reconcile() { + local release="$1" + local reconcile_at + reconcile_at="$(date +%s%N)" + kubectl -n flux-system annotate "helmrelease/${release}" \ + reconcile.fluxcd.io/requestedAt="${reconcile_at}" \ + reconcile.fluxcd.io/resetAt="${reconcile_at}" \ + reconcile.fluxcd.io/forceAt="${reconcile_at}" \ + --overwrite + } + + wait_for_flux_ready() { + local resource="$1" + local timeout="$2" + if ! kubectl -n flux-system wait --for=condition=Ready "${resource}" --timeout="${timeout}"; then + observability_diagnostics + exit 1 + fi + } + + wait_for_ocirepository_ready_or_cached() { + local repository="$1" + local timeout="$2" + local artifact_storage + + if kubectl -n flux-system wait --for=condition=Ready "ocirepository/${repository}" --timeout="${timeout}"; then + return 0 + fi + + artifact_storage="$(kubectl -n flux-system get "ocirepository/${repository}" -o jsonpath='{.status.conditions[?(@.type=="ArtifactInStorage")].status}' 2>/dev/null || true)" + if [ "${artifact_storage}" = "True" ]; then + echo "OCIRepository ${repository} is not currently Ready; continuing with cached artifact" >&2 + return 0 + fi + + observability_diagnostics + exit 1 + } + + wait_for_helmrelease_ready() { + local release="$1" + local timeout_seconds="$2" + local elapsed=0 + local ready + local stalled + local generation + local observed_generation + + while [ "${elapsed}" -lt "${timeout_seconds}" ]; do + ready="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)" + stalled="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)" + generation="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.metadata.generation}' 2>/dev/null || true)" + observed_generation="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.observedGeneration}' 2>/dev/null || true)" + + if [ "${ready}" = "True" ] && [ "${observed_generation}" = "${generation}" ]; then + return 0 + fi + + if [ "${stalled}" = "True" ]; then + echo "HelmRelease ${release} is stalled" >&2 + observability_diagnostics + exit 1 + fi + + sleep 10 + elapsed=$((elapsed + 10)) + done + + echo "Timed out waiting for HelmRelease ${release} to become Ready" >&2 + observability_diagnostics + exit 1 + } + + reconcile_flux_resource gitrepository/platform 300 + wait_for_flux_ready gitrepository/platform 300s + reconcile_flux_resource kustomization/infrastructure 300 + wait_for_flux_ready kustomization/infrastructure 300s + + wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability-secrets 300 + reconcile_flux_resource kustomization/addon-observability-secrets 300 + wait_for_flux_ready kustomization/addon-observability-secrets 300s + wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability 300 + reconcile_flux_resource kustomization/addon-observability 600 + wait_for_flux_ready kustomization/addon-observability 300s + + wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/loki 300 + wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/promtail 300 + wait_for_ocirepository_ready_or_cached loki 300s + wait_for_ocirepository_ready_or_cached promtail 300s + + for release in kube-prometheus-stack loki promtail; do + wait_for_resource flux-system "helmrelease.helm.toolkit.fluxcd.io/${release}" 300 + request_helmrelease_reconcile "${release}" + wait_for_helmrelease_ready "${release}" 600 + done + + wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability-content 300 + reconcile_flux_resource kustomization/addon-observability-content 300 + wait_for_flux_ready kustomization/addon-observability-content 300s + kubectl -n observability rollout restart deployment/observability-kube-prometheus-stack-grafana || true diff --git a/infrastructure/addons/kustomization-observability.yaml b/infrastructure/addons/kustomization-observability.yaml index 82d5849..8640129 100644 --- a/infrastructure/addons/kustomization-observability.yaml +++ b/infrastructure/addons/kustomization-observability.yaml @@ -16,18 +16,5 @@ spec: - name: addon-tailscale-operator - name: addon-tailscale-proxyclass wait: false - healthChecks: - - apiVersion: helm.toolkit.fluxcd.io/v2 - kind: HelmRelease - name: kube-prometheus-stack - namespace: flux-system - - apiVersion: helm.toolkit.fluxcd.io/v2 - kind: HelmRelease - name: loki - namespace: flux-system - - apiVersion: helm.toolkit.fluxcd.io/v2 - kind: HelmRelease - name: promtail - namespace: flux-system - timeout: 30m + timeout: 5m suspend: false