cleanup: Remove obsolete port-forwarding, deferred Traefik files, and CI workaround

- Remove ansible/roles/private-access/ (replaced by Tailscale LB services) - Remove deferred observability ingress/traefik files (replaced by direct Tailscale LBs) - Remove orphaned kustomization-traefik-config.yaml (no backing directory) - Simplify CI: remove SA patch + job deletion workaround for rancher-backup (now handled by postRenderer in HelmRelease) - Update AGENTS.md to reflect current architecture
fix: Use rancher/kubectl image for rancher-backup hook
2026-04-02 01:21:23 +00:00 · 2026-04-02 01:00:27 +00:00 · 2026-04-02 00:51:50 +00:00 · 2026-04-02 00:45:03 +00:00 · 2026-04-01 02:08:12 +00:00 · 2026-04-01 01:41:49 +00:00
123 changed files with 9928 additions and 574 deletions
--- a/.gitea/workflows/dashboards.yml
+++ b/.gitea/workflows/dashboards.yml
@@ -0,0 +1,99 @@
 name: Deploy Grafana Content
 on:
  push:
    branches:
      - main
    paths:
      - "ansible/dashboards.yml"
      - "ansible/roles/observability-content/**"
      - ".gitea/workflows/dashboards.yml"
  workflow_dispatch:
 env:
  TF_VERSION: "1.7.0"
  TF_VAR_hcloud_token: ${{ secrets.HCLOUD_TOKEN }}
  TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
  TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
  TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
  TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
  TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
 jobs:
  dashboards:
    name: Grafana Content
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Setup Terraform
        uses: hashicorp/setup-terraform@v3
        with:
          terraform_version: ${{ env.TF_VERSION }}
      - name: Setup SSH Keys
        run: |
          mkdir -p ~/.ssh
          echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
          chmod 600 ~/.ssh/id_ed25519
          echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
          chmod 644 ~/.ssh/id_ed25519.pub
      - name: Terraform Init
        working-directory: terraform
        run: |
          terraform init \
            -backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
            -backend-config="bucket=${{ secrets.S3_BUCKET }}" \
            -backend-config="region=auto" \
            -backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
            -backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
            -backend-config="skip_requesting_account_id=true"
      - name: Detect runner egress IP
        run: |
          RUNNER_IP=$(curl -fsSL https://api.ipify.org)
          echo "RUNNER_CIDR=[\"${RUNNER_IP}/32\"]" >> "$GITHUB_ENV"
          echo "Runner egress IP: ${RUNNER_IP}"
      - name: Open SSH/API for current runner CIDR
        working-directory: terraform
        run: |
          terraform apply \
            -refresh=false \
            -target=hcloud_firewall.cluster \
            -var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
            -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
            -var="ssh_private_key=$HOME/.ssh/id_ed25519" \
            -var="allowed_ssh_ips=${RUNNER_CIDR}" \
            -var="allowed_api_ips=${RUNNER_CIDR}" \
            -auto-approve
      - name: Install Python Dependencies
        run: |
          apt-get update && apt-get install -y python3-pip
          pip3 install --break-system-packages ansible kubernetes jinja2 pyyaml
      - name: Install Ansible Collections
        run: ansible-galaxy collection install -r ansible/requirements.yml
      - name: Generate Ansible Inventory
        working-directory: ansible
        run: python3 generate_inventory.py
      - name: Apply dashboards and datasources
        working-directory: ansible
        run: |
          ansible-playbook dashboards.yml \
            -e "cluster_name=k8s-cluster"
        env:
          ANSIBLE_HOST_KEY_CHECKING: "False"
      - name: Verify Grafana content resources
        working-directory: ansible
        run: |
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get configmap -l grafana_datasource=1"
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get configmap -l grafana_dashboard=1"
        env:
          ANSIBLE_HOST_KEY_CHECKING: "False"
--- a/.gitea/workflows/deploy.yml
+++ b/.gitea/workflows/deploy.yml
@@ -17,6 +17,8 @@ env:
  TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
  TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
  TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
  TS_OAUTH_CLIENT_ID: ${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}
  TS_OAUTH_CLIENT_SECRET: ${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}
 jobs:
  terraform:
@@ -91,7 +93,6 @@ jobs:
          ensure_import 'hcloud_server.workers[0]' 'k8s-cluster-worker-1'
          ensure_import 'hcloud_server.workers[1]' 'k8s-cluster-worker-2'
          ensure_import 'hcloud_server.workers[2]' 'k8s-cluster-worker-3'
          ensure_import 'hcloud_server.workers[3]' 'k8s-cluster-worker-4'
      - name: Terraform Plan
        id: plan
@@ -226,16 +227,188 @@ jobs:
            -e "hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
            -e "tailscale_auth_key=${{ secrets.TAILSCALE_AUTH_KEY }}" \
            -e "tailscale_tailnet=${{ secrets.TAILSCALE_TAILNET }}" \
            -e "tailscale_oauth_client_id=${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}" \
            -e "tailscale_oauth_client_secret=${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}" \
            -e "doppler_hetznerterra_service_token=${{ secrets.DOPPLER_HETZNERTERRA_SERVICE_TOKEN }}" \
            -e "tailscale_api_key=${{ secrets.TAILSCALE_API_KEY }}" \
            -e "grafana_admin_password=${{ secrets.GRAFANA_ADMIN_PASSWORD }}" \
            -e "cluster_name=k8s-cluster"
        env:
          ANSIBLE_HOST_KEY_CHECKING: "False"
      - name: Install kubectl
        run: |
          curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
          chmod +x /usr/local/bin/kubectl
      - name: Install flux CLI
        run: |
          curl -fsSL https://github.com/fluxcd/flux2/releases/download/v2.5.1/flux_2.5.1_linux_amd64.tar.gz | tar xz -C /tmp
          mv /tmp/flux /usr/local/bin/flux
          chmod +x /usr/local/bin/flux
      - name: Rewrite kubeconfig for runner-reachable API
        working-directory: terraform
        run: |
          PRIMARY_IP=$(terraform output -raw primary_control_plane_ip)
          sed -i "s#https://k8s-cluster-cp-1\.[^:]*:6443#https://${PRIMARY_IP}:6443#g" ../outputs/kubeconfig
      - name: Bootstrap Flux source and reconciliation graph
        env:
          KUBECONFIG: outputs/kubeconfig
          FLUX_GIT_HOST: 64.176.189.59
          FLUX_GIT_PORT: "2222"
        run: |
          kubectl create namespace flux-system --dry-run=client -o yaml | kubectl apply -f -
          ssh-keyscan -p "${FLUX_GIT_PORT}" "${FLUX_GIT_HOST}" > /tmp/flux_known_hosts
          kubectl -n flux-system create secret generic flux-system \
            --from-file=identity="$HOME/.ssh/id_ed25519" \
            --from-file=known_hosts=/tmp/flux_known_hosts \
            --dry-run=client -o yaml | kubectl apply -f -
          # Apply CRDs and controllers first
          kubectl apply -f clusters/prod/flux-system/gotk-components.yaml
          # Wait for CRDs to be established
          kubectl wait --for=condition=Established crd --all --timeout=120s
          # Then apply custom resources
          kubectl apply -f clusters/prod/flux-system/gitrepository-platform.yaml
          kubectl apply -f clusters/prod/flux-system/kustomization-infrastructure.yaml
          kubectl apply -f clusters/prod/flux-system/kustomization-apps.yaml
          # Patch Flux controllers to run on cp-1 only
          kubectl -n flux-system patch deployment source-controller --type='merge' -p='{"spec":{"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"k8s-cluster-cp-1"}}}}}'
          kubectl -n flux-system patch deployment kustomize-controller --type='merge' -p='{"spec":{"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"k8s-cluster-cp-1"}}}}}'
          kubectl -n flux-system patch deployment helm-controller --type='merge' -p='{"spec":{"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"k8s-cluster-cp-1"}}}}}'
          kubectl -n flux-system patch deployment notification-controller --type='merge' -p='{"spec":{"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"k8s-cluster-cp-1"}}}}}'
          kubectl -n flux-system rollout status deployment/source-controller --timeout=180s
          kubectl -n flux-system rollout status deployment/kustomize-controller --timeout=180s
          kubectl -n flux-system rollout status deployment/helm-controller --timeout=180s
          kubectl -n flux-system wait --for=condition=Ready gitrepository/platform --timeout=180s
          kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=300s
          kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets --timeout=300s
          # Create Doppler ClusterSecretStore now that ESO CRDs are available
          kubectl apply -f - <<'EOF'
          apiVersion: external-secrets.io/v1
          kind: ClusterSecretStore
          metadata:
            name: doppler-hetznerterra
          spec:
            provider:
              doppler:
                auth:
                  secretRef:
                    dopplerToken:
                      name: doppler-hetznerterra-service-token
                      key: dopplerToken
                      namespace: external-secrets
          EOF
          # Wait for CCM and CSI (Hetzner cloud integration)
          kubectl -n flux-system wait --for=condition=Ready kustomization/addon-ccm --timeout=600s
          kubectl -n flux-system wait --for=condition=Ready kustomization/addon-csi --timeout=600s
          kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s
      - name: Wait for Rancher and backup operator
        env:
          KUBECONFIG: outputs/kubeconfig
        run: |
          set -euo pipefail
          echo "Waiting for Rancher..."
          kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=600s
          kubectl -n flux-system wait --for=condition=Ready helmrelease/rancher -n flux-system --timeout=300s
          echo "Waiting for rancher-backup operator..."
          kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup --timeout=600s || true
      - name: Restore Rancher from latest B2 backup
        env:
          KUBECONFIG: outputs/kubeconfig
          B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }}
          B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }}
        run: |
          echo "Finding latest backup in B2..."
          CREDS=$(echo -n "${B2_ACCOUNT_ID}:${B2_APPLICATION_KEY}" | base64)
          AUTH_RESP=$(curl -sS -H "Authorization: Basic ${CREDS}" https://api.backblazeb2.com/b2api/v2/b2_authorize_account)
          API_URL=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['apiUrl'])")
          AUTH_TOKEN=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['authorizationToken'])")
          BUCKET_ID=$(echo "$AUTH_RESP" | python3 -c "
          import json,sys
          resp = json.load(sys.stdin)
          bid = resp.get('allowed', {}).get('bucketId')
          if bid:
            print(bid)
          else:
            print('')
          ")
          if [ -z "$BUCKET_ID" ]; then
            echo "Restricted B2 key - resolving bucket ID by name..."
            BUCKET_ID=$(curl -sS -H "Authorization: Bearer ${AUTH_TOKEN}" \
              "${API_URL}/b2api/v2/b2_list_buckets?accountId=${B2_ACCOUNT_ID}&bucketName=HetznerTerra" \
              | python3 -c "import json,sys; buckets=json.load(sys.stdin).get('buckets',[]); print(buckets[0]['bucketId'] if buckets else '')")
          fi
          LATEST=$(curl -sS -H "Authorization: Bearer ${AUTH_TOKEN}" \
            "${API_URL}/b2api/v2/b2_list_file_names?bucketId=${BUCKET_ID}&prefix=rancher-backups/&maxFileCount=100" \
            | python3 -c "
          import json,sys
          files = json.load(sys.stdin).get('files', [])
          tars = [f['fileName'] for f in files if f['fileName'].endswith('.tar.gz')]
          if not tars:
            print('NONE')
          else:
            tars.sort()
            print(tars[-1])
          ")
          if [ "$LATEST" = "NONE" ]; then
            echo "No backups found in B2. Skipping restore."
            exit 0
          fi
          BACKUP_FILE=$(basename "$LATEST")
          echo "Latest backup: ${BACKUP_FILE}"
          echo "Creating Restore CR..."
          kubectl apply -f - <<EOF
          apiVersion: resources.cattle.io/v1
          kind: Restore
          metadata:
            name: restore-from-b2
            namespace: cattle-resources-system
          spec:
            backupFilename: ${BACKUP_FILE}
            storageLocation:
              s3:
                credentialSecretName: rancher-b2-creds
                credentialSecretNamespace: cattle-resources-system
                bucketName: HetznerTerra
                folder: rancher-backups
                endpoint: s3.us-east-005.backblazeb2.com
                region: us-east-005
          EOF
          echo "Waiting for restore to complete..."
          for i in $(seq 1 60); do
            STATUS=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "Unknown")
            MESSAGE=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].message}' 2>/dev/null || echo "")
            echo "  Restore status: ${STATUS} - ${MESSAGE}"
            if [ "$STATUS" = "True" ]; then
              echo "Restore completed successfully!"
              exit 0
            fi
            sleep 10
          done
          echo "Restore did not complete within timeout. Continuing anyway."
      - name: Post-deploy cluster health checks
        working-directory: ansible
        run: |
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get nodes -o wide"
          ansible -i inventory.ini 'control_plane[0]' -m shell -a "kubectl describe nodes | grep -E 'Name:|providerID:'"
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n flux-system get gitrepositories,kustomizations,helmreleases"
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n kube-system get pods -o wide"
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass"
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n tailscale-system get pods -o wide"
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n external-secrets get pods"
        env:
          ANSIBLE_HOST_KEY_CHECKING: "False"
--- a/.gitea/workflows/destroy.yml
+++ b/.gitea/workflows/destroy.yml
@@ -16,10 +16,12 @@ env:
  TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
  TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
  TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
  B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }}
  B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }}
 jobs:
-  destroy:
+  pre-destroy-backup:
-    name: Destroy Cluster
+    name: Pre-Destroy Backup
    runs-on: ubuntu-latest
    if: github.event.inputs.confirm == 'destroy'
    environment: destroy
@@ -51,11 +53,143 @@ jobs:
          echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
          chmod 644 ~/.ssh/id_ed25519.pub
-      - name: Terraform Destroy
+      - name: Get Control Plane IP
        id: cp_ip
        working-directory: terraform
        run: |
-          terraform destroy \
+          PRIMARY_IP=$(terraform output -raw primary_control_plane_ip)
-            -var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
+          echo "PRIMARY_IP=${PRIMARY_IP}" >> "$GITHUB_ENV"
-            -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
+
-            -var="ssh_private_key=$HOME/.ssh/id_ed25519" \
+      - name: Pre-Destroy pg_dump to B2
-            -auto-approve
+        run: |
          set +e
          echo "Attempting pre-destroy backup to B2..."
          ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null root@${PRIMARY_IP} << 'EOF'
            set -e
            # Check if kubectl is available and cluster is up
            if ! command -v kubectl &> /dev/null; then
              echo "kubectl not found, skipping pre-destroy backup"
              exit 0
            fi
            # Check if we can reach the cluster
            if ! kubectl cluster-info &> /dev/null; then
              echo "Cannot reach cluster, skipping pre-destroy backup"
              exit 0
            fi
            # Check if CNP is deployed
            if ! kubectl get namespace cnpg-cluster &> /dev/null; then
              echo "CNP namespace not found, skipping pre-destroy backup"
              exit 0
            fi
            # Run backup using the pgdump image directly
            BACKUP_FILE="rancher-backup-$(date +%Y%m%d-%H%M%S).sql.gz"
            B2_ACCOUNT_ID="$(cat /etc/kubernetes/secret/b2_account_id 2>/dev/null || echo '')"
            B2_APPLICATION_KEY="$(cat /etc/kubernetes/secret/b2_application_key 2>/dev/null || echo '')"
            if [ -z "$B2_ACCOUNT_ID" ] || [ -z "$B2_APPLICATION_KEY" ]; then
              echo "B2 credentials not found in secret, skipping pre-destroy backup"
              exit 0
            fi
            kubectl run pgdump-manual --image=ghcr.io/cloudnative-pg/pgbackrest:latest --restart=Never \
              -n cnpg-cluster --dry-run=client -o yaml | \
              kubectl apply -f -
            echo "Waiting for backup job to complete..."
            kubectl wait --for=condition=complete job/pgdump-manual -n cnpg-cluster --timeout=300s || true
            kubectl logs job/pgdump-manual -n cnpg-cluster || true
            kubectl delete job pgdump-manual -n cnpg-cluster --ignore-not-found=true || true
          EOF
          echo "Pre-destroy backup step completed (failure is non-fatal)"
  destroy:
    name: Destroy Cluster
    runs-on: ubuntu-latest
    if: github.event.inputs.confirm == 'destroy'
    environment: destroy
    needs: pre-destroy-backup
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Setup Terraform
        uses: hashicorp/setup-terraform@v3
        with:
          terraform_version: ${{ env.TF_VERSION }}
      - name: Terraform Init
        working-directory: terraform
        run: |
          terraform init \
            -backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
            -backend-config="bucket=${{ secrets.S3_BUCKET }}" \
            -backend-config="region=auto" \
            -backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
            -backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
            -backend-config="skip_requesting_account_id=true"
      - name: Setup SSH Keys
        run: |
          mkdir -p ~/.ssh
          echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
          chmod 600 ~/.ssh/id_ed25519
          echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
          chmod 644 ~/.ssh/id_ed25519.pub
      - name: Install jq
        run: |
          apt-get update
          apt-get install -y jq
      - name: Terraform Destroy
        id: destroy
        working-directory: terraform
        run: |
          set +e
          for attempt in 1 2 3; do
            echo "Terraform destroy attempt ${attempt}/3"
            terraform destroy \
              -var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
              -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
              -var="ssh_private_key=$HOME/.ssh/id_ed25519" \
              -auto-approve
            rc=$?
            if [ "$rc" -eq 0 ]; then
              exit 0
            fi
            if [ "$attempt" -lt 3 ]; then
              echo "Terraform destroy failed with exit code ${rc}; retrying in 30s"
              sleep 30
              terraform refresh \
                -var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
                -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
                -var="ssh_private_key=$HOME/.ssh/id_ed25519" || true
            fi
          done
          exit "$rc"
      - name: Hetzner destroy diagnostics
        if: failure() && steps.destroy.outcome == 'failure'
        env:
          HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }}
        run: |
          set +e
          echo "== Terraform state list =="
          terraform -chdir=terraform state list || true
          network_id=$(terraform -chdir=terraform state show hcloud_network.cluster 2>/dev/null | awk '/^id *=/ {gsub(/"/, "", $3); print $3; exit}')
          if [ -z "$network_id" ]; then
            network_id="11988935"
          fi
          echo "== Hetzner network =="
          curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/networks/${network_id}" | jq . || true
          echo "== Hetzner servers attached to network =="
          curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/servers" | jq --argjson id "$network_id" '.servers[] | select(any(.private_net[]?; .network == $id)) | {id, name, private_net}' || true
          echo "== Hetzner load balancers attached to network =="
          curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/load_balancers" | jq --argjson id "$network_id" '.load_balancers[] | select(any(.private_net[]?; .network == $id)) | {id, name, private_net}' || true
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -0,0 +1,144 @@
 # AGENTS.md
 Repository guide for agentic contributors working in this repo.
 ## Scope
 - Infrastructure repo for a Hetzner + k3s + Flux stack running Rancher.
 - Primary areas: `terraform/`, `ansible/`, `clusters/`, `infrastructure/`, `apps/`, `.gitea/workflows/`.
 - Treat `README.md` and `STABLE_BASELINE.md` as user-facing context, but prefer current manifests and workflows as source of truth.
 - Keep changes small and reviewable; prefer the narrowest file set that solves the task.
 ## Architecture
 - **Terraform** provisions Hetzner servers, network, firewall, load balancer, SSH keys.
 - **Ansible** bootstraps OS, installs k3s (with external cloud provider), deploys Hetzner CCM, Tailscale, Doppler token.
 - **Flux** reconciles all cluster addons from this repo after Ansible hands off.
 - **Rancher** stores state in embedded etcd (NOT an external DB). Backup/restore uses the `rancher-backup` operator to B2.
 - **cert-manager** is required — Tailscale LoadBalancer does L4 TCP passthrough, so Rancher serves its own TLS.
 - **Secrets flow**: Doppler → `ClusterSecretStore` (doppler-hetznerterra) → `ExternalSecret` resources → k8s Secrets.
 - Rancher is reachable only over Tailscale at `https://rancher.silverside-gopher.ts.net/`.
 - Grafana, Prometheus, and Flux UI are also exposed via dedicated Tailscale LoadBalancer services at `http://grafana.silverside-gopher.ts.net/`, `http://prometheus.silverside-gopher.ts.net/`, `http://flux.silverside-gopher.ts.net:9001/`.
 ## Important Files
 - `terraform/main.tf` — provider and version pins
 - `terraform/variables.tf` — input surface and defaults
 - `terraform/firewall.tf` — firewall rules (tailnet CIDR, internal cluster ports)
 - `ansible/site.yml` — ordered bootstrap playbook (roles: common → k3s-server → ccm → k3s-agent → doppler → tailscale-cleanup)
 - `ansible/generate_inventory.py` — renders `ansible/inventory.ini` from Terraform outputs via Jinja2
 - `clusters/prod/flux-system/` — Flux GitRepository and top-level Kustomization resources
 - `infrastructure/addons/kustomization.yaml` — root addon graph with dependency ordering
 - `infrastructure/addons/<addon>/` — each addon is a self-contained dir with its own `kustomization.yaml`
 - `.gitea/workflows/deploy.yml` — canonical CI: terraform → ansible → flux bootstrap → B2 restore → health checks
 ## Build / Validate / Test
 ### Terraform
 - Format: `terraform -chdir=terraform fmt -recursive`
 - Check formatting: `terraform -chdir=terraform fmt -check -recursive`
 - Validate: `terraform -chdir=terraform validate`
 - Plan (full): `terraform -chdir=terraform plan -var-file=../terraform.tfvars`
 - Plan one resource: `terraform -chdir=terraform plan -var-file=../terraform.tfvars -target=hcloud_server.control_plane[0]`
 - Apply: `terraform -chdir=terraform apply -var-file=../terraform.tfvars`
 - State inspection: `terraform -chdir=terraform state list` / `terraform state show <address>`
 ### Ansible
 - Install collections: `ansible-galaxy collection install -r ansible/requirements.yml`
 - Generate inventory: `cd ansible && python3 generate_inventory.py` (requires Terraform outputs)
 - Syntax check: `ansible-playbook -i ansible/inventory.ini ansible/site.yml --syntax-check`
 - Dry-run one host: `ansible-playbook -i ansible/inventory.ini ansible/site.yml --check --diff -l control_plane[0]`
 - Full bootstrap: `ansible-playbook ansible/site.yml`
 - Targeted: `ansible-playbook ansible/site.yml -t upgrade` or `-t reset`
 - Dashboards only: `ansible-playbook ansible/dashboards.yml`
 ### Python
 - Syntax check: `python3 -m py_compile ansible/generate_inventory.py`
 - Run: `cd ansible && python3 generate_inventory.py`
 ### Kubernetes / Flux manifests
 - Render single addon: `kubectl kustomize infrastructure/addons/<addon>`
 - Render cluster bootstrap: `kubectl kustomize clusters/prod/flux-system`
 - Validate only the directory you edited, not the whole repo.
 ### Kubeconfig refresh
 - Preferred: `scripts/refresh-kubeconfig.sh <cp1-public-ip>`
 - Manual: `ssh -i ~/.ssh/infra root@<cp1-ip> "cat /etc/rancher/k3s/k3s.yaml" | sed 's/127.0.0.1/<cp1-ip>/g' > outputs/kubeconfig`
 ## Code Style
 ### General
 - Match existing style in adjacent files. No new tools/frameworks unless the repo already uses them.
 - Prefer ASCII. Keep diffs minimal. No unrelated cleanup.
 - No comments unless the logic is non-obvious.
 ### Terraform / HCL
 - 2-space indent. `terraform {}` block first, then providers, locals, variables, resources, outputs.
 - `snake_case` for variables, locals, resources. Descriptions on all variables/outputs.
 - `sensitive = true` on secrets. Run `terraform fmt` instead of hand-formatting.
 - Use `locals` for reused or non-trivial logic. Explicit `depends_on` only when required.
 ### Ansible / YAML
 - 2-space YAML indent. Descriptive task names in sentence case.
 - Idempotent tasks: `changed_when: false` and `failed_when: false` for probes.
 - `command`/`shell` only when no dedicated module fits. `shell` only for pipes/redirection/heredocs.
 - `when` guards and `default(...)` filters over duplicated tasks.
 - Role names and filenames: kebab-case. Variables: snake_case.
 - Multi-line shell in workflows: `set -e` or `set -euo pipefail` for fail-fast.
 ### Kubernetes / Flux YAML
 - One object per file. Kebab-case filenames matching repo patterns: `helmrelease-*.yaml`, `kustomization-*.yaml`, `*-externalsecret.yaml`.
 - Addon manifests live in `infrastructure/addons/<addon>/` with a `kustomization.yaml`.
 - Flux graph objects in `clusters/prod/flux-system/`.
 - Each addon gets a `kustomization-<addon>.yaml` entry in `infrastructure/addons/` with `dependsOn` for ordering.
 - Quote strings with `:`, `*`, cron expressions, or shell-sensitive chars.
 - Preserve existing labels/annotations unless the change specifically needs them.
 ### Python
 - PEP 8. Imports ordered: stdlib, third-party, local. `snake_case` for functions/variables.
 - Scripts small and explicit. Exit non-zero on failure. Clear subprocess error handling.
 ## Known Issues & Workarounds
 - **rancher-backup post-install job** (`rancher-backup-patch-sa`) uses a postRenderer in the HelmRelease to replace the broken `rancher/kuberlr-kubectl` image with `rancher/kubectl`. Do NOT set `s3` block in HelmRelease values — put S3 config in the Backup CR instead.
 - **B2 ExternalSecret** must use key names `accessKey` and `secretKey` (not `aws_access_key_id`/`aws_secret_access_key`).
 - **Stale Tailscale devices**: After cluster rebuild, delete stale offline `rancher` devices before booting. The `tailscale-cleanup` Ansible role handles this via the Tailscale API.
 - **Restricted B2 keys**: `b2_authorize_account` may return `allowed.bucketId: null`. CI falls back to `b2_list_buckets` to resolve bucket ID by name.
 ## Secrets / Security
 - Never commit tokens, passwords, kubeconfigs, private keys, or generated secrets.
 - Runtime secrets via Gitea secrets (CI), Doppler, or External Secrets Operator.
 - `terraform.tfvars` and `outputs/` are gitignored. Never print secret values in logs or commits.
 ## CI Pipeline (`.gitea/workflows/deploy.yml`)
 1. Terraform: fmt check → init → validate → import existing servers → plan → apply (main only)
 2. Ansible: install deps → generate inventory → run site.yml with extra vars (secrets injected from Gitea)
 3. Flux bootstrap: install kubectl/flux → rewrite kubeconfig → apply CRDs → apply graph → wait for addons
 4. Rancher wait: wait for Rancher and backup operator to be ready
 5. B2 restore: authorize B2 → find latest backup → create Restore CR → poll until ready
 6. Health checks: nodes, Flux objects, pods, storage class
 ## Editing Practices
 - Read target file and adjacent patterns before editing.
 - Run the narrowest validation command after edits.
 - If you make a live-cluster workaround, also update the declarative manifests so Flux can own it.
 - Changes spanning Terraform + Ansible + Flux: update and verify each layer separately.
 - Check `git status` before and after changes.
 ## Cursor / Copilot Rules
 - No `.cursor/rules/`, `.cursorrules`, or `.github/copilot-instructions.md` files exist.
 - If added later, mirror their guidance here and treat them as authoritative.
--- a/README.md
+++ b/README.md
@@ -10,8 +10,8 @@ Production-ready Kubernetes cluster on Hetzner Cloud using Terraform and Ansible
 | **Workers** | 4x CX33 |
 | **Total Cost** | €28.93/mo |
 | **K8s** | k3s (latest, HA) |
-| **Addons** | Hetzner CCM + CSI |
+| **Addons** | Hetzner CCM + CSI + Prometheus + Grafana + Loki |
-| **Access** | SSH/API restricted to Tailnet |
+| **Access** | SSH/API and Rancher UI restricted to Tailnet |
 | **Bootstrap** | Terraform + Ansible |
 ### Cluster Resources
@@ -152,6 +152,7 @@ This repository includes Gitea workflows for:
 - **terraform-plan**: Runs on PRs, shows planned changes
 - **terraform-apply**: Runs on main branch after merge
 - **ansible-deploy**: Runs after terraform apply
 - **dashboards**: Fast workflow that updates Grafana datasources/dashboards only
 ### Required Gitea Secrets
@@ -166,10 +167,167 @@ Set these in your Gitea repository settings (**Settings** → **Secrets** → **
 | `S3_BUCKET` | S3 bucket name (e.g., `k8s-terraform-state`) |
 | `TAILSCALE_AUTH_KEY` | Tailscale auth key for node bootstrap |
 | `TAILSCALE_TAILNET` | Tailnet domain (e.g., `yourtailnet.ts.net`) |
 | `TAILSCALE_OAUTH_CLIENT_ID` | Tailscale OAuth client ID for Kubernetes Operator |
 | `TAILSCALE_OAUTH_CLIENT_SECRET` | Tailscale OAuth client secret for Kubernetes Operator |
 | `DOPPLER_HETZNERTERRA_SERVICE_TOKEN` | Doppler service token for `hetznerterra` runtime secrets |
 | `GRAFANA_ADMIN_PASSWORD` | Optional admin password for Grafana (auto-generated if unset) |
 | `RUNNER_ALLOWED_CIDRS` | Optional CIDR list for CI runner access if you choose to pass it via tfvars/secrets |
 | `SSH_PUBLIC_KEY` | SSH public key content |
 | `SSH_PRIVATE_KEY` | SSH private key content |
 ## GitOps (Flux)
 This repo uses Flux for continuous reconciliation after Terraform + Ansible bootstrap.
 ### Stable private-only baseline
 The current default target is a deliberately simplified baseline:
 - `1` control plane node
 - `2` worker nodes
 - private Hetzner network only
 - Tailscale for operator access
 - Flux-managed core addons only
 Detailed phase gates and success criteria live in `STABLE_BASELINE.md`.
 This is the default until rebuilds are consistently green. High availability, public ingress, and app-layer expansion come later.
 ### Runtime secrets
 Runtime cluster secrets are moving to Doppler + External Secrets Operator.
 - Doppler project: `hetznerterra`
 - Initial auth: service token via `DOPPLER_HETZNERTERRA_SERVICE_TOKEN`
 - First synced secrets:
  - `GRAFANA_ADMIN_PASSWORD`
  - `WEAVE_GITOPS_ADMIN_USERNAME`
  - `WEAVE_GITOPS_ADMIN_PASSWORD_BCRYPT_HASH`
 Terraform/bootstrap secrets remain in Gitea Actions secrets and are not managed by Doppler.
 ### Repository layout
 - `clusters/prod/`: cluster entrypoint and Flux reconciliation objects
 - `clusters/prod/flux-system/`: `GitRepository` source and top-level `Kustomization` graph
 - `infrastructure/`: infrastructure addon reconciliation graph
 - `infrastructure/addons/*`: per-addon manifests for Flux-managed cluster addons
 - `apps/`: application workload layer (currently scaffolded)
 ### Reconciliation graph
 - `infrastructure` (top-level)
  - `addon-ccm`
  - `addon-csi` depends on `addon-ccm`
  - `addon-tailscale-operator`
  - `addon-observability`
  - `addon-observability-content` depends on `addon-observability`
 - `apps` depends on `infrastructure`
 ### Bootstrap notes
 1. Install Flux controllers in `flux-system`.
 2. Create the Flux deploy key/secret named `flux-system` in `flux-system` namespace.
 3. Apply `clusters/prod/flux-system/` once to establish source + reconciliation graph.
 4. Bootstrap-only Ansible creates prerequisite secrets; Flux manages addon lifecycle after bootstrap.
 ### Current addon status
 - Core infrastructure addons are Flux-managed from `infrastructure/addons/`.
 - Active Flux addons for stable baseline: `addon-tailscale-operator`, `addon-tailscale-proxyclass`, `addon-external-secrets`.
 - Deferred addons: `addon-ccm`, `addon-csi`, `addon-observability`, `addon-observability-content` (to be added after baseline is stable).
 - Ansible is limited to cluster bootstrap, private-access setup, and prerequisite secret creation for Flux-managed addons.
 - `addon-flux-ui` is optional for the stable-baseline phase and is not a blocker for rebuild success.
 ### Rancher access
 - Rancher is private-only and exposed through Tailscale at `https://rancher.silverside-gopher.ts.net/dashboard/`.
 - The public Hetzner load balancer path is not used for Rancher.
 - Rancher uses the CNPG-backed PostgreSQL cluster in `cnpg-cluster`.
 ### Stable baseline acceptance
 A rebuild is considered successful only when all of the following pass without manual intervention:
 - Terraform create succeeds for the default `1` control plane and `2` workers.
 - Ansible bootstrap succeeds end-to-end.
 - All nodes become `Ready`.
 - Flux core reconciliation is healthy.
 - External Secrets Operator is ready.
 - Tailscale operator is ready.
 - Terraform destroy succeeds cleanly or succeeds after workflow retries.
 _Note: Observability stack (Grafana/Prometheus) is deferred and will be added once the core platform baseline is stable._
 ## Observability Stack
 Flux deploys a lightweight observability stack in the `observability` namespace:
 - `kube-prometheus-stack` (Prometheus + Grafana)
 - `loki`
 - `promtail`
 Grafana content is managed as code via ConfigMaps in `infrastructure/addons/observability-content/`.
 Grafana and Prometheus are exposed through a single Tailscale front door backed by Traefik when the Tailscale Kubernetes Operator is healthy.
 ### Access Grafana and Prometheus
 Preferred private access:
 - Grafana: `http://k8s-cluster-cp-1.<your-tailnet>:30080/`
 - Prometheus: `http://k8s-cluster-cp-1.<your-tailnet>:30990/`
 - Flux UI: `http://k8s-cluster-cp-1.<your-tailnet>:30901/`
 This access path is bootstrapped automatically by Ansible on `control_plane[0]` using persistent `kubectl port-forward` systemd services plus `tailscale serve`, so it survives cluster rebuilds.
 Fallback (port-forward from a tailnet-connected machine):
 Run from a tailnet-connected machine:
 ```bash
 export KUBECONFIG=$(pwd)/outputs/kubeconfig
 kubectl -n observability port-forward svc/kube-prometheus-stack-grafana 3000:80
 kubectl -n observability port-forward svc/kube-prometheus-stack-prometheus 9090:9090
 ```
 Then open:
 - Grafana: http://127.0.0.1:3000
 - Prometheus: http://127.0.0.1:9090
 Grafana user: `admin`
 Grafana password: value of `GRAFANA_ADMIN_PASSWORD` secret (or the generated value shown by Ansible output)
 ### Verify Tailscale exposure
 ```bash
 export KUBECONFIG=$(pwd)/outputs/kubeconfig
 kubectl -n tailscale-system get pods
 kubectl -n observability get svc kube-prometheus-stack-grafana kube-prometheus-stack-prometheus
 kubectl -n observability describe svc kube-prometheus-stack-grafana | grep TailscaleProxyReady
 kubectl -n observability describe svc kube-prometheus-stack-prometheus | grep TailscaleProxyReady
 ```
 If `TailscaleProxyReady=False`, check:
 ```bash
 kubectl -n tailscale-system logs deployment/operator --tail=100
 ```
 Common cause: OAuth client missing tag/scopes permissions.
 ### Fast dashboard iteration workflow
 Use the `Deploy Grafana Content` workflow when changing dashboard/data source templates.
 It avoids full cluster provisioning and only applies Grafana content resources:
 - `ansible/roles/observability-content/templates/grafana-datasources.yaml.j2`
 - `ansible/roles/observability-content/templates/grafana-dashboard-k8s-overview.yaml.j2`
 - `ansible/dashboards.yml`
 ## File Structure
 ```
@@ -191,13 +349,15 @@ Set these in your Gitea repository settings (**Settings** → **Secrets** → **
 │   │   ├── common/
 │   │   ├── k3s-server/
 │   │   ├── k3s-agent/
-│   │   ├── ccm/
+│   │   ├── addon-secrets-bootstrap/
-│   │   └── csi/
+│   │   ├── observability-content/
 │   │   └── observability/
 │   └── ansible.cfg
 ├── .gitea/
 │   └── workflows/
 │       ├── terraform.yml
-│       └── ansible.yml
+│       ├── ansible.yml
 │       └── dashboards.yml
 ├── outputs/
 ├── terraform.tfvars.example
 └── README.md
--- a/SECRETS_SETUP.md
+++ b/SECRETS_SETUP.md
@@ -0,0 +1,93 @@
 # Gitea Secrets Setup
 This document describes the secrets required for the HetznerTerra deployment workflow.
 ## Required Secrets
 Add these secrets in your Gitea repository settings:
 **Settings → Secrets → Actions**
 ### Infrastructure Secrets
 #### `HCLOUD_TOKEN`
 - Hetzner Cloud API token
 - Get from: https://console.hetzner.com/projects/{project-id}/security/api-tokens
 - Permissions: Read & Write
 #### `S3_ACCESS_KEY` & `S3_SECRET_KEY`
 - Backblaze B2 credentials for Terraform state storage
 - Get from: https://secure.backblaze.com/b2_buckets.htm
 - Create application key with access to your terraform state bucket
 #### `S3_ENDPOINT`
 - Backblaze B2 S3 endpoint
 - Example: `https://s3.eu-central-003.backblazeb2.com`
 #### `S3_BUCKET`
 - Backblaze B2 bucket name for Terraform state
 - Example: `k8s-terraform-state`
 ### SSH Secrets
 #### `SSH_PRIVATE_KEY` & `SSH_PUBLIC_KEY`
 - SSH key pair for cluster access
 - Generate with: `ssh-keygen -t ed25519 -C "k8s@hetzner" -f ~/.ssh/hetzner_k8s`
 - Private key content (include BEGIN/END lines)
 - Public key content (full line starting with ssh-ed25519)
 ### Tailscale Secrets
 #### `TAILSCALE_AUTH_KEY`
 - Tailscale auth key for node registration
 - Get from: https://login.tailscale.com/admin/settings/keys
 - Type: Reusable, Ephemeral
 - Scope: `devices:core:write`
 #### `TAILSCALE_TAILNET`
 - Your Tailscale network name
 - Example: `tail7ec33.ts.net` or your custom domain
 #### `TAILSCALE_OAUTH_CLIENT_ID` & `TAILSCALE_OAUTH_CLIENT_SECRET`
 - OAuth credentials for Tailscale Kubernetes Operator
 - Get from: https://login.tailscale.com/admin/settings/oauth
 - Create OAuth client with scope: `devices:core:write`
 ### Application Secrets
 #### `DOPPLER_HETZNERTERRA_SERVICE_TOKEN`
 - Doppler service token for the `hetznerterra` project runtime secrets
 - Used by External Secrets Operator bootstrap
 - Recommended scope: `hetznerterra` project, `prod` config only
 #### `GRAFANA_ADMIN_PASSWORD`
 - Transitional fallback only while migrating observability secrets to Doppler
 - In steady state, store this in Doppler as `GRAFANA_ADMIN_PASSWORD`
 ## Setting Up Secrets
 1. Go to your Gitea repository
 2. Navigate to **Settings → Secrets → Actions**
 3. Click **Add Secret**
 4. Enter the secret name (exact match from above)
 5. Paste the secret value
 6. Click **Add Secret**
 7. Repeat for all secrets
 ## Verification
 After adding all secrets, trigger a workflow run:
 ```bash
 git commit --allow-empty -m "ci: trigger workflow with new secrets"
 git push
 ```
 Check the workflow logs to verify all secrets are being used correctly.
 ## Security Notes
 - Never commit secrets to the repository
 - Use strong, unique passwords for Grafana and other services
 - Prefer Doppler for runtime app/platform secrets after cluster bootstrap
 - Rotate Tailscale auth keys periodically
 - Review OAuth client permissions regularly
 - The workflow automatically opens SSH/API access only for the runner's IP during deployment
--- a/STABLE_BASELINE.md
+++ b/STABLE_BASELINE.md
@@ -0,0 +1,65 @@
 # Stable Private-Only Baseline
 This document defines the current engineering target for this repository.
 ## Topology
 - 3 control planes (HA etcd cluster)
 - 3 workers
 - Hetzner Load Balancer for Kubernetes API
 - private Hetzner network
 - Tailscale operator access
 - Rancher UI exposed only through Tailscale (`rancher.silverside-gopher.ts.net`)
 ## In Scope
 - Terraform infrastructure bootstrap
 - Ansible k3s bootstrap with external cloud provider
 - **HA control plane (3 nodes with etcd quorum)**
 - **Hetzner Load Balancer for Kubernetes API**
 - **Hetzner CCM deployed via Ansible (before workers join)**
 - **Hetzner CSI for persistent volumes (via Flux)**
 - Flux core reconciliation
 - External Secrets Operator with Doppler
 - Tailscale private access
 - Persistent volume provisioning validated
 ## Deferred for Later Phases
 - Observability stack (deferred - complex helm release needs separate debugging)
 ## Out of Scope
 - public ingress or DNS
 - public TLS
 - app workloads
 - DR / backup strategy
 - upgrade strategy
 ## Phase Gates
 1. Terraform apply completes for HA topology (3 CP, 3 workers, 1 LB).
 2. Load Balancer is healthy with all 3 control plane targets.
 3. Primary control plane bootstraps with `--cluster-init`.
 4. Secondary control planes join via Load Balancer endpoint.
 5. **CCM deployed via Ansible before workers join** (fixes uninitialized taint issue).
 6. Workers join successfully via Load Balancer and all nodes show proper `providerID`.
 7. etcd reports 3 healthy members.
 8. Flux source and infrastructure reconciliation are healthy.
 9. **CSI deploys and creates `hcloud-volumes` StorageClass**.
 10. **PVC provisioning tested and working**.
 11. External Secrets sync required secrets.
 12. Tailscale private access works, including Rancher UI access.
 13. Terraform destroy succeeds cleanly or via workflow retry.
 ## Success Criteria
 ✅ **ACHIEVED** - HA Cluster with CCM/CSI:
 - Build 1: Initial CCM/CSI deployment and validation (2026-03-23)
 - Build 2: Full destroy/rebuild cycle successful (2026-03-23)
 🔄 **IN PROGRESS** - HA Control Plane Validation:
 - Build 3: Deploy 3-3 topology with Load Balancer
 - Build 4: Destroy/rebuild to validate HA configuration
 Success requires two consecutive HA rebuilds passing all phase gates with no manual fixes.
--- a/ansible/dashboards.yml
+++ b/ansible/dashboards.yml
@@ -0,0 +1,7 @@
 ---
 - name: Provision Grafana dashboards and datasources
  hosts: control_plane[0]
  become: true
  roles:
    - observability-content
--- a/ansible/generate_inventory.py
+++ b/ansible/generate_inventory.py
@@ -32,6 +32,7 @@ def main():
    worker_names = outputs["worker_names"]["value"]
    worker_ips = outputs["worker_ips"]["value"]
    worker_private_ips = outputs["worker_private_ips"]["value"]
    kube_api_lb_ip = outputs.get("kube_api_lb_ip", {}).get("value", control_plane_ips[0])
    control_planes = [
        {
@@ -59,6 +60,7 @@ def main():
        "control_planes": control_planes,
        "workers": workers,
        "private_key_file": outputs["ssh_private_key_path"]["value"],
        "kube_api_lb_ip": kube_api_lb_ip,
    }
    env = Environment(loader=FileSystemLoader("."))
--- a/ansible/inventory.tmpl
+++ b/ansible/inventory.tmpl
@@ -17,3 +17,4 @@ ansible_user=root
 ansible_python_interpreter=/usr/bin/python3
 ansible_ssh_private_key_file={{ private_key_file }}
 k3s_version=latest
 kube_api_endpoint={{ kube_api_lb_ip }}
--- a/ansible/requirements.yml
+++ b/ansible/requirements.yml
@@ -3,3 +3,5 @@ collections:
    version: ">=2.4.0"
  - name: community.general
    version: ">=8.0.0"
  - name: community.network
    version: ">=5.0.0"
--- a/ansible/roles/addon-secrets-bootstrap/tasks/main.yml
+++ b/ansible/roles/addon-secrets-bootstrap/tasks/main.yml
@@ -0,0 +1,41 @@
 ---
 - name: Apply Hetzner cloud secret
  shell: >-
    kubectl -n kube-system create secret generic hcloud
    --from-literal=token='{{ hcloud_token }}'
    --from-literal=network='{{ cluster_name }}-network'
    --dry-run=client -o yaml | kubectl apply -f -
  changed_when: true
  no_log: true
  when: hcloud_token | default('') | length > 0
 - name: Ensure Tailscale operator namespace exists
  command: >-
    kubectl create namespace {{ tailscale_operator_namespace | default('tailscale-system') }}
    --dry-run=client -o yaml
  register: tailscale_namespace_manifest
  changed_when: false
  when:
    - tailscale_oauth_client_id | default('') | length > 0
    - tailscale_oauth_client_secret | default('') | length > 0
 - name: Apply Tailscale operator namespace
  command: kubectl apply -f -
  args:
    stdin: "{{ tailscale_namespace_manifest.stdout }}"
  changed_when: true
  when:
    - tailscale_oauth_client_id | default('') | length > 0
    - tailscale_oauth_client_secret | default('') | length > 0
 - name: Apply Tailscale operator OAuth secret
  shell: >-
    kubectl -n {{ tailscale_operator_namespace | default('tailscale-system') }} create secret generic operator-oauth
    --from-literal=client_id='{{ tailscale_oauth_client_id }}'
    --from-literal=client_secret='{{ tailscale_oauth_client_secret }}'
    --dry-run=client -o yaml | kubectl apply -f -
  changed_when: true
  no_log: true
  when:
    - tailscale_oauth_client_id | default('') | length > 0
    - tailscale_oauth_client_secret | default('') | length > 0
--- a/ansible/roles/ccm-deploy/tasks/main.yml
+++ b/ansible/roles/ccm-deploy/tasks/main.yml
@@ -0,0 +1,82 @@
 ---
 - name: Check if hcloud secret exists
  command: kubectl -n kube-system get secret hcloud
  register: hcloud_secret_check
  changed_when: false
  failed_when: false
 - name: Fail if hcloud secret is missing
  fail:
    msg: "hcloud secret not found in kube-system namespace. CCM requires it."
  when: hcloud_secret_check.rc != 0
 - name: Check if helm is installed
  command: which helm
  register: helm_check
  changed_when: false
  failed_when: false
 - name: Install helm
  when: helm_check.rc != 0
  block:
    - name: Download helm install script
      get_url:
        url: https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3
        dest: /tmp/get-helm-3.sh
        mode: "0755"
    - name: Run helm install script
      command: /tmp/get-helm-3.sh
      args:
        creates: /usr/local/bin/helm
 - name: Add Hetzner Helm repository
  kubernetes.core.helm_repository:
    name: hcloud
    repo_url: https://charts.hetzner.cloud
    kubeconfig: /etc/rancher/k3s/k3s.yaml
  environment:
    KUBECONFIG: /etc/rancher/k3s/k3s.yaml
 - name: Deploy Hetzner Cloud Controller Manager
  kubernetes.core.helm:
    name: hcloud-cloud-controller-manager
    chart_ref: hcloud/hcloud-cloud-controller-manager
    release_namespace: kube-system
    create_namespace: true
    values:
      networking:
        enabled: true
      nodeSelector:
        kubernetes.io/hostname: "{{ inventory_hostname }}"
      additionalTolerations:
        - key: node-role.kubernetes.io/control-plane
          operator: Exists
          effect: NoSchedule
    kubeconfig: /etc/rancher/k3s/k3s.yaml
    wait: true
    wait_timeout: 300s
  environment:
    KUBECONFIG: /etc/rancher/k3s/k3s.yaml
 - name: Wait for CCM to be ready
  command: kubectl -n kube-system rollout status deployment/hcloud-cloud-controller-manager --timeout=120s
  changed_when: false
  register: ccm_rollout
  until: ccm_rollout.rc == 0
  retries: 3
  delay: 10
 - name: Pause to ensure CCM is fully ready to process new nodes
  pause:
    seconds: 10
 - name: Verify CCM is removing uninitialized taints
  command: kubectl get nodes -o jsonpath='{.items[*].spec.taints[?(@.key=="node.cloudprovider.kubernetes.io/uninitialized")].key}'
  register: uninitialized_taints
  changed_when: false
  failed_when: false
 - name: Display taint status
  debug:
    msg: "Nodes with uninitialized taint: {{ uninitialized_taints.stdout }}"
--- a/ansible/roles/ccm/defaults/main.yml
+++ b/ansible/roles/ccm/defaults/main.yml
@@ -1,4 +0,0 @@
 ---
 hcloud_token: ""
 cluster_name: "k8s-cluster"
 hcloud_lb_location: "nbg1"
--- a/ansible/roles/ccm/tasks/main.yml
+++ b/ansible/roles/ccm/tasks/main.yml
@@ -1,88 +0,0 @@
 ---
 - name: Check if Hetzner CCM is already deployed
  command: kubectl -n kube-system get deployment hcloud-cloud-controller-manager
  register: ccm_namespace
  failed_when: false
  changed_when: false
 - name: Create Hetzner cloud secret
  shell: |
    kubectl -n kube-system create secret generic hcloud \
      --from-literal=token='{{ hcloud_token }}' \
      --from-literal=network='{{ cluster_name }}-network' \
      --dry-run=client -o yaml | kubectl apply -f -
  no_log: true
  when: hcloud_token is defined
  changed_when: true
 - name: Deploy Hetzner CCM
  command: kubectl apply -f https://raw.githubusercontent.com/hetznercloud/hcloud-cloud-controller-manager/main/deploy/ccm-networks.yaml
  changed_when: true
 - name: Detect CCM workload kind
  shell: |
    if kubectl -n kube-system get deployment hcloud-cloud-controller-manager >/dev/null 2>&1; then
      echo deployment
    elif kubectl -n kube-system get daemonset hcloud-cloud-controller-manager >/dev/null 2>&1; then
      echo daemonset
    else
      echo missing
    fi
  register: ccm_workload_kind
  changed_when: false
 - name: Wait for CCM deployment rollout
  command: kubectl rollout status deployment/hcloud-cloud-controller-manager -n kube-system
  register: ccm_rollout_deploy
  until: ccm_rollout_deploy.rc == 0
  changed_when: false
  retries: 30
  delay: 10
  when: ccm_workload_kind.stdout == "deployment"
 - name: Wait for CCM daemonset rollout
  command: kubectl rollout status daemonset/hcloud-cloud-controller-manager -n kube-system
  register: ccm_rollout_ds
  until: ccm_rollout_ds.rc == 0
  changed_when: false
  retries: 30
  delay: 10
  when: ccm_workload_kind.stdout == "daemonset"
 - name: Set default Hetzner load balancer location for Traefik service
  command: kubectl -n kube-system annotate service traefik load-balancer.hetzner.cloud/location={{ hcloud_lb_location }} --overwrite
  register: traefik_annotation
  changed_when: true
  failed_when: false
 - name: Show Traefik service when annotation patch fails
  command: kubectl -n kube-system get service traefik -o yaml
  register: traefik_service_dump
  changed_when: false
  failed_when: false
  when: traefik_annotation.rc != 0
 - name: Fail when Traefik load balancer annotation cannot be set
  fail:
    msg: |
      Failed to set Hetzner load balancer location annotation on kube-system/traefik service.
      Command output:
      {{ traefik_annotation.stderr | default(traefik_annotation.stdout) }}
      Service dump:
      {{ traefik_service_dump.stdout | default('n/a') }}
  when: traefik_annotation.rc != 0
 - name: Show CCM namespace objects when workload missing
  command: kubectl -n kube-system get all | grep hcloud-cloud-controller-manager || true
  register: ccm_ns_objects
  changed_when: false
  when: ccm_workload_kind.stdout == "missing"
 - name: Fail when CCM workload is missing
  fail:
    msg: |
      hcloud-cloud-controller-manager workload not found after applying manifest.
      Namespace objects:
      {{ ccm_ns_objects.stdout | default('n/a') }}
  when: ccm_workload_kind.stdout == "missing"
--- a/ansible/roles/csi/defaults/main.yml
+++ b/ansible/roles/csi/defaults/main.yml
@@ -1,15 +0,0 @@
 ---
 hcloud_token: ""
 cluster_name: "k8s-cluster"
 csi_manifest_url: "https://raw.githubusercontent.com/hetznercloud/csi-driver/main/deploy/kubernetes/hcloud-csi.yml"
 csi_rollout_timeout_seconds: 30
 csi_rollout_retries: 8
 csi_rollout_delay_seconds: 5
 csi_failure_log_tail_lines: 120
 csi_smoke_test_enabled: true
 csi_smoke_test_storage_class: "csi-smoke-hcloud-immediate"
 csi_smoke_test_base_storage_class: "hcloud-volumes"
 csi_smoke_test_size: "1Gi"
 csi_smoke_test_pvc_timeout_seconds: 300
 csi_smoke_test_job_timeout_seconds: 300
 csi_smoke_test_required: false
--- a/ansible/roles/csi/tasks/main.yml
+++ b/ansible/roles/csi/tasks/main.yml
@@ -1,425 +0,0 @@
 ---
 - name: Create Hetzner CSI secret
  shell: |
    kubectl -n kube-system create secret generic hcloud \
      --from-literal=token='{{ hcloud_token }}' \
      --from-literal=network='{{ cluster_name }}-network' \
      --dry-run=client -o yaml | kubectl apply -f -
  no_log: true
  when: hcloud_token is defined
  changed_when: true
 - name: Deploy Hetzner CSI
  command: kubectl apply -f {{ csi_manifest_url }}
  changed_when: true
 - name: Ensure CSI controller endpoint is set for sidecars
  command: kubectl -n kube-system set env deployment/hcloud-csi-controller CSI_ENDPOINT=unix:///run/csi/socket
  changed_when: true
 - name: Ensure CSI node endpoint is set for sidecars
  command: kubectl -n kube-system set env daemonset/hcloud-csi-node CSI_ENDPOINT=unix:///run/csi/socket
  changed_when: true
 - name: Restart CSI controller to pick up current secret
  command: kubectl -n kube-system rollout restart deployment/hcloud-csi-controller
  changed_when: true
 - name: Wait for CSI controller deployment generation
  command: kubectl -n kube-system rollout status deployment/hcloud-csi-controller --timeout=30s
  failed_when: false
  changed_when: false
 - name: Wait for CSI controller rollout
  command: kubectl rollout status deployment/hcloud-csi-controller -n kube-system --timeout={{ csi_rollout_timeout_seconds }}s
  register: csi_controller_rollout
  until: csi_controller_rollout.rc == 0
  retries: "{{ csi_rollout_retries | int }}"
  delay: "{{ csi_rollout_delay_seconds | int }}"
  failed_when: false
  changed_when: false
 - name: Show CSI controller status on failure
  command: kubectl -n kube-system get deployment hcloud-csi-controller -o wide
  register: csi_controller_deploy_status
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Show CSI controller pods on failure
  command: kubectl -n kube-system get pods -l app=hcloud-csi-controller -o wide
  register: csi_controller_pods_status
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Describe CSI controller deployment on failure
  command: kubectl -n kube-system describe deployment hcloud-csi-controller
  register: csi_controller_deploy_describe
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Describe CSI controller pod on failure
  shell: |
    pod="$(kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
    if [ -n "$pod" ]; then
      kubectl -n kube-system describe pod "$pod"
    fi
  register: csi_controller_pod_describe
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Show CSI driver logs on failure
  command: kubectl -n kube-system logs deployment/hcloud-csi-controller -c hcloud-csi-driver --tail={{ csi_failure_log_tail_lines }}
  register: csi_driver_logs
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Show CSI driver previous logs on failure
  shell: |
    pod="$(kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
    if [ -n "$pod" ]; then
      kubectl -n kube-system logs "$pod" -c hcloud-csi-driver --previous --tail={{ csi_failure_log_tail_lines }}
    fi
  register: csi_driver_previous_logs
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Show sidecar previous logs on failure
  shell: |
    pod="$(kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
    if [ -n "$pod" ]; then
      for container in csi-attacher csi-resizer csi-provisioner; do
        echo "===== $container ====="
        kubectl -n kube-system logs "$pod" -c "$container" --previous --tail={{ csi_failure_log_tail_lines }} || true
      done
    fi
  register: csi_sidecar_previous_logs
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Show recent kube-system events on failure
  command: kubectl -n kube-system get events --sort-by=.lastTimestamp
  register: csi_recent_events
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Fail with CSI controller diagnostics
  fail:
    msg: |
      CSI controller rollout failed.
      Deployment status:
      {{ csi_controller_deploy_status.stdout | default('n/a') }}
      Pods status:
      {{ csi_controller_pods_status.stdout | default('n/a') }}
      Deployment describe:
      {{ csi_controller_deploy_describe.stdout | default('n/a') }}
      Pod describe:
      {{ csi_controller_pod_describe.stdout | default('n/a') }}
      hcloud-csi-driver logs:
      {{ csi_driver_logs.stdout | default('n/a') }}
      hcloud-csi-driver previous logs:
      {{ csi_driver_previous_logs.stdout | default('n/a') }}
      Sidecar previous logs:
      {{ csi_sidecar_previous_logs.stdout | default('n/a') }}
      Recent kube-system events:
      {{ csi_recent_events.stdout | default('n/a') }}
  when: csi_controller_rollout.rc != 0
 - name: Wait for CSI node daemonset rollout
  command: kubectl rollout status daemonset/hcloud-csi-node -n kube-system --timeout={{ csi_rollout_timeout_seconds }}s
  register: csi_node_rollout
  until: csi_node_rollout.rc == 0
  retries: "{{ csi_rollout_retries | int }}"
  delay: "{{ csi_rollout_delay_seconds | int }}"
  failed_when: false
  changed_when: false
 - name: Fail when CSI node daemonset rollout does not complete
  fail:
    msg: "CSI node daemonset rollout failed: {{ csi_node_rollout.stdout | default('') }} {{ csi_node_rollout.stderr | default('') }}"
  when: csi_node_rollout.rc != 0
 - name: Generate CSI smoke test run identifier
  set_fact:
    csi_smoke_test_run_id: "{{ lookup('pipe', 'date +%s') }}"
  when: csi_smoke_test_enabled | bool
 - name: Generate unique CSI smoke test resource names
  set_fact:
    csi_smoke_test_pvc_name: "csi-smoke-pvc-{{ csi_smoke_test_run_id }}"
    csi_smoke_test_job_name: "csi-smoke-job-{{ csi_smoke_test_run_id }}"
  when: csi_smoke_test_enabled | bool
 - name: Cleanup stale CSI smoke test resources before apply
  shell: |
    kubectl -n kube-system delete job,pvc -l app.kubernetes.io/name=csi-smoke --ignore-not-found --wait=true
    kubectl delete storageclass {{ csi_smoke_test_storage_class }} --ignore-not-found
  failed_when: false
  changed_when: false
  when: csi_smoke_test_enabled | bool
 - name: Apply CSI smoke test resources
  shell: |
    kubectl apply -f - <<'EOF'
    apiVersion: storage.k8s.io/v1
    kind: StorageClass
    metadata:
      name: {{ csi_smoke_test_storage_class }}
    provisioner: csi.hetzner.cloud
    reclaimPolicy: Delete
    volumeBindingMode: Immediate
    allowVolumeExpansion: true
    ---
    apiVersion: v1
    kind: PersistentVolumeClaim
    metadata:
      name: {{ csi_smoke_test_pvc_name }}
      namespace: kube-system
      labels:
        app.kubernetes.io/name: csi-smoke
    spec:
      accessModes:
        - ReadWriteOnce
      resources:
        requests:
          storage: {{ csi_smoke_test_size }}
      storageClassName: {{ csi_smoke_test_storage_class }}
    ---
    apiVersion: batch/v1
    kind: Job
    metadata:
      name: {{ csi_smoke_test_job_name }}
      namespace: kube-system
      labels:
        app.kubernetes.io/name: csi-smoke
    spec:
      backoffLimit: 0
      template:
        spec:
          restartPolicy: Never
          containers:
            - name: write-and-read
              image: busybox:1.36
              command: ["/bin/sh", "-c", "echo csi-ok > /data/health && cat /data/health"]
              volumeMounts:
                - name: data
                  mountPath: /data
          volumes:
            - name: data
              persistentVolumeClaim:
                claimName: {{ csi_smoke_test_pvc_name }}
    EOF
  changed_when: true
  when: csi_smoke_test_enabled | bool
 - name: Wait for CSI smoke PVC to bind
  command: kubectl -n kube-system wait --for=jsonpath='{.status.phase}'=Bound pvc/{{ csi_smoke_test_pvc_name }} --timeout={{ csi_smoke_test_pvc_timeout_seconds }}s
  register: csi_smoke_pvc_wait
  failed_when: false
  changed_when: false
  when: csi_smoke_test_enabled | bool
 - name: Wait for CSI smoke Job completion
  command: kubectl -n kube-system wait --for=condition=complete job/{{ csi_smoke_test_job_name }} --timeout={{ csi_smoke_test_job_timeout_seconds }}s
  register: csi_smoke_job_wait
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc == 0
 - name: Show CSI smoke job logs
  command: kubectl -n kube-system logs job/{{ csi_smoke_test_job_name }}
  register: csi_smoke_job_logs
  failed_when: false
  changed_when: false
  when: csi_smoke_test_enabled | bool
 - name: Show CSI smoke PVC on failure
  command: kubectl -n kube-system get pvc {{ csi_smoke_test_pvc_name }} -o wide
  register: csi_smoke_pvc_status
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Show CSI smoke Job on failure
  command: kubectl -n kube-system get job {{ csi_smoke_test_job_name }} -o wide
  register: csi_smoke_job_status
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Show CSI smoke pods on failure
  command: kubectl -n kube-system get pod -l job-name={{ csi_smoke_test_job_name }} -o wide
  register: csi_smoke_pod_status
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Describe CSI smoke PVC on failure
  command: kubectl -n kube-system describe pvc {{ csi_smoke_test_pvc_name }}
  register: csi_smoke_pvc_describe
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Show storage classes on failure
  command: kubectl get storageclass
  register: csi_storageclasses
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Get CSI controller pod name on smoke failure
  shell: kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}'
  register: csi_controller_pod_name
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Describe CSI controller pod on smoke failure
  command: kubectl -n kube-system describe pod {{ csi_controller_pod_name.stdout }}
  register: csi_controller_pod_smoke_describe
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
    - csi_controller_pod_name.stdout | length > 0
 - name: Show CSI controller container logs on smoke failure
  shell: |
    pod="{{ csi_controller_pod_name.stdout }}"
    for container in hcloud-csi-driver csi-provisioner csi-attacher csi-resizer liveness-probe; do
      echo "===== ${container}: current ====="
      kubectl -n kube-system logs "$pod" -c "$container" --tail={{ csi_failure_log_tail_lines }} || true
      echo "===== ${container}: previous ====="
      kubectl -n kube-system logs "$pod" -c "$container" --previous --tail={{ csi_failure_log_tail_lines }} || true
    done
  register: csi_controller_container_logs
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
    - csi_controller_pod_name.stdout | length > 0
 - name: Show CSI driver and node driver objects on smoke failure
  shell: |
    echo "===== CSIDriver ====="
    kubectl get csidriver csi.hetzner.cloud -o yaml || true
    echo "===== CSINode ====="
    kubectl get csinode -o wide || true
  register: csi_driver_objects
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Show CSI smoke pod describe on failure
  shell: |
    pod="$(kubectl -n kube-system get pods -l job-name={{ csi_smoke_test_job_name }} -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
    if [ -n "$pod" ]; then
      kubectl -n kube-system describe pod "$pod"
    fi
  register: csi_smoke_pod_describe
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Fail when CSI smoke test fails
  fail:
    msg: |
      CSI smoke test failed.
      PVC wait:
      stdout: {{ csi_smoke_pvc_wait.stdout | default('') }}
      stderr: {{ csi_smoke_pvc_wait.stderr | default('') }}
      Job wait:
      stdout: {{ csi_smoke_job_wait.stdout | default('') }}
      stderr: {{ csi_smoke_job_wait.stderr | default('') }}
      PVC:
      {{ csi_smoke_pvc_status.stdout | default(csi_smoke_pvc_status.stderr | default('n/a')) }}
      Job:
      {{ csi_smoke_job_status.stdout | default(csi_smoke_job_status.stderr | default('n/a')) }}
      Pod list:
      {{ csi_smoke_pod_status.stdout | default(csi_smoke_pod_status.stderr | default('n/a')) }}
      PVC describe:
      {{ csi_smoke_pvc_describe.stdout | default(csi_smoke_pvc_describe.stderr | default('n/a')) }}
      Storage classes:
      {{ csi_storageclasses.stdout | default(csi_storageclasses.stderr | default('n/a')) }}
      CSI controller pod:
      {{ csi_controller_pod_name.stdout | default('n/a') }}
      CSI controller pod describe:
      {{ csi_controller_pod_smoke_describe.stdout | default(csi_controller_pod_smoke_describe.stderr | default('n/a')) }}
      CSI controller container logs:
      {{ csi_controller_container_logs.stdout | default(csi_controller_container_logs.stderr | default('n/a')) }}
      CSI driver objects:
      {{ csi_driver_objects.stdout | default(csi_driver_objects.stderr | default('n/a')) }}
      Pod describe:
      {{ csi_smoke_pod_describe.stdout | default('n/a') }}
      Job logs:
      {{ csi_smoke_job_logs.stdout | default('n/a') }}
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
    - csi_smoke_test_required | bool
 - name: Warn when CSI smoke test fails but is non-blocking
  debug:
    msg: |
      CSI smoke test failed but csi_smoke_test_required is false, so deployment will continue.
      PVC wait stderr: {{ csi_smoke_pvc_wait.stderr | default('') }}
      Job wait stderr: {{ csi_smoke_job_wait.stderr | default('') }}
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
    - not (csi_smoke_test_required | bool)
 - name: Cleanup CSI smoke test resources
  shell: |
    kubectl -n kube-system delete job {{ csi_smoke_test_job_name }} pvc {{ csi_smoke_test_pvc_name }} --ignore-not-found
    kubectl delete storageclass {{ csi_smoke_test_storage_class }} --ignore-not-found
  failed_when: false
  changed_when: false
  when: csi_smoke_test_enabled | bool
--- a/ansible/roles/doppler-bootstrap/tasks/main.yml
+++ b/ansible/roles/doppler-bootstrap/tasks/main.yml
@@ -0,0 +1,50 @@
 ---
 - name: Ensure Doppler service token is provided
  assert:
    that:
      - doppler_hetznerterra_service_token | length > 0
    fail_msg: doppler_hetznerterra_service_token must be provided for External Secrets bootstrap.
 - name: Ensure external-secrets namespace exists
  shell: kubectl create namespace external-secrets --dry-run=client -o yaml | kubectl apply -f -
  changed_when: true
 - name: Apply Doppler service token secret
  shell: >-
    kubectl -n external-secrets create secret generic doppler-hetznerterra-service-token
    --from-literal=dopplerToken='{{ doppler_hetznerterra_service_token }}'
    --dry-run=client -o yaml | kubectl apply -f -
  changed_when: true
 - name: Check for ClusterSecretStore CRD
  command: kubectl get crd clustersecretstores.external-secrets.io
  register: doppler_clustersecretstore_crd
  changed_when: false
  failed_when: false
 - name: Apply Doppler ClusterSecretStore
  shell: |
    cat <<'EOF' | kubectl apply -f -
    apiVersion: external-secrets.io/v1
    kind: ClusterSecretStore
    metadata:
      name: doppler-hetznerterra
    spec:
      provider:
        doppler:
          auth:
            secretRef:
              dopplerToken:
                name: doppler-hetznerterra-service-token
                key: dopplerToken
                namespace: external-secrets
    EOF
  changed_when: true
  when: doppler_clustersecretstore_crd.rc == 0
 - name: Note pending Doppler ClusterSecretStore bootstrap
  debug:
    msg: >-
      Skipping Doppler ClusterSecretStore bootstrap because the External Secrets CRD
      is not available yet. Re-run after External Secrets is installed.
  when: doppler_clustersecretstore_crd.rc != 0
--- a/ansible/roles/k3s-agent/defaults/main.yml
+++ b/ansible/roles/k3s-agent/defaults/main.yml
@@ -3,3 +3,4 @@ k3s_version: latest
 k3s_server_url: ""
 k3s_token: ""
 k3s_node_ip: ""
 k3s_kubelet_cloud_provider_external: true
--- a/ansible/roles/k3s-agent/tasks/main.yml
+++ b/ansible/roles/k3s-agent/tasks/main.yml
@@ -12,14 +12,42 @@
  when: not k3s_agent_binary.stat.exists
 - name: Install k3s agent
  environment:
    INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
    K3S_URL: "{{ k3s_server_url }}"
    K3S_TOKEN: "{{ k3s_token }}"
  command: /tmp/install-k3s.sh agent --node-ip {{ k3s_node_ip }}
  args:
    creates: /usr/local/bin/k3s-agent
  when: not k3s_agent_binary.stat.exists
  block:
    - name: Run k3s agent install
      environment:
        INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
        K3S_URL: "{{ k3s_server_url }}"
        K3S_TOKEN: "{{ k3s_token }}"
      command: >-
        /tmp/install-k3s.sh agent
        --node-ip {{ k3s_node_ip }}
        --flannel-iface=enp7s0
        {% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
      args:
        creates: /usr/local/bin/k3s-agent
  rescue:
    - name: Show k3s-agent service status after failed install
      command: systemctl status k3s-agent --no-pager
      register: k3s_agent_status_after_install
      changed_when: false
      failed_when: false
    - name: Show recent k3s-agent logs after failed install
      command: journalctl -u k3s-agent -n 120 --no-pager
      register: k3s_agent_journal_after_install
      changed_when: false
      failed_when: false
    - name: Fail with k3s-agent diagnostics
      fail:
        msg: |
          k3s agent install failed on {{ inventory_hostname }}.
          Service status:
          {{ k3s_agent_status_after_install.stdout | default('n/a') }}
          Recent logs:
          {{ k3s_agent_journal_after_install.stdout | default('n/a') }}
 - name: Wait for k3s agent to be ready
  command: systemctl is-active k3s-agent
--- a/ansible/roles/k3s-server/defaults/main.yml
+++ b/ansible/roles/k3s-server/defaults/main.yml
@@ -3,3 +3,14 @@ k3s_version: latest
 k3s_token: ""
 k3s_node_ip: ""
 k3s_primary_public_ip: ""
 k3s_disable_embedded_ccm: true
 k3s_disable_servicelb: true
 k3s_kubelet_cloud_provider_external: true
 # Load Balancer endpoint for HA cluster joins (set in inventory)
 kube_api_endpoint: ""
 # Tailscale DNS names for control planes (to enable tailnet access)
 # Using DNS names instead of IPs since Tailscale IPs change on rebuild
 tailscale_control_plane_names:
  - "k8s-cluster-cp-1.silverside-gopher.ts.net"
  - "k8s-cluster-cp-2.silverside-gopher.ts.net"
  - "k8s-cluster-cp-3.silverside-gopher.ts.net"
--- a/ansible/roles/k3s-server/tasks/main.yml
+++ b/ansible/roles/k3s-server/tasks/main.yml
@@ -15,9 +15,9 @@
  set_fact:
    k3s_install_needed: "{{ (not k3s_service.stat.exists) or ((k3s_service_state.stdout | default('')) != 'active') }}"
- name: Wait for primary API on 6443 (secondary only)
+- name: Wait for API endpoint on 6443 (secondary only)
  wait_for:
-    host: "{{ k3s_primary_ip }}"
+    host: "{{ k3s_join_endpoint | default(k3s_primary_ip) }}"
    port: 6443
    state: started
    timeout: 120
@@ -28,27 +28,22 @@
  stat:
    path: /usr/local/bin/k3s-uninstall.sh
  register: k3s_uninstall_script
-  when:
+  when: k3s_install_needed
    - not (k3s_primary | default(false))
    - k3s_install_needed
- name: Reset broken secondary k3s install before rejoin
+- name: Reset broken k3s install before reinstall
  command: /usr/local/bin/k3s-uninstall.sh
  when:
    - not (k3s_primary | default(false))
    - k3s_install_needed
    - k3s_uninstall_script.stat.exists
- name: Remove stale k3s data on secondary
+- name: Remove stale k3s data
  file:
    path: "{{ item }}"
    state: absent
  loop:
    - /etc/rancher/k3s
    - /var/lib/rancher/k3s
-  when:
+  when: k3s_install_needed
    - not (k3s_primary | default(false))
    - k3s_install_needed
 - name: Download k3s install script
  get_url:
@@ -61,8 +56,20 @@
  environment:
    INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
    K3S_TOKEN: "{{ k3s_token }}"
-  command: /tmp/install-k3s.sh server --cluster-init --advertise-address={{ k3s_primary_ip }} --node-ip={{ k3s_node_ip }} --tls-san={{ k3s_primary_ip }} --tls-san={{ k3s_primary_public_ip }}
+  command: >-
-  when: 
+    /tmp/install-k3s.sh server
    --cluster-init
    --advertise-address={{ k3s_primary_ip }}
    --node-ip={{ k3s_node_ip }}
    --flannel-iface=enp7s0
    --tls-san={{ k3s_primary_ip }}
    --tls-san={{ k3s_primary_public_ip }}
    --tls-san={{ kube_api_endpoint }}
    {% for name in tailscale_control_plane_names %}--tls-san={{ name }} {% endfor %}
    {% if k3s_disable_embedded_ccm | bool %}--disable-cloud-controller{% endif %}
    {% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
    {% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
  when:
    - k3s_install_needed
    - k3s_primary | default(false)
@@ -75,7 +82,15 @@
      environment:
        INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
        K3S_TOKEN: "{{ k3s_token }}"
-      command: /tmp/install-k3s.sh server --server https://{{ k3s_primary_ip }}:6443 --advertise-address={{ k3s_node_ip }} --node-ip={{ k3s_node_ip }}
+      command: >-
        /tmp/install-k3s.sh server
        --server https://{{ k3s_join_endpoint | default(k3s_primary_ip) }}:6443
        --advertise-address={{ k3s_node_ip }}
        --node-ip={{ k3s_node_ip }}
        --flannel-iface=enp7s0
        {% if k3s_disable_embedded_ccm | bool %}--disable-cloud-controller{% endif %}
        {% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
        {% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
      register: secondary_install
  rescue:
--- a/ansible/roles/observability-content/defaults/main.yml
+++ b/ansible/roles/observability-content/defaults/main.yml
@@ -0,0 +1,9 @@
 ---
 observability_namespace: "observability"
 grafana_dashboard_configmap_name: "grafana-dashboard-k8s-overview"
 grafana_datasource_configmap_name: "grafana-datasources-core"
 loki_enabled: true
 grafana_prometheus_url: "http://kube-prometheus-stack-prometheus.{{ observability_namespace }}.svc.cluster.local:9090"
 grafana_loki_url: "http://loki.{{ observability_namespace }}.svc.cluster.local:3100"
 grafana_use_prometheus_nodeport_fallback: true
 grafana_use_loki_nodeport_fallback: true
--- a/ansible/roles/observability-content/tasks/main.yml
+++ b/ansible/roles/observability-content/tasks/main.yml
@@ -0,0 +1,173 @@
 ---
 - name: Ensure observability namespace exists
  command: kubectl create namespace {{ observability_namespace }}
  register: create_observability_ns
  failed_when: create_observability_ns.rc != 0 and "AlreadyExists" not in create_observability_ns.stderr
  changed_when: create_observability_ns.rc == 0
 - name: Wait for Grafana deployment rollout
  command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
  changed_when: false
 - name: Set default Prometheus datasource URL
  set_fact:
    grafana_prometheus_effective_url: "{{ grafana_prometheus_url }}"
    grafana_loki_effective_url: "{{ grafana_loki_url }}"
 - name: Get Grafana pod name
  command: kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}'
  register: grafana_pod_name
  changed_when: false
 - name: Probe Prometheus from Grafana pod via default datasource URL
  shell: >-
    kubectl -n {{ observability_namespace }} exec {{ grafana_pod_name.stdout }} -c grafana --
    sh -c 'wget -qO- --timeout=5 {{ grafana_prometheus_url }}/-/ready >/dev/null'
  register: grafana_prometheus_probe
  changed_when: false
  failed_when: false
 - name: Probe Loki from Grafana pod via default datasource URL
  shell: >-
    kubectl -n {{ observability_namespace }} exec {{ grafana_pod_name.stdout }} -c grafana --
    sh -c 'wget -qO- --timeout=5 {{ grafana_loki_url }}/ready >/dev/null'
  register: grafana_loki_probe
  changed_when: false
  failed_when: false
  when: loki_enabled
 - name: Get Prometheus pod host IP for fallback
  command: kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=prometheus -o jsonpath='{.items[0].status.hostIP}'
  register: prometheus_host_ip
  changed_when: false
  when:
    - grafana_use_prometheus_nodeport_fallback | bool
    - grafana_prometheus_probe.rc != 0
 - name: Get Prometheus service NodePort for fallback
  command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-prometheus -o jsonpath='{.spec.ports[?(@.name=="http-web")].nodePort}'
  register: prometheus_nodeport
  changed_when: false
  when:
    - grafana_use_prometheus_nodeport_fallback | bool
    - grafana_prometheus_probe.rc != 0
 - name: Enable Prometheus NodePort fallback datasource URL
  set_fact:
    grafana_prometheus_effective_url: "http://{{ prometheus_host_ip.stdout }}:{{ prometheus_nodeport.stdout }}"
  when:
    - grafana_use_prometheus_nodeport_fallback | bool
    - grafana_prometheus_probe.rc != 0
    - prometheus_host_ip.stdout | length > 0
    - prometheus_nodeport.stdout | length > 0
 - name: Ensure Loki service uses NodePort for fallback
  command: kubectl -n {{ observability_namespace }} patch svc loki -p '{"spec":{"type":"NodePort"}}'
  changed_when: false
  failed_when: false
  when:
    - loki_enabled
    - grafana_use_loki_nodeport_fallback | bool
    - grafana_loki_probe.rc != 0
 - name: Get Loki pod host IP for fallback
  command: kubectl -n {{ observability_namespace }} get pod loki-0 -o jsonpath='{.status.hostIP}'
  register: loki_host_ip
  changed_when: false
  when:
    - loki_enabled
    - grafana_use_loki_nodeport_fallback | bool
    - grafana_loki_probe.rc != 0
 - name: Get Loki service NodePort for fallback
  command: kubectl -n {{ observability_namespace }} get svc loki -o jsonpath='{.spec.ports[?(@.name=="http-metrics")].nodePort}'
  register: loki_nodeport
  changed_when: false
  when:
    - loki_enabled
    - grafana_use_loki_nodeport_fallback | bool
    - grafana_loki_probe.rc != 0
 - name: Enable Loki NodePort fallback datasource URL
  set_fact:
    grafana_loki_effective_url: "http://{{ loki_host_ip.stdout }}:{{ loki_nodeport.stdout }}"
  when:
    - loki_enabled
    - grafana_use_loki_nodeport_fallback | bool
    - grafana_loki_probe.rc != 0
    - loki_host_ip.stdout | length > 0
    - loki_nodeport.stdout | length > 0
 - name: Query Loki labels endpoint from Grafana pod
  shell: >-
    kubectl -n {{ observability_namespace }} exec {{ grafana_pod_name.stdout }} -c grafana --
    sh -c 'wget -qO- --timeout=10 {{ grafana_loki_effective_url }}/loki/api/v1/labels'
  register: grafana_loki_labels
  changed_when: false
  failed_when: false
  when: loki_enabled
 - name: Fail when Loki is reachable but has zero indexed labels
  fail:
    msg: >-
      Loki is reachable from Grafana at {{ grafana_loki_effective_url }} but /loki/api/v1/labels returned no labels.
      This usually means no logs are ingested yet. Check Promtail and tenant configuration.
  when:
    - loki_enabled
    - grafana_loki_labels.rc == 0
    - "'\"status\":\"success\"' in (grafana_loki_labels.stdout | replace(' ', ''))"
    - "'\"data\":[]' in (grafana_loki_labels.stdout | replace(' ', ''))"
 - name: Write default Prometheus datasource ConfigMap patch
  template:
    src: grafana-default-prometheus-datasource.yaml.j2
    dest: /tmp/grafana-default-prometheus-datasource.yaml
    mode: "0644"
 - name: Apply default Prometheus datasource ConfigMap patch
  command: kubectl apply -f /tmp/grafana-default-prometheus-datasource.yaml
  changed_when: true
 - name: Remove legacy Loki datasource ConfigMap
  command: kubectl -n {{ observability_namespace }} delete configmap grafana-datasource-loki --ignore-not-found=true
  changed_when: false
  failed_when: false
 - name: Write Grafana datasources ConfigMap
  template:
    src: grafana-datasources.yaml.j2
    dest: /tmp/grafana-datasources.yaml
    mode: "0644"
  when: loki_enabled
 - name: Apply Grafana datasources ConfigMap
  command: kubectl apply -f /tmp/grafana-datasources.yaml
  changed_when: true
  when: loki_enabled
 - name: Restart Grafana to load datasource updates deterministically
  command: kubectl -n {{ observability_namespace }} rollout restart deployment/kube-prometheus-stack-grafana
  changed_when: true
 - name: Wait for Grafana rollout after datasource update
  command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
  changed_when: false
 - name: Write Grafana dashboard ConfigMap
  template:
    src: grafana-dashboard-k8s-overview.yaml.j2
    dest: /tmp/grafana-dashboard-k8s-overview.yaml
    mode: "0644"
 - name: Apply Grafana dashboard ConfigMap
  command: kubectl apply -f /tmp/grafana-dashboard-k8s-overview.yaml
  changed_when: true
 - name: Show Grafana content provisioning summary
  debug:
    msg: |
      Grafana content applied.
      Datasources ConfigMap: {{ grafana_datasource_configmap_name }}
      Prometheus datasource URL: {{ grafana_prometheus_effective_url }}
      Loki datasource URL: {{ grafana_loki_effective_url }}
      Dashboard ConfigMap: {{ grafana_dashboard_configmap_name }}
--- a/ansible/roles/observability-content/templates/grafana-dashboard-k8s-overview.yaml.j2
+++ b/ansible/roles/observability-content/templates/grafana-dashboard-k8s-overview.yaml.j2
@@ -0,0 +1,60 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: {{ grafana_dashboard_configmap_name }}
  namespace: {{ observability_namespace }}
  labels:
    grafana_dashboard: "1"
 data:
  k8s-overview.json: |
    {
      "annotations": {"list": []},
      "editable": true,
      "fiscalYearStartMonth": 0,
      "graphTooltip": 0,
      "id": null,
      "links": [],
      "panels": [
        {
          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
          "fieldConfig": {"defaults": {"unit": "none"}, "overrides": []},
          "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
          "id": 1,
          "options": {"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
          "targets": [
            {
              "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
              "legendFormat": "ready",
              "refId": "A"
            }
          ],
          "title": "Ready Nodes",
          "type": "stat"
        },
        {
          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
          "fieldConfig": {"defaults": {"unit": "percentunit"}, "overrides": []},
          "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
          "id": 2,
          "targets": [
            {
              "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))",
              "legendFormat": "cpu",
              "refId": "A"
            }
          ],
          "title": "Cluster CPU Usage",
          "type": "timeseries"
        }
      ],
      "refresh": "30s",
      "schemaVersion": 39,
      "style": "dark",
      "tags": ["kubernetes", "infrastructure"],
      "templating": {"list": []},
      "time": {"from": "now-1h", "to": "now"},
      "timezone": "browser",
      "title": "K8s Cluster Overview",
      "uid": "k8s-cluster-overview",
      "version": 1
    }
--- a/ansible/roles/observability-content/templates/grafana-datasources.yaml.j2
+++ b/ansible/roles/observability-content/templates/grafana-datasources.yaml.j2
@@ -0,0 +1,18 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: {{ grafana_datasource_configmap_name }}
  namespace: {{ observability_namespace }}
  labels:
    grafana_datasource: "1"
 data:
  datasources.yaml: |
    apiVersion: 1
    datasources:
 {% if loki_enabled %}
      - name: Loki
        type: loki
        access: proxy
        url: "{{ grafana_loki_effective_url }}"
        isDefault: false
 {% endif %}
--- a/ansible/roles/observability-content/templates/grafana-default-prometheus-datasource.yaml.j2
+++ b/ansible/roles/observability-content/templates/grafana-default-prometheus-datasource.yaml.j2
@@ -0,0 +1,26 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: kube-prometheus-stack-grafana-datasource
  namespace: {{ observability_namespace }}
 data:
  datasource.yaml: |-
    apiVersion: 1
    datasources:
    - name: "Prometheus"
      type: prometheus
      uid: prometheus
      url: "{{ grafana_prometheus_effective_url }}/"
      access: proxy
      isDefault: true
      jsonData:
        httpMethod: POST
        timeInterval: 30s
    - name: "Alertmanager"
      type: alertmanager
      uid: alertmanager
      url: http://kube-prometheus-stack-alertmanager.{{ observability_namespace }}:9093/
      access: proxy
      jsonData:
        handleGrafanaManagedAlerts: false
        implementation: prometheus
--- a/ansible/roles/observability/defaults/main.yml
+++ b/ansible/roles/observability/defaults/main.yml
@@ -0,0 +1,27 @@
 ---
 observability_namespace: "observability"
 prometheus_chart_version: "68.4.4"
 loki_chart_version: "6.10.0"
 promtail_chart_version: "6.16.6"
 grafana_admin_password: ""
 prometheus_storage_size: "10Gi"
 grafana_storage_size: "5Gi"
 loki_storage_size: "10Gi"
 prometheus_storage_class: "local-path"
 grafana_storage_class: "local-path"
 loki_storage_class: "local-path"
 loki_enabled: true
 tailscale_oauth_client_id: ""
 tailscale_oauth_client_secret: ""
 tailscale_tailnet: ""
 observability_tailscale_expose: true
 grafana_tailscale_hostname: "grafana"
 prometheus_tailscale_hostname: "prometheus"
 tailscale_proxyclass_name: "infra-stable"
--- a/ansible/roles/observability/tasks/main.yml
+++ b/ansible/roles/observability/tasks/main.yml
@@ -0,0 +1,252 @@
 ---
 - name: Check if Helm is installed
  command: helm version --short
  register: helm_check
  changed_when: false
  failed_when: false
 - name: Install Helm
  shell: curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
  when: helm_check.rc != 0
  changed_when: true
 - name: Ensure observability namespace exists
  command: kubectl create namespace {{ observability_namespace }}
  register: create_observability_ns
  failed_when: create_observability_ns.rc != 0 and "AlreadyExists" not in create_observability_ns.stderr
  changed_when: create_observability_ns.rc == 0
 - name: Set Grafana admin password
  set_fact:
    grafana_password_effective: "{{ grafana_admin_password if grafana_admin_password | length > 0 else lookup('password', '/dev/null length=32 chars=ascii_letters,digits') }}"
 - name: Write kube-prometheus-stack values
  template:
    src: kube-prometheus-stack-values.yaml.j2
    dest: /tmp/kube-prometheus-stack-values.yaml
    mode: "0644"
 - name: Add Prometheus Helm repo
  command: helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
  register: add_prom_repo
  failed_when: add_prom_repo.rc != 0 and "already exists" not in add_prom_repo.stderr
  changed_when: add_prom_repo.rc == 0
 - name: Add Grafana Helm repo
  command: helm repo add grafana https://grafana.github.io/helm-charts
  register: add_grafana_repo
  failed_when: add_grafana_repo.rc != 0 and "already exists" not in add_grafana_repo.stderr
  changed_when: add_grafana_repo.rc == 0
 - name: Update Helm repos
  command: helm repo update
  changed_when: false
 - name: Clear stale pending Helm revision secrets for kube-prometheus-stack
  shell: >-
    kubectl -n {{ observability_namespace }} delete
    $(kubectl -n {{ observability_namespace }} get secret -l owner=helm,name=kube-prometheus-stack,status=pending-upgrade -o name)
    --ignore-not-found=true;
    kubectl -n {{ observability_namespace }} delete
    $(kubectl -n {{ observability_namespace }} get secret -l owner=helm,name=kube-prometheus-stack,status=pending-install -o name)
    --ignore-not-found=true;
    kubectl -n {{ observability_namespace }} delete
    $(kubectl -n {{ observability_namespace }} get secret -l owner=helm,name=kube-prometheus-stack,status=pending-rollback -o name)
    --ignore-not-found=true
  changed_when: false
  failed_when: false
 - name: Install kube-prometheus-stack
  command: >-
    helm upgrade --install kube-prometheus-stack prometheus-community/kube-prometheus-stack
    --namespace {{ observability_namespace }}
    --version {{ prometheus_chart_version }}
    --values /tmp/kube-prometheus-stack-values.yaml
    --wait
    --timeout 10m
  register: kube_prom_install
  retries: 12
  delay: 15
  until: kube_prom_install.rc == 0
  changed_when: true
 - name: Wait for Grafana deployment rollout
  command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
  changed_when: false
 - name: Reset Grafana admin password in Grafana database
  shell: >-
    kubectl -n {{ observability_namespace }} exec
    "$(kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}')"
    -c grafana -- grafana cli admin reset-admin-password '{{ grafana_password_effective }}'
  changed_when: true
 - name: Write Loki values
  template:
    src: loki-values.yaml.j2
    dest: /tmp/loki-values.yaml
    mode: "0644"
  when: loki_enabled
 - name: Validate Loki chart produces resources
  command: >-
    helm template loki grafana/loki
    --namespace {{ observability_namespace }}
    --version {{ loki_chart_version }}
    --values /tmp/loki-values.yaml
  register: loki_template
  changed_when: false
  failed_when: "loki_template.rc != 0 or 'kind: StatefulSet' not in loki_template.stdout"
  when: loki_enabled
 - name: Remove legacy Loki resources
  command: >-
    kubectl -n {{ observability_namespace }} delete
    deployment/loki-gateway
    statefulset/loki
    statefulset/loki-chunks-cache
    statefulset/loki-results-cache
    statefulset/loki-backend
    statefulset/loki-read
    statefulset/loki-write
    poddisruptionbudget/loki-memcached-chunks-cache
    poddisruptionbudget/loki-memcached-results-cache
    --ignore-not-found=true
  changed_when: false
  failed_when: false
  when: loki_enabled
 - name: Clear stuck Helm lock for Loki
  command: kubectl -n {{ observability_namespace }} delete secret sh.helm.release.v1.loki.v1 --ignore-not-found=true
  changed_when: false
  failed_when: false
  when: loki_enabled
 - name: Uninstall failed Loki release (if stuck)
  command: helm uninstall loki -n {{ observability_namespace }}
  changed_when: false
  failed_when: false
  when: loki_enabled
 - name: Install Loki
  command: >-
    helm upgrade --install loki grafana/loki
    --namespace {{ observability_namespace }}
    --version {{ loki_chart_version }}
    --values /tmp/loki-values.yaml
  register: loki_install
  changed_when: true
  when: loki_enabled
 - name: Wait for Loki StatefulSet
  command: kubectl -n {{ observability_namespace }} rollout status statefulset/loki --timeout=10m
  register: loki_rollout
  changed_when: false
  when: loki_enabled
 - name: Show Loki pod status
  command: kubectl -n {{ observability_namespace }} get pods -l app.kubernetes.io/name=loki -o wide
  register: loki_pods
  changed_when: false
  when: loki_enabled
 - name: Debug Loki pods
  debug:
    msg: "{{ loki_pods.stdout }}"
  when: loki_enabled
 - name: Write Promtail values
  template:
    src: promtail-values.yaml.j2
    dest: /tmp/promtail-values.yaml
    mode: "0644"
  when: loki_enabled
 - name: Install Promtail
  command: >-
    helm upgrade --install promtail grafana/promtail
    --namespace {{ observability_namespace }}
    --version {{ promtail_chart_version }}
    --values /tmp/promtail-values.yaml
    --wait
    --timeout 10m
  changed_when: true
  when: loki_enabled
 - name: Check Tailscale service readiness for Grafana
  command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-grafana -o jsonpath='{.status.conditions[?(@.type=="TailscaleProxyReady")].status}'
  register: grafana_tailscale_ready
  changed_when: false
  failed_when: false
  when:
    - observability_tailscale_expose | bool
    - tailscale_operator_ready | default(false) | bool
 - name: Check Tailscale service readiness for Prometheus
  command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-prometheus -o jsonpath='{.status.conditions[?(@.type=="TailscaleProxyReady")].status}'
  register: prometheus_tailscale_ready
  changed_when: false
  failed_when: false
  when:
    - observability_tailscale_expose | bool
    - tailscale_operator_ready | default(false) | bool
 - name: Check Tailscale endpoint (IP/hostname) for Grafana
  shell: >-
    kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-grafana
    -o go-template='{{"{{"}}range .status.loadBalancer.ingress{{"}}"}}{{"{{"}}if .ip{{"}}"}}{{"{{"}}.ip{{"}}"}}{{"{{"}}else{{"}}"}}{{"{{"}}.hostname{{"}}"}}{{"{{"}}end{{"}}"}}{{"{{"}}end{{"}}"}}'
  register: grafana_lb_ip
  changed_when: false
  failed_when: false
  when:
    - observability_tailscale_expose | bool
    - tailscale_operator_ready | default(false) | bool
 - name: Check Tailscale endpoint (IP/hostname) for Prometheus
  shell: >-
    kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-prometheus
    -o go-template='{{"{{"}}range .status.loadBalancer.ingress{{"}}"}}{{"{{"}}if .ip{{"}}"}}{{"{{"}}.ip{{"}}"}}{{"{{"}}else{{"}}"}}{{"{{"}}.hostname{{"}}"}}{{"{{"}}end{{"}}"}}{{"{{"}}end{{"}}"}}'
  register: prometheus_lb_ip
  changed_when: false
  failed_when: false
  when:
    - observability_tailscale_expose | bool
    - tailscale_operator_ready | default(false) | bool
 - name: Show Tailscale access details
  debug:
    msg: |
      Observability stack deployed with Tailscale access!
      Grafana: http://{{ grafana_tailscale_hostname }}{% if grafana_lb_ip.stdout | default('') | length > 0 %} (or http://{{ grafana_lb_ip.stdout }}){% endif %}
      Prometheus: http://{{ prometheus_tailscale_hostname }}{% if prometheus_lb_ip.stdout | default('') | length > 0 %} (or http://{{ prometheus_lb_ip.stdout }}){% endif %}
      Login: admin / {{ grafana_password_effective }}
      Tailscale readiness:
      - Grafana proxy ready: {{ grafana_tailscale_ready.stdout | default('pending') }}
      - Prometheus proxy ready: {{ prometheus_tailscale_ready.stdout | default('pending') }}
      Access via:
      - MagicDNS: http://{{ grafana_tailscale_hostname }} and http://{{ prometheus_tailscale_hostname }}
      - Tailnet FQDN: http://{{ grafana_tailscale_hostname }}.{{ tailscale_tailnet | default('tailnet.ts.net') }}
      - Direct endpoint: {% if grafana_lb_ip.stdout | default('') | length > 0 %}http://{{ grafana_lb_ip.stdout }}{% else %}(pending){% endif %} / {% if prometheus_lb_ip.stdout | default('') | length > 0 %}http://{{ prometheus_lb_ip.stdout }}{% else %}(pending){% endif %}
  when:
    - observability_tailscale_expose | bool
    - tailscale_operator_ready | default(false) | bool
 - name: Show observability access details (fallback)
  debug:
    msg: |
      Observability stack deployed.
      Namespace: {{ observability_namespace }}
      Grafana (tailnet): kubectl -n {{ observability_namespace }} port-forward svc/kube-prometheus-stack-grafana 3000:80
      Prometheus (tailnet): kubectl -n {{ observability_namespace }} port-forward svc/kube-prometheus-stack-prometheus 9090:9090
      Grafana admin password: {{ grafana_password_effective }}
      {% if loki_enabled %}
      Loki: Enabled - logs available in Grafana
      {% else %}
      Loki: Disabled
      {% endif %}
  when:
    - not (observability_tailscale_expose | bool and (tailscale_operator_ready | default(false) | bool))
--- a/ansible/roles/observability/templates/grafana-datasource-loki.yaml.j2
+++ b/ansible/roles/observability/templates/grafana-datasource-loki.yaml.j2
@@ -0,0 +1,16 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: grafana-datasource-loki
  namespace: {{ observability_namespace }}
  labels:
    grafana_datasource: "1"
 data:
  loki-datasource.yaml: |
    apiVersion: 1
    datasources:
      - name: Loki
        type: loki
        access: proxy
        url: http://loki.{{ observability_namespace }}.svc.cluster.local:3100
        isDefault: false
--- a/ansible/roles/observability/templates/kube-prometheus-stack-values.yaml.j2
+++ b/ansible/roles/observability/templates/kube-prometheus-stack-values.yaml.j2
@@ -0,0 +1,46 @@
 grafana:
  enabled: true
  adminPassword: {{ grafana_password_effective }}
  persistence:
    enabled: true
    storageClassName: {{ grafana_storage_class }}
    size: {{ grafana_storage_size }}
  service:
 {% if observability_tailscale_expose and (tailscale_operator_ready | default(false)) %}
    type: LoadBalancer
    loadBalancerClass: tailscale
    annotations:
      tailscale.com/hostname: {{ grafana_tailscale_hostname }}
      tailscale.com/proxy-class: {{ tailscale_proxyclass_name }}
 {% else %}
    type: ClusterIP
 {% endif %}
 prometheus:
  service:
 {% if observability_tailscale_expose and (tailscale_operator_ready | default(false)) %}
    type: LoadBalancer
    loadBalancerClass: tailscale
    annotations:
      tailscale.com/hostname: {{ prometheus_tailscale_hostname }}
      tailscale.com/proxy-class: {{ tailscale_proxyclass_name }}
 {% else %}
    type: ClusterIP
 {% endif %}
  prometheusSpec:
    retention: 7d
    storageSpec:
      volumeClaimTemplate:
        spec:
          storageClassName: {{ prometheus_storage_class }}
          accessModes: ["ReadWriteOnce"]
          resources:
            requests:
              storage: {{ prometheus_storage_size }}
 alertmanager:
  enabled: false
 kubeEtcd:
  enabled: false
 kubeControllerManager:
  enabled: false
 kubeScheduler:
  enabled: false
--- a/ansible/roles/observability/templates/loki-values.yaml.j2
+++ b/ansible/roles/observability/templates/loki-values.yaml.j2
@@ -0,0 +1,75 @@
 deploymentMode: SingleBinary
 loki:
  auth_enabled: false
  commonConfig:
    replication_factor: 1
  schemaConfig:
    configs:
      - from: "2024-04-01"
        store: tsdb
        object_store: filesystem
        schema: v13
        index:
          prefix: loki_index_
          period: 24h
  storage:
    type: filesystem
  limits_config:
    allow_structured_metadata: true
    volume_enabled: true
    retention_period: 168h
  pattern_ingester:
    enabled: true
  ruler:
    enable_api: true
 singleBinary:
  replicas: 1
  persistence:
    size: {{ loki_storage_size }}
    storageClass: {{ loki_storage_class }}
  resources:
    requests:
      cpu: 100m
      memory: 256Mi
    limits:
      cpu: 500m
      memory: 1Gi
 backend:
  replicas: 0
 read:
  replicas: 0
 write:
  replicas: 0
 ingester:
  replicas: 0
 querier:
  replicas: 0
 queryFrontend:
  replicas: 0
 queryScheduler:
  replicas: 0
 distributor:
  replicas: 0
 compactor:
  replicas: 0
 indexGateway:
  replicas: 0
 bloomCompactor:
  replicas: 0
 bloomGateway:
  replicas: 0
 gateway:
  enabled: false
 test:
  enabled: false
 monitoring:
  selfMonitoring:
    enabled: false
  lokiCanary:
    enabled: false
--- a/ansible/roles/observability/templates/promtail-values.yaml.j2
+++ b/ansible/roles/observability/templates/promtail-values.yaml.j2
@@ -0,0 +1,3 @@
 config:
  clients:
    - url: http://loki.{{ observability_namespace }}.svc.cluster.local:3100/loki/api/v1/push
--- a/ansible/roles/tailscale-cleanup/tasks/main.yml
+++ b/ansible/roles/tailscale-cleanup/tasks/main.yml
@@ -0,0 +1,53 @@
 ---
 - name: Delete stale Tailscale devices with reserved hostnames
  block:
    - name: Get Tailscale devices from API
      uri:
        url: "https://api.tailscale.com/api/v2/tailnet/{{ tailscale_tailnet }}/devices"
        method: GET
        headers:
          Authorization: "Bearer {{ tailscale_api_key }}"
        return_content: true
      register: ts_devices
    - name: Find stale devices matching reserved hostnames
      set_fact:
        stale_devices: >-
          {{ ts_devices.json.devices | default([])
             | selectattr('hostname', 'defined')
             | selectattr('hostname', 'in', tailscale_reserved_hostnames)
             | rejectattr('online', 'defined')
             | list
             +
             ts_devices.json.devices | default([])
             | selectattr('hostname', 'defined')
             | selectattr('hostname', 'in', tailscale_reserved_hostnames)
             | selectattr('online', 'defined')
             | rejectattr('online', 'equalto', true)
             | list }}
    - name: Delete stale devices
      uri:
        url: "https://api.tailscale.com/api/v2/device/{{ item.id }}"
        method: DELETE
        headers:
          Authorization: "Bearer {{ tailscale_api_key }}"
        status_code: 200
      loop: "{{ stale_devices }}"
      loop_control:
        label: "{{ item.name }} ({{ item.id }})"
      when: stale_devices | length > 0
    - name: Report cleaned devices
      debug:
        msg: "Deleted stale Tailscale device: {{ item.name }}"
      loop: "{{ stale_devices }}"
      when: stale_devices | length > 0
    - name: No stale devices found
      debug:
        msg: "No stale Tailscale devices found."
      when: stale_devices | length == 0
  when:
    - tailscale_api_key is defined
    - tailscale_api_key | length > 0
--- a/ansible/site.yml
+++ b/ansible/site.yml
@@ -24,6 +24,7 @@
    k3s_primary_public_ip: "{{ ansible_host }}"
    k3s_primary_ip: "{{ k3s_private_ip }}"
    k3s_node_ip: "{{ k3s_private_ip }}"
    # kube_api_endpoint is set in inventory group_vars
  roles:
    - k3s-server
@@ -49,6 +50,20 @@
        dest: ../outputs/kubeconfig
        flat: true
 - name: Bootstrap addon prerequisite secrets
  hosts: control_plane[0]
  become: true
  roles:
    - addon-secrets-bootstrap
 - name: Deploy Hetzner CCM (required for workers with external cloud provider)
  hosts: control_plane[0]
  become: true
  roles:
    - ccm-deploy
 - name: Setup secondary control planes
  hosts: control_plane[1:]
  become: true
@@ -59,6 +74,8 @@
    k3s_primary_ip: "{{ hostvars[groups['control_plane'][0]]['k3s_primary_private_ip'] }}"
    k3s_primary_public_ip: "{{ hostvars[groups['control_plane'][0]]['k3s_primary_public_ip'] }}"
    k3s_node_ip: "{{ k3s_private_ip }}"
    # Use Load Balancer for HA - all control planes join via LB endpoint
    k3s_join_endpoint: "{{ kube_api_endpoint | default(hostvars[groups['control_plane'][0]]['k3s_primary_private_ip']) }}"
  roles:
    - k3s-server
@@ -69,25 +86,45 @@
  vars:
    k3s_token: "{{ hostvars[groups['control_plane'][0]]['k3s_token'] }}"
-    k3s_server_url: "https://{{ hostvars[groups['control_plane'][0]]['k3s_primary_private_ip'] }}:6443"
+    # Use Load Balancer for HA - workers join via LB endpoint
    k3s_server_url: "https://{{ kube_api_endpoint | default(hostvars[groups['control_plane'][0]]['k3s_primary_private_ip']) }}:6443"
    k3s_node_ip: "{{ k3s_private_ip }}"
  roles:
    - k3s-agent
- name: Deploy Hetzner CCM
+- name: Deploy observability stack
  hosts: control_plane[0]
  become: true
  roles:
-    - ccm
+    - role: observability
      when: not (observability_gitops_enabled | default(true) | bool)
- name: Deploy Hetzner CSI
+- name: Provision Grafana content
  hosts: control_plane[0]
  become: true
  roles:
-    - csi
+    - role: observability-content
      when: not (observability_gitops_enabled | default(true) | bool)
 - name: Bootstrap Doppler access for External Secrets
  hosts: control_plane[0]
  become: true
  roles:
    - doppler-bootstrap
 - name: Clean up stale Tailscale devices
  hosts: localhost
  connection: local
  vars:
    tailscale_reserved_hostnames:
      - rancher
  roles:
    - tailscale-cleanup
 - name: Finalize
  hosts: localhost
@@ -95,7 +132,7 @@
  tasks:
    - name: Update kubeconfig server address
      command: |
-        sed -i 's/127.0.0.1/{{ groups["control_plane"][0] }}.{{ tailscale_tailnet }}/g' ../outputs/kubeconfig
+        sed -i 's/127.0.0.1/{{ hostvars[groups["control_plane"][0]]["ansible_host"] }}/g' ../outputs/kubeconfig
      changed_when: true
    - name: Display success message
--- a/apps/kustomization.yaml
+++ b/apps/kustomization.yaml
@@ -0,0 +1,3 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources: []
--- a/clusters/prod/flux-system/gitrepository-platform.yaml
+++ b/clusters/prod/flux-system/gitrepository-platform.yaml
@@ -0,0 +1,12 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: GitRepository
 metadata:
  name: platform
  namespace: flux-system
 spec:
  interval: 1m
  ref:
    branch: main
  url: ssh://git@64.176.189.59:2222/HomeInfra/HetznerTerra.git
  secretRef:
    name: flux-system
--- a/clusters/prod/flux-system/gotk-components.yaml
+++ b/clusters/prod/flux-system/gotk-components.yaml
--- a/clusters/prod/flux-system/gotk-controller-cp1-patches.yaml
+++ b/clusters/prod/flux-system/gotk-controller-cp1-patches.yaml
@@ -0,0 +1,43 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: source-controller
  namespace: flux-system
 spec:
  template:
    spec:
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: kustomize-controller
  namespace: flux-system
 spec:
  template:
    spec:
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: helm-controller
  namespace: flux-system
 spec:
  template:
    spec:
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: notification-controller
  namespace: flux-system
 spec:
  template:
    spec:
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
--- a/clusters/prod/flux-system/kustomization-apps.yaml
+++ b/clusters/prod/flux-system/kustomization-apps.yaml
@@ -0,0 +1,17 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: apps
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./apps
  dependsOn:
    - name: infrastructure
  wait: true
  timeout: 5m
  suspend: true
--- a/clusters/prod/flux-system/kustomization-infrastructure.yaml
+++ b/clusters/prod/flux-system/kustomization-infrastructure.yaml
@@ -0,0 +1,14 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: infrastructure
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure
  wait: false
  timeout: 5m
--- a/clusters/prod/flux-system/kustomization.yaml
+++ b/clusters/prod/flux-system/kustomization.yaml
@@ -0,0 +1,9 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - gotk-components.yaml
  - gitrepository-platform.yaml
  - kustomization-infrastructure.yaml
  - kustomization-apps.yaml
 patchesStrategicMerge:
  - gotk-controller-cp1-patches.yaml
--- a/clusters/prod/kustomization.yaml
+++ b/clusters/prod/kustomization.yaml
@@ -0,0 +1,4 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - flux-system
--- a/infrastructure/addons/ccm/helmrelease-hcloud-ccm.yaml
+++ b/infrastructure/addons/ccm/helmrelease-hcloud-ccm.yaml
@@ -0,0 +1,36 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: hcloud-cloud-controller-manager
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: kube-system
  chart:
    spec:
      chart: hcloud-cloud-controller-manager
      version: 1.30.1
      sourceRef:
        kind: HelmRepository
        name: hcloud
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    selectorLabels:
      app: hcloud-cloud-controller-manager
    args:
      secure-port: "0"
    networking:
      enabled: true
    nodeSelector:
      kubernetes.io/hostname: k8s-cluster-cp-1
    additionalTolerations:
      - key: node-role.kubernetes.io/control-plane
        operator: Exists
        effect: NoSchedule
--- a/infrastructure/addons/ccm/helmrepository-hcloud.yaml
+++ b/infrastructure/addons/ccm/helmrepository-hcloud.yaml
@@ -0,0 +1,8 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: hcloud
  namespace: flux-system
 spec:
  interval: 1h
  url: https://charts.hetzner.cloud
--- a/infrastructure/addons/ccm/kustomization.yaml
+++ b/infrastructure/addons/ccm/kustomization.yaml
@@ -0,0 +1,5 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - helmrepository-hcloud.yaml
  - helmrelease-hcloud-ccm.yaml
--- a/infrastructure/addons/cert-manager/helmrelease-cert-manager.yaml
+++ b/infrastructure/addons/cert-manager/helmrelease-cert-manager.yaml
@@ -0,0 +1,34 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: cert-manager
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: cert-manager
  chart:
    spec:
      chart: cert-manager
      version: "v1.17.2"
      sourceRef:
        kind: HelmRepository
        name: jetstack
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    crds:
      enabled: true
    replicaCount: 1
    resources:
      requests:
        cpu: 50m
        memory: 128Mi
      limits:
        cpu: 250m
        memory: 256Mi
--- a/infrastructure/addons/cert-manager/helmrepository-cert-manager.yaml
+++ b/infrastructure/addons/cert-manager/helmrepository-cert-manager.yaml
@@ -0,0 +1,8 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: jetstack
  namespace: flux-system
 spec:
  interval: 1h
  url: https://charts.jetstack.io
--- a/infrastructure/addons/cert-manager/kustomization.yaml
+++ b/infrastructure/addons/cert-manager/kustomization.yaml
@@ -0,0 +1,6 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - namespace.yaml
  - helmrepository-cert-manager.yaml
  - helmrelease-cert-manager.yaml
--- a/infrastructure/addons/cert-manager/namespace.yaml
+++ b/infrastructure/addons/cert-manager/namespace.yaml
@@ -0,0 +1,6 @@
 apiVersion: v1
 kind: Namespace
 metadata:
  name: cert-manager
  labels:
    kustomize.toolkit.fluxcd.io/prune: disabled
--- a/infrastructure/addons/csi/helmrelease-hcloud-csi.yaml
+++ b/infrastructure/addons/csi/helmrelease-hcloud-csi.yaml
@@ -0,0 +1,36 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: hcloud-csi
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: kube-system
  chart:
    spec:
      chart: hcloud-csi
      version: 2.20.0
      sourceRef:
        kind: HelmRepository
        name: hcloud
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    controller:
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
      tolerations:
        - key: node-role.kubernetes.io/control-plane
          operator: Exists
          effect: NoSchedule
      hcloudVolumeDefaultLocation: nbg1
    storageClasses:
      - name: hcloud-volumes
        defaultStorageClass: true
        reclaimPolicy: Delete
--- a/infrastructure/addons/csi/helmrepository-hcloud.yaml
+++ b/infrastructure/addons/csi/helmrepository-hcloud.yaml
@@ -0,0 +1,8 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: hcloud
  namespace: flux-system
 spec:
  interval: 1h
  url: https://charts.hetzner.cloud
--- a/infrastructure/addons/csi/kustomization.yaml
+++ b/infrastructure/addons/csi/kustomization.yaml
@@ -0,0 +1,5 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - helmrepository-hcloud.yaml
  - helmrelease-hcloud-csi.yaml
--- a/infrastructure/addons/external-secrets/clustersecretstore-doppler-hetznerterra.yaml
+++ b/infrastructure/addons/external-secrets/clustersecretstore-doppler-hetznerterra.yaml
@@ -0,0 +1,13 @@
 apiVersion: external-secrets.io/v1
 kind: ClusterSecretStore
 metadata:
  name: doppler-hetznerterra
 spec:
  provider:
    doppler:
      auth:
        secretRef:
          dopplerToken:
            name: doppler-hetznerterra-service-token
            key: dopplerToken
            namespace: external-secrets
--- a/infrastructure/addons/external-secrets/helmrelease-external-secrets.yaml
+++ b/infrastructure/addons/external-secrets/helmrelease-external-secrets.yaml
@@ -0,0 +1,36 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: external-secrets
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: external-secrets
  chart:
    spec:
      chart: external-secrets
      version: 2.1.0
      sourceRef:
        kind: HelmRepository
        name: external-secrets
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    installCRDs: true
    nodeSelector:
      kubernetes.io/hostname: k8s-cluster-cp-1
    webhook:
      failurePolicy: Ignore
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
    certController:
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
    serviceMonitor:
      enabled: false
--- a/infrastructure/addons/external-secrets/helmrepository-external-secrets.yaml
+++ b/infrastructure/addons/external-secrets/helmrepository-external-secrets.yaml
@@ -0,0 +1,8 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: external-secrets
  namespace: flux-system
 spec:
  interval: 1h
  url: https://charts.external-secrets.io
--- a/infrastructure/addons/external-secrets/kustomization.yaml
+++ b/infrastructure/addons/external-secrets/kustomization.yaml
@@ -0,0 +1,6 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - namespace.yaml
  - helmrepository-external-secrets.yaml
  - helmrelease-external-secrets.yaml
--- a/infrastructure/addons/external-secrets/namespace.yaml
+++ b/infrastructure/addons/external-secrets/namespace.yaml
@@ -0,0 +1,4 @@
 apiVersion: v1
 kind: Namespace
 metadata:
  name: external-secrets
--- a/infrastructure/addons/flux-ui/cluster-user-auth-externalsecret.yaml
+++ b/infrastructure/addons/flux-ui/cluster-user-auth-externalsecret.yaml
@@ -0,0 +1,25 @@
 apiVersion: external-secrets.io/v1
 kind: ExternalSecret
 metadata:
  name: cluster-user-auth
  namespace: flux-system
 spec:
  refreshInterval: 1h
  secretStoreRef:
    name: doppler-hetznerterra
    kind: ClusterSecretStore
  target:
    name: cluster-user-auth
    creationPolicy: Owner
    template:
      type: Opaque
      data:
        username: "{{ .fluxAdminUsername }}"
        password: "{{ .fluxAdminPasswordHash }}"
  data:
    - secretKey: fluxAdminUsername
      remoteRef:
        key: WEAVE_GITOPS_ADMIN_USERNAME
    - secretKey: fluxAdminPasswordHash
      remoteRef:
        key: WEAVE_GITOPS_ADMIN_PASSWORD_BCRYPT_HASH
--- a/infrastructure/addons/flux-ui/flux-tailscale-service.yaml
+++ b/infrastructure/addons/flux-ui/flux-tailscale-service.yaml
@@ -0,0 +1,19 @@
 apiVersion: v1
 kind: Service
 metadata:
  name: flux-tailscale
  namespace: flux-system
  annotations:
    tailscale.com/hostname: flux
    tailscale.com/proxy-class: infra-stable
 spec:
  type: LoadBalancer
  loadBalancerClass: tailscale
  selector:
    app.kubernetes.io/name: weave-gitops
    app.kubernetes.io/instance: flux-system-weave-gitops
  ports:
    - name: http
      port: 9001
      protocol: TCP
      targetPort: http
--- a/infrastructure/addons/flux-ui/gitrepository-weave-gitops.yaml
+++ b/infrastructure/addons/flux-ui/gitrepository-weave-gitops.yaml
@@ -0,0 +1,10 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: GitRepository
 metadata:
  name: weave-gitops
  namespace: flux-system
 spec:
  interval: 1h
  url: https://github.com/weaveworks/weave-gitops
  ref:
    tag: v0.39.0-rc.2
--- a/infrastructure/addons/flux-ui/helmrelease-weave-gitops.yaml
+++ b/infrastructure/addons/flux-ui/helmrelease-weave-gitops.yaml
@@ -0,0 +1,38 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: weave-gitops
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: flux-system
  chart:
    spec:
      chart: ./charts/gitops-server
      sourceRef:
        kind: GitRepository
        name: weave-gitops
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    service:
      type: ClusterIP
      port: 9001
    adminUser:
      create: true
      createClusterRole: true
      createSecret: false  # Secret is managed by External Secret from Doppler
      username: admin
    rbac:
      create: true
      impersonationResourceNames:
        - admin
      viewSecretsResourceNames:
        - cluster-user-auth
        - oidc-auth
--- a/infrastructure/addons/flux-ui/kustomization.yaml
+++ b/infrastructure/addons/flux-ui/kustomization.yaml
@@ -0,0 +1,7 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - cluster-user-auth-externalsecret.yaml
  - gitrepository-weave-gitops.yaml
  - helmrelease-weave-gitops.yaml
  - flux-tailscale-service.yaml
--- a/infrastructure/addons/kustomization-ccm.yaml
+++ b/infrastructure/addons/kustomization-ccm.yaml
@@ -0,0 +1,15 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-ccm
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/ccm
  wait: true
  timeout: 10m
  suspend: false
--- a/infrastructure/addons/kustomization-cert-manager.yaml
+++ b/infrastructure/addons/kustomization-cert-manager.yaml
@@ -0,0 +1,15 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-cert-manager
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/cert-manager
  wait: true
  timeout: 10m
  suspend: false
--- a/infrastructure/addons/kustomization-csi.yaml
+++ b/infrastructure/addons/kustomization-csi.yaml
@@ -0,0 +1,17 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-csi
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/csi
  dependsOn:
    - name: addon-ccm
  wait: true
  timeout: 10m
  suspend: false
--- a/infrastructure/addons/kustomization-external-secrets.yaml
+++ b/infrastructure/addons/kustomization-external-secrets.yaml
@@ -0,0 +1,15 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-external-secrets
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/external-secrets
  wait: true
  timeout: 5m
  suspend: false
--- a/infrastructure/addons/kustomization-flux-ui.yaml
+++ b/infrastructure/addons/kustomization-flux-ui.yaml
@@ -0,0 +1,19 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-flux-ui
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/flux-ui
  dependsOn:
    - name: addon-external-secrets
    - name: addon-tailscale-operator
    - name: addon-tailscale-proxyclass
  wait: true
  timeout: 5m
  suspend: false
--- a/infrastructure/addons/kustomization-observability-content.yaml
+++ b/infrastructure/addons/kustomization-observability-content.yaml
@@ -0,0 +1,17 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-observability-content
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/observability-content
  dependsOn:
    - name: addon-observability
  wait: true
  timeout: 5m
  suspend: false
--- a/infrastructure/addons/kustomization-observability.yaml
+++ b/infrastructure/addons/kustomization-observability.yaml
@@ -0,0 +1,19 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-observability
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/observability
  dependsOn:
    - name: addon-external-secrets
    - name: addon-tailscale-operator
    - name: addon-tailscale-proxyclass
  wait: true
  timeout: 5m
  suspend: false
--- a/infrastructure/addons/kustomization-rancher-backup-config.yaml
+++ b/infrastructure/addons/kustomization-rancher-backup-config.yaml
@@ -0,0 +1,16 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-rancher-backup-config
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/rancher-backup-config
  timeout: 5m
  suspend: false
  dependsOn:
    - name: addon-rancher-backup
--- a/infrastructure/addons/kustomization-rancher-backup.yaml
+++ b/infrastructure/addons/kustomization-rancher-backup.yaml
@@ -0,0 +1,18 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-rancher-backup
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/rancher-backup
  wait: true
  timeout: 10m
  suspend: false
  dependsOn:
    - name: addon-external-secrets
    - name: addon-rancher
--- a/infrastructure/addons/kustomization-rancher-config.yaml
+++ b/infrastructure/addons/kustomization-rancher-config.yaml
@@ -0,0 +1,17 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-rancher-config
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/rancher-config
  dependsOn:
    - name: addon-rancher
  wait: true
  timeout: 5m
  suspend: false
--- a/infrastructure/addons/kustomization-rancher.yaml
+++ b/infrastructure/addons/kustomization-rancher.yaml
@@ -0,0 +1,20 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-rancher
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/rancher
  wait: true
  timeout: 15m
  suspend: false
  dependsOn:
    - name: addon-tailscale-operator
    - name: addon-tailscale-proxyclass
    - name: addon-external-secrets
    - name: addon-cert-manager
--- a/infrastructure/addons/kustomization-tailscale-operator.yaml
+++ b/infrastructure/addons/kustomization-tailscale-operator.yaml
@@ -0,0 +1,15 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-tailscale-operator
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/tailscale-operator
  wait: true
  timeout: 5m
  suspend: false
--- a/infrastructure/addons/kustomization-tailscale-proxyclass.yaml
+++ b/infrastructure/addons/kustomization-tailscale-proxyclass.yaml
@@ -0,0 +1,17 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-tailscale-proxyclass
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/tailscale-proxyclass
  dependsOn:
    - name: addon-tailscale-operator
  wait: true
  timeout: 5m
  suspend: false
--- a/infrastructure/addons/kustomization.yaml
+++ b/infrastructure/addons/kustomization.yaml
@@ -0,0 +1,17 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - kustomization-ccm.yaml
  - kustomization-csi.yaml
  - kustomization-external-secrets.yaml
  - kustomization-cert-manager.yaml
  - kustomization-tailscale-operator.yaml
  - kustomization-tailscale-proxyclass.yaml
  - traefik
  - kustomization-flux-ui.yaml
  - kustomization-observability.yaml
  - kustomization-observability-content.yaml
  - kustomization-rancher.yaml
  - kustomization-rancher-config.yaml
  - kustomization-rancher-backup.yaml
  - kustomization-rancher-backup-config.yaml
--- a/infrastructure/addons/observability-content/grafana-dashboard-k8s-overview-configmap.yaml
+++ b/infrastructure/addons/observability-content/grafana-dashboard-k8s-overview-configmap.yaml
@@ -0,0 +1,60 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: grafana-dashboard-k8s-overview
  namespace: observability
  labels:
    grafana_dashboard: "1"
 data:
  k8s-overview.json: |
    {
      "annotations": {"list": []},
      "editable": true,
      "fiscalYearStartMonth": 0,
      "graphTooltip": 0,
      "id": null,
      "links": [],
      "panels": [
        {
          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
          "fieldConfig": {"defaults": {"unit": "none"}, "overrides": []},
          "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
          "id": 1,
          "options": {"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
          "targets": [
            {
              "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
              "legendFormat": "ready",
              "refId": "A"
            }
          ],
          "title": "Ready Nodes",
          "type": "stat"
        },
        {
          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
          "fieldConfig": {"defaults": {"unit": "percentunit"}, "overrides": []},
          "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
          "id": 2,
          "targets": [
            {
              "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))",
              "legendFormat": "cpu",
              "refId": "A"
            }
          ],
          "title": "Cluster CPU Usage",
          "type": "timeseries"
        }
      ],
      "refresh": "30s",
      "schemaVersion": 39,
      "style": "dark",
      "tags": ["kubernetes", "infrastructure"],
      "templating": {"list": []},
      "time": {"from": "now-1h", "to": "now"},
      "timezone": "browser",
      "title": "K8s Cluster Overview",
      "uid": "k8s-cluster-overview",
      "version": 1
    }
--- a/infrastructure/addons/observability-content/grafana-datasources-core-configmap.yaml
+++ b/infrastructure/addons/observability-content/grafana-datasources-core-configmap.yaml
@@ -0,0 +1,16 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: grafana-datasources-core
  namespace: observability
  labels:
    grafana_datasource: "1"
 data:
  datasources.yaml: |
    apiVersion: 1
    datasources:
      - name: Loki
        type: loki
        access: proxy
        url: "http://loki.observability.svc.cluster.local:3100"
        isDefault: false
--- a/infrastructure/addons/observability-content/kustomization.yaml
+++ b/infrastructure/addons/observability-content/kustomization.yaml
@@ -0,0 +1,5 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - grafana-datasources-core-configmap.yaml
  - grafana-dashboard-k8s-overview-configmap.yaml
--- a/infrastructure/addons/observability/grafana-admin-externalsecret.yaml
+++ b/infrastructure/addons/observability/grafana-admin-externalsecret.yaml
@@ -0,0 +1,22 @@
 apiVersion: external-secrets.io/v1
 kind: ExternalSecret
 metadata:
  name: grafana-admin
  namespace: observability
 spec:
  refreshInterval: 1h
  secretStoreRef:
    name: doppler-hetznerterra
    kind: ClusterSecretStore
  target:
    name: grafana-admin-credentials
    creationPolicy: Owner
    template:
      type: Opaque
      data:
        admin-user: admin
        admin-password: "{{ .grafanaAdminPassword }}"
  data:
    - secretKey: grafanaAdminPassword
      remoteRef:
        key: GRAFANA_ADMIN_PASSWORD
--- a/infrastructure/addons/observability/grafana-tailscale-service.yaml
+++ b/infrastructure/addons/observability/grafana-tailscale-service.yaml
@@ -0,0 +1,18 @@
 apiVersion: v1
 kind: Service
 metadata:
  name: grafana-tailscale
  namespace: observability
  annotations:
    tailscale.com/hostname: grafana
    tailscale.com/proxy-class: infra-stable
 spec:
  type: LoadBalancer
  loadBalancerClass: tailscale
  selector:
    app.kubernetes.io/name: grafana
  ports:
    - name: http
      port: 80
      protocol: TCP
      targetPort: 3000
--- a/infrastructure/addons/observability/helmrelease-kube-prometheus-stack.yaml
+++ b/infrastructure/addons/observability/helmrelease-kube-prometheus-stack.yaml
@@ -0,0 +1,75 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: kube-prometheus-stack
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: observability
  chart:
    spec:
      chart: kube-prometheus-stack
      version: 68.4.4
      sourceRef:
        kind: HelmRepository
        name: prometheus-community
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    grafana:
      enabled: true
      admin:
        existingSecret: grafana-admin-credentials
      grafana.ini:
        server:
          root_url: http://grafana.silverside-gopher.ts.net/
          serve_from_sub_path: false
      persistence:
        enabled: true
        storageClassName: local-path
        size: 5Gi
      service:
        type: ClusterIP
      sidecar:
        datasources:
          enabled: true
          label: grafana_datasource
          searchNamespace: observability
        dashboards:
          enabled: true
          label: grafana_dashboard
          searchNamespace: observability
    prometheus:
      service:
        type: ClusterIP
      prometheusSpec:
        externalUrl: http://prometheus.silverside-gopher.ts.net/
        routePrefix: /
        retention: 7d
        storageSpec:
          volumeClaimTemplate:
            spec:
              storageClassName: local-path
              accessModes:
                - ReadWriteOnce
              resources:
                requests:
                  storage: 10Gi
    alertmanager:
      enabled: false
    kubeEtcd:
      enabled: false
    kubeControllerManager:
      enabled: false
    kubeScheduler:
      enabled: false
    prometheus-node-exporter:
      hostNetwork: false
      service:
        hostPort: false
--- a/infrastructure/addons/observability/helmrelease-loki.yaml
+++ b/infrastructure/addons/observability/helmrelease-loki.yaml
@@ -0,0 +1,99 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: loki
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: observability
  chart:
    spec:
      chart: loki
      version: 6.10.0
      sourceRef:
        kind: HelmRepository
        name: grafana
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    deploymentMode: SingleBinary
    loki:
      auth_enabled: false
      commonConfig:
        replication_factor: 1
      schemaConfig:
        configs:
          - from: "2024-04-01"
            store: tsdb
            object_store: filesystem
            schema: v13
            index:
              prefix: loki_index_
              period: 24h
      storage:
        type: filesystem
      limits_config:
        allow_structured_metadata: true
        volume_enabled: true
        retention_period: 168h
      pattern_ingester:
        enabled: true
      ruler:
        enable_api: true
    singleBinary:
      replicas: 1
      persistence:
        size: 10Gi
        storageClass: local-path
      resources:
        requests:
          cpu: 100m
          memory: 256Mi
        limits:
          cpu: 500m
          memory: 1Gi
    backend:
      replicas: 0
    read:
      replicas: 0
    write:
      replicas: 0
    ingester:
      replicas: 0
    querier:
      replicas: 0
    queryFrontend:
      replicas: 0
    queryScheduler:
      replicas: 0
    distributor:
      replicas: 0
    compactor:
      replicas: 0
    indexGateway:
      replicas: 0
    bloomCompactor:
      replicas: 0
    bloomGateway:
      replicas: 0
    gateway:
      enabled: false
    test:
      enabled: false
    chunksCache:
      enabled: true
      allocatedMemory: 128
    resultsCache:
      enabled: true
      allocatedMemory: 128
    monitoring:
      selfMonitoring:
        enabled: false
      lokiCanary:
        enabled: false
--- a/infrastructure/addons/observability/helmrelease-promtail.yaml
+++ b/infrastructure/addons/observability/helmrelease-promtail.yaml
@@ -0,0 +1,27 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: promtail
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: observability
  chart:
    spec:
      chart: promtail
      version: 6.16.6
      sourceRef:
        kind: HelmRepository
        name: grafana
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    config:
      clients:
        - url: http://loki.observability.svc.cluster.local:3100/loki/api/v1/push
--- a/infrastructure/addons/observability/helmrepository-grafana.yaml
+++ b/infrastructure/addons/observability/helmrepository-grafana.yaml
@@ -0,0 +1,8 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: grafana
  namespace: flux-system
 spec:
  interval: 1h
  url: https://grafana.github.io/helm-charts
--- a/infrastructure/addons/observability/helmrepository-prometheus-community.yaml
+++ b/infrastructure/addons/observability/helmrepository-prometheus-community.yaml
@@ -0,0 +1,8 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: prometheus-community
  namespace: flux-system
 spec:
  interval: 1h
  url: https://prometheus-community.github.io/helm-charts
--- a/infrastructure/addons/observability/kustomization.yaml
+++ b/infrastructure/addons/observability/kustomization.yaml
@@ -0,0 +1,12 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - namespace.yaml
  - grafana-admin-externalsecret.yaml
  - helmrepository-prometheus-community.yaml
  - helmrepository-grafana.yaml
  - helmrelease-kube-prometheus-stack.yaml
  - helmrelease-loki.yaml
  - helmrelease-promtail.yaml
  - grafana-tailscale-service.yaml
  - prometheus-tailscale-service.yaml
--- a/infrastructure/addons/observability/namespace.yaml
+++ b/infrastructure/addons/observability/namespace.yaml
@@ -0,0 +1,4 @@
 apiVersion: v1
 kind: Namespace
 metadata:
  name: observability
--- a/infrastructure/addons/observability/prometheus-tailscale-service.yaml
+++ b/infrastructure/addons/observability/prometheus-tailscale-service.yaml
@@ -0,0 +1,19 @@
 apiVersion: v1
 kind: Service
 metadata:
  name: prometheus-tailscale
  namespace: observability
  annotations:
    tailscale.com/hostname: prometheus
    tailscale.com/proxy-class: infra-stable
 spec:
  type: LoadBalancer
  loadBalancerClass: tailscale
  selector:
    app.kubernetes.io/name: prometheus
    operator.prometheus.io/name: observability-kube-prometh-prometheus
  ports:
    - name: http
      port: 9090
      protocol: TCP
      targetPort: 9090
--- a/infrastructure/addons/rancher-backup-config/backup-recurring.yaml
+++ b/infrastructure/addons/rancher-backup-config/backup-recurring.yaml
@@ -0,0 +1,17 @@
 apiVersion: resources.cattle.io/v1
 kind: Backup
 metadata:
  name: rancher-b2-recurring
  namespace: cattle-resources-system
 spec:
  resourceSetName: rancher-resource-set-full
  storageLocation:
    s3:
      credentialSecretName: rancher-b2-creds
      credentialSecretNamespace: cattle-resources-system
      bucketName: HetznerTerra
      folder: rancher-backups
      endpoint: s3.us-east-005.backblazeb2.com
      region: us-east-005
  schedule: "0 3 * * *"
  retentionCount: 7
--- a/infrastructure/addons/rancher-backup-config/kustomization.yaml
+++ b/infrastructure/addons/rancher-backup-config/kustomization.yaml
@@ -0,0 +1,5 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - backup-recurring.yaml
  - restore-from-b2.yaml
--- a/infrastructure/addons/rancher-backup-config/restore-from-b2.yaml
+++ b/infrastructure/addons/rancher-backup-config/restore-from-b2.yaml
@@ -0,0 +1,19 @@
 # Uncomment and set backupFilename to restore from a specific backup on rebuild.
 # Find the latest backup filename in B2: rancher-backups/ folder.
 # After restore succeeds, Rancher will have all users/settings from the backup.
 #
 # apiVersion: resources.cattle.io/v1
 # kind: Restore
 # metadata:
 #   name: restore-from-b2
 #   namespace: cattle-resources-system
 # spec:
 #   backupFilename: rancher-b2-manual-test-0a416444-2c8a-4d34-8a07-d9e406750374-2026-03-30T00-08-02Z.tar.gz
 #   storageLocation:
 #     s3:
 #       credentialSecretName: rancher-b2-creds
 #       credentialSecretNamespace: cattle-resources-system
 #       bucketName: HetznerTerra
 #       folder: rancher-backups
 #       endpoint: s3.us-east-005.backblazeb2.com
 #       region: us-east-005
--- a/infrastructure/addons/rancher-backup/b2-credentials-externalsecret.yaml
+++ b/infrastructure/addons/rancher-backup/b2-credentials-externalsecret.yaml
@@ -0,0 +1,25 @@
 apiVersion: external-secrets.io/v1
 kind: ExternalSecret
 metadata:
  name: rancher-b2-creds
  namespace: cattle-resources-system
 spec:
  refreshInterval: 1h
  secretStoreRef:
    name: doppler-hetznerterra
    kind: ClusterSecretStore
  target:
    name: rancher-b2-creds
    creationPolicy: Owner
    template:
      type: Opaque
      data:
        accessKey: "{{ .B2_ACCOUNT_ID }}"
        secretKey: "{{ .B2_APPLICATION_KEY }}"
  data:
    - secretKey: B2_ACCOUNT_ID
      remoteRef:
        key: B2_ACCOUNT_ID
    - secretKey: B2_APPLICATION_KEY
      remoteRef:
        key: B2_APPLICATION_KEY
--- a/infrastructure/addons/rancher-backup/helmrelease-rancher-backup-crd.yaml
+++ b/infrastructure/addons/rancher-backup/helmrelease-rancher-backup-crd.yaml
@@ -0,0 +1,23 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: rancher-backup-crd
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: cattle-resources-system
  chart:
    spec:
      chart: rancher-backup-crd
      version: "106.0.2+up8.1.0"
      sourceRef:
        kind: HelmRepository
        name: rancher-charts
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
--- a/infrastructure/addons/rancher-backup/helmrelease-rancher-backup.yaml
+++ b/infrastructure/addons/rancher-backup/helmrelease-rancher-backup.yaml
@@ -0,0 +1,42 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: rancher-backup
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: cattle-resources-system
  dependsOn:
    - name: rancher-backup-crd
  chart:
    spec:
      chart: rancher-backup
      version: "106.0.2+up8.1.0"
      sourceRef:
        kind: HelmRepository
        name: rancher-charts
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    image:
      repository: rancher/backup-restore-operator
    kubectl:
      image:
        repository: rancher/kubectl
        tag: "v1.34.0"
  postRenderers:
    - kustomize:
        patches:
          - target:
              kind: Job
              name: rancher-backup-patch-sa
            patch: |
              - op: replace
                path: /spec/template/spec/containers/0/image
                value: rancher/kubectl:v1.34.0
--- a/infrastructure/addons/rancher-backup/helmrepository-rancher-backup.yaml
+++ b/infrastructure/addons/rancher-backup/helmrepository-rancher-backup.yaml
@@ -0,0 +1,8 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: rancher-charts
  namespace: flux-system
 spec:
  interval: 1h
  url: https://charts.rancher.io
--- a/infrastructure/addons/rancher-backup/kustomization.yaml
+++ b/infrastructure/addons/rancher-backup/kustomization.yaml
@@ -0,0 +1,8 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - namespace.yaml
  - helmrepository-rancher-backup.yaml
  - helmrelease-rancher-backup-crd.yaml
  - helmrelease-rancher-backup.yaml
  - b2-credentials-externalsecret.yaml
--- a/infrastructure/addons/rancher-backup/namespace.yaml
+++ b/infrastructure/addons/rancher-backup/namespace.yaml
@@ -0,0 +1,4 @@
 apiVersion: v1
 kind: Namespace
 metadata:
  name: cattle-resources-system
--- a/infrastructure/addons/rancher-config/kustomization.yaml
+++ b/infrastructure/addons/rancher-config/kustomization.yaml
@@ -0,0 +1,4 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - server-url-setting.yaml
--- a/Show More
+++ b/Show More