update README

docs: record validated Rancher restore drill
Update the baseline to treat Rancher backup and restore validation as part of the accepted platform state, and capture the successful live drill run performed on 2026-04-18.
2026-04-22 01:14:21 +00:00 · 2026-04-18 21:27:42 +00:00 · 2026-04-18 19:59:13 +00:00 · 2026-04-18 18:44:55 +00:00 · 2026-04-18 18:16:27 +00:00 · 2026-04-18 17:45:59 +00:00
118 changed files with 9830 additions and 587 deletions
@@ -0,0 +1,99 @@
 name: Deploy Grafana Content
 on:
  push:
    branches:
      - main
    paths:
      - "ansible/dashboards.yml"
      - "ansible/roles/observability-content/**"
      - ".gitea/workflows/dashboards.yml"
  workflow_dispatch:
 env:
  TF_VERSION: "1.7.0"
  TF_VAR_hcloud_token: ${{ secrets.HCLOUD_TOKEN }}
  TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
  TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
  TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
  TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
  TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
 jobs:
  dashboards:
    name: Grafana Content
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Setup Terraform
        uses: hashicorp/setup-terraform@v3
        with:
          terraform_version: ${{ env.TF_VERSION }}
      - name: Setup SSH Keys
        run: |
          mkdir -p ~/.ssh
          echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
          chmod 600 ~/.ssh/id_ed25519
          echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
          chmod 644 ~/.ssh/id_ed25519.pub
      - name: Terraform Init
        working-directory: terraform
        run: |
          terraform init \
            -backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
            -backend-config="bucket=${{ secrets.S3_BUCKET }}" \
            -backend-config="region=auto" \
            -backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
            -backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
            -backend-config="skip_requesting_account_id=true"
      - name: Detect runner egress IP
        run: |
          RUNNER_IP=$(curl -fsSL https://api.ipify.org)
          echo "RUNNER_CIDR=[\"${RUNNER_IP}/32\"]" >> "$GITHUB_ENV"
          echo "Runner egress IP: ${RUNNER_IP}"
      - name: Open SSH/API for current runner CIDR
        working-directory: terraform
        run: |
          terraform apply \
            -refresh=false \
            -target=hcloud_firewall.cluster \
            -var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
            -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
            -var="ssh_private_key=$HOME/.ssh/id_ed25519" \
            -var="allowed_ssh_ips=${RUNNER_CIDR}" \
            -var="allowed_api_ips=${RUNNER_CIDR}" \
            -auto-approve
      - name: Install Python Dependencies
        run: |
          apt-get update && apt-get install -y python3-pip
          pip3 install --break-system-packages ansible kubernetes jinja2 pyyaml
      - name: Install Ansible Collections
        run: ansible-galaxy collection install -r ansible/requirements.yml
      - name: Generate Ansible Inventory
        working-directory: ansible
        run: python3 generate_inventory.py
      - name: Apply dashboards and datasources
        working-directory: ansible
        run: |
          ansible-playbook dashboards.yml \
            -e "cluster_name=k8s-cluster"
        env:
          ANSIBLE_HOST_KEY_CHECKING: "False"
      - name: Verify Grafana content resources
        working-directory: ansible
        run: |
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get configmap -l grafana_datasource=1"
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get configmap -l grafana_dashboard=1"
        env:
          ANSIBLE_HOST_KEY_CHECKING: "False"
@@ -17,6 +17,8 @@ env:
  TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
  TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
  TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
  TS_OAUTH_CLIENT_ID: ${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}
  TS_OAUTH_CLIENT_SECRET: ${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}
 jobs:
  terraform:
@@ -91,7 +93,6 @@ jobs:
          ensure_import 'hcloud_server.workers[0]' 'k8s-cluster-worker-1'
          ensure_import 'hcloud_server.workers[1]' 'k8s-cluster-worker-2'
          ensure_import 'hcloud_server.workers[2]' 'k8s-cluster-worker-3'
          ensure_import 'hcloud_server.workers[3]' 'k8s-cluster-worker-4'
      - name: Terraform Plan
        id: plan
@@ -226,16 +227,189 @@ jobs:
            -e "hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
            -e "tailscale_auth_key=${{ secrets.TAILSCALE_AUTH_KEY }}" \
            -e "tailscale_tailnet=${{ secrets.TAILSCALE_TAILNET }}" \
            -e "tailscale_oauth_client_id=${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}" \
            -e "tailscale_oauth_client_secret=${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}" \
            -e "doppler_hetznerterra_service_token=${{ secrets.DOPPLER_HETZNERTERRA_SERVICE_TOKEN }}" \
            -e "tailscale_api_key=${{ secrets.TAILSCALE_API_KEY }}" \
            -e "grafana_admin_password=${{ secrets.GRAFANA_ADMIN_PASSWORD }}" \
            -e "cluster_name=k8s-cluster"
        env:
          ANSIBLE_HOST_KEY_CHECKING: "False"
      - name: Install kubectl
        run: |
          curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
          chmod +x /usr/local/bin/kubectl
      - name: Rewrite kubeconfig for runner-reachable API
        working-directory: terraform
        run: |
          PRIMARY_IP=$(terraform output -raw primary_control_plane_ip)
          sed -i "s#https://k8s-cluster-cp-1\.[^:]*:6443#https://${PRIMARY_IP}:6443#g" ../outputs/kubeconfig
      - name: Bootstrap Flux source and reconciliation graph
        env:
          KUBECONFIG: outputs/kubeconfig
          FLUX_GIT_HOST: 64.176.189.59
          FLUX_GIT_PORT: "2222"
        run: |
          kubectl create namespace flux-system --dry-run=client -o yaml | kubectl apply -f -
          ssh-keyscan -p "${FLUX_GIT_PORT}" "${FLUX_GIT_HOST}" > /tmp/flux_known_hosts
          kubectl -n flux-system create secret generic flux-system \
            --from-file=identity="$HOME/.ssh/id_ed25519" \
            --from-file=known_hosts=/tmp/flux_known_hosts \
            --dry-run=client -o yaml | kubectl apply -f -
          # Apply CRDs and controllers first
          kubectl apply -f clusters/prod/flux-system/gotk-components.yaml
          # Wait for CRDs to be established
          kubectl wait --for=condition=Established crd --all --timeout=120s
          # Then apply custom resources
          kubectl apply -f clusters/prod/flux-system/gitrepository-platform.yaml
          kubectl apply -f clusters/prod/flux-system/kustomization-infrastructure.yaml
          kubectl apply -f clusters/prod/flux-system/kustomization-apps.yaml
          # Patch Flux controllers to run on cp-1 only
          kubectl -n flux-system patch deployment source-controller --type='merge' -p='{"spec":{"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"k8s-cluster-cp-1"}}}}}'
          kubectl -n flux-system patch deployment kustomize-controller --type='merge' -p='{"spec":{"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"k8s-cluster-cp-1"}}}}}'
          kubectl -n flux-system patch deployment helm-controller --type='merge' -p='{"spec":{"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"k8s-cluster-cp-1"}}}}}'
          kubectl -n flux-system patch deployment notification-controller --type='merge' -p='{"spec":{"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"k8s-cluster-cp-1"}}}}}'
          kubectl -n flux-system rollout status deployment/source-controller --timeout=180s
          kubectl -n flux-system rollout status deployment/kustomize-controller --timeout=180s
          kubectl -n flux-system rollout status deployment/helm-controller --timeout=180s
          kubectl -n flux-system wait --for=condition=Ready gitrepository/platform --timeout=180s
          kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=300s
          kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets --timeout=300s
          # Create Doppler ClusterSecretStore now that ESO CRDs are available
          kubectl apply -f - <<'EOF'
          apiVersion: external-secrets.io/v1
          kind: ClusterSecretStore
          metadata:
            name: doppler-hetznerterra
          spec:
            provider:
              doppler:
                auth:
                  secretRef:
                    dopplerToken:
                      name: doppler-hetznerterra-service-token
                      key: dopplerToken
                      namespace: external-secrets
          EOF
          # Wait for CCM and CSI (Hetzner cloud integration)
          kubectl -n flux-system wait --for=condition=Ready kustomization/addon-ccm --timeout=600s
          kubectl -n flux-system wait --for=condition=Ready kustomization/addon-csi --timeout=600s
          kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s
      - name: Wait for Rancher and backup operator
        env:
          KUBECONFIG: outputs/kubeconfig
        run: |
          set -euo pipefail
          echo "Waiting for Rancher..."
          kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=600s
          kubectl -n flux-system wait --for=condition=Ready helmrelease/rancher -n flux-system --timeout=300s
          echo "Waiting for rancher-backup operator..."
          kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup --timeout=600s || true
      - name: Restore Rancher from latest B2 backup
        env:
          KUBECONFIG: outputs/kubeconfig
          B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }}
          B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }}
        run: |
          echo "Finding latest backup in B2..."
          CREDS=$(echo -n "${B2_ACCOUNT_ID}:${B2_APPLICATION_KEY}" | base64)
          AUTH_RESP=$(curl -sS -H "Authorization: Basic ${CREDS}" https://api.backblazeb2.com/b2api/v2/b2_authorize_account)
          API_URL=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['apiUrl'])")
          AUTH_TOKEN=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['authorizationToken'])")
          BUCKET_ID=$(echo "$AUTH_RESP" | python3 -c "
          import json,sys
          resp = json.load(sys.stdin)
          bid = resp.get('allowed', {}).get('bucketId')
          if bid:
            print(bid)
          else:
            print('')
          ")
          if [ -z "$BUCKET_ID" ]; then
            echo "Restricted B2 key - resolving bucket ID by name..."
            BUCKET_ID=$(curl -sS -H "Authorization: Bearer ${AUTH_TOKEN}" \
              "${API_URL}/b2api/v2/b2_list_buckets?accountId=${B2_ACCOUNT_ID}&bucketName=HetznerTerra" \
              | python3 -c "import json,sys; buckets=json.load(sys.stdin).get('buckets',[]); print(buckets[0]['bucketId'] if buckets else '')")
          fi
          LATEST=$(curl -sS -H "Authorization: Bearer ${AUTH_TOKEN}" \
            "${API_URL}/b2api/v2/b2_list_file_names?bucketId=${BUCKET_ID}&prefix=rancher-backups/&maxFileCount=100" \
            | python3 -c "
          import json,sys
          files = json.load(sys.stdin).get('files', [])
          tars = [f['fileName'] for f in files if f['fileName'].endswith('.tar.gz')]
          if not tars:
            print('NONE')
          else:
            tars.sort()
            print(tars[-1])
          ")
          if [ "$LATEST" = "NONE" ]; then
            echo "No backups found in B2. Skipping restore."
            exit 0
          fi
          BACKUP_FILE=$(basename "$LATEST")
          echo "Latest backup: ${BACKUP_FILE}"
          echo "Creating Restore CR..."
          kubectl apply -f - <<EOF
          apiVersion: resources.cattle.io/v1
          kind: Restore
          metadata:
            name: restore-from-b2
            namespace: cattle-resources-system
          spec:
            backupFilename: ${BACKUP_FILE}
            storageLocation:
              s3:
                credentialSecretName: rancher-b2-creds
                credentialSecretNamespace: cattle-resources-system
                bucketName: HetznerTerra
                folder: rancher-backups
                endpoint: s3.us-east-005.backblazeb2.com
                region: us-east-005
          EOF
          echo "Waiting for restore to complete..."
          for i in $(seq 1 60); do
            STATUS=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "Unknown")
            MESSAGE=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].message}' 2>/dev/null || echo "")
            echo "  Restore status: ${STATUS} - ${MESSAGE}"
            if [ "$STATUS" = "True" ]; then
              echo "Restore completed successfully!"
              exit 0
            fi
            sleep 10
          done
          echo "Restore did not complete within timeout. Continuing anyway."
      - name: Post-deploy cluster health checks
        working-directory: ansible
        run: |
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get nodes -o wide"
          ansible -i inventory.ini 'control_plane[0]' -m shell -a "kubectl describe nodes | grep -E 'Name:|providerID:'"
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n flux-system get gitrepositories,kustomizations,helmreleases"
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n kube-system get pods -o wide"
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass"
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n tailscale-system get pods -o wide"
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n external-secrets get pods"
        env:
          ANSIBLE_HOST_KEY_CHECKING: "False"
      - name: Post-deploy tailnet smoke checks
        working-directory: ansible
        run: |
          ansible -i inventory.ini 'control_plane[0]' -m script -a "../scripts/smoke-check-tailnet-services.sh"
        env:
          ANSIBLE_HOST_KEY_CHECKING: "False"
@@ -16,10 +16,12 @@ env:
  TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
  TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
  TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
  B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }}
  B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }}
 jobs:
-  destroy:
+  pre-destroy-backup:
-    name: Destroy Cluster
+    name: Pre-Destroy Backup
    runs-on: ubuntu-latest
    if: github.event.inputs.confirm == 'destroy'
    environment: destroy
@@ -51,11 +53,143 @@ jobs:
          echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
          chmod 644 ~/.ssh/id_ed25519.pub
-      - name: Terraform Destroy
+      - name: Get Control Plane IP
        id: cp_ip
        working-directory: terraform
        run: |
          PRIMARY_IP=$(terraform output -raw primary_control_plane_ip)
          echo "PRIMARY_IP=${PRIMARY_IP}" >> "$GITHUB_ENV"
      - name: Pre-Destroy pg_dump to B2
        run: |
          set +e
          echo "Attempting pre-destroy backup to B2..."
          ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null root@${PRIMARY_IP} << 'EOF'
            set -e
            # Check if kubectl is available and cluster is up
            if ! command -v kubectl &> /dev/null; then
              echo "kubectl not found, skipping pre-destroy backup"
              exit 0
            fi
            # Check if we can reach the cluster
            if ! kubectl cluster-info &> /dev/null; then
              echo "Cannot reach cluster, skipping pre-destroy backup"
              exit 0
            fi
            # Check if CNP is deployed
            if ! kubectl get namespace cnpg-cluster &> /dev/null; then
              echo "CNP namespace not found, skipping pre-destroy backup"
              exit 0
            fi
            # Run backup using the pgdump image directly
            BACKUP_FILE="rancher-backup-$(date +%Y%m%d-%H%M%S).sql.gz"
            B2_ACCOUNT_ID="$(cat /etc/kubernetes/secret/b2_account_id 2>/dev/null || echo '')"
            B2_APPLICATION_KEY="$(cat /etc/kubernetes/secret/b2_application_key 2>/dev/null || echo '')"
            if [ -z "$B2_ACCOUNT_ID" ] || [ -z "$B2_APPLICATION_KEY" ]; then
              echo "B2 credentials not found in secret, skipping pre-destroy backup"
              exit 0
            fi
            kubectl run pgdump-manual --image=ghcr.io/cloudnative-pg/pgbackrest:latest --restart=Never \
              -n cnpg-cluster --dry-run=client -o yaml | \
              kubectl apply -f -
            echo "Waiting for backup job to complete..."
            kubectl wait --for=condition=complete job/pgdump-manual -n cnpg-cluster --timeout=300s || true
            kubectl logs job/pgdump-manual -n cnpg-cluster || true
            kubectl delete job pgdump-manual -n cnpg-cluster --ignore-not-found=true || true
          EOF
          echo "Pre-destroy backup step completed (failure is non-fatal)"
  destroy:
    name: Destroy Cluster
    runs-on: ubuntu-latest
    if: github.event.inputs.confirm == 'destroy'
    environment: destroy
    needs: pre-destroy-backup
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Setup Terraform
        uses: hashicorp/setup-terraform@v3
        with:
          terraform_version: ${{ env.TF_VERSION }}
      - name: Terraform Init
        working-directory: terraform
        run: |
          terraform init \
            -backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
            -backend-config="bucket=${{ secrets.S3_BUCKET }}" \
            -backend-config="region=auto" \
            -backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
            -backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
            -backend-config="skip_requesting_account_id=true"
      - name: Setup SSH Keys
        run: |
          mkdir -p ~/.ssh
          echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
          chmod 600 ~/.ssh/id_ed25519
          echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
          chmod 644 ~/.ssh/id_ed25519.pub
      - name: Install jq
        run: |
          apt-get update
          apt-get install -y jq
      - name: Terraform Destroy
        id: destroy
        working-directory: terraform
        run: |
          set +e
          for attempt in 1 2 3; do
            echo "Terraform destroy attempt ${attempt}/3"
            terraform destroy \
              -var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
              -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
              -var="ssh_private_key=$HOME/.ssh/id_ed25519" \
              -auto-approve
            rc=$?
            if [ "$rc" -eq 0 ]; then
              exit 0
            fi
            if [ "$attempt" -lt 3 ]; then
              echo "Terraform destroy failed with exit code ${rc}; retrying in 30s"
              sleep 30
              terraform refresh \
                -var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
                -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
                -var="ssh_private_key=$HOME/.ssh/id_ed25519" || true
            fi
          done
          exit "$rc"
      - name: Hetzner destroy diagnostics
        if: failure() && steps.destroy.outcome == 'failure'
        env:
          HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }}
        run: |
          set +e
          echo "== Terraform state list =="
          terraform -chdir=terraform state list || true
          network_id=$(terraform -chdir=terraform state show hcloud_network.cluster 2>/dev/null | awk '/^id *=/ {gsub(/"/, "", $3); print $3; exit}')
          if [ -z "$network_id" ]; then
            network_id="11988935"
          fi
          echo "== Hetzner network =="
          curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/networks/${network_id}" | jq . || true
          echo "== Hetzner servers attached to network =="
          curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/servers" | jq --argjson id "$network_id" '.servers[] | select(any(.private_net[]?; .network == $id)) | {id, name, private_net}' || true
          echo "== Hetzner load balancers attached to network =="
          curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/load_balancers" | jq --argjson id "$network_id" '.load_balancers[] | select(any(.private_net[]?; .network == $id)) | {id, name, private_net}' || true
@@ -0,0 +1,48 @@
 # AGENTS.md
 Repository guide for OpenCode sessions in this repo.
 ## Read First
 - Trust manifests and workflows over prose when they conflict.
 - Highest-value sources: `terraform/main.tf`, `terraform/variables.tf`, `ansible/site.yml`, `clusters/prod/flux-system/`, `infrastructure/addons/kustomization.yaml`, `.gitea/workflows/deploy.yml`, `.gitea/workflows/destroy.yml`, `README.md`, `STABLE_BASELINE.md`, `scripts/refresh-kubeconfig.sh`, `scripts/smoke-check-tailnet-services.sh`.
 ## Current Baseline
 - HA private cluster: 3 control planes, 3 workers.
 - Tailscale is the private access path for Rancher and shared services.
 - Rancher, Grafana, and Prometheus are exposed through Tailscale; Flux UI / Weave GitOps is removed.
 - `apps/` is suspended by default.
 - Rancher stores state in embedded etcd; backup/restore uses `rancher-backup` to B2.
 ## Common Commands
 - Terraform: `terraform -chdir=terraform fmt -recursive`, `terraform -chdir=terraform validate`, `terraform -chdir=terraform plan -var-file=../terraform.tfvars`, `terraform -chdir=terraform apply -var-file=../terraform.tfvars`
 - Ansible: `ansible-galaxy collection install -r ansible/requirements.yml`, `cd ansible && python3 generate_inventory.py`, `ansible-playbook -i ansible/inventory.ini ansible/site.yml --syntax-check`, `ansible-playbook ansible/site.yml`
 - Flux/Kustomize: `kubectl kustomize infrastructure/addons/<addon>`, `kubectl kustomize clusters/prod/flux-system`
 - Kubeconfig refresh: `scripts/refresh-kubeconfig.sh <cp1-public-ip>`
 - Tailnet smoke check: `ssh root@<cp1-ip> 'bash -s' < scripts/smoke-check-tailnet-services.sh`
 ## Workflow Rules
 - Keep diffs small and validate only the directory you edited.
 - Update manifests and docs together when behavior changes.
 - Use `set -euo pipefail` in workflow shell blocks.
 - CI deploy order is Terraform -> Ansible -> Flux bootstrap -> Rancher restore -> health checks.
 - One object per Kubernetes YAML file; keep filenames kebab-case.
 - If `kubectl` points at `localhost:8080` after a rebuild, refresh kubeconfig from the primary control-plane IP.
 ## Repo-Specific Gotchas
 - `rancher-backup` uses a postRenderer to swap the broken hook image to `rancher/kubectl:v1.34.0`; do not put S3 config in HelmRelease values. Put it in the Backup CR.
 - Tailscale cleanup only runs before service proxies exist; it removes stale offline `rancher`/`grafana`/`prometheus`/`flux` devices, then must stop so live proxies are not deleted.
 - Keep the Tailscale operator on the stable Helm repo `https://pkgs.tailscale.com/helmcharts` at `1.96.5` unless you have a reason to change it.
 - Current private URLs:
  - Rancher: `https://rancher.silverside-gopher.ts.net/`
  - Grafana: `http://grafana.silverside-gopher.ts.net/`
  - Prometheus: `http://prometheus.silverside-gopher.ts.net:9090/`
 ## Secrets
 - Runtime secrets live in Doppler + External Secrets.
 - Bootstrap and CI secrets stay in Gitea; never commit secrets, kubeconfigs, or private keys.
@@ -7,18 +7,11 @@ Production-ready Kubernetes cluster on Hetzner Cloud using Terraform and Ansible
 | Component | Details |
 |-----------|---------|
 | **Control Plane** | 3x CX23 (HA) |
-| **Workers** | 4x CX33 |
+| **Workers** | 3x CX33 |
 | **Total Cost** | €28.93/mo |
 | **K8s** | k3s (latest, HA) |
-| **Addons** | Hetzner CCM + CSI |
+| **Addons** | Hetzner CCM + CSI + Prometheus + Grafana + Loki |
-| **Access** | SSH/API restricted to Tailnet |
+| **Access** | SSH/API and private services restricted to Tailnet |
-| **Bootstrap** | Terraform + Ansible |
+| **Bootstrap** | Terraform + Ansible + Flux |
 ### Cluster Resources
 - 22 vCPU total (6 CP + 16 workers)
 - 44 GB RAM total (12 CP + 32 workers)
 - 440 GB SSD storage
 - 140 TB bandwidth allocation
 ## Prerequisites
@@ -143,15 +136,15 @@ export KUBECONFIG=$(pwd)/outputs/kubeconfig
 kubectl get nodes
 ```
-Kubeconfig endpoint is rewritten to the primary control-plane tailnet hostname (`k8s-cluster-cp-1.<your-tailnet>`).
+Use `scripts/refresh-kubeconfig.sh <cp1-public-ip>` to refresh kubeconfig against the primary control-plane public IP after rebuilds.
 ## Gitea CI/CD
 This repository includes Gitea workflows for:
- **terraform-plan**: Runs on PRs, shows planned changes
+- **deploy**: End-to-end Terraform + Ansible + Flux bootstrap + restore + health checks
- **terraform-apply**: Runs on main branch after merge
+- **destroy**: Cluster teardown with backup-aware cleanup
- **ansible-deploy**: Runs after terraform apply
+- **dashboards**: Fast workflow that updates Grafana datasources/dashboards only
 ### Required Gitea Secrets
@@ -166,10 +159,163 @@ Set these in your Gitea repository settings (**Settings** → **Secrets** → **
 | `S3_BUCKET` | S3 bucket name (e.g., `k8s-terraform-state`) |
 | `TAILSCALE_AUTH_KEY` | Tailscale auth key for node bootstrap |
 | `TAILSCALE_TAILNET` | Tailnet domain (e.g., `yourtailnet.ts.net`) |
 | `TAILSCALE_OAUTH_CLIENT_ID` | Tailscale OAuth client ID for Kubernetes Operator |
 | `TAILSCALE_OAUTH_CLIENT_SECRET` | Tailscale OAuth client secret for Kubernetes Operator |
 | `DOPPLER_HETZNERTERRA_SERVICE_TOKEN` | Doppler service token for `hetznerterra` runtime secrets |
 | `GRAFANA_ADMIN_PASSWORD` | Optional admin password for Grafana (auto-generated if unset) |
 | `RUNNER_ALLOWED_CIDRS` | Optional CIDR list for CI runner access if you choose to pass it via tfvars/secrets |
 | `SSH_PUBLIC_KEY` | SSH public key content |
 | `SSH_PRIVATE_KEY` | SSH private key content |
 ## GitOps (Flux)
 This repo uses Flux for continuous reconciliation after Terraform + Ansible bootstrap.
 ### Stable private-only baseline
 The current default target is the HA private baseline:
 - `3` control plane nodes
 - `3` worker nodes
 - private Hetzner network only
 - Tailscale for operator and service access
 - Flux-managed platform addons with `apps` suspended by default
 Detailed phase gates and success criteria live in `STABLE_BASELINE.md`.
 This is the default until rebuilds are consistently green. High availability, public ingress, and app-layer expansion come later.
 ### Runtime secrets
 Runtime cluster secrets are moving to Doppler + External Secrets Operator.
 - Doppler project: `hetznerterra`
 - Initial auth: service token via `DOPPLER_HETZNERTERRA_SERVICE_TOKEN`
 - First synced secrets:
  - `GRAFANA_ADMIN_PASSWORD`
 Terraform/bootstrap secrets remain in Gitea Actions secrets and are not managed by Doppler.
 ### Repository layout
 - `clusters/prod/`: cluster entrypoint and Flux reconciliation objects
 - `clusters/prod/flux-system/`: `GitRepository` source and top-level `Kustomization` graph
 - `infrastructure/`: infrastructure addon reconciliation graph
 - `infrastructure/addons/*`: per-addon manifests for Flux-managed cluster addons
 - `apps/`: application workload layer (currently scaffolded)
 ### Reconciliation graph
 - `infrastructure` (top-level)
  - `addon-ccm`
  - `addon-csi` depends on `addon-ccm`
  - `addon-tailscale-operator`
  - `addon-observability`
  - `addon-observability-content` depends on `addon-observability`
 - `apps` depends on `infrastructure`
 ### Bootstrap notes
 1. Install Flux controllers in `flux-system`.
 2. Create the Flux deploy key/secret named `flux-system` in `flux-system` namespace.
 3. Apply `clusters/prod/flux-system/` once to establish source + reconciliation graph.
 4. Bootstrap-only Ansible creates prerequisite secrets; Flux manages addon lifecycle after bootstrap.
 ### Current addon status
 - Core infrastructure addons are Flux-managed from `infrastructure/addons/`.
 - Active Flux addons for the current baseline: `addon-ccm`, `addon-csi`, `addon-cert-manager`, `addon-external-secrets`, `addon-tailscale-operator`, `addon-tailscale-proxyclass`, `addon-observability`, `addon-observability-content`, `addon-rancher`, `addon-rancher-config`, `addon-rancher-backup`, `addon-rancher-backup-config`.
 - `apps` remains suspended until workload rollout is explicitly enabled.
 - Ansible is limited to cluster bootstrap, prerequisite secret creation, pre-proxy Tailscale cleanup, and kubeconfig finalization.
 - Weave GitOps / Flux UI is no longer deployed; use Rancher or the `flux` CLI for Flux operations.
 ### Rancher access
 - Rancher is private-only and exposed through Tailscale at `https://rancher.silverside-gopher.ts.net/`.
 - The public Hetzner load balancer path is not used for Rancher.
 - Rancher stores state in embedded etcd; no external database is used.
 ### Stable baseline acceptance
 A rebuild is considered successful only when all of the following pass without manual intervention:
 - Terraform create succeeds for the default `3` control planes and `3` workers.
 - Ansible bootstrap succeeds end-to-end.
 - All nodes become `Ready`.
 - Flux core reconciliation is healthy.
 - External Secrets Operator is ready.
 - Tailscale operator is ready.
 - Tailnet smoke checks pass for Rancher, Grafana, and Prometheus.
 - Terraform destroy succeeds cleanly or succeeds after workflow retries.
 ## Observability Stack
 Flux deploys a lightweight observability stack in the `observability` namespace:
 - `kube-prometheus-stack` (Prometheus + Grafana)
 - `loki`
 - `promtail`
 Grafana content is managed as code via ConfigMaps in `infrastructure/addons/observability-content/`.
 Grafana and Prometheus are exposed through dedicated Tailscale LoadBalancer services when the Tailscale Kubernetes Operator is healthy.
 ### Access Grafana and Prometheus
 Preferred private access:
 - Grafana: `http://grafana.silverside-gopher.ts.net/`
 - Prometheus: `http://prometheus.silverside-gopher.ts.net:9090/`
 Fallback (port-forward from a tailnet-connected machine):
 Run from a tailnet-connected machine:
 ```bash
 export KUBECONFIG=$(pwd)/outputs/kubeconfig
 kubectl -n observability port-forward svc/kube-prometheus-stack-grafana 3000:80
 kubectl -n observability port-forward svc/kube-prometheus-stack-prometheus 9090:9090
 ```
 Then open:
 - Grafana: http://127.0.0.1:3000
 - Prometheus: http://127.0.0.1:9090
 Grafana user: `admin`
 Grafana password: value of `GRAFANA_ADMIN_PASSWORD` secret (or the generated value shown by Ansible output)
 ### Verify Tailscale exposure
 ```bash
 export KUBECONFIG=$(pwd)/outputs/kubeconfig
 kubectl -n tailscale-system get pods
 kubectl -n cattle-system get svc rancher-tailscale
 kubectl -n observability get svc grafana-tailscale prometheus-tailscale
 kubectl -n cattle-system describe svc rancher-tailscale | grep TailscaleProxyReady
 kubectl -n observability describe svc grafana-tailscale | grep TailscaleProxyReady
 kubectl -n observability describe svc prometheus-tailscale | grep TailscaleProxyReady
 ```
 If `TailscaleProxyReady=False`, check:
 ```bash
 kubectl -n tailscale-system logs deployment/operator --tail=100
 ```
 Common cause: OAuth client missing tag/scopes permissions.
 ### Fast dashboard iteration workflow
 Use the `Deploy Grafana Content` workflow when changing dashboard/data source templates.
 It avoids full cluster provisioning and only applies Grafana content resources:
 - `ansible/roles/observability-content/templates/grafana-datasources.yaml.j2`
 - `ansible/roles/observability-content/templates/grafana-dashboard-k8s-overview.yaml.j2`
 - `ansible/dashboards.yml`
 ## File Structure
 ```
@@ -191,13 +337,15 @@ Set these in your Gitea repository settings (**Settings** → **Secrets** → **
 │   │   ├── common/
 │   │   ├── k3s-server/
 │   │   ├── k3s-agent/
-│   │   ├── ccm/
+│   │   ├── addon-secrets-bootstrap/
-│   │   └── csi/
+│   │   ├── observability-content/
 │   │   └── observability/
 │   └── ansible.cfg
 ├── .gitea/
 │   └── workflows/
 │       ├── terraform.yml
-│       └── ansible.yml
+│       ├── ansible.yml
 │       └── dashboards.yml
 ├── outputs/
 ├── terraform.tfvars.example
 └── README.md
@@ -0,0 +1,93 @@
 # Gitea Secrets Setup
 This document describes the secrets required for the HetznerTerra deployment workflow.
 ## Required Secrets
 Add these secrets in your Gitea repository settings:
 **Settings → Secrets → Actions**
 ### Infrastructure Secrets
 #### `HCLOUD_TOKEN`
 - Hetzner Cloud API token
 - Get from: https://console.hetzner.com/projects/{project-id}/security/api-tokens
 - Permissions: Read & Write
 #### `S3_ACCESS_KEY` & `S3_SECRET_KEY`
 - Backblaze B2 credentials for Terraform state storage
 - Get from: https://secure.backblaze.com/b2_buckets.htm
 - Create application key with access to your terraform state bucket
 #### `S3_ENDPOINT`
 - Backblaze B2 S3 endpoint
 - Example: `https://s3.eu-central-003.backblazeb2.com`
 #### `S3_BUCKET`
 - Backblaze B2 bucket name for Terraform state
 - Example: `k8s-terraform-state`
 ### SSH Secrets
 #### `SSH_PRIVATE_KEY` & `SSH_PUBLIC_KEY`
 - SSH key pair for cluster access
 - Generate with: `ssh-keygen -t ed25519 -C "k8s@hetzner" -f ~/.ssh/hetzner_k8s`
 - Private key content (include BEGIN/END lines)
 - Public key content (full line starting with ssh-ed25519)
 ### Tailscale Secrets
 #### `TAILSCALE_AUTH_KEY`
 - Tailscale auth key for node registration
 - Get from: https://login.tailscale.com/admin/settings/keys
 - Type: Reusable, Ephemeral
 - Scope: `devices:core:write`
 #### `TAILSCALE_TAILNET`
 - Your Tailscale network name
 - Example: `tail7ec33.ts.net` or your custom domain
 #### `TAILSCALE_OAUTH_CLIENT_ID` & `TAILSCALE_OAUTH_CLIENT_SECRET`
 - OAuth credentials for Tailscale Kubernetes Operator
 - Get from: https://login.tailscale.com/admin/settings/oauth
 - Create OAuth client with scope: `devices:core:write`
 ### Application Secrets
 #### `DOPPLER_HETZNERTERRA_SERVICE_TOKEN`
 - Doppler service token for the `hetznerterra` project runtime secrets
 - Used by External Secrets Operator bootstrap
 - Recommended scope: `hetznerterra` project, `prod` config only
 #### `GRAFANA_ADMIN_PASSWORD`
 - Transitional fallback only while migrating observability secrets to Doppler
 - In steady state, store this in Doppler as `GRAFANA_ADMIN_PASSWORD`
 ## Setting Up Secrets
 1. Go to your Gitea repository
 2. Navigate to **Settings → Secrets → Actions**
 3. Click **Add Secret**
 4. Enter the secret name (exact match from above)
 5. Paste the secret value
 6. Click **Add Secret**
 7. Repeat for all secrets
 ## Verification
 After adding all secrets, trigger a workflow run:
 ```bash
 git commit --allow-empty -m "ci: trigger workflow with new secrets"
 git push
 ```
 Check the workflow logs to verify all secrets are being used correctly.
 ## Security Notes
 - Never commit secrets to the repository
 - Use strong, unique passwords for Grafana and other services
 - Prefer Doppler for runtime app/platform secrets after cluster bootstrap
 - Rotate Tailscale auth keys periodically
 - Review OAuth client permissions regularly
 - The workflow automatically opens SSH/API access only for the runner's IP during deployment
@@ -0,0 +1,75 @@
 # Stable Private-Only Baseline
 This document defines the current engineering target for this repository.
 ## Topology
 - 3 control planes (HA etcd cluster)
 - 3 workers
 - Hetzner Load Balancer for Kubernetes API
 - private Hetzner network
 - Tailscale operator access and service exposure
 - Rancher exposed through Tailscale (`rancher.silverside-gopher.ts.net`)
 - Grafana exposed through Tailscale (`grafana.silverside-gopher.ts.net`)
 - Prometheus exposed through Tailscale (`prometheus.silverside-gopher.ts.net:9090`)
 - `apps` Kustomization suspended by default
 ## In Scope
 - Terraform infrastructure bootstrap
 - Ansible k3s bootstrap with external cloud provider
 - **HA control plane (3 nodes with etcd quorum)**
 - **Hetzner Load Balancer for Kubernetes API**
 - **Hetzner CCM deployed via Ansible (before workers join)**
 - **Hetzner CSI for persistent volumes (via Flux)**
 - Flux core reconciliation
 - External Secrets Operator with Doppler
 - Tailscale private access and smoke-check validation
 - cert-manager
 - Rancher and rancher-backup
 - Rancher backup/restore validation
 - Observability stack (Grafana, Prometheus, Loki, Promtail)
 - Persistent volume provisioning validated
 ## Deferred for Later Phases
 - app workloads in `apps/`
 ## Out of Scope
 - public ingress or DNS
 - public TLS
 - app workloads
 - cross-region / multi-cluster disaster recovery strategy
 - upgrade strategy
 ## Phase Gates
 1. Terraform apply completes for HA topology (3 CP, 3 workers, 1 LB).
 2. Load Balancer is healthy with all 3 control plane targets.
 3. Primary control plane bootstraps with `--cluster-init`.
 4. Secondary control planes join via Load Balancer endpoint.
 5. **CCM deployed via Ansible before workers join** (fixes uninitialized taint issue).
 6. Workers join successfully via Load Balancer and all nodes show proper `providerID`.
 7. etcd reports 3 healthy members.
 8. Flux source and infrastructure reconciliation are healthy.
 9. **CSI deploys and creates `hcloud-volumes` StorageClass**.
 10. **PVC provisioning tested and working**.
 11. External Secrets sync required secrets.
 12. Tailscale private access works for Rancher, Grafana, and Prometheus.
 13. CI smoke checks pass for Tailscale DNS resolution, `tailscale ping`, and HTTP reachability.
 14. A fresh Rancher backup can be created and restored successfully.
 15. Terraform destroy succeeds cleanly or via workflow retry.
 ## Success Criteria
 Success requires two consecutive HA rebuilds passing all phase gates with no manual fixes, no manual `kubectl` patching, and no manual Tailscale proxy recreation.
 ## Validated Drills
 - 2026-04-18: live Rancher backup/restore drill succeeded on the current cluster.
 - A fresh one-time backup was created, restored back onto the same cluster, and post-restore validation confirmed:
  - all nodes remained `Ready`
  - Flux infrastructure stayed healthy
  - Rancher backup/restore resources reported `Completed`
  - Rancher, Grafana, and Prometheus remained reachable through the Tailscale smoke checks
@@ -0,0 +1,7 @@
 ---
 - name: Provision Grafana dashboards and datasources
  hosts: control_plane[0]
  become: true
  roles:
    - observability-content
@@ -32,6 +32,7 @@ def main():
    worker_names = outputs["worker_names"]["value"]
    worker_ips = outputs["worker_ips"]["value"]
    worker_private_ips = outputs["worker_private_ips"]["value"]
    kube_api_lb_ip = outputs.get("kube_api_lb_ip", {}).get("value", control_plane_ips[0])
    control_planes = [
        {
@@ -59,6 +60,7 @@ def main():
        "control_planes": control_planes,
        "workers": workers,
        "private_key_file": outputs["ssh_private_key_path"]["value"],
        "kube_api_lb_ip": kube_api_lb_ip,
    }
    env = Environment(loader=FileSystemLoader("."))
@@ -17,3 +17,4 @@ ansible_user=root
 ansible_python_interpreter=/usr/bin/python3
 ansible_ssh_private_key_file={{ private_key_file }}
 k3s_version=latest
 kube_api_endpoint={{ kube_api_lb_ip }}
@@ -3,3 +3,5 @@ collections:
    version: ">=2.4.0"
  - name: community.general
    version: ">=8.0.0"
  - name: community.network
    version: ">=5.0.0"
@@ -0,0 +1,41 @@
 ---
 - name: Apply Hetzner cloud secret
  shell: >-
    kubectl -n kube-system create secret generic hcloud
    --from-literal=token='{{ hcloud_token }}'
    --from-literal=network='{{ cluster_name }}-network'
    --dry-run=client -o yaml | kubectl apply -f -
  changed_when: true
  no_log: true
  when: hcloud_token | default('') | length > 0
 - name: Ensure Tailscale operator namespace exists
  command: >-
    kubectl create namespace {{ tailscale_operator_namespace | default('tailscale-system') }}
    --dry-run=client -o yaml
  register: tailscale_namespace_manifest
  changed_when: false
  when:
    - tailscale_oauth_client_id | default('') | length > 0
    - tailscale_oauth_client_secret | default('') | length > 0
 - name: Apply Tailscale operator namespace
  command: kubectl apply -f -
  args:
    stdin: "{{ tailscale_namespace_manifest.stdout }}"
  changed_when: true
  when:
    - tailscale_oauth_client_id | default('') | length > 0
    - tailscale_oauth_client_secret | default('') | length > 0
 - name: Apply Tailscale operator OAuth secret
  shell: >-
    kubectl -n {{ tailscale_operator_namespace | default('tailscale-system') }} create secret generic operator-oauth
    --from-literal=client_id='{{ tailscale_oauth_client_id }}'
    --from-literal=client_secret='{{ tailscale_oauth_client_secret }}'
    --dry-run=client -o yaml | kubectl apply -f -
  changed_when: true
  no_log: true
  when:
    - tailscale_oauth_client_id | default('') | length > 0
    - tailscale_oauth_client_secret | default('') | length > 0
@@ -0,0 +1,82 @@
 ---
 - name: Check if hcloud secret exists
  command: kubectl -n kube-system get secret hcloud
  register: hcloud_secret_check
  changed_when: false
  failed_when: false
 - name: Fail if hcloud secret is missing
  fail:
    msg: "hcloud secret not found in kube-system namespace. CCM requires it."
  when: hcloud_secret_check.rc != 0
 - name: Check if helm is installed
  command: which helm
  register: helm_check
  changed_when: false
  failed_when: false
 - name: Install helm
  when: helm_check.rc != 0
  block:
    - name: Download helm install script
      get_url:
        url: https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3
        dest: /tmp/get-helm-3.sh
        mode: "0755"
    - name: Run helm install script
      command: /tmp/get-helm-3.sh
      args:
        creates: /usr/local/bin/helm
 - name: Add Hetzner Helm repository
  kubernetes.core.helm_repository:
    name: hcloud
    repo_url: https://charts.hetzner.cloud
    kubeconfig: /etc/rancher/k3s/k3s.yaml
  environment:
    KUBECONFIG: /etc/rancher/k3s/k3s.yaml
 - name: Deploy Hetzner Cloud Controller Manager
  kubernetes.core.helm:
    name: hcloud-cloud-controller-manager
    chart_ref: hcloud/hcloud-cloud-controller-manager
    release_namespace: kube-system
    create_namespace: true
    values:
      networking:
        enabled: true
      nodeSelector:
        kubernetes.io/hostname: "{{ inventory_hostname }}"
      additionalTolerations:
        - key: node-role.kubernetes.io/control-plane
          operator: Exists
          effect: NoSchedule
    kubeconfig: /etc/rancher/k3s/k3s.yaml
    wait: true
    wait_timeout: 300s
  environment:
    KUBECONFIG: /etc/rancher/k3s/k3s.yaml
 - name: Wait for CCM to be ready
  command: kubectl -n kube-system rollout status deployment/hcloud-cloud-controller-manager --timeout=120s
  changed_when: false
  register: ccm_rollout
  until: ccm_rollout.rc == 0
  retries: 3
  delay: 10
 - name: Pause to ensure CCM is fully ready to process new nodes
  pause:
    seconds: 10
 - name: Verify CCM is removing uninitialized taints
  command: kubectl get nodes -o jsonpath='{.items[*].spec.taints[?(@.key=="node.cloudprovider.kubernetes.io/uninitialized")].key}'
  register: uninitialized_taints
  changed_when: false
  failed_when: false
 - name: Display taint status
  debug:
    msg: "Nodes with uninitialized taint: {{ uninitialized_taints.stdout }}"
@@ -1,4 +0,0 @@
 ---
 hcloud_token: ""
 cluster_name: "k8s-cluster"
 hcloud_lb_location: "nbg1"
@@ -1,88 +0,0 @@
 ---
 - name: Check if Hetzner CCM is already deployed
  command: kubectl -n kube-system get deployment hcloud-cloud-controller-manager
  register: ccm_namespace
  failed_when: false
  changed_when: false
 - name: Create Hetzner cloud secret
  shell: |
    kubectl -n kube-system create secret generic hcloud \
      --from-literal=token='{{ hcloud_token }}' \
      --from-literal=network='{{ cluster_name }}-network' \
      --dry-run=client -o yaml | kubectl apply -f -
  no_log: true
  when: hcloud_token is defined
  changed_when: true
 - name: Deploy Hetzner CCM
  command: kubectl apply -f https://raw.githubusercontent.com/hetznercloud/hcloud-cloud-controller-manager/main/deploy/ccm-networks.yaml
  changed_when: true
 - name: Detect CCM workload kind
  shell: |
    if kubectl -n kube-system get deployment hcloud-cloud-controller-manager >/dev/null 2>&1; then
      echo deployment
    elif kubectl -n kube-system get daemonset hcloud-cloud-controller-manager >/dev/null 2>&1; then
      echo daemonset
    else
      echo missing
    fi
  register: ccm_workload_kind
  changed_when: false
 - name: Wait for CCM deployment rollout
  command: kubectl rollout status deployment/hcloud-cloud-controller-manager -n kube-system
  register: ccm_rollout_deploy
  until: ccm_rollout_deploy.rc == 0
  changed_when: false
  retries: 30
  delay: 10
  when: ccm_workload_kind.stdout == "deployment"
 - name: Wait for CCM daemonset rollout
  command: kubectl rollout status daemonset/hcloud-cloud-controller-manager -n kube-system
  register: ccm_rollout_ds
  until: ccm_rollout_ds.rc == 0
  changed_when: false
  retries: 30
  delay: 10
  when: ccm_workload_kind.stdout == "daemonset"
 - name: Set default Hetzner load balancer location for Traefik service
  command: kubectl -n kube-system annotate service traefik load-balancer.hetzner.cloud/location={{ hcloud_lb_location }} --overwrite
  register: traefik_annotation
  changed_when: true
  failed_when: false
 - name: Show Traefik service when annotation patch fails
  command: kubectl -n kube-system get service traefik -o yaml
  register: traefik_service_dump
  changed_when: false
  failed_when: false
  when: traefik_annotation.rc != 0
 - name: Fail when Traefik load balancer annotation cannot be set
  fail:
    msg: |
      Failed to set Hetzner load balancer location annotation on kube-system/traefik service.
      Command output:
      {{ traefik_annotation.stderr | default(traefik_annotation.stdout) }}
      Service dump:
      {{ traefik_service_dump.stdout | default('n/a') }}
  when: traefik_annotation.rc != 0
 - name: Show CCM namespace objects when workload missing
  command: kubectl -n kube-system get all | grep hcloud-cloud-controller-manager || true
  register: ccm_ns_objects
  changed_when: false
  when: ccm_workload_kind.stdout == "missing"
 - name: Fail when CCM workload is missing
  fail:
    msg: |
      hcloud-cloud-controller-manager workload not found after applying manifest.
      Namespace objects:
      {{ ccm_ns_objects.stdout | default('n/a') }}
  when: ccm_workload_kind.stdout == "missing"
@@ -1,15 +0,0 @@
 ---
 hcloud_token: ""
 cluster_name: "k8s-cluster"
 csi_manifest_url: "https://raw.githubusercontent.com/hetznercloud/csi-driver/main/deploy/kubernetes/hcloud-csi.yml"
 csi_rollout_timeout_seconds: 30
 csi_rollout_retries: 8
 csi_rollout_delay_seconds: 5
 csi_failure_log_tail_lines: 120
 csi_smoke_test_enabled: true
 csi_smoke_test_storage_class: "csi-smoke-hcloud-immediate"
 csi_smoke_test_base_storage_class: "hcloud-volumes"
 csi_smoke_test_size: "1Gi"
 csi_smoke_test_pvc_timeout_seconds: 300
 csi_smoke_test_job_timeout_seconds: 300
 csi_smoke_test_required: false
@@ -1,425 +0,0 @@
 ---
 - name: Create Hetzner CSI secret
  shell: |
    kubectl -n kube-system create secret generic hcloud \
      --from-literal=token='{{ hcloud_token }}' \
      --from-literal=network='{{ cluster_name }}-network' \
      --dry-run=client -o yaml | kubectl apply -f -
  no_log: true
  when: hcloud_token is defined
  changed_when: true
 - name: Deploy Hetzner CSI
  command: kubectl apply -f {{ csi_manifest_url }}
  changed_when: true
 - name: Ensure CSI controller endpoint is set for sidecars
  command: kubectl -n kube-system set env deployment/hcloud-csi-controller CSI_ENDPOINT=unix:///run/csi/socket
  changed_when: true
 - name: Ensure CSI node endpoint is set for sidecars
  command: kubectl -n kube-system set env daemonset/hcloud-csi-node CSI_ENDPOINT=unix:///run/csi/socket
  changed_when: true
 - name: Restart CSI controller to pick up current secret
  command: kubectl -n kube-system rollout restart deployment/hcloud-csi-controller
  changed_when: true
 - name: Wait for CSI controller deployment generation
  command: kubectl -n kube-system rollout status deployment/hcloud-csi-controller --timeout=30s
  failed_when: false
  changed_when: false
 - name: Wait for CSI controller rollout
  command: kubectl rollout status deployment/hcloud-csi-controller -n kube-system --timeout={{ csi_rollout_timeout_seconds }}s
  register: csi_controller_rollout
  until: csi_controller_rollout.rc == 0
  retries: "{{ csi_rollout_retries | int }}"
  delay: "{{ csi_rollout_delay_seconds | int }}"
  failed_when: false
  changed_when: false
 - name: Show CSI controller status on failure
  command: kubectl -n kube-system get deployment hcloud-csi-controller -o wide
  register: csi_controller_deploy_status
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Show CSI controller pods on failure
  command: kubectl -n kube-system get pods -l app=hcloud-csi-controller -o wide
  register: csi_controller_pods_status
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Describe CSI controller deployment on failure
  command: kubectl -n kube-system describe deployment hcloud-csi-controller
  register: csi_controller_deploy_describe
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Describe CSI controller pod on failure
  shell: |
    pod="$(kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
    if [ -n "$pod" ]; then
      kubectl -n kube-system describe pod "$pod"
    fi
  register: csi_controller_pod_describe
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Show CSI driver logs on failure
  command: kubectl -n kube-system logs deployment/hcloud-csi-controller -c hcloud-csi-driver --tail={{ csi_failure_log_tail_lines }}
  register: csi_driver_logs
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Show CSI driver previous logs on failure
  shell: |
    pod="$(kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
    if [ -n "$pod" ]; then
      kubectl -n kube-system logs "$pod" -c hcloud-csi-driver --previous --tail={{ csi_failure_log_tail_lines }}
    fi
  register: csi_driver_previous_logs
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Show sidecar previous logs on failure
  shell: |
    pod="$(kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
    if [ -n "$pod" ]; then
      for container in csi-attacher csi-resizer csi-provisioner; do
        echo "===== $container ====="
        kubectl -n kube-system logs "$pod" -c "$container" --previous --tail={{ csi_failure_log_tail_lines }} || true
      done
    fi
  register: csi_sidecar_previous_logs
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Show recent kube-system events on failure
  command: kubectl -n kube-system get events --sort-by=.lastTimestamp
  register: csi_recent_events
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Fail with CSI controller diagnostics
  fail:
    msg: |
      CSI controller rollout failed.
      Deployment status:
      {{ csi_controller_deploy_status.stdout | default('n/a') }}
      Pods status:
      {{ csi_controller_pods_status.stdout | default('n/a') }}
      Deployment describe:
      {{ csi_controller_deploy_describe.stdout | default('n/a') }}
      Pod describe:
      {{ csi_controller_pod_describe.stdout | default('n/a') }}
      hcloud-csi-driver logs:
      {{ csi_driver_logs.stdout | default('n/a') }}
      hcloud-csi-driver previous logs:
      {{ csi_driver_previous_logs.stdout | default('n/a') }}
      Sidecar previous logs:
      {{ csi_sidecar_previous_logs.stdout | default('n/a') }}
      Recent kube-system events:
      {{ csi_recent_events.stdout | default('n/a') }}
  when: csi_controller_rollout.rc != 0
 - name: Wait for CSI node daemonset rollout
  command: kubectl rollout status daemonset/hcloud-csi-node -n kube-system --timeout={{ csi_rollout_timeout_seconds }}s
  register: csi_node_rollout
  until: csi_node_rollout.rc == 0
  retries: "{{ csi_rollout_retries | int }}"
  delay: "{{ csi_rollout_delay_seconds | int }}"
  failed_when: false
  changed_when: false
 - name: Fail when CSI node daemonset rollout does not complete
  fail:
    msg: "CSI node daemonset rollout failed: {{ csi_node_rollout.stdout | default('') }} {{ csi_node_rollout.stderr | default('') }}"
  when: csi_node_rollout.rc != 0
 - name: Generate CSI smoke test run identifier
  set_fact:
    csi_smoke_test_run_id: "{{ lookup('pipe', 'date +%s') }}"
  when: csi_smoke_test_enabled | bool
 - name: Generate unique CSI smoke test resource names
  set_fact:
    csi_smoke_test_pvc_name: "csi-smoke-pvc-{{ csi_smoke_test_run_id }}"
    csi_smoke_test_job_name: "csi-smoke-job-{{ csi_smoke_test_run_id }}"
  when: csi_smoke_test_enabled | bool
 - name: Cleanup stale CSI smoke test resources before apply
  shell: |
    kubectl -n kube-system delete job,pvc -l app.kubernetes.io/name=csi-smoke --ignore-not-found --wait=true
    kubectl delete storageclass {{ csi_smoke_test_storage_class }} --ignore-not-found
  failed_when: false
  changed_when: false
  when: csi_smoke_test_enabled | bool
 - name: Apply CSI smoke test resources
  shell: |
    kubectl apply -f - <<'EOF'
    apiVersion: storage.k8s.io/v1
    kind: StorageClass
    metadata:
      name: {{ csi_smoke_test_storage_class }}
    provisioner: csi.hetzner.cloud
    reclaimPolicy: Delete
    volumeBindingMode: Immediate
    allowVolumeExpansion: true
    ---
    apiVersion: v1
    kind: PersistentVolumeClaim
    metadata:
      name: {{ csi_smoke_test_pvc_name }}
      namespace: kube-system
      labels:
        app.kubernetes.io/name: csi-smoke
    spec:
      accessModes:
        - ReadWriteOnce
      resources:
        requests:
          storage: {{ csi_smoke_test_size }}
      storageClassName: {{ csi_smoke_test_storage_class }}
    ---
    apiVersion: batch/v1
    kind: Job
    metadata:
      name: {{ csi_smoke_test_job_name }}
      namespace: kube-system
      labels:
        app.kubernetes.io/name: csi-smoke
    spec:
      backoffLimit: 0
      template:
        spec:
          restartPolicy: Never
          containers:
            - name: write-and-read
              image: busybox:1.36
              command: ["/bin/sh", "-c", "echo csi-ok > /data/health && cat /data/health"]
              volumeMounts:
                - name: data
                  mountPath: /data
          volumes:
            - name: data
              persistentVolumeClaim:
                claimName: {{ csi_smoke_test_pvc_name }}
    EOF
  changed_when: true
  when: csi_smoke_test_enabled | bool
 - name: Wait for CSI smoke PVC to bind
  command: kubectl -n kube-system wait --for=jsonpath='{.status.phase}'=Bound pvc/{{ csi_smoke_test_pvc_name }} --timeout={{ csi_smoke_test_pvc_timeout_seconds }}s
  register: csi_smoke_pvc_wait
  failed_when: false
  changed_when: false
  when: csi_smoke_test_enabled | bool
 - name: Wait for CSI smoke Job completion
  command: kubectl -n kube-system wait --for=condition=complete job/{{ csi_smoke_test_job_name }} --timeout={{ csi_smoke_test_job_timeout_seconds }}s
  register: csi_smoke_job_wait
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc == 0
 - name: Show CSI smoke job logs
  command: kubectl -n kube-system logs job/{{ csi_smoke_test_job_name }}
  register: csi_smoke_job_logs
  failed_when: false
  changed_when: false
  when: csi_smoke_test_enabled | bool
 - name: Show CSI smoke PVC on failure
  command: kubectl -n kube-system get pvc {{ csi_smoke_test_pvc_name }} -o wide
  register: csi_smoke_pvc_status
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Show CSI smoke Job on failure
  command: kubectl -n kube-system get job {{ csi_smoke_test_job_name }} -o wide
  register: csi_smoke_job_status
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Show CSI smoke pods on failure
  command: kubectl -n kube-system get pod -l job-name={{ csi_smoke_test_job_name }} -o wide
  register: csi_smoke_pod_status
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Describe CSI smoke PVC on failure
  command: kubectl -n kube-system describe pvc {{ csi_smoke_test_pvc_name }}
  register: csi_smoke_pvc_describe
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Show storage classes on failure
  command: kubectl get storageclass
  register: csi_storageclasses
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Get CSI controller pod name on smoke failure
  shell: kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}'
  register: csi_controller_pod_name
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Describe CSI controller pod on smoke failure
  command: kubectl -n kube-system describe pod {{ csi_controller_pod_name.stdout }}
  register: csi_controller_pod_smoke_describe
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
    - csi_controller_pod_name.stdout | length > 0
 - name: Show CSI controller container logs on smoke failure
  shell: |
    pod="{{ csi_controller_pod_name.stdout }}"
    for container in hcloud-csi-driver csi-provisioner csi-attacher csi-resizer liveness-probe; do
      echo "===== ${container}: current ====="
      kubectl -n kube-system logs "$pod" -c "$container" --tail={{ csi_failure_log_tail_lines }} || true
      echo "===== ${container}: previous ====="
      kubectl -n kube-system logs "$pod" -c "$container" --previous --tail={{ csi_failure_log_tail_lines }} || true
    done
  register: csi_controller_container_logs
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
    - csi_controller_pod_name.stdout | length > 0
 - name: Show CSI driver and node driver objects on smoke failure
  shell: |
    echo "===== CSIDriver ====="
    kubectl get csidriver csi.hetzner.cloud -o yaml || true
    echo "===== CSINode ====="
    kubectl get csinode -o wide || true
  register: csi_driver_objects
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Show CSI smoke pod describe on failure
  shell: |
    pod="$(kubectl -n kube-system get pods -l job-name={{ csi_smoke_test_job_name }} -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
    if [ -n "$pod" ]; then
      kubectl -n kube-system describe pod "$pod"
    fi
  register: csi_smoke_pod_describe
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Fail when CSI smoke test fails
  fail:
    msg: |
      CSI smoke test failed.
      PVC wait:
      stdout: {{ csi_smoke_pvc_wait.stdout | default('') }}
      stderr: {{ csi_smoke_pvc_wait.stderr | default('') }}
      Job wait:
      stdout: {{ csi_smoke_job_wait.stdout | default('') }}
      stderr: {{ csi_smoke_job_wait.stderr | default('') }}
      PVC:
      {{ csi_smoke_pvc_status.stdout | default(csi_smoke_pvc_status.stderr | default('n/a')) }}
      Job:
      {{ csi_smoke_job_status.stdout | default(csi_smoke_job_status.stderr | default('n/a')) }}
      Pod list:
      {{ csi_smoke_pod_status.stdout | default(csi_smoke_pod_status.stderr | default('n/a')) }}
      PVC describe:
      {{ csi_smoke_pvc_describe.stdout | default(csi_smoke_pvc_describe.stderr | default('n/a')) }}
      Storage classes:
      {{ csi_storageclasses.stdout | default(csi_storageclasses.stderr | default('n/a')) }}
      CSI controller pod:
      {{ csi_controller_pod_name.stdout | default('n/a') }}
      CSI controller pod describe:
      {{ csi_controller_pod_smoke_describe.stdout | default(csi_controller_pod_smoke_describe.stderr | default('n/a')) }}
      CSI controller container logs:
      {{ csi_controller_container_logs.stdout | default(csi_controller_container_logs.stderr | default('n/a')) }}
      CSI driver objects:
      {{ csi_driver_objects.stdout | default(csi_driver_objects.stderr | default('n/a')) }}
      Pod describe:
      {{ csi_smoke_pod_describe.stdout | default('n/a') }}
      Job logs:
      {{ csi_smoke_job_logs.stdout | default('n/a') }}
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
    - csi_smoke_test_required | bool
 - name: Warn when CSI smoke test fails but is non-blocking
  debug:
    msg: |
      CSI smoke test failed but csi_smoke_test_required is false, so deployment will continue.
      PVC wait stderr: {{ csi_smoke_pvc_wait.stderr | default('') }}
      Job wait stderr: {{ csi_smoke_job_wait.stderr | default('') }}
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
    - not (csi_smoke_test_required | bool)
 - name: Cleanup CSI smoke test resources
  shell: |
    kubectl -n kube-system delete job {{ csi_smoke_test_job_name }} pvc {{ csi_smoke_test_pvc_name }} --ignore-not-found
    kubectl delete storageclass {{ csi_smoke_test_storage_class }} --ignore-not-found
  failed_when: false
  changed_when: false
  when: csi_smoke_test_enabled | bool
@@ -0,0 +1,50 @@
 ---
 - name: Ensure Doppler service token is provided
  assert:
    that:
      - doppler_hetznerterra_service_token | length > 0
    fail_msg: doppler_hetznerterra_service_token must be provided for External Secrets bootstrap.
 - name: Ensure external-secrets namespace exists
  shell: kubectl create namespace external-secrets --dry-run=client -o yaml | kubectl apply -f -
  changed_when: true
 - name: Apply Doppler service token secret
  shell: >-
    kubectl -n external-secrets create secret generic doppler-hetznerterra-service-token
    --from-literal=dopplerToken='{{ doppler_hetznerterra_service_token }}'
    --dry-run=client -o yaml | kubectl apply -f -
  changed_when: true
 - name: Check for ClusterSecretStore CRD
  command: kubectl get crd clustersecretstores.external-secrets.io
  register: doppler_clustersecretstore_crd
  changed_when: false
  failed_when: false
 - name: Apply Doppler ClusterSecretStore
  shell: |
    cat <<'EOF' | kubectl apply -f -
    apiVersion: external-secrets.io/v1
    kind: ClusterSecretStore
    metadata:
      name: doppler-hetznerterra
    spec:
      provider:
        doppler:
          auth:
            secretRef:
              dopplerToken:
                name: doppler-hetznerterra-service-token
                key: dopplerToken
                namespace: external-secrets
    EOF
  changed_when: true
  when: doppler_clustersecretstore_crd.rc == 0
 - name: Note pending Doppler ClusterSecretStore bootstrap
  debug:
    msg: >-
      Skipping Doppler ClusterSecretStore bootstrap because the External Secrets CRD
      is not available yet. Re-run after External Secrets is installed.
  when: doppler_clustersecretstore_crd.rc != 0
@@ -3,3 +3,4 @@ k3s_version: latest
 k3s_server_url: ""
 k3s_token: ""
 k3s_node_ip: ""
 k3s_kubelet_cloud_provider_external: true
@@ -12,14 +12,42 @@
  when: not k3s_agent_binary.stat.exists
 - name: Install k3s agent
  when: not k3s_agent_binary.stat.exists
  block:
    - name: Run k3s agent install
      environment:
        INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
        K3S_URL: "{{ k3s_server_url }}"
        K3S_TOKEN: "{{ k3s_token }}"
-  command: /tmp/install-k3s.sh agent --node-ip {{ k3s_node_ip }}
+      command: >-
        /tmp/install-k3s.sh agent
        --node-ip {{ k3s_node_ip }}
        --flannel-iface=enp7s0
        {% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
      args:
        creates: /usr/local/bin/k3s-agent
-  when: not k3s_agent_binary.stat.exists
+  rescue:
    - name: Show k3s-agent service status after failed install
      command: systemctl status k3s-agent --no-pager
      register: k3s_agent_status_after_install
      changed_when: false
      failed_when: false
    - name: Show recent k3s-agent logs after failed install
      command: journalctl -u k3s-agent -n 120 --no-pager
      register: k3s_agent_journal_after_install
      changed_when: false
      failed_when: false
    - name: Fail with k3s-agent diagnostics
      fail:
        msg: |
          k3s agent install failed on {{ inventory_hostname }}.
          Service status:
          {{ k3s_agent_status_after_install.stdout | default('n/a') }}
          Recent logs:
          {{ k3s_agent_journal_after_install.stdout | default('n/a') }}
 - name: Wait for k3s agent to be ready
  command: systemctl is-active k3s-agent
@@ -3,3 +3,14 @@ k3s_version: latest
 k3s_token: ""
 k3s_node_ip: ""
 k3s_primary_public_ip: ""
 k3s_disable_embedded_ccm: true
 k3s_disable_servicelb: true
 k3s_kubelet_cloud_provider_external: true
 # Load Balancer endpoint for HA cluster joins (set in inventory)
 kube_api_endpoint: ""
 # Tailscale DNS names for control planes (to enable tailnet access)
 # Using DNS names instead of IPs since Tailscale IPs change on rebuild
 tailscale_control_plane_names:
  - "k8s-cluster-cp-1.silverside-gopher.ts.net"
  - "k8s-cluster-cp-2.silverside-gopher.ts.net"
  - "k8s-cluster-cp-3.silverside-gopher.ts.net"
@@ -15,9 +15,9 @@
  set_fact:
    k3s_install_needed: "{{ (not k3s_service.stat.exists) or ((k3s_service_state.stdout | default('')) != 'active') }}"
- name: Wait for primary API on 6443 (secondary only)
+- name: Wait for API endpoint on 6443 (secondary only)
  wait_for:
-    host: "{{ k3s_primary_ip }}"
+    host: "{{ k3s_join_endpoint | default(k3s_primary_ip) }}"
    port: 6443
    state: started
    timeout: 120
@@ -28,27 +28,22 @@
  stat:
    path: /usr/local/bin/k3s-uninstall.sh
  register: k3s_uninstall_script
-  when:
+  when: k3s_install_needed
    - not (k3s_primary | default(false))
    - k3s_install_needed
- name: Reset broken secondary k3s install before rejoin
+- name: Reset broken k3s install before reinstall
  command: /usr/local/bin/k3s-uninstall.sh
  when:
    - not (k3s_primary | default(false))
    - k3s_install_needed
    - k3s_uninstall_script.stat.exists
- name: Remove stale k3s data on secondary
+- name: Remove stale k3s data
  file:
    path: "{{ item }}"
    state: absent
  loop:
    - /etc/rancher/k3s
    - /var/lib/rancher/k3s
-  when:
+  when: k3s_install_needed
    - not (k3s_primary | default(false))
    - k3s_install_needed
 - name: Download k3s install script
  get_url:
@@ -61,7 +56,19 @@
  environment:
    INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
    K3S_TOKEN: "{{ k3s_token }}"
-  command: /tmp/install-k3s.sh server --cluster-init --advertise-address={{ k3s_primary_ip }} --node-ip={{ k3s_node_ip }} --tls-san={{ k3s_primary_ip }} --tls-san={{ k3s_primary_public_ip }}
+  command: >-
    /tmp/install-k3s.sh server
    --cluster-init
    --advertise-address={{ k3s_primary_ip }}
    --node-ip={{ k3s_node_ip }}
    --flannel-iface=enp7s0
    --tls-san={{ k3s_primary_ip }}
    --tls-san={{ k3s_primary_public_ip }}
    --tls-san={{ kube_api_endpoint }}
    {% for name in tailscale_control_plane_names %}--tls-san={{ name }} {% endfor %}
    {% if k3s_disable_embedded_ccm | bool %}--disable-cloud-controller{% endif %}
    {% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
    {% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
  when:
    - k3s_install_needed
    - k3s_primary | default(false)
@@ -75,7 +82,15 @@
      environment:
        INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
        K3S_TOKEN: "{{ k3s_token }}"
-      command: /tmp/install-k3s.sh server --server https://{{ k3s_primary_ip }}:6443 --advertise-address={{ k3s_node_ip }} --node-ip={{ k3s_node_ip }}
+      command: >-
        /tmp/install-k3s.sh server
        --server https://{{ k3s_join_endpoint | default(k3s_primary_ip) }}:6443
        --advertise-address={{ k3s_node_ip }}
        --node-ip={{ k3s_node_ip }}
        --flannel-iface=enp7s0
        {% if k3s_disable_embedded_ccm | bool %}--disable-cloud-controller{% endif %}
        {% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
        {% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
      register: secondary_install
  rescue:
@@ -0,0 +1,9 @@
 ---
 observability_namespace: "observability"
 grafana_dashboard_configmap_name: "grafana-dashboard-k8s-overview"
 grafana_datasource_configmap_name: "grafana-datasources-core"
 loki_enabled: true
 grafana_prometheus_url: "http://kube-prometheus-stack-prometheus.{{ observability_namespace }}.svc.cluster.local:9090"
 grafana_loki_url: "http://loki.{{ observability_namespace }}.svc.cluster.local:3100"
 grafana_use_prometheus_nodeport_fallback: true
 grafana_use_loki_nodeport_fallback: true
@@ -0,0 +1,173 @@
 ---
 - name: Ensure observability namespace exists
  command: kubectl create namespace {{ observability_namespace }}
  register: create_observability_ns
  failed_when: create_observability_ns.rc != 0 and "AlreadyExists" not in create_observability_ns.stderr
  changed_when: create_observability_ns.rc == 0
 - name: Wait for Grafana deployment rollout
  command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
  changed_when: false
 - name: Set default Prometheus datasource URL
  set_fact:
    grafana_prometheus_effective_url: "{{ grafana_prometheus_url }}"
    grafana_loki_effective_url: "{{ grafana_loki_url }}"
 - name: Get Grafana pod name
  command: kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}'
  register: grafana_pod_name
  changed_when: false
 - name: Probe Prometheus from Grafana pod via default datasource URL
  shell: >-
    kubectl -n {{ observability_namespace }} exec {{ grafana_pod_name.stdout }} -c grafana --
    sh -c 'wget -qO- --timeout=5 {{ grafana_prometheus_url }}/-/ready >/dev/null'
  register: grafana_prometheus_probe
  changed_when: false
  failed_when: false
 - name: Probe Loki from Grafana pod via default datasource URL
  shell: >-
    kubectl -n {{ observability_namespace }} exec {{ grafana_pod_name.stdout }} -c grafana --
    sh -c 'wget -qO- --timeout=5 {{ grafana_loki_url }}/ready >/dev/null'
  register: grafana_loki_probe
  changed_when: false
  failed_when: false
  when: loki_enabled
 - name: Get Prometheus pod host IP for fallback
  command: kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=prometheus -o jsonpath='{.items[0].status.hostIP}'
  register: prometheus_host_ip
  changed_when: false
  when:
    - grafana_use_prometheus_nodeport_fallback | bool
    - grafana_prometheus_probe.rc != 0
 - name: Get Prometheus service NodePort for fallback
  command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-prometheus -o jsonpath='{.spec.ports[?(@.name=="http-web")].nodePort}'
  register: prometheus_nodeport
  changed_when: false
  when:
    - grafana_use_prometheus_nodeport_fallback | bool
    - grafana_prometheus_probe.rc != 0
 - name: Enable Prometheus NodePort fallback datasource URL
  set_fact:
    grafana_prometheus_effective_url: "http://{{ prometheus_host_ip.stdout }}:{{ prometheus_nodeport.stdout }}"
  when:
    - grafana_use_prometheus_nodeport_fallback | bool
    - grafana_prometheus_probe.rc != 0
    - prometheus_host_ip.stdout | length > 0
    - prometheus_nodeport.stdout | length > 0
 - name: Ensure Loki service uses NodePort for fallback
  command: kubectl -n {{ observability_namespace }} patch svc loki -p '{"spec":{"type":"NodePort"}}'
  changed_when: false
  failed_when: false
  when:
    - loki_enabled
    - grafana_use_loki_nodeport_fallback | bool
    - grafana_loki_probe.rc != 0
 - name: Get Loki pod host IP for fallback
  command: kubectl -n {{ observability_namespace }} get pod loki-0 -o jsonpath='{.status.hostIP}'
  register: loki_host_ip
  changed_when: false
  when:
    - loki_enabled
    - grafana_use_loki_nodeport_fallback | bool
    - grafana_loki_probe.rc != 0
 - name: Get Loki service NodePort for fallback
  command: kubectl -n {{ observability_namespace }} get svc loki -o jsonpath='{.spec.ports[?(@.name=="http-metrics")].nodePort}'
  register: loki_nodeport
  changed_when: false
  when:
    - loki_enabled
    - grafana_use_loki_nodeport_fallback | bool
    - grafana_loki_probe.rc != 0
 - name: Enable Loki NodePort fallback datasource URL
  set_fact:
    grafana_loki_effective_url: "http://{{ loki_host_ip.stdout }}:{{ loki_nodeport.stdout }}"
  when:
    - loki_enabled
    - grafana_use_loki_nodeport_fallback | bool
    - grafana_loki_probe.rc != 0
    - loki_host_ip.stdout | length > 0
    - loki_nodeport.stdout | length > 0
 - name: Query Loki labels endpoint from Grafana pod
  shell: >-
    kubectl -n {{ observability_namespace }} exec {{ grafana_pod_name.stdout }} -c grafana --
    sh -c 'wget -qO- --timeout=10 {{ grafana_loki_effective_url }}/loki/api/v1/labels'
  register: grafana_loki_labels
  changed_when: false
  failed_when: false
  when: loki_enabled
 - name: Fail when Loki is reachable but has zero indexed labels
  fail:
    msg: >-
      Loki is reachable from Grafana at {{ grafana_loki_effective_url }} but /loki/api/v1/labels returned no labels.
      This usually means no logs are ingested yet. Check Promtail and tenant configuration.
  when:
    - loki_enabled
    - grafana_loki_labels.rc == 0
    - "'\"status\":\"success\"' in (grafana_loki_labels.stdout | replace(' ', ''))"
    - "'\"data\":[]' in (grafana_loki_labels.stdout | replace(' ', ''))"
 - name: Write default Prometheus datasource ConfigMap patch
  template:
    src: grafana-default-prometheus-datasource.yaml.j2
    dest: /tmp/grafana-default-prometheus-datasource.yaml
    mode: "0644"
 - name: Apply default Prometheus datasource ConfigMap patch
  command: kubectl apply -f /tmp/grafana-default-prometheus-datasource.yaml
  changed_when: true
 - name: Remove legacy Loki datasource ConfigMap
  command: kubectl -n {{ observability_namespace }} delete configmap grafana-datasource-loki --ignore-not-found=true
  changed_when: false
  failed_when: false
 - name: Write Grafana datasources ConfigMap
  template:
    src: grafana-datasources.yaml.j2
    dest: /tmp/grafana-datasources.yaml
    mode: "0644"
  when: loki_enabled
 - name: Apply Grafana datasources ConfigMap
  command: kubectl apply -f /tmp/grafana-datasources.yaml
  changed_when: true
  when: loki_enabled
 - name: Restart Grafana to load datasource updates deterministically
  command: kubectl -n {{ observability_namespace }} rollout restart deployment/kube-prometheus-stack-grafana
  changed_when: true
 - name: Wait for Grafana rollout after datasource update
  command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
  changed_when: false
 - name: Write Grafana dashboard ConfigMap
  template:
    src: grafana-dashboard-k8s-overview.yaml.j2
    dest: /tmp/grafana-dashboard-k8s-overview.yaml
    mode: "0644"
 - name: Apply Grafana dashboard ConfigMap
  command: kubectl apply -f /tmp/grafana-dashboard-k8s-overview.yaml
  changed_when: true
 - name: Show Grafana content provisioning summary
  debug:
    msg: |
      Grafana content applied.
      Datasources ConfigMap: {{ grafana_datasource_configmap_name }}
      Prometheus datasource URL: {{ grafana_prometheus_effective_url }}
      Loki datasource URL: {{ grafana_loki_effective_url }}
      Dashboard ConfigMap: {{ grafana_dashboard_configmap_name }}
@@ -0,0 +1,60 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: {{ grafana_dashboard_configmap_name }}
  namespace: {{ observability_namespace }}
  labels:
    grafana_dashboard: "1"
 data:
  k8s-overview.json: |
    {
      "annotations": {"list": []},
      "editable": true,
      "fiscalYearStartMonth": 0,
      "graphTooltip": 0,
      "id": null,
      "links": [],
      "panels": [
        {
          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
          "fieldConfig": {"defaults": {"unit": "none"}, "overrides": []},
          "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
          "id": 1,
          "options": {"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
          "targets": [
            {
              "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
              "legendFormat": "ready",
              "refId": "A"
            }
          ],
          "title": "Ready Nodes",
          "type": "stat"
        },
        {
          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
          "fieldConfig": {"defaults": {"unit": "percentunit"}, "overrides": []},
          "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
          "id": 2,
          "targets": [
            {
              "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))",
              "legendFormat": "cpu",
              "refId": "A"
            }
          ],
          "title": "Cluster CPU Usage",
          "type": "timeseries"
        }
      ],
      "refresh": "30s",
      "schemaVersion": 39,
      "style": "dark",
      "tags": ["kubernetes", "infrastructure"],
      "templating": {"list": []},
      "time": {"from": "now-1h", "to": "now"},
      "timezone": "browser",
      "title": "K8s Cluster Overview",
      "uid": "k8s-cluster-overview",
      "version": 1
    }
@@ -0,0 +1,18 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: {{ grafana_datasource_configmap_name }}
  namespace: {{ observability_namespace }}
  labels:
    grafana_datasource: "1"
 data:
  datasources.yaml: |
    apiVersion: 1
    datasources:
 {% if loki_enabled %}
      - name: Loki
        type: loki
        access: proxy
        url: "{{ grafana_loki_effective_url }}"
        isDefault: false
 {% endif %}
@@ -0,0 +1,26 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: kube-prometheus-stack-grafana-datasource
  namespace: {{ observability_namespace }}
 data:
  datasource.yaml: |-
    apiVersion: 1
    datasources:
    - name: "Prometheus"
      type: prometheus
      uid: prometheus
      url: "{{ grafana_prometheus_effective_url }}/"
      access: proxy
      isDefault: true
      jsonData:
        httpMethod: POST
        timeInterval: 30s
    - name: "Alertmanager"
      type: alertmanager
      uid: alertmanager
      url: http://kube-prometheus-stack-alertmanager.{{ observability_namespace }}:9093/
      access: proxy
      jsonData:
        handleGrafanaManagedAlerts: false
        implementation: prometheus
@@ -0,0 +1,27 @@
 ---
 observability_namespace: "observability"
 prometheus_chart_version: "68.4.4"
 loki_chart_version: "6.10.0"
 promtail_chart_version: "6.16.6"
 grafana_admin_password: ""
 prometheus_storage_size: "10Gi"
 grafana_storage_size: "5Gi"
 loki_storage_size: "10Gi"
 prometheus_storage_class: "local-path"
 grafana_storage_class: "local-path"
 loki_storage_class: "local-path"
 loki_enabled: true
 tailscale_oauth_client_id: ""
 tailscale_oauth_client_secret: ""
 tailscale_tailnet: ""
 observability_tailscale_expose: true
 grafana_tailscale_hostname: "grafana"
 prometheus_tailscale_hostname: "prometheus"
 tailscale_proxyclass_name: "infra-stable"
@@ -0,0 +1,252 @@
 ---
 - name: Check if Helm is installed
  command: helm version --short
  register: helm_check
  changed_when: false
  failed_when: false
 - name: Install Helm
  shell: curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
  when: helm_check.rc != 0
  changed_when: true
 - name: Ensure observability namespace exists
  command: kubectl create namespace {{ observability_namespace }}
  register: create_observability_ns
  failed_when: create_observability_ns.rc != 0 and "AlreadyExists" not in create_observability_ns.stderr
  changed_when: create_observability_ns.rc == 0
 - name: Set Grafana admin password
  set_fact:
    grafana_password_effective: "{{ grafana_admin_password if grafana_admin_password | length > 0 else lookup('password', '/dev/null length=32 chars=ascii_letters,digits') }}"
 - name: Write kube-prometheus-stack values
  template:
    src: kube-prometheus-stack-values.yaml.j2
    dest: /tmp/kube-prometheus-stack-values.yaml
    mode: "0644"
 - name: Add Prometheus Helm repo
  command: helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
  register: add_prom_repo
  failed_when: add_prom_repo.rc != 0 and "already exists" not in add_prom_repo.stderr
  changed_when: add_prom_repo.rc == 0
 - name: Add Grafana Helm repo
  command: helm repo add grafana https://grafana.github.io/helm-charts
  register: add_grafana_repo
  failed_when: add_grafana_repo.rc != 0 and "already exists" not in add_grafana_repo.stderr
  changed_when: add_grafana_repo.rc == 0
 - name: Update Helm repos
  command: helm repo update
  changed_when: false
 - name: Clear stale pending Helm revision secrets for kube-prometheus-stack
  shell: >-
    kubectl -n {{ observability_namespace }} delete
    $(kubectl -n {{ observability_namespace }} get secret -l owner=helm,name=kube-prometheus-stack,status=pending-upgrade -o name)
    --ignore-not-found=true;
    kubectl -n {{ observability_namespace }} delete
    $(kubectl -n {{ observability_namespace }} get secret -l owner=helm,name=kube-prometheus-stack,status=pending-install -o name)
    --ignore-not-found=true;
    kubectl -n {{ observability_namespace }} delete
    $(kubectl -n {{ observability_namespace }} get secret -l owner=helm,name=kube-prometheus-stack,status=pending-rollback -o name)
    --ignore-not-found=true
  changed_when: false
  failed_when: false
 - name: Install kube-prometheus-stack
  command: >-
    helm upgrade --install kube-prometheus-stack prometheus-community/kube-prometheus-stack
    --namespace {{ observability_namespace }}
    --version {{ prometheus_chart_version }}
    --values /tmp/kube-prometheus-stack-values.yaml
    --wait
    --timeout 10m
  register: kube_prom_install
  retries: 12
  delay: 15
  until: kube_prom_install.rc == 0
  changed_when: true
 - name: Wait for Grafana deployment rollout
  command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
  changed_when: false
 - name: Reset Grafana admin password in Grafana database
  shell: >-
    kubectl -n {{ observability_namespace }} exec
    "$(kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}')"
    -c grafana -- grafana cli admin reset-admin-password '{{ grafana_password_effective }}'
  changed_when: true
 - name: Write Loki values
  template:
    src: loki-values.yaml.j2
    dest: /tmp/loki-values.yaml
    mode: "0644"
  when: loki_enabled
 - name: Validate Loki chart produces resources
  command: >-
    helm template loki grafana/loki
    --namespace {{ observability_namespace }}
    --version {{ loki_chart_version }}
    --values /tmp/loki-values.yaml
  register: loki_template
  changed_when: false
  failed_when: "loki_template.rc != 0 or 'kind: StatefulSet' not in loki_template.stdout"
  when: loki_enabled
 - name: Remove legacy Loki resources
  command: >-
    kubectl -n {{ observability_namespace }} delete
    deployment/loki-gateway
    statefulset/loki
    statefulset/loki-chunks-cache
    statefulset/loki-results-cache
    statefulset/loki-backend
    statefulset/loki-read
    statefulset/loki-write
    poddisruptionbudget/loki-memcached-chunks-cache
    poddisruptionbudget/loki-memcached-results-cache
    --ignore-not-found=true
  changed_when: false
  failed_when: false
  when: loki_enabled
 - name: Clear stuck Helm lock for Loki
  command: kubectl -n {{ observability_namespace }} delete secret sh.helm.release.v1.loki.v1 --ignore-not-found=true
  changed_when: false
  failed_when: false
  when: loki_enabled
 - name: Uninstall failed Loki release (if stuck)
  command: helm uninstall loki -n {{ observability_namespace }}
  changed_when: false
  failed_when: false
  when: loki_enabled
 - name: Install Loki
  command: >-
    helm upgrade --install loki grafana/loki
    --namespace {{ observability_namespace }}
    --version {{ loki_chart_version }}
    --values /tmp/loki-values.yaml
  register: loki_install
  changed_when: true
  when: loki_enabled
 - name: Wait for Loki StatefulSet
  command: kubectl -n {{ observability_namespace }} rollout status statefulset/loki --timeout=10m
  register: loki_rollout
  changed_when: false
  when: loki_enabled
 - name: Show Loki pod status
  command: kubectl -n {{ observability_namespace }} get pods -l app.kubernetes.io/name=loki -o wide
  register: loki_pods
  changed_when: false
  when: loki_enabled
 - name: Debug Loki pods
  debug:
    msg: "{{ loki_pods.stdout }}"
  when: loki_enabled
 - name: Write Promtail values
  template:
    src: promtail-values.yaml.j2
    dest: /tmp/promtail-values.yaml
    mode: "0644"
  when: loki_enabled
 - name: Install Promtail
  command: >-
    helm upgrade --install promtail grafana/promtail
    --namespace {{ observability_namespace }}
    --version {{ promtail_chart_version }}
    --values /tmp/promtail-values.yaml
    --wait
    --timeout 10m
  changed_when: true
  when: loki_enabled
 - name: Check Tailscale service readiness for Grafana
  command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-grafana -o jsonpath='{.status.conditions[?(@.type=="TailscaleProxyReady")].status}'
  register: grafana_tailscale_ready
  changed_when: false
  failed_when: false
  when:
    - observability_tailscale_expose | bool
    - tailscale_operator_ready | default(false) | bool
 - name: Check Tailscale service readiness for Prometheus
  command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-prometheus -o jsonpath='{.status.conditions[?(@.type=="TailscaleProxyReady")].status}'
  register: prometheus_tailscale_ready
  changed_when: false
  failed_when: false
  when:
    - observability_tailscale_expose | bool
    - tailscale_operator_ready | default(false) | bool
 - name: Check Tailscale endpoint (IP/hostname) for Grafana
  shell: >-
    kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-grafana
    -o go-template='{{"{{"}}range .status.loadBalancer.ingress{{"}}"}}{{"{{"}}if .ip{{"}}"}}{{"{{"}}.ip{{"}}"}}{{"{{"}}else{{"}}"}}{{"{{"}}.hostname{{"}}"}}{{"{{"}}end{{"}}"}}{{"{{"}}end{{"}}"}}'
  register: grafana_lb_ip
  changed_when: false
  failed_when: false
  when:
    - observability_tailscale_expose | bool
    - tailscale_operator_ready | default(false) | bool
 - name: Check Tailscale endpoint (IP/hostname) for Prometheus
  shell: >-
    kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-prometheus
    -o go-template='{{"{{"}}range .status.loadBalancer.ingress{{"}}"}}{{"{{"}}if .ip{{"}}"}}{{"{{"}}.ip{{"}}"}}{{"{{"}}else{{"}}"}}{{"{{"}}.hostname{{"}}"}}{{"{{"}}end{{"}}"}}{{"{{"}}end{{"}}"}}'
  register: prometheus_lb_ip
  changed_when: false
  failed_when: false
  when:
    - observability_tailscale_expose | bool
    - tailscale_operator_ready | default(false) | bool
 - name: Show Tailscale access details
  debug:
    msg: |
      Observability stack deployed with Tailscale access!
      Grafana: http://{{ grafana_tailscale_hostname }}{% if grafana_lb_ip.stdout | default('') | length > 0 %} (or http://{{ grafana_lb_ip.stdout }}){% endif %}
      Prometheus: http://{{ prometheus_tailscale_hostname }}{% if prometheus_lb_ip.stdout | default('') | length > 0 %} (or http://{{ prometheus_lb_ip.stdout }}){% endif %}
      Login: admin / {{ grafana_password_effective }}
      Tailscale readiness:
      - Grafana proxy ready: {{ grafana_tailscale_ready.stdout | default('pending') }}
      - Prometheus proxy ready: {{ prometheus_tailscale_ready.stdout | default('pending') }}
      Access via:
      - MagicDNS: http://{{ grafana_tailscale_hostname }} and http://{{ prometheus_tailscale_hostname }}
      - Tailnet FQDN: http://{{ grafana_tailscale_hostname }}.{{ tailscale_tailnet | default('tailnet.ts.net') }}
      - Direct endpoint: {% if grafana_lb_ip.stdout | default('') | length > 0 %}http://{{ grafana_lb_ip.stdout }}{% else %}(pending){% endif %} / {% if prometheus_lb_ip.stdout | default('') | length > 0 %}http://{{ prometheus_lb_ip.stdout }}{% else %}(pending){% endif %}
  when:
    - observability_tailscale_expose | bool
    - tailscale_operator_ready | default(false) | bool
 - name: Show observability access details (fallback)
  debug:
    msg: |
      Observability stack deployed.
      Namespace: {{ observability_namespace }}
      Grafana (tailnet): kubectl -n {{ observability_namespace }} port-forward svc/kube-prometheus-stack-grafana 3000:80
      Prometheus (tailnet): kubectl -n {{ observability_namespace }} port-forward svc/kube-prometheus-stack-prometheus 9090:9090
      Grafana admin password: {{ grafana_password_effective }}
      {% if loki_enabled %}
      Loki: Enabled - logs available in Grafana
      {% else %}
      Loki: Disabled
      {% endif %}
  when:
    - not (observability_tailscale_expose | bool and (tailscale_operator_ready | default(false) | bool))
@@ -0,0 +1,16 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: grafana-datasource-loki
  namespace: {{ observability_namespace }}
  labels:
    grafana_datasource: "1"
 data:
  loki-datasource.yaml: |
    apiVersion: 1
    datasources:
      - name: Loki
        type: loki
        access: proxy
        url: http://loki.{{ observability_namespace }}.svc.cluster.local:3100
        isDefault: false
@@ -0,0 +1,46 @@
 grafana:
  enabled: true
  adminPassword: {{ grafana_password_effective }}
  persistence:
    enabled: true
    storageClassName: {{ grafana_storage_class }}
    size: {{ grafana_storage_size }}
  service:
 {% if observability_tailscale_expose and (tailscale_operator_ready | default(false)) %}
    type: LoadBalancer
    loadBalancerClass: tailscale
    annotations:
      tailscale.com/hostname: {{ grafana_tailscale_hostname }}
      tailscale.com/proxy-class: {{ tailscale_proxyclass_name }}
 {% else %}
    type: ClusterIP
 {% endif %}
 prometheus:
  service:
 {% if observability_tailscale_expose and (tailscale_operator_ready | default(false)) %}
    type: LoadBalancer
    loadBalancerClass: tailscale
    annotations:
      tailscale.com/hostname: {{ prometheus_tailscale_hostname }}
      tailscale.com/proxy-class: {{ tailscale_proxyclass_name }}
 {% else %}
    type: ClusterIP
 {% endif %}
  prometheusSpec:
    retention: 7d
    storageSpec:
      volumeClaimTemplate:
        spec:
          storageClassName: {{ prometheus_storage_class }}
          accessModes: ["ReadWriteOnce"]
          resources:
            requests:
              storage: {{ prometheus_storage_size }}
 alertmanager:
  enabled: false
 kubeEtcd:
  enabled: false
 kubeControllerManager:
  enabled: false
 kubeScheduler:
  enabled: false
@@ -0,0 +1,75 @@
 deploymentMode: SingleBinary
 loki:
  auth_enabled: false
  commonConfig:
    replication_factor: 1
  schemaConfig:
    configs:
      - from: "2024-04-01"
        store: tsdb
        object_store: filesystem
        schema: v13
        index:
          prefix: loki_index_
          period: 24h
  storage:
    type: filesystem
  limits_config:
    allow_structured_metadata: true
    volume_enabled: true
    retention_period: 168h
  pattern_ingester:
    enabled: true
  ruler:
    enable_api: true
 singleBinary:
  replicas: 1
  persistence:
    size: {{ loki_storage_size }}
    storageClass: {{ loki_storage_class }}
  resources:
    requests:
      cpu: 100m
      memory: 256Mi
    limits:
      cpu: 500m
      memory: 1Gi
 backend:
  replicas: 0
 read:
  replicas: 0
 write:
  replicas: 0
 ingester:
  replicas: 0
 querier:
  replicas: 0
 queryFrontend:
  replicas: 0
 queryScheduler:
  replicas: 0
 distributor:
  replicas: 0
 compactor:
  replicas: 0
 indexGateway:
  replicas: 0
 bloomCompactor:
  replicas: 0
 bloomGateway:
  replicas: 0
 gateway:
  enabled: false
 test:
  enabled: false
 monitoring:
  selfMonitoring:
    enabled: false
  lokiCanary:
    enabled: false
@@ -0,0 +1,3 @@
 config:
  clients:
    - url: http://loki.{{ observability_namespace }}.svc.cluster.local:3100/loki/api/v1/push
@@ -0,0 +1,53 @@
 ---
 - name: Delete stale Tailscale devices with reserved hostnames
  block:
    - name: Get Tailscale devices from API
      uri:
        url: "https://api.tailscale.com/api/v2/tailnet/{{ tailscale_tailnet }}/devices"
        method: GET
        headers:
          Authorization: "Bearer {{ tailscale_api_key }}"
        return_content: true
      register: ts_devices
    - name: Find stale devices matching reserved hostnames
      set_fact:
        stale_devices: >-
          {{ ts_devices.json.devices | default([])
             | selectattr('hostname', 'defined')
             | selectattr('hostname', 'in', tailscale_reserved_hostnames)
             | rejectattr('online', 'defined')
             | list
             +
             ts_devices.json.devices | default([])
             | selectattr('hostname', 'defined')
             | selectattr('hostname', 'in', tailscale_reserved_hostnames)
             | selectattr('online', 'defined')
             | rejectattr('online', 'equalto', true)
             | list }}
    - name: Delete stale devices
      uri:
        url: "https://api.tailscale.com/api/v2/device/{{ item.id }}"
        method: DELETE
        headers:
          Authorization: "Bearer {{ tailscale_api_key }}"
        status_code: 200
      loop: "{{ stale_devices }}"
      loop_control:
        label: "{{ item.name }} ({{ item.id }})"
      when: stale_devices | length > 0
    - name: Report cleaned devices
      debug:
        msg: "Deleted stale Tailscale device: {{ item.name }}"
      loop: "{{ stale_devices }}"
      when: stale_devices | length > 0
    - name: No stale devices found
      debug:
        msg: "No stale Tailscale devices found."
      when: stale_devices | length == 0
  when:
    - tailscale_api_key is defined
    - tailscale_api_key | length > 0
@@ -24,6 +24,7 @@
    k3s_primary_public_ip: "{{ ansible_host }}"
    k3s_primary_ip: "{{ k3s_private_ip }}"
    k3s_node_ip: "{{ k3s_private_ip }}"
    # kube_api_endpoint is set in inventory group_vars
  roles:
    - k3s-server
@@ -49,6 +50,20 @@
        dest: ../outputs/kubeconfig
        flat: true
 - name: Bootstrap addon prerequisite secrets
  hosts: control_plane[0]
  become: true
  roles:
    - addon-secrets-bootstrap
 - name: Deploy Hetzner CCM (required for workers with external cloud provider)
  hosts: control_plane[0]
  become: true
  roles:
    - ccm-deploy
 - name: Setup secondary control planes
  hosts: control_plane[1:]
  become: true
@@ -59,6 +74,8 @@
    k3s_primary_ip: "{{ hostvars[groups['control_plane'][0]]['k3s_primary_private_ip'] }}"
    k3s_primary_public_ip: "{{ hostvars[groups['control_plane'][0]]['k3s_primary_public_ip'] }}"
    k3s_node_ip: "{{ k3s_private_ip }}"
    # Use Load Balancer for HA - all control planes join via LB endpoint
    k3s_join_endpoint: "{{ kube_api_endpoint | default(hostvars[groups['control_plane'][0]]['k3s_primary_private_ip']) }}"
  roles:
    - k3s-server
@@ -69,25 +86,63 @@
  vars:
    k3s_token: "{{ hostvars[groups['control_plane'][0]]['k3s_token'] }}"
-    k3s_server_url: "https://{{ hostvars[groups['control_plane'][0]]['k3s_primary_private_ip'] }}:6443"
+    # Use Load Balancer for HA - workers join via LB endpoint
    k3s_server_url: "https://{{ kube_api_endpoint | default(hostvars[groups['control_plane'][0]]['k3s_primary_private_ip']) }}:6443"
    k3s_node_ip: "{{ k3s_private_ip }}"
  roles:
    - k3s-agent
- name: Deploy Hetzner CCM
+- name: Deploy observability stack
  hosts: control_plane[0]
  become: true
  roles:
-    - ccm
+    - role: observability
      when: not (observability_gitops_enabled | default(true) | bool)
- name: Deploy Hetzner CSI
+- name: Provision Grafana content
  hosts: control_plane[0]
  become: true
  roles:
-    - csi
+    - role: observability-content
      when: not (observability_gitops_enabled | default(true) | bool)
 - name: Bootstrap Doppler access for External Secrets
  hosts: control_plane[0]
  become: true
  roles:
    - doppler-bootstrap
 - name: Detect existing Tailscale service proxies
  hosts: control_plane[0]
  become: true
  tasks:
    - name: Check for current Tailscale service hostnames
      command: kubectl get svc -A -o jsonpath='{range .items[*]}{.metadata.annotations.tailscale\.com/hostname}{"\n"}{end}'
      register: existing_tailscale_hostnames
      changed_when: false
      failed_when: false
 - name: Clean up stale Tailscale devices
  hosts: localhost
  connection: local
  vars:
    tailscale_reserved_hostnames:
      - rancher
      - grafana
      - prometheus
      - flux
  tasks:
    - name: Delete stale devices only before service proxies exist
      include_role:
        name: tailscale-cleanup
      when: >-
        hostvars[groups['control_plane'][0]].existing_tailscale_hostnames.stdout_lines | default([])
        | intersect(tailscale_reserved_hostnames)
        | length == 0
 - name: Finalize
  hosts: localhost
@@ -95,7 +150,7 @@
  tasks:
    - name: Update kubeconfig server address
      command: |
-        sed -i 's/127.0.0.1/{{ groups["control_plane"][0] }}.{{ tailscale_tailnet }}/g' ../outputs/kubeconfig
+        sed -i 's/127.0.0.1/{{ hostvars[groups["control_plane"][0]]["ansible_host"] }}/g' ../outputs/kubeconfig
      changed_when: true
    - name: Display success message
@@ -0,0 +1,3 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources: []
@@ -0,0 +1,12 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: GitRepository
 metadata:
  name: platform
  namespace: flux-system
 spec:
  interval: 1m
  ref:
    branch: main
  url: ssh://git@64.176.189.59:2222/HomeInfra/HetznerTerra.git
  secretRef:
    name: flux-system
@@ -0,0 +1,43 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: source-controller
  namespace: flux-system
 spec:
  template:
    spec:
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: kustomize-controller
  namespace: flux-system
 spec:
  template:
    spec:
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: helm-controller
  namespace: flux-system
 spec:
  template:
    spec:
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: notification-controller
  namespace: flux-system
 spec:
  template:
    spec:
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
@@ -0,0 +1,17 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: apps
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./apps
  dependsOn:
    - name: infrastructure
  wait: true
  timeout: 5m
  suspend: true
@@ -0,0 +1,14 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: infrastructure
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure
  wait: false
  timeout: 5m
@@ -0,0 +1,9 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - gotk-components.yaml
  - gitrepository-platform.yaml
  - kustomization-infrastructure.yaml
  - kustomization-apps.yaml
 patchesStrategicMerge:
  - gotk-controller-cp1-patches.yaml
@@ -0,0 +1,4 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - flux-system
@@ -0,0 +1,36 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: hcloud-cloud-controller-manager
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: kube-system
  chart:
    spec:
      chart: hcloud-cloud-controller-manager
      version: 1.30.1
      sourceRef:
        kind: HelmRepository
        name: hcloud
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    selectorLabels:
      app: hcloud-cloud-controller-manager
    args:
      secure-port: "0"
    networking:
      enabled: true
    nodeSelector:
      kubernetes.io/hostname: k8s-cluster-cp-1
    additionalTolerations:
      - key: node-role.kubernetes.io/control-plane
        operator: Exists
        effect: NoSchedule
@@ -0,0 +1,8 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: hcloud
  namespace: flux-system
 spec:
  interval: 1h
  url: https://charts.hetzner.cloud
@@ -0,0 +1,5 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - helmrepository-hcloud.yaml
  - helmrelease-hcloud-ccm.yaml
@@ -0,0 +1,34 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: cert-manager
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: cert-manager
  chart:
    spec:
      chart: cert-manager
      version: "v1.17.2"
      sourceRef:
        kind: HelmRepository
        name: jetstack
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    crds:
      enabled: true
    replicaCount: 1
    resources:
      requests:
        cpu: 50m
        memory: 128Mi
      limits:
        cpu: 250m
        memory: 256Mi
@@ -0,0 +1,8 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: jetstack
  namespace: flux-system
 spec:
  interval: 1h
  url: https://charts.jetstack.io
@@ -0,0 +1,6 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - namespace.yaml
  - helmrepository-cert-manager.yaml
  - helmrelease-cert-manager.yaml
@@ -0,0 +1,6 @@
 apiVersion: v1
 kind: Namespace
 metadata:
  name: cert-manager
  labels:
    kustomize.toolkit.fluxcd.io/prune: disabled
@@ -0,0 +1,36 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: hcloud-csi
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: kube-system
  chart:
    spec:
      chart: hcloud-csi
      version: 2.20.0
      sourceRef:
        kind: HelmRepository
        name: hcloud
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    controller:
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
      tolerations:
        - key: node-role.kubernetes.io/control-plane
          operator: Exists
          effect: NoSchedule
      hcloudVolumeDefaultLocation: nbg1
    storageClasses:
      - name: hcloud-volumes
        defaultStorageClass: true
        reclaimPolicy: Delete
@@ -0,0 +1,8 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: hcloud
  namespace: flux-system
 spec:
  interval: 1h
  url: https://charts.hetzner.cloud
@@ -0,0 +1,5 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - helmrepository-hcloud.yaml
  - helmrelease-hcloud-csi.yaml
@@ -0,0 +1,13 @@
 apiVersion: external-secrets.io/v1
 kind: ClusterSecretStore
 metadata:
  name: doppler-hetznerterra
 spec:
  provider:
    doppler:
      auth:
        secretRef:
          dopplerToken:
            name: doppler-hetznerterra-service-token
            key: dopplerToken
            namespace: external-secrets
@@ -0,0 +1,36 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: external-secrets
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: external-secrets
  chart:
    spec:
      chart: external-secrets
      version: 2.1.0
      sourceRef:
        kind: HelmRepository
        name: external-secrets
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    installCRDs: true
    nodeSelector:
      kubernetes.io/hostname: k8s-cluster-cp-1
    webhook:
      failurePolicy: Ignore
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
    certController:
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
    serviceMonitor:
      enabled: false
@@ -0,0 +1,8 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: external-secrets
  namespace: flux-system
 spec:
  interval: 1h
  url: https://charts.external-secrets.io
@@ -0,0 +1,6 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - namespace.yaml
  - helmrepository-external-secrets.yaml
  - helmrelease-external-secrets.yaml
@@ -0,0 +1,4 @@
 apiVersion: v1
 kind: Namespace
 metadata:
  name: external-secrets
@@ -0,0 +1,15 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-ccm
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/ccm
  wait: true
  timeout: 10m
  suspend: false
@@ -0,0 +1,15 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-cert-manager
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/cert-manager
  wait: true
  timeout: 10m
  suspend: false
@@ -0,0 +1,17 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-csi
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/csi
  dependsOn:
    - name: addon-ccm
  wait: true
  timeout: 10m
  suspend: false
@@ -0,0 +1,15 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-external-secrets
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/external-secrets
  wait: true
  timeout: 5m
  suspend: false
@@ -0,0 +1,17 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-observability-content
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/observability-content
  dependsOn:
    - name: addon-observability
  wait: true
  timeout: 5m
  suspend: false
@@ -0,0 +1,19 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-observability
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/observability
  dependsOn:
    - name: addon-external-secrets
    - name: addon-tailscale-operator
    - name: addon-tailscale-proxyclass
  wait: true
  timeout: 5m
  suspend: false
@@ -0,0 +1,16 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-rancher-backup-config
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/rancher-backup-config
  timeout: 5m
  suspend: false
  dependsOn:
    - name: addon-rancher-backup
@@ -0,0 +1,18 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-rancher-backup
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/rancher-backup
  wait: true
  timeout: 10m
  suspend: false
  dependsOn:
    - name: addon-external-secrets
    - name: addon-rancher
@@ -0,0 +1,17 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-rancher-config
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/rancher-config
  dependsOn:
    - name: addon-rancher
  wait: true
  timeout: 5m
  suspend: false
@@ -0,0 +1,20 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-rancher
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/rancher
  wait: true
  timeout: 15m
  suspend: false
  dependsOn:
    - name: addon-tailscale-operator
    - name: addon-tailscale-proxyclass
    - name: addon-external-secrets
    - name: addon-cert-manager
@@ -0,0 +1,15 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-tailscale-operator
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/tailscale-operator
  wait: true
  timeout: 5m
  suspend: false
@@ -0,0 +1,17 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-tailscale-proxyclass
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/tailscale-proxyclass
  dependsOn:
    - name: addon-tailscale-operator
  wait: true
  timeout: 5m
  suspend: false
@@ -0,0 +1,16 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - kustomization-ccm.yaml
  - kustomization-csi.yaml
  - kustomization-external-secrets.yaml
  - kustomization-cert-manager.yaml
  - kustomization-tailscale-operator.yaml
  - kustomization-tailscale-proxyclass.yaml
  - traefik
  - kustomization-observability.yaml
  - kustomization-observability-content.yaml
  - kustomization-rancher.yaml
  - kustomization-rancher-config.yaml
  - kustomization-rancher-backup.yaml
  - kustomization-rancher-backup-config.yaml
@@ -0,0 +1,60 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: grafana-dashboard-k8s-overview
  namespace: observability
  labels:
    grafana_dashboard: "1"
 data:
  k8s-overview.json: |
    {
      "annotations": {"list": []},
      "editable": true,
      "fiscalYearStartMonth": 0,
      "graphTooltip": 0,
      "id": null,
      "links": [],
      "panels": [
        {
          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
          "fieldConfig": {"defaults": {"unit": "none"}, "overrides": []},
          "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
          "id": 1,
          "options": {"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
          "targets": [
            {
              "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
              "legendFormat": "ready",
              "refId": "A"
            }
          ],
          "title": "Ready Nodes",
          "type": "stat"
        },
        {
          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
          "fieldConfig": {"defaults": {"unit": "percentunit"}, "overrides": []},
          "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
          "id": 2,
          "targets": [
            {
              "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))",
              "legendFormat": "cpu",
              "refId": "A"
            }
          ],
          "title": "Cluster CPU Usage",
          "type": "timeseries"
        }
      ],
      "refresh": "30s",
      "schemaVersion": 39,
      "style": "dark",
      "tags": ["kubernetes", "infrastructure"],
      "templating": {"list": []},
      "time": {"from": "now-1h", "to": "now"},
      "timezone": "browser",
      "title": "K8s Cluster Overview",
      "uid": "k8s-cluster-overview",
      "version": 1
    }
@@ -0,0 +1,16 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: grafana-datasources-core
  namespace: observability
  labels:
    grafana_datasource: "1"
 data:
  datasources.yaml: |
    apiVersion: 1
    datasources:
      - name: Loki
        type: loki
        access: proxy
        url: "http://loki.observability.svc.cluster.local:3100"
        isDefault: false
@@ -0,0 +1,5 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - grafana-datasources-core-configmap.yaml
  - grafana-dashboard-k8s-overview-configmap.yaml
@@ -0,0 +1,22 @@
 apiVersion: external-secrets.io/v1
 kind: ExternalSecret
 metadata:
  name: grafana-admin
  namespace: observability
 spec:
  refreshInterval: 1h
  secretStoreRef:
    name: doppler-hetznerterra
    kind: ClusterSecretStore
  target:
    name: grafana-admin-credentials
    creationPolicy: Owner
    template:
      type: Opaque
      data:
        admin-user: admin
        admin-password: "{{ .grafanaAdminPassword }}"
  data:
    - secretKey: grafanaAdminPassword
      remoteRef:
        key: GRAFANA_ADMIN_PASSWORD
@@ -0,0 +1,19 @@
 apiVersion: v1
 kind: Service
 metadata:
  name: grafana-tailscale
  namespace: observability
  annotations:
    tailscale.com/hostname: grafana
    tailscale.com/tags: "tag:prod,tag:grafana"
    tailscale.com/proxy-class: infra-stable
 spec:
  type: LoadBalancer
  loadBalancerClass: tailscale
  selector:
    app.kubernetes.io/name: grafana
  ports:
    - name: http
      port: 80
      protocol: TCP
      targetPort: 3000
@@ -0,0 +1,75 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: kube-prometheus-stack
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: observability
  chart:
    spec:
      chart: kube-prometheus-stack
      version: 68.4.4
      sourceRef:
        kind: HelmRepository
        name: prometheus-community
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    grafana:
      enabled: true
      admin:
        existingSecret: grafana-admin-credentials
      grafana.ini:
        server:
          root_url: http://grafana.silverside-gopher.ts.net/
          serve_from_sub_path: false
      persistence:
        enabled: true
        storageClassName: local-path
        size: 5Gi
      service:
        type: ClusterIP
      sidecar:
        datasources:
          enabled: true
          label: grafana_datasource
          searchNamespace: observability
        dashboards:
          enabled: true
          label: grafana_dashboard
          searchNamespace: observability
    prometheus:
      service:
        type: ClusterIP
      prometheusSpec:
        externalUrl: http://prometheus.silverside-gopher.ts.net:9090/
        routePrefix: /
        retention: 7d
        storageSpec:
          volumeClaimTemplate:
            spec:
              storageClassName: local-path
              accessModes:
                - ReadWriteOnce
              resources:
                requests:
                  storage: 10Gi
    alertmanager:
      enabled: false
    kubeEtcd:
      enabled: false
    kubeControllerManager:
      enabled: false
    kubeScheduler:
      enabled: false
    prometheus-node-exporter:
      hostNetwork: false
      service:
        hostPort: false
@@ -0,0 +1,99 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: loki
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: observability
  chart:
    spec:
      chart: loki
      version: 6.10.0
      sourceRef:
        kind: HelmRepository
        name: grafana
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    deploymentMode: SingleBinary
    loki:
      auth_enabled: false
      commonConfig:
        replication_factor: 1
      schemaConfig:
        configs:
          - from: "2024-04-01"
            store: tsdb
            object_store: filesystem
            schema: v13
            index:
              prefix: loki_index_
              period: 24h
      storage:
        type: filesystem
      limits_config:
        allow_structured_metadata: true
        volume_enabled: true
        retention_period: 168h
      pattern_ingester:
        enabled: true
      ruler:
        enable_api: true
    singleBinary:
      replicas: 1
      persistence:
        size: 10Gi
        storageClass: local-path
      resources:
        requests:
          cpu: 100m
          memory: 256Mi
        limits:
          cpu: 500m
          memory: 1Gi
    backend:
      replicas: 0
    read:
      replicas: 0
    write:
      replicas: 0
    ingester:
      replicas: 0
    querier:
      replicas: 0
    queryFrontend:
      replicas: 0
    queryScheduler:
      replicas: 0
    distributor:
      replicas: 0
    compactor:
      replicas: 0
    indexGateway:
      replicas: 0
    bloomCompactor:
      replicas: 0
    bloomGateway:
      replicas: 0
    gateway:
      enabled: false
    test:
      enabled: false
    chunksCache:
      enabled: true
      allocatedMemory: 128
    resultsCache:
      enabled: true
      allocatedMemory: 128
    monitoring:
      selfMonitoring:
        enabled: false
      lokiCanary:
        enabled: false
@@ -0,0 +1,27 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: promtail
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: observability
  chart:
    spec:
      chart: promtail
      version: 6.16.6
      sourceRef:
        kind: HelmRepository
        name: grafana
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    config:
      clients:
        - url: http://loki.observability.svc.cluster.local:3100/loki/api/v1/push
@@ -0,0 +1,8 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: grafana
  namespace: flux-system
 spec:
  interval: 1h
  url: https://grafana.github.io/helm-charts
@@ -0,0 +1,8 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: prometheus-community
  namespace: flux-system
 spec:
  interval: 1h
  url: https://prometheus-community.github.io/helm-charts
@@ -0,0 +1,12 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - namespace.yaml
  - grafana-admin-externalsecret.yaml
  - helmrepository-prometheus-community.yaml
  - helmrepository-grafana.yaml
  - helmrelease-kube-prometheus-stack.yaml
  - helmrelease-loki.yaml
  - helmrelease-promtail.yaml
  - grafana-tailscale-service.yaml
  - prometheus-tailscale-service.yaml
@@ -0,0 +1,4 @@
 apiVersion: v1
 kind: Namespace
 metadata:
  name: observability
@@ -0,0 +1,20 @@
 apiVersion: v1
 kind: Service
 metadata:
  name: prometheus-tailscale
  namespace: observability
  annotations:
    tailscale.com/hostname: prometheus
    tailscale.com/tags: "tag:prod,tag:prometheus"
    tailscale.com/proxy-class: infra-stable
 spec:
  type: LoadBalancer
  loadBalancerClass: tailscale
  selector:
    app.kubernetes.io/name: prometheus
    operator.prometheus.io/name: observability-kube-prometh-prometheus
  ports:
    - name: http
      port: 9090
      protocol: TCP
      targetPort: 9090
@@ -0,0 +1,17 @@
 apiVersion: resources.cattle.io/v1
 kind: Backup
 metadata:
  name: rancher-b2-recurring
  namespace: cattle-resources-system
 spec:
  resourceSetName: rancher-resource-set-full
  storageLocation:
    s3:
      credentialSecretName: rancher-b2-creds
      credentialSecretNamespace: cattle-resources-system
      bucketName: HetznerTerra
      folder: rancher-backups
      endpoint: s3.us-east-005.backblazeb2.com
      region: us-east-005
  schedule: "0 3 * * *"
  retentionCount: 7
@@ -0,0 +1,5 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - backup-recurring.yaml
  - restore-from-b2.yaml
@@ -0,0 +1,19 @@
 # Uncomment and set backupFilename to restore from a specific backup on rebuild.
 # Find the latest backup filename in B2: rancher-backups/ folder.
 # After restore succeeds, Rancher will have all users/settings from the backup.
 #
 # apiVersion: resources.cattle.io/v1
 # kind: Restore
 # metadata:
 #   name: restore-from-b2
 #   namespace: cattle-resources-system
 # spec:
 #   backupFilename: rancher-b2-manual-test-0a416444-2c8a-4d34-8a07-d9e406750374-2026-03-30T00-08-02Z.tar.gz
 #   storageLocation:
 #     s3:
 #       credentialSecretName: rancher-b2-creds
 #       credentialSecretNamespace: cattle-resources-system
 #       bucketName: HetznerTerra
 #       folder: rancher-backups
 #       endpoint: s3.us-east-005.backblazeb2.com
 #       region: us-east-005
@@ -0,0 +1,25 @@
 apiVersion: external-secrets.io/v1
 kind: ExternalSecret
 metadata:
  name: rancher-b2-creds
  namespace: cattle-resources-system
 spec:
  refreshInterval: 1h
  secretStoreRef:
    name: doppler-hetznerterra
    kind: ClusterSecretStore
  target:
    name: rancher-b2-creds
    creationPolicy: Owner
    template:
      type: Opaque
      data:
        accessKey: "{{ .B2_ACCOUNT_ID }}"
        secretKey: "{{ .B2_APPLICATION_KEY }}"
  data:
    - secretKey: B2_ACCOUNT_ID
      remoteRef:
        key: B2_ACCOUNT_ID
    - secretKey: B2_APPLICATION_KEY
      remoteRef:
        key: B2_APPLICATION_KEY
@@ -0,0 +1,23 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: rancher-backup-crd
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: cattle-resources-system
  chart:
    spec:
      chart: rancher-backup-crd
      version: "106.0.2+up8.1.0"
      sourceRef:
        kind: HelmRepository
        name: rancher-charts
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
@@ -0,0 +1,42 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: rancher-backup
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: cattle-resources-system
  dependsOn:
    - name: rancher-backup-crd
  chart:
    spec:
      chart: rancher-backup
      version: "106.0.2+up8.1.0"
      sourceRef:
        kind: HelmRepository
        name: rancher-charts
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    image:
      repository: rancher/backup-restore-operator
    kubectl:
      image:
        repository: rancher/kubectl
        tag: "v1.34.0"
  postRenderers:
    - kustomize:
        patches:
          - target:
              kind: Job
              name: rancher-backup-patch-sa
            patch: |
              - op: replace
                path: /spec/template/spec/containers/0/image
                value: rancher/kubectl:v1.34.0
@@ -0,0 +1,8 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: rancher-charts
  namespace: flux-system
 spec:
  interval: 1h
  url: https://charts.rancher.io
@@ -0,0 +1,8 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - namespace.yaml
  - helmrepository-rancher-backup.yaml
  - helmrelease-rancher-backup-crd.yaml
  - helmrelease-rancher-backup.yaml
  - b2-credentials-externalsecret.yaml
@@ -0,0 +1,4 @@
 apiVersion: v1
 kind: Namespace
 metadata:
  name: cattle-resources-system
@@ -0,0 +1,4 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - server-url-setting.yaml
@@ -0,0 +1,5 @@
 apiVersion: management.cattle.io/v3
 kind: Setting
 metadata:
  name: server-url
 value: https://rancher.silverside-gopher.ts.net
@@ -0,0 +1,48 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: rancher
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: cattle-system
  chart:
    spec:
      chart: rancher
      version: "2.13.3"
      sourceRef:
        kind: HelmRepository
        name: rancher-stable
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    hostname: rancher.silverside-gopher.ts.net
    replicas: 1
    extraEnv:
      - name: CATTLE_PROMETHEUS_METRICS
        value: "true"
    resources:
      requests:
        cpu: 500m
        memory: 512Mi
      limits:
        cpu: 1000m
        memory: 1Gi
    affinity:
      nodeAffinity:
        requiredDuringSchedulingIgnoredDuringExecution:
          nodeSelectorTerms:
            - matchExpressions:
                - key: node-role.kubernetes.io/control-plane
                  operator: DoesNotExist
  valuesFrom:
    - kind: Secret
      name: rancher-bootstrap-password
      valuesKey: bootstrapPassword
      targetPath: bootstrapPassword
@@ -0,0 +1,8 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: rancher-stable
  namespace: flux-system
 spec:
  interval: 1h
  url: https://releases.rancher.com/server-charts/stable
@@ -0,0 +1,9 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - namespace.yaml
  - helmrepository-rancher.yaml
  - helmrelease-rancher.yaml
  - rancher-bootstrap-password-flux-externalsecret.yaml
  - rancher-bootstrap-password-externalsecret.yaml
  - rancher-tailscale-service.yaml
@@ -0,0 +1,4 @@
 apiVersion: v1
 kind: Namespace
 metadata:
  name: cattle-system
@@ -0,0 +1,21 @@
 apiVersion: external-secrets.io/v1
 kind: ExternalSecret
 metadata:
  name: rancher-bootstrap-password
  namespace: cattle-system
 spec:
  refreshInterval: 1h
  secretStoreRef:
    name: doppler-hetznerterra
    kind: ClusterSecretStore
  target:
    name: rancher-bootstrap-password
    creationPolicy: Owner
    template:
      type: Opaque
      data:
        bootstrapPassword: "{{ .rancherBootstrapPassword }}"
  data:
    - secretKey: rancherBootstrapPassword
      remoteRef:
        key: RANCHER_BOOTSTRAP_PASSWORD
--- a/Show More
+++ b/Show More