fix: add k3s reset logic for primary control plane

refactor: simplify k3s-server bootstrap for
fix: disable external cloud-provider kubelet arg for stable baseline
2026-03-21 16:10:17 +00:00 · 2026-03-21 15:48:33 +00:00 · 2026-03-21 14:36:54 +00:00 · 2026-03-21 00:07:51 +00:00 · 2026-03-20 10:37:11 +00:00 · 2026-03-20 09:30:17 +00:00
87 changed files with 8879 additions and 569 deletions
--- a/.gitea/workflows/dashboards.yml
+++ b/.gitea/workflows/dashboards.yml
@@ -0,0 +1,99 @@
 name: Deploy Grafana Content
 on:
  push:
    branches:
      - main
    paths:
      - "ansible/dashboards.yml"
      - "ansible/roles/observability-content/**"
      - ".gitea/workflows/dashboards.yml"
  workflow_dispatch:
 env:
  TF_VERSION: "1.7.0"
  TF_VAR_hcloud_token: ${{ secrets.HCLOUD_TOKEN }}
  TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
  TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
  TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
  TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
  TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
 jobs:
  dashboards:
    name: Grafana Content
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Setup Terraform
        uses: hashicorp/setup-terraform@v3
        with:
          terraform_version: ${{ env.TF_VERSION }}
      - name: Setup SSH Keys
        run: |
          mkdir -p ~/.ssh
          echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
          chmod 600 ~/.ssh/id_ed25519
          echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
          chmod 644 ~/.ssh/id_ed25519.pub
      - name: Terraform Init
        working-directory: terraform
        run: |
          terraform init \
            -backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
            -backend-config="bucket=${{ secrets.S3_BUCKET }}" \
            -backend-config="region=auto" \
            -backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
            -backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
            -backend-config="skip_requesting_account_id=true"
      - name: Detect runner egress IP
        run: |
          RUNNER_IP=$(curl -fsSL https://api.ipify.org)
          echo "RUNNER_CIDR=[\"${RUNNER_IP}/32\"]" >> "$GITHUB_ENV"
          echo "Runner egress IP: ${RUNNER_IP}"
      - name: Open SSH/API for current runner CIDR
        working-directory: terraform
        run: |
          terraform apply \
            -refresh=false \
            -target=hcloud_firewall.cluster \
            -var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
            -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
            -var="ssh_private_key=$HOME/.ssh/id_ed25519" \
            -var="allowed_ssh_ips=${RUNNER_CIDR}" \
            -var="allowed_api_ips=${RUNNER_CIDR}" \
            -auto-approve
      - name: Install Python Dependencies
        run: |
          apt-get update && apt-get install -y python3-pip
          pip3 install --break-system-packages ansible kubernetes jinja2 pyyaml
      - name: Install Ansible Collections
        run: ansible-galaxy collection install -r ansible/requirements.yml
      - name: Generate Ansible Inventory
        working-directory: ansible
        run: python3 generate_inventory.py
      - name: Apply dashboards and datasources
        working-directory: ansible
        run: |
          ansible-playbook dashboards.yml \
            -e "cluster_name=k8s-cluster"
        env:
          ANSIBLE_HOST_KEY_CHECKING: "False"
      - name: Verify Grafana content resources
        working-directory: ansible
        run: |
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get configmap -l grafana_datasource=1"
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get configmap -l grafana_dashboard=1"
        env:
          ANSIBLE_HOST_KEY_CHECKING: "False"
--- a/.gitea/workflows/deploy.yml
+++ b/.gitea/workflows/deploy.yml
@@ -17,6 +17,8 @@ env:
  TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
  TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
  TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
  TS_OAUTH_CLIENT_ID: ${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}
  TS_OAUTH_CLIENT_SECRET: ${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}
 jobs:
  terraform:
@@ -86,12 +88,8 @@ jobs:
          }
          ensure_import 'hcloud_server.control_plane[0]' 'k8s-cluster-cp-1'
          ensure_import 'hcloud_server.control_plane[1]' 'k8s-cluster-cp-2'
          ensure_import 'hcloud_server.control_plane[2]' 'k8s-cluster-cp-3'
          ensure_import 'hcloud_server.workers[0]' 'k8s-cluster-worker-1'
          ensure_import 'hcloud_server.workers[1]' 'k8s-cluster-worker-2'
          ensure_import 'hcloud_server.workers[2]' 'k8s-cluster-worker-3'
          ensure_import 'hcloud_server.workers[3]' 'k8s-cluster-worker-4'
      - name: Terraform Plan
        id: plan
@@ -226,16 +224,62 @@ jobs:
            -e "hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
            -e "tailscale_auth_key=${{ secrets.TAILSCALE_AUTH_KEY }}" \
            -e "tailscale_tailnet=${{ secrets.TAILSCALE_TAILNET }}" \
            -e "tailscale_oauth_client_id=${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}" \
            -e "tailscale_oauth_client_secret=${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}" \
            -e "doppler_hetznerterra_service_token=${{ secrets.DOPPLER_HETZNERTERRA_SERVICE_TOKEN }}" \
            -e "grafana_admin_password=${{ secrets.GRAFANA_ADMIN_PASSWORD }}" \
            -e "cluster_name=k8s-cluster"
        env:
          ANSIBLE_HOST_KEY_CHECKING: "False"
      - name: Install kubectl
        run: |
          curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
          chmod +x /usr/local/bin/kubectl
      - name: Rewrite kubeconfig for runner-reachable API
        working-directory: terraform
        run: |
          PRIMARY_IP=$(terraform output -raw primary_control_plane_ip)
          sed -i "s#https://k8s-cluster-cp-1\.[^:]*:6443#https://${PRIMARY_IP}:6443#g" ../outputs/kubeconfig
      - name: Bootstrap Flux source and reconciliation graph
        env:
          KUBECONFIG: outputs/kubeconfig
          FLUX_GIT_HOST: 64.176.189.59
          FLUX_GIT_PORT: "2222"
        run: |
          kubectl create namespace flux-system --dry-run=client -o yaml | kubectl apply -f -
          ssh-keyscan -p "${FLUX_GIT_PORT}" "${FLUX_GIT_HOST}" > /tmp/flux_known_hosts
          kubectl -n flux-system create secret generic flux-system \
            --from-file=identity="$HOME/.ssh/id_ed25519" \
            --from-file=known_hosts=/tmp/flux_known_hosts \
            --dry-run=client -o yaml | kubectl apply -f -
          kubectl apply -k clusters/prod/flux-system
          kubectl -n flux-system rollout status deployment/source-controller --timeout=180s
          kubectl -n flux-system rollout status deployment/kustomize-controller --timeout=180s
          kubectl -n flux-system rollout status deployment/helm-controller --timeout=180s
          kubectl -n flux-system wait --for=condition=Ready gitrepository/platform --timeout=180s
          kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=300s
          kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets --timeout=300s
          kubectl -n flux-system wait --for=condition=Ready kustomization/addon-ccm --timeout=300s
          kubectl -n flux-system wait --for=condition=Ready kustomization/addon-csi --timeout=300s
          kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s
          kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=300s
          kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=300s
      - name: Post-deploy cluster health checks
        working-directory: ansible
        run: |
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get nodes -o wide"
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n flux-system get gitrepositories,kustomizations,helmreleases"
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n kube-system get pods -o wide"
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass"
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get pods -o wide"
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get pvc"
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n tailscale-system get pods -o wide"
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get svc kube-prometheus-stack-grafana kube-prometheus-stack-prometheus"
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability describe svc kube-prometheus-stack-grafana"
        env:
          ANSIBLE_HOST_KEY_CHECKING: "False"
--- a/.gitea/workflows/destroy.yml
+++ b/.gitea/workflows/destroy.yml
@@ -51,11 +51,57 @@ jobs:
          echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
          chmod 644 ~/.ssh/id_ed25519.pub
      - name: Install jq
        run: |
          apt-get update
          apt-get install -y jq
      - name: Terraform Destroy
        id: destroy
        working-directory: terraform
        run: |
-          terraform destroy \
+          set +e
-            -var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
+          for attempt in 1 2 3; do
-            -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
+            echo "Terraform destroy attempt ${attempt}/3"
-            -var="ssh_private_key=$HOME/.ssh/id_ed25519" \
+            terraform destroy \
-            -auto-approve
+              -var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
              -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
              -var="ssh_private_key=$HOME/.ssh/id_ed25519" \
              -auto-approve
            rc=$?
            if [ "$rc" -eq 0 ]; then
              exit 0
            fi
            if [ "$attempt" -lt 3 ]; then
              echo "Terraform destroy failed with exit code ${rc}; retrying in 30s"
              sleep 30
              terraform refresh \
                -var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
                -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
                -var="ssh_private_key=$HOME/.ssh/id_ed25519" || true
            fi
          done
          exit "$rc"
      - name: Hetzner destroy diagnostics
        if: failure() && steps.destroy.outcome == 'failure'
        env:
          HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }}
        run: |
          set +e
          echo "== Terraform state list =="
          terraform -chdir=terraform state list || true
          network_id=$(terraform -chdir=terraform state show hcloud_network.cluster 2>/dev/null | awk '/^id *=/ {gsub(/"/, "", $3); print $3; exit}')
          if [ -z "$network_id" ]; then
            network_id="11988935"
          fi
          echo "== Hetzner network =="
          curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/networks/${network_id}" | jq . || true
          echo "== Hetzner servers attached to network =="
          curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/servers" | jq --argjson id "$network_id" '.servers[] | select(any(.private_net[]?; .network == $id)) | {id, name, private_net}' || true
          echo "== Hetzner load balancers attached to network =="
          curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/load_balancers" | jq --argjson id "$network_id" '.load_balancers[] | select(any(.private_net[]?; .network == $id)) | {id, name, private_net}' || true
--- a/README.md
+++ b/README.md
@@ -10,7 +10,7 @@ Production-ready Kubernetes cluster on Hetzner Cloud using Terraform and Ansible
 | **Workers** | 4x CX33 |
 | **Total Cost** | €28.93/mo |
 | **K8s** | k3s (latest, HA) |
-| **Addons** | Hetzner CCM + CSI |
+| **Addons** | Hetzner CCM + CSI + Prometheus + Grafana + Loki |
 | **Access** | SSH/API restricted to Tailnet |
 | **Bootstrap** | Terraform + Ansible |
@@ -152,6 +152,7 @@ This repository includes Gitea workflows for:
 - **terraform-plan**: Runs on PRs, shows planned changes
 - **terraform-apply**: Runs on main branch after merge
 - **ansible-deploy**: Runs after terraform apply
 - **dashboards**: Fast workflow that updates Grafana datasources/dashboards only
 ### Required Gitea Secrets
@@ -166,10 +167,159 @@ Set these in your Gitea repository settings (**Settings** → **Secrets** → **
 | `S3_BUCKET` | S3 bucket name (e.g., `k8s-terraform-state`) |
 | `TAILSCALE_AUTH_KEY` | Tailscale auth key for node bootstrap |
 | `TAILSCALE_TAILNET` | Tailnet domain (e.g., `yourtailnet.ts.net`) |
 | `TAILSCALE_OAUTH_CLIENT_ID` | Tailscale OAuth client ID for Kubernetes Operator |
 | `TAILSCALE_OAUTH_CLIENT_SECRET` | Tailscale OAuth client secret for Kubernetes Operator |
 | `DOPPLER_HETZNERTERRA_SERVICE_TOKEN` | Doppler service token for `hetznerterra` runtime secrets |
 | `GRAFANA_ADMIN_PASSWORD` | Optional admin password for Grafana (auto-generated if unset) |
 | `RUNNER_ALLOWED_CIDRS` | Optional CIDR list for CI runner access if you choose to pass it via tfvars/secrets |
 | `SSH_PUBLIC_KEY` | SSH public key content |
 | `SSH_PRIVATE_KEY` | SSH private key content |
 ## GitOps (Flux)
 This repo uses Flux for continuous reconciliation after Terraform + Ansible bootstrap.
 ### Stable private-only baseline
 The current default target is a deliberately simplified baseline:
 - `1` control plane node
 - `2` worker nodes
 - private Hetzner network only
 - Tailscale for operator access
 - Flux-managed core addons only
 Detailed phase gates and success criteria live in `STABLE_BASELINE.md`.
 This is the default until rebuilds are consistently green. High availability, public ingress, and app-layer expansion come later.
 ### Runtime secrets
 Runtime cluster secrets are moving to Doppler + External Secrets Operator.
 - Doppler project: `hetznerterra`
 - Initial auth: service token via `DOPPLER_HETZNERTERRA_SERVICE_TOKEN`
 - First synced secrets:
  - `GRAFANA_ADMIN_PASSWORD`
  - `WEAVE_GITOPS_ADMIN_USERNAME`
  - `WEAVE_GITOPS_ADMIN_PASSWORD_BCRYPT_HASH`
 Terraform/bootstrap secrets remain in Gitea Actions secrets and are not managed by Doppler.
 ### Repository layout
 - `clusters/prod/`: cluster entrypoint and Flux reconciliation objects
 - `clusters/prod/flux-system/`: `GitRepository` source and top-level `Kustomization` graph
 - `infrastructure/`: infrastructure addon reconciliation graph
 - `infrastructure/addons/*`: per-addon manifests for Flux-managed cluster addons
 - `apps/`: application workload layer (currently scaffolded)
 ### Reconciliation graph
 - `infrastructure` (top-level)
  - `addon-ccm`
  - `addon-csi` depends on `addon-ccm`
  - `addon-tailscale-operator`
  - `addon-observability`
  - `addon-observability-content` depends on `addon-observability`
 - `apps` depends on `infrastructure`
 ### Bootstrap notes
 1. Install Flux controllers in `flux-system`.
 2. Create the Flux deploy key/secret named `flux-system` in `flux-system` namespace.
 3. Apply `clusters/prod/flux-system/` once to establish source + reconciliation graph.
 4. Bootstrap-only Ansible creates prerequisite secrets; Flux manages addon lifecycle after bootstrap.
 ### Current addon status
 - Core infrastructure addons are Flux-managed from `infrastructure/addons/`.
 - Active Flux addons include `addon-ccm`, `addon-csi`, `addon-tailscale-operator`, `addon-tailscale-proxyclass`, `addon-external-secrets`, `addon-observability`, and `addon-observability-content`.
 - Ansible is limited to cluster bootstrap, private-access setup, and prerequisite secret creation for Flux-managed addons.
 - `addon-flux-ui` is optional for the stable-baseline phase and is not a blocker for rebuild success.
 ### Stable baseline acceptance
 A rebuild is considered successful only when all of the following pass without manual intervention:
 - Terraform create succeeds for the default `1` control plane and `2` workers.
 - Ansible bootstrap succeeds end-to-end.
 - All nodes become `Ready`.
 - `hcloud-cloud-controller-manager` and `hcloud-csi` are `Ready`.
 - Required External Secrets sync successfully.
 - Tailscale private access works.
 - Grafana and Prometheus are reachable privately.
 - Terraform destroy succeeds cleanly or succeeds after workflow retries.
 ## Observability Stack
 Flux deploys a lightweight observability stack in the `observability` namespace:
 - `kube-prometheus-stack` (Prometheus + Grafana)
 - `loki`
 - `promtail`
 Grafana content is managed as code via ConfigMaps in `infrastructure/addons/observability-content/`.
 Grafana and Prometheus are exposed through a single Tailscale front door backed by Traefik when the Tailscale Kubernetes Operator is healthy.
 ### Access Grafana and Prometheus
 Preferred private access:
 - Grafana: `http://k8s-cluster-cp-1.<your-tailnet>:30080/`
 - Prometheus: `http://k8s-cluster-cp-1.<your-tailnet>:30990/`
 - Flux UI: `http://k8s-cluster-cp-1.<your-tailnet>:30901/`
 This access path is bootstrapped automatically by Ansible on `control_plane[0]` using persistent `kubectl port-forward` systemd services plus `tailscale serve`, so it survives cluster rebuilds.
 Fallback (port-forward from a tailnet-connected machine):
 Run from a tailnet-connected machine:
 ```bash
 export KUBECONFIG=$(pwd)/outputs/kubeconfig
 kubectl -n observability port-forward svc/kube-prometheus-stack-grafana 3000:80
 kubectl -n observability port-forward svc/kube-prometheus-stack-prometheus 9090:9090
 ```
 Then open:
 - Grafana: http://127.0.0.1:3000
 - Prometheus: http://127.0.0.1:9090
 Grafana user: `admin`
 Grafana password: value of `GRAFANA_ADMIN_PASSWORD` secret (or the generated value shown by Ansible output)
 ### Verify Tailscale exposure
 ```bash
 export KUBECONFIG=$(pwd)/outputs/kubeconfig
 kubectl -n tailscale-system get pods
 kubectl -n observability get svc kube-prometheus-stack-grafana kube-prometheus-stack-prometheus
 kubectl -n observability describe svc kube-prometheus-stack-grafana | grep TailscaleProxyReady
 kubectl -n observability describe svc kube-prometheus-stack-prometheus | grep TailscaleProxyReady
 ```
 If `TailscaleProxyReady=False`, check:
 ```bash
 kubectl -n tailscale-system logs deployment/operator --tail=100
 ```
 Common cause: OAuth client missing tag/scopes permissions.
 ### Fast dashboard iteration workflow
 Use the `Deploy Grafana Content` workflow when changing dashboard/data source templates.
 It avoids full cluster provisioning and only applies Grafana content resources:
 - `ansible/roles/observability-content/templates/grafana-datasources.yaml.j2`
 - `ansible/roles/observability-content/templates/grafana-dashboard-k8s-overview.yaml.j2`
 - `ansible/dashboards.yml`
 ## File Structure
 ```
@@ -191,13 +341,15 @@ Set these in your Gitea repository settings (**Settings** → **Secrets** → **
 │   │   ├── common/
 │   │   ├── k3s-server/
 │   │   ├── k3s-agent/
-│   │   ├── ccm/
+│   │   ├── addon-secrets-bootstrap/
-│   │   └── csi/
+│   │   ├── observability-content/
 │   │   └── observability/
 │   └── ansible.cfg
 ├── .gitea/
 │   └── workflows/
 │       ├── terraform.yml
-│       └── ansible.yml
+│       ├── ansible.yml
 │       └── dashboards.yml
 ├── outputs/
 ├── terraform.tfvars.example
 └── README.md
--- a/SECRETS_SETUP.md
+++ b/SECRETS_SETUP.md
@@ -0,0 +1,93 @@
 # Gitea Secrets Setup
 This document describes the secrets required for the HetznerTerra deployment workflow.
 ## Required Secrets
 Add these secrets in your Gitea repository settings:
 **Settings → Secrets → Actions**
 ### Infrastructure Secrets
 #### `HCLOUD_TOKEN`
 - Hetzner Cloud API token
 - Get from: https://console.hetzner.com/projects/{project-id}/security/api-tokens
 - Permissions: Read & Write
 #### `S3_ACCESS_KEY` & `S3_SECRET_KEY`
 - Backblaze B2 credentials for Terraform state storage
 - Get from: https://secure.backblaze.com/b2_buckets.htm
 - Create application key with access to your terraform state bucket
 #### `S3_ENDPOINT`
 - Backblaze B2 S3 endpoint
 - Example: `https://s3.eu-central-003.backblazeb2.com`
 #### `S3_BUCKET`
 - Backblaze B2 bucket name for Terraform state
 - Example: `k8s-terraform-state`
 ### SSH Secrets
 #### `SSH_PRIVATE_KEY` & `SSH_PUBLIC_KEY`
 - SSH key pair for cluster access
 - Generate with: `ssh-keygen -t ed25519 -C "k8s@hetzner" -f ~/.ssh/hetzner_k8s`
 - Private key content (include BEGIN/END lines)
 - Public key content (full line starting with ssh-ed25519)
 ### Tailscale Secrets
 #### `TAILSCALE_AUTH_KEY`
 - Tailscale auth key for node registration
 - Get from: https://login.tailscale.com/admin/settings/keys
 - Type: Reusable, Ephemeral
 - Scope: `devices:core:write`
 #### `TAILSCALE_TAILNET`
 - Your Tailscale network name
 - Example: `tail7ec33.ts.net` or your custom domain
 #### `TAILSCALE_OAUTH_CLIENT_ID` & `TAILSCALE_OAUTH_CLIENT_SECRET`
 - OAuth credentials for Tailscale Kubernetes Operator
 - Get from: https://login.tailscale.com/admin/settings/oauth
 - Create OAuth client with scope: `devices:core:write`
 ### Application Secrets
 #### `DOPPLER_HETZNERTERRA_SERVICE_TOKEN`
 - Doppler service token for the `hetznerterra` project runtime secrets
 - Used by External Secrets Operator bootstrap
 - Recommended scope: `hetznerterra` project, `prod` config only
 #### `GRAFANA_ADMIN_PASSWORD`
 - Transitional fallback only while migrating observability secrets to Doppler
 - In steady state, store this in Doppler as `GRAFANA_ADMIN_PASSWORD`
 ## Setting Up Secrets
 1. Go to your Gitea repository
 2. Navigate to **Settings → Secrets → Actions**
 3. Click **Add Secret**
 4. Enter the secret name (exact match from above)
 5. Paste the secret value
 6. Click **Add Secret**
 7. Repeat for all secrets
 ## Verification
 After adding all secrets, trigger a workflow run:
 ```bash
 git commit --allow-empty -m "ci: trigger workflow with new secrets"
 git push
 ```
 Check the workflow logs to verify all secrets are being used correctly.
 ## Security Notes
 - Never commit secrets to the repository
 - Use strong, unique passwords for Grafana and other services
 - Prefer Doppler for runtime app/platform secrets after cluster bootstrap
 - Rotate Tailscale auth keys periodically
 - Review OAuth client permissions regularly
 - The workflow automatically opens SSH/API access only for the runner's IP during deployment
--- a/STABLE_BASELINE.md
+++ b/STABLE_BASELINE.md
@@ -0,0 +1,47 @@
 # Stable Private-Only Baseline
 This document defines the current engineering target for this repository.
 ## Topology
 - 1 control plane
 - 2 workers
 - private Hetzner network
 - Tailscale operator access
 ## In Scope
 - Terraform infrastructure bootstrap
 - Ansible k3s bootstrap
 - Flux core reconciliation
 - Hetzner CCM
 - Hetzner CSI
 - External Secrets Operator with Doppler
 - Tailscale private access
 - Observability stack
 ## Out of Scope
 - HA control plane
 - public ingress or DNS
 - public TLS
 - app workloads
 - DR / backup strategy
 - upgrade strategy
 ## Phase Gates
 1. Terraform apply completes for the default topology.
 2. k3s server bootstrap completes and kubeconfig works.
 3. Workers join and all nodes are Ready.
 4. Flux source and infrastructure reconciliation are healthy.
 5. CCM is Ready.
 6. CSI is Ready and a PVC can bind.
 7. External Secrets sync required secrets.
 8. Tailscale private access works.
 9. Observability is healthy and reachable privately.
 10. Terraform destroy succeeds cleanly or via workflow retry.
 ## Success Criteria
 The baseline is considered stable only after two consecutive fresh rebuilds pass all phase gates with no manual fixes.
--- a/ansible/dashboards.yml
+++ b/ansible/dashboards.yml
@@ -0,0 +1,7 @@
 ---
 - name: Provision Grafana dashboards and datasources
  hosts: control_plane[0]
  become: true
  roles:
    - observability-content
--- a/ansible/roles/addon-secrets-bootstrap/tasks/main.yml
+++ b/ansible/roles/addon-secrets-bootstrap/tasks/main.yml
@@ -0,0 +1,41 @@
 ---
 - name: Apply Hetzner cloud secret
  shell: >-
    kubectl -n kube-system create secret generic hcloud
    --from-literal=token='{{ hcloud_token }}'
    --from-literal=network='{{ cluster_name }}-network'
    --dry-run=client -o yaml | kubectl apply -f -
  changed_when: true
  no_log: true
  when: hcloud_token | default('') | length > 0
 - name: Ensure Tailscale operator namespace exists
  command: >-
    kubectl create namespace {{ tailscale_operator_namespace | default('tailscale-system') }}
    --dry-run=client -o yaml
  register: tailscale_namespace_manifest
  changed_when: false
  when:
    - tailscale_oauth_client_id | default('') | length > 0
    - tailscale_oauth_client_secret | default('') | length > 0
 - name: Apply Tailscale operator namespace
  command: kubectl apply -f -
  args:
    stdin: "{{ tailscale_namespace_manifest.stdout }}"
  changed_when: true
  when:
    - tailscale_oauth_client_id | default('') | length > 0
    - tailscale_oauth_client_secret | default('') | length > 0
 - name: Apply Tailscale operator OAuth secret
  shell: >-
    kubectl -n {{ tailscale_operator_namespace | default('tailscale-system') }} create secret generic operator-oauth
    --from-literal=client_id='{{ tailscale_oauth_client_id }}'
    --from-literal=client_secret='{{ tailscale_oauth_client_secret }}'
    --dry-run=client -o yaml | kubectl apply -f -
  changed_when: true
  no_log: true
  when:
    - tailscale_oauth_client_id | default('') | length > 0
    - tailscale_oauth_client_secret | default('') | length > 0
--- a/ansible/roles/ccm/defaults/main.yml
+++ b/ansible/roles/ccm/defaults/main.yml
@@ -1,4 +0,0 @@
 ---
 hcloud_token: ""
 cluster_name: "k8s-cluster"
 hcloud_lb_location: "nbg1"
--- a/ansible/roles/ccm/tasks/main.yml
+++ b/ansible/roles/ccm/tasks/main.yml
@@ -1,88 +0,0 @@
 ---
 - name: Check if Hetzner CCM is already deployed
  command: kubectl -n kube-system get deployment hcloud-cloud-controller-manager
  register: ccm_namespace
  failed_when: false
  changed_when: false
 - name: Create Hetzner cloud secret
  shell: |
    kubectl -n kube-system create secret generic hcloud \
      --from-literal=token='{{ hcloud_token }}' \
      --from-literal=network='{{ cluster_name }}-network' \
      --dry-run=client -o yaml | kubectl apply -f -
  no_log: true
  when: hcloud_token is defined
  changed_when: true
 - name: Deploy Hetzner CCM
  command: kubectl apply -f https://raw.githubusercontent.com/hetznercloud/hcloud-cloud-controller-manager/main/deploy/ccm-networks.yaml
  changed_when: true
 - name: Detect CCM workload kind
  shell: |
    if kubectl -n kube-system get deployment hcloud-cloud-controller-manager >/dev/null 2>&1; then
      echo deployment
    elif kubectl -n kube-system get daemonset hcloud-cloud-controller-manager >/dev/null 2>&1; then
      echo daemonset
    else
      echo missing
    fi
  register: ccm_workload_kind
  changed_when: false
 - name: Wait for CCM deployment rollout
  command: kubectl rollout status deployment/hcloud-cloud-controller-manager -n kube-system
  register: ccm_rollout_deploy
  until: ccm_rollout_deploy.rc == 0
  changed_when: false
  retries: 30
  delay: 10
  when: ccm_workload_kind.stdout == "deployment"
 - name: Wait for CCM daemonset rollout
  command: kubectl rollout status daemonset/hcloud-cloud-controller-manager -n kube-system
  register: ccm_rollout_ds
  until: ccm_rollout_ds.rc == 0
  changed_when: false
  retries: 30
  delay: 10
  when: ccm_workload_kind.stdout == "daemonset"
 - name: Set default Hetzner load balancer location for Traefik service
  command: kubectl -n kube-system annotate service traefik load-balancer.hetzner.cloud/location={{ hcloud_lb_location }} --overwrite
  register: traefik_annotation
  changed_when: true
  failed_when: false
 - name: Show Traefik service when annotation patch fails
  command: kubectl -n kube-system get service traefik -o yaml
  register: traefik_service_dump
  changed_when: false
  failed_when: false
  when: traefik_annotation.rc != 0
 - name: Fail when Traefik load balancer annotation cannot be set
  fail:
    msg: |
      Failed to set Hetzner load balancer location annotation on kube-system/traefik service.
      Command output:
      {{ traefik_annotation.stderr | default(traefik_annotation.stdout) }}
      Service dump:
      {{ traefik_service_dump.stdout | default('n/a') }}
  when: traefik_annotation.rc != 0
 - name: Show CCM namespace objects when workload missing
  command: kubectl -n kube-system get all | grep hcloud-cloud-controller-manager || true
  register: ccm_ns_objects
  changed_when: false
  when: ccm_workload_kind.stdout == "missing"
 - name: Fail when CCM workload is missing
  fail:
    msg: |
      hcloud-cloud-controller-manager workload not found after applying manifest.
      Namespace objects:
      {{ ccm_ns_objects.stdout | default('n/a') }}
  when: ccm_workload_kind.stdout == "missing"
--- a/ansible/roles/csi/defaults/main.yml
+++ b/ansible/roles/csi/defaults/main.yml
@@ -1,15 +0,0 @@
 ---
 hcloud_token: ""
 cluster_name: "k8s-cluster"
 csi_manifest_url: "https://raw.githubusercontent.com/hetznercloud/csi-driver/main/deploy/kubernetes/hcloud-csi.yml"
 csi_rollout_timeout_seconds: 30
 csi_rollout_retries: 8
 csi_rollout_delay_seconds: 5
 csi_failure_log_tail_lines: 120
 csi_smoke_test_enabled: true
 csi_smoke_test_storage_class: "csi-smoke-hcloud-immediate"
 csi_smoke_test_base_storage_class: "hcloud-volumes"
 csi_smoke_test_size: "1Gi"
 csi_smoke_test_pvc_timeout_seconds: 300
 csi_smoke_test_job_timeout_seconds: 300
 csi_smoke_test_required: false
--- a/ansible/roles/csi/tasks/main.yml
+++ b/ansible/roles/csi/tasks/main.yml
@@ -1,425 +0,0 @@
 ---
 - name: Create Hetzner CSI secret
  shell: |
    kubectl -n kube-system create secret generic hcloud \
      --from-literal=token='{{ hcloud_token }}' \
      --from-literal=network='{{ cluster_name }}-network' \
      --dry-run=client -o yaml | kubectl apply -f -
  no_log: true
  when: hcloud_token is defined
  changed_when: true
 - name: Deploy Hetzner CSI
  command: kubectl apply -f {{ csi_manifest_url }}
  changed_when: true
 - name: Ensure CSI controller endpoint is set for sidecars
  command: kubectl -n kube-system set env deployment/hcloud-csi-controller CSI_ENDPOINT=unix:///run/csi/socket
  changed_when: true
 - name: Ensure CSI node endpoint is set for sidecars
  command: kubectl -n kube-system set env daemonset/hcloud-csi-node CSI_ENDPOINT=unix:///run/csi/socket
  changed_when: true
 - name: Restart CSI controller to pick up current secret
  command: kubectl -n kube-system rollout restart deployment/hcloud-csi-controller
  changed_when: true
 - name: Wait for CSI controller deployment generation
  command: kubectl -n kube-system rollout status deployment/hcloud-csi-controller --timeout=30s
  failed_when: false
  changed_when: false
 - name: Wait for CSI controller rollout
  command: kubectl rollout status deployment/hcloud-csi-controller -n kube-system --timeout={{ csi_rollout_timeout_seconds }}s
  register: csi_controller_rollout
  until: csi_controller_rollout.rc == 0
  retries: "{{ csi_rollout_retries | int }}"
  delay: "{{ csi_rollout_delay_seconds | int }}"
  failed_when: false
  changed_when: false
 - name: Show CSI controller status on failure
  command: kubectl -n kube-system get deployment hcloud-csi-controller -o wide
  register: csi_controller_deploy_status
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Show CSI controller pods on failure
  command: kubectl -n kube-system get pods -l app=hcloud-csi-controller -o wide
  register: csi_controller_pods_status
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Describe CSI controller deployment on failure
  command: kubectl -n kube-system describe deployment hcloud-csi-controller
  register: csi_controller_deploy_describe
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Describe CSI controller pod on failure
  shell: |
    pod="$(kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
    if [ -n "$pod" ]; then
      kubectl -n kube-system describe pod "$pod"
    fi
  register: csi_controller_pod_describe
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Show CSI driver logs on failure
  command: kubectl -n kube-system logs deployment/hcloud-csi-controller -c hcloud-csi-driver --tail={{ csi_failure_log_tail_lines }}
  register: csi_driver_logs
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Show CSI driver previous logs on failure
  shell: |
    pod="$(kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
    if [ -n "$pod" ]; then
      kubectl -n kube-system logs "$pod" -c hcloud-csi-driver --previous --tail={{ csi_failure_log_tail_lines }}
    fi
  register: csi_driver_previous_logs
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Show sidecar previous logs on failure
  shell: |
    pod="$(kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
    if [ -n "$pod" ]; then
      for container in csi-attacher csi-resizer csi-provisioner; do
        echo "===== $container ====="
        kubectl -n kube-system logs "$pod" -c "$container" --previous --tail={{ csi_failure_log_tail_lines }} || true
      done
    fi
  register: csi_sidecar_previous_logs
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Show recent kube-system events on failure
  command: kubectl -n kube-system get events --sort-by=.lastTimestamp
  register: csi_recent_events
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Fail with CSI controller diagnostics
  fail:
    msg: |
      CSI controller rollout failed.
      Deployment status:
      {{ csi_controller_deploy_status.stdout | default('n/a') }}
      Pods status:
      {{ csi_controller_pods_status.stdout | default('n/a') }}
      Deployment describe:
      {{ csi_controller_deploy_describe.stdout | default('n/a') }}
      Pod describe:
      {{ csi_controller_pod_describe.stdout | default('n/a') }}
      hcloud-csi-driver logs:
      {{ csi_driver_logs.stdout | default('n/a') }}
      hcloud-csi-driver previous logs:
      {{ csi_driver_previous_logs.stdout | default('n/a') }}
      Sidecar previous logs:
      {{ csi_sidecar_previous_logs.stdout | default('n/a') }}
      Recent kube-system events:
      {{ csi_recent_events.stdout | default('n/a') }}
  when: csi_controller_rollout.rc != 0
 - name: Wait for CSI node daemonset rollout
  command: kubectl rollout status daemonset/hcloud-csi-node -n kube-system --timeout={{ csi_rollout_timeout_seconds }}s
  register: csi_node_rollout
  until: csi_node_rollout.rc == 0
  retries: "{{ csi_rollout_retries | int }}"
  delay: "{{ csi_rollout_delay_seconds | int }}"
  failed_when: false
  changed_when: false
 - name: Fail when CSI node daemonset rollout does not complete
  fail:
    msg: "CSI node daemonset rollout failed: {{ csi_node_rollout.stdout | default('') }} {{ csi_node_rollout.stderr | default('') }}"
  when: csi_node_rollout.rc != 0
 - name: Generate CSI smoke test run identifier
  set_fact:
    csi_smoke_test_run_id: "{{ lookup('pipe', 'date +%s') }}"
  when: csi_smoke_test_enabled | bool
 - name: Generate unique CSI smoke test resource names
  set_fact:
    csi_smoke_test_pvc_name: "csi-smoke-pvc-{{ csi_smoke_test_run_id }}"
    csi_smoke_test_job_name: "csi-smoke-job-{{ csi_smoke_test_run_id }}"
  when: csi_smoke_test_enabled | bool
 - name: Cleanup stale CSI smoke test resources before apply
  shell: |
    kubectl -n kube-system delete job,pvc -l app.kubernetes.io/name=csi-smoke --ignore-not-found --wait=true
    kubectl delete storageclass {{ csi_smoke_test_storage_class }} --ignore-not-found
  failed_when: false
  changed_when: false
  when: csi_smoke_test_enabled | bool
 - name: Apply CSI smoke test resources
  shell: |
    kubectl apply -f - <<'EOF'
    apiVersion: storage.k8s.io/v1
    kind: StorageClass
    metadata:
      name: {{ csi_smoke_test_storage_class }}
    provisioner: csi.hetzner.cloud
    reclaimPolicy: Delete
    volumeBindingMode: Immediate
    allowVolumeExpansion: true
    ---
    apiVersion: v1
    kind: PersistentVolumeClaim
    metadata:
      name: {{ csi_smoke_test_pvc_name }}
      namespace: kube-system
      labels:
        app.kubernetes.io/name: csi-smoke
    spec:
      accessModes:
        - ReadWriteOnce
      resources:
        requests:
          storage: {{ csi_smoke_test_size }}
      storageClassName: {{ csi_smoke_test_storage_class }}
    ---
    apiVersion: batch/v1
    kind: Job
    metadata:
      name: {{ csi_smoke_test_job_name }}
      namespace: kube-system
      labels:
        app.kubernetes.io/name: csi-smoke
    spec:
      backoffLimit: 0
      template:
        spec:
          restartPolicy: Never
          containers:
            - name: write-and-read
              image: busybox:1.36
              command: ["/bin/sh", "-c", "echo csi-ok > /data/health && cat /data/health"]
              volumeMounts:
                - name: data
                  mountPath: /data
          volumes:
            - name: data
              persistentVolumeClaim:
                claimName: {{ csi_smoke_test_pvc_name }}
    EOF
  changed_when: true
  when: csi_smoke_test_enabled | bool
 - name: Wait for CSI smoke PVC to bind
  command: kubectl -n kube-system wait --for=jsonpath='{.status.phase}'=Bound pvc/{{ csi_smoke_test_pvc_name }} --timeout={{ csi_smoke_test_pvc_timeout_seconds }}s
  register: csi_smoke_pvc_wait
  failed_when: false
  changed_when: false
  when: csi_smoke_test_enabled | bool
 - name: Wait for CSI smoke Job completion
  command: kubectl -n kube-system wait --for=condition=complete job/{{ csi_smoke_test_job_name }} --timeout={{ csi_smoke_test_job_timeout_seconds }}s
  register: csi_smoke_job_wait
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc == 0
 - name: Show CSI smoke job logs
  command: kubectl -n kube-system logs job/{{ csi_smoke_test_job_name }}
  register: csi_smoke_job_logs
  failed_when: false
  changed_when: false
  when: csi_smoke_test_enabled | bool
 - name: Show CSI smoke PVC on failure
  command: kubectl -n kube-system get pvc {{ csi_smoke_test_pvc_name }} -o wide
  register: csi_smoke_pvc_status
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Show CSI smoke Job on failure
  command: kubectl -n kube-system get job {{ csi_smoke_test_job_name }} -o wide
  register: csi_smoke_job_status
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Show CSI smoke pods on failure
  command: kubectl -n kube-system get pod -l job-name={{ csi_smoke_test_job_name }} -o wide
  register: csi_smoke_pod_status
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Describe CSI smoke PVC on failure
  command: kubectl -n kube-system describe pvc {{ csi_smoke_test_pvc_name }}
  register: csi_smoke_pvc_describe
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Show storage classes on failure
  command: kubectl get storageclass
  register: csi_storageclasses
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Get CSI controller pod name on smoke failure
  shell: kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}'
  register: csi_controller_pod_name
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Describe CSI controller pod on smoke failure
  command: kubectl -n kube-system describe pod {{ csi_controller_pod_name.stdout }}
  register: csi_controller_pod_smoke_describe
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
    - csi_controller_pod_name.stdout | length > 0
 - name: Show CSI controller container logs on smoke failure
  shell: |
    pod="{{ csi_controller_pod_name.stdout }}"
    for container in hcloud-csi-driver csi-provisioner csi-attacher csi-resizer liveness-probe; do
      echo "===== ${container}: current ====="
      kubectl -n kube-system logs "$pod" -c "$container" --tail={{ csi_failure_log_tail_lines }} || true
      echo "===== ${container}: previous ====="
      kubectl -n kube-system logs "$pod" -c "$container" --previous --tail={{ csi_failure_log_tail_lines }} || true
    done
  register: csi_controller_container_logs
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
    - csi_controller_pod_name.stdout | length > 0
 - name: Show CSI driver and node driver objects on smoke failure
  shell: |
    echo "===== CSIDriver ====="
    kubectl get csidriver csi.hetzner.cloud -o yaml || true
    echo "===== CSINode ====="
    kubectl get csinode -o wide || true
  register: csi_driver_objects
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Show CSI smoke pod describe on failure
  shell: |
    pod="$(kubectl -n kube-system get pods -l job-name={{ csi_smoke_test_job_name }} -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
    if [ -n "$pod" ]; then
      kubectl -n kube-system describe pod "$pod"
    fi
  register: csi_smoke_pod_describe
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Fail when CSI smoke test fails
  fail:
    msg: |
      CSI smoke test failed.
      PVC wait:
      stdout: {{ csi_smoke_pvc_wait.stdout | default('') }}
      stderr: {{ csi_smoke_pvc_wait.stderr | default('') }}
      Job wait:
      stdout: {{ csi_smoke_job_wait.stdout | default('') }}
      stderr: {{ csi_smoke_job_wait.stderr | default('') }}
      PVC:
      {{ csi_smoke_pvc_status.stdout | default(csi_smoke_pvc_status.stderr | default('n/a')) }}
      Job:
      {{ csi_smoke_job_status.stdout | default(csi_smoke_job_status.stderr | default('n/a')) }}
      Pod list:
      {{ csi_smoke_pod_status.stdout | default(csi_smoke_pod_status.stderr | default('n/a')) }}
      PVC describe:
      {{ csi_smoke_pvc_describe.stdout | default(csi_smoke_pvc_describe.stderr | default('n/a')) }}
      Storage classes:
      {{ csi_storageclasses.stdout | default(csi_storageclasses.stderr | default('n/a')) }}
      CSI controller pod:
      {{ csi_controller_pod_name.stdout | default('n/a') }}
      CSI controller pod describe:
      {{ csi_controller_pod_smoke_describe.stdout | default(csi_controller_pod_smoke_describe.stderr | default('n/a')) }}
      CSI controller container logs:
      {{ csi_controller_container_logs.stdout | default(csi_controller_container_logs.stderr | default('n/a')) }}
      CSI driver objects:
      {{ csi_driver_objects.stdout | default(csi_driver_objects.stderr | default('n/a')) }}
      Pod describe:
      {{ csi_smoke_pod_describe.stdout | default('n/a') }}
      Job logs:
      {{ csi_smoke_job_logs.stdout | default('n/a') }}
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
    - csi_smoke_test_required | bool
 - name: Warn when CSI smoke test fails but is non-blocking
  debug:
    msg: |
      CSI smoke test failed but csi_smoke_test_required is false, so deployment will continue.
      PVC wait stderr: {{ csi_smoke_pvc_wait.stderr | default('') }}
      Job wait stderr: {{ csi_smoke_job_wait.stderr | default('') }}
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
    - not (csi_smoke_test_required | bool)
 - name: Cleanup CSI smoke test resources
  shell: |
    kubectl -n kube-system delete job {{ csi_smoke_test_job_name }} pvc {{ csi_smoke_test_pvc_name }} --ignore-not-found
    kubectl delete storageclass {{ csi_smoke_test_storage_class }} --ignore-not-found
  failed_when: false
  changed_when: false
  when: csi_smoke_test_enabled | bool
--- a/ansible/roles/doppler-bootstrap/tasks/main.yml
+++ b/ansible/roles/doppler-bootstrap/tasks/main.yml
@@ -0,0 +1,50 @@
 ---
 - name: Ensure Doppler service token is provided
  assert:
    that:
      - doppler_hetznerterra_service_token | length > 0
    fail_msg: doppler_hetznerterra_service_token must be provided for External Secrets bootstrap.
 - name: Ensure external-secrets namespace exists
  shell: kubectl create namespace external-secrets --dry-run=client -o yaml | kubectl apply -f -
  changed_when: true
 - name: Apply Doppler service token secret
  shell: >-
    kubectl -n external-secrets create secret generic doppler-hetznerterra-service-token
    --from-literal=dopplerToken='{{ doppler_hetznerterra_service_token }}'
    --dry-run=client -o yaml | kubectl apply -f -
  changed_when: true
 - name: Check for ClusterSecretStore CRD
  command: kubectl get crd clustersecretstores.external-secrets.io
  register: doppler_clustersecretstore_crd
  changed_when: false
  failed_when: false
 - name: Apply Doppler ClusterSecretStore
  shell: |
    cat <<'EOF' | kubectl apply -f -
    apiVersion: external-secrets.io/v1
    kind: ClusterSecretStore
    metadata:
      name: doppler-hetznerterra
    spec:
      provider:
        doppler:
          auth:
            secretRef:
              dopplerToken:
                name: doppler-hetznerterra-service-token
                key: dopplerToken
                namespace: external-secrets
    EOF
  changed_when: true
  when: doppler_clustersecretstore_crd.rc == 0
 - name: Note pending Doppler ClusterSecretStore bootstrap
  debug:
    msg: >-
      Skipping Doppler ClusterSecretStore bootstrap because the External Secrets CRD
      is not available yet. Re-run after External Secrets is installed.
  when: doppler_clustersecretstore_crd.rc != 0
--- a/ansible/roles/k3s-agent/defaults/main.yml
+++ b/ansible/roles/k3s-agent/defaults/main.yml
@@ -3,3 +3,4 @@ k3s_version: latest
 k3s_server_url: ""
 k3s_token: ""
 k3s_node_ip: ""
 k3s_kubelet_cloud_provider_external: false
--- a/ansible/roles/k3s-agent/tasks/main.yml
+++ b/ansible/roles/k3s-agent/tasks/main.yml
@@ -12,14 +12,41 @@
  when: not k3s_agent_binary.stat.exists
 - name: Install k3s agent
  environment:
    INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
    K3S_URL: "{{ k3s_server_url }}"
    K3S_TOKEN: "{{ k3s_token }}"
  command: /tmp/install-k3s.sh agent --node-ip {{ k3s_node_ip }}
  args:
    creates: /usr/local/bin/k3s-agent
  when: not k3s_agent_binary.stat.exists
  block:
    - name: Run k3s agent install
      environment:
        INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
        K3S_URL: "{{ k3s_server_url }}"
        K3S_TOKEN: "{{ k3s_token }}"
      command: >-
        /tmp/install-k3s.sh agent
        --node-ip {{ k3s_node_ip }}
        {% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
      args:
        creates: /usr/local/bin/k3s-agent
  rescue:
    - name: Show k3s-agent service status after failed install
      command: systemctl status k3s-agent --no-pager
      register: k3s_agent_status_after_install
      changed_when: false
      failed_when: false
    - name: Show recent k3s-agent logs after failed install
      command: journalctl -u k3s-agent -n 120 --no-pager
      register: k3s_agent_journal_after_install
      changed_when: false
      failed_when: false
    - name: Fail with k3s-agent diagnostics
      fail:
        msg: |
          k3s agent install failed on {{ inventory_hostname }}.
          Service status:
          {{ k3s_agent_status_after_install.stdout | default('n/a') }}
          Recent logs:
          {{ k3s_agent_journal_after_install.stdout | default('n/a') }}
 - name: Wait for k3s agent to be ready
  command: systemctl is-active k3s-agent
--- a/ansible/roles/k3s-server/defaults/main.yml
+++ b/ansible/roles/k3s-server/defaults/main.yml
@@ -3,3 +3,6 @@ k3s_version: latest
 k3s_token: ""
 k3s_node_ip: ""
 k3s_primary_public_ip: ""
 k3s_disable_embedded_ccm: true
 k3s_disable_servicelb: true
 k3s_kubelet_cloud_provider_external: false
--- a/ansible/roles/k3s-server/tasks/main.yml
+++ b/ansible/roles/k3s-server/tasks/main.yml
@@ -28,27 +28,22 @@
  stat:
    path: /usr/local/bin/k3s-uninstall.sh
  register: k3s_uninstall_script
-  when:
+  when: k3s_install_needed
    - not (k3s_primary | default(false))
    - k3s_install_needed
- name: Reset broken secondary k3s install before rejoin
+- name: Reset broken k3s install before reinstall
  command: /usr/local/bin/k3s-uninstall.sh
  when:
    - not (k3s_primary | default(false))
    - k3s_install_needed
    - k3s_uninstall_script.stat.exists
- name: Remove stale k3s data on secondary
+- name: Remove stale k3s data
  file:
    path: "{{ item }}"
    state: absent
  loop:
    - /etc/rancher/k3s
    - /var/lib/rancher/k3s
-  when:
+  when: k3s_install_needed
    - not (k3s_primary | default(false))
    - k3s_install_needed
 - name: Download k3s install script
  get_url:
@@ -61,7 +56,16 @@
  environment:
    INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
    K3S_TOKEN: "{{ k3s_token }}"
-  command: /tmp/install-k3s.sh server --cluster-init --advertise-address={{ k3s_primary_ip }} --node-ip={{ k3s_node_ip }} --tls-san={{ k3s_primary_ip }} --tls-san={{ k3s_primary_public_ip }}
+  command: >-
    /tmp/install-k3s.sh server
    --cluster-init
    --advertise-address={{ k3s_primary_ip }}
    --node-ip={{ k3s_node_ip }}
    --tls-san={{ k3s_primary_ip }}
    --tls-san={{ k3s_primary_public_ip }}
    {% if k3s_disable_embedded_ccm | bool %}--disable-cloud-controller{% endif %}
    {% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
    {% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
  when: 
    - k3s_install_needed
    - k3s_primary | default(false)
@@ -75,7 +79,14 @@
      environment:
        INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
        K3S_TOKEN: "{{ k3s_token }}"
-      command: /tmp/install-k3s.sh server --server https://{{ k3s_primary_ip }}:6443 --advertise-address={{ k3s_node_ip }} --node-ip={{ k3s_node_ip }}
+      command: >-
        /tmp/install-k3s.sh server
        --server https://{{ k3s_primary_ip }}:6443
        --advertise-address={{ k3s_node_ip }}
        --node-ip={{ k3s_node_ip }}
        {% if k3s_disable_embedded_ccm | bool %}--disable-cloud-controller{% endif %}
        {% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
        {% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
      register: secondary_install
  rescue:
--- a/ansible/roles/observability-content/defaults/main.yml
+++ b/ansible/roles/observability-content/defaults/main.yml
@@ -0,0 +1,9 @@
 ---
 observability_namespace: "observability"
 grafana_dashboard_configmap_name: "grafana-dashboard-k8s-overview"
 grafana_datasource_configmap_name: "grafana-datasources-core"
 loki_enabled: true
 grafana_prometheus_url: "http://kube-prometheus-stack-prometheus.{{ observability_namespace }}.svc.cluster.local:9090"
 grafana_loki_url: "http://loki.{{ observability_namespace }}.svc.cluster.local:3100"
 grafana_use_prometheus_nodeport_fallback: true
 grafana_use_loki_nodeport_fallback: true
--- a/ansible/roles/observability-content/tasks/main.yml
+++ b/ansible/roles/observability-content/tasks/main.yml
@@ -0,0 +1,173 @@
 ---
 - name: Ensure observability namespace exists
  command: kubectl create namespace {{ observability_namespace }}
  register: create_observability_ns
  failed_when: create_observability_ns.rc != 0 and "AlreadyExists" not in create_observability_ns.stderr
  changed_when: create_observability_ns.rc == 0
 - name: Wait for Grafana deployment rollout
  command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
  changed_when: false
 - name: Set default Prometheus datasource URL
  set_fact:
    grafana_prometheus_effective_url: "{{ grafana_prometheus_url }}"
    grafana_loki_effective_url: "{{ grafana_loki_url }}"
 - name: Get Grafana pod name
  command: kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}'
  register: grafana_pod_name
  changed_when: false
 - name: Probe Prometheus from Grafana pod via default datasource URL
  shell: >-
    kubectl -n {{ observability_namespace }} exec {{ grafana_pod_name.stdout }} -c grafana --
    sh -c 'wget -qO- --timeout=5 {{ grafana_prometheus_url }}/-/ready >/dev/null'
  register: grafana_prometheus_probe
  changed_when: false
  failed_when: false
 - name: Probe Loki from Grafana pod via default datasource URL
  shell: >-
    kubectl -n {{ observability_namespace }} exec {{ grafana_pod_name.stdout }} -c grafana --
    sh -c 'wget -qO- --timeout=5 {{ grafana_loki_url }}/ready >/dev/null'
  register: grafana_loki_probe
  changed_when: false
  failed_when: false
  when: loki_enabled
 - name: Get Prometheus pod host IP for fallback
  command: kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=prometheus -o jsonpath='{.items[0].status.hostIP}'
  register: prometheus_host_ip
  changed_when: false
  when:
    - grafana_use_prometheus_nodeport_fallback | bool
    - grafana_prometheus_probe.rc != 0
 - name: Get Prometheus service NodePort for fallback
  command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-prometheus -o jsonpath='{.spec.ports[?(@.name=="http-web")].nodePort}'
  register: prometheus_nodeport
  changed_when: false
  when:
    - grafana_use_prometheus_nodeport_fallback | bool
    - grafana_prometheus_probe.rc != 0
 - name: Enable Prometheus NodePort fallback datasource URL
  set_fact:
    grafana_prometheus_effective_url: "http://{{ prometheus_host_ip.stdout }}:{{ prometheus_nodeport.stdout }}"
  when:
    - grafana_use_prometheus_nodeport_fallback | bool
    - grafana_prometheus_probe.rc != 0
    - prometheus_host_ip.stdout | length > 0
    - prometheus_nodeport.stdout | length > 0
 - name: Ensure Loki service uses NodePort for fallback
  command: kubectl -n {{ observability_namespace }} patch svc loki -p '{"spec":{"type":"NodePort"}}'
  changed_when: false
  failed_when: false
  when:
    - loki_enabled
    - grafana_use_loki_nodeport_fallback | bool
    - grafana_loki_probe.rc != 0
 - name: Get Loki pod host IP for fallback
  command: kubectl -n {{ observability_namespace }} get pod loki-0 -o jsonpath='{.status.hostIP}'
  register: loki_host_ip
  changed_when: false
  when:
    - loki_enabled
    - grafana_use_loki_nodeport_fallback | bool
    - grafana_loki_probe.rc != 0
 - name: Get Loki service NodePort for fallback
  command: kubectl -n {{ observability_namespace }} get svc loki -o jsonpath='{.spec.ports[?(@.name=="http-metrics")].nodePort}'
  register: loki_nodeport
  changed_when: false
  when:
    - loki_enabled
    - grafana_use_loki_nodeport_fallback | bool
    - grafana_loki_probe.rc != 0
 - name: Enable Loki NodePort fallback datasource URL
  set_fact:
    grafana_loki_effective_url: "http://{{ loki_host_ip.stdout }}:{{ loki_nodeport.stdout }}"
  when:
    - loki_enabled
    - grafana_use_loki_nodeport_fallback | bool
    - grafana_loki_probe.rc != 0
    - loki_host_ip.stdout | length > 0
    - loki_nodeport.stdout | length > 0
 - name: Query Loki labels endpoint from Grafana pod
  shell: >-
    kubectl -n {{ observability_namespace }} exec {{ grafana_pod_name.stdout }} -c grafana --
    sh -c 'wget -qO- --timeout=10 {{ grafana_loki_effective_url }}/loki/api/v1/labels'
  register: grafana_loki_labels
  changed_when: false
  failed_when: false
  when: loki_enabled
 - name: Fail when Loki is reachable but has zero indexed labels
  fail:
    msg: >-
      Loki is reachable from Grafana at {{ grafana_loki_effective_url }} but /loki/api/v1/labels returned no labels.
      This usually means no logs are ingested yet. Check Promtail and tenant configuration.
  when:
    - loki_enabled
    - grafana_loki_labels.rc == 0
    - "'\"status\":\"success\"' in (grafana_loki_labels.stdout | replace(' ', ''))"
    - "'\"data\":[]' in (grafana_loki_labels.stdout | replace(' ', ''))"
 - name: Write default Prometheus datasource ConfigMap patch
  template:
    src: grafana-default-prometheus-datasource.yaml.j2
    dest: /tmp/grafana-default-prometheus-datasource.yaml
    mode: "0644"
 - name: Apply default Prometheus datasource ConfigMap patch
  command: kubectl apply -f /tmp/grafana-default-prometheus-datasource.yaml
  changed_when: true
 - name: Remove legacy Loki datasource ConfigMap
  command: kubectl -n {{ observability_namespace }} delete configmap grafana-datasource-loki --ignore-not-found=true
  changed_when: false
  failed_when: false
 - name: Write Grafana datasources ConfigMap
  template:
    src: grafana-datasources.yaml.j2
    dest: /tmp/grafana-datasources.yaml
    mode: "0644"
  when: loki_enabled
 - name: Apply Grafana datasources ConfigMap
  command: kubectl apply -f /tmp/grafana-datasources.yaml
  changed_when: true
  when: loki_enabled
 - name: Restart Grafana to load datasource updates deterministically
  command: kubectl -n {{ observability_namespace }} rollout restart deployment/kube-prometheus-stack-grafana
  changed_when: true
 - name: Wait for Grafana rollout after datasource update
  command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
  changed_when: false
 - name: Write Grafana dashboard ConfigMap
  template:
    src: grafana-dashboard-k8s-overview.yaml.j2
    dest: /tmp/grafana-dashboard-k8s-overview.yaml
    mode: "0644"
 - name: Apply Grafana dashboard ConfigMap
  command: kubectl apply -f /tmp/grafana-dashboard-k8s-overview.yaml
  changed_when: true
 - name: Show Grafana content provisioning summary
  debug:
    msg: |
      Grafana content applied.
      Datasources ConfigMap: {{ grafana_datasource_configmap_name }}
      Prometheus datasource URL: {{ grafana_prometheus_effective_url }}
      Loki datasource URL: {{ grafana_loki_effective_url }}
      Dashboard ConfigMap: {{ grafana_dashboard_configmap_name }}
--- a/ansible/roles/observability-content/templates/grafana-dashboard-k8s-overview.yaml.j2
+++ b/ansible/roles/observability-content/templates/grafana-dashboard-k8s-overview.yaml.j2
@@ -0,0 +1,60 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: {{ grafana_dashboard_configmap_name }}
  namespace: {{ observability_namespace }}
  labels:
    grafana_dashboard: "1"
 data:
  k8s-overview.json: |
    {
      "annotations": {"list": []},
      "editable": true,
      "fiscalYearStartMonth": 0,
      "graphTooltip": 0,
      "id": null,
      "links": [],
      "panels": [
        {
          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
          "fieldConfig": {"defaults": {"unit": "none"}, "overrides": []},
          "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
          "id": 1,
          "options": {"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
          "targets": [
            {
              "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
              "legendFormat": "ready",
              "refId": "A"
            }
          ],
          "title": "Ready Nodes",
          "type": "stat"
        },
        {
          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
          "fieldConfig": {"defaults": {"unit": "percentunit"}, "overrides": []},
          "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
          "id": 2,
          "targets": [
            {
              "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))",
              "legendFormat": "cpu",
              "refId": "A"
            }
          ],
          "title": "Cluster CPU Usage",
          "type": "timeseries"
        }
      ],
      "refresh": "30s",
      "schemaVersion": 39,
      "style": "dark",
      "tags": ["kubernetes", "infrastructure"],
      "templating": {"list": []},
      "time": {"from": "now-1h", "to": "now"},
      "timezone": "browser",
      "title": "K8s Cluster Overview",
      "uid": "k8s-cluster-overview",
      "version": 1
    }
--- a/ansible/roles/observability-content/templates/grafana-datasources.yaml.j2
+++ b/ansible/roles/observability-content/templates/grafana-datasources.yaml.j2
@@ -0,0 +1,18 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: {{ grafana_datasource_configmap_name }}
  namespace: {{ observability_namespace }}
  labels:
    grafana_datasource: "1"
 data:
  datasources.yaml: |
    apiVersion: 1
    datasources:
 {% if loki_enabled %}
      - name: Loki
        type: loki
        access: proxy
        url: "{{ grafana_loki_effective_url }}"
        isDefault: false
 {% endif %}
--- a/ansible/roles/observability-content/templates/grafana-default-prometheus-datasource.yaml.j2
+++ b/ansible/roles/observability-content/templates/grafana-default-prometheus-datasource.yaml.j2
@@ -0,0 +1,26 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: kube-prometheus-stack-grafana-datasource
  namespace: {{ observability_namespace }}
 data:
  datasource.yaml: |-
    apiVersion: 1
    datasources:
    - name: "Prometheus"
      type: prometheus
      uid: prometheus
      url: "{{ grafana_prometheus_effective_url }}/"
      access: proxy
      isDefault: true
      jsonData:
        httpMethod: POST
        timeInterval: 30s
    - name: "Alertmanager"
      type: alertmanager
      uid: alertmanager
      url: http://kube-prometheus-stack-alertmanager.{{ observability_namespace }}:9093/
      access: proxy
      jsonData:
        handleGrafanaManagedAlerts: false
        implementation: prometheus
--- a/ansible/roles/observability/defaults/main.yml
+++ b/ansible/roles/observability/defaults/main.yml
@@ -0,0 +1,27 @@
 ---
 observability_namespace: "observability"
 prometheus_chart_version: "68.4.4"
 loki_chart_version: "6.10.0"
 promtail_chart_version: "6.16.6"
 grafana_admin_password: ""
 prometheus_storage_size: "10Gi"
 grafana_storage_size: "5Gi"
 loki_storage_size: "10Gi"
 prometheus_storage_class: "local-path"
 grafana_storage_class: "local-path"
 loki_storage_class: "local-path"
 loki_enabled: true
 tailscale_oauth_client_id: ""
 tailscale_oauth_client_secret: ""
 tailscale_tailnet: ""
 observability_tailscale_expose: true
 grafana_tailscale_hostname: "grafana"
 prometheus_tailscale_hostname: "prometheus"
 tailscale_proxyclass_name: "infra-stable"
--- a/ansible/roles/observability/tasks/main.yml
+++ b/ansible/roles/observability/tasks/main.yml
@@ -0,0 +1,252 @@
 ---
 - name: Check if Helm is installed
  command: helm version --short
  register: helm_check
  changed_when: false
  failed_when: false
 - name: Install Helm
  shell: curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
  when: helm_check.rc != 0
  changed_when: true
 - name: Ensure observability namespace exists
  command: kubectl create namespace {{ observability_namespace }}
  register: create_observability_ns
  failed_when: create_observability_ns.rc != 0 and "AlreadyExists" not in create_observability_ns.stderr
  changed_when: create_observability_ns.rc == 0
 - name: Set Grafana admin password
  set_fact:
    grafana_password_effective: "{{ grafana_admin_password if grafana_admin_password | length > 0 else lookup('password', '/dev/null length=32 chars=ascii_letters,digits') }}"
 - name: Write kube-prometheus-stack values
  template:
    src: kube-prometheus-stack-values.yaml.j2
    dest: /tmp/kube-prometheus-stack-values.yaml
    mode: "0644"
 - name: Add Prometheus Helm repo
  command: helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
  register: add_prom_repo
  failed_when: add_prom_repo.rc != 0 and "already exists" not in add_prom_repo.stderr
  changed_when: add_prom_repo.rc == 0
 - name: Add Grafana Helm repo
  command: helm repo add grafana https://grafana.github.io/helm-charts
  register: add_grafana_repo
  failed_when: add_grafana_repo.rc != 0 and "already exists" not in add_grafana_repo.stderr
  changed_when: add_grafana_repo.rc == 0
 - name: Update Helm repos
  command: helm repo update
  changed_when: false
 - name: Clear stale pending Helm revision secrets for kube-prometheus-stack
  shell: >-
    kubectl -n {{ observability_namespace }} delete
    $(kubectl -n {{ observability_namespace }} get secret -l owner=helm,name=kube-prometheus-stack,status=pending-upgrade -o name)
    --ignore-not-found=true;
    kubectl -n {{ observability_namespace }} delete
    $(kubectl -n {{ observability_namespace }} get secret -l owner=helm,name=kube-prometheus-stack,status=pending-install -o name)
    --ignore-not-found=true;
    kubectl -n {{ observability_namespace }} delete
    $(kubectl -n {{ observability_namespace }} get secret -l owner=helm,name=kube-prometheus-stack,status=pending-rollback -o name)
    --ignore-not-found=true
  changed_when: false
  failed_when: false
 - name: Install kube-prometheus-stack
  command: >-
    helm upgrade --install kube-prometheus-stack prometheus-community/kube-prometheus-stack
    --namespace {{ observability_namespace }}
    --version {{ prometheus_chart_version }}
    --values /tmp/kube-prometheus-stack-values.yaml
    --wait
    --timeout 10m
  register: kube_prom_install
  retries: 12
  delay: 15
  until: kube_prom_install.rc == 0
  changed_when: true
 - name: Wait for Grafana deployment rollout
  command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
  changed_when: false
 - name: Reset Grafana admin password in Grafana database
  shell: >-
    kubectl -n {{ observability_namespace }} exec
    "$(kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}')"
    -c grafana -- grafana cli admin reset-admin-password '{{ grafana_password_effective }}'
  changed_when: true
 - name: Write Loki values
  template:
    src: loki-values.yaml.j2
    dest: /tmp/loki-values.yaml
    mode: "0644"
  when: loki_enabled
 - name: Validate Loki chart produces resources
  command: >-
    helm template loki grafana/loki
    --namespace {{ observability_namespace }}
    --version {{ loki_chart_version }}
    --values /tmp/loki-values.yaml
  register: loki_template
  changed_when: false
  failed_when: "loki_template.rc != 0 or 'kind: StatefulSet' not in loki_template.stdout"
  when: loki_enabled
 - name: Remove legacy Loki resources
  command: >-
    kubectl -n {{ observability_namespace }} delete
    deployment/loki-gateway
    statefulset/loki
    statefulset/loki-chunks-cache
    statefulset/loki-results-cache
    statefulset/loki-backend
    statefulset/loki-read
    statefulset/loki-write
    poddisruptionbudget/loki-memcached-chunks-cache
    poddisruptionbudget/loki-memcached-results-cache
    --ignore-not-found=true
  changed_when: false
  failed_when: false
  when: loki_enabled
 - name: Clear stuck Helm lock for Loki
  command: kubectl -n {{ observability_namespace }} delete secret sh.helm.release.v1.loki.v1 --ignore-not-found=true
  changed_when: false
  failed_when: false
  when: loki_enabled
 - name: Uninstall failed Loki release (if stuck)
  command: helm uninstall loki -n {{ observability_namespace }}
  changed_when: false
  failed_when: false
  when: loki_enabled
 - name: Install Loki
  command: >-
    helm upgrade --install loki grafana/loki
    --namespace {{ observability_namespace }}
    --version {{ loki_chart_version }}
    --values /tmp/loki-values.yaml
  register: loki_install
  changed_when: true
  when: loki_enabled
 - name: Wait for Loki StatefulSet
  command: kubectl -n {{ observability_namespace }} rollout status statefulset/loki --timeout=10m
  register: loki_rollout
  changed_when: false
  when: loki_enabled
 - name: Show Loki pod status
  command: kubectl -n {{ observability_namespace }} get pods -l app.kubernetes.io/name=loki -o wide
  register: loki_pods
  changed_when: false
  when: loki_enabled
 - name: Debug Loki pods
  debug:
    msg: "{{ loki_pods.stdout }}"
  when: loki_enabled
 - name: Write Promtail values
  template:
    src: promtail-values.yaml.j2
    dest: /tmp/promtail-values.yaml
    mode: "0644"
  when: loki_enabled
 - name: Install Promtail
  command: >-
    helm upgrade --install promtail grafana/promtail
    --namespace {{ observability_namespace }}
    --version {{ promtail_chart_version }}
    --values /tmp/promtail-values.yaml
    --wait
    --timeout 10m
  changed_when: true
  when: loki_enabled
 - name: Check Tailscale service readiness for Grafana
  command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-grafana -o jsonpath='{.status.conditions[?(@.type=="TailscaleProxyReady")].status}'
  register: grafana_tailscale_ready
  changed_when: false
  failed_when: false
  when:
    - observability_tailscale_expose | bool
    - tailscale_operator_ready | default(false) | bool
 - name: Check Tailscale service readiness for Prometheus
  command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-prometheus -o jsonpath='{.status.conditions[?(@.type=="TailscaleProxyReady")].status}'
  register: prometheus_tailscale_ready
  changed_when: false
  failed_when: false
  when:
    - observability_tailscale_expose | bool
    - tailscale_operator_ready | default(false) | bool
 - name: Check Tailscale endpoint (IP/hostname) for Grafana
  shell: >-
    kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-grafana
    -o go-template='{{"{{"}}range .status.loadBalancer.ingress{{"}}"}}{{"{{"}}if .ip{{"}}"}}{{"{{"}}.ip{{"}}"}}{{"{{"}}else{{"}}"}}{{"{{"}}.hostname{{"}}"}}{{"{{"}}end{{"}}"}}{{"{{"}}end{{"}}"}}'
  register: grafana_lb_ip
  changed_when: false
  failed_when: false
  when:
    - observability_tailscale_expose | bool
    - tailscale_operator_ready | default(false) | bool
 - name: Check Tailscale endpoint (IP/hostname) for Prometheus
  shell: >-
    kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-prometheus
    -o go-template='{{"{{"}}range .status.loadBalancer.ingress{{"}}"}}{{"{{"}}if .ip{{"}}"}}{{"{{"}}.ip{{"}}"}}{{"{{"}}else{{"}}"}}{{"{{"}}.hostname{{"}}"}}{{"{{"}}end{{"}}"}}{{"{{"}}end{{"}}"}}'
  register: prometheus_lb_ip
  changed_when: false
  failed_when: false
  when:
    - observability_tailscale_expose | bool
    - tailscale_operator_ready | default(false) | bool
 - name: Show Tailscale access details
  debug:
    msg: |
      Observability stack deployed with Tailscale access!
      Grafana: http://{{ grafana_tailscale_hostname }}{% if grafana_lb_ip.stdout | default('') | length > 0 %} (or http://{{ grafana_lb_ip.stdout }}){% endif %}
      Prometheus: http://{{ prometheus_tailscale_hostname }}{% if prometheus_lb_ip.stdout | default('') | length > 0 %} (or http://{{ prometheus_lb_ip.stdout }}){% endif %}
      Login: admin / {{ grafana_password_effective }}
      Tailscale readiness:
      - Grafana proxy ready: {{ grafana_tailscale_ready.stdout | default('pending') }}
      - Prometheus proxy ready: {{ prometheus_tailscale_ready.stdout | default('pending') }}
      Access via:
      - MagicDNS: http://{{ grafana_tailscale_hostname }} and http://{{ prometheus_tailscale_hostname }}
      - Tailnet FQDN: http://{{ grafana_tailscale_hostname }}.{{ tailscale_tailnet | default('tailnet.ts.net') }}
      - Direct endpoint: {% if grafana_lb_ip.stdout | default('') | length > 0 %}http://{{ grafana_lb_ip.stdout }}{% else %}(pending){% endif %} / {% if prometheus_lb_ip.stdout | default('') | length > 0 %}http://{{ prometheus_lb_ip.stdout }}{% else %}(pending){% endif %}
  when:
    - observability_tailscale_expose | bool
    - tailscale_operator_ready | default(false) | bool
 - name: Show observability access details (fallback)
  debug:
    msg: |
      Observability stack deployed.
      Namespace: {{ observability_namespace }}
      Grafana (tailnet): kubectl -n {{ observability_namespace }} port-forward svc/kube-prometheus-stack-grafana 3000:80
      Prometheus (tailnet): kubectl -n {{ observability_namespace }} port-forward svc/kube-prometheus-stack-prometheus 9090:9090
      Grafana admin password: {{ grafana_password_effective }}
      {% if loki_enabled %}
      Loki: Enabled - logs available in Grafana
      {% else %}
      Loki: Disabled
      {% endif %}
  when:
    - not (observability_tailscale_expose | bool and (tailscale_operator_ready | default(false) | bool))
--- a/ansible/roles/observability/templates/grafana-datasource-loki.yaml.j2
+++ b/ansible/roles/observability/templates/grafana-datasource-loki.yaml.j2
@@ -0,0 +1,16 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: grafana-datasource-loki
  namespace: {{ observability_namespace }}
  labels:
    grafana_datasource: "1"
 data:
  loki-datasource.yaml: |
    apiVersion: 1
    datasources:
      - name: Loki
        type: loki
        access: proxy
        url: http://loki.{{ observability_namespace }}.svc.cluster.local:3100
        isDefault: false
--- a/ansible/roles/observability/templates/kube-prometheus-stack-values.yaml.j2
+++ b/ansible/roles/observability/templates/kube-prometheus-stack-values.yaml.j2
@@ -0,0 +1,46 @@
 grafana:
  enabled: true
  adminPassword: {{ grafana_password_effective }}
  persistence:
    enabled: true
    storageClassName: {{ grafana_storage_class }}
    size: {{ grafana_storage_size }}
  service:
 {% if observability_tailscale_expose and (tailscale_operator_ready | default(false)) %}
    type: LoadBalancer
    loadBalancerClass: tailscale
    annotations:
      tailscale.com/hostname: {{ grafana_tailscale_hostname }}
      tailscale.com/proxy-class: {{ tailscale_proxyclass_name }}
 {% else %}
    type: ClusterIP
 {% endif %}
 prometheus:
  service:
 {% if observability_tailscale_expose and (tailscale_operator_ready | default(false)) %}
    type: LoadBalancer
    loadBalancerClass: tailscale
    annotations:
      tailscale.com/hostname: {{ prometheus_tailscale_hostname }}
      tailscale.com/proxy-class: {{ tailscale_proxyclass_name }}
 {% else %}
    type: ClusterIP
 {% endif %}
  prometheusSpec:
    retention: 7d
    storageSpec:
      volumeClaimTemplate:
        spec:
          storageClassName: {{ prometheus_storage_class }}
          accessModes: ["ReadWriteOnce"]
          resources:
            requests:
              storage: {{ prometheus_storage_size }}
 alertmanager:
  enabled: false
 kubeEtcd:
  enabled: false
 kubeControllerManager:
  enabled: false
 kubeScheduler:
  enabled: false
--- a/ansible/roles/observability/templates/loki-values.yaml.j2
+++ b/ansible/roles/observability/templates/loki-values.yaml.j2
@@ -0,0 +1,75 @@
 deploymentMode: SingleBinary
 loki:
  auth_enabled: false
  commonConfig:
    replication_factor: 1
  schemaConfig:
    configs:
      - from: "2024-04-01"
        store: tsdb
        object_store: filesystem
        schema: v13
        index:
          prefix: loki_index_
          period: 24h
  storage:
    type: filesystem
  limits_config:
    allow_structured_metadata: true
    volume_enabled: true
    retention_period: 168h
  pattern_ingester:
    enabled: true
  ruler:
    enable_api: true
 singleBinary:
  replicas: 1
  persistence:
    size: {{ loki_storage_size }}
    storageClass: {{ loki_storage_class }}
  resources:
    requests:
      cpu: 100m
      memory: 256Mi
    limits:
      cpu: 500m
      memory: 1Gi
 backend:
  replicas: 0
 read:
  replicas: 0
 write:
  replicas: 0
 ingester:
  replicas: 0
 querier:
  replicas: 0
 queryFrontend:
  replicas: 0
 queryScheduler:
  replicas: 0
 distributor:
  replicas: 0
 compactor:
  replicas: 0
 indexGateway:
  replicas: 0
 bloomCompactor:
  replicas: 0
 bloomGateway:
  replicas: 0
 gateway:
  enabled: false
 test:
  enabled: false
 monitoring:
  selfMonitoring:
    enabled: false
  lokiCanary:
    enabled: false
--- a/ansible/roles/observability/templates/promtail-values.yaml.j2
+++ b/ansible/roles/observability/templates/promtail-values.yaml.j2
@@ -0,0 +1,3 @@
 config:
  clients:
    - url: http://loki.{{ observability_namespace }}.svc.cluster.local:3100/loki/api/v1/push
--- a/ansible/roles/private-access/tasks/main.yml
+++ b/ansible/roles/private-access/tasks/main.yml
@@ -0,0 +1,58 @@
 ---
 - name: Create systemd unit for Grafana private access
  template:
    src: kubectl-port-forward.service.j2
    dest: /etc/systemd/system/k8s-portforward-grafana.service
    mode: "0644"
  vars:
    unit_description: Port-forward Grafana for Tailscale access
    unit_namespace: observability
    unit_target: svc/observability-kube-prometheus-stack-grafana
    unit_local_port: 13080
    unit_remote_port: 80
 - name: Create systemd unit for Prometheus private access
  template:
    src: kubectl-port-forward.service.j2
    dest: /etc/systemd/system/k8s-portforward-prometheus.service
    mode: "0644"
  vars:
    unit_description: Port-forward Prometheus for Tailscale access
    unit_namespace: observability
    unit_target: svc/observability-kube-prometh-prometheus
    unit_local_port: 19090
    unit_remote_port: 9090
 - name: Create systemd unit for Flux UI private access
  template:
    src: kubectl-port-forward.service.j2
    dest: /etc/systemd/system/k8s-portforward-flux-ui.service
    mode: "0644"
  vars:
    unit_description: Port-forward Flux UI for Tailscale access
    unit_namespace: flux-system
    unit_target: svc/flux-system-weave-gitops
    unit_local_port: 19001
    unit_remote_port: 9001
 - name: Reload systemd
  systemd:
    daemon_reload: true
 - name: Enable and start private access port-forward services
  systemd:
    name: "{{ item }}"
    enabled: true
    state: started
  loop:
    - k8s-portforward-grafana.service
    - k8s-portforward-prometheus.service
    - k8s-portforward-flux-ui.service
 - name: Configure Tailscale Serve for private access endpoints
  shell: >-
    tailscale serve reset &&
    tailscale serve --bg --tcp={{ private_access_grafana_port }} tcp://127.0.0.1:13080 &&
    tailscale serve --bg --tcp={{ private_access_prometheus_port }} tcp://127.0.0.1:19090 &&
    tailscale serve --bg --tcp={{ private_access_flux_port }} tcp://127.0.0.1:19001
  changed_when: true
--- a/ansible/roles/private-access/templates/kubectl-port-forward.service.j2
+++ b/ansible/roles/private-access/templates/kubectl-port-forward.service.j2
@@ -0,0 +1,13 @@
 [Unit]
 Description={{ unit_description }}
 After=network-online.target k3s.service
 Wants=network-online.target
 [Service]
 Type=simple
 Restart=always
 RestartSec=5
 ExecStart=/usr/local/bin/kubectl -n {{ unit_namespace }} port-forward --address 127.0.0.1 {{ unit_target }} {{ unit_local_port }}:{{ unit_remote_port }}
 [Install]
 WantedBy=multi-user.target
--- a/ansible/site.yml
+++ b/ansible/site.yml
@@ -75,19 +75,46 @@
  roles:
    - k3s-agent
- name: Deploy Hetzner CCM
+- name: Bootstrap addon prerequisite secrets
  hosts: control_plane[0]
  become: true
  roles:
-    - ccm
+    - addon-secrets-bootstrap
- name: Deploy Hetzner CSI
+- name: Deploy observability stack
  hosts: control_plane[0]
  become: true
  roles:
-    - csi
+    - role: observability
      when: not (observability_gitops_enabled | default(true) | bool)
 - name: Provision Grafana content
  hosts: control_plane[0]
  become: true
  roles:
    - role: observability-content
      when: not (observability_gitops_enabled | default(true) | bool)
 - name: Configure private tailnet access
  hosts: control_plane[0]
  become: true
  vars:
    private_access_grafana_port: 30080
    private_access_prometheus_port: 30990
    private_access_flux_port: 30901
  roles:
    - private-access
 - name: Bootstrap Doppler access for External Secrets
  hosts: control_plane[0]
  become: true
  roles:
    - doppler-bootstrap
 - name: Finalize
  hosts: localhost
--- a/apps/kustomization.yaml
+++ b/apps/kustomization.yaml
@@ -0,0 +1,3 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources: []
--- a/clusters/prod/flux-system/gitrepository-platform.yaml
+++ b/clusters/prod/flux-system/gitrepository-platform.yaml
@@ -0,0 +1,12 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: GitRepository
 metadata:
  name: platform
  namespace: flux-system
 spec:
  interval: 1m
  ref:
    branch: main
  url: ssh://git@64.176.189.59:2222/HomeInfra/HetznerTerra.git
  secretRef:
    name: flux-system
--- a/clusters/prod/flux-system/gotk-components.yaml
+++ b/clusters/prod/flux-system/gotk-components.yaml
--- a/clusters/prod/flux-system/gotk-controller-cp1-patches.yaml
+++ b/clusters/prod/flux-system/gotk-controller-cp1-patches.yaml
@@ -0,0 +1,43 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: source-controller
  namespace: flux-system
 spec:
  template:
    spec:
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: kustomize-controller
  namespace: flux-system
 spec:
  template:
    spec:
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: helm-controller
  namespace: flux-system
 spec:
  template:
    spec:
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: notification-controller
  namespace: flux-system
 spec:
  template:
    spec:
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
--- a/clusters/prod/flux-system/kustomization-apps.yaml
+++ b/clusters/prod/flux-system/kustomization-apps.yaml
@@ -0,0 +1,17 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: apps
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./apps
  dependsOn:
    - name: infrastructure
  wait: true
  timeout: 5m
  suspend: true
--- a/clusters/prod/flux-system/kustomization-infrastructure.yaml
+++ b/clusters/prod/flux-system/kustomization-infrastructure.yaml
@@ -0,0 +1,14 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: infrastructure
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure
  wait: false
  timeout: 5m
--- a/clusters/prod/flux-system/kustomization.yaml
+++ b/clusters/prod/flux-system/kustomization.yaml
@@ -0,0 +1,9 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - gotk-components.yaml
  - gitrepository-platform.yaml
  - kustomization-infrastructure.yaml
  - kustomization-apps.yaml
 patchesStrategicMerge:
  - gotk-controller-cp1-patches.yaml
--- a/clusters/prod/kustomization.yaml
+++ b/clusters/prod/kustomization.yaml
@@ -0,0 +1,4 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - flux-system
--- a/infrastructure/addons/ccm/helmrelease-hcloud-ccm.yaml
+++ b/infrastructure/addons/ccm/helmrelease-hcloud-ccm.yaml
@@ -0,0 +1,36 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: hcloud-cloud-controller-manager
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: kube-system
  chart:
    spec:
      chart: hcloud-cloud-controller-manager
      version: 1.30.1
      sourceRef:
        kind: HelmRepository
        name: hcloud
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    selectorLabels:
      app: hcloud-cloud-controller-manager
    args:
      secure-port: "0"
    networking:
      enabled: true
    nodeSelector:
      kubernetes.io/hostname: k8s-cluster-cp-1
    additionalTolerations:
      - key: node-role.kubernetes.io/control-plane
        operator: Exists
        effect: NoSchedule
--- a/infrastructure/addons/ccm/helmrepository-hcloud.yaml
+++ b/infrastructure/addons/ccm/helmrepository-hcloud.yaml
@@ -0,0 +1,8 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: hcloud
  namespace: flux-system
 spec:
  interval: 1h
  url: https://charts.hetzner.cloud
--- a/infrastructure/addons/ccm/kustomization.yaml
+++ b/infrastructure/addons/ccm/kustomization.yaml
@@ -0,0 +1,5 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - helmrepository-hcloud.yaml
  - helmrelease-hcloud-ccm.yaml
--- a/infrastructure/addons/csi/helmrelease-hcloud-csi.yaml
+++ b/infrastructure/addons/csi/helmrelease-hcloud-csi.yaml
@@ -0,0 +1,36 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: hcloud-csi
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: kube-system
  chart:
    spec:
      chart: hcloud-csi
      version: 2.20.0
      sourceRef:
        kind: HelmRepository
        name: hcloud
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    controller:
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
      tolerations:
        - key: node-role.kubernetes.io/control-plane
          operator: Exists
          effect: NoSchedule
      hcloudVolumeDefaultLocation: nbg1
    storageClasses:
      - name: hcloud-volumes
        defaultStorageClass: true
        reclaimPolicy: Delete
--- a/infrastructure/addons/csi/helmrepository-hcloud.yaml
+++ b/infrastructure/addons/csi/helmrepository-hcloud.yaml
@@ -0,0 +1,8 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: hcloud
  namespace: flux-system
 spec:
  interval: 1h
  url: https://charts.hetzner.cloud
--- a/infrastructure/addons/csi/kustomization.yaml
+++ b/infrastructure/addons/csi/kustomization.yaml
@@ -0,0 +1,5 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - helmrepository-hcloud.yaml
  - helmrelease-hcloud-csi.yaml
--- a/infrastructure/addons/external-secrets/clustersecretstore-doppler-hetznerterra.yaml
+++ b/infrastructure/addons/external-secrets/clustersecretstore-doppler-hetznerterra.yaml
@@ -0,0 +1,13 @@
 apiVersion: external-secrets.io/v1
 kind: ClusterSecretStore
 metadata:
  name: doppler-hetznerterra
 spec:
  provider:
    doppler:
      auth:
        secretRef:
          dopplerToken:
            name: doppler-hetznerterra-service-token
            key: dopplerToken
            namespace: external-secrets
--- a/infrastructure/addons/external-secrets/helmrelease-external-secrets.yaml
+++ b/infrastructure/addons/external-secrets/helmrelease-external-secrets.yaml
@@ -0,0 +1,36 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: external-secrets
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: external-secrets
  chart:
    spec:
      chart: external-secrets
      version: 2.1.0
      sourceRef:
        kind: HelmRepository
        name: external-secrets
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    installCRDs: true
    nodeSelector:
      kubernetes.io/hostname: k8s-cluster-cp-1
    webhook:
      failurePolicy: Ignore
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
    certController:
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
    serviceMonitor:
      enabled: false
--- a/infrastructure/addons/external-secrets/helmrepository-external-secrets.yaml
+++ b/infrastructure/addons/external-secrets/helmrepository-external-secrets.yaml
@@ -0,0 +1,8 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: external-secrets
  namespace: flux-system
 spec:
  interval: 1h
  url: https://charts.external-secrets.io
--- a/infrastructure/addons/external-secrets/kustomization.yaml
+++ b/infrastructure/addons/external-secrets/kustomization.yaml
@@ -0,0 +1,6 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - namespace.yaml
  - helmrepository-external-secrets.yaml
  - helmrelease-external-secrets.yaml
--- a/infrastructure/addons/external-secrets/namespace.yaml
+++ b/infrastructure/addons/external-secrets/namespace.yaml
@@ -0,0 +1,4 @@
 apiVersion: v1
 kind: Namespace
 metadata:
  name: external-secrets
--- a/infrastructure/addons/flux-ui/cluster-user-auth-externalsecret.yaml
+++ b/infrastructure/addons/flux-ui/cluster-user-auth-externalsecret.yaml
@@ -0,0 +1,25 @@
 apiVersion: external-secrets.io/v1
 kind: ExternalSecret
 metadata:
  name: cluster-user-auth
  namespace: flux-system
 spec:
  refreshInterval: 1h
  secretStoreRef:
    name: doppler-hetznerterra
    kind: ClusterSecretStore
  target:
    name: cluster-user-auth
    creationPolicy: Owner
    template:
      type: Opaque
      data:
        username: "{{ .fluxAdminUsername }}"
        password: "{{ .fluxAdminPasswordHash }}"
  data:
    - secretKey: fluxAdminUsername
      remoteRef:
        key: WEAVE_GITOPS_ADMIN_USERNAME
    - secretKey: fluxAdminPasswordHash
      remoteRef:
        key: WEAVE_GITOPS_ADMIN_PASSWORD_BCRYPT_HASH
--- a/infrastructure/addons/flux-ui/gitrepository-weave-gitops.yaml
+++ b/infrastructure/addons/flux-ui/gitrepository-weave-gitops.yaml
@@ -0,0 +1,10 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: GitRepository
 metadata:
  name: weave-gitops
  namespace: flux-system
 spec:
  interval: 1h
  url: https://github.com/weaveworks/weave-gitops
  ref:
    tag: v0.39.0-rc.2
--- a/infrastructure/addons/flux-ui/helmrelease-weave-gitops.yaml
+++ b/infrastructure/addons/flux-ui/helmrelease-weave-gitops.yaml
@@ -0,0 +1,35 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: weave-gitops
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: flux-system
  chart:
    spec:
      chart: ./charts/gitops-server
      sourceRef:
        kind: GitRepository
        name: weave-gitops
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    service:
      type: ClusterIP
      port: 9001
    adminUser:
      create: true
      createClusterRole: true
      createSecret: false
      username: admin
    rbac:
      create: true
      impersonationResourceNames:
        - admin
--- a/infrastructure/addons/flux-ui/ingress-flux-ui.yaml
+++ b/infrastructure/addons/flux-ui/ingress-flux-ui.yaml
@@ -0,0 +1,19 @@
 apiVersion: networking.k8s.io/v1
 kind: Ingress
 metadata:
  name: flux-ui
  namespace: flux-system
  annotations:
    traefik.ingress.kubernetes.io/router.entrypoints: flux
 spec:
  ingressClassName: traefik
  rules:
    - http:
        paths:
          - path: /
            pathType: Prefix
            backend:
              service:
                name: flux-system-weave-gitops
                port:
                  number: 9001
--- a/infrastructure/addons/flux-ui/kustomization.yaml
+++ b/infrastructure/addons/flux-ui/kustomization.yaml
@@ -0,0 +1,8 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - cluster-user-auth-externalsecret.yaml
  - gitrepository-weave-gitops.yaml
  - helmrelease-weave-gitops.yaml
  - traefik-helmchartconfig-flux-entrypoint.yaml
  - ingress-flux-ui.yaml
--- a/infrastructure/addons/flux-ui/traefik-helmchartconfig-flux-entrypoint.yaml
+++ b/infrastructure/addons/flux-ui/traefik-helmchartconfig-flux-entrypoint.yaml
@@ -0,0 +1,9 @@
 apiVersion: helm.cattle.io/v1
 kind: HelmChartConfig
 metadata:
  name: traefik
  namespace: kube-system
 spec:
  valuesContent: |-
    additionalArguments:
      - "--entryPoints.flux.address=:9001/tcp"
--- a/infrastructure/addons/kustomization-ccm.yaml
+++ b/infrastructure/addons/kustomization-ccm.yaml
@@ -0,0 +1,15 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-ccm
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/ccm
  wait: true
  timeout: 5m
  suspend: true
--- a/infrastructure/addons/kustomization-csi.yaml
+++ b/infrastructure/addons/kustomization-csi.yaml
@@ -0,0 +1,17 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-csi
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/csi
  dependsOn:
    - name: addon-ccm
  wait: true
  timeout: 5m
  suspend: true
--- a/infrastructure/addons/kustomization-external-secrets.yaml
+++ b/infrastructure/addons/kustomization-external-secrets.yaml
@@ -0,0 +1,15 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-external-secrets
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/external-secrets
  wait: true
  timeout: 5m
  suspend: false
--- a/infrastructure/addons/kustomization-flux-ui.yaml
+++ b/infrastructure/addons/kustomization-flux-ui.yaml
@@ -0,0 +1,17 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-flux-ui
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/flux-ui
  dependsOn:
    - name: addon-external-secrets
  wait: true
  timeout: 5m
  suspend: false
--- a/infrastructure/addons/kustomization-observability-content.yaml
+++ b/infrastructure/addons/kustomization-observability-content.yaml
@@ -0,0 +1,17 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-observability-content
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/observability-content
  dependsOn:
    - name: addon-observability
  wait: true
  timeout: 5m
  suspend: false
--- a/infrastructure/addons/kustomization-observability.yaml
+++ b/infrastructure/addons/kustomization-observability.yaml
@@ -0,0 +1,17 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-observability
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/observability
  dependsOn:
    - name: addon-external-secrets
  wait: true
  timeout: 5m
  suspend: false
--- a/infrastructure/addons/kustomization-tailscale-operator.yaml
+++ b/infrastructure/addons/kustomization-tailscale-operator.yaml
@@ -0,0 +1,15 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-tailscale-operator
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/tailscale-operator
  wait: true
  timeout: 5m
  suspend: true
--- a/infrastructure/addons/kustomization-tailscale-proxyclass.yaml
+++ b/infrastructure/addons/kustomization-tailscale-proxyclass.yaml
@@ -0,0 +1,17 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-tailscale-proxyclass
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/tailscale-proxyclass
  dependsOn:
    - name: addon-tailscale-operator
  wait: true
  timeout: 5m
  suspend: true
--- a/infrastructure/addons/kustomization.yaml
+++ b/infrastructure/addons/kustomization.yaml
@@ -0,0 +1,11 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - kustomization-ccm.yaml
  - kustomization-csi.yaml
  - kustomization-external-secrets.yaml
  - kustomization-flux-ui.yaml
  - kustomization-tailscale-operator.yaml
  - kustomization-tailscale-proxyclass.yaml
  - kustomization-observability.yaml
  - kustomization-observability-content.yaml
--- a/infrastructure/addons/observability-content/grafana-dashboard-k8s-overview-configmap.yaml
+++ b/infrastructure/addons/observability-content/grafana-dashboard-k8s-overview-configmap.yaml
@@ -0,0 +1,60 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: grafana-dashboard-k8s-overview
  namespace: observability
  labels:
    grafana_dashboard: "1"
 data:
  k8s-overview.json: |
    {
      "annotations": {"list": []},
      "editable": true,
      "fiscalYearStartMonth": 0,
      "graphTooltip": 0,
      "id": null,
      "links": [],
      "panels": [
        {
          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
          "fieldConfig": {"defaults": {"unit": "none"}, "overrides": []},
          "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
          "id": 1,
          "options": {"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
          "targets": [
            {
              "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
              "legendFormat": "ready",
              "refId": "A"
            }
          ],
          "title": "Ready Nodes",
          "type": "stat"
        },
        {
          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
          "fieldConfig": {"defaults": {"unit": "percentunit"}, "overrides": []},
          "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
          "id": 2,
          "targets": [
            {
              "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))",
              "legendFormat": "cpu",
              "refId": "A"
            }
          ],
          "title": "Cluster CPU Usage",
          "type": "timeseries"
        }
      ],
      "refresh": "30s",
      "schemaVersion": 39,
      "style": "dark",
      "tags": ["kubernetes", "infrastructure"],
      "templating": {"list": []},
      "time": {"from": "now-1h", "to": "now"},
      "timezone": "browser",
      "title": "K8s Cluster Overview",
      "uid": "k8s-cluster-overview",
      "version": 1
    }
--- a/infrastructure/addons/observability-content/grafana-datasources-core-configmap.yaml
+++ b/infrastructure/addons/observability-content/grafana-datasources-core-configmap.yaml
@@ -0,0 +1,16 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: grafana-datasources-core
  namespace: observability
  labels:
    grafana_datasource: "1"
 data:
  datasources.yaml: |
    apiVersion: 1
    datasources:
      - name: Loki
        type: loki
        access: proxy
        url: "http://loki.observability.svc.cluster.local:3100"
        isDefault: false
--- a/infrastructure/addons/observability-content/kustomization.yaml
+++ b/infrastructure/addons/observability-content/kustomization.yaml
@@ -0,0 +1,5 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - grafana-datasources-core-configmap.yaml
  - grafana-dashboard-k8s-overview-configmap.yaml
--- a/infrastructure/addons/observability/grafana-admin-externalsecret.yaml
+++ b/infrastructure/addons/observability/grafana-admin-externalsecret.yaml
@@ -0,0 +1,22 @@
 apiVersion: external-secrets.io/v1
 kind: ExternalSecret
 metadata:
  name: grafana-admin
  namespace: observability
 spec:
  refreshInterval: 1h
  secretStoreRef:
    name: doppler-hetznerterra
    kind: ClusterSecretStore
  target:
    name: grafana-admin-credentials
    creationPolicy: Owner
    template:
      type: Opaque
      data:
        admin-user: admin
        admin-password: "{{ .grafanaAdminPassword }}"
  data:
    - secretKey: grafanaAdminPassword
      remoteRef:
        key: GRAFANA_ADMIN_PASSWORD
--- a/infrastructure/addons/observability/grafana-ingress.yaml
+++ b/infrastructure/addons/observability/grafana-ingress.yaml
@@ -0,0 +1,17 @@
 apiVersion: networking.k8s.io/v1
 kind: Ingress
 metadata:
  name: grafana
  namespace: observability
 spec:
  ingressClassName: traefik
  rules:
    - http:
        paths:
          - path: /grafana
            pathType: Prefix
            backend:
              service:
                name: observability-kube-prometheus-stack-grafana
                port:
                  number: 80
--- a/infrastructure/addons/observability/helmrelease-kube-prometheus-stack.yaml
+++ b/infrastructure/addons/observability/helmrelease-kube-prometheus-stack.yaml
@@ -0,0 +1,77 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: kube-prometheus-stack
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: observability
  chart:
    spec:
      chart: kube-prometheus-stack
      version: 68.4.4
      sourceRef:
        kind: HelmRepository
        name: prometheus-community
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    grafana:
      enabled: true
      admin:
        existingSecret: grafana-admin-credentials
        userKey: admin-user
        passwordKey: admin-password
      grafana.ini:
        server:
          root_url: http://observability/grafana/
          serve_from_sub_path: true
      persistence:
        enabled: true
        storageClassName: local-path
        size: 5Gi
      service:
        type: ClusterIP
      sidecar:
        datasources:
          enabled: true
          label: grafana_datasource
          searchNamespace: observability
        dashboards:
          enabled: true
          label: grafana_dashboard
          searchNamespace: observability
    prometheus:
      service:
        type: ClusterIP
      prometheusSpec:
        externalUrl: http://observability/prometheus/
        routePrefix: /prometheus/
        retention: 7d
        storageSpec:
          volumeClaimTemplate:
            spec:
              storageClassName: local-path
              accessModes:
                - ReadWriteOnce
              resources:
                requests:
                  storage: 10Gi
    alertmanager:
      enabled: false
    kubeEtcd:
      enabled: false
    kubeControllerManager:
      enabled: false
    kubeScheduler:
      enabled: false
    prometheus-node-exporter:
      hostNetwork: false
      service:
        hostPort: false
--- a/infrastructure/addons/observability/helmrelease-loki.yaml
+++ b/infrastructure/addons/observability/helmrelease-loki.yaml
@@ -0,0 +1,99 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: loki
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: observability
  chart:
    spec:
      chart: loki
      version: 6.10.0
      sourceRef:
        kind: HelmRepository
        name: grafana
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    deploymentMode: SingleBinary
    loki:
      auth_enabled: false
      commonConfig:
        replication_factor: 1
      schemaConfig:
        configs:
          - from: "2024-04-01"
            store: tsdb
            object_store: filesystem
            schema: v13
            index:
              prefix: loki_index_
              period: 24h
      storage:
        type: filesystem
      limits_config:
        allow_structured_metadata: true
        volume_enabled: true
        retention_period: 168h
      pattern_ingester:
        enabled: true
      ruler:
        enable_api: true
    singleBinary:
      replicas: 1
      persistence:
        size: 10Gi
        storageClass: local-path
      resources:
        requests:
          cpu: 100m
          memory: 256Mi
        limits:
          cpu: 500m
          memory: 1Gi
    backend:
      replicas: 0
    read:
      replicas: 0
    write:
      replicas: 0
    ingester:
      replicas: 0
    querier:
      replicas: 0
    queryFrontend:
      replicas: 0
    queryScheduler:
      replicas: 0
    distributor:
      replicas: 0
    compactor:
      replicas: 0
    indexGateway:
      replicas: 0
    bloomCompactor:
      replicas: 0
    bloomGateway:
      replicas: 0
    gateway:
      enabled: false
    test:
      enabled: false
    chunksCache:
      enabled: true
      allocatedMemory: 128
    resultsCache:
      enabled: true
      allocatedMemory: 128
    monitoring:
      selfMonitoring:
        enabled: false
      lokiCanary:
        enabled: false
--- a/infrastructure/addons/observability/helmrelease-promtail.yaml
+++ b/infrastructure/addons/observability/helmrelease-promtail.yaml
@@ -0,0 +1,27 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: promtail
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: observability
  chart:
    spec:
      chart: promtail
      version: 6.16.6
      sourceRef:
        kind: HelmRepository
        name: grafana
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    config:
      clients:
        - url: http://loki.observability.svc.cluster.local:3100/loki/api/v1/push
--- a/infrastructure/addons/observability/helmrepository-grafana.yaml
+++ b/infrastructure/addons/observability/helmrepository-grafana.yaml
@@ -0,0 +1,8 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: grafana
  namespace: flux-system
 spec:
  interval: 1h
  url: https://grafana.github.io/helm-charts
--- a/infrastructure/addons/observability/helmrepository-prometheus-community.yaml
+++ b/infrastructure/addons/observability/helmrepository-prometheus-community.yaml
@@ -0,0 +1,8 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: prometheus-community
  namespace: flux-system
 spec:
  interval: 1h
  url: https://prometheus-community.github.io/helm-charts
--- a/infrastructure/addons/observability/kustomization.yaml
+++ b/infrastructure/addons/observability/kustomization.yaml
@@ -0,0 +1,13 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - namespace.yaml
  - grafana-admin-externalsecret.yaml
  - traefik-tailscale-service.yaml
  - grafana-ingress.yaml
  - prometheus-ingress.yaml
  - helmrepository-prometheus-community.yaml
  - helmrepository-grafana.yaml
  - helmrelease-kube-prometheus-stack.yaml
  - helmrelease-loki.yaml
  - helmrelease-promtail.yaml
--- a/infrastructure/addons/observability/namespace.yaml
+++ b/infrastructure/addons/observability/namespace.yaml
@@ -0,0 +1,4 @@
 apiVersion: v1
 kind: Namespace
 metadata:
  name: observability
--- a/infrastructure/addons/observability/prometheus-ingress.yaml
+++ b/infrastructure/addons/observability/prometheus-ingress.yaml
@@ -0,0 +1,17 @@
 apiVersion: networking.k8s.io/v1
 kind: Ingress
 metadata:
  name: prometheus
  namespace: observability
 spec:
  ingressClassName: traefik
  rules:
    - http:
        paths:
          - path: /prometheus
            pathType: Prefix
            backend:
              service:
                name: observability-kube-prometh-prometheus
                port:
                  number: 9090
--- a/infrastructure/addons/observability/traefik-tailscale-service.yaml
+++ b/infrastructure/addons/observability/traefik-tailscale-service.yaml
@@ -0,0 +1,27 @@
 apiVersion: v1
 kind: Service
 metadata:
  name: traefik-tailscale
  namespace: kube-system
  annotations:
    tailscale.com/hostname: observability
    tailscale.com/proxy-class: infra-stable
 spec:
  type: LoadBalancer
  loadBalancerClass: tailscale
  selector:
    app.kubernetes.io/instance: traefik-kube-system
    app.kubernetes.io/name: traefik
  ports:
    - name: web
      port: 80
      protocol: TCP
      targetPort: web
    - name: websecure
      port: 443
      protocol: TCP
      targetPort: websecure
    - name: flux
      port: 9001
      protocol: TCP
      targetPort: 9001
--- a/infrastructure/addons/tailscale-operator/helmrelease-tailscale-operator.yaml
+++ b/infrastructure/addons/tailscale-operator/helmrelease-tailscale-operator.yaml
@@ -0,0 +1,39 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: tailscale-operator
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: tailscale-system
  chart:
    spec:
      chart: tailscale-operator
      version: 1.95.91
      sourceRef:
        kind: HelmRepository
        name: tailscale
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    installCRDs: true
    apiServerProxyConfig:
      mode: "true"
    operatorConfig:
      defaultTags:
        - tag:k8s
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
      tolerations:
        - key: node-role.kubernetes.io/control-plane
          operator: Exists
          effect: NoSchedule
    proxyConfig:
      defaultTags: tag:k8s
      defaultProxyClass: infra-stable
--- a/infrastructure/addons/tailscale-operator/helmrepository-tailscale.yaml
+++ b/infrastructure/addons/tailscale-operator/helmrepository-tailscale.yaml
@@ -0,0 +1,8 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: HelmRepository
 metadata:
  name: tailscale
  namespace: flux-system
 spec:
  interval: 1h
  url: https://pkgs.tailscale.com/unstable/helmcharts
--- a/infrastructure/addons/tailscale-operator/kustomization.yaml
+++ b/infrastructure/addons/tailscale-operator/kustomization.yaml
@@ -0,0 +1,6 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - namespace.yaml
  - helmrepository-tailscale.yaml
  - helmrelease-tailscale-operator.yaml
--- a/infrastructure/addons/tailscale-operator/namespace.yaml
+++ b/infrastructure/addons/tailscale-operator/namespace.yaml
@@ -0,0 +1,4 @@
 apiVersion: v1
 kind: Namespace
 metadata:
  name: tailscale-system
--- a/infrastructure/addons/tailscale-proxyclass/kustomization.yaml
+++ b/infrastructure/addons/tailscale-proxyclass/kustomization.yaml
@@ -0,0 +1,4 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - proxyclass-infra-stable.yaml
--- a/infrastructure/addons/tailscale-proxyclass/proxyclass-infra-stable.yaml
+++ b/infrastructure/addons/tailscale-proxyclass/proxyclass-infra-stable.yaml
@@ -0,0 +1,13 @@
 apiVersion: tailscale.com/v1alpha1
 kind: ProxyClass
 metadata:
  name: infra-stable
 spec:
  statefulSet:
    pod:
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
      tolerations:
        - key: node-role.kubernetes.io/control-plane
          operator: Exists
          effect: NoSchedule
--- a/infrastructure/kustomization.yaml
+++ b/infrastructure/kustomization.yaml
@@ -0,0 +1,4 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - addons
--- a/terraform/variables.tf
+++ b/terraform/variables.tf
@@ -25,7 +25,7 @@ variable "cluster_name" {
 variable "control_plane_count" {
  description = "Number of control plane nodes"
  type        = number
-  default     = 3
+  default     = 1
 }
 variable "control_plane_type" {
@@ -37,7 +37,7 @@ variable "control_plane_type" {
 variable "worker_count" {
  description = "Number of worker nodes"
  type        = number
-  default     = 4
+  default     = 2
 }
 variable "worker_type" {