cleanup: Remove obsolete port-forwarding, deferred Traefik files, and CI workaround

- Remove ansible/roles/private-access/ (replaced by Tailscale LB services) - Remove deferred observability ingress/traefik files (replaced by direct Tailscale LBs) - Remove orphaned kustomization-traefik-config.yaml (no backing directory) - Simplify CI: remove SA patch + job deletion workaround for rancher-backup (now handled by postRenderer in HelmRelease) - Update AGENTS.md to reflect current architecture
fix: Use rancher/kubectl image for rancher-backup hook
2026-04-02 01:21:23 +00:00 · 2026-04-02 01:00:27 +00:00 · 2026-04-02 00:51:50 +00:00 · 2026-04-02 00:45:03 +00:00 · 2026-04-01 02:08:12 +00:00 · 2026-04-01 01:41:49 +00:00
71 changed files with 1284 additions and 240 deletions
--- a/.gitea/workflows/deploy.yml
+++ b/.gitea/workflows/deploy.yml
@@ -88,8 +88,11 @@ jobs:
          }

          ensure_import 'hcloud_server.control_plane[0]' 'k8s-cluster-cp-1'
+          ensure_import 'hcloud_server.control_plane[1]' 'k8s-cluster-cp-2'
+          ensure_import 'hcloud_server.control_plane[2]' 'k8s-cluster-cp-3'
          ensure_import 'hcloud_server.workers[0]' 'k8s-cluster-worker-1'
          ensure_import 'hcloud_server.workers[1]' 'k8s-cluster-worker-2'
+          ensure_import 'hcloud_server.workers[2]' 'k8s-cluster-worker-3'

      - name: Terraform Plan
        id: plan
@@ -227,6 +230,7 @@ jobs:
            -e "tailscale_oauth_client_id=${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}" \
            -e "tailscale_oauth_client_secret=${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}" \
            -e "doppler_hetznerterra_service_token=${{ secrets.DOPPLER_HETZNERTERRA_SERVICE_TOKEN }}" \
+            -e "tailscale_api_key=${{ secrets.TAILSCALE_API_KEY }}" \
            -e "grafana_admin_password=${{ secrets.GRAFANA_ADMIN_PASSWORD }}" \
            -e "cluster_name=k8s-cluster"
        env:
@@ -237,6 +241,12 @@ jobs:
          curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
          chmod +x /usr/local/bin/kubectl

+      - name: Install flux CLI
+        run: |
+          curl -fsSL https://github.com/fluxcd/flux2/releases/download/v2.5.1/flux_2.5.1_linux_amd64.tar.gz | tar xz -C /tmp
+          mv /tmp/flux /usr/local/bin/flux
+          chmod +x /usr/local/bin/flux
+
      - name: Rewrite kubeconfig for runner-reachable API
        working-directory: terraform
        run: |
@@ -255,31 +265,150 @@ jobs:
            --from-file=identity="$HOME/.ssh/id_ed25519" \
            --from-file=known_hosts=/tmp/flux_known_hosts \
            --dry-run=client -o yaml | kubectl apply -f -
-          kubectl apply -k clusters/prod/flux-system
+          # Apply CRDs and controllers first
+          kubectl apply -f clusters/prod/flux-system/gotk-components.yaml
+          # Wait for CRDs to be established
+          kubectl wait --for=condition=Established crd --all --timeout=120s
+          # Then apply custom resources
+          kubectl apply -f clusters/prod/flux-system/gitrepository-platform.yaml
+          kubectl apply -f clusters/prod/flux-system/kustomization-infrastructure.yaml
+          kubectl apply -f clusters/prod/flux-system/kustomization-apps.yaml
+          # Patch Flux controllers to run on cp-1 only
+          kubectl -n flux-system patch deployment source-controller --type='merge' -p='{"spec":{"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"k8s-cluster-cp-1"}}}}}'
+          kubectl -n flux-system patch deployment kustomize-controller --type='merge' -p='{"spec":{"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"k8s-cluster-cp-1"}}}}}'
+          kubectl -n flux-system patch deployment helm-controller --type='merge' -p='{"spec":{"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"k8s-cluster-cp-1"}}}}}'
+          kubectl -n flux-system patch deployment notification-controller --type='merge' -p='{"spec":{"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"k8s-cluster-cp-1"}}}}}'
          kubectl -n flux-system rollout status deployment/source-controller --timeout=180s
          kubectl -n flux-system rollout status deployment/kustomize-controller --timeout=180s
          kubectl -n flux-system rollout status deployment/helm-controller --timeout=180s
          kubectl -n flux-system wait --for=condition=Ready gitrepository/platform --timeout=180s
          kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=300s
          kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets --timeout=300s
-          kubectl -n flux-system wait --for=condition=Ready kustomization/addon-ccm --timeout=300s
-          kubectl -n flux-system wait --for=condition=Ready kustomization/addon-csi --timeout=300s
+          # Create Doppler ClusterSecretStore now that ESO CRDs are available
+          kubectl apply -f - <<'EOF'
+          apiVersion: external-secrets.io/v1
+          kind: ClusterSecretStore
+          metadata:
+            name: doppler-hetznerterra
+          spec:
+            provider:
+              doppler:
+                auth:
+                  secretRef:
+                    dopplerToken:
+                      name: doppler-hetznerterra-service-token
+                      key: dopplerToken
+                      namespace: external-secrets
+          EOF
+          # Wait for CCM and CSI (Hetzner cloud integration)
+          kubectl -n flux-system wait --for=condition=Ready kustomization/addon-ccm --timeout=600s
+          kubectl -n flux-system wait --for=condition=Ready kustomization/addon-csi --timeout=600s
          kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s
-          kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=300s
-          kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=300s
+
+      - name: Wait for Rancher and backup operator
+        env:
+          KUBECONFIG: outputs/kubeconfig
+        run: |
+          set -euo pipefail
+          echo "Waiting for Rancher..."
+          kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=600s
+          kubectl -n flux-system wait --for=condition=Ready helmrelease/rancher -n flux-system --timeout=300s
+
+          echo "Waiting for rancher-backup operator..."
+          kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup --timeout=600s || true
+
+      - name: Restore Rancher from latest B2 backup
+        env:
+          KUBECONFIG: outputs/kubeconfig
+          B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }}
+          B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }}
+        run: |
+          echo "Finding latest backup in B2..."
+
+          CREDS=$(echo -n "${B2_ACCOUNT_ID}:${B2_APPLICATION_KEY}" | base64)
+          AUTH_RESP=$(curl -sS -H "Authorization: Basic ${CREDS}" https://api.backblazeb2.com/b2api/v2/b2_authorize_account)
+          API_URL=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['apiUrl'])")
+          AUTH_TOKEN=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['authorizationToken'])")
+          BUCKET_ID=$(echo "$AUTH_RESP" | python3 -c "
+          import json,sys
+          resp = json.load(sys.stdin)
+          bid = resp.get('allowed', {}).get('bucketId')
+          if bid:
+            print(bid)
+          else:
+            print('')
+          ")
+
+          if [ -z "$BUCKET_ID" ]; then
+            echo "Restricted B2 key - resolving bucket ID by name..."
+            BUCKET_ID=$(curl -sS -H "Authorization: Bearer ${AUTH_TOKEN}" \
+              "${API_URL}/b2api/v2/b2_list_buckets?accountId=${B2_ACCOUNT_ID}&bucketName=HetznerTerra" \
+              | python3 -c "import json,sys; buckets=json.load(sys.stdin).get('buckets',[]); print(buckets[0]['bucketId'] if buckets else '')")
+          fi
+
+          LATEST=$(curl -sS -H "Authorization: Bearer ${AUTH_TOKEN}" \
+            "${API_URL}/b2api/v2/b2_list_file_names?bucketId=${BUCKET_ID}&prefix=rancher-backups/&maxFileCount=100" \
+            | python3 -c "
+          import json,sys
+          files = json.load(sys.stdin).get('files', [])
+          tars = [f['fileName'] for f in files if f['fileName'].endswith('.tar.gz')]
+          if not tars:
+            print('NONE')
+          else:
+            tars.sort()
+            print(tars[-1])
+          ")
+
+          if [ "$LATEST" = "NONE" ]; then
+            echo "No backups found in B2. Skipping restore."
+            exit 0
+          fi
+
+          BACKUP_FILE=$(basename "$LATEST")
+          echo "Latest backup: ${BACKUP_FILE}"
+
+          echo "Creating Restore CR..."
+          kubectl apply -f - <<EOF
+          apiVersion: resources.cattle.io/v1
+          kind: Restore
+          metadata:
+            name: restore-from-b2
+            namespace: cattle-resources-system
+          spec:
+            backupFilename: ${BACKUP_FILE}
+            storageLocation:
+              s3:
+                credentialSecretName: rancher-b2-creds
+                credentialSecretNamespace: cattle-resources-system
+                bucketName: HetznerTerra
+                folder: rancher-backups
+                endpoint: s3.us-east-005.backblazeb2.com
+                region: us-east-005
+          EOF
+
+          echo "Waiting for restore to complete..."
+          for i in $(seq 1 60); do
+            STATUS=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "Unknown")
+            MESSAGE=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].message}' 2>/dev/null || echo "")
+            echo "  Restore status: ${STATUS} - ${MESSAGE}"
+            if [ "$STATUS" = "True" ]; then
+              echo "Restore completed successfully!"
+              exit 0
+            fi
+            sleep 10
+          done
+          echo "Restore did not complete within timeout. Continuing anyway."

      - name: Post-deploy cluster health checks
        working-directory: ansible
        run: |
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get nodes -o wide"
+          ansible -i inventory.ini 'control_plane[0]' -m shell -a "kubectl describe nodes | grep -E 'Name:|providerID:'"
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n flux-system get gitrepositories,kustomizations,helmreleases"
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n kube-system get pods -o wide"
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass"
-          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get pods -o wide"
-          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get pvc"
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n tailscale-system get pods -o wide"
-          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get svc kube-prometheus-stack-grafana kube-prometheus-stack-prometheus"
-          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability describe svc kube-prometheus-stack-grafana"
+          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n external-secrets get pods"
        env:
          ANSIBLE_HOST_KEY_CHECKING: "False"

--- a/.gitea/workflows/destroy.yml
+++ b/.gitea/workflows/destroy.yml
@@ -16,13 +16,101 @@ env:
  TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
  TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
  TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
+  B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }}
+  B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }}

 jobs:
+  pre-destroy-backup:
+    name: Pre-Destroy Backup
+    runs-on: ubuntu-latest
+    if: github.event.inputs.confirm == 'destroy'
+    environment: destroy
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Terraform
+        uses: hashicorp/setup-terraform@v3
+        with:
+          terraform_version: ${{ env.TF_VERSION }}
+
+      - name: Terraform Init
+        working-directory: terraform
+        run: |
+          terraform init \
+            -backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
+            -backend-config="bucket=${{ secrets.S3_BUCKET }}" \
+            -backend-config="region=auto" \
+            -backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
+            -backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
+            -backend-config="skip_requesting_account_id=true"
+
+      - name: Setup SSH Keys
+        run: |
+          mkdir -p ~/.ssh
+          echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
+          chmod 600 ~/.ssh/id_ed25519
+          echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
+          chmod 644 ~/.ssh/id_ed25519.pub
+
+      - name: Get Control Plane IP
+        id: cp_ip
+        working-directory: terraform
+        run: |
+          PRIMARY_IP=$(terraform output -raw primary_control_plane_ip)
+          echo "PRIMARY_IP=${PRIMARY_IP}" >> "$GITHUB_ENV"
+
+      - name: Pre-Destroy pg_dump to B2
+        run: |
+          set +e
+          echo "Attempting pre-destroy backup to B2..."
+          ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null root@${PRIMARY_IP} << 'EOF'
+            set -e
+            # Check if kubectl is available and cluster is up
+            if ! command -v kubectl &> /dev/null; then
+              echo "kubectl not found, skipping pre-destroy backup"
+              exit 0
+            fi
+            
+            # Check if we can reach the cluster
+            if ! kubectl cluster-info &> /dev/null; then
+              echo "Cannot reach cluster, skipping pre-destroy backup"
+              exit 0
+            fi
+            
+            # Check if CNP is deployed
+            if ! kubectl get namespace cnpg-cluster &> /dev/null; then
+              echo "CNP namespace not found, skipping pre-destroy backup"
+              exit 0
+            fi
+            
+            # Run backup using the pgdump image directly
+            BACKUP_FILE="rancher-backup-$(date +%Y%m%d-%H%M%S).sql.gz"
+            B2_ACCOUNT_ID="$(cat /etc/kubernetes/secret/b2_account_id 2>/dev/null || echo '')"
+            B2_APPLICATION_KEY="$(cat /etc/kubernetes/secret/b2_application_key 2>/dev/null || echo '')"
+            
+            if [ -z "$B2_ACCOUNT_ID" ] || [ -z "$B2_APPLICATION_KEY" ]; then
+              echo "B2 credentials not found in secret, skipping pre-destroy backup"
+              exit 0
+            fi
+            
+            kubectl run pgdump-manual --image=ghcr.io/cloudnative-pg/pgbackrest:latest --restart=Never \
+              -n cnpg-cluster --dry-run=client -o yaml | \
+              kubectl apply -f -
+            
+            echo "Waiting for backup job to complete..."
+            kubectl wait --for=condition=complete job/pgdump-manual -n cnpg-cluster --timeout=300s || true
+            kubectl logs job/pgdump-manual -n cnpg-cluster || true
+            kubectl delete job pgdump-manual -n cnpg-cluster --ignore-not-found=true || true
+          EOF
+          echo "Pre-destroy backup step completed (failure is non-fatal)"
+
  destroy:
    name: Destroy Cluster
    runs-on: ubuntu-latest
    if: github.event.inputs.confirm == 'destroy'
    environment: destroy
+    needs: pre-destroy-backup
    steps:
      - name: Checkout
        uses: actions/checkout@v4
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -0,0 +1,144 @@
+# AGENTS.md
+
+Repository guide for agentic contributors working in this repo.
+
+## Scope
+
+- Infrastructure repo for a Hetzner + k3s + Flux stack running Rancher.
+- Primary areas: `terraform/`, `ansible/`, `clusters/`, `infrastructure/`, `apps/`, `.gitea/workflows/`.
+- Treat `README.md` and `STABLE_BASELINE.md` as user-facing context, but prefer current manifests and workflows as source of truth.
+- Keep changes small and reviewable; prefer the narrowest file set that solves the task.
+
+## Architecture
+
+- **Terraform** provisions Hetzner servers, network, firewall, load balancer, SSH keys.
+- **Ansible** bootstraps OS, installs k3s (with external cloud provider), deploys Hetzner CCM, Tailscale, Doppler token.
+- **Flux** reconciles all cluster addons from this repo after Ansible hands off.
+- **Rancher** stores state in embedded etcd (NOT an external DB). Backup/restore uses the `rancher-backup` operator to B2.
+- **cert-manager** is required — Tailscale LoadBalancer does L4 TCP passthrough, so Rancher serves its own TLS.
+- **Secrets flow**: Doppler → `ClusterSecretStore` (doppler-hetznerterra) → `ExternalSecret` resources → k8s Secrets.
+- Rancher is reachable only over Tailscale at `https://rancher.silverside-gopher.ts.net/`.
+- Grafana, Prometheus, and Flux UI are also exposed via dedicated Tailscale LoadBalancer services at `http://grafana.silverside-gopher.ts.net/`, `http://prometheus.silverside-gopher.ts.net/`, `http://flux.silverside-gopher.ts.net:9001/`.
+
+## Important Files
+
+- `terraform/main.tf` — provider and version pins
+- `terraform/variables.tf` — input surface and defaults
+- `terraform/firewall.tf` — firewall rules (tailnet CIDR, internal cluster ports)
+- `ansible/site.yml` — ordered bootstrap playbook (roles: common → k3s-server → ccm → k3s-agent → doppler → tailscale-cleanup)
+- `ansible/generate_inventory.py` — renders `ansible/inventory.ini` from Terraform outputs via Jinja2
+- `clusters/prod/flux-system/` — Flux GitRepository and top-level Kustomization resources
+- `infrastructure/addons/kustomization.yaml` — root addon graph with dependency ordering
+- `infrastructure/addons/<addon>/` — each addon is a self-contained dir with its own `kustomization.yaml`
+- `.gitea/workflows/deploy.yml` — canonical CI: terraform → ansible → flux bootstrap → B2 restore → health checks
+
+## Build / Validate / Test
+
+### Terraform
+
+- Format: `terraform -chdir=terraform fmt -recursive`
+- Check formatting: `terraform -chdir=terraform fmt -check -recursive`
+- Validate: `terraform -chdir=terraform validate`
+- Plan (full): `terraform -chdir=terraform plan -var-file=../terraform.tfvars`
+- Plan one resource: `terraform -chdir=terraform plan -var-file=../terraform.tfvars -target=hcloud_server.control_plane[0]`
+- Apply: `terraform -chdir=terraform apply -var-file=../terraform.tfvars`
+- State inspection: `terraform -chdir=terraform state list` / `terraform state show <address>`
+
+### Ansible
+
+- Install collections: `ansible-galaxy collection install -r ansible/requirements.yml`
+- Generate inventory: `cd ansible && python3 generate_inventory.py` (requires Terraform outputs)
+- Syntax check: `ansible-playbook -i ansible/inventory.ini ansible/site.yml --syntax-check`
+- Dry-run one host: `ansible-playbook -i ansible/inventory.ini ansible/site.yml --check --diff -l control_plane[0]`
+- Full bootstrap: `ansible-playbook ansible/site.yml`
+- Targeted: `ansible-playbook ansible/site.yml -t upgrade` or `-t reset`
+- Dashboards only: `ansible-playbook ansible/dashboards.yml`
+
+### Python
+
+- Syntax check: `python3 -m py_compile ansible/generate_inventory.py`
+- Run: `cd ansible && python3 generate_inventory.py`
+
+### Kubernetes / Flux manifests
+
+- Render single addon: `kubectl kustomize infrastructure/addons/<addon>`
+- Render cluster bootstrap: `kubectl kustomize clusters/prod/flux-system`
+- Validate only the directory you edited, not the whole repo.
+
+### Kubeconfig refresh
+
+- Preferred: `scripts/refresh-kubeconfig.sh <cp1-public-ip>`
+- Manual: `ssh -i ~/.ssh/infra root@<cp1-ip> "cat /etc/rancher/k3s/k3s.yaml" | sed 's/127.0.0.1/<cp1-ip>/g' > outputs/kubeconfig`
+
+## Code Style
+
+### General
+
+- Match existing style in adjacent files. No new tools/frameworks unless the repo already uses them.
+- Prefer ASCII. Keep diffs minimal. No unrelated cleanup.
+- No comments unless the logic is non-obvious.
+
+### Terraform / HCL
+
+- 2-space indent. `terraform {}` block first, then providers, locals, variables, resources, outputs.
+- `snake_case` for variables, locals, resources. Descriptions on all variables/outputs.
+- `sensitive = true` on secrets. Run `terraform fmt` instead of hand-formatting.
+- Use `locals` for reused or non-trivial logic. Explicit `depends_on` only when required.
+
+### Ansible / YAML
+
+- 2-space YAML indent. Descriptive task names in sentence case.
+- Idempotent tasks: `changed_when: false` and `failed_when: false` for probes.
+- `command`/`shell` only when no dedicated module fits. `shell` only for pipes/redirection/heredocs.
+- `when` guards and `default(...)` filters over duplicated tasks.
+- Role names and filenames: kebab-case. Variables: snake_case.
+- Multi-line shell in workflows: `set -e` or `set -euo pipefail` for fail-fast.
+
+### Kubernetes / Flux YAML
+
+- One object per file. Kebab-case filenames matching repo patterns: `helmrelease-*.yaml`, `kustomization-*.yaml`, `*-externalsecret.yaml`.
+- Addon manifests live in `infrastructure/addons/<addon>/` with a `kustomization.yaml`.
+- Flux graph objects in `clusters/prod/flux-system/`.
+- Each addon gets a `kustomization-<addon>.yaml` entry in `infrastructure/addons/` with `dependsOn` for ordering.
+- Quote strings with `:`, `*`, cron expressions, or shell-sensitive chars.
+- Preserve existing labels/annotations unless the change specifically needs them.
+
+### Python
+
+- PEP 8. Imports ordered: stdlib, third-party, local. `snake_case` for functions/variables.
+- Scripts small and explicit. Exit non-zero on failure. Clear subprocess error handling.
+
+## Known Issues & Workarounds
+
+- **rancher-backup post-install job** (`rancher-backup-patch-sa`) uses a postRenderer in the HelmRelease to replace the broken `rancher/kuberlr-kubectl` image with `rancher/kubectl`. Do NOT set `s3` block in HelmRelease values — put S3 config in the Backup CR instead.
+- **B2 ExternalSecret** must use key names `accessKey` and `secretKey` (not `aws_access_key_id`/`aws_secret_access_key`).
+- **Stale Tailscale devices**: After cluster rebuild, delete stale offline `rancher` devices before booting. The `tailscale-cleanup` Ansible role handles this via the Tailscale API.
+- **Restricted B2 keys**: `b2_authorize_account` may return `allowed.bucketId: null`. CI falls back to `b2_list_buckets` to resolve bucket ID by name.
+
+## Secrets / Security
+
+- Never commit tokens, passwords, kubeconfigs, private keys, or generated secrets.
+- Runtime secrets via Gitea secrets (CI), Doppler, or External Secrets Operator.
+- `terraform.tfvars` and `outputs/` are gitignored. Never print secret values in logs or commits.
+
+## CI Pipeline (`.gitea/workflows/deploy.yml`)
+
+1. Terraform: fmt check → init → validate → import existing servers → plan → apply (main only)
+2. Ansible: install deps → generate inventory → run site.yml with extra vars (secrets injected from Gitea)
+3. Flux bootstrap: install kubectl/flux → rewrite kubeconfig → apply CRDs → apply graph → wait for addons
+4. Rancher wait: wait for Rancher and backup operator to be ready
+5. B2 restore: authorize B2 → find latest backup → create Restore CR → poll until ready
+6. Health checks: nodes, Flux objects, pods, storage class
+
+## Editing Practices
+
+- Read target file and adjacent patterns before editing.
+- Run the narrowest validation command after edits.
+- If you make a live-cluster workaround, also update the declarative manifests so Flux can own it.
+- Changes spanning Terraform + Ansible + Flux: update and verify each layer separately.
+- Check `git status` before and after changes.
+
+## Cursor / Copilot Rules
+
+- No `.cursor/rules/`, `.cursorrules`, or `.github/copilot-instructions.md` files exist.
+- If added later, mirror their guidance here and treat them as authoritative.
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ Production-ready Kubernetes cluster on Hetzner Cloud using Terraform and Ansible
 | **Total Cost** | €28.93/mo |
 | **K8s** | k3s (latest, HA) |
 | **Addons** | Hetzner CCM + CSI + Prometheus + Grafana + Loki |
-| **Access** | SSH/API restricted to Tailnet |
+| **Access** | SSH/API and Rancher UI restricted to Tailnet |
 | **Bootstrap** | Terraform + Ansible |

 ### Cluster Resources
@@ -234,10 +234,17 @@ Terraform/bootstrap secrets remain in Gitea Actions secrets and are not managed
 ### Current addon status

 - Core infrastructure addons are Flux-managed from `infrastructure/addons/`.
- Active Flux addons include `addon-ccm`, `addon-csi`, `addon-tailscale-operator`, `addon-tailscale-proxyclass`, `addon-external-secrets`, `addon-observability`, and `addon-observability-content`.
+- Active Flux addons for stable baseline: `addon-tailscale-operator`, `addon-tailscale-proxyclass`, `addon-external-secrets`.
+- Deferred addons: `addon-ccm`, `addon-csi`, `addon-observability`, `addon-observability-content` (to be added after baseline is stable).
 - Ansible is limited to cluster bootstrap, private-access setup, and prerequisite secret creation for Flux-managed addons.
 - `addon-flux-ui` is optional for the stable-baseline phase and is not a blocker for rebuild success.

+### Rancher access
+
+- Rancher is private-only and exposed through Tailscale at `https://rancher.silverside-gopher.ts.net/dashboard/`.
+- The public Hetzner load balancer path is not used for Rancher.
+- Rancher uses the CNPG-backed PostgreSQL cluster in `cnpg-cluster`.
+
 ### Stable baseline acceptance

 A rebuild is considered successful only when all of the following pass without manual intervention:
@@ -245,12 +252,13 @@ A rebuild is considered successful only when all of the following pass without m
 - Terraform create succeeds for the default `1` control plane and `2` workers.
 - Ansible bootstrap succeeds end-to-end.
 - All nodes become `Ready`.
- `hcloud-cloud-controller-manager` and `hcloud-csi` are `Ready`.
- Required External Secrets sync successfully.
- Tailscale private access works.
- Grafana and Prometheus are reachable privately.
+- Flux core reconciliation is healthy.
+- External Secrets Operator is ready.
+- Tailscale operator is ready.
 - Terraform destroy succeeds cleanly or succeeds after workflow retries.

+_Note: Observability stack (Grafana/Prometheus) is deferred and will be added once the core platform baseline is stable._
+
 ## Observability Stack

 Flux deploys a lightweight observability stack in the `observability` namespace:
--- a/STABLE_BASELINE.md
+++ b/STABLE_BASELINE.md
@@ -4,25 +4,32 @@ This document defines the current engineering target for this repository.

 ## Topology

- 1 control plane
- 2 workers
+- 3 control planes (HA etcd cluster)
+- 3 workers
+- Hetzner Load Balancer for Kubernetes API
 - private Hetzner network
 - Tailscale operator access
+- Rancher UI exposed only through Tailscale (`rancher.silverside-gopher.ts.net`)

 ## In Scope

 - Terraform infrastructure bootstrap
- Ansible k3s bootstrap
+- Ansible k3s bootstrap with external cloud provider
+- **HA control plane (3 nodes with etcd quorum)**
+- **Hetzner Load Balancer for Kubernetes API**
+- **Hetzner CCM deployed via Ansible (before workers join)**
+- **Hetzner CSI for persistent volumes (via Flux)**
 - Flux core reconciliation
- Hetzner CCM
- Hetzner CSI
 - External Secrets Operator with Doppler
 - Tailscale private access
- Observability stack
+- Persistent volume provisioning validated
+
+## Deferred for Later Phases
+
+- Observability stack (deferred - complex helm release needs separate debugging)

 ## Out of Scope

- HA control plane
 - public ingress or DNS
 - public TLS
 - app workloads
@@ -31,17 +38,28 @@ This document defines the current engineering target for this repository.

 ## Phase Gates

-1. Terraform apply completes for the default topology.
-2. k3s server bootstrap completes and kubeconfig works.
-3. Workers join and all nodes are Ready.
-4. Flux source and infrastructure reconciliation are healthy.
-5. CCM is Ready.
-6. CSI is Ready and a PVC can bind.
-7. External Secrets sync required secrets.
-8. Tailscale private access works.
-9. Observability is healthy and reachable privately.
-10. Terraform destroy succeeds cleanly or via workflow retry.
+1. Terraform apply completes for HA topology (3 CP, 3 workers, 1 LB).
+2. Load Balancer is healthy with all 3 control plane targets.
+3. Primary control plane bootstraps with `--cluster-init`.
+4. Secondary control planes join via Load Balancer endpoint.
+5. **CCM deployed via Ansible before workers join** (fixes uninitialized taint issue).
+6. Workers join successfully via Load Balancer and all nodes show proper `providerID`.
+7. etcd reports 3 healthy members.
+8. Flux source and infrastructure reconciliation are healthy.
+9. **CSI deploys and creates `hcloud-volumes` StorageClass**.
+10. **PVC provisioning tested and working**.
+11. External Secrets sync required secrets.
+12. Tailscale private access works, including Rancher UI access.
+13. Terraform destroy succeeds cleanly or via workflow retry.

 ## Success Criteria

-The baseline is considered stable only after two consecutive fresh rebuilds pass all phase gates with no manual fixes.
+✅ **ACHIEVED** - HA Cluster with CCM/CSI:
+- Build 1: Initial CCM/CSI deployment and validation (2026-03-23)
+- Build 2: Full destroy/rebuild cycle successful (2026-03-23)
+
+🔄 **IN PROGRESS** - HA Control Plane Validation:
+- Build 3: Deploy 3-3 topology with Load Balancer
+- Build 4: Destroy/rebuild to validate HA configuration
+
+Success requires two consecutive HA rebuilds passing all phase gates with no manual fixes.
--- a/ansible/generate_inventory.py
+++ b/ansible/generate_inventory.py
@@ -32,6 +32,7 @@ def main():
    worker_names = outputs["worker_names"]["value"]
    worker_ips = outputs["worker_ips"]["value"]
    worker_private_ips = outputs["worker_private_ips"]["value"]
+    kube_api_lb_ip = outputs.get("kube_api_lb_ip", {}).get("value", control_plane_ips[0])

    control_planes = [
        {
@@ -59,6 +60,7 @@ def main():
        "control_planes": control_planes,
        "workers": workers,
        "private_key_file": outputs["ssh_private_key_path"]["value"],
+        "kube_api_lb_ip": kube_api_lb_ip,
    }

    env = Environment(loader=FileSystemLoader("."))
--- a/ansible/inventory.tmpl
+++ b/ansible/inventory.tmpl
@@ -17,3 +17,4 @@ ansible_user=root
 ansible_python_interpreter=/usr/bin/python3
 ansible_ssh_private_key_file={{ private_key_file }}
 k3s_version=latest
+kube_api_endpoint={{ kube_api_lb_ip }}
--- a/ansible/requirements.yml
+++ b/ansible/requirements.yml
@@ -3,3 +3,5 @@ collections:
    version: ">=2.4.0"
  - name: community.general
    version: ">=8.0.0"
+  - name: community.network
+    version: ">=5.0.0"
--- a/ansible/roles/ccm-deploy/tasks/main.yml
+++ b/ansible/roles/ccm-deploy/tasks/main.yml
@@ -0,0 +1,82 @@
+---
+- name: Check if hcloud secret exists
+  command: kubectl -n kube-system get secret hcloud
+  register: hcloud_secret_check
+  changed_when: false
+  failed_when: false
+
+- name: Fail if hcloud secret is missing
+  fail:
+    msg: "hcloud secret not found in kube-system namespace. CCM requires it."
+  when: hcloud_secret_check.rc != 0
+
+- name: Check if helm is installed
+  command: which helm
+  register: helm_check
+  changed_when: false
+  failed_when: false
+
+- name: Install helm
+  when: helm_check.rc != 0
+  block:
+    - name: Download helm install script
+      get_url:
+        url: https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3
+        dest: /tmp/get-helm-3.sh
+        mode: "0755"
+
+    - name: Run helm install script
+      command: /tmp/get-helm-3.sh
+      args:
+        creates: /usr/local/bin/helm
+
+- name: Add Hetzner Helm repository
+  kubernetes.core.helm_repository:
+    name: hcloud
+    repo_url: https://charts.hetzner.cloud
+    kubeconfig: /etc/rancher/k3s/k3s.yaml
+  environment:
+    KUBECONFIG: /etc/rancher/k3s/k3s.yaml
+
+- name: Deploy Hetzner Cloud Controller Manager
+  kubernetes.core.helm:
+    name: hcloud-cloud-controller-manager
+    chart_ref: hcloud/hcloud-cloud-controller-manager
+    release_namespace: kube-system
+    create_namespace: true
+    values:
+      networking:
+        enabled: true
+      nodeSelector:
+        kubernetes.io/hostname: "{{ inventory_hostname }}"
+      additionalTolerations:
+        - key: node-role.kubernetes.io/control-plane
+          operator: Exists
+          effect: NoSchedule
+    kubeconfig: /etc/rancher/k3s/k3s.yaml
+    wait: true
+    wait_timeout: 300s
+  environment:
+    KUBECONFIG: /etc/rancher/k3s/k3s.yaml
+
+- name: Wait for CCM to be ready
+  command: kubectl -n kube-system rollout status deployment/hcloud-cloud-controller-manager --timeout=120s
+  changed_when: false
+  register: ccm_rollout
+  until: ccm_rollout.rc == 0
+  retries: 3
+  delay: 10
+
+- name: Pause to ensure CCM is fully ready to process new nodes
+  pause:
+    seconds: 10
+
+- name: Verify CCM is removing uninitialized taints
+  command: kubectl get nodes -o jsonpath='{.items[*].spec.taints[?(@.key=="node.cloudprovider.kubernetes.io/uninitialized")].key}'
+  register: uninitialized_taints
+  changed_when: false
+  failed_when: false
+
+- name: Display taint status
+  debug:
+    msg: "Nodes with uninitialized taint: {{ uninitialized_taints.stdout }}"
--- a/ansible/roles/k3s-agent/defaults/main.yml
+++ b/ansible/roles/k3s-agent/defaults/main.yml
@@ -3,4 +3,4 @@ k3s_version: latest
 k3s_server_url: ""
 k3s_token: ""
 k3s_node_ip: ""
-k3s_kubelet_cloud_provider_external: false
+k3s_kubelet_cloud_provider_external: true
--- a/ansible/roles/k3s-agent/tasks/main.yml
+++ b/ansible/roles/k3s-agent/tasks/main.yml
@@ -22,6 +22,7 @@
      command: >-
        /tmp/install-k3s.sh agent
        --node-ip {{ k3s_node_ip }}
+        --flannel-iface=enp7s0
        {% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
      args:
        creates: /usr/local/bin/k3s-agent
--- a/ansible/roles/k3s-server/defaults/main.yml
+++ b/ansible/roles/k3s-server/defaults/main.yml
@@ -5,4 +5,12 @@ k3s_node_ip: ""
 k3s_primary_public_ip: ""
 k3s_disable_embedded_ccm: true
 k3s_disable_servicelb: true
-k3s_kubelet_cloud_provider_external: false
+k3s_kubelet_cloud_provider_external: true
+# Load Balancer endpoint for HA cluster joins (set in inventory)
+kube_api_endpoint: ""
+# Tailscale DNS names for control planes (to enable tailnet access)
+# Using DNS names instead of IPs since Tailscale IPs change on rebuild
+tailscale_control_plane_names:
+  - "k8s-cluster-cp-1.silverside-gopher.ts.net"
+  - "k8s-cluster-cp-2.silverside-gopher.ts.net"
+  - "k8s-cluster-cp-3.silverside-gopher.ts.net"
--- a/ansible/roles/k3s-server/tasks/main.yml
+++ b/ansible/roles/k3s-server/tasks/main.yml
@@ -15,9 +15,9 @@
  set_fact:
    k3s_install_needed: "{{ (not k3s_service.stat.exists) or ((k3s_service_state.stdout | default('')) != 'active') }}"

- name: Wait for primary API on 6443 (secondary only)
+- name: Wait for API endpoint on 6443 (secondary only)
  wait_for:
-    host: "{{ k3s_primary_ip }}"
+    host: "{{ k3s_join_endpoint | default(k3s_primary_ip) }}"
    port: 6443
    state: started
    timeout: 120
@@ -61,12 +61,15 @@
    --cluster-init
    --advertise-address={{ k3s_primary_ip }}
    --node-ip={{ k3s_node_ip }}
+    --flannel-iface=enp7s0
    --tls-san={{ k3s_primary_ip }}
    --tls-san={{ k3s_primary_public_ip }}
+    --tls-san={{ kube_api_endpoint }}
+    {% for name in tailscale_control_plane_names %}--tls-san={{ name }} {% endfor %}
    {% if k3s_disable_embedded_ccm | bool %}--disable-cloud-controller{% endif %}
    {% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
    {% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
-  when: 
+  when:
    - k3s_install_needed
    - k3s_primary | default(false)

@@ -81,9 +84,10 @@
        K3S_TOKEN: "{{ k3s_token }}"
      command: >-
        /tmp/install-k3s.sh server
-        --server https://{{ k3s_primary_ip }}:6443
+        --server https://{{ k3s_join_endpoint | default(k3s_primary_ip) }}:6443
        --advertise-address={{ k3s_node_ip }}
        --node-ip={{ k3s_node_ip }}
+        --flannel-iface=enp7s0
        {% if k3s_disable_embedded_ccm | bool %}--disable-cloud-controller{% endif %}
        {% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
        {% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
--- a/ansible/roles/private-access/tasks/main.yml
+++ b/ansible/roles/private-access/tasks/main.yml
@@ -1,58 +0,0 @@
---
- name: Create systemd unit for Grafana private access
-  template:
-    src: kubectl-port-forward.service.j2
-    dest: /etc/systemd/system/k8s-portforward-grafana.service
-    mode: "0644"
-  vars:
-    unit_description: Port-forward Grafana for Tailscale access
-    unit_namespace: observability
-    unit_target: svc/observability-kube-prometheus-stack-grafana
-    unit_local_port: 13080
-    unit_remote_port: 80
-
- name: Create systemd unit for Prometheus private access
-  template:
-    src: kubectl-port-forward.service.j2
-    dest: /etc/systemd/system/k8s-portforward-prometheus.service
-    mode: "0644"
-  vars:
-    unit_description: Port-forward Prometheus for Tailscale access
-    unit_namespace: observability
-    unit_target: svc/observability-kube-prometh-prometheus
-    unit_local_port: 19090
-    unit_remote_port: 9090
-
- name: Create systemd unit for Flux UI private access
-  template:
-    src: kubectl-port-forward.service.j2
-    dest: /etc/systemd/system/k8s-portforward-flux-ui.service
-    mode: "0644"
-  vars:
-    unit_description: Port-forward Flux UI for Tailscale access
-    unit_namespace: flux-system
-    unit_target: svc/flux-system-weave-gitops
-    unit_local_port: 19001
-    unit_remote_port: 9001
-
- name: Reload systemd
-  systemd:
-    daemon_reload: true
-
- name: Enable and start private access port-forward services
-  systemd:
-    name: "{{ item }}"
-    enabled: true
-    state: started
-  loop:
-    - k8s-portforward-grafana.service
-    - k8s-portforward-prometheus.service
-    - k8s-portforward-flux-ui.service
-
- name: Configure Tailscale Serve for private access endpoints
-  shell: >-
-    tailscale serve reset &&
-    tailscale serve --bg --tcp={{ private_access_grafana_port }} tcp://127.0.0.1:13080 &&
-    tailscale serve --bg --tcp={{ private_access_prometheus_port }} tcp://127.0.0.1:19090 &&
-    tailscale serve --bg --tcp={{ private_access_flux_port }} tcp://127.0.0.1:19001
-  changed_when: true
--- a/ansible/roles/private-access/templates/kubectl-port-forward.service.j2
+++ b/ansible/roles/private-access/templates/kubectl-port-forward.service.j2
@@ -1,13 +0,0 @@
-[Unit]
-Description={{ unit_description }}
-After=network-online.target k3s.service
-Wants=network-online.target
-
-[Service]
-Type=simple
-Restart=always
-RestartSec=5
-ExecStart=/usr/local/bin/kubectl -n {{ unit_namespace }} port-forward --address 127.0.0.1 {{ unit_target }} {{ unit_local_port }}:{{ unit_remote_port }}
-
-[Install]
-WantedBy=multi-user.target
--- a/ansible/roles/tailscale-cleanup/tasks/main.yml
+++ b/ansible/roles/tailscale-cleanup/tasks/main.yml
@@ -0,0 +1,53 @@
+---
+- name: Delete stale Tailscale devices with reserved hostnames
+  block:
+    - name: Get Tailscale devices from API
+      uri:
+        url: "https://api.tailscale.com/api/v2/tailnet/{{ tailscale_tailnet }}/devices"
+        method: GET
+        headers:
+          Authorization: "Bearer {{ tailscale_api_key }}"
+        return_content: true
+      register: ts_devices
+
+    - name: Find stale devices matching reserved hostnames
+      set_fact:
+        stale_devices: >-
+          {{ ts_devices.json.devices | default([])
+             | selectattr('hostname', 'defined')
+             | selectattr('hostname', 'in', tailscale_reserved_hostnames)
+             | rejectattr('online', 'defined')
+             | list
+             +
+             ts_devices.json.devices | default([])
+             | selectattr('hostname', 'defined')
+             | selectattr('hostname', 'in', tailscale_reserved_hostnames)
+             | selectattr('online', 'defined')
+             | rejectattr('online', 'equalto', true)
+             | list }}
+
+    - name: Delete stale devices
+      uri:
+        url: "https://api.tailscale.com/api/v2/device/{{ item.id }}"
+        method: DELETE
+        headers:
+          Authorization: "Bearer {{ tailscale_api_key }}"
+        status_code: 200
+      loop: "{{ stale_devices }}"
+      loop_control:
+        label: "{{ item.name }} ({{ item.id }})"
+      when: stale_devices | length > 0
+
+    - name: Report cleaned devices
+      debug:
+        msg: "Deleted stale Tailscale device: {{ item.name }}"
+      loop: "{{ stale_devices }}"
+      when: stale_devices | length > 0
+
+    - name: No stale devices found
+      debug:
+        msg: "No stale Tailscale devices found."
+      when: stale_devices | length == 0
+  when:
+    - tailscale_api_key is defined
+    - tailscale_api_key | length > 0
--- a/ansible/site.yml
+++ b/ansible/site.yml
@@ -24,6 +24,7 @@
    k3s_primary_public_ip: "{{ ansible_host }}"
    k3s_primary_ip: "{{ k3s_private_ip }}"
    k3s_node_ip: "{{ k3s_private_ip }}"
+    # kube_api_endpoint is set in inventory group_vars

  roles:
    - k3s-server
@@ -49,6 +50,20 @@
        dest: ../outputs/kubeconfig
        flat: true

+- name: Bootstrap addon prerequisite secrets
+  hosts: control_plane[0]
+  become: true
+
+  roles:
+    - addon-secrets-bootstrap
+
+- name: Deploy Hetzner CCM (required for workers with external cloud provider)
+  hosts: control_plane[0]
+  become: true
+
+  roles:
+    - ccm-deploy
+
 - name: Setup secondary control planes
  hosts: control_plane[1:]
  become: true
@@ -59,6 +74,8 @@
    k3s_primary_ip: "{{ hostvars[groups['control_plane'][0]]['k3s_primary_private_ip'] }}"
    k3s_primary_public_ip: "{{ hostvars[groups['control_plane'][0]]['k3s_primary_public_ip'] }}"
    k3s_node_ip: "{{ k3s_private_ip }}"
+    # Use Load Balancer for HA - all control planes join via LB endpoint
+    k3s_join_endpoint: "{{ kube_api_endpoint | default(hostvars[groups['control_plane'][0]]['k3s_primary_private_ip']) }}"

  roles:
    - k3s-server
@@ -69,19 +86,13 @@

  vars:
    k3s_token: "{{ hostvars[groups['control_plane'][0]]['k3s_token'] }}"
-    k3s_server_url: "https://{{ hostvars[groups['control_plane'][0]]['k3s_primary_private_ip'] }}:6443"
+    # Use Load Balancer for HA - workers join via LB endpoint
+    k3s_server_url: "https://{{ kube_api_endpoint | default(hostvars[groups['control_plane'][0]]['k3s_primary_private_ip']) }}:6443"
    k3s_node_ip: "{{ k3s_private_ip }}"

  roles:
    - k3s-agent

- name: Bootstrap addon prerequisite secrets
-  hosts: control_plane[0]
-  become: true
-
-  roles:
-    - addon-secrets-bootstrap
-
 - name: Deploy observability stack
  hosts: control_plane[0]
  become: true
@@ -98,17 +109,6 @@
    - role: observability-content
      when: not (observability_gitops_enabled | default(true) | bool)

- name: Configure private tailnet access
-  hosts: control_plane[0]
-  become: true
-  vars:
-    private_access_grafana_port: 30080
-    private_access_prometheus_port: 30990
-    private_access_flux_port: 30901
-
-  roles:
-    - private-access
-
 - name: Bootstrap Doppler access for External Secrets
  hosts: control_plane[0]
  become: true
@@ -116,13 +116,23 @@
  roles:
    - doppler-bootstrap

+- name: Clean up stale Tailscale devices
+  hosts: localhost
+  connection: local
+  vars:
+    tailscale_reserved_hostnames:
+      - rancher
+
+  roles:
+    - tailscale-cleanup
+
 - name: Finalize
  hosts: localhost
  connection: local
  tasks:
    - name: Update kubeconfig server address
      command: |
-        sed -i 's/127.0.0.1/{{ groups["control_plane"][0] }}.{{ tailscale_tailnet }}/g' ../outputs/kubeconfig
+        sed -i 's/127.0.0.1/{{ hostvars[groups["control_plane"][0]]["ansible_host"] }}/g' ../outputs/kubeconfig
      changed_when: true

    - name: Display success message
--- a/infrastructure/addons/cert-manager/helmrelease-cert-manager.yaml
+++ b/infrastructure/addons/cert-manager/helmrelease-cert-manager.yaml
@@ -0,0 +1,34 @@
+apiVersion: helm.toolkit.fluxcd.io/v2
+kind: HelmRelease
+metadata:
+  name: cert-manager
+  namespace: flux-system
+spec:
+  interval: 10m
+  targetNamespace: cert-manager
+  chart:
+    spec:
+      chart: cert-manager
+      version: "v1.17.2"
+      sourceRef:
+        kind: HelmRepository
+        name: jetstack
+        namespace: flux-system
+  install:
+    createNamespace: true
+    remediation:
+      retries: 3
+  upgrade:
+    remediation:
+      retries: 3
+  values:
+    crds:
+      enabled: true
+    replicaCount: 1
+    resources:
+      requests:
+        cpu: 50m
+        memory: 128Mi
+      limits:
+        cpu: 250m
+        memory: 256Mi
--- a/infrastructure/addons/cert-manager/helmrepository-cert-manager.yaml
+++ b/infrastructure/addons/cert-manager/helmrepository-cert-manager.yaml
@@ -0,0 +1,8 @@
+apiVersion: source.toolkit.fluxcd.io/v1
+kind: HelmRepository
+metadata:
+  name: jetstack
+  namespace: flux-system
+spec:
+  interval: 1h
+  url: https://charts.jetstack.io
--- a/infrastructure/addons/cert-manager/kustomization.yaml
+++ b/infrastructure/addons/cert-manager/kustomization.yaml
@@ -0,0 +1,6 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - namespace.yaml
+  - helmrepository-cert-manager.yaml
+  - helmrelease-cert-manager.yaml
--- a/infrastructure/addons/cert-manager/namespace.yaml
+++ b/infrastructure/addons/cert-manager/namespace.yaml
@@ -0,0 +1,6 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: cert-manager
+  labels:
+    kustomize.toolkit.fluxcd.io/prune: disabled
--- a/infrastructure/addons/flux-ui/flux-tailscale-service.yaml
+++ b/infrastructure/addons/flux-ui/flux-tailscale-service.yaml
@@ -0,0 +1,19 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: flux-tailscale
+  namespace: flux-system
+  annotations:
+    tailscale.com/hostname: flux
+    tailscale.com/proxy-class: infra-stable
+spec:
+  type: LoadBalancer
+  loadBalancerClass: tailscale
+  selector:
+    app.kubernetes.io/name: weave-gitops
+    app.kubernetes.io/instance: flux-system-weave-gitops
+  ports:
+    - name: http
+      port: 9001
+      protocol: TCP
+      targetPort: http
--- a/infrastructure/addons/flux-ui/helmrelease-weave-gitops.yaml
+++ b/infrastructure/addons/flux-ui/helmrelease-weave-gitops.yaml
@@ -27,9 +27,12 @@ spec:
    adminUser:
      create: true
      createClusterRole: true
-      createSecret: false
+      createSecret: false  # Secret is managed by External Secret from Doppler
      username: admin
    rbac:
      create: true
      impersonationResourceNames:
        - admin
+      viewSecretsResourceNames:
+        - cluster-user-auth
+        - oidc-auth
--- a/infrastructure/addons/flux-ui/ingress-flux-ui.yaml
+++ b/infrastructure/addons/flux-ui/ingress-flux-ui.yaml
@@ -1,19 +0,0 @@
-apiVersion: networking.k8s.io/v1
-kind: Ingress
-metadata:
-  name: flux-ui
-  namespace: flux-system
-  annotations:
-    traefik.ingress.kubernetes.io/router.entrypoints: flux
-spec:
-  ingressClassName: traefik
-  rules:
-    - http:
-        paths:
-          - path: /
-            pathType: Prefix
-            backend:
-              service:
-                name: flux-system-weave-gitops
-                port:
-                  number: 9001
--- a/infrastructure/addons/flux-ui/kustomization.yaml
+++ b/infrastructure/addons/flux-ui/kustomization.yaml
@@ -4,5 +4,4 @@ resources:
  - cluster-user-auth-externalsecret.yaml
  - gitrepository-weave-gitops.yaml
  - helmrelease-weave-gitops.yaml
-  - traefik-helmchartconfig-flux-entrypoint.yaml
-  - ingress-flux-ui.yaml
+  - flux-tailscale-service.yaml
--- a/infrastructure/addons/flux-ui/traefik-helmchartconfig-flux-entrypoint.yaml
+++ b/infrastructure/addons/flux-ui/traefik-helmchartconfig-flux-entrypoint.yaml
@@ -1,9 +0,0 @@
-apiVersion: helm.cattle.io/v1
-kind: HelmChartConfig
-metadata:
-  name: traefik
-  namespace: kube-system
-spec:
-  valuesContent: |-
-    additionalArguments:
-      - "--entryPoints.flux.address=:9001/tcp"
--- a/infrastructure/addons/kustomization-ccm.yaml
+++ b/infrastructure/addons/kustomization-ccm.yaml
@@ -11,5 +11,5 @@ spec:
    name: platform
  path: ./infrastructure/addons/ccm
  wait: true
-  timeout: 5m
-  suspend: true
+  timeout: 10m
+  suspend: false
--- a/infrastructure/addons/kustomization-cert-manager.yaml
+++ b/infrastructure/addons/kustomization-cert-manager.yaml
@@ -0,0 +1,15 @@
+apiVersion: kustomize.toolkit.fluxcd.io/v1
+kind: Kustomization
+metadata:
+  name: addon-cert-manager
+  namespace: flux-system
+spec:
+  interval: 10m
+  prune: true
+  sourceRef:
+    kind: GitRepository
+    name: platform
+  path: ./infrastructure/addons/cert-manager
+  wait: true
+  timeout: 10m
+  suspend: false
--- a/infrastructure/addons/kustomization-csi.yaml
+++ b/infrastructure/addons/kustomization-csi.yaml
@@ -13,5 +13,5 @@ spec:
  dependsOn:
    - name: addon-ccm
  wait: true
-  timeout: 5m
-  suspend: true
+  timeout: 10m
+  suspend: false
--- a/infrastructure/addons/kustomization-flux-ui.yaml
+++ b/infrastructure/addons/kustomization-flux-ui.yaml
@@ -12,6 +12,8 @@ spec:
  path: ./infrastructure/addons/flux-ui
  dependsOn:
    - name: addon-external-secrets
+    - name: addon-tailscale-operator
+    - name: addon-tailscale-proxyclass
  wait: true
  timeout: 5m
  suspend: false
--- a/infrastructure/addons/kustomization-observability.yaml
+++ b/infrastructure/addons/kustomization-observability.yaml
@@ -12,6 +12,8 @@ spec:
  path: ./infrastructure/addons/observability
  dependsOn:
    - name: addon-external-secrets
+    - name: addon-tailscale-operator
+    - name: addon-tailscale-proxyclass
  wait: true
  timeout: 5m
  suspend: false
--- a/infrastructure/addons/kustomization-rancher-backup-config.yaml
+++ b/infrastructure/addons/kustomization-rancher-backup-config.yaml
@@ -0,0 +1,16 @@
+apiVersion: kustomize.toolkit.fluxcd.io/v1
+kind: Kustomization
+metadata:
+  name: addon-rancher-backup-config
+  namespace: flux-system
+spec:
+  interval: 10m
+  prune: true
+  sourceRef:
+    kind: GitRepository
+    name: platform
+  path: ./infrastructure/addons/rancher-backup-config
+  timeout: 5m
+  suspend: false
+  dependsOn:
+    - name: addon-rancher-backup
--- a/infrastructure/addons/kustomization-rancher-backup.yaml
+++ b/infrastructure/addons/kustomization-rancher-backup.yaml
@@ -0,0 +1,18 @@
+apiVersion: kustomize.toolkit.fluxcd.io/v1
+kind: Kustomization
+metadata:
+  name: addon-rancher-backup
+  namespace: flux-system
+spec:
+  interval: 10m
+  prune: true
+  sourceRef:
+    kind: GitRepository
+    name: platform
+  path: ./infrastructure/addons/rancher-backup
+  wait: true
+  timeout: 10m
+  suspend: false
+  dependsOn:
+    - name: addon-external-secrets
+    - name: addon-rancher
--- a/infrastructure/addons/kustomization-rancher-config.yaml
+++ b/infrastructure/addons/kustomization-rancher-config.yaml
@@ -0,0 +1,17 @@
+apiVersion: kustomize.toolkit.fluxcd.io/v1
+kind: Kustomization
+metadata:
+  name: addon-rancher-config
+  namespace: flux-system
+spec:
+  interval: 10m
+  prune: true
+  sourceRef:
+    kind: GitRepository
+    name: platform
+  path: ./infrastructure/addons/rancher-config
+  dependsOn:
+    - name: addon-rancher
+  wait: true
+  timeout: 5m
+  suspend: false
--- a/infrastructure/addons/kustomization-rancher.yaml
+++ b/infrastructure/addons/kustomization-rancher.yaml
@@ -0,0 +1,20 @@
+apiVersion: kustomize.toolkit.fluxcd.io/v1
+kind: Kustomization
+metadata:
+  name: addon-rancher
+  namespace: flux-system
+spec:
+  interval: 10m
+  prune: true
+  sourceRef:
+    kind: GitRepository
+    name: platform
+  path: ./infrastructure/addons/rancher
+  wait: true
+  timeout: 15m
+  suspend: false
+  dependsOn:
+    - name: addon-tailscale-operator
+    - name: addon-tailscale-proxyclass
+    - name: addon-external-secrets
+    - name: addon-cert-manager
--- a/infrastructure/addons/kustomization-tailscale-operator.yaml
+++ b/infrastructure/addons/kustomization-tailscale-operator.yaml
@@ -12,4 +12,4 @@ spec:
  path: ./infrastructure/addons/tailscale-operator
  wait: true
  timeout: 5m
-  suspend: true
+  suspend: false
--- a/infrastructure/addons/kustomization-tailscale-proxyclass.yaml
+++ b/infrastructure/addons/kustomization-tailscale-proxyclass.yaml
@@ -14,4 +14,4 @@ spec:
    - name: addon-tailscale-operator
  wait: true
  timeout: 5m
-  suspend: true
+  suspend: false
--- a/infrastructure/addons/kustomization.yaml
+++ b/infrastructure/addons/kustomization.yaml
@@ -4,8 +4,14 @@ resources:
  - kustomization-ccm.yaml
  - kustomization-csi.yaml
  - kustomization-external-secrets.yaml
-  - kustomization-flux-ui.yaml
+  - kustomization-cert-manager.yaml
  - kustomization-tailscale-operator.yaml
  - kustomization-tailscale-proxyclass.yaml
+  - traefik
+  - kustomization-flux-ui.yaml
  - kustomization-observability.yaml
  - kustomization-observability-content.yaml
+  - kustomization-rancher.yaml
+  - kustomization-rancher-config.yaml
+  - kustomization-rancher-backup.yaml
+  - kustomization-rancher-backup-config.yaml
--- a/infrastructure/addons/observability/grafana-ingress.yaml
+++ b/infrastructure/addons/observability/grafana-ingress.yaml
@@ -1,17 +0,0 @@
-apiVersion: networking.k8s.io/v1
-kind: Ingress
-metadata:
-  name: grafana
-  namespace: observability
-spec:
-  ingressClassName: traefik
-  rules:
-    - http:
-        paths:
-          - path: /grafana
-            pathType: Prefix
-            backend:
-              service:
-                name: observability-kube-prometheus-stack-grafana
-                port:
-                  number: 80
--- a/infrastructure/addons/observability/grafana-tailscale-service.yaml
+++ b/infrastructure/addons/observability/grafana-tailscale-service.yaml
@@ -0,0 +1,18 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: grafana-tailscale
+  namespace: observability
+  annotations:
+    tailscale.com/hostname: grafana
+    tailscale.com/proxy-class: infra-stable
+spec:
+  type: LoadBalancer
+  loadBalancerClass: tailscale
+  selector:
+    app.kubernetes.io/name: grafana
+  ports:
+    - name: http
+      port: 80
+      protocol: TCP
+      targetPort: 3000
--- a/infrastructure/addons/observability/helmrelease-kube-prometheus-stack.yaml
+++ b/infrastructure/addons/observability/helmrelease-kube-prometheus-stack.yaml
@@ -26,12 +26,10 @@ spec:
      enabled: true
      admin:
        existingSecret: grafana-admin-credentials
-        userKey: admin-user
-        passwordKey: admin-password
      grafana.ini:
        server:
-          root_url: http://observability/grafana/
-          serve_from_sub_path: true
+          root_url: http://grafana.silverside-gopher.ts.net/
+          serve_from_sub_path: false
      persistence:
        enabled: true
        storageClassName: local-path
@@ -51,8 +49,8 @@ spec:
      service:
        type: ClusterIP
      prometheusSpec:
-        externalUrl: http://observability/prometheus/
-        routePrefix: /prometheus/
+        externalUrl: http://prometheus.silverside-gopher.ts.net/
+        routePrefix: /
        retention: 7d
        storageSpec:
          volumeClaimTemplate:
--- a/infrastructure/addons/observability/kustomization.yaml
+++ b/infrastructure/addons/observability/kustomization.yaml
@@ -3,11 +3,10 @@ kind: Kustomization
 resources:
  - namespace.yaml
  - grafana-admin-externalsecret.yaml
-  - traefik-tailscale-service.yaml
-  - grafana-ingress.yaml
-  - prometheus-ingress.yaml
  - helmrepository-prometheus-community.yaml
  - helmrepository-grafana.yaml
  - helmrelease-kube-prometheus-stack.yaml
  - helmrelease-loki.yaml
  - helmrelease-promtail.yaml
+  - grafana-tailscale-service.yaml
+  - prometheus-tailscale-service.yaml
--- a/infrastructure/addons/observability/prometheus-ingress.yaml
+++ b/infrastructure/addons/observability/prometheus-ingress.yaml
@@ -1,17 +0,0 @@
-apiVersion: networking.k8s.io/v1
-kind: Ingress
-metadata:
-  name: prometheus
-  namespace: observability
-spec:
-  ingressClassName: traefik
-  rules:
-    - http:
-        paths:
-          - path: /prometheus
-            pathType: Prefix
-            backend:
-              service:
-                name: observability-kube-prometh-prometheus
-                port:
-                  number: 9090
--- a/infrastructure/addons/observability/prometheus-tailscale-service.yaml
+++ b/infrastructure/addons/observability/prometheus-tailscale-service.yaml
@@ -0,0 +1,19 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: prometheus-tailscale
+  namespace: observability
+  annotations:
+    tailscale.com/hostname: prometheus
+    tailscale.com/proxy-class: infra-stable
+spec:
+  type: LoadBalancer
+  loadBalancerClass: tailscale
+  selector:
+    app.kubernetes.io/name: prometheus
+    operator.prometheus.io/name: observability-kube-prometh-prometheus
+  ports:
+    - name: http
+      port: 9090
+      protocol: TCP
+      targetPort: 9090
--- a/infrastructure/addons/observability/traefik-tailscale-service.yaml
+++ b/infrastructure/addons/observability/traefik-tailscale-service.yaml
@@ -1,27 +0,0 @@
-apiVersion: v1
-kind: Service
-metadata:
-  name: traefik-tailscale
-  namespace: kube-system
-  annotations:
-    tailscale.com/hostname: observability
-    tailscale.com/proxy-class: infra-stable
-spec:
-  type: LoadBalancer
-  loadBalancerClass: tailscale
-  selector:
-    app.kubernetes.io/instance: traefik-kube-system
-    app.kubernetes.io/name: traefik
-  ports:
-    - name: web
-      port: 80
-      protocol: TCP
-      targetPort: web
-    - name: websecure
-      port: 443
-      protocol: TCP
-      targetPort: websecure
-    - name: flux
-      port: 9001
-      protocol: TCP
-      targetPort: 9001
--- a/infrastructure/addons/rancher-backup-config/backup-recurring.yaml
+++ b/infrastructure/addons/rancher-backup-config/backup-recurring.yaml
@@ -0,0 +1,17 @@
+apiVersion: resources.cattle.io/v1
+kind: Backup
+metadata:
+  name: rancher-b2-recurring
+  namespace: cattle-resources-system
+spec:
+  resourceSetName: rancher-resource-set-full
+  storageLocation:
+    s3:
+      credentialSecretName: rancher-b2-creds
+      credentialSecretNamespace: cattle-resources-system
+      bucketName: HetznerTerra
+      folder: rancher-backups
+      endpoint: s3.us-east-005.backblazeb2.com
+      region: us-east-005
+  schedule: "0 3 * * *"
+  retentionCount: 7
--- a/infrastructure/addons/rancher-backup-config/kustomization.yaml
+++ b/infrastructure/addons/rancher-backup-config/kustomization.yaml
@@ -0,0 +1,5 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - backup-recurring.yaml
+  - restore-from-b2.yaml
--- a/infrastructure/addons/rancher-backup-config/restore-from-b2.yaml
+++ b/infrastructure/addons/rancher-backup-config/restore-from-b2.yaml
@@ -0,0 +1,19 @@
+# Uncomment and set backupFilename to restore from a specific backup on rebuild.
+# Find the latest backup filename in B2: rancher-backups/ folder.
+# After restore succeeds, Rancher will have all users/settings from the backup.
+#
+# apiVersion: resources.cattle.io/v1
+# kind: Restore
+# metadata:
+#   name: restore-from-b2
+#   namespace: cattle-resources-system
+# spec:
+#   backupFilename: rancher-b2-manual-test-0a416444-2c8a-4d34-8a07-d9e406750374-2026-03-30T00-08-02Z.tar.gz
+#   storageLocation:
+#     s3:
+#       credentialSecretName: rancher-b2-creds
+#       credentialSecretNamespace: cattle-resources-system
+#       bucketName: HetznerTerra
+#       folder: rancher-backups
+#       endpoint: s3.us-east-005.backblazeb2.com
+#       region: us-east-005
--- a/infrastructure/addons/rancher-backup/b2-credentials-externalsecret.yaml
+++ b/infrastructure/addons/rancher-backup/b2-credentials-externalsecret.yaml
@@ -0,0 +1,25 @@
+apiVersion: external-secrets.io/v1
+kind: ExternalSecret
+metadata:
+  name: rancher-b2-creds
+  namespace: cattle-resources-system
+spec:
+  refreshInterval: 1h
+  secretStoreRef:
+    name: doppler-hetznerterra
+    kind: ClusterSecretStore
+  target:
+    name: rancher-b2-creds
+    creationPolicy: Owner
+    template:
+      type: Opaque
+      data:
+        accessKey: "{{ .B2_ACCOUNT_ID }}"
+        secretKey: "{{ .B2_APPLICATION_KEY }}"
+  data:
+    - secretKey: B2_ACCOUNT_ID
+      remoteRef:
+        key: B2_ACCOUNT_ID
+    - secretKey: B2_APPLICATION_KEY
+      remoteRef:
+        key: B2_APPLICATION_KEY
--- a/infrastructure/addons/rancher-backup/helmrelease-rancher-backup-crd.yaml
+++ b/infrastructure/addons/rancher-backup/helmrelease-rancher-backup-crd.yaml
@@ -0,0 +1,23 @@
+apiVersion: helm.toolkit.fluxcd.io/v2
+kind: HelmRelease
+metadata:
+  name: rancher-backup-crd
+  namespace: flux-system
+spec:
+  interval: 10m
+  targetNamespace: cattle-resources-system
+  chart:
+    spec:
+      chart: rancher-backup-crd
+      version: "106.0.2+up8.1.0"
+      sourceRef:
+        kind: HelmRepository
+        name: rancher-charts
+        namespace: flux-system
+  install:
+    createNamespace: true
+    remediation:
+      retries: 3
+  upgrade:
+    remediation:
+      retries: 3
--- a/infrastructure/addons/rancher-backup/helmrelease-rancher-backup.yaml
+++ b/infrastructure/addons/rancher-backup/helmrelease-rancher-backup.yaml
@@ -0,0 +1,42 @@
+apiVersion: helm.toolkit.fluxcd.io/v2
+kind: HelmRelease
+metadata:
+  name: rancher-backup
+  namespace: flux-system
+spec:
+  interval: 10m
+  targetNamespace: cattle-resources-system
+  dependsOn:
+    - name: rancher-backup-crd
+  chart:
+    spec:
+      chart: rancher-backup
+      version: "106.0.2+up8.1.0"
+      sourceRef:
+        kind: HelmRepository
+        name: rancher-charts
+        namespace: flux-system
+  install:
+    createNamespace: true
+    remediation:
+      retries: 3
+  upgrade:
+    remediation:
+      retries: 3
+  values:
+    image:
+      repository: rancher/backup-restore-operator
+    kubectl:
+      image:
+        repository: rancher/kubectl
+        tag: "v1.34.0"
+  postRenderers:
+    - kustomize:
+        patches:
+          - target:
+              kind: Job
+              name: rancher-backup-patch-sa
+            patch: |
+              - op: replace
+                path: /spec/template/spec/containers/0/image
+                value: rancher/kubectl:v1.34.0
--- a/infrastructure/addons/rancher-backup/helmrepository-rancher-backup.yaml
+++ b/infrastructure/addons/rancher-backup/helmrepository-rancher-backup.yaml
@@ -0,0 +1,8 @@
+apiVersion: source.toolkit.fluxcd.io/v1
+kind: HelmRepository
+metadata:
+  name: rancher-charts
+  namespace: flux-system
+spec:
+  interval: 1h
+  url: https://charts.rancher.io
--- a/infrastructure/addons/rancher-backup/kustomization.yaml
+++ b/infrastructure/addons/rancher-backup/kustomization.yaml
@@ -0,0 +1,8 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - namespace.yaml
+  - helmrepository-rancher-backup.yaml
+  - helmrelease-rancher-backup-crd.yaml
+  - helmrelease-rancher-backup.yaml
+  - b2-credentials-externalsecret.yaml
--- a/infrastructure/addons/rancher-backup/namespace.yaml
+++ b/infrastructure/addons/rancher-backup/namespace.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: cattle-resources-system
--- a/infrastructure/addons/rancher-config/kustomization.yaml
+++ b/infrastructure/addons/rancher-config/kustomization.yaml
@@ -0,0 +1,4 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - server-url-setting.yaml
--- a/infrastructure/addons/rancher-config/server-url-setting.yaml
+++ b/infrastructure/addons/rancher-config/server-url-setting.yaml
@@ -0,0 +1,5 @@
+apiVersion: management.cattle.io/v3
+kind: Setting
+metadata:
+  name: server-url
+value: https://rancher.silverside-gopher.ts.net
--- a/infrastructure/addons/rancher/helmrelease-rancher.yaml
+++ b/infrastructure/addons/rancher/helmrelease-rancher.yaml
@@ -0,0 +1,48 @@
+apiVersion: helm.toolkit.fluxcd.io/v2
+kind: HelmRelease
+metadata:
+  name: rancher
+  namespace: flux-system
+spec:
+  interval: 10m
+  targetNamespace: cattle-system
+  chart:
+    spec:
+      chart: rancher
+      version: "2.13.3"
+      sourceRef:
+        kind: HelmRepository
+        name: rancher-stable
+        namespace: flux-system
+  install:
+    createNamespace: true
+    remediation:
+      retries: 3
+  upgrade:
+    remediation:
+      retries: 3
+  values:
+    hostname: rancher.silverside-gopher.ts.net
+    replicas: 1
+    extraEnv:
+      - name: CATTLE_PROMETHEUS_METRICS
+        value: "true"
+    resources:
+      requests:
+        cpu: 500m
+        memory: 512Mi
+      limits:
+        cpu: 1000m
+        memory: 1Gi
+    affinity:
+      nodeAffinity:
+        requiredDuringSchedulingIgnoredDuringExecution:
+          nodeSelectorTerms:
+            - matchExpressions:
+                - key: node-role.kubernetes.io/control-plane
+                  operator: DoesNotExist
+  valuesFrom:
+    - kind: Secret
+      name: rancher-bootstrap-password
+      valuesKey: bootstrapPassword
+      targetPath: bootstrapPassword
--- a/infrastructure/addons/rancher/helmrepository-rancher.yaml
+++ b/infrastructure/addons/rancher/helmrepository-rancher.yaml
@@ -0,0 +1,8 @@
+apiVersion: source.toolkit.fluxcd.io/v1
+kind: HelmRepository
+metadata:
+  name: rancher-stable
+  namespace: flux-system
+spec:
+  interval: 1h
+  url: https://releases.rancher.com/server-charts/stable
--- a/infrastructure/addons/rancher/kustomization.yaml
+++ b/infrastructure/addons/rancher/kustomization.yaml
@@ -0,0 +1,9 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - namespace.yaml
+  - helmrepository-rancher.yaml
+  - helmrelease-rancher.yaml
+  - rancher-bootstrap-password-flux-externalsecret.yaml
+  - rancher-bootstrap-password-externalsecret.yaml
+  - rancher-tailscale-service.yaml
--- a/infrastructure/addons/rancher/namespace.yaml
+++ b/infrastructure/addons/rancher/namespace.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: cattle-system
--- a/infrastructure/addons/rancher/rancher-bootstrap-password-externalsecret.yaml
+++ b/infrastructure/addons/rancher/rancher-bootstrap-password-externalsecret.yaml
@@ -0,0 +1,21 @@
+apiVersion: external-secrets.io/v1
+kind: ExternalSecret
+metadata:
+  name: rancher-bootstrap-password
+  namespace: cattle-system
+spec:
+  refreshInterval: 1h
+  secretStoreRef:
+    name: doppler-hetznerterra
+    kind: ClusterSecretStore
+  target:
+    name: rancher-bootstrap-password
+    creationPolicy: Owner
+    template:
+      type: Opaque
+      data:
+        bootstrapPassword: "{{ .rancherBootstrapPassword }}"
+  data:
+    - secretKey: rancherBootstrapPassword
+      remoteRef:
+        key: RANCHER_BOOTSTRAP_PASSWORD
--- a/infrastructure/addons/rancher/rancher-bootstrap-password-flux-externalsecret.yaml
+++ b/infrastructure/addons/rancher/rancher-bootstrap-password-flux-externalsecret.yaml
@@ -0,0 +1,21 @@
+apiVersion: external-secrets.io/v1
+kind: ExternalSecret
+metadata:
+  name: rancher-bootstrap-password
+  namespace: flux-system
+spec:
+  refreshInterval: 1h
+  secretStoreRef:
+    name: doppler-hetznerterra
+    kind: ClusterSecretStore
+  target:
+    name: rancher-bootstrap-password
+    creationPolicy: Owner
+    template:
+      type: Opaque
+      data:
+        bootstrapPassword: "{{ .RANCHER_BOOTSTRAP_PASSWORD }}"
+  data:
+    - secretKey: RANCHER_BOOTSTRAP_PASSWORD
+      remoteRef:
+        key: RANCHER_BOOTSTRAP_PASSWORD
--- a/infrastructure/addons/rancher/rancher-tailscale-service.yaml
+++ b/infrastructure/addons/rancher/rancher-tailscale-service.yaml
@@ -0,0 +1,22 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: rancher-tailscale
+  namespace: cattle-system
+  annotations:
+    tailscale.com/hostname: rancher
+    tailscale.com/proxy-class: infra-stable
+spec:
+  type: LoadBalancer
+  loadBalancerClass: tailscale
+  selector:
+    app: cattle-system-rancher
+  ports:
+    - name: http
+      port: 80
+      protocol: TCP
+      targetPort: 80
+    - name: https
+      port: 443
+      protocol: TCP
+      targetPort: 443
--- a/infrastructure/addons/traefik/helmrelease-traefik.yaml
+++ b/infrastructure/addons/traefik/helmrelease-traefik.yaml
@@ -0,0 +1,38 @@
+apiVersion: helm.toolkit.fluxcd.io/v2
+kind: HelmRelease
+metadata:
+  name: traefik
+  namespace: flux-system
+spec:
+  interval: 10m
+  targetNamespace: kube-system
+  chart:
+    spec:
+      chart: traefik
+      version: "39.0.0"
+      sourceRef:
+        kind: HelmRepository
+        name: traefik
+        namespace: flux-system
+  install:
+    createNamespace: true
+    remediation:
+      retries: 3
+  upgrade:
+    remediation:
+      retries: 3
+  values:
+    additionalArguments:
+      - "--entryPoints.flux.address=:9001/tcp"
+      - "--entryPoints.rancher.address=:9442/tcp"
+    service:
+      type: NodePort
+    ports:
+      web:
+        nodePort: 31097
+      websecure:
+        nodePort: 30193
+      rancher:
+        port: 9442
+        exposedPort: 9442
+        protocol: TCP
--- a/infrastructure/addons/traefik/helmrepository-traefik.yaml
+++ b/infrastructure/addons/traefik/helmrepository-traefik.yaml
@@ -0,0 +1,9 @@
+apiVersion: source.toolkit.fluxcd.io/v1
+kind: HelmRepository
+metadata:
+  name: traefik
+  namespace: flux-system
+spec:
+  interval: 10m
+  url: https://traefik.github.io/charts
+  provider: generic
--- a/infrastructure/addons/traefik/kustomization.yaml
+++ b/infrastructure/addons/traefik/kustomization.yaml
@@ -0,0 +1,5 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+resources:
+  - helmrepository-traefik.yaml
+  - helmrelease-traefik.yaml
--- a/scripts/refresh-kubeconfig.sh
+++ b/scripts/refresh-kubeconfig.sh
@@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+KUBECONFIG_PATH="$REPO_ROOT/outputs/kubeconfig"
+SSH_KEY="${SSH_KEY:-$HOME/.ssh/infra}"
+
+CP1_PUBLIC_IP="${1:-}"
+
+if [ -z "$CP1_PUBLIC_IP" ]; then
+    if [ -f "$REPO_ROOT/ansible/inventory.ini" ]; then
+        CP1_PUBLIC_IP=$(grep -A2 '\[control_plane\]' "$REPO_ROOT/ansible/inventory.ini" | grep -oP '\d+\.\d+\.\d+\.\d+' | head -1)
+    fi
+fi
+
+if [ -z "$CP1_PUBLIC_IP" ]; then
+    echo "Usage: $0 <control-plane-1-public-ip>"
+    echo "   Or ensure ansible/inventory.ini exists with control plane IPs."
+    exit 1
+fi
+
+echo "Fetching kubeconfig from $CP1_PUBLIC_IP ..."
+ssh -i "$SSH_KEY" \
+    -o StrictHostKeyChecking=no \
+    -o UserKnownHostsFile=/dev/null \
+    "root@$CP1_PUBLIC_IP" "cat /etc/rancher/k3s/k3s.yaml" \
+    | sed "s/127.0.0.1/$CP1_PUBLIC_IP/g" \
+    > "$KUBECONFIG_PATH"
+
+chmod 600 "$KUBECONFIG_PATH"
+echo "Kubeconfig saved to $KUBECONFIG_PATH"
+echo "Run: export KUBECONFIG=$KUBECONFIG_PATH"
--- a/terraform/firewall.tf
+++ b/terraform/firewall.tf
@@ -89,6 +89,22 @@ resource "hcloud_firewall" "cluster" {
    }
  }

+  rule {
+    description = "HTTP from Load Balancer"
+    direction   = "in"
+    protocol    = "tcp"
+    port        = "80"
+    source_ips  = ["0.0.0.0/0"]
+  }
+
+  rule {
+    description = "HTTPS from Load Balancer"
+    direction   = "in"
+    protocol    = "tcp"
+    port        = "443"
+    source_ips  = ["0.0.0.0/0"]
+  }
+
  rule {
    description = "ICMP"
    direction   = "in"
--- a/terraform/loadbalancer.tf
+++ b/terraform/loadbalancer.tf
@@ -0,0 +1,50 @@
+# Load Balancer for Kubernetes API High Availability
+# Provides a single endpoint for all control planes
+
+resource "hcloud_load_balancer" "kube_api" {
+  name               = "${var.cluster_name}-api"
+  load_balancer_type = "lb11" # Cheapest tier: €5.39/month
+  location           = var.location
+
+  labels = {
+    cluster = var.cluster_name
+    role    = "kube-api"
+  }
+}
+
+# Attach Load Balancer to private network (required for use_private_ip)
+resource "hcloud_load_balancer_network" "kube_api" {
+  load_balancer_id = hcloud_load_balancer.kube_api.id
+  network_id       = hcloud_network.cluster.id
+  ip               = cidrhost(var.subnet_cidr, 5) # 10.0.1.5
+}
+
+# Attach all control plane servers as targets
+resource "hcloud_load_balancer_target" "kube_api_targets" {
+  count            = var.control_plane_count
+  type             = "server"
+  load_balancer_id = hcloud_load_balancer.kube_api.id
+  server_id        = hcloud_server.control_plane[count.index].id
+  use_private_ip   = true
+
+  depends_on = [hcloud_load_balancer_network.kube_api, hcloud_server.control_plane]
+}
+
+# Kubernetes API service on port 6443
+resource "hcloud_load_balancer_service" "kube_api" {
+  load_balancer_id = hcloud_load_balancer.kube_api.id
+  protocol         = "tcp"
+  listen_port      = 6443
+  destination_port = 6443
+
+  health_check {
+    protocol = "tcp"
+    port     = 6443
+    interval = 15
+    timeout  = 10
+    retries  = 3
+  }
+}
+
+# Firewall rule to allow LB access to control planes on 6443
+# This is added to the existing cluster firewall
--- a/terraform/outputs.tf
+++ b/terraform/outputs.tf
@@ -63,3 +63,8 @@ output "kubeconfig_command" {
  description = "Command to fetch kubeconfig"
  value       = "ssh root@${hcloud_server.control_plane[0].ipv4_address} 'cat /etc/rancher/k3s/k3s.yaml' > kubeconfig && sed -i 's/127.0.0.1/${hcloud_server.control_plane[0].ipv4_address}/g' kubeconfig"
 }
+
+output "kube_api_lb_ip" {
+  description = "Load Balancer private IP for Kubernetes API (used for cluster joins)"
+  value       = hcloud_load_balancer_network.kube_api.ip
+}
--- a/terraform/variables.tf
+++ b/terraform/variables.tf
@@ -25,7 +25,7 @@ variable "cluster_name" {
 variable "control_plane_count" {
  description = "Number of control plane nodes"
  type        = number
-  default     = 1
+  default     = 3
 }

 variable "control_plane_type" {
@@ -37,7 +37,7 @@ variable "control_plane_type" {
 variable "worker_count" {
  description = "Number of worker nodes"
  type        = number
-  default     = 2
+  default     = 3
 }

 variable "worker_type" {