fix: stop preloading observability images everywhere

fix: make observability image seeding best effort
fix: reset tailscale helm release directly
2026-05-01 07:52:35 +00:00 · 2026-04-30 21:02:20 +00:00 · 2026-04-30 20:25:48 +00:00 · 2026-04-30 19:34:33 +00:00 · 2026-04-30 17:26:16 +00:00 · 2026-04-30 09:08:44 +00:00
615 changed files with 179237 additions and 1128 deletions
@@ -0,0 +1,88 @@
 name: Deploy Grafana Content
 on:
  push:
    branches:
      - main
    paths:
      - "ansible/dashboards.yml"
      - "ansible/roles/observability-content/**"
  workflow_dispatch:
 concurrency:
  group: prod-cluster
  cancel-in-progress: false
 env:
  TF_VERSION: "1.14.9"
  TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
  TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
  TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
  TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
  TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
  TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
  TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
  TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
  TF_VAR_proxmox_insecure: "true"
 jobs:
  dashboards:
    name: Grafana Content
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Setup Terraform
        uses: hashicorp/setup-terraform@v3
        with:
          terraform_version: ${{ env.TF_VERSION }}
          terraform_wrapper: false
      - name: Setup SSH Keys
        run: |
          mkdir -p ~/.ssh
          echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
          chmod 600 ~/.ssh/id_ed25519
          echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
          chmod 644 ~/.ssh/id_ed25519.pub
      - name: Terraform Init
        working-directory: terraform
        run: |
          terraform init \
            -lockfile=readonly \
            -backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
            -backend-config="bucket=${{ secrets.S3_BUCKET }}" \
            -backend-config="region=auto" \
            -backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
            -backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
            -backend-config="skip_requesting_account_id=true"
      - name: Install Python Dependencies
        run: |
          apt-get update && apt-get install -y python3-pip
          pip3 install ansible==8.7.0 kubernetes==26.1.0 jinja2==3.1.5 pyyaml==6.0.2
      - name: Install Ansible Collections
        run: ansible-galaxy collection install -r ansible/requirements.yml
      - name: Generate Ansible Inventory
        working-directory: ansible
        run: python3 generate_inventory.py
      - name: Apply dashboards and datasources
        working-directory: ansible
        run: |
          ansible-playbook dashboards.yml \
            -e "cluster_name=k8s-cluster"
        env:
          ANSIBLE_HOST_KEY_CHECKING: "False"
      - name: Verify Grafana content resources
        working-directory: ansible
        run: |
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get configmap -l grafana_datasource=1"
          ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n observability get configmap -l grafana_dashboard=1"
        env:
          ANSIBLE_HOST_KEY_CHECKING: "False"
@@ -8,19 +8,26 @@ on:
        required: true
        default: ''
 concurrency:
  group: prod-cluster
  cancel-in-progress: false
 env:
-  TF_VERSION: "1.7.0"
+  TF_VERSION: "1.14.9"
  TF_VAR_hcloud_token: ${{ secrets.HCLOUD_TOKEN }}
  TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
  TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
  TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
  TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
  TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
  TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
  TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
  TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
  TF_VAR_proxmox_insecure: "true"
 jobs:
  destroy:
    name: Destroy Cluster
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
    if: github.event.inputs.confirm == 'destroy'
    environment: destroy
    steps:
@@ -31,17 +38,7 @@ jobs:
        uses: hashicorp/setup-terraform@v3
        with:
          terraform_version: ${{ env.TF_VERSION }}
-
+          terraform_wrapper: false
      - name: Terraform Init
        working-directory: terraform
        run: |
          terraform init \
            -backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
            -backend-config="bucket=${{ secrets.S3_BUCKET }}" \
            -backend-config="region=auto" \
            -backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
            -backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
            -backend-config="skip_requesting_account_id=true"
      - name: Setup SSH Keys
        run: |
@@ -51,11 +48,66 @@ jobs:
          echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
          chmod 644 ~/.ssh/id_ed25519.pub
-      - name: Terraform Destroy
+      - name: Terraform Init
        working-directory: terraform
        run: |
-          terraform destroy \
+          terraform init \
-            -var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
+            -lockfile=readonly \
-            -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
+            -backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
-            -var="ssh_private_key=$HOME/.ssh/id_ed25519" \
+            -backend-config="bucket=${{ secrets.S3_BUCKET }}" \
-            -auto-approve
+            -backend-config="region=auto" \
            -backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
            -backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
            -backend-config="skip_requesting_account_id=true"
      - name: Save Proxmox target list
        run: |
          mkdir -p outputs
          if ! terraform -chdir=terraform output -json proxmox_target_vms > outputs/proxmox_target_vms.json; then
            terraform -chdir=terraform plan \
              -refresh=false \
              -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
              -var="ssh_private_key=$HOME/.ssh/id_ed25519" \
              -out=cleanup.tfplan \
              -no-color || true
            printf '[]' > outputs/proxmox_target_vms.json
          fi
      - name: Terraform Destroy
        id: destroy
        working-directory: terraform
        run: |
          set +e
          for attempt in 1 2 3; do
            echo "Terraform destroy attempt ${attempt}/3"
            terraform destroy \
              -parallelism=2 \
              -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
              -var="ssh_private_key=$HOME/.ssh/id_ed25519" \
              -auto-approve
            rc=$?
            if [ "$rc" -eq 0 ]; then
              exit 0
            fi
            if [ "$attempt" -lt 3 ]; then
              echo "Terraform destroy failed with exit code ${rc}; retrying in 30s"
              sleep 30
              terraform refresh \
                -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
                -var="ssh_private_key=$HOME/.ssh/id_ed25519" || true
            fi
          done
          exit "$rc"
      - name: Verify Proxmox target VMs removed
        if: success()
        run: |
          python3 scripts/proxmox-rebuild-cleanup.py --mode post-destroy --targets-file outputs/proxmox_target_vms.json
          if [ -f terraform/cleanup.tfplan ]; then
            python3 scripts/proxmox-rebuild-cleanup.py --mode post-destroy --terraform-dir terraform --plan cleanup.tfplan
          fi
      - name: Terraform state diagnostics
        if: failure() && steps.destroy.outcome == 'failure'
        run: |
          terraform -chdir=terraform state list || true
@@ -3,7 +3,6 @@
 *.tfstate.*
 *.tfstate.backup
 .terraform/
 .terraform.lock.hcl
 terraform.tfvars
 crash.log
 override.tf
@@ -0,0 +1,57 @@
 # AGENTS.md
 Compact repo guidance for OpenCode sessions. Trust executable sources over docs when they conflict.
 ## Read First
 - Highest-value sources: `.gitea/workflows/deploy.yml`, `.gitea/workflows/destroy.yml`, `terraform/main.tf`, `terraform/variables.tf`, `terraform/servers.tf`, `ansible/site.yml`, `ansible/inventory.tmpl`, `clusters/prod/flux-system/`, `infrastructure/addons/kustomization.yaml`.
 - `STABLE_BASELINE.md` still contains stale Rancher backup/restore references; current workflows and addon manifests do not deploy or restore `rancher-backup`.
 ## Baseline
 - Proxmox HA K3s cluster: 3 control planes, 5 workers, VMIDs `200-202` and `210-214`, node `flex`, template VMID `9000`, datastore `Flash`.
 - API HA is kube-vip at `10.27.27.40`; control planes are `10.27.27.30-32`, workers are `10.27.27.41-45`.
 - SSH user is `ubuntu`; Ansible derives the flannel iface from `ansible_default_ipv4.interface` with `eth0` fallback, so do not hard-code `ens18`.
 - Storage is raw-manifest `nfs-subdir-external-provisioner` using `10.27.27.239:/TheFlash/k8s-nfs` and default StorageClass `flash-nfs`.
 - Tailscale is the private access path. Rancher, Grafana, and Prometheus are exposed only through Tailscale services.
 - `apps` is intentionally suspended in `clusters/prod/flux-system/kustomization-apps.yaml`.
 ## Commands
 - Terraform: `terraform -chdir=terraform fmt -recursive`, `terraform -chdir=terraform validate`, `terraform -chdir=terraform plan -var-file=../terraform.tfvars`, `terraform -chdir=terraform apply -var-file=../terraform.tfvars`.
 - Ansible setup: `ansible-galaxy collection install -r ansible/requirements.yml`, then from `ansible/` run `python3 generate_inventory.py` and `ansible-playbook site.yml --syntax-check`.
 - Flux/Kustomize checks: `kubectl kustomize infrastructure/addons/<addon>`, `kubectl kustomize infrastructure/addons`, `kubectl kustomize clusters/prod/flux-system`.
 - Kubeconfig refresh: `scripts/refresh-kubeconfig.sh <cp1-ip>`; use this if local `kubectl` falls back to `localhost:8080` after rebuilds.
 - Tailnet smoke check from cp1: `ssh ubuntu@<cp1-ip> 'bash -s' < scripts/smoke-check-tailnet-services.sh`.
 - Fast Grafana content iteration uses `.gitea/workflows/dashboards.yml` and `ansible/dashboards.yml`, not a full cluster rebuild.
 ## Deploy Flow
 - Pushes to `main` run Gitea CI: Terraform fmt/init/validate/plan/apply, Proxmox cleanup/retry, Ansible bootstrap, Flux bootstrap, addon gates, Rancher gate, observability image seeding, health checks, tailnet smoke checks.
 - Deploy and destroy workflows share `concurrency.group: prod-cluster`; destroy only requires workflow input `confirm: destroy` and has no backup gate.
 - Keep `set -euo pipefail` in workflow shell blocks.
 - Terraform retry cleanup has hard-coded target VMIDs/names in `.gitea/workflows/deploy.yml`; update it when changing node counts, names, or VMIDs.
 - Fresh VMs have unreliable registry/chart egress, so critical images are prepared by `skopeo` on the runner and imported with `k3s ctr`; update the workflow archive lists when adding bootstrap-time images.
 - CI applies `clusters/prod/flux-system/gotk-components.yaml` directly and then patches Flux controller deployments inline; changes only in `gotk-controller-cp1-patches.yaml` do not affect CI bootstrap.
 ## GitOps Addons
 - Vendored charts are intentional: `infrastructure/charts/{cert-manager,traefik,kube-prometheus-stack,tailscale-operator,rancher}`. Do not restore remote `HelmRepository` objects unless cluster-side chart fetch reliability is intentionally changed.
 - External Secrets and Loki/Promtail use Flux `OCIRepository`; Rancher, Tailscale, cert-manager, Traefik, and kube-prometheus-stack use `GitRepository` chart paths.
 - Use fully qualified `helmchart.source.toolkit.fluxcd.io/...` in scripts; K3s also has `helmcharts.helm.cattle.io`, so `helmchart/...` can target the wrong resource.
 - `doppler-bootstrap` only creates the `external-secrets` namespace and Doppler token secret. The deploy workflow creates `ClusterSecretStore/doppler-hetznerterra` after ESO CRDs and webhook endpoints exist.
 - The checked-in `infrastructure/addons/external-secrets/clustersecretstore-doppler-hetznerterra.yaml` is not included by that addon kustomization; do not assume Flux applies it.
 - Keep Kubernetes manifests one object per file with kebab-case filenames.
 ## Gotchas
 - Rancher chart `2.13.3` requires Kubernetes `<1.35.0-0`; K3s `latest` can break Rancher. Role defaults pin `v1.34.6+k3s1`; do not reintroduce a generated-inventory `k3s_version=latest` override.
 - The repo no longer uses a cloud controller manager. `providerID`, Hetzner CCM/CSI, or Hetzner firewall/load-balancer logic is stale.
 - Tailscale cleanup must only remove stale offline reserved hostnames before live service proxies exist; do not delete active `rancher`, `grafana`, `prometheus`, or `flux` devices.
 - Proxmox endpoint should be the base URL, for example `https://100.105.0.115:8006/`; provider/workflow code strips `/api2/json` when needed.
 - Current private URLs: Rancher `https://rancher.silverside-gopher.ts.net/`, Grafana `http://grafana.silverside-gopher.ts.net/`, Prometheus `http://prometheus.silverside-gopher.ts.net:9090/`.
 ## Secrets
 - Runtime secrets are Doppler + External Secrets; Terraform/bootstrap/CI secrets stay in Gitea Actions secrets.
 - Never commit secrets, kubeconfigs, private keys, `terraform.tfvars`, or generated `outputs/` artifacts.
@@ -0,0 +1,287 @@
 # App Repo Deployment Guide
 This guide explains the recommended way to deploy an application to this cluster.
 ## Recommended Model
 Use two repos:
 - `HetznerTerra` (this repo): cluster, addons, shared infrastructure, Flux wiring
 - `your-app-repo`: application source, Dockerfile, CI, Kubernetes manifests or Helm chart
 Why:
 - cluster lifecycle stays separate from app code
 - app CI can build and tag images independently
 - this repo remains the source of truth for what the cluster is allowed to deploy
 ## Current Cluster Assumptions
 - Flux is already installed and reconciles this repo from `main`
 - `clusters/prod/flux-system/kustomization-apps.yaml` points at `./apps`
 - `apps` is suspended by default
 - private access is through Tailscale
 - runtime secrets should come from Doppler via External Secrets
 ## Deployment Options
 ### Option A: Separate app repo
 Recommended for most real applications.
 Flow:
 1. App repo builds and pushes an image.
 2. This repo defines a `GitRepository` pointing at the app repo.
 3. This repo defines a `Kustomization` pointing at a path in the app repo.
 4. Flux pulls the app repo and applies the manifests.
 ### Option B: In-repo app manifests
 Only use this when the application is tiny or tightly coupled to the platform.
 Flow:
 1. Put Kubernetes manifests directly under `apps/` in this repo.
 2. Unsuspend the top-level `apps` Kustomization.
 This is simpler, but mixes platform and app changes together.
 ## App Repo Structure
 Suggested layout:
 ```text
 your-app-repo/
 ├── src/
 ├── Dockerfile
 ├── .gitea/workflows/
 └── deploy/
    ├── base/
    │   ├── namespace.yaml
    │   ├── deployment.yaml
    │   ├── service.yaml
    │   ├── externalsecret.yaml
    │   └── kustomization.yaml
    └── prod/
        ├── kustomization.yaml
        └── patch-*.yaml
 ```
 If you prefer Helm, replace `deploy/base` and `deploy/prod` with a chart path and point Flux at that instead.
 ## What the App Repo Should Own
 - application source code
 - image build pipeline
 - image tag strategy
 - Deployment / Service / Ingress or Tailscale-facing Service manifests
 - app-specific `ExternalSecret` manifests
 - app-specific namespace
 ## What This Repo Should Own
 - cluster-level permission to deploy the app
 - the `GitRepository` and top-level `Kustomization` that attach the app repo to the cluster
 - whether the `apps` layer is suspended or active
 ## Recommended First App Integration
 In this repo, add Flux objects under `apps/` that point to the app repo.
 Example files to add:
 - `apps/gitrepository-my-app.yaml`
 - `apps/kustomization-my-app.yaml`
 - update `apps/kustomization.yaml`
 Example `apps/gitrepository-my-app.yaml`:
 ```yaml
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: GitRepository
 metadata:
  name: my-app
  namespace: flux-system
 spec:
  interval: 1m
  ref:
    branch: main
  secretRef:
    name: flux-system
  url: ssh://git@<your-git-host>:<port>/<org>/<your-app-repo>.git
 ```
 Example `apps/kustomization-my-app.yaml`:
 ```yaml
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: my-app
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: my-app
  path: ./deploy/prod
  wait: true
  timeout: 5m
  dependsOn:
    - name: infrastructure
 ```
 Then update `apps/kustomization.yaml`:
 ```yaml
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - gitrepository-my-app.yaml
  - kustomization-my-app.yaml
 ```
 ## App Secrets
 Recommended path:
 1. Put runtime values in Doppler.
 2. In the app manifests, create an `ExternalSecret` that reads from `doppler-hetznerterra`.
 3. Reference the resulting Kubernetes Secret from the Deployment.
 Example app-side `ExternalSecret`:
 ```yaml
 apiVersion: external-secrets.io/v1
 kind: ExternalSecret
 metadata:
  name: my-app-env
  namespace: my-app
 spec:
  refreshInterval: 1h
  secretStoreRef:
    name: doppler-hetznerterra
    kind: ClusterSecretStore
  target:
    name: my-app-env
    creationPolicy: Owner
  data:
    - secretKey: DATABASE_URL
      remoteRef:
        key: MY_APP_DATABASE_URL
 ```
 ## Image Delivery
 Recommended flow:
 1. App repo CI builds a container image.
 2. CI pushes it to a registry.
 3. The app repo updates the Kubernetes image tag in `deploy/prod`.
 4. Flux notices the Git change and deploys it.
 Keep the first version simple. Do not add image automation until the basic deploy path is proven.
 ## Exposing the App
 Pick one:
 ### Private app over Tailscale
 Best fit for this cluster right now.
 Create a Service like the existing Rancher/Grafana/Prometheus pattern:
 ```yaml
 apiVersion: v1
 kind: Service
 metadata:
  name: my-app-tailscale
  namespace: my-app
  annotations:
    tailscale.com/hostname: my-app
    tailscale.com/tags: "tag:prod"
    tailscale.com/proxy-class: infra-stable
 spec:
  type: LoadBalancer
  loadBalancerClass: tailscale
  selector:
    app.kubernetes.io/name: my-app
  ports:
    - name: http
      port: 80
      protocol: TCP
      targetPort: 3000
 ```
 Use `http://my-app.<your-tailnet>` or your chosen hostname.
 ### Cluster-internal only
 Create only a `ClusterIP` Service.
 ### Public ingress
 Not recommended as the first app path in this repo. Get the private path working first.
 ## Enabling the Apps Layer
 The cluster-wide `apps` Kustomization is suspended by default.
 When you are ready to let Flux deploy app attachments from `apps/`, unsuspend it:
 ```bash
 kubectl -n flux-system patch kustomization apps --type=merge -p '{"spec":{"suspend":false}}'
 ```
 Or commit a change to `clusters/prod/flux-system/kustomization-apps.yaml` changing:
 ```yaml
 suspend: true
 ```
 to:
 ```yaml
 suspend: false
 ```
 ## First Deploy Checklist
 Before deploying the first app, make sure:
 1. app image builds successfully
 2. app repo contains valid `deploy/prod` manifests
 3. this repo contains the `GitRepository` + `Kustomization` attachment objects
 4. required Doppler secrets exist
 5. `apps` is unsuspended if you are using the top-level `apps` layer
 ## Verification Commands
 From a machine with cluster access:
 ```bash
 kubectl -n flux-system get gitrepositories,kustomizations
 kubectl get ns
 kubectl -n my-app get deploy,svc,pods,externalsecret,secret
 ```
 If private over Tailscale:
 ```bash
 kubectl -n my-app get svc my-app-tailscale -o wide
 ```
 ## Minimal Recommendation
 If you want the simplest, lowest-risk first deploy:
 1. create a separate app repo
 2. add `deploy/base` + `deploy/prod`
 3. add a `GitRepository` + `Kustomization` in this repo under `apps/`
 4. keep the app private with a Tailscale `LoadBalancer` Service
 5. use Doppler + `ExternalSecret` for runtime config
 That matches the current cluster design with the least surprise.
@@ -1,281 +1,284 @@
-# Hetzner Kubernetes Cluster
+# Proxmox Kubernetes Cluster
-Production-ready Kubernetes cluster on Hetzner Cloud using Terraform and Ansible.
+Private HA K3s cluster on Proxmox, provisioned by Terraform, bootstrapped by Ansible, and reconciled by Flux.
 ## Architecture
-| Component | Details |
+| Component | Current Baseline |
-|-----------|---------|
+|-----------|------------------|
-| **Control Plane** | 3x CX23 (HA) |
+| **Control plane** | 3 Proxmox VMs, VMIDs `200-202`, IPs `10.27.27.30-32`, 2 vCPU / 4 GiB / 32 GiB |
-| **Workers** | 4x CX33 |
+| **Workers** | 5 Proxmox VMs, VMIDs `210-214`, IPs `10.27.27.41-45`, 4 vCPU / 8 GiB / 64 GiB |
-| **Total Cost** | €28.93/mo |
+| **Kubernetes** | K3s `v1.34.6+k3s1`, HA embedded etcd, kube-vip API VIP `10.27.27.40` |
-| **K8s** | k3s (latest, HA) |
+| **Proxmox** | Node `flex`, template VMID `9000`, datastore `Flash`, bridge `vmbr0` |
-| **Addons** | Hetzner CCM + CSI |
+| **Storage** | Raw-manifest `nfs-subdir-external-provisioner`, `10.27.27.239:/TheFlash/k8s-nfs`, default StorageClass `flash-nfs` |
-| **Access** | SSH/API restricted to Tailnet |
+| **GitOps** | Flux source `platform` on branch `main`; `apps` Kustomization is intentionally suspended |
-| **Bootstrap** | Terraform + Ansible |
+| **Private access** | Tailscale operator exposes Rancher, Grafana, and Prometheus; no public ingress baseline |
 | **Runtime secrets** | Doppler service token bootstraps External Secrets Operator |
-### Cluster Resources
+K3s is pinned because Rancher chart `2.13.3` requires Kubernetes `<1.35.0-0`.
 - 22 vCPU total (6 CP + 16 workers)
 - 44 GB RAM total (12 CP + 32 workers)
 - 440 GB SSD storage
 - 140 TB bandwidth allocation
 ## Prerequisites
-### 1. Hetzner Cloud API Token
+- Terraform `>= 1.0`.
 - Ansible with Python `jinja2` and `pyyaml`.
 - `kubectl` for local verification.
 - Proxmox API token for the `bpg/proxmox` provider.
 - S3-compatible bucket for Terraform state, currently Backblaze B2.
 - SSH key pair available to Terraform and Ansible, defaulting to `~/.ssh/infra` and `~/.ssh/infra.pub`.
-1. Go to [Hetzner Cloud Console](https://console.hetzner.com/)
+Expected Proxmox inputs:
 2. Select your project (or create a new one)
 3. Navigate to **Security** → **API Tokens**
 4. Click **Generate API Token**
 5. Set description: `k8s-cluster-terraform`
 6. Select permissions: **Read & Write**
 7. Click **Generate API Token**
 8. **Copy the token immediately** - it won't be shown again!
-### 2. Backblaze B2 Bucket (for Terraform State)
+| Setting | Value |
 |---------|-------|
 | Endpoint | `https://100.105.0.115:8006/` |
 | Node | `flex` |
 | Clone source | Template VMID `9000` (`ubuntu-2404-k8s-template`) |
 | Storage | `Flash` |
-1. Go to [Backblaze B2](https://secure.backblaze.com/b2_buckets.htm)
+## Local Setup
 2. Click **Create a Bucket**
 3. Set bucket name: `k8s-terraform-state` (must be globally unique)
 4. Choose **Private** access
 5. Click **Create Bucket**
 6. Create application key:
   - Go to **App Keys** → **Add a New Application Key**
   - Name: `terraform-state`
   - Allow access to: `k8s-terraform-state` bucket only
   - Type: **Read and Write**
   - Copy **keyID** (access key) and **applicationKey** (secret key)
 7. Note your bucket's S3 endpoint (e.g., `https://s3.eu-central-003.backblazeb2.com`)
-### 3. SSH Key Pair
+Create local variables from the example:
 ```bash
 ssh-keygen -t ed25519 -C "k8s@hetzner" -f ~/.ssh/hetzner_k8s
 ```
 ### 4. Local Tools
 - [Terraform](https://terraform.io/downloads) >= 1.0
 - [Ansible](https://docs.ansible.com/ansible/latest/installation_guide/intro_installation.html) >= 2.9
 - Python 3 with `jinja2` and `pyyaml`
 ## Setup
 ### 1. Clone Repository
 ```bash
 git clone <your-gitea-repo>/HetznerTerra.git
 cd HetznerTerra
 ```
 ### 2. Configure Variables
 ```bash
 cp terraform.tfvars.example terraform.tfvars
 ```
-Edit `terraform.tfvars`:
+Important defaults in `terraform.tfvars.example`:
 ```hcl
-hcloud_token = "your-hetzner-api-token"
+proxmox_endpoint         = "https://100.105.0.115:8006/"
 proxmox_api_token_id     = "terraform-prov@pve!k8s-cluster"
 proxmox_api_token_secret = "your-proxmox-api-token-secret"
-ssh_public_key  = "~/.ssh/hetzner_k8s.pub"
+ssh_public_key  = "~/.ssh/infra.pub"
-ssh_private_key = "~/.ssh/hetzner_k8s"
+ssh_private_key = "~/.ssh/infra"
 s3_access_key = "your-backblaze-key-id"
 s3_secret_key = "your-backblaze-application-key"
 s3_endpoint   = "https://s3.eu-central-003.backblazeb2.com"
 s3_bucket     = "k8s-terraform-state"
-tailscale_auth_key = "tskey-auth-..."
+tailscale_tailnet = "yourtailnet.ts.net"
-tailscale_tailnet  = "yourtailnet.ts.net"
+kube_api_vip     = "10.27.27.40"
 restrict_api_ssh_to_tailnet = true
 tailnet_cidr                = "100.64.0.0/10"
 enable_nodeport_public      = false
 allowed_ssh_ips = []
 allowed_api_ips = []
 ```
-### 3. Initialize Terraform
+Initialize Terraform with backend credentials:
 ```bash
-cd terraform
+terraform -chdir=terraform init \
-
+  -backend-config="endpoint=<s3-endpoint>" \
-# Create backend config file (or use CLI args)
+  -backend-config="bucket=<s3-bucket>" \
-cat > backend.hcl << EOF
+  -backend-config="region=auto" \
-endpoint                    = "https://s3.eu-central-003.backblazeb2.com"
+  -backend-config="access_key=<s3-access-key>" \
-bucket                      = "k8s-terraform-state"
+  -backend-config="secret_key=<s3-secret-key>" \
-access_key                  = "your-backblaze-key-id"
+  -backend-config="skip_requesting_account_id=true"
 secret_key                  = "your-backblaze-application-key"
 skip_requesting_account_id  = true
 EOF
 terraform init -backend-config=backend.hcl
 ```
-### 4. Plan and Apply
+## Common Commands
 Terraform:
 ```bash
-terraform plan -var-file=../terraform.tfvars
+terraform -chdir=terraform fmt -recursive
-terraform apply -var-file=../terraform.tfvars
+terraform -chdir=terraform validate
 terraform -chdir=terraform plan -var-file=../terraform.tfvars
 terraform -chdir=terraform apply -var-file=../terraform.tfvars
 ```
-### 5. Generate Ansible Inventory
+Ansible setup:
 ```bash
-cd ../ansible
+ansible-galaxy collection install -r ansible/requirements.yml
 cd ansible
 python3 generate_inventory.py
 ansible-playbook site.yml --syntax-check
 ```
-### 6. Bootstrap Cluster
+Manual Ansible bootstrap uses the same extra vars as the deploy workflow:
 ```bash
-ansible-playbook site.yml
+cd ansible
 ansible-playbook site.yml \
  -e "tailscale_auth_key=$TAILSCALE_AUTH_KEY" \
  -e "tailscale_tailnet=$TAILSCALE_TAILNET" \
  -e "tailscale_oauth_client_id=$TAILSCALE_OAUTH_CLIENT_ID" \
  -e "tailscale_oauth_client_secret=$TAILSCALE_OAUTH_CLIENT_SECRET" \
  -e "doppler_hetznerterra_service_token=$DOPPLER_HETZNERTERRA_SERVICE_TOKEN" \
  -e "tailscale_api_key=${TAILSCALE_API_KEY:-}" \
  -e "grafana_admin_password=${GRAFANA_ADMIN_PASSWORD:-}" \
  -e "cluster_name=k8s-cluster"
 ```
-### 7. Get Kubeconfig
+Flux/Kustomize verification:
 ```bash
 kubectl kustomize infrastructure/addons/<addon>
 kubectl kustomize infrastructure/addons
 kubectl kustomize clusters/prod/flux-system
 ```
 Refresh kubeconfig after rebuilds:
 ```bash
 scripts/refresh-kubeconfig.sh 10.27.27.30
 export KUBECONFIG=$(pwd)/outputs/kubeconfig
 kubectl get nodes
 ```
-Kubeconfig endpoint is rewritten to the primary control-plane tailnet hostname (`k8s-cluster-cp-1.<your-tailnet>`).
+Run the tailnet smoke check from cp1:
 ```bash
 ssh ubuntu@10.27.27.30 'bash -s' < scripts/smoke-check-tailnet-services.sh
 ```
 ## Gitea CI/CD
-This repository includes Gitea workflows for:
+The supported full rebuild path is the Gitea deploy workflow.
- **terraform-plan**: Runs on PRs, shows planned changes
+| Workflow | Trigger | Purpose |
- **terraform-apply**: Runs on main branch after merge
+|----------|---------|---------|
- **ansible-deploy**: Runs after terraform apply
+| `.gitea/workflows/deploy.yml` | PR to `main`, push to `main`, manual dispatch | PRs run Terraform plan; pushes run Terraform apply, Ansible bootstrap, Flux bootstrap, addon gates, health checks, and tailnet smoke checks |
 | `.gitea/workflows/destroy.yml` | Manual dispatch with `confirm: destroy` | Terraform destroy with retries; no Rancher backup gate |
 | `.gitea/workflows/dashboards.yml` | Grafana content changes or manual dispatch | Fast Grafana datasource/dashboard update through `ansible/dashboards.yml` |
-### Required Gitea Secrets
+Deploy and destroy share `concurrency.group: prod-cluster` so they do not run at the same time.
-Set these in your Gitea repository settings (**Settings** → **Secrets** → **Actions**):
+Deploy sequence on push to `main`:
 1. Terraform fmt/init/validate/plan/apply.
 2. Cleanup/retry around known transient Proxmox clone and disk-update failures.
 3. Generate Ansible inventory from Terraform outputs.
 4. Prepare critical image archives with `skopeo` on the runner.
 5. Run `ansible/site.yml` to bootstrap nodes, K3s, kube-vip, prerequisite secrets, and kubeconfig.
 6. Apply Flux CRDs/controllers and the `clusters/prod/flux-system` graph.
 7. Gate cert-manager, External Secrets, Tailscale, NFS, Rancher, and observability.
 8. Run post-deploy health checks and Tailscale service smoke checks.
 Required Gitea secrets:
 | Secret | Description |
 |--------|-------------|
-| `HCLOUD_TOKEN` | Hetzner Cloud API token |
+| `PROXMOX_ENDPOINT` | Proxmox API endpoint, for example `https://100.105.0.115:8006/` |
-| `S3_ACCESS_KEY` | Backblaze B2 keyID |
+| `PROXMOX_API_TOKEN_ID` | Proxmox API token ID |
-| `S3_SECRET_KEY` | Backblaze B2 applicationKey |
+| `PROXMOX_API_TOKEN_SECRET` | Proxmox API token secret |
-| `S3_ENDPOINT` | Backblaze S3 endpoint (e.g., `https://s3.eu-central-003.backblazeb2.com`) |
+| `S3_ACCESS_KEY` | S3/Backblaze access key for Terraform state |
-| `S3_BUCKET` | S3 bucket name (e.g., `k8s-terraform-state`) |
+| `S3_SECRET_KEY` | S3/Backblaze secret key for Terraform state |
 | `S3_ENDPOINT` | S3 endpoint, for example `https://s3.eu-central-003.backblazeb2.com` |
 | `S3_BUCKET` | Terraform state bucket, for example `k8s-terraform-state` |
 | `TAILSCALE_AUTH_KEY` | Tailscale auth key for node bootstrap |
-| `TAILSCALE_TAILNET` | Tailnet domain (e.g., `yourtailnet.ts.net`) |
+| `TAILSCALE_TAILNET` | Tailnet domain, for example `silverside-gopher.ts.net` |
-| `RUNNER_ALLOWED_CIDRS` | Optional CIDR list for CI runner access if you choose to pass it via tfvars/secrets |
+| `TAILSCALE_OAUTH_CLIENT_ID` | Tailscale OAuth client ID for the Kubernetes operator |
 | `TAILSCALE_OAUTH_CLIENT_SECRET` | Tailscale OAuth client secret for the Kubernetes operator |
 | `TAILSCALE_API_KEY` | Optional API key used to delete stale offline reserved devices before service proxies exist |
 | `DOPPLER_HETZNERTERRA_SERVICE_TOKEN` | Doppler service token for runtime cluster secrets |
 | `GRAFANA_ADMIN_PASSWORD` | Optional Grafana admin password |
 | `SSH_PUBLIC_KEY` | SSH public key content |
 | `SSH_PRIVATE_KEY` | SSH private key content |
-## File Structure
+## GitOps Graph
-```
+Flux entrypoint:
-.
+
-├── terraform/
+```text
-│   ├── main.tf
+clusters/prod/flux-system/
-│   ├── variables.tf
+├── gotk-components.yaml
-│   ├── network.tf
+├── gitrepository-platform.yaml
-│   ├── firewall.tf
+├── kustomization-infrastructure.yaml
-│   ├── ssh.tf
+└── kustomization-apps.yaml  # suspend: true
 │   ├── servers.tf
 │   ├── outputs.tf
 │   └── backend.tf
 ├── ansible/
 │   ├── inventory.tmpl
 │   ├── generate_inventory.py
 │   ├── site.yml
 │   ├── roles/
 │   │   ├── common/
 │   │   ├── k3s-server/
 │   │   ├── k3s-agent/
 │   │   ├── ccm/
 │   │   └── csi/
 │   └── ansible.cfg
 ├── .gitea/
 │   └── workflows/
 │       ├── terraform.yml
 │       └── ansible.yml
 ├── outputs/
 ├── terraform.tfvars.example
 └── README.md
 ```
-## Firewall Rules
+Active infrastructure addons from `infrastructure/addons/kustomization.yaml`:
-| Port | Source | Purpose |
+- `addon-nfs-storage`
-|------|--------|---------|
+- `addon-external-secrets`
-| 22 | Tailnet CIDR | SSH |
+- `addon-cert-manager`
-| 6443 | Tailnet CIDR + internal | Kubernetes API |
+- `addon-tailscale-operator`
-| 41641/udp | Any | Tailscale WireGuard |
+- `addon-tailscale-proxyclass`
-| 9345 | 10.0.0.0/16 | k3s Supervisor (HA join) |
+- `traefik` HelmRelease manifests applied directly by the top-level infrastructure Kustomization
-| 2379 | 10.0.0.0/16 | etcd Client |
+- `addon-observability`
-| 2380 | 10.0.0.0/16 | etcd Peer |
+- `addon-observability-content`
-| 8472 | 10.0.0.0/16 | Flannel VXLAN |
+- `addon-rancher`
-| 10250 | 10.0.0.0/16 | Kubelet |
+- `addon-rancher-config`
-| 30000-32767 | Optional | NodePorts (disabled by default) |
+
 Chart/source strategy:
 - Vendored charts are intentional: `cert-manager`, `traefik`, `kube-prometheus-stack`, `tailscale-operator`, and `rancher` live under `infrastructure/charts/`.
 - External Secrets, Loki, and Promtail use Flux `OCIRepository` sources.
 - NFS storage is raw Kubernetes manifests, not a Helm chart.
 - Rancher backup/restore is not part of the current live graph.
 Doppler bootstrap details:
 - `ansible/roles/doppler-bootstrap` creates the `external-secrets` namespace and the Doppler token secret only.
 - The deploy workflow creates `ClusterSecretStore/doppler-hetznerterra` after ESO CRDs and webhook endpoints exist.
 - The checked-in `infrastructure/addons/external-secrets/clustersecretstore-doppler-hetznerterra.yaml` is not included by the addon kustomization.
 ## Access URLs
 | Service | URL |
 |---------|-----|
 | Rancher | `https://rancher.silverside-gopher.ts.net/` |
 | Grafana | `http://grafana.silverside-gopher.ts.net/` |
 | Prometheus | `http://prometheus.silverside-gopher.ts.net:9090/` |
 Fallback port-forward from a tailnet-connected machine:
 ```bash
 export KUBECONFIG=$(pwd)/outputs/kubeconfig
 kubectl -n observability port-forward svc/kube-prometheus-stack-grafana 3000:80
 kubectl -n observability port-forward svc/kube-prometheus-stack-prometheus 9090:9090
 ```
 Grafana user is `admin`; password comes from the `GRAFANA_ADMIN_PASSWORD` Doppler secret or the workflow-provided fallback.
 ## Operations
-### Scale Workers
+Scale workers by updating `terraform.tfvars` counts, IP lists, and VMID lists together. If node names or VMIDs change, also update the hard-coded retry cleanup target map in `.gitea/workflows/deploy.yml`.
-Edit `terraform.tfvars`:
+Upgrade K3s by changing the role defaults in `ansible/roles/k3s-server/defaults/main.yml` and `ansible/roles/k3s-agent/defaults/main.yml`. Check Rancher chart compatibility before moving to a Kubernetes minor outside `<1.35.0-0`.
-```hcl
+Destroy through the Gitea `Destroy` workflow with `confirm: destroy`, or locally with:
 worker_count = 5
 ```
 Then:
 ```bash
-terraform apply
+terraform -chdir=terraform destroy -var-file=../terraform.tfvars
 ansible-playbook site.yml
 ```
 ### Upgrade k3s
 ```bash
 ansible-playbook site.yml -t upgrade
 ```
 ### Destroy Cluster
 ```bash
 terraform destroy
 ```
 ## Troubleshooting
-### Check k3s Logs
+Check K3s from cp1:
 ```bash
-ssh root@<control-plane-ip> journalctl -u k3s -f
+ssh ubuntu@10.27.27.30 'sudo k3s kubectl get nodes -o wide'
 ssh ubuntu@10.27.27.30 'sudo journalctl -u k3s -n 120 --no-pager'
 ```
-### Reset k3s
+Check Flux and Rancher:
 ```bash
-ansible-playbook site.yml -t reset
+kubectl -n flux-system get gitrepositories,kustomizations,helmreleases,ocirepositories
 kubectl -n flux-system describe helmrelease rancher
 kubectl -n cattle-system get pods,deploy -o wide
 ```
-## Costs Breakdown
+Check Tailscale services:
-| Resource | Quantity | Unit Price | Monthly |
+```bash
-|----------|----------|------------|---------|
+kubectl -n tailscale-system get pods
-| CX23 (Control Plane) | 3 | €2.99 | €8.97 |
+kubectl -n cattle-system get svc rancher-tailscale
-| CX33 (Workers) | 4 | €4.99 | €19.96 |
+kubectl -n observability get svc grafana-tailscale prometheus-tailscale
-| Backblaze B2 | ~1 GB | Free (first 10GB) | €0.00 |
+kubectl -n cattle-system describe svc rancher-tailscale | grep TailscaleProxyReady
-| **Total** | | | **€28.93/mo** |
+kubectl -n observability describe svc grafana-tailscale | grep TailscaleProxyReady
 kubectl -n observability describe svc prometheus-tailscale | grep TailscaleProxyReady
 ```
 If local `kubectl` falls back to `localhost:8080`, refresh `outputs/kubeconfig` with `scripts/refresh-kubeconfig.sh 10.27.27.30`.
 ## Security Notes
- Control plane has HA (3 nodes, can survive 1 failure)
+- Never commit `terraform.tfvars`, kubeconfigs, private keys, `outputs/`, or real secret values.
- Consider adding Hetzner load balancer for API server
+- Terraform/bootstrap/CI secrets stay in Gitea Actions secrets.
- Rotate API tokens regularly
+- Runtime cluster secrets are sourced from Doppler through External Secrets.
- Use network policies in Kubernetes
+- This repo does not manage Proxmox/LAN firewalls or public ingress.
 - Enable audit logging for production
 ## License
@@ -0,0 +1,100 @@
 # Gitea Secrets Setup
 This document describes the secrets required for the Proxmox-based deployment workflow.
 ## Required Secrets
 Add these secrets in your Gitea repository settings:
 **Settings → Secrets → Actions**
 ### Infrastructure Secrets
 #### `PROXMOX_ENDPOINT`
 - Proxmox VE API endpoint
 - Example: `https://100.105.0.115:8006/`
 #### `PROXMOX_API_TOKEN_ID`
 - Proxmox API token ID
 - Example: `terraform-prov@pve!k8s-cluster`
 #### `PROXMOX_API_TOKEN_SECRET`
 - Proxmox API token secret
 - Create with `pveum user token add terraform-prov@pve k8s-cluster`
 #### `S3_ACCESS_KEY` & `S3_SECRET_KEY`
 - Backblaze B2 credentials for Terraform state storage
 - Get from: https://secure.backblaze.com/b2_buckets.htm
 - Create application key with access to your terraform state bucket
 #### `S3_ENDPOINT`
 - Backblaze B2 S3 endpoint
 - Example: `https://s3.eu-central-003.backblazeb2.com`
 #### `S3_BUCKET`
 - Backblaze B2 bucket name for Terraform state
 - Example: `k8s-terraform-state`
 ### SSH Secrets
 #### `SSH_PRIVATE_KEY` & `SSH_PUBLIC_KEY`
 - SSH key pair for cluster access
 - Generate with: `ssh-keygen -t ed25519 -C "k8s@proxmox" -f ~/.ssh/infra`
 - Private key content (include BEGIN/END lines)
 - Public key content (full line starting with ssh-ed25519)
 ### Tailscale Secrets
 #### `TAILSCALE_AUTH_KEY`
 - Tailscale auth key for node registration
 - Get from: https://login.tailscale.com/admin/settings/keys
 - Type: Reusable, Ephemeral
 - Scope: `devices:core:write`
 #### `TAILSCALE_TAILNET`
 - Your Tailscale network name
 - Example: `tail7ec33.ts.net` or your custom domain
 #### `TAILSCALE_OAUTH_CLIENT_ID` & `TAILSCALE_OAUTH_CLIENT_SECRET`
 - OAuth credentials for Tailscale Kubernetes Operator
 - Get from: https://login.tailscale.com/admin/settings/oauth
 - Create OAuth client with scope: `devices:core:write`
 ### Application Secrets
 #### `DOPPLER_HETZNERTERRA_SERVICE_TOKEN`
 - Doppler service token for the `hetznerterra` project runtime secrets
 - Used by External Secrets Operator bootstrap
 - Recommended scope: `hetznerterra` project, `prod` config only
 #### `GRAFANA_ADMIN_PASSWORD`
 - Transitional fallback only while migrating observability secrets to Doppler
 - In steady state, store this in Doppler as `GRAFANA_ADMIN_PASSWORD`
 ## Setting Up Secrets
 1. Go to your Gitea repository
 2. Navigate to **Settings → Secrets → Actions**
 3. Click **Add Secret**
 4. Enter the secret name (exact match from above)
 5. Paste the secret value
 6. Click **Add Secret**
 7. Repeat for all secrets
 ## Verification
 After adding all secrets, trigger a workflow run:
 ```bash
 git commit --allow-empty -m "ci: trigger workflow with new secrets"
 git push
 ```
 Check the workflow logs to verify all secrets are being used correctly.
 ## Security Notes
 - Never commit secrets to the repository
 - Use strong, unique passwords for Grafana and other services
 - Prefer Doppler for runtime app/platform secrets after cluster bootstrap
 - Rotate Tailscale auth keys periodically
 - Review OAuth client permissions regularly
 - CI expects direct SSH access to the Proxmox VMs and direct Proxmox API access
@@ -0,0 +1,73 @@
 # Stable Private-Only Baseline
 This document defines the current engineering target for this repository.
 ## Topology
 - 3 control planes (HA etcd cluster)
 - 5 workers
 - kube-vip API VIP (`10.27.27.40`)
 - private Proxmox/LAN network (`10.27.27.0/24`)
 - Tailscale operator access and service exposure
 - Rancher exposed through Tailscale (`rancher.silverside-gopher.ts.net`)
 - Grafana exposed through Tailscale (`grafana.silverside-gopher.ts.net`)
 - Prometheus exposed through Tailscale (`prometheus.silverside-gopher.ts.net:9090`)
 - `apps` Kustomization suspended by default
 ## In Scope
 - Terraform infrastructure bootstrap
 - Ansible k3s bootstrap on Ubuntu cloud-init VMs
 - **HA control plane (3 nodes with etcd quorum)**
 - **kube-vip for Kubernetes API HA**
 - **NFS-backed persistent volumes via `nfs-subdir-external-provisioner`**
 - Flux core reconciliation
 - External Secrets Operator with Doppler
 - Tailscale private access and smoke-check validation
 - cert-manager
 - Rancher and rancher-backup
 - Rancher backup/restore validation
 - Observability stack (Grafana, Prometheus, Loki, Promtail)
 - Persistent volume provisioning validated
 ## Deferred for Later Phases
 - app workloads in `apps/`
 ## Out of Scope
 - public ingress or DNS
 - public TLS
 - app workloads
 - cross-region / multi-cluster disaster recovery strategy
 - upgrade strategy
 ## Phase Gates
 1. Terraform apply completes for HA topology (3 CP, 5 workers, 1 VIP).
 2. Primary control plane bootstraps with `--cluster-init`.
 3. kube-vip advertises `10.27.27.40:6443` from the control-plane set.
 4. Secondary control planes join via the kube-vip endpoint.
 5. Workers join successfully via the kube-vip endpoint.
 7. etcd reports 3 healthy members.
 8. Flux source and infrastructure reconciliation are healthy.
 9. **NFS provisioner deploys and creates `flash-nfs` StorageClass**.
 10. **PVC provisioning tested and working**.
 11. External Secrets sync required secrets.
 12. Tailscale private access works for Rancher, Grafana, and Prometheus.
 13. CI smoke checks pass for Tailscale DNS resolution, `tailscale ping`, and HTTP reachability.
 14. A fresh Rancher backup can be created and restored successfully.
 15. Terraform destroy succeeds cleanly or via workflow retry.
 ## Success Criteria
 Success requires two consecutive HA rebuilds passing all phase gates with no manual fixes, no manual `kubectl` patching, and no manual Tailscale proxy recreation.
 ## Validated Drills
 - 2026-04-18: live Rancher backup/restore drill succeeded on the current cluster.
 - A fresh one-time backup was created, restored back onto the same cluster, and post-restore validation confirmed:
  - all nodes remained `Ready`
  - Flux infrastructure stayed healthy
  - Rancher backup/restore resources reported `Completed`
  - Rancher, Grafana, and Prometheus remained reachable through the Tailscale smoke checks
@@ -3,7 +3,8 @@ inventory = inventory.ini
 host_key_checking = False
 retry_files_enabled = False
 roles_path = roles
-stdout_callback = yaml
+stdout_callback = default
 result_format = yaml
 interpreter_python = auto_silent
 [privilege_escalation]
@@ -0,0 +1,7 @@
 ---
 - name: Provision Grafana dashboards and datasources
  hosts: control_plane[0]
  become: true
  roles:
    - observability-content
@@ -32,6 +32,7 @@ def main():
    worker_names = outputs["worker_names"]["value"]
    worker_ips = outputs["worker_ips"]["value"]
    worker_private_ips = outputs["worker_private_ips"]["value"]
    kube_api_lb_ip = outputs.get("kube_api_lb_ip", {}).get("value", control_plane_ips[0])
    control_planes = [
        {
@@ -59,6 +60,7 @@ def main():
        "control_planes": control_planes,
        "workers": workers,
        "private_key_file": outputs["ssh_private_key_path"]["value"],
        "kube_api_lb_ip": kube_api_lb_ip,
    }
    env = Environment(loader=FileSystemLoader("."))
@@ -13,7 +13,7 @@ control_plane
 workers
 [cluster:vars]
-ansible_user=root
+ansible_user=ubuntu
 ansible_python_interpreter=/usr/bin/python3
 ansible_ssh_private_key_file={{ private_key_file }}
-k3s_version=latest
+kube_api_endpoint={{ kube_api_lb_ip }}
@@ -3,3 +3,5 @@ collections:
    version: ">=2.4.0"
  - name: community.general
    version: ">=8.0.0"
  - name: community.network
    version: ">=5.0.0"
@@ -0,0 +1,31 @@
 ---
 - name: Ensure Tailscale operator namespace exists
  command: >-
    kubectl create namespace {{ tailscale_operator_namespace | default('tailscale-system') }}
    --dry-run=client -o yaml
  register: tailscale_namespace_manifest
  changed_when: false
  when:
    - tailscale_oauth_client_id | default('') | length > 0
    - tailscale_oauth_client_secret | default('') | length > 0
 - name: Apply Tailscale operator namespace
  command: kubectl apply -f -
  args:
    stdin: "{{ tailscale_namespace_manifest.stdout }}"
  changed_when: true
  when:
    - tailscale_oauth_client_id | default('') | length > 0
    - tailscale_oauth_client_secret | default('') | length > 0
 - name: Apply Tailscale operator OAuth secret
  shell: >-
    kubectl -n {{ tailscale_operator_namespace | default('tailscale-system') }} create secret generic operator-oauth
    --from-literal=client_id='{{ tailscale_oauth_client_id }}'
    --from-literal=client_secret='{{ tailscale_oauth_client_secret }}'
    --dry-run=client -o yaml | kubectl apply -f -
  changed_when: true
  no_log: true
  when:
    - tailscale_oauth_client_id | default('') | length > 0
    - tailscale_oauth_client_secret | default('') | length > 0
@@ -0,0 +1,12 @@
 ---
 bootstrap_prepull_images:
  - docker.io/rancher/mirrored-pause:3.6
  - docker.io/rancher/mirrored-coredns-coredns:1.14.2
  - docker.io/rancher/mirrored-metrics-server:v0.8.1
  - docker.io/rancher/local-path-provisioner:v0.0.35
  - docker.io/rancher/mirrored-library-traefik:3.6.10
  - docker.io/rancher/klipper-helm:v0.9.14-build20260309
  - ghcr.io/fluxcd/source-controller:v1.8.0
  - ghcr.io/fluxcd/kustomize-controller:v1.8.1
  - ghcr.io/fluxcd/helm-controller:v1.5.1
  - ghcr.io/fluxcd/notification-controller:v1.8.1
@@ -0,0 +1,59 @@
 ---
 - name: Check for runner-provided bootstrap image archives
  stat:
    path: "{{ playbook_dir }}/../outputs/bootstrap-image-archives/{{ item | regex_replace('[/:]', '_') }}.tar"
  delegate_to: localhost
  become: false
  register: bootstrap_image_archive_stats
  loop: "{{ bootstrap_prepull_images }}"
 - name: Ensure remote bootstrap image archive directory exists
  file:
    path: /tmp/bootstrap-image-archives
    state: directory
    mode: "0755"
 - name: Copy runner-provided bootstrap image archives
  copy:
    src: "{{ item.stat.path }}"
    dest: "/tmp/bootstrap-image-archives/{{ item.item | regex_replace('[/:]', '_') }}.tar"
    mode: "0644"
  loop: "{{ bootstrap_image_archive_stats.results }}"
  loop_control:
    label: "{{ item.item }}"
  when: item.stat.exists
 - name: Import or pull bootstrap images into containerd
  shell: |
    if /usr/local/bin/ctr -n k8s.io images ls -q | grep -Fx -- "{{ item }}" >/dev/null; then
      echo "already present"
      exit 0
    fi
    archive="/tmp/bootstrap-image-archives/{{ item | regex_replace('[/:]', '_') }}.tar"
    if [ -s "${archive}" ]; then
      for attempt in 1 2 3; do
        if /usr/local/bin/ctr -n k8s.io images import "${archive}" && /usr/local/bin/ctr -n k8s.io images ls -q | grep -Fx -- "{{ item }}" >/dev/null; then
          echo "imported image"
          exit 0
        fi
        sleep 10
      done
    fi
    for attempt in 1 2 3 4 5; do
      if timeout 180s /usr/local/bin/ctr -n k8s.io images pull "{{ item }}"; then
        echo "pulled image"
        exit 0
      fi
      sleep 10
    done
    exit 1
  args:
    executable: /bin/bash
  register: bootstrap_image_pull
  loop: "{{ bootstrap_prepull_images }}"
  changed_when: "'imported image' in bootstrap_image_pull.stdout or 'pulled image' in bootstrap_image_pull.stdout"
@@ -1,4 +0,0 @@
 ---
 hcloud_token: ""
 cluster_name: "k8s-cluster"
 hcloud_lb_location: "nbg1"
@@ -1,88 +0,0 @@
 ---
 - name: Check if Hetzner CCM is already deployed
  command: kubectl -n kube-system get deployment hcloud-cloud-controller-manager
  register: ccm_namespace
  failed_when: false
  changed_when: false
 - name: Create Hetzner cloud secret
  shell: |
    kubectl -n kube-system create secret generic hcloud \
      --from-literal=token='{{ hcloud_token }}' \
      --from-literal=network='{{ cluster_name }}-network' \
      --dry-run=client -o yaml | kubectl apply -f -
  no_log: true
  when: hcloud_token is defined
  changed_when: true
 - name: Deploy Hetzner CCM
  command: kubectl apply -f https://raw.githubusercontent.com/hetznercloud/hcloud-cloud-controller-manager/main/deploy/ccm-networks.yaml
  changed_when: true
 - name: Detect CCM workload kind
  shell: |
    if kubectl -n kube-system get deployment hcloud-cloud-controller-manager >/dev/null 2>&1; then
      echo deployment
    elif kubectl -n kube-system get daemonset hcloud-cloud-controller-manager >/dev/null 2>&1; then
      echo daemonset
    else
      echo missing
    fi
  register: ccm_workload_kind
  changed_when: false
 - name: Wait for CCM deployment rollout
  command: kubectl rollout status deployment/hcloud-cloud-controller-manager -n kube-system
  register: ccm_rollout_deploy
  until: ccm_rollout_deploy.rc == 0
  changed_when: false
  retries: 30
  delay: 10
  when: ccm_workload_kind.stdout == "deployment"
 - name: Wait for CCM daemonset rollout
  command: kubectl rollout status daemonset/hcloud-cloud-controller-manager -n kube-system
  register: ccm_rollout_ds
  until: ccm_rollout_ds.rc == 0
  changed_when: false
  retries: 30
  delay: 10
  when: ccm_workload_kind.stdout == "daemonset"
 - name: Set default Hetzner load balancer location for Traefik service
  command: kubectl -n kube-system annotate service traefik load-balancer.hetzner.cloud/location={{ hcloud_lb_location }} --overwrite
  register: traefik_annotation
  changed_when: true
  failed_when: false
 - name: Show Traefik service when annotation patch fails
  command: kubectl -n kube-system get service traefik -o yaml
  register: traefik_service_dump
  changed_when: false
  failed_when: false
  when: traefik_annotation.rc != 0
 - name: Fail when Traefik load balancer annotation cannot be set
  fail:
    msg: |
      Failed to set Hetzner load balancer location annotation on kube-system/traefik service.
      Command output:
      {{ traefik_annotation.stderr | default(traefik_annotation.stdout) }}
      Service dump:
      {{ traefik_service_dump.stdout | default('n/a') }}
  when: traefik_annotation.rc != 0
 - name: Show CCM namespace objects when workload missing
  command: kubectl -n kube-system get all | grep hcloud-cloud-controller-manager || true
  register: ccm_ns_objects
  changed_when: false
  when: ccm_workload_kind.stdout == "missing"
 - name: Fail when CCM workload is missing
  fail:
    msg: |
      hcloud-cloud-controller-manager workload not found after applying manifest.
      Namespace objects:
      {{ ccm_ns_objects.stdout | default('n/a') }}
  when: ccm_workload_kind.stdout == "missing"
@@ -1,12 +1,32 @@
 ---
 - name: Check if cloud-init is installed
  command: which cloud-init
  register: cloud_init_binary
  changed_when: false
  failed_when: false
 - name: Wait for cloud-init to finish first-boot tasks
  command: cloud-init status --wait
  register: cloud_init_wait
  changed_when: false
  failed_when: >-
    cloud_init_wait.rc not in [0, 2] or
    (
      'status: done' not in cloud_init_wait.stdout and
      'status: disabled' not in cloud_init_wait.stdout
    )
  when: cloud_init_binary.rc == 0
 - name: Update apt cache
  apt:
    update_cache: true
    cache_valid_time: 3600
    lock_timeout: 600
 - name: Upgrade packages
  apt:
    upgrade: dist
    lock_timeout: 600
  when: common_upgrade_packages | default(false)
 - name: Install required packages
@@ -19,18 +39,27 @@
      - lsb-release
      - software-properties-common
      - jq
      - nfs-common
      - htop
      - vim
    state: present
    lock_timeout: 600
 - name: Check active swap
  command: swapon --noheadings
  register: active_swap
  changed_when: false
  failed_when: false
 - name: Disable swap
  command: swapoff -a
  changed_when: true
  when: active_swap.stdout | trim | length > 0
 - name: Remove swap from fstab
-  mount:
+  lineinfile:
-    name: swap
+    path: /etc/fstab
-    fstype: swap
+    regexp: '^\s*[^#]\S+\s+\S+\s+swap\s+.*$'
    state: absent
 - name: Load br_netfilter module
@@ -66,6 +95,10 @@
 - name: Install tailscale
  shell: curl -fsSL https://tailscale.com/install.sh | sh
  register: tailscale_install
  until: tailscale_install.rc == 0
  retries: 5
  delay: 15
  when:
    - tailscale_auth_key | length > 0
    - tailscale_binary.rc != 0
@@ -78,9 +111,22 @@
  failed_when: false
  when: tailscale_auth_key | length > 0
- name: Connect node to tailnet
+- name: Parse tailscale connection state
-  command: tailscale up --authkey {{ tailscale_auth_key }} --hostname {{ inventory_hostname }} --ssh={{ tailscale_ssh | ternary('true', 'false') }} --accept-routes={{ tailscale_accept_routes | ternary('true', 'false') }}
+  set_fact:
    tailscale_backend_state: "{{ (tailscale_status.stdout | from_json).BackendState | default('') }}"
  when:
    - tailscale_auth_key | length > 0
-    - tailscale_status.rc != 0 or '"BackendState":"Running"' not in tailscale_status.stdout
+    - tailscale_status.rc == 0
    - tailscale_status.stdout | length > 0
 - name: Connect node to tailnet
  command: tailscale up --authkey {{ tailscale_auth_key }} --hostname {{ inventory_hostname }} --ssh={{ tailscale_ssh | ternary('true', 'false') }} --accept-routes={{ tailscale_accept_routes | ternary('true', 'false') }}
  register: tailscale_up
  until: tailscale_up.rc == 0
  retries: 5
  delay: 15
  no_log: true
  when:
    - tailscale_auth_key | length > 0
    - tailscale_status.rc != 0 or (tailscale_backend_state | default('')) != 'Running'
  changed_when: true
@@ -1,15 +0,0 @@
 ---
 hcloud_token: ""
 cluster_name: "k8s-cluster"
 csi_manifest_url: "https://raw.githubusercontent.com/hetznercloud/csi-driver/main/deploy/kubernetes/hcloud-csi.yml"
 csi_rollout_timeout_seconds: 30
 csi_rollout_retries: 8
 csi_rollout_delay_seconds: 5
 csi_failure_log_tail_lines: 120
 csi_smoke_test_enabled: true
 csi_smoke_test_storage_class: "csi-smoke-hcloud-immediate"
 csi_smoke_test_base_storage_class: "hcloud-volumes"
 csi_smoke_test_size: "1Gi"
 csi_smoke_test_pvc_timeout_seconds: 300
 csi_smoke_test_job_timeout_seconds: 300
 csi_smoke_test_required: false
@@ -1,425 +0,0 @@
 ---
 - name: Create Hetzner CSI secret
  shell: |
    kubectl -n kube-system create secret generic hcloud \
      --from-literal=token='{{ hcloud_token }}' \
      --from-literal=network='{{ cluster_name }}-network' \
      --dry-run=client -o yaml | kubectl apply -f -
  no_log: true
  when: hcloud_token is defined
  changed_when: true
 - name: Deploy Hetzner CSI
  command: kubectl apply -f {{ csi_manifest_url }}
  changed_when: true
 - name: Ensure CSI controller endpoint is set for sidecars
  command: kubectl -n kube-system set env deployment/hcloud-csi-controller CSI_ENDPOINT=unix:///run/csi/socket
  changed_when: true
 - name: Ensure CSI node endpoint is set for sidecars
  command: kubectl -n kube-system set env daemonset/hcloud-csi-node CSI_ENDPOINT=unix:///run/csi/socket
  changed_when: true
 - name: Restart CSI controller to pick up current secret
  command: kubectl -n kube-system rollout restart deployment/hcloud-csi-controller
  changed_when: true
 - name: Wait for CSI controller deployment generation
  command: kubectl -n kube-system rollout status deployment/hcloud-csi-controller --timeout=30s
  failed_when: false
  changed_when: false
 - name: Wait for CSI controller rollout
  command: kubectl rollout status deployment/hcloud-csi-controller -n kube-system --timeout={{ csi_rollout_timeout_seconds }}s
  register: csi_controller_rollout
  until: csi_controller_rollout.rc == 0
  retries: "{{ csi_rollout_retries | int }}"
  delay: "{{ csi_rollout_delay_seconds | int }}"
  failed_when: false
  changed_when: false
 - name: Show CSI controller status on failure
  command: kubectl -n kube-system get deployment hcloud-csi-controller -o wide
  register: csi_controller_deploy_status
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Show CSI controller pods on failure
  command: kubectl -n kube-system get pods -l app=hcloud-csi-controller -o wide
  register: csi_controller_pods_status
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Describe CSI controller deployment on failure
  command: kubectl -n kube-system describe deployment hcloud-csi-controller
  register: csi_controller_deploy_describe
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Describe CSI controller pod on failure
  shell: |
    pod="$(kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
    if [ -n "$pod" ]; then
      kubectl -n kube-system describe pod "$pod"
    fi
  register: csi_controller_pod_describe
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Show CSI driver logs on failure
  command: kubectl -n kube-system logs deployment/hcloud-csi-controller -c hcloud-csi-driver --tail={{ csi_failure_log_tail_lines }}
  register: csi_driver_logs
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Show CSI driver previous logs on failure
  shell: |
    pod="$(kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
    if [ -n "$pod" ]; then
      kubectl -n kube-system logs "$pod" -c hcloud-csi-driver --previous --tail={{ csi_failure_log_tail_lines }}
    fi
  register: csi_driver_previous_logs
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Show sidecar previous logs on failure
  shell: |
    pod="$(kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
    if [ -n "$pod" ]; then
      for container in csi-attacher csi-resizer csi-provisioner; do
        echo "===== $container ====="
        kubectl -n kube-system logs "$pod" -c "$container" --previous --tail={{ csi_failure_log_tail_lines }} || true
      done
    fi
  register: csi_sidecar_previous_logs
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Show recent kube-system events on failure
  command: kubectl -n kube-system get events --sort-by=.lastTimestamp
  register: csi_recent_events
  changed_when: false
  failed_when: false
  when: csi_controller_rollout.rc != 0
 - name: Fail with CSI controller diagnostics
  fail:
    msg: |
      CSI controller rollout failed.
      Deployment status:
      {{ csi_controller_deploy_status.stdout | default('n/a') }}
      Pods status:
      {{ csi_controller_pods_status.stdout | default('n/a') }}
      Deployment describe:
      {{ csi_controller_deploy_describe.stdout | default('n/a') }}
      Pod describe:
      {{ csi_controller_pod_describe.stdout | default('n/a') }}
      hcloud-csi-driver logs:
      {{ csi_driver_logs.stdout | default('n/a') }}
      hcloud-csi-driver previous logs:
      {{ csi_driver_previous_logs.stdout | default('n/a') }}
      Sidecar previous logs:
      {{ csi_sidecar_previous_logs.stdout | default('n/a') }}
      Recent kube-system events:
      {{ csi_recent_events.stdout | default('n/a') }}
  when: csi_controller_rollout.rc != 0
 - name: Wait for CSI node daemonset rollout
  command: kubectl rollout status daemonset/hcloud-csi-node -n kube-system --timeout={{ csi_rollout_timeout_seconds }}s
  register: csi_node_rollout
  until: csi_node_rollout.rc == 0
  retries: "{{ csi_rollout_retries | int }}"
  delay: "{{ csi_rollout_delay_seconds | int }}"
  failed_when: false
  changed_when: false
 - name: Fail when CSI node daemonset rollout does not complete
  fail:
    msg: "CSI node daemonset rollout failed: {{ csi_node_rollout.stdout | default('') }} {{ csi_node_rollout.stderr | default('') }}"
  when: csi_node_rollout.rc != 0
 - name: Generate CSI smoke test run identifier
  set_fact:
    csi_smoke_test_run_id: "{{ lookup('pipe', 'date +%s') }}"
  when: csi_smoke_test_enabled | bool
 - name: Generate unique CSI smoke test resource names
  set_fact:
    csi_smoke_test_pvc_name: "csi-smoke-pvc-{{ csi_smoke_test_run_id }}"
    csi_smoke_test_job_name: "csi-smoke-job-{{ csi_smoke_test_run_id }}"
  when: csi_smoke_test_enabled | bool
 - name: Cleanup stale CSI smoke test resources before apply
  shell: |
    kubectl -n kube-system delete job,pvc -l app.kubernetes.io/name=csi-smoke --ignore-not-found --wait=true
    kubectl delete storageclass {{ csi_smoke_test_storage_class }} --ignore-not-found
  failed_when: false
  changed_when: false
  when: csi_smoke_test_enabled | bool
 - name: Apply CSI smoke test resources
  shell: |
    kubectl apply -f - <<'EOF'
    apiVersion: storage.k8s.io/v1
    kind: StorageClass
    metadata:
      name: {{ csi_smoke_test_storage_class }}
    provisioner: csi.hetzner.cloud
    reclaimPolicy: Delete
    volumeBindingMode: Immediate
    allowVolumeExpansion: true
    ---
    apiVersion: v1
    kind: PersistentVolumeClaim
    metadata:
      name: {{ csi_smoke_test_pvc_name }}
      namespace: kube-system
      labels:
        app.kubernetes.io/name: csi-smoke
    spec:
      accessModes:
        - ReadWriteOnce
      resources:
        requests:
          storage: {{ csi_smoke_test_size }}
      storageClassName: {{ csi_smoke_test_storage_class }}
    ---
    apiVersion: batch/v1
    kind: Job
    metadata:
      name: {{ csi_smoke_test_job_name }}
      namespace: kube-system
      labels:
        app.kubernetes.io/name: csi-smoke
    spec:
      backoffLimit: 0
      template:
        spec:
          restartPolicy: Never
          containers:
            - name: write-and-read
              image: busybox:1.36
              command: ["/bin/sh", "-c", "echo csi-ok > /data/health && cat /data/health"]
              volumeMounts:
                - name: data
                  mountPath: /data
          volumes:
            - name: data
              persistentVolumeClaim:
                claimName: {{ csi_smoke_test_pvc_name }}
    EOF
  changed_when: true
  when: csi_smoke_test_enabled | bool
 - name: Wait for CSI smoke PVC to bind
  command: kubectl -n kube-system wait --for=jsonpath='{.status.phase}'=Bound pvc/{{ csi_smoke_test_pvc_name }} --timeout={{ csi_smoke_test_pvc_timeout_seconds }}s
  register: csi_smoke_pvc_wait
  failed_when: false
  changed_when: false
  when: csi_smoke_test_enabled | bool
 - name: Wait for CSI smoke Job completion
  command: kubectl -n kube-system wait --for=condition=complete job/{{ csi_smoke_test_job_name }} --timeout={{ csi_smoke_test_job_timeout_seconds }}s
  register: csi_smoke_job_wait
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc == 0
 - name: Show CSI smoke job logs
  command: kubectl -n kube-system logs job/{{ csi_smoke_test_job_name }}
  register: csi_smoke_job_logs
  failed_when: false
  changed_when: false
  when: csi_smoke_test_enabled | bool
 - name: Show CSI smoke PVC on failure
  command: kubectl -n kube-system get pvc {{ csi_smoke_test_pvc_name }} -o wide
  register: csi_smoke_pvc_status
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Show CSI smoke Job on failure
  command: kubectl -n kube-system get job {{ csi_smoke_test_job_name }} -o wide
  register: csi_smoke_job_status
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Show CSI smoke pods on failure
  command: kubectl -n kube-system get pod -l job-name={{ csi_smoke_test_job_name }} -o wide
  register: csi_smoke_pod_status
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Describe CSI smoke PVC on failure
  command: kubectl -n kube-system describe pvc {{ csi_smoke_test_pvc_name }}
  register: csi_smoke_pvc_describe
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Show storage classes on failure
  command: kubectl get storageclass
  register: csi_storageclasses
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Get CSI controller pod name on smoke failure
  shell: kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}'
  register: csi_controller_pod_name
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Describe CSI controller pod on smoke failure
  command: kubectl -n kube-system describe pod {{ csi_controller_pod_name.stdout }}
  register: csi_controller_pod_smoke_describe
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
    - csi_controller_pod_name.stdout | length > 0
 - name: Show CSI controller container logs on smoke failure
  shell: |
    pod="{{ csi_controller_pod_name.stdout }}"
    for container in hcloud-csi-driver csi-provisioner csi-attacher csi-resizer liveness-probe; do
      echo "===== ${container}: current ====="
      kubectl -n kube-system logs "$pod" -c "$container" --tail={{ csi_failure_log_tail_lines }} || true
      echo "===== ${container}: previous ====="
      kubectl -n kube-system logs "$pod" -c "$container" --previous --tail={{ csi_failure_log_tail_lines }} || true
    done
  register: csi_controller_container_logs
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
    - csi_controller_pod_name.stdout | length > 0
 - name: Show CSI driver and node driver objects on smoke failure
  shell: |
    echo "===== CSIDriver ====="
    kubectl get csidriver csi.hetzner.cloud -o yaml || true
    echo "===== CSINode ====="
    kubectl get csinode -o wide || true
  register: csi_driver_objects
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Show CSI smoke pod describe on failure
  shell: |
    pod="$(kubectl -n kube-system get pods -l job-name={{ csi_smoke_test_job_name }} -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
    if [ -n "$pod" ]; then
      kubectl -n kube-system describe pod "$pod"
    fi
  register: csi_smoke_pod_describe
  failed_when: false
  changed_when: false
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
 - name: Fail when CSI smoke test fails
  fail:
    msg: |
      CSI smoke test failed.
      PVC wait:
      stdout: {{ csi_smoke_pvc_wait.stdout | default('') }}
      stderr: {{ csi_smoke_pvc_wait.stderr | default('') }}
      Job wait:
      stdout: {{ csi_smoke_job_wait.stdout | default('') }}
      stderr: {{ csi_smoke_job_wait.stderr | default('') }}
      PVC:
      {{ csi_smoke_pvc_status.stdout | default(csi_smoke_pvc_status.stderr | default('n/a')) }}
      Job:
      {{ csi_smoke_job_status.stdout | default(csi_smoke_job_status.stderr | default('n/a')) }}
      Pod list:
      {{ csi_smoke_pod_status.stdout | default(csi_smoke_pod_status.stderr | default('n/a')) }}
      PVC describe:
      {{ csi_smoke_pvc_describe.stdout | default(csi_smoke_pvc_describe.stderr | default('n/a')) }}
      Storage classes:
      {{ csi_storageclasses.stdout | default(csi_storageclasses.stderr | default('n/a')) }}
      CSI controller pod:
      {{ csi_controller_pod_name.stdout | default('n/a') }}
      CSI controller pod describe:
      {{ csi_controller_pod_smoke_describe.stdout | default(csi_controller_pod_smoke_describe.stderr | default('n/a')) }}
      CSI controller container logs:
      {{ csi_controller_container_logs.stdout | default(csi_controller_container_logs.stderr | default('n/a')) }}
      CSI driver objects:
      {{ csi_driver_objects.stdout | default(csi_driver_objects.stderr | default('n/a')) }}
      Pod describe:
      {{ csi_smoke_pod_describe.stdout | default('n/a') }}
      Job logs:
      {{ csi_smoke_job_logs.stdout | default('n/a') }}
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
    - csi_smoke_test_required | bool
 - name: Warn when CSI smoke test fails but is non-blocking
  debug:
    msg: |
      CSI smoke test failed but csi_smoke_test_required is false, so deployment will continue.
      PVC wait stderr: {{ csi_smoke_pvc_wait.stderr | default('') }}
      Job wait stderr: {{ csi_smoke_job_wait.stderr | default('') }}
  when:
    - csi_smoke_test_enabled | bool
    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
    - not (csi_smoke_test_required | bool)
 - name: Cleanup CSI smoke test resources
  shell: |
    kubectl -n kube-system delete job {{ csi_smoke_test_job_name }} pvc {{ csi_smoke_test_pvc_name }} --ignore-not-found
    kubectl delete storageclass {{ csi_smoke_test_storage_class }} --ignore-not-found
  failed_when: false
  changed_when: false
  when: csi_smoke_test_enabled | bool
@@ -0,0 +1,24 @@
 ---
 - name: Ensure Doppler service token is provided
  assert:
    that:
      - doppler_hetznerterra_service_token | length > 0
    fail_msg: doppler_hetznerterra_service_token must be provided for External Secrets bootstrap.
 - name: Ensure external-secrets namespace exists
  shell: kubectl create namespace external-secrets --dry-run=client -o yaml | kubectl apply -f -
  changed_when: true
 - name: Apply Doppler service token secret
  shell: >-
    kubectl -n external-secrets create secret generic doppler-hetznerterra-service-token
    --from-literal=dopplerToken='{{ doppler_hetznerterra_service_token }}'
    --dry-run=client -o yaml | kubectl apply -f -
  changed_when: true
  no_log: true
 - name: Note pending Doppler ClusterSecretStore bootstrap
  debug:
    msg: >-
      Doppler service token secret is bootstrapped. The deploy workflow creates the
      ClusterSecretStore after External Secrets CRDs and webhook endpoints are ready.
@@ -1,5 +1,7 @@
 ---
-k3s_version: latest
+k3s_version: v1.34.6+k3s1
 k3s_server_url: ""
 k3s_token: ""
 k3s_node_ip: ""
 k3s_kubelet_cloud_provider_external: false
 k3s_flannel_iface: "{{ ansible_default_ipv4.interface | default('eth0') }}"
@@ -1,25 +1,67 @@
 ---
- name: Check if k3s agent is already installed
+- name: Check if k3s agent service exists
  stat:
-    path: /usr/local/bin/k3s-agent
+    path: /etc/systemd/system/k3s-agent.service
-  register: k3s_agent_binary
+  register: k3s_agent_service
 - name: Check k3s agent service state
  command: systemctl is-active k3s-agent
  register: k3s_agent_service_state
  changed_when: false
  failed_when: false
  when: k3s_agent_service.stat.exists
 - name: Check installed k3s version
  command: k3s --version
  register: installed_k3s_version
  changed_when: false
  failed_when: false
  when: k3s_agent_service.stat.exists
 - name: Determine whether k3s agent install is needed
  set_fact:
    k3s_agent_install_needed: >-
      {{
        (not k3s_agent_service.stat.exists)
        or ((k3s_agent_service_state.stdout | default('')) != 'active')
        or (k3s_version != 'latest' and k3s_version not in (installed_k3s_version.stdout | default('')))
      }}
 - name: Download k3s install script
  get_url:
    url: https://get.k3s.io
    dest: /tmp/install-k3s.sh
    mode: "0755"
-  when: not k3s_agent_binary.stat.exists
+  register: k3s_agent_install_script
  until: k3s_agent_install_script is succeeded
  retries: 5
  delay: 10
  when: k3s_agent_install_needed
 - name: Install k3s agent
-  environment:
+  when: k3s_agent_install_needed
-    INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
+  block:
-    K3S_URL: "{{ k3s_server_url }}"
+    - name: Wait for Kubernetes API endpoint before agent join
-    K3S_TOKEN: "{{ k3s_token }}"
+      wait_for:
-  command: /tmp/install-k3s.sh agent --node-ip {{ k3s_node_ip }}
+        host: "{{ k3s_server_url | regex_replace('^https?://([^:/]+).*$', '\\1') }}"
-  args:
+        port: 6443
-    creates: /usr/local/bin/k3s-agent
+        state: started
-  when: not k3s_agent_binary.stat.exists
+        timeout: 180
    - name: Run k3s agent install
      environment:
        INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
        K3S_URL: "{{ k3s_server_url }}"
        K3S_TOKEN: "{{ k3s_token }}"
      command: >-
        /tmp/install-k3s.sh agent
        --node-ip {{ k3s_node_ip }}
        --flannel-iface={{ k3s_flannel_iface }}
        {% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
      register: k3s_agent_install
      until: k3s_agent_install.rc == 0
      retries: 3
      delay: 20
 - name: Wait for k3s agent to be ready
  command: systemctl is-active k3s-agent
@@ -28,3 +70,34 @@
  retries: 30
  delay: 10
  changed_when: false
 - name: Show k3s-agent service status on failure
  command: systemctl status k3s-agent --no-pager
  register: k3s_agent_status
  changed_when: false
  failed_when: false
  when: agent_status is failed
 - name: Show recent k3s-agent logs on failure
  command: journalctl -u k3s-agent -n 120 --no-pager
  register: k3s_agent_journal
  changed_when: false
  failed_when: false
  when: agent_status is failed
 - name: Fail with k3s-agent diagnostics
  fail:
    msg: |
      k3s agent failed to become ready on {{ inventory_hostname }}.
      Install stdout:
      {{ k3s_agent_install.stdout | default('n/a') }}
      Install stderr:
      {{ k3s_agent_install.stderr | default('n/a') }}
      Service status:
      {{ k3s_agent_status.stdout | default('n/a') }}
      Recent logs:
      {{ k3s_agent_journal.stdout | default('n/a') }}
  when: agent_status is failed
@@ -1,5 +1,17 @@
 ---
-k3s_version: latest
+k3s_version: v1.34.6+k3s1
 k3s_token: ""
 k3s_node_ip: ""
 k3s_primary_public_ip: ""
 k3s_disable_embedded_ccm: false
 k3s_disable_servicelb: true
 k3s_kubelet_cloud_provider_external: false
 k3s_flannel_iface: "{{ ansible_default_ipv4.interface | default('eth0') }}"
 # Load Balancer endpoint for HA cluster joins (set in inventory)
 kube_api_endpoint: ""
 # Tailscale DNS names for control planes (to enable tailnet access)
 # Using DNS names instead of IPs since Tailscale IPs change on rebuild
 tailscale_control_plane_names:
  - "k8s-cluster-cp-1.silverside-gopher.ts.net"
  - "k8s-cluster-cp-2.silverside-gopher.ts.net"
  - "k8s-cluster-cp-3.silverside-gopher.ts.net"
@@ -11,13 +11,25 @@
  failed_when: false
  when: k3s_service.stat.exists
 - name: Check installed k3s version
  command: k3s --version
  register: installed_k3s_version
  changed_when: false
  failed_when: false
  when: k3s_service.stat.exists
 - name: Determine whether k3s install is needed
  set_fact:
-    k3s_install_needed: "{{ (not k3s_service.stat.exists) or ((k3s_service_state.stdout | default('')) != 'active') }}"
+    k3s_install_needed: >-
      {{
        (not k3s_service.stat.exists)
        or ((k3s_service_state.stdout | default('')) != 'active')
        or (k3s_version != 'latest' and k3s_version not in (installed_k3s_version.stdout | default('')))
      }}
- name: Wait for primary API on 6443 (secondary only)
+- name: Wait for API endpoint on 6443 (secondary only)
  wait_for:
-    host: "{{ k3s_primary_ip }}"
+    host: "{{ k3s_join_endpoint | default(k3s_primary_ip) }}"
    port: 6443
    state: started
    timeout: 120
@@ -28,41 +40,56 @@
  stat:
    path: /usr/local/bin/k3s-uninstall.sh
  register: k3s_uninstall_script
-  when:
+  when: k3s_install_needed
    - not (k3s_primary | default(false))
    - k3s_install_needed
- name: Reset broken secondary k3s install before rejoin
+- name: Reset broken k3s install before reinstall
  command: /usr/local/bin/k3s-uninstall.sh
  when:
    - not (k3s_primary | default(false))
    - k3s_install_needed
    - k3s_uninstall_script.stat.exists
- name: Remove stale k3s data on secondary
+- name: Remove stale k3s data
  file:
    path: "{{ item }}"
    state: absent
  loop:
    - /etc/rancher/k3s
    - /var/lib/rancher/k3s
-  when:
+  when: k3s_install_needed
    - not (k3s_primary | default(false))
    - k3s_install_needed
 - name: Download k3s install script
  get_url:
    url: https://get.k3s.io
    dest: /tmp/install-k3s.sh
    mode: "0755"
  register: k3s_install_script
  until: k3s_install_script is succeeded
  retries: 5
  delay: 10
  when: k3s_install_needed
 - name: Install k3s server (primary)
  environment:
    INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
    K3S_TOKEN: "{{ k3s_token }}"
-  command: /tmp/install-k3s.sh server --cluster-init --advertise-address={{ k3s_primary_ip }} --node-ip={{ k3s_node_ip }} --tls-san={{ k3s_primary_ip }} --tls-san={{ k3s_primary_public_ip }}
+  command: >-
-  when: 
+    /tmp/install-k3s.sh server
    --cluster-init
    --advertise-address={{ k3s_primary_ip }}
    --node-ip={{ k3s_node_ip }}
    --flannel-iface={{ k3s_flannel_iface }}
    --tls-san={{ k3s_primary_ip }}
    --tls-san={{ k3s_primary_public_ip }}
    --tls-san={{ kube_api_endpoint }}
    {% for name in tailscale_control_plane_names %}--tls-san={{ name }} {% endfor %}
    {% if k3s_disable_embedded_ccm | bool %}--disable-cloud-controller{% endif %}
    {% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
    {% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
  register: primary_install
  until: primary_install.rc == 0
  retries: 3
  delay: 20
  when:
    - k3s_install_needed
    - k3s_primary | default(false)
@@ -75,37 +102,19 @@
      environment:
        INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
        K3S_TOKEN: "{{ k3s_token }}"
-      command: /tmp/install-k3s.sh server --server https://{{ k3s_primary_ip }}:6443 --advertise-address={{ k3s_node_ip }} --node-ip={{ k3s_node_ip }}
+      command: >-
        /tmp/install-k3s.sh server
        --server https://{{ k3s_join_endpoint | default(k3s_primary_ip) }}:6443
        --advertise-address={{ k3s_node_ip }}
        --node-ip={{ k3s_node_ip }}
        --flannel-iface={{ k3s_flannel_iface }}
        {% if k3s_disable_embedded_ccm | bool %}--disable-cloud-controller{% endif %}
        {% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
        {% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
      register: secondary_install
-
+      until: secondary_install.rc == 0
-  rescue:
+      retries: 3
-    - name: Show k3s service status after failed secondary install
+      delay: 20
      command: systemctl status k3s --no-pager
      register: k3s_status_after_install
      changed_when: false
      failed_when: false
    - name: Show recent k3s logs after failed secondary install
      command: journalctl -u k3s -n 120 --no-pager
      register: k3s_journal_after_install
      changed_when: false
      failed_when: false
    - name: Fail with secondary install diagnostics
      fail:
        msg: |
          Secondary k3s install failed on {{ inventory_hostname }}.
          Install stdout:
          {{ secondary_install.stdout | default('n/a') }}
          Install stderr:
          {{ secondary_install.stderr | default('n/a') }}
          Service status:
          {{ k3s_status_after_install.stdout | default('n/a') }}
          Recent logs:
          {{ k3s_journal_after_install.stdout | default('n/a') }}
 - name: Wait for k3s to be ready
  command: "{{ (k3s_primary | default(false)) | ternary('kubectl get nodes', 'systemctl is-active k3s') }}"
@@ -0,0 +1,7 @@
 ---
 kube_vip_version: v1.1.2
 kube_vip_interface: "{{ ansible_default_ipv4.interface | default('eth0') }}"
 kube_vip_address: "{{ kube_api_endpoint }}"
 kube_vip_prepull_images:
  - docker.io/rancher/mirrored-pause:3.6
  - ghcr.io/kube-vip/kube-vip:{{ kube_vip_version }}
@@ -0,0 +1,102 @@
 ---
 - name: Check for runner-provided kube-vip image archive
  stat:
    path: "{{ playbook_dir }}/../outputs/kube-vip-bootstrap.tar"
  delegate_to: localhost
  become: false
  register: kube_vip_bootstrap_archive
 - name: Copy runner-provided kube-vip image archive
  copy:
    src: "{{ playbook_dir }}/../outputs/kube-vip-bootstrap.tar"
    dest: /tmp/kube-vip-bootstrap.tar
    mode: "0644"
  when: kube_vip_bootstrap_archive.stat.exists
 - name: Import runner-provided kube-vip image archive
  command: /usr/local/bin/ctr -n k8s.io images import /tmp/kube-vip-bootstrap.tar
  changed_when: false
  when: kube_vip_bootstrap_archive.stat.exists
 - name: Pre-pull kube-vip bootstrap images into containerd
  shell: |
    if /usr/local/bin/ctr -n k8s.io images ls -q | grep -Fx -- "{{ item }}" >/dev/null; then
      echo "already present"
      exit 0
    fi
    for attempt in 1 2 3; do
      if timeout 120s /usr/local/bin/ctr -n k8s.io images pull "{{ item }}"; then
        echo "pulled image"
        exit 0
      fi
      sleep 10
    done
    exit 1
  args:
    executable: /bin/bash
  register: kube_vip_image_pull
  loop: "{{ kube_vip_prepull_images }}"
  changed_when: "'pulled image' in kube_vip_image_pull.stdout"
 - name: Render kube-vip control plane manifest
  template:
    src: kube-vip-control-plane.yaml.j2
    dest: /tmp/kube-vip-control-plane.yaml
    mode: "0644"
 - name: Apply kube-vip control plane manifest
  command: kubectl apply -f /tmp/kube-vip-control-plane.yaml
  register: kube_vip_apply
  until: kube_vip_apply.rc == 0
  retries: 3
  delay: 10
  changed_when: true
 - name: Wait for local kube-vip pod to be ready
  shell: >-
    kubectl -n kube-system get pods
    -l app.kubernetes.io/name=kube-vip
    --field-selector spec.nodeName={{ inventory_hostname }}
    -o jsonpath='{.items[0].status.conditions[?(@.type=="Ready")].status}'
  register: kube_vip_pod_ready
  changed_when: false
  until: kube_vip_pod_ready.stdout == "True"
  retries: 30
  delay: 10
 - name: Show kube-vip pod status on failure
  command: kubectl -n kube-system get pods -l app.kubernetes.io/name=kube-vip -o wide
  register: kube_vip_pods
  changed_when: false
  failed_when: false
  when: kube_vip_pod_ready is failed
 - name: Describe kube-vip pod on failure
  shell: >-
    kubectl -n kube-system describe pod
    $(kubectl -n kube-system get pods -l app.kubernetes.io/name=kube-vip --field-selector spec.nodeName={{ inventory_hostname }} -o jsonpath='{.items[0].metadata.name}')
  register: kube_vip_pod_describe
  changed_when: false
  failed_when: false
  when: kube_vip_pod_ready is failed
 - name: Fail with kube-vip diagnostics
  fail:
    msg: |
      kube-vip failed to become ready on {{ inventory_hostname }}.
      Pods:
      {{ kube_vip_pods.stdout | default('n/a') }}
      Describe:
      {{ kube_vip_pod_describe.stdout | default('n/a') }}
  when: kube_vip_pod_ready is failed
 - name: Wait for API VIP on 6443
  wait_for:
    host: "{{ kube_vip_address }}"
    port: 6443
    state: started
    timeout: 180
@@ -0,0 +1,110 @@
 apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: kube-vip
  namespace: kube-system
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
  name: system:kube-vip-role
 rules:
  - apiGroups: [""]
    resources: ["services/status"]
    verbs: ["update"]
  - apiGroups: [""]
    resources: ["services", "endpoints"]
    verbs: ["list", "get", "watch", "update"]
  - apiGroups: [""]
    resources: ["nodes"]
    verbs: ["list", "get", "watch", "update", "patch"]
  - apiGroups: ["coordination.k8s.io"]
    resources: ["leases"]
    verbs: ["list", "get", "watch", "update", "create"]
  - apiGroups: ["discovery.k8s.io"]
    resources: ["endpointslices"]
    verbs: ["list", "get", "watch", "update"]
  - apiGroups: [""]
    resources: ["pods"]
    verbs: ["list"]
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
  name: system:kube-vip-binding
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: system:kube-vip-role
 subjects:
  - kind: ServiceAccount
    name: kube-vip
    namespace: kube-system
 ---
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
  name: kube-vip
  namespace: kube-system
 spec:
  selector:
    matchLabels:
      app.kubernetes.io/name: kube-vip
  template:
    metadata:
      labels:
        app.kubernetes.io/name: kube-vip
    spec:
      serviceAccountName: kube-vip
      hostNetwork: true
      affinity:
        nodeAffinity:
          requiredDuringSchedulingIgnoredDuringExecution:
            nodeSelectorTerms:
              - matchExpressions:
                  - key: node-role.kubernetes.io/control-plane
                    operator: Exists
      tolerations:
        - key: node-role.kubernetes.io/control-plane
          operator: Exists
          effect: NoSchedule
        - key: node-role.kubernetes.io/master
          operator: Exists
          effect: NoSchedule
      containers:
        - name: kube-vip
          image: ghcr.io/kube-vip/kube-vip:{{ kube_vip_version }}
          imagePullPolicy: IfNotPresent
          args:
            - manager
          env:
            - name: vip_arp
              value: "true"
            - name: port
              value: "6443"
            - name: vip_interface
              value: {{ kube_vip_interface | quote }}
            - name: vip_subnet
              value: "32"
            - name: cp_enable
              value: "true"
            - name: cp_namespace
              value: kube-system
            - name: vip_ddns
              value: "false"
            - name: vip_leaderelection
              value: "true"
            - name: vip_leaseduration
              value: "5"
            - name: vip_renewdeadline
              value: "3"
            - name: vip_retryperiod
              value: "1"
            - name: address
              value: {{ kube_vip_address | quote }}
          securityContext:
            capabilities:
              add:
                - NET_ADMIN
                - NET_RAW
                - SYS_TIME
@@ -0,0 +1,9 @@
 ---
 observability_namespace: "observability"
 grafana_dashboard_configmap_name: "grafana-dashboard-k8s-overview"
 grafana_datasource_configmap_name: "grafana-datasources-core"
 loki_enabled: true
 grafana_prometheus_url: "http://kube-prometheus-stack-prometheus.{{ observability_namespace }}.svc.cluster.local:9090"
 grafana_loki_url: "http://loki.{{ observability_namespace }}.svc.cluster.local:3100"
 grafana_use_prometheus_nodeport_fallback: true
 grafana_use_loki_nodeport_fallback: true
@@ -0,0 +1,178 @@
 ---
 - name: Ensure observability namespace exists
  command: kubectl create namespace {{ observability_namespace }}
  register: create_observability_ns
  failed_when: create_observability_ns.rc != 0 and "AlreadyExists" not in create_observability_ns.stderr
  changed_when: create_observability_ns.rc == 0
 - name: Wait for Grafana deployment rollout
  command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
  changed_when: false
 - name: Set default Prometheus datasource URL
  set_fact:
    grafana_prometheus_effective_url: "{{ grafana_prometheus_url }}"
    grafana_loki_effective_url: "{{ grafana_loki_url }}"
 - name: Get Grafana pod name
  command: kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}'
  register: grafana_pod_name
  changed_when: false
 - name: Probe Prometheus from Grafana pod via default datasource URL
  shell: >-
    kubectl -n {{ observability_namespace }} exec {{ grafana_pod_name.stdout }} -c grafana --
    sh -c 'wget -qO- --timeout=5 {{ grafana_prometheus_url }}/-/ready >/dev/null'
  register: grafana_prometheus_probe
  changed_when: false
  failed_when: false
 - name: Probe Loki from Grafana pod via default datasource URL
  shell: >-
    kubectl -n {{ observability_namespace }} exec {{ grafana_pod_name.stdout }} -c grafana --
    sh -c 'wget -qO- --timeout=5 {{ grafana_loki_url }}/ready >/dev/null'
  register: grafana_loki_probe
  changed_when: false
  failed_when: false
  when: loki_enabled
 - name: Get Prometheus pod host IP for fallback
  command: kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=prometheus -o jsonpath='{.items[0].status.hostIP}'
  register: prometheus_host_ip
  changed_when: false
  when:
    - grafana_use_prometheus_nodeport_fallback | bool
    - grafana_prometheus_probe.rc != 0
 - name: Get Prometheus service NodePort for fallback
  command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-prometheus -o jsonpath='{.spec.ports[?(@.name=="http-web")].nodePort}'
  register: prometheus_nodeport
  changed_when: false
  when:
    - grafana_use_prometheus_nodeport_fallback | bool
    - grafana_prometheus_probe.rc != 0
 - name: Enable Prometheus NodePort fallback datasource URL
  set_fact:
    grafana_prometheus_effective_url: "http://{{ prometheus_host_ip.stdout }}:{{ prometheus_nodeport.stdout }}"
  when:
    - grafana_use_prometheus_nodeport_fallback | bool
    - grafana_prometheus_probe.rc != 0
    - prometheus_host_ip.stdout | length > 0
    - prometheus_nodeport.stdout | length > 0
 - name: Ensure Loki service uses NodePort for fallback
  command: kubectl -n {{ observability_namespace }} patch svc loki -p '{"spec":{"type":"NodePort"}}'
  changed_when: false
  failed_when: false
  when:
    - loki_enabled
    - grafana_use_loki_nodeport_fallback | bool
    - grafana_loki_probe.rc != 0
 - name: Get Loki pod host IP for fallback
  command: kubectl -n {{ observability_namespace }} get pod loki-0 -o jsonpath='{.status.hostIP}'
  register: loki_host_ip
  changed_when: false
  when:
    - loki_enabled
    - grafana_use_loki_nodeport_fallback | bool
    - grafana_loki_probe.rc != 0
 - name: Get Loki service NodePort for fallback
  command: kubectl -n {{ observability_namespace }} get svc loki -o jsonpath='{.spec.ports[?(@.name=="http-metrics")].nodePort}'
  register: loki_nodeport
  changed_when: false
  when:
    - loki_enabled
    - grafana_use_loki_nodeport_fallback | bool
    - grafana_loki_probe.rc != 0
 - name: Enable Loki NodePort fallback datasource URL
  set_fact:
    grafana_loki_effective_url: "http://{{ loki_host_ip.stdout }}:{{ loki_nodeport.stdout }}"
  when:
    - loki_enabled
    - grafana_use_loki_nodeport_fallback | bool
    - grafana_loki_probe.rc != 0
    - loki_host_ip.stdout | length > 0
    - loki_nodeport.stdout | length > 0
 - name: Query Loki labels endpoint from Grafana pod
  shell: >-
    kubectl -n {{ observability_namespace }} exec {{ grafana_pod_name.stdout }} -c grafana --
    sh -c 'wget -qO- --timeout=10 {{ grafana_loki_effective_url }}/loki/api/v1/labels'
  register: grafana_loki_labels
  changed_when: false
  failed_when: false
  until: >-
    grafana_loki_labels.rc != 0 or
    '"data":[]' not in (grafana_loki_labels.stdout | replace(' ', ''))
  retries: 30
  delay: 10
  when: loki_enabled
 - name: Fail when Loki is reachable but has zero indexed labels
  fail:
    msg: >-
      Loki is reachable from Grafana at {{ grafana_loki_effective_url }} but /loki/api/v1/labels returned no labels.
      This usually means no logs are ingested yet. Check Promtail and tenant configuration.
  when:
    - loki_enabled
    - grafana_loki_labels.rc == 0
    - "'\"status\":\"success\"' in (grafana_loki_labels.stdout | replace(' ', ''))"
    - "'\"data\":[]' in (grafana_loki_labels.stdout | replace(' ', ''))"
 - name: Write default Prometheus datasource ConfigMap patch
  template:
    src: grafana-default-prometheus-datasource.yaml.j2
    dest: /tmp/grafana-default-prometheus-datasource.yaml
    mode: "0644"
 - name: Apply default Prometheus datasource ConfigMap patch
  command: kubectl apply -f /tmp/grafana-default-prometheus-datasource.yaml
  changed_when: true
 - name: Remove legacy Loki datasource ConfigMap
  command: kubectl -n {{ observability_namespace }} delete configmap grafana-datasource-loki --ignore-not-found=true
  changed_when: false
  failed_when: false
 - name: Write Grafana datasources ConfigMap
  template:
    src: grafana-datasources.yaml.j2
    dest: /tmp/grafana-datasources.yaml
    mode: "0644"
  when: loki_enabled
 - name: Apply Grafana datasources ConfigMap
  command: kubectl apply -f /tmp/grafana-datasources.yaml
  changed_when: true
  when: loki_enabled
 - name: Restart Grafana to load datasource updates deterministically
  command: kubectl -n {{ observability_namespace }} rollout restart deployment/kube-prometheus-stack-grafana
  changed_when: true
 - name: Wait for Grafana rollout after datasource update
  command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
  changed_when: false
 - name: Write Grafana dashboard ConfigMap
  template:
    src: grafana-dashboard-k8s-overview.yaml.j2
    dest: /tmp/grafana-dashboard-k8s-overview.yaml
    mode: "0644"
 - name: Apply Grafana dashboard ConfigMap
  command: kubectl apply -f /tmp/grafana-dashboard-k8s-overview.yaml
  changed_when: true
 - name: Show Grafana content provisioning summary
  debug:
    msg: |
      Grafana content applied.
      Datasources ConfigMap: {{ grafana_datasource_configmap_name }}
      Prometheus datasource URL: {{ grafana_prometheus_effective_url }}
      Loki datasource URL: {{ grafana_loki_effective_url }}
      Dashboard ConfigMap: {{ grafana_dashboard_configmap_name }}
@@ -0,0 +1,60 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: {{ grafana_dashboard_configmap_name }}
  namespace: {{ observability_namespace }}
  labels:
    grafana_dashboard: "1"
 data:
  k8s-overview.json: |
    {
      "annotations": {"list": []},
      "editable": true,
      "fiscalYearStartMonth": 0,
      "graphTooltip": 0,
      "id": null,
      "links": [],
      "panels": [
        {
          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
          "fieldConfig": {"defaults": {"unit": "none"}, "overrides": []},
          "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
          "id": 1,
          "options": {"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
          "targets": [
            {
              "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
              "legendFormat": "ready",
              "refId": "A"
            }
          ],
          "title": "Ready Nodes",
          "type": "stat"
        },
        {
          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
          "fieldConfig": {"defaults": {"unit": "percentunit"}, "overrides": []},
          "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
          "id": 2,
          "targets": [
            {
              "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))",
              "legendFormat": "cpu",
              "refId": "A"
            }
          ],
          "title": "Cluster CPU Usage",
          "type": "timeseries"
        }
      ],
      "refresh": "30s",
      "schemaVersion": 39,
      "style": "dark",
      "tags": ["kubernetes", "infrastructure"],
      "templating": {"list": []},
      "time": {"from": "now-1h", "to": "now"},
      "timezone": "browser",
      "title": "K8s Cluster Overview",
      "uid": "k8s-cluster-overview",
      "version": 1
    }
@@ -0,0 +1,18 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: {{ grafana_datasource_configmap_name }}
  namespace: {{ observability_namespace }}
  labels:
    grafana_datasource: "1"
 data:
  datasources.yaml: |
    apiVersion: 1
    datasources:
 {% if loki_enabled %}
      - name: Loki
        type: loki
        access: proxy
        url: "{{ grafana_loki_effective_url }}"
        isDefault: false
 {% endif %}
@@ -0,0 +1,26 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: kube-prometheus-stack-grafana-datasource
  namespace: {{ observability_namespace }}
 data:
  datasource.yaml: |-
    apiVersion: 1
    datasources:
    - name: "Prometheus"
      type: prometheus
      uid: prometheus
      url: "{{ grafana_prometheus_effective_url }}/"
      access: proxy
      isDefault: true
      jsonData:
        httpMethod: POST
        timeInterval: 30s
    - name: "Alertmanager"
      type: alertmanager
      uid: alertmanager
      url: http://kube-prometheus-stack-alertmanager.{{ observability_namespace }}:9093/
      access: proxy
      jsonData:
        handleGrafanaManagedAlerts: false
        implementation: prometheus
@@ -0,0 +1,27 @@
 ---
 observability_namespace: "observability"
 prometheus_chart_version: "68.4.4"
 loki_chart_version: "6.10.0"
 promtail_chart_version: "6.16.6"
 grafana_admin_password: ""
 prometheus_storage_size: "10Gi"
 grafana_storage_size: "5Gi"
 loki_storage_size: "10Gi"
 prometheus_storage_class: "local-path"
 grafana_storage_class: "local-path"
 loki_storage_class: "local-path"
 loki_enabled: true
 tailscale_oauth_client_id: ""
 tailscale_oauth_client_secret: ""
 tailscale_tailnet: ""
 observability_tailscale_expose: true
 grafana_tailscale_hostname: "grafana"
 prometheus_tailscale_hostname: "prometheus"
 tailscale_proxyclass_name: "infra-stable"
@@ -0,0 +1,252 @@
 ---
 - name: Check if Helm is installed
  command: helm version --short
  register: helm_check
  changed_when: false
  failed_when: false
 - name: Install Helm
  shell: curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
  when: helm_check.rc != 0
  changed_when: true
 - name: Ensure observability namespace exists
  command: kubectl create namespace {{ observability_namespace }}
  register: create_observability_ns
  failed_when: create_observability_ns.rc != 0 and "AlreadyExists" not in create_observability_ns.stderr
  changed_when: create_observability_ns.rc == 0
 - name: Set Grafana admin password
  set_fact:
    grafana_password_effective: "{{ grafana_admin_password if grafana_admin_password | length > 0 else lookup('password', '/dev/null length=32 chars=ascii_letters,digits') }}"
 - name: Write kube-prometheus-stack values
  template:
    src: kube-prometheus-stack-values.yaml.j2
    dest: /tmp/kube-prometheus-stack-values.yaml
    mode: "0644"
 - name: Add Prometheus Helm repo
  command: helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
  register: add_prom_repo
  failed_when: add_prom_repo.rc != 0 and "already exists" not in add_prom_repo.stderr
  changed_when: add_prom_repo.rc == 0
 - name: Add Grafana Helm repo
  command: helm repo add grafana https://grafana.github.io/helm-charts
  register: add_grafana_repo
  failed_when: add_grafana_repo.rc != 0 and "already exists" not in add_grafana_repo.stderr
  changed_when: add_grafana_repo.rc == 0
 - name: Update Helm repos
  command: helm repo update
  changed_when: false
 - name: Clear stale pending Helm revision secrets for kube-prometheus-stack
  shell: >-
    kubectl -n {{ observability_namespace }} delete
    $(kubectl -n {{ observability_namespace }} get secret -l owner=helm,name=kube-prometheus-stack,status=pending-upgrade -o name)
    --ignore-not-found=true;
    kubectl -n {{ observability_namespace }} delete
    $(kubectl -n {{ observability_namespace }} get secret -l owner=helm,name=kube-prometheus-stack,status=pending-install -o name)
    --ignore-not-found=true;
    kubectl -n {{ observability_namespace }} delete
    $(kubectl -n {{ observability_namespace }} get secret -l owner=helm,name=kube-prometheus-stack,status=pending-rollback -o name)
    --ignore-not-found=true
  changed_when: false
  failed_when: false
 - name: Install kube-prometheus-stack
  command: >-
    helm upgrade --install kube-prometheus-stack prometheus-community/kube-prometheus-stack
    --namespace {{ observability_namespace }}
    --version {{ prometheus_chart_version }}
    --values /tmp/kube-prometheus-stack-values.yaml
    --wait
    --timeout 10m
  register: kube_prom_install
  retries: 12
  delay: 15
  until: kube_prom_install.rc == 0
  changed_when: true
 - name: Wait for Grafana deployment rollout
  command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
  changed_when: false
 - name: Reset Grafana admin password in Grafana database
  shell: >-
    kubectl -n {{ observability_namespace }} exec
    "$(kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}')"
    -c grafana -- grafana cli admin reset-admin-password '{{ grafana_password_effective }}'
  changed_when: true
 - name: Write Loki values
  template:
    src: loki-values.yaml.j2
    dest: /tmp/loki-values.yaml
    mode: "0644"
  when: loki_enabled
 - name: Validate Loki chart produces resources
  command: >-
    helm template loki grafana/loki
    --namespace {{ observability_namespace }}
    --version {{ loki_chart_version }}
    --values /tmp/loki-values.yaml
  register: loki_template
  changed_when: false
  failed_when: "loki_template.rc != 0 or 'kind: StatefulSet' not in loki_template.stdout"
  when: loki_enabled
 - name: Remove legacy Loki resources
  command: >-
    kubectl -n {{ observability_namespace }} delete
    deployment/loki-gateway
    statefulset/loki
    statefulset/loki-chunks-cache
    statefulset/loki-results-cache
    statefulset/loki-backend
    statefulset/loki-read
    statefulset/loki-write
    poddisruptionbudget/loki-memcached-chunks-cache
    poddisruptionbudget/loki-memcached-results-cache
    --ignore-not-found=true
  changed_when: false
  failed_when: false
  when: loki_enabled
 - name: Clear stuck Helm lock for Loki
  command: kubectl -n {{ observability_namespace }} delete secret sh.helm.release.v1.loki.v1 --ignore-not-found=true
  changed_when: false
  failed_when: false
  when: loki_enabled
 - name: Uninstall failed Loki release (if stuck)
  command: helm uninstall loki -n {{ observability_namespace }}
  changed_when: false
  failed_when: false
  when: loki_enabled
 - name: Install Loki
  command: >-
    helm upgrade --install loki grafana/loki
    --namespace {{ observability_namespace }}
    --version {{ loki_chart_version }}
    --values /tmp/loki-values.yaml
  register: loki_install
  changed_when: true
  when: loki_enabled
 - name: Wait for Loki StatefulSet
  command: kubectl -n {{ observability_namespace }} rollout status statefulset/loki --timeout=10m
  register: loki_rollout
  changed_when: false
  when: loki_enabled
 - name: Show Loki pod status
  command: kubectl -n {{ observability_namespace }} get pods -l app.kubernetes.io/name=loki -o wide
  register: loki_pods
  changed_when: false
  when: loki_enabled
 - name: Debug Loki pods
  debug:
    msg: "{{ loki_pods.stdout }}"
  when: loki_enabled
 - name: Write Promtail values
  template:
    src: promtail-values.yaml.j2
    dest: /tmp/promtail-values.yaml
    mode: "0644"
  when: loki_enabled
 - name: Install Promtail
  command: >-
    helm upgrade --install promtail grafana/promtail
    --namespace {{ observability_namespace }}
    --version {{ promtail_chart_version }}
    --values /tmp/promtail-values.yaml
    --wait
    --timeout 10m
  changed_when: true
  when: loki_enabled
 - name: Check Tailscale service readiness for Grafana
  command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-grafana -o jsonpath='{.status.conditions[?(@.type=="TailscaleProxyReady")].status}'
  register: grafana_tailscale_ready
  changed_when: false
  failed_when: false
  when:
    - observability_tailscale_expose | bool
    - tailscale_operator_ready | default(false) | bool
 - name: Check Tailscale service readiness for Prometheus
  command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-prometheus -o jsonpath='{.status.conditions[?(@.type=="TailscaleProxyReady")].status}'
  register: prometheus_tailscale_ready
  changed_when: false
  failed_when: false
  when:
    - observability_tailscale_expose | bool
    - tailscale_operator_ready | default(false) | bool
 - name: Check Tailscale endpoint (IP/hostname) for Grafana
  shell: >-
    kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-grafana
    -o go-template='{{"{{"}}range .status.loadBalancer.ingress{{"}}"}}{{"{{"}}if .ip{{"}}"}}{{"{{"}}.ip{{"}}"}}{{"{{"}}else{{"}}"}}{{"{{"}}.hostname{{"}}"}}{{"{{"}}end{{"}}"}}{{"{{"}}end{{"}}"}}'
  register: grafana_lb_ip
  changed_when: false
  failed_when: false
  when:
    - observability_tailscale_expose | bool
    - tailscale_operator_ready | default(false) | bool
 - name: Check Tailscale endpoint (IP/hostname) for Prometheus
  shell: >-
    kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-prometheus
    -o go-template='{{"{{"}}range .status.loadBalancer.ingress{{"}}"}}{{"{{"}}if .ip{{"}}"}}{{"{{"}}.ip{{"}}"}}{{"{{"}}else{{"}}"}}{{"{{"}}.hostname{{"}}"}}{{"{{"}}end{{"}}"}}{{"{{"}}end{{"}}"}}'
  register: prometheus_lb_ip
  changed_when: false
  failed_when: false
  when:
    - observability_tailscale_expose | bool
    - tailscale_operator_ready | default(false) | bool
 - name: Show Tailscale access details
  debug:
    msg: |
      Observability stack deployed with Tailscale access!
      Grafana: http://{{ grafana_tailscale_hostname }}{% if grafana_lb_ip.stdout | default('') | length > 0 %} (or http://{{ grafana_lb_ip.stdout }}){% endif %}
      Prometheus: http://{{ prometheus_tailscale_hostname }}{% if prometheus_lb_ip.stdout | default('') | length > 0 %} (or http://{{ prometheus_lb_ip.stdout }}){% endif %}
      Login: admin / {{ grafana_password_effective }}
      Tailscale readiness:
      - Grafana proxy ready: {{ grafana_tailscale_ready.stdout | default('pending') }}
      - Prometheus proxy ready: {{ prometheus_tailscale_ready.stdout | default('pending') }}
      Access via:
      - MagicDNS: http://{{ grafana_tailscale_hostname }} and http://{{ prometheus_tailscale_hostname }}
      - Tailnet FQDN: http://{{ grafana_tailscale_hostname }}.{{ tailscale_tailnet | default('tailnet.ts.net') }}
      - Direct endpoint: {% if grafana_lb_ip.stdout | default('') | length > 0 %}http://{{ grafana_lb_ip.stdout }}{% else %}(pending){% endif %} / {% if prometheus_lb_ip.stdout | default('') | length > 0 %}http://{{ prometheus_lb_ip.stdout }}{% else %}(pending){% endif %}
  when:
    - observability_tailscale_expose | bool
    - tailscale_operator_ready | default(false) | bool
 - name: Show observability access details (fallback)
  debug:
    msg: |
      Observability stack deployed.
      Namespace: {{ observability_namespace }}
      Grafana (tailnet): kubectl -n {{ observability_namespace }} port-forward svc/kube-prometheus-stack-grafana 3000:80
      Prometheus (tailnet): kubectl -n {{ observability_namespace }} port-forward svc/kube-prometheus-stack-prometheus 9090:9090
      Grafana admin password: {{ grafana_password_effective }}
      {% if loki_enabled %}
      Loki: Enabled - logs available in Grafana
      {% else %}
      Loki: Disabled
      {% endif %}
  when:
    - not (observability_tailscale_expose | bool and (tailscale_operator_ready | default(false) | bool))
@@ -0,0 +1,16 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: grafana-datasource-loki
  namespace: {{ observability_namespace }}
  labels:
    grafana_datasource: "1"
 data:
  loki-datasource.yaml: |
    apiVersion: 1
    datasources:
      - name: Loki
        type: loki
        access: proxy
        url: http://loki.{{ observability_namespace }}.svc.cluster.local:3100
        isDefault: false
@@ -0,0 +1,46 @@
 grafana:
  enabled: true
  adminPassword: {{ grafana_password_effective }}
  persistence:
    enabled: true
    storageClassName: {{ grafana_storage_class }}
    size: {{ grafana_storage_size }}
  service:
 {% if observability_tailscale_expose and (tailscale_operator_ready | default(false)) %}
    type: LoadBalancer
    loadBalancerClass: tailscale
    annotations:
      tailscale.com/hostname: {{ grafana_tailscale_hostname }}
      tailscale.com/proxy-class: {{ tailscale_proxyclass_name }}
 {% else %}
    type: ClusterIP
 {% endif %}
 prometheus:
  service:
 {% if observability_tailscale_expose and (tailscale_operator_ready | default(false)) %}
    type: LoadBalancer
    loadBalancerClass: tailscale
    annotations:
      tailscale.com/hostname: {{ prometheus_tailscale_hostname }}
      tailscale.com/proxy-class: {{ tailscale_proxyclass_name }}
 {% else %}
    type: ClusterIP
 {% endif %}
  prometheusSpec:
    retention: 7d
    storageSpec:
      volumeClaimTemplate:
        spec:
          storageClassName: {{ prometheus_storage_class }}
          accessModes: ["ReadWriteOnce"]
          resources:
            requests:
              storage: {{ prometheus_storage_size }}
 alertmanager:
  enabled: false
 kubeEtcd:
  enabled: false
 kubeControllerManager:
  enabled: false
 kubeScheduler:
  enabled: false
@@ -0,0 +1,75 @@
 deploymentMode: SingleBinary
 loki:
  auth_enabled: false
  commonConfig:
    replication_factor: 1
  schemaConfig:
    configs:
      - from: "2024-04-01"
        store: tsdb
        object_store: filesystem
        schema: v13
        index:
          prefix: loki_index_
          period: 24h
  storage:
    type: filesystem
  limits_config:
    allow_structured_metadata: true
    volume_enabled: true
    retention_period: 168h
  pattern_ingester:
    enabled: true
  ruler:
    enable_api: true
 singleBinary:
  replicas: 1
  persistence:
    size: {{ loki_storage_size }}
    storageClass: {{ loki_storage_class }}
  resources:
    requests:
      cpu: 100m
      memory: 256Mi
    limits:
      cpu: 500m
      memory: 1Gi
 backend:
  replicas: 0
 read:
  replicas: 0
 write:
  replicas: 0
 ingester:
  replicas: 0
 querier:
  replicas: 0
 queryFrontend:
  replicas: 0
 queryScheduler:
  replicas: 0
 distributor:
  replicas: 0
 compactor:
  replicas: 0
 indexGateway:
  replicas: 0
 bloomCompactor:
  replicas: 0
 bloomGateway:
  replicas: 0
 gateway:
  enabled: false
 test:
  enabled: false
 monitoring:
  selfMonitoring:
    enabled: false
  lokiCanary:
    enabled: false
@@ -0,0 +1,3 @@
 config:
  clients:
    - url: http://loki.{{ observability_namespace }}.svc.cluster.local:3100/loki/api/v1/push
@@ -0,0 +1,6 @@
 ---
 rancher_images_to_prepull:
  - docker.io/rancher/rancher:v2.13.3
  - docker.io/rancher/rancher-webhook:v0.9.3
  - docker.io/rancher/system-upgrade-controller:v0.17.0
  - docker.io/rancher/shell:v0.6.2
@@ -0,0 +1,59 @@
 ---
 - name: Check for runner-provided Rancher image archives
  stat:
    path: "{{ playbook_dir }}/../outputs/bootstrap-image-archives/{{ item | regex_replace('[/:]', '_') }}.tar"
  delegate_to: localhost
  become: false
  register: rancher_image_archive_stats
  loop: "{{ rancher_images_to_prepull }}"
 - name: Ensure remote Rancher image archive directory exists
  file:
    path: /tmp/bootstrap-image-archives
    state: directory
    mode: "0755"
 - name: Copy runner-provided Rancher image archives
  copy:
    src: "{{ item.stat.path }}"
    dest: "/tmp/bootstrap-image-archives/{{ item.item | regex_replace('[/:]', '_') }}.tar"
    mode: "0644"
  loop: "{{ rancher_image_archive_stats.results }}"
  loop_control:
    label: "{{ item.item }}"
  when: item.stat.exists
 - name: Import or pull Rancher images into containerd
  shell: |
    if /usr/local/bin/ctr -n k8s.io images ls -q | grep -Fx -- "{{ item }}" >/dev/null; then
      echo "already present"
      exit 0
    fi
    archive="/tmp/bootstrap-image-archives/{{ item | regex_replace('[/:]', '_') }}.tar"
    if [ -s "${archive}" ]; then
      for attempt in 1 2 3; do
        if /usr/local/bin/ctr -n k8s.io images import "${archive}" && /usr/local/bin/ctr -n k8s.io images ls -q | grep -Fx -- "{{ item }}" >/dev/null; then
          echo "imported image"
          exit 0
        fi
        sleep 10
      done
    fi
    for attempt in 1 2 3 4 5; do
      if timeout 180s /usr/local/bin/ctr -n k8s.io images pull "{{ item }}"; then
        echo "pulled image"
        exit 0
      fi
      sleep 10
    done
    exit 1
  args:
    executable: /bin/bash
  register: rancher_image_pull
  loop: "{{ rancher_images_to_prepull }}"
  changed_when: "'imported image' in rancher_image_pull.stdout or 'pulled image' in rancher_image_pull.stdout"
@@ -0,0 +1,61 @@
 ---
 - name: Delete stale Tailscale devices with reserved hostnames
  block:
    - name: Get Tailscale devices from API
      uri:
        url: "https://api.tailscale.com/api/v2/tailnet/{{ tailscale_tailnet }}/devices"
        method: GET
        headers:
          Authorization: "Bearer {{ tailscale_api_key }}"
        return_content: true
      register: ts_devices
      until: ts_devices.status == 200
      retries: 5
      delay: 10
    - name: Find stale devices matching reserved hostnames
      set_fact:
        stale_devices: >-
          {{ (ts_devices.json.devices | default([])
              | selectattr('hostname', 'defined')
              | selectattr('hostname', 'in', tailscale_reserved_hostnames)
              | selectattr('connectedToControl', 'defined')
              | rejectattr('connectedToControl', 'equalto', true)
              | list
              +
              ts_devices.json.devices | default([])
              | selectattr('hostname', 'defined')
              | selectattr('hostname', 'in', tailscale_reserved_hostnames)
              | selectattr('online', 'defined')
              | rejectattr('online', 'equalto', true)
              | list) | unique(attribute='id') | list }}
    - name: Delete stale devices
      uri:
        url: "https://api.tailscale.com/api/v2/device/{{ item.id }}"
        method: DELETE
        headers:
          Authorization: "Bearer {{ tailscale_api_key }}"
        status_code: 200
      register: ts_delete_device
      until: ts_delete_device.status == 200
      retries: 3
      delay: 5
      loop: "{{ stale_devices }}"
      loop_control:
        label: "{{ item.name }} ({{ item.id }})"
      when: stale_devices | length > 0
    - name: Report cleaned devices
      debug:
        msg: "Deleted stale Tailscale device: {{ item.name }}"
      loop: "{{ stale_devices }}"
      when: stale_devices | length > 0
    - name: No stale devices found
      debug:
        msg: "No stale Tailscale devices found."
      when: stale_devices | length == 0
  when:
    - tailscale_api_key is defined
    - tailscale_api_key | length > 0
@@ -1,14 +1,26 @@
 ---
 - name: Clean up stale Tailscale cluster node devices
  hosts: localhost
  connection: local
  vars:
    tailscale_reserved_hostnames: "{{ groups['cluster'] | default([]) | list }}"
  roles:
    - tailscale-cleanup
 - name: Bootstrap Kubernetes cluster
  hosts: cluster
  become: true
-  gather_facts: true
+  gather_facts: false
  pre_tasks:
    - name: Wait for SSH
      wait_for_connection:
        delay: 10
-        timeout: 300
+        timeout: 600
    - name: Gather facts after SSH is reachable
      setup:
  roles:
    - common
@@ -24,6 +36,7 @@
    k3s_primary_public_ip: "{{ ansible_host }}"
    k3s_primary_ip: "{{ k3s_private_ip }}"
    k3s_node_ip: "{{ k3s_private_ip }}"
    # kube_api_endpoint is set in inventory group_vars
  roles:
    - k3s-server
@@ -49,6 +62,32 @@
        dest: ../outputs/kubeconfig
        flat: true
 - name: Bootstrap addon prerequisite secrets
  hosts: control_plane[0]
  become: true
  roles:
    - addon-secrets-bootstrap
 - name: Deploy kube-vip for API HA
  hosts: control_plane[0]
  become: true
  roles:
    - kube-vip-deploy
 - name: Wait for Kubernetes API VIP readiness
  hosts: control_plane[0]
  become: true
  tasks:
    - name: Wait for Kubernetes readyz through the VIP
      command: kubectl --server=https://{{ kube_api_endpoint }}:6443 get --raw=/readyz
      register: api_readyz
      until: api_readyz.rc == 0
      retries: 30
      delay: 10
      changed_when: false
 - name: Setup secondary control planes
  hosts: control_plane[1:]
  become: true
@@ -59,44 +98,163 @@
    k3s_primary_ip: "{{ hostvars[groups['control_plane'][0]]['k3s_primary_private_ip'] }}"
    k3s_primary_public_ip: "{{ hostvars[groups['control_plane'][0]]['k3s_primary_public_ip'] }}"
    k3s_node_ip: "{{ k3s_private_ip }}"
    # Use Load Balancer for HA - all control planes join via LB endpoint
    k3s_join_endpoint: "{{ kube_api_endpoint | default(hostvars[groups['control_plane'][0]]['k3s_primary_private_ip']) }}"
  roles:
    - k3s-server
 - name: Export kube-vip image from primary control plane
  hosts: control_plane[0]
  become: true
  tasks:
    - name: Export kube-vip image for secondary control planes
      command: >-
        /usr/local/bin/ctr -n k8s.io images export
        /tmp/kube-vip-bootstrap.tar
        ghcr.io/kube-vip/kube-vip:v1.1.2
      changed_when: false
    - name: Fetch kube-vip image archive
      fetch:
        src: /tmp/kube-vip-bootstrap.tar
        dest: ../outputs/kube-vip-bootstrap.tar
        flat: true
 - name: Seed kube-vip image on secondary control planes
  hosts: control_plane[1:]
  become: true
  tasks:
    - name: Copy kube-vip image archive
      copy:
        src: ../outputs/kube-vip-bootstrap.tar
        dest: /tmp/kube-vip-bootstrap.tar
        mode: "0644"
    - name: Import kube-vip image into containerd
      command: /usr/local/bin/ctr -n k8s.io images import /tmp/kube-vip-bootstrap.tar
      register: kube_vip_secondary_import
      until: kube_vip_secondary_import.rc == 0
      retries: 3
      delay: 10
      changed_when: false
 - name: Wait for all control plane nodes to be Ready
  hosts: control_plane[0]
  become: true
  tasks:
    - name: Wait for control plane node readiness
      command: kubectl wait --for=condition=Ready node/{{ item }} --timeout=30s
      register: control_plane_ready
      until: control_plane_ready.rc == 0
      retries: 20
      delay: 15
      changed_when: false
      loop: "{{ groups['control_plane'] }}"
    - name: Wait for Kubernetes readyz before worker joins
      command: kubectl --server=https://{{ kube_api_endpoint }}:6443 get --raw=/readyz
      register: api_readyz_before_workers
      until: api_readyz_before_workers.rc == 0
      retries: 30
      delay: 10
      changed_when: false
 - name: Setup workers
  hosts: workers
  become: true
  vars:
    k3s_token: "{{ hostvars[groups['control_plane'][0]]['k3s_token'] }}"
-    k3s_server_url: "https://{{ hostvars[groups['control_plane'][0]]['k3s_primary_private_ip'] }}:6443"
+    # Use Load Balancer for HA - workers join via LB endpoint
    k3s_server_url: "https://{{ kube_api_endpoint | default(hostvars[groups['control_plane'][0]]['k3s_primary_private_ip']) }}:6443"
    k3s_node_ip: "{{ k3s_private_ip }}"
  roles:
    - k3s-agent
- name: Deploy Hetzner CCM
+- name: Pre-pull bootstrap control-plane images
  hosts: control_plane[0]
  become: true
  roles:
-    - ccm
+    - bootstrap-image-prepull
- name: Deploy Hetzner CSI
+- name: Pre-pull Rancher bootstrap images
  hosts: workers
  become: true
  roles:
    - role: rancher-image-prepull
      when: rancher_image_prepull_enabled | default(false) | bool
 - name: Deploy observability stack
  hosts: control_plane[0]
  become: true
  roles:
-    - csi
+    - role: observability
      when: not (observability_gitops_enabled | default(true) | bool)
 - name: Provision Grafana content
  hosts: control_plane[0]
  become: true
  roles:
    - role: observability-content
      when: not (observability_gitops_enabled | default(true) | bool)
 - name: Bootstrap Doppler access for External Secrets
  hosts: control_plane[0]
  become: true
  roles:
    - doppler-bootstrap
 - name: Detect existing Tailscale service proxies
  hosts: control_plane[0]
  become: true
  tasks:
    - name: Check for current Tailscale service hostnames
      command: kubectl get svc -A -o jsonpath='{range .items[*]}{.metadata.annotations.tailscale\.com/hostname}{"\n"}{end}'
      register: existing_tailscale_hostnames
      changed_when: false
      failed_when: false
 - name: Clean up stale Tailscale devices
  hosts: localhost
  connection: local
  vars:
    tailscale_reserved_hostnames:
      - rancher
      - grafana
      - prometheus
      - flux
  tasks:
    - name: Delete stale devices only before service proxies exist
      include_role:
        name: tailscale-cleanup
      when: >-
        hostvars[groups['control_plane'][0]].existing_tailscale_hostnames.stdout_lines | default([])
        | intersect(tailscale_reserved_hostnames)
        | length == 0
 - name: Finalize
  hosts: localhost
  connection: local
  tasks:
    - name: Check whether kubeconfig was fetched
      stat:
        path: ../outputs/kubeconfig
      register: kubeconfig_file
    - name: Update kubeconfig server address
      command: |
-        sed -i 's/127.0.0.1/{{ groups["control_plane"][0] }}.{{ tailscale_tailnet }}/g' ../outputs/kubeconfig
+        sed -i 's/127.0.0.1/{{ hostvars[groups["control_plane"][0]]["ansible_host"] }}/g' ../outputs/kubeconfig
      changed_when: true
      when: kubeconfig_file.stat.exists
    - name: Display success message
      debug:
@@ -0,0 +1,3 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources: []
@@ -0,0 +1,12 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: GitRepository
 metadata:
  name: platform
  namespace: flux-system
 spec:
  interval: 1m
  ref:
    branch: main
  url: ssh://git@64.176.189.59:2222/HomeInfra/HetznerTerra.git
  secretRef:
    name: flux-system
@@ -0,0 +1,59 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: source-controller
  namespace: flux-system
 spec:
  template:
    spec:
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
      tolerations:
        - key: node-role.kubernetes.io/control-plane
          operator: Exists
          effect: NoSchedule
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: kustomize-controller
  namespace: flux-system
 spec:
  template:
    spec:
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
      tolerations:
        - key: node-role.kubernetes.io/control-plane
          operator: Exists
          effect: NoSchedule
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: helm-controller
  namespace: flux-system
 spec:
  template:
    spec:
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
      tolerations:
        - key: node-role.kubernetes.io/control-plane
          operator: Exists
          effect: NoSchedule
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: notification-controller
  namespace: flux-system
 spec:
  template:
    spec:
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
      tolerations:
        - key: node-role.kubernetes.io/control-plane
          operator: Exists
          effect: NoSchedule
@@ -0,0 +1,17 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: apps
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./apps
  dependsOn:
    - name: infrastructure
  wait: true
  timeout: 5m
  suspend: true
@@ -0,0 +1,14 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: infrastructure
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure
  wait: false
  timeout: 5m
@@ -0,0 +1,9 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - gotk-components.yaml
  - gitrepository-platform.yaml
  - kustomization-infrastructure.yaml
  - kustomization-apps.yaml
 patchesStrategicMerge:
  - gotk-controller-cp1-patches.yaml
@@ -0,0 +1,4 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - flux-system
@@ -0,0 +1,34 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: cert-manager
  namespace: flux-system
 spec:
  interval: 10m
  timeout: 15m
  targetNamespace: cert-manager
  chart:
    spec:
      chart: ./infrastructure/charts/cert-manager
      sourceRef:
        kind: GitRepository
        name: platform
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    crds:
      enabled: true
    replicaCount: 1
    resources:
      requests:
        cpu: 50m
        memory: 128Mi
      limits:
        cpu: 250m
        memory: 256Mi
@@ -0,0 +1,5 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - namespace.yaml
  - helmrelease-cert-manager.yaml
@@ -0,0 +1,6 @@
 apiVersion: v1
 kind: Namespace
 metadata:
  name: cert-manager
  labels:
    kustomize.toolkit.fluxcd.io/prune: disabled
@@ -0,0 +1,13 @@
 apiVersion: external-secrets.io/v1
 kind: ClusterSecretStore
 metadata:
  name: doppler-hetznerterra
 spec:
  provider:
    doppler:
      auth:
        secretRef:
          dopplerToken:
            name: doppler-hetznerterra-service-token
            key: dopplerToken
            namespace: external-secrets
@@ -0,0 +1,4 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - clustersecretstore-doppler-hetznerterra.yaml
@@ -0,0 +1,44 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: external-secrets
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: external-secrets
  chartRef:
    kind: OCIRepository
    name: external-secrets
    namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    installCRDs: true
    image:
      repository: oci.external-secrets.io/external-secrets/external-secrets
      tag: v2.1.0
      pullPolicy: IfNotPresent
    nodeSelector:
      kubernetes.io/hostname: k8s-cluster-cp-1
    webhook:
      failurePolicy: Ignore
      image:
        repository: oci.external-secrets.io/external-secrets/external-secrets
        tag: v2.1.0
        pullPolicy: IfNotPresent
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
    certController:
      image:
        repository: oci.external-secrets.io/external-secrets/external-secrets
        tag: v2.1.0
        pullPolicy: IfNotPresent
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
    serviceMonitor:
      enabled: false
@@ -0,0 +1,6 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - namespace.yaml
  - ocirepository-external-secrets.yaml
  - helmrelease-external-secrets.yaml
@@ -0,0 +1,4 @@
 apiVersion: v1
 kind: Namespace
 metadata:
  name: external-secrets
@@ -0,0 +1,13 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: OCIRepository
 metadata:
  name: external-secrets
  namespace: flux-system
 spec:
  interval: 10m
  url: oci://ghcr.io/external-secrets/charts/external-secrets
  ref:
    tag: 2.1.0
  layerSelector:
    mediaType: application/vnd.cncf.helm.chart.content.v1.tar+gzip
    operation: copy
@@ -0,0 +1,15 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-cert-manager
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/cert-manager
  wait: true
  timeout: 20m
  suspend: false
@@ -0,0 +1,21 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-external-secrets-store
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/external-secrets-store
  dependsOn:
    - name: addon-external-secrets
  wait: false
  healthChecks:
    - apiVersion: external-secrets.io/v1
      kind: ClusterSecretStore
      name: doppler-hetznerterra
  timeout: 5m
  suspend: false
@@ -0,0 +1,28 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-external-secrets
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/external-secrets
  wait: false
  healthChecks:
    - apiVersion: helm.toolkit.fluxcd.io/v2
      kind: HelmRelease
      name: external-secrets
      namespace: flux-system
    - apiVersion: apps/v1
      kind: Deployment
      name: external-secrets-external-secrets
      namespace: external-secrets
    - apiVersion: apps/v1
      kind: Deployment
      name: external-secrets-external-secrets-webhook
      namespace: external-secrets
  timeout: 10m
  suspend: false
@@ -0,0 +1,20 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-nfs-storage
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/nfs-storage
  wait: true
  healthChecks:
    - apiVersion: apps/v1
      kind: Deployment
      name: nfs-subdir-external-provisioner
      namespace: kube-system
  timeout: 10m
  suspend: false
@@ -0,0 +1,17 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-observability-content
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/observability-content
  dependsOn:
    - name: addon-observability
  wait: true
  timeout: 5m
  suspend: false
@@ -0,0 +1,26 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-observability-secrets
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/observability-secrets
  dependsOn:
    - name: addon-external-secrets-store
  wait: false
  healthChecks:
    - apiVersion: external-secrets.io/v1
      kind: ExternalSecret
      name: grafana-admin
      namespace: observability
    - apiVersion: v1
      kind: Secret
      name: grafana-admin-credentials
      namespace: observability
  timeout: 5m
  suspend: false
@@ -0,0 +1,33 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-observability
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/observability
  dependsOn:
    - name: addon-observability-secrets
    - name: addon-nfs-storage
    - name: addon-tailscale-operator
    - name: addon-tailscale-proxyclass
  wait: false
  healthChecks:
    - apiVersion: helm.toolkit.fluxcd.io/v2
      kind: HelmRelease
      name: kube-prometheus-stack
      namespace: flux-system
    - apiVersion: helm.toolkit.fluxcd.io/v2
      kind: HelmRelease
      name: loki
      namespace: flux-system
    - apiVersion: helm.toolkit.fluxcd.io/v2
      kind: HelmRelease
      name: promtail
      namespace: flux-system
  timeout: 30m
  suspend: false
@@ -0,0 +1,17 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-rancher-config
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/rancher-config
  dependsOn:
    - name: addon-rancher
  wait: true
  timeout: 10m
  suspend: false
@@ -0,0 +1,34 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-rancher-secrets
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/rancher-secrets
  dependsOn:
    - name: addon-external-secrets-store
  wait: false
  healthChecks:
    - apiVersion: external-secrets.io/v1
      kind: ExternalSecret
      name: rancher-bootstrap-password
      namespace: flux-system
    - apiVersion: v1
      kind: Secret
      name: rancher-bootstrap-password
      namespace: flux-system
    - apiVersion: external-secrets.io/v1
      kind: ExternalSecret
      name: rancher-bootstrap-password
      namespace: cattle-system
    - apiVersion: v1
      kind: Secret
      name: rancher-bootstrap-password
      namespace: cattle-system
  timeout: 5m
  suspend: false
@@ -0,0 +1,41 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-rancher
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/rancher
  timeout: 30m
  suspend: false
  dependsOn:
    - name: addon-tailscale-operator
    - name: addon-tailscale-proxyclass
    - name: addon-rancher-secrets
    - name: addon-cert-manager
  wait: false
  healthChecks:
    - apiVersion: helm.toolkit.fluxcd.io/v2
      kind: HelmRelease
      name: rancher
      namespace: flux-system
    - apiVersion: apps/v1
      kind: Deployment
      name: cattle-system-rancher
      namespace: cattle-system
    - apiVersion: apps/v1
      kind: Deployment
      name: rancher-webhook
      namespace: cattle-system
    - apiVersion: cert-manager.io/v1
      kind: Issuer
      name: cattle-system-rancher
      namespace: cattle-system
    - apiVersion: cert-manager.io/v1
      kind: Certificate
      name: tls-rancher-ingress
      namespace: cattle-system
@@ -0,0 +1,15 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-tailscale-operator
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/tailscale-operator
  wait: false
  timeout: 10m
  suspend: false
@@ -0,0 +1,17 @@
 apiVersion: kustomize.toolkit.fluxcd.io/v1
 kind: Kustomization
 metadata:
  name: addon-tailscale-proxyclass
  namespace: flux-system
 spec:
  interval: 10m
  prune: true
  sourceRef:
    kind: GitRepository
    name: platform
  path: ./infrastructure/addons/tailscale-proxyclass
  dependsOn:
    - name: addon-tailscale-operator
  wait: true
  timeout: 5m
  suspend: false
@@ -0,0 +1,16 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - kustomization-nfs-storage.yaml
  - kustomization-external-secrets.yaml
  - kustomization-external-secrets-store.yaml
  - kustomization-cert-manager.yaml
  - kustomization-tailscale-operator.yaml
  - kustomization-tailscale-proxyclass.yaml
  - traefik
  - kustomization-observability-secrets.yaml
  - kustomization-observability.yaml
  - kustomization-observability-content.yaml
  - kustomization-rancher-secrets.yaml
  - kustomization-rancher.yaml
  - kustomization-rancher-config.yaml
@@ -0,0 +1,20 @@
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
 metadata:
  name: nfs-subdir-external-provisioner-runner
 rules:
  - apiGroups: [""]
    resources: ["nodes"]
    verbs: ["get", "list", "watch"]
  - apiGroups: [""]
    resources: ["persistentvolumes"]
    verbs: ["get", "list", "watch", "create", "delete"]
  - apiGroups: [""]
    resources: ["persistentvolumeclaims"]
    verbs: ["get", "list", "watch", "update"]
  - apiGroups: ["storage.k8s.io"]
    resources: ["storageclasses"]
    verbs: ["get", "list", "watch"]
  - apiGroups: [""]
    resources: ["events"]
    verbs: ["create", "update", "patch"]
@@ -0,0 +1,12 @@
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRoleBinding
 metadata:
  name: run-nfs-subdir-external-provisioner
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: nfs-subdir-external-provisioner-runner
 subjects:
  - kind: ServiceAccount
    name: nfs-subdir-external-provisioner
    namespace: kube-system
@@ -0,0 +1,41 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
  name: nfs-subdir-external-provisioner
  namespace: kube-system
 spec:
  replicas: 1
  selector:
    matchLabels:
      app: nfs-subdir-external-provisioner
  template:
    metadata:
      labels:
        app: nfs-subdir-external-provisioner
    spec:
      serviceAccountName: nfs-subdir-external-provisioner
      nodeSelector:
        kubernetes.io/hostname: k8s-cluster-cp-1
      tolerations:
        - key: node-role.kubernetes.io/control-plane
          operator: Exists
          effect: NoSchedule
      containers:
        - name: nfs-subdir-external-provisioner
          image: registry.k8s.io/sig-storage/nfs-subdir-external-provisioner:v4.0.2
          imagePullPolicy: IfNotPresent
          env:
            - name: PROVISIONER_NAME
              value: flash-nfs
            - name: NFS_SERVER
              value: 10.27.27.239
            - name: NFS_PATH
              value: /TheFlash/k8s-nfs
          volumeMounts:
            - name: nfs-subdir-external-provisioner-root
              mountPath: /persistentvolumes
      volumes:
        - name: nfs-subdir-external-provisioner-root
          nfs:
            server: 10.27.27.239
            path: /TheFlash/k8s-nfs
@@ -0,0 +1,10 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - serviceaccount-nfs-subdir-external-provisioner.yaml
  - clusterrole-nfs-subdir-external-provisioner.yaml
  - clusterrolebinding-nfs-subdir-external-provisioner.yaml
  - role-nfs-subdir-external-provisioner.yaml
  - rolebinding-nfs-subdir-external-provisioner.yaml
  - storageclass-flash-nfs.yaml
  - deployment-nfs-subdir-external-provisioner.yaml
@@ -0,0 +1,9 @@
 apiVersion: rbac.authorization.k8s.io/v1
 kind: Role
 metadata:
  name: leader-locking-nfs-subdir-external-provisioner
  namespace: kube-system
 rules:
  - apiGroups: [""]
    resources: ["endpoints"]
    verbs: ["get", "list", "watch", "create", "update", "patch"]
@@ -0,0 +1,13 @@
 apiVersion: rbac.authorization.k8s.io/v1
 kind: RoleBinding
 metadata:
  name: leader-locking-nfs-subdir-external-provisioner
  namespace: kube-system
 roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: Role
  name: leader-locking-nfs-subdir-external-provisioner
 subjects:
  - kind: ServiceAccount
    name: nfs-subdir-external-provisioner
    namespace: kube-system
@@ -0,0 +1,5 @@
 apiVersion: v1
 kind: ServiceAccount
 metadata:
  name: nfs-subdir-external-provisioner
  namespace: kube-system
@@ -0,0 +1,12 @@
 apiVersion: storage.k8s.io/v1
 kind: StorageClass
 metadata:
  name: flash-nfs
  annotations:
    storageclass.kubernetes.io/is-default-class: "true"
 provisioner: flash-nfs
 parameters:
  archiveOnDelete: "true"
 reclaimPolicy: Delete
 allowVolumeExpansion: true
 volumeBindingMode: Immediate
@@ -0,0 +1,60 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: grafana-dashboard-k8s-overview
  namespace: observability
  labels:
    grafana_dashboard: "1"
 data:
  k8s-overview.json: |
    {
      "annotations": {"list": []},
      "editable": true,
      "fiscalYearStartMonth": 0,
      "graphTooltip": 0,
      "id": null,
      "links": [],
      "panels": [
        {
          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
          "fieldConfig": {"defaults": {"unit": "none"}, "overrides": []},
          "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
          "id": 1,
          "options": {"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, "textMode": "auto"},
          "targets": [
            {
              "expr": "count(kube_node_status_condition{condition=\"Ready\",status=\"true\"})",
              "legendFormat": "ready",
              "refId": "A"
            }
          ],
          "title": "Ready Nodes",
          "type": "stat"
        },
        {
          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
          "fieldConfig": {"defaults": {"unit": "percentunit"}, "overrides": []},
          "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
          "id": 2,
          "targets": [
            {
              "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m]))",
              "legendFormat": "cpu",
              "refId": "A"
            }
          ],
          "title": "Cluster CPU Usage",
          "type": "timeseries"
        }
      ],
      "refresh": "30s",
      "schemaVersion": 39,
      "style": "dark",
      "tags": ["kubernetes", "infrastructure"],
      "templating": {"list": []},
      "time": {"from": "now-1h", "to": "now"},
      "timezone": "browser",
      "title": "K8s Cluster Overview",
      "uid": "k8s-cluster-overview",
      "version": 1
    }
@@ -0,0 +1,16 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
  name: grafana-datasources-core
  namespace: observability
  labels:
    grafana_datasource: "1"
 data:
  datasources.yaml: |
    apiVersion: 1
    datasources:
      - name: Loki
        type: loki
        access: proxy
        url: "http://loki.observability.svc.cluster.local:3100"
        isDefault: false
@@ -0,0 +1,5 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - grafana-datasources-core-configmap.yaml
  - grafana-dashboard-k8s-overview-configmap.yaml
@@ -0,0 +1,22 @@
 apiVersion: external-secrets.io/v1
 kind: ExternalSecret
 metadata:
  name: grafana-admin
  namespace: observability
 spec:
  refreshInterval: 1h
  secretStoreRef:
    name: doppler-hetznerterra
    kind: ClusterSecretStore
  target:
    name: grafana-admin-credentials
    creationPolicy: Owner
    template:
      type: Opaque
      data:
        admin-user: admin
        admin-password: "{{ .grafanaAdminPassword }}"
  data:
    - secretKey: grafanaAdminPassword
      remoteRef:
        key: GRAFANA_ADMIN_PASSWORD
@@ -0,0 +1,5 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - namespace.yaml
  - grafana-admin-externalsecret.yaml
@@ -0,0 +1,4 @@
 apiVersion: v1
 kind: Namespace
 metadata:
  name: observability
@@ -0,0 +1,19 @@
 apiVersion: v1
 kind: Service
 metadata:
  name: grafana-tailscale
  namespace: observability
  annotations:
    tailscale.com/hostname: grafana
    tailscale.com/tags: "tag:prod,tag:grafana"
    tailscale.com/proxy-class: infra-stable
 spec:
  type: LoadBalancer
  loadBalancerClass: tailscale
  selector:
    app.kubernetes.io/name: grafana
  ports:
    - name: http
      port: 80
      protocol: TCP
      targetPort: 3000
@@ -0,0 +1,76 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: kube-prometheus-stack
  namespace: flux-system
 spec:
  interval: 10m
  timeout: 15m
  targetNamespace: observability
  chart:
    spec:
      chart: ./infrastructure/charts/kube-prometheus-stack
      sourceRef:
        kind: GitRepository
        name: platform
        namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
      strategy: uninstall
  values:
    grafana:
      enabled: true
      admin:
        existingSecret: grafana-admin-credentials
      grafana.ini:
        server:
          root_url: http://grafana.silverside-gopher.ts.net/
          serve_from_sub_path: false
      persistence:
        enabled: true
        storageClassName: local-path
        size: 5Gi
      service:
        type: ClusterIP
      sidecar:
        datasources:
          enabled: true
          label: grafana_datasource
          searchNamespace: observability
        dashboards:
          enabled: true
          label: grafana_dashboard
          searchNamespace: observability
    prometheus:
      service:
        type: ClusterIP
      prometheusSpec:
        externalUrl: http://prometheus.silverside-gopher.ts.net:9090/
        routePrefix: /
        retention: 7d
        storageSpec:
          volumeClaimTemplate:
            spec:
              storageClassName: local-path
              accessModes:
                - ReadWriteOnce
              resources:
                requests:
                  storage: 10Gi
    alertmanager:
      enabled: false
    kubeEtcd:
      enabled: false
    kubeControllerManager:
      enabled: false
    kubeScheduler:
      enabled: false
    prometheus-node-exporter:
      hostNetwork: false
      service:
        hostPort: false
@@ -0,0 +1,95 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: loki
  namespace: flux-system
 spec:
  interval: 10m
  targetNamespace: observability
  chartRef:
    kind: OCIRepository
    name: loki
    namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    deploymentMode: SingleBinary
    loki:
      auth_enabled: false
      commonConfig:
        replication_factor: 1
      schemaConfig:
        configs:
          - from: "2024-04-01"
            store: tsdb
            object_store: filesystem
            schema: v13
            index:
              prefix: loki_index_
              period: 24h
      storage:
        type: filesystem
      limits_config:
        allow_structured_metadata: true
        volume_enabled: true
        retention_period: 168h
      pattern_ingester:
        enabled: true
      ruler:
        enable_api: true
    singleBinary:
      replicas: 1
      persistence:
        size: 10Gi
        storageClass: flash-nfs
      resources:
        requests:
          cpu: 100m
          memory: 256Mi
        limits:
          cpu: 500m
          memory: 1Gi
    backend:
      replicas: 0
    read:
      replicas: 0
    write:
      replicas: 0
    ingester:
      replicas: 0
    querier:
      replicas: 0
    queryFrontend:
      replicas: 0
    queryScheduler:
      replicas: 0
    distributor:
      replicas: 0
    compactor:
      replicas: 0
    indexGateway:
      replicas: 0
    bloomCompactor:
      replicas: 0
    bloomGateway:
      replicas: 0
    gateway:
      enabled: false
    test:
      enabled: false
    chunksCache:
      enabled: false
    resultsCache:
      enabled: false
    lokiCanary:
      enabled: false
    monitoring:
      selfMonitoring:
        enabled: false
      lokiCanary:
        enabled: false
@@ -0,0 +1,26 @@
 apiVersion: helm.toolkit.fluxcd.io/v2
 kind: HelmRelease
 metadata:
  name: promtail
  namespace: flux-system
 spec:
  interval: 10m
  timeout: 20m
  targetNamespace: observability
  chartRef:
    kind: OCIRepository
    name: promtail
    namespace: flux-system
  install:
    createNamespace: true
    remediation:
      retries: 3
  upgrade:
    remediation:
      retries: 3
  values:
    image:
      pullPolicy: IfNotPresent
    config:
      clients:
        - url: http://observability-loki.observability.svc.cluster.local:3100/loki/api/v1/push
@@ -0,0 +1,10 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - ocirepository-loki.yaml
  - ocirepository-promtail.yaml
  - helmrelease-kube-prometheus-stack.yaml
  - helmrelease-loki.yaml
  - helmrelease-promtail.yaml
  - grafana-tailscale-service.yaml
  - prometheus-tailscale-service.yaml
@@ -0,0 +1,13 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: OCIRepository
 metadata:
  name: loki
  namespace: flux-system
 spec:
  interval: 10m
  url: oci://ghcr.io/grafana/helm-charts/loki
  ref:
    tag: 6.46.0
  layerSelector:
    mediaType: application/vnd.cncf.helm.chart.content.v1.tar+gzip
    operation: copy
@@ -0,0 +1,13 @@
 apiVersion: source.toolkit.fluxcd.io/v1
 kind: OCIRepository
 metadata:
  name: promtail
  namespace: flux-system
 spec:
  interval: 10m
  url: oci://ghcr.io/grafana/helm-charts/promtail
  ref:
    tag: 6.16.6
  layerSelector:
    mediaType: application/vnd.cncf.helm.chart.content.v1.tar+gzip
    operation: copy
@@ -0,0 +1,20 @@
 apiVersion: v1
 kind: Service
 metadata:
  name: prometheus-tailscale
  namespace: observability
  annotations:
    tailscale.com/hostname: prometheus
    tailscale.com/tags: "tag:prod,tag:prometheus"
    tailscale.com/proxy-class: infra-stable
 spec:
  type: LoadBalancer
  loadBalancerClass: tailscale
  selector:
    app.kubernetes.io/name: prometheus
    operator.prometheus.io/name: observability-kube-prometh-prometheus
  ports:
    - name: http
      port: 9090
      protocol: TCP
      targetPort: 9090
@@ -0,0 +1,4 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - server-url-setting.yaml
@@ -0,0 +1,5 @@
 apiVersion: management.cattle.io/v3
 kind: Setting
 metadata:
  name: server-url
 value: https://rancher.silverside-gopher.ts.net
@@ -0,0 +1,6 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 resources:
  - namespace.yaml
  - rancher-bootstrap-password-flux-externalsecret.yaml
  - rancher-bootstrap-password-externalsecret.yaml
--- a/Show More
+++ b/Show More