refactor: retire imperative addon roles

2026-03-17 01:04:02 +00:00
parent e3ce91db62
commit 08a3031276
11 changed files with 3 additions and 799 deletions
--- a/README.md
+++ b/README.md
@@ -197,7 +197,7 @@ Terraform/bootstrap secrets remain in Gitea Actions secrets and are not managed
 - `clusters/prod/`: cluster entrypoint and Flux reconciliation objects
 - `clusters/prod/flux-system/`: `GitRepository` source and top-level `Kustomization` graph
 - `infrastructure/`: infrastructure addon reconciliation graph
- `infrastructure/addons/*`: per-addon manifests (observability + observability-content migrated)
+- `infrastructure/addons/*`: per-addon manifests for Flux-managed cluster addons
 - `apps/`: application workload layer (currently scaffolded)

 ### Reconciliation graph
@@ -215,7 +215,7 @@ Terraform/bootstrap secrets remain in Gitea Actions secrets and are not managed
 1. Install Flux controllers in `flux-system`.
 2. Create the Flux deploy key/secret named `flux-system` in `flux-system` namespace.
 3. Apply `clusters/prod/flux-system/` once to establish source + reconciliation graph.
-4. Unsuspend addon `Kustomization` objects one-by-one as each addon is migrated from Ansible.
+4. Bootstrap-only Ansible creates prerequisite secrets; Flux manages addon lifecycle after bootstrap.

 ### Current migration status

@@ -319,9 +319,7 @@ It avoids full cluster provisioning and only applies Grafana content resources:
 │   │   ├── common/
 │   │   ├── k3s-server/
 │   │   ├── k3s-agent/
-│   │   ├── ccm/
-│   │   ├── csi/
-│   │   ├── tailscale-operator/
+│   │   ├── addon-secrets-bootstrap/
 │   │   ├── observability-content/
 │   │   └── observability/
 │   └── ansible.cfg
--- a/ansible/roles/ccm/defaults/main.yml
+++ b/ansible/roles/ccm/defaults/main.yml
@@ -1,4 +0,0 @@
---
-hcloud_token: ""
-cluster_name: "k8s-cluster"
-hcloud_lb_location: "nbg1"
--- a/ansible/roles/ccm/tasks/main.yml
+++ b/ansible/roles/ccm/tasks/main.yml
@@ -1,88 +0,0 @@
---
- name: Check if Hetzner CCM is already deployed
-  command: kubectl -n kube-system get deployment hcloud-cloud-controller-manager
-  register: ccm_namespace
-  failed_when: false
-  changed_when: false
-
- name: Create Hetzner cloud secret
-  shell: |
-    kubectl -n kube-system create secret generic hcloud \
-      --from-literal=token='{{ hcloud_token }}' \
-      --from-literal=network='{{ cluster_name }}-network' \
-      --dry-run=client -o yaml | kubectl apply -f -
-  no_log: true
-  when: hcloud_token is defined
-  changed_when: true
-
- name: Deploy Hetzner CCM
-  command: kubectl apply -f https://raw.githubusercontent.com/hetznercloud/hcloud-cloud-controller-manager/main/deploy/ccm-networks.yaml
-  changed_when: true
-
- name: Detect CCM workload kind
-  shell: |
-    if kubectl -n kube-system get deployment hcloud-cloud-controller-manager >/dev/null 2>&1; then
-      echo deployment
-    elif kubectl -n kube-system get daemonset hcloud-cloud-controller-manager >/dev/null 2>&1; then
-      echo daemonset
-    else
-      echo missing
-    fi
-  register: ccm_workload_kind
-  changed_when: false
-
- name: Wait for CCM deployment rollout
-  command: kubectl rollout status deployment/hcloud-cloud-controller-manager -n kube-system
-  register: ccm_rollout_deploy
-  until: ccm_rollout_deploy.rc == 0
-  changed_when: false
-  retries: 30
-  delay: 10
-  when: ccm_workload_kind.stdout == "deployment"
-
- name: Wait for CCM daemonset rollout
-  command: kubectl rollout status daemonset/hcloud-cloud-controller-manager -n kube-system
-  register: ccm_rollout_ds
-  until: ccm_rollout_ds.rc == 0
-  changed_when: false
-  retries: 30
-  delay: 10
-  when: ccm_workload_kind.stdout == "daemonset"
-
- name: Set default Hetzner load balancer location for Traefik service
-  command: kubectl -n kube-system annotate service traefik load-balancer.hetzner.cloud/location={{ hcloud_lb_location }} --overwrite
-  register: traefik_annotation
-  changed_when: true
-  failed_when: false
-
- name: Show Traefik service when annotation patch fails
-  command: kubectl -n kube-system get service traefik -o yaml
-  register: traefik_service_dump
-  changed_when: false
-  failed_when: false
-  when: traefik_annotation.rc != 0
-
- name: Fail when Traefik load balancer annotation cannot be set
-  fail:
-    msg: |
-      Failed to set Hetzner load balancer location annotation on kube-system/traefik service.
-      Command output:
-      {{ traefik_annotation.stderr | default(traefik_annotation.stdout) }}
-
-      Service dump:
-      {{ traefik_service_dump.stdout | default('n/a') }}
-  when: traefik_annotation.rc != 0
-
- name: Show CCM namespace objects when workload missing
-  command: kubectl -n kube-system get all | grep hcloud-cloud-controller-manager || true
-  register: ccm_ns_objects
-  changed_when: false
-  when: ccm_workload_kind.stdout == "missing"
-
- name: Fail when CCM workload is missing
-  fail:
-    msg: |
-      hcloud-cloud-controller-manager workload not found after applying manifest.
-      Namespace objects:
-      {{ ccm_ns_objects.stdout | default('n/a') }}
-  when: ccm_workload_kind.stdout == "missing"
--- a/ansible/roles/csi/defaults/main.yml
+++ b/ansible/roles/csi/defaults/main.yml
@@ -1,15 +0,0 @@
---
-hcloud_token: ""
-cluster_name: "k8s-cluster"
-csi_manifest_url: "https://raw.githubusercontent.com/hetznercloud/csi-driver/main/deploy/kubernetes/hcloud-csi.yml"
-csi_rollout_timeout_seconds: 30
-csi_rollout_retries: 8
-csi_rollout_delay_seconds: 5
-csi_failure_log_tail_lines: 120
-csi_smoke_test_enabled: true
-csi_smoke_test_storage_class: "csi-smoke-hcloud-immediate"
-csi_smoke_test_base_storage_class: "hcloud-volumes"
-csi_smoke_test_size: "1Gi"
-csi_smoke_test_pvc_timeout_seconds: 300
-csi_smoke_test_job_timeout_seconds: 300
-csi_smoke_test_required: false
--- a/ansible/roles/csi/tasks/main.yml
+++ b/ansible/roles/csi/tasks/main.yml
@@ -1,383 +0,0 @@
---
- name: Create Hetzner CSI secret
-  shell: |
-    kubectl -n kube-system create secret generic hcloud \
-      --from-literal=token='{{ hcloud_token }}' \
-      --from-literal=network='{{ cluster_name }}-network' \
-      --dry-run=client -o yaml | kubectl apply -f -
-  no_log: true
-  when: hcloud_token is defined
-  changed_when: true
-
- name: Deploy Hetzner CSI
-  command: kubectl apply -f {{ csi_manifest_url }}
-  changed_when: true
-
- name: Ensure CSI controller endpoint is set for sidecars
-  command: kubectl -n kube-system set env deployment/hcloud-csi-controller CSI_ENDPOINT=unix:///run/csi/socket
-  changed_when: true
-
- name: Ensure CSI node endpoint is set for sidecars
-  command: kubectl -n kube-system set env daemonset/hcloud-csi-node CSI_ENDPOINT=unix:///run/csi/socket
-  changed_when: true
-
- name: Restart CSI controller to pick up current secret
-  command: kubectl -n kube-system rollout restart deployment/hcloud-csi-controller
-  changed_when: true
-
- name: Wait for CSI controller deployment generation
-  command: kubectl -n kube-system rollout status deployment/hcloud-csi-controller --timeout=30s
-  failed_when: false
-  changed_when: false
-
- name: Wait for CSI controller rollout
-  command: kubectl rollout status deployment/hcloud-csi-controller -n kube-system --timeout={{ csi_rollout_timeout_seconds }}s
-  register: csi_controller_rollout
-  until: csi_controller_rollout.rc == 0
-  retries: "{{ csi_rollout_retries | int }}"
-  delay: "{{ csi_rollout_delay_seconds | int }}"
-  failed_when: false
-  changed_when: false
-
- name: Show CSI controller status on failure
-  command: kubectl -n kube-system get deployment hcloud-csi-controller -o wide
-  register: csi_controller_deploy_status
-  changed_when: false
-  failed_when: false
-  when: csi_controller_rollout.rc != 0
-
- name: Show CSI controller pods on failure
-  command: kubectl -n kube-system get pods -l app=hcloud-csi-controller -o wide
-  register: csi_controller_pods_status
-  changed_when: false
-  failed_when: false
-  when: csi_controller_rollout.rc != 0
-
- name: Describe CSI controller deployment on failure
-  command: kubectl -n kube-system describe deployment hcloud-csi-controller
-  register: csi_controller_deploy_describe
-  changed_when: false
-  failed_when: false
-  when: csi_controller_rollout.rc != 0
-
- name: Describe CSI controller pod on failure
-  shell: |
-    pod="$(kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
-    if [ -n "$pod" ]; then
-      kubectl -n kube-system describe pod "$pod"
-    fi
-  register: csi_controller_pod_describe
-  changed_when: false
-  failed_when: false
-  when: csi_controller_rollout.rc != 0
-
- name: Show CSI driver logs on failure
-  command: kubectl -n kube-system logs deployment/hcloud-csi-controller -c hcloud-csi-driver --tail={{ csi_failure_log_tail_lines }}
-  register: csi_driver_logs
-  changed_when: false
-  failed_when: false
-  when: csi_controller_rollout.rc != 0
-
- name: Show CSI driver previous logs on failure
-  shell: |
-    pod="$(kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
-    if [ -n "$pod" ]; then
-      kubectl -n kube-system logs "$pod" -c hcloud-csi-driver --previous --tail={{ csi_failure_log_tail_lines }}
-    fi
-  register: csi_driver_previous_logs
-  changed_when: false
-  failed_when: false
-  when: csi_controller_rollout.rc != 0
-
- name: Show sidecar previous logs on failure
-  shell: |
-    pod="$(kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
-    if [ -n "$pod" ]; then
-      for container in csi-attacher csi-resizer csi-provisioner; do
-        echo "===== $container ====="
-        kubectl -n kube-system logs "$pod" -c "$container" --previous --tail={{ csi_failure_log_tail_lines }} || true
-      done
-    fi
-  register: csi_sidecar_previous_logs
-  changed_when: false
-  failed_when: false
-  when: csi_controller_rollout.rc != 0
-
- name: Show recent kube-system events on failure
-  command: kubectl -n kube-system get events --sort-by=.lastTimestamp
-  register: csi_recent_events
-  changed_when: false
-  failed_when: false
-  when: csi_controller_rollout.rc != 0
-
- name: Fail with CSI controller diagnostics
-  fail:
-    msg: |
-      CSI controller rollout failed.
-      Deployment status:
-      {{ csi_controller_deploy_status.stdout | default('n/a') }}
-
-      Pods status:
-      {{ csi_controller_pods_status.stdout | default('n/a') }}
-
-      Deployment describe:
-      {{ csi_controller_deploy_describe.stdout | default('n/a') }}
-
-      Pod describe:
-      {{ csi_controller_pod_describe.stdout | default('n/a') }}
-
-      hcloud-csi-driver logs:
-      {{ csi_driver_logs.stdout | default('n/a') }}
-
-      hcloud-csi-driver previous logs:
-      {{ csi_driver_previous_logs.stdout | default('n/a') }}
-
-      Sidecar previous logs:
-      {{ csi_sidecar_previous_logs.stdout | default('n/a') }}
-
-      Recent kube-system events:
-      {{ csi_recent_events.stdout | default('n/a') }}
-  when: csi_controller_rollout.rc != 0
-
- name: Wait for CSI node daemonset rollout
-  command: kubectl rollout status daemonset/hcloud-csi-node -n kube-system --timeout={{ csi_rollout_timeout_seconds }}s
-  register: csi_node_rollout
-  until: csi_node_rollout.rc == 0
-  retries: "{{ csi_rollout_retries | int }}"
-  delay: "{{ csi_rollout_delay_seconds | int }}"
-  failed_when: false
-  changed_when: false
-
- name: Fail when CSI node daemonset rollout does not complete
-  fail:
-    msg: "CSI node daemonset rollout failed: {{ csi_node_rollout.stdout | default('') }} {{ csi_node_rollout.stderr | default('') }}"
-  when: csi_node_rollout.rc != 0
-
- name: Generate CSI smoke test run identifier
-  set_fact:
-    csi_smoke_test_run_id: "{{ lookup('pipe', 'date +%s') }}"
-  when: csi_smoke_test_enabled | bool
-
- name: Generate unique CSI smoke test resource names
-  set_fact:
-    csi_smoke_test_pvc_name: "csi-smoke-pvc-{{ csi_smoke_test_run_id }}"
-    csi_smoke_test_job_name: "csi-smoke-job-{{ csi_smoke_test_run_id }}"
-  when: csi_smoke_test_enabled | bool
-
- name: Cleanup stale CSI smoke test resources before apply
-  shell: |
-    kubectl -n kube-system delete job,pvc -l app.kubernetes.io/name=csi-smoke --ignore-not-found --wait=true
-    kubectl delete storageclass {{ csi_smoke_test_storage_class }} --ignore-not-found
-  failed_when: false
-  changed_when: false
-  when: csi_smoke_test_enabled | bool
-
- name: Apply CSI smoke test resources
-  template:
-    src: csi-smoke.yaml.j2
-    dest: /tmp/csi-smoke.yaml
-    mode: "0644"
-  when: csi_smoke_test_enabled | bool
-
- name: Apply CSI smoke test manifests
-  command: kubectl apply -f /tmp/csi-smoke.yaml
-  changed_when: true
-  when: csi_smoke_test_enabled | bool
-
- name: Wait for CSI smoke PVC to bind
-  command: kubectl -n kube-system wait --for=jsonpath='{.status.phase}'=Bound pvc/{{ csi_smoke_test_pvc_name }} --timeout={{ csi_smoke_test_pvc_timeout_seconds }}s
-  register: csi_smoke_pvc_wait
-  failed_when: false
-  changed_when: false
-  when: csi_smoke_test_enabled | bool
-
- name: Wait for CSI smoke Job completion
-  command: kubectl -n kube-system wait --for=condition=complete job/{{ csi_smoke_test_job_name }} --timeout={{ csi_smoke_test_job_timeout_seconds }}s
-  register: csi_smoke_job_wait
-  failed_when: false
-  changed_when: false
-  when:
-    - csi_smoke_test_enabled | bool
-    - csi_smoke_pvc_wait.rc == 0
-
- name: Show CSI smoke job logs
-  command: kubectl -n kube-system logs job/{{ csi_smoke_test_job_name }}
-  register: csi_smoke_job_logs
-  failed_when: false
-  changed_when: false
-  when: csi_smoke_test_enabled | bool
-
- name: Show CSI smoke PVC on failure
-  command: kubectl -n kube-system get pvc {{ csi_smoke_test_pvc_name }} -o wide
-  register: csi_smoke_pvc_status
-  failed_when: false
-  changed_when: false
-  when:
-    - csi_smoke_test_enabled | bool
-    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
-
- name: Show CSI smoke Job on failure
-  command: kubectl -n kube-system get job {{ csi_smoke_test_job_name }} -o wide
-  register: csi_smoke_job_status
-  failed_when: false
-  changed_when: false
-  when:
-    - csi_smoke_test_enabled | bool
-    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
-
- name: Show CSI smoke pods on failure
-  command: kubectl -n kube-system get pod -l job-name={{ csi_smoke_test_job_name }} -o wide
-  register: csi_smoke_pod_status
-  failed_when: false
-  changed_when: false
-  when:
-    - csi_smoke_test_enabled | bool
-    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
-
- name: Describe CSI smoke PVC on failure
-  command: kubectl -n kube-system describe pvc {{ csi_smoke_test_pvc_name }}
-  register: csi_smoke_pvc_describe
-  failed_when: false
-  changed_when: false
-  when:
-    - csi_smoke_test_enabled | bool
-    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
-
- name: Show storage classes on failure
-  command: kubectl get storageclass
-  register: csi_storageclasses
-  failed_when: false
-  changed_when: false
-  when:
-    - csi_smoke_test_enabled | bool
-    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
-
- name: Get CSI controller pod name on smoke failure
-  shell: kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}'
-  register: csi_controller_pod_name
-  failed_when: false
-  changed_when: false
-  when:
-    - csi_smoke_test_enabled | bool
-    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
-
- name: Describe CSI controller pod on smoke failure
-  command: kubectl -n kube-system describe pod {{ csi_controller_pod_name.stdout }}
-  register: csi_controller_pod_smoke_describe
-  failed_when: false
-  changed_when: false
-  when:
-    - csi_smoke_test_enabled | bool
-    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
-    - csi_controller_pod_name.stdout | length > 0
-
- name: Show CSI controller container logs on smoke failure
-  shell: |
-    pod="{{ csi_controller_pod_name.stdout }}"
-    for container in hcloud-csi-driver csi-provisioner csi-attacher csi-resizer liveness-probe; do
-      echo "===== ${container}: current ====="
-      kubectl -n kube-system logs "$pod" -c "$container" --tail={{ csi_failure_log_tail_lines }} || true
-      echo "===== ${container}: previous ====="
-      kubectl -n kube-system logs "$pod" -c "$container" --previous --tail={{ csi_failure_log_tail_lines }} || true
-    done
-  register: csi_controller_container_logs
-  failed_when: false
-  changed_when: false
-  when:
-    - csi_smoke_test_enabled | bool
-    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
-    - csi_controller_pod_name.stdout | length > 0
-
- name: Show CSI driver and node driver objects on smoke failure
-  shell: |
-    echo "===== CSIDriver ====="
-    kubectl get csidriver csi.hetzner.cloud -o yaml || true
-    echo "===== CSINode ====="
-    kubectl get csinode -o wide || true
-  register: csi_driver_objects
-  failed_when: false
-  changed_when: false
-  when:
-    - csi_smoke_test_enabled | bool
-    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
-
- name: Show CSI smoke pod describe on failure
-  shell: |
-    pod="$(kubectl -n kube-system get pods -l job-name={{ csi_smoke_test_job_name }} -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)"
-    if [ -n "$pod" ]; then
-      kubectl -n kube-system describe pod "$pod"
-    fi
-  register: csi_smoke_pod_describe
-  failed_when: false
-  changed_when: false
-  when:
-    - csi_smoke_test_enabled | bool
-    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
-
- name: Fail when CSI smoke test fails
-  fail:
-    msg: |
-      CSI smoke test failed.
-      PVC wait:
-      stdout: {{ csi_smoke_pvc_wait.stdout | default('') }}
-      stderr: {{ csi_smoke_pvc_wait.stderr | default('') }}
-
-      Job wait:
-      stdout: {{ csi_smoke_job_wait.stdout | default('') }}
-      stderr: {{ csi_smoke_job_wait.stderr | default('') }}
-
-      PVC:
-      {{ csi_smoke_pvc_status.stdout | default(csi_smoke_pvc_status.stderr | default('n/a')) }}
-
-      Job:
-      {{ csi_smoke_job_status.stdout | default(csi_smoke_job_status.stderr | default('n/a')) }}
-
-      Pod list:
-      {{ csi_smoke_pod_status.stdout | default(csi_smoke_pod_status.stderr | default('n/a')) }}
-
-      PVC describe:
-      {{ csi_smoke_pvc_describe.stdout | default(csi_smoke_pvc_describe.stderr | default('n/a')) }}
-
-      Storage classes:
-      {{ csi_storageclasses.stdout | default(csi_storageclasses.stderr | default('n/a')) }}
-
-      CSI controller pod:
-      {{ csi_controller_pod_name.stdout | default('n/a') }}
-
-      CSI controller pod describe:
-      {{ csi_controller_pod_smoke_describe.stdout | default(csi_controller_pod_smoke_describe.stderr | default('n/a')) }}
-
-      CSI controller container logs:
-      {{ csi_controller_container_logs.stdout | default(csi_controller_container_logs.stderr | default('n/a')) }}
-
-      CSI driver objects:
-      {{ csi_driver_objects.stdout | default(csi_driver_objects.stderr | default('n/a')) }}
-
-      Pod describe:
-      {{ csi_smoke_pod_describe.stdout | default('n/a') }}
-
-      Job logs:
-      {{ csi_smoke_job_logs.stdout | default('n/a') }}
-  when:
-    - csi_smoke_test_enabled | bool
-    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
-    - csi_smoke_test_required | bool
-
- name: Warn when CSI smoke test fails but is non-blocking
-  debug:
-    msg: |
-      CSI smoke test failed but csi_smoke_test_required is false, so deployment will continue.
-      PVC wait stderr: {{ csi_smoke_pvc_wait.stderr | default('') }}
-      Job wait stderr: {{ csi_smoke_job_wait.stderr | default('') }}
-  when:
-    - csi_smoke_test_enabled | bool
-    - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0
-    - not (csi_smoke_test_required | bool)
-
- name: Cleanup CSI smoke test resources
-  shell: |
-    kubectl -n kube-system delete job {{ csi_smoke_test_job_name }} pvc {{ csi_smoke_test_pvc_name }} --ignore-not-found
-    kubectl delete storageclass {{ csi_smoke_test_storage_class }} --ignore-not-found
-  failed_when: false
-  changed_when: false
-  when: csi_smoke_test_enabled | bool
--- a/ansible/roles/csi/templates/csi-smoke.yaml.j2
+++ b/ansible/roles/csi/templates/csi-smoke.yaml.j2
@@ -1,47 +0,0 @@
-apiVersion: storage.k8s.io/v1
-kind: StorageClass
-metadata:
-  name: {{ csi_smoke_test_storage_class }}
-provisioner: csi.hetzner.cloud
-reclaimPolicy: Delete
-volumeBindingMode: Immediate
-allowVolumeExpansion: true
---
-apiVersion: v1
-kind: PersistentVolumeClaim
-metadata:
-  name: {{ csi_smoke_test_pvc_name }}
-  namespace: kube-system
-  labels:
-    app.kubernetes.io/name: csi-smoke
-spec:
-  accessModes:
-    - ReadWriteOnce
-  resources:
-    requests:
-      storage: {{ csi_smoke_test_size }}
-  storageClassName: {{ csi_smoke_test_storage_class }}
---
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: {{ csi_smoke_test_job_name }}
-  namespace: kube-system
-  labels:
-    app.kubernetes.io/name: csi-smoke
-spec:
-  backoffLimit: 0
-  template:
-    spec:
-      restartPolicy: Never
-      containers:
-        - name: write-and-read
-          image: busybox:1.36
-          command: ["/bin/sh", "-c", "echo csi-ok > /data/health && cat /data/health"]
-          volumeMounts:
-            - name: data
-              mountPath: /data
-      volumes:
-        - name: data
-          persistentVolumeClaim:
-            claimName: {{ csi_smoke_test_pvc_name }}
--- a/ansible/roles/tailscale-operator/defaults/main.yml
+++ b/ansible/roles/tailscale-operator/defaults/main.yml
@@ -1,21 +0,0 @@
---
-tailscale_operator_namespace: "tailscale-system"
-tailscale_operator_version: "1.95.91"
-
-tailscale_oauth_client_id: ""
-tailscale_oauth_client_secret: ""
-
-tailscale_operator_default_tags:
-  - "tag:k8s"
-
-tailscale_proxyclass_name: "infra-stable"
-
-tailscale_operator_required: false
-
-tailscale_operator_node_selector:
-  kubernetes.io/hostname: "k8s-cluster-cp-1"
-
-tailscale_operator_tolerations:
-  - key: "node-role.kubernetes.io/control-plane"
-    operator: "Exists"
-    effect: "NoSchedule"
--- a/ansible/roles/tailscale-operator/tasks/main.yml
+++ b/ansible/roles/tailscale-operator/tasks/main.yml
@@ -1,171 +0,0 @@
---
- name: Determine if Tailscale operator is enabled
-  set_fact:
-    tailscale_operator_enabled: "{{ (tailscale_oauth_client_id | default('') | length) > 0 and (tailscale_oauth_client_secret | default('') | length) > 0 }}"
-    tailscale_operator_ready: false
-  changed_when: false
-
- name: Skip Tailscale operator when OAuth credentials are missing
-  debug:
-    msg: "Skipping Tailscale Kubernetes Operator: set TAILSCALE_OAUTH_CLIENT_ID and TAILSCALE_OAUTH_CLIENT_SECRET to enable it."
-  when: not tailscale_operator_enabled
-
- name: End Tailscale operator role when disabled
-  meta: end_host
-  when: not tailscale_operator_enabled
-
- name: Check if Helm is installed
-  command: helm version --short
-  register: helm_check
-  changed_when: false
-  failed_when: false
-
- name: Install Helm
-  shell: curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
-  when: helm_check.rc != 0
-  changed_when: true
-
- name: Create Tailscale operator namespace
-  command: kubectl create namespace {{ tailscale_operator_namespace }}
-  register: create_ns
-  failed_when: create_ns.rc != 0 and "AlreadyExists" not in create_ns.stderr
-  changed_when: create_ns.rc == 0
-
- name: Add Tailscale Helm repo
-  command: helm repo add tailscale https://pkgs.tailscale.com/unstable/helmcharts
-  register: add_repo
-  failed_when: add_repo.rc != 0 and "already exists" not in add_repo.stderr
-  changed_when: add_repo.rc == 0
-
- name: Update Helm repos
-  command: helm repo update
-  changed_when: false
-
- name: Write Tailscale operator values
-  template:
-    src: operator-values.yaml.j2
-    dest: /tmp/tailscale-operator-values.yaml
-    mode: "0644"
-
- name: Create or update Tailscale operator OAuth secret
-  shell: >-
-    kubectl -n {{ tailscale_operator_namespace }} create secret generic operator-oauth
-    --from-literal=client_id='{{ tailscale_oauth_client_id }}'
-    --from-literal=client_secret='{{ tailscale_oauth_client_secret }}'
-    --dry-run=client -o yaml | kubectl apply -f -
-  register: oauth_secret_result
-  changed_when: "'created' in oauth_secret_result.stdout or 'configured' in oauth_secret_result.stdout"
-
- name: Install Tailscale Kubernetes Operator
-  command: >-
-    helm upgrade --install tailscale-operator tailscale/tailscale-operator
-    --namespace {{ tailscale_operator_namespace }}
-    --version {{ tailscale_operator_version }}
-    --values /tmp/tailscale-operator-values.yaml
-    --timeout 5m
-  register: tailscale_install
-  failed_when: false
-  changed_when: true
-
- name: Show Tailscale operator pods on install failure
-  command: kubectl -n {{ tailscale_operator_namespace }} get pods -o wide
-  register: tailscale_pods
-  changed_when: false
-  failed_when: false
-  when: tailscale_install.rc != 0
-
- name: Show Tailscale operator events on install failure
-  command: kubectl -n {{ tailscale_operator_namespace }} get events --sort-by=.lastTimestamp
-  register: tailscale_events
-  changed_when: false
-  failed_when: false
-  when: tailscale_install.rc != 0
-
- name: Fail with Tailscale operator diagnostics
-  fail:
-    msg: |
-      Tailscale operator install failed.
-      Helm stderr:
-      {{ tailscale_install.stderr | default('') }}
-
-      Pods:
-      {{ tailscale_pods.stdout | default('n/a') }}
-
-      Events:
-      {{ tailscale_events.stdout | default('n/a') }}
-  when: tailscale_install.rc != 0
-
- name: Wait for Tailscale operator to be ready
-  command: kubectl -n {{ tailscale_operator_namespace }} rollout status deployment/operator --timeout=5m
-  register: tailscale_rollout
-  failed_when: false
-  changed_when: false
-
- name: Show Tailscale operator deployment status
-  command: kubectl -n {{ tailscale_operator_namespace }} get deployment operator -o wide
-  register: tailscale_deploy
-  changed_when: false
-  failed_when: false
-
- name: Get Tailscale operator logs
-  command: kubectl -n {{ tailscale_operator_namespace }} logs deployment/operator --tail=200
-  register: tailscale_operator_logs
-  changed_when: false
-  failed_when: false
-
- name: Fail when Tailscale OAuth permissions are insufficient
-  fail:
-    msg: |
-      Tailscale operator started but cannot create auth keys (OAuth/tag permission error).
-      Fix your Tailscale OAuth client/tag permissions.
-
-      Required checks in Tailscale admin:
-      - OAuth client has devices:core, auth_keys, and services write access
-      - OAuth client can create tagged devices for: {{ tailscale_operator_default_tags | join(', ') }}
-      - ACL/tag ownership allows those tags for this OAuth client
-
-      Operator log excerpt:
-      {{ tailscale_operator_logs.stdout | default('n/a') }}
-  when:
-    - tailscale_operator_required | bool
-    - "tailscale_operator_logs.stdout is defined and ('does not have enough permissions' in tailscale_operator_logs.stdout or 'Status: 403' in tailscale_operator_logs.stdout or 'invalid or not permitted' in tailscale_operator_logs.stdout or 'Status: 400' in tailscale_operator_logs.stdout)"
-
- name: Warn when Tailscale OAuth permissions are insufficient (non-blocking)
-  debug:
-    msg: |
-      Tailscale operator is not ready due to OAuth/tag permissions.
-      Continuing deployment because tailscale_operator_required=false.
-      Operator log excerpt:
-      {{ tailscale_operator_logs.stdout | default('n/a') }}
-  when:
-    - not (tailscale_operator_required | bool)
-    - "tailscale_operator_logs.stdout is defined and ('does not have enough permissions' in tailscale_operator_logs.stdout or 'Status: 403' in tailscale_operator_logs.stdout or 'invalid or not permitted' in tailscale_operator_logs.stdout or 'Status: 400' in tailscale_operator_logs.stdout)"
-
- name: Mark Tailscale operator ready when rollout succeeds and no auth errors
-  set_fact:
-    tailscale_operator_ready: true
-  when:
-    - tailscale_rollout.rc == 0
-    - "tailscale_operator_logs.stdout is not defined or (('does not have enough permissions' not in tailscale_operator_logs.stdout) and ('Status: 403' not in tailscale_operator_logs.stdout) and ('invalid or not permitted' not in tailscale_operator_logs.stdout) and ('Status: 400' not in tailscale_operator_logs.stdout))"
-
- name: Warn if Tailscale operator is not ready yet
-  debug:
-    msg: |
-      Tailscale operator deployment is still converging.
-      This is non-blocking for CI; service endpoints may appear shortly.
-      Rollout output:
-      {{ tailscale_rollout.stdout | default('') }}
-      {{ tailscale_deploy.stdout | default('') }}
-  when: tailscale_rollout.rc != 0
-
- name: Write Tailscale default ProxyClass manifest
-  template:
-    src: proxyclass.yaml.j2
-    dest: /tmp/tailscale-proxyclass.yaml
-    mode: "0644"
-  when: tailscale_operator_ready | default(false) | bool
-
- name: Apply Tailscale default ProxyClass
-  command: kubectl apply -f /tmp/tailscale-proxyclass.yaml
-  changed_when: true
-  when: tailscale_operator_ready | default(false) | bool
--- a/ansible/roles/tailscale-operator/templates/operator-values.yaml.j2
+++ b/ansible/roles/tailscale-operator/templates/operator-values.yaml.j2
@@ -1,24 +0,0 @@
-apiServerProxyConfig:
-  mode: "true"
-
-operatorConfig:
-  defaultTags:
-{% for tag in tailscale_operator_default_tags %}
-    - "{{ tag }}"
-{% endfor %}
-  nodeSelector:
-{% for key, value in tailscale_operator_node_selector.items() %}
-    {{ key }}: "{{ value }}"
-{% endfor %}
-  tolerations:
-{% for tol in tailscale_operator_tolerations %}
-    - key: "{{ tol.key }}"
-      operator: "{{ tol.operator }}"
-      effect: "{{ tol.effect }}"
-{% endfor %}
-
-installCRDs: true
-
-proxyConfig:
-  defaultTags: "{{ tailscale_operator_default_tags | join(',') }}"
-  defaultProxyClass: "{{ tailscale_proxyclass_name }}"
--- a/ansible/roles/tailscale-operator/templates/proxyclass.yaml.j2
+++ b/ansible/roles/tailscale-operator/templates/proxyclass.yaml.j2
@@ -1,17 +0,0 @@
-apiVersion: tailscale.com/v1alpha1
-kind: ProxyClass
-metadata:
-  name: {{ tailscale_proxyclass_name }}
-spec:
-  statefulSet:
-    pod:
-      nodeSelector:
-{% for key, value in tailscale_operator_node_selector.items() %}
-        {{ key }}: "{{ value }}"
-{% endfor %}
-      tolerations:
-{% for tol in tailscale_operator_tolerations %}
-      - key: "{{ tol.key }}"
-        operator: "{{ tol.operator }}"
-        effect: "{{ tol.effect }}"
-{% endfor %}
--- a/ansible/site.yml
+++ b/ansible/site.yml
@@ -75,30 +75,6 @@
  roles:
    - k3s-agent

- name: Deploy Hetzner CCM
-  hosts: control_plane[0]
-  become: true
-
-  roles:
-    - role: ccm
-      when: not (ccm_gitops_enabled | default(true) | bool)
-
- name: Deploy Hetzner CSI
-  hosts: control_plane[0]
-  become: true
-
-  roles:
-    - role: csi
-      when: not (csi_gitops_enabled | default(true) | bool)
-
- name: Deploy Tailscale Kubernetes Operator
-  hosts: control_plane[0]
-  become: true
-
-  roles:
-    - role: tailscale-operator
-      when: not (tailscale_operator_gitops_enabled | default(true) | bool)
-
 - name: Bootstrap addon prerequisite secrets
  hosts: control_plane[0]
  become: true