From 08a30312762a720f811f3ed267fe406f8476d40c Mon Sep 17 00:00:00 2001 From: MichaelFisher1997 Date: Tue, 17 Mar 2026 01:04:02 +0000 Subject: [PATCH] refactor: retire imperative addon roles --- README.md | 8 +- ansible/roles/ccm/defaults/main.yml | 4 - ansible/roles/ccm/tasks/main.yml | 88 ---- ansible/roles/csi/defaults/main.yml | 15 - ansible/roles/csi/tasks/main.yml | 383 ------------------ ansible/roles/csi/templates/csi-smoke.yaml.j2 | 47 --- .../tailscale-operator/defaults/main.yml | 21 - .../roles/tailscale-operator/tasks/main.yml | 171 -------- .../templates/operator-values.yaml.j2 | 24 -- .../templates/proxyclass.yaml.j2 | 17 - ansible/site.yml | 24 -- 11 files changed, 3 insertions(+), 799 deletions(-) delete mode 100644 ansible/roles/ccm/defaults/main.yml delete mode 100644 ansible/roles/ccm/tasks/main.yml delete mode 100644 ansible/roles/csi/defaults/main.yml delete mode 100644 ansible/roles/csi/tasks/main.yml delete mode 100644 ansible/roles/csi/templates/csi-smoke.yaml.j2 delete mode 100644 ansible/roles/tailscale-operator/defaults/main.yml delete mode 100644 ansible/roles/tailscale-operator/tasks/main.yml delete mode 100644 ansible/roles/tailscale-operator/templates/operator-values.yaml.j2 delete mode 100644 ansible/roles/tailscale-operator/templates/proxyclass.yaml.j2 diff --git a/README.md b/README.md index dc7cfb1..ebe6404 100644 --- a/README.md +++ b/README.md @@ -197,7 +197,7 @@ Terraform/bootstrap secrets remain in Gitea Actions secrets and are not managed - `clusters/prod/`: cluster entrypoint and Flux reconciliation objects - `clusters/prod/flux-system/`: `GitRepository` source and top-level `Kustomization` graph - `infrastructure/`: infrastructure addon reconciliation graph -- `infrastructure/addons/*`: per-addon manifests (observability + observability-content migrated) +- `infrastructure/addons/*`: per-addon manifests for Flux-managed cluster addons - `apps/`: application workload layer (currently scaffolded) ### Reconciliation graph @@ -215,7 +215,7 @@ Terraform/bootstrap secrets remain in Gitea Actions secrets and are not managed 1. Install Flux controllers in `flux-system`. 2. Create the Flux deploy key/secret named `flux-system` in `flux-system` namespace. 3. Apply `clusters/prod/flux-system/` once to establish source + reconciliation graph. -4. Unsuspend addon `Kustomization` objects one-by-one as each addon is migrated from Ansible. +4. Bootstrap-only Ansible creates prerequisite secrets; Flux manages addon lifecycle after bootstrap. ### Current migration status @@ -319,9 +319,7 @@ It avoids full cluster provisioning and only applies Grafana content resources: │ │ ├── common/ │ │ ├── k3s-server/ │ │ ├── k3s-agent/ -│ │ ├── ccm/ -│ │ ├── csi/ -│ │ ├── tailscale-operator/ +│ │ ├── addon-secrets-bootstrap/ │ │ ├── observability-content/ │ │ └── observability/ │ └── ansible.cfg diff --git a/ansible/roles/ccm/defaults/main.yml b/ansible/roles/ccm/defaults/main.yml deleted file mode 100644 index 26d72ce..0000000 --- a/ansible/roles/ccm/defaults/main.yml +++ /dev/null @@ -1,4 +0,0 @@ ---- -hcloud_token: "" -cluster_name: "k8s-cluster" -hcloud_lb_location: "nbg1" diff --git a/ansible/roles/ccm/tasks/main.yml b/ansible/roles/ccm/tasks/main.yml deleted file mode 100644 index 2fcb60f..0000000 --- a/ansible/roles/ccm/tasks/main.yml +++ /dev/null @@ -1,88 +0,0 @@ ---- -- name: Check if Hetzner CCM is already deployed - command: kubectl -n kube-system get deployment hcloud-cloud-controller-manager - register: ccm_namespace - failed_when: false - changed_when: false - -- name: Create Hetzner cloud secret - shell: | - kubectl -n kube-system create secret generic hcloud \ - --from-literal=token='{{ hcloud_token }}' \ - --from-literal=network='{{ cluster_name }}-network' \ - --dry-run=client -o yaml | kubectl apply -f - - no_log: true - when: hcloud_token is defined - changed_when: true - -- name: Deploy Hetzner CCM - command: kubectl apply -f https://raw.githubusercontent.com/hetznercloud/hcloud-cloud-controller-manager/main/deploy/ccm-networks.yaml - changed_when: true - -- name: Detect CCM workload kind - shell: | - if kubectl -n kube-system get deployment hcloud-cloud-controller-manager >/dev/null 2>&1; then - echo deployment - elif kubectl -n kube-system get daemonset hcloud-cloud-controller-manager >/dev/null 2>&1; then - echo daemonset - else - echo missing - fi - register: ccm_workload_kind - changed_when: false - -- name: Wait for CCM deployment rollout - command: kubectl rollout status deployment/hcloud-cloud-controller-manager -n kube-system - register: ccm_rollout_deploy - until: ccm_rollout_deploy.rc == 0 - changed_when: false - retries: 30 - delay: 10 - when: ccm_workload_kind.stdout == "deployment" - -- name: Wait for CCM daemonset rollout - command: kubectl rollout status daemonset/hcloud-cloud-controller-manager -n kube-system - register: ccm_rollout_ds - until: ccm_rollout_ds.rc == 0 - changed_when: false - retries: 30 - delay: 10 - when: ccm_workload_kind.stdout == "daemonset" - -- name: Set default Hetzner load balancer location for Traefik service - command: kubectl -n kube-system annotate service traefik load-balancer.hetzner.cloud/location={{ hcloud_lb_location }} --overwrite - register: traefik_annotation - changed_when: true - failed_when: false - -- name: Show Traefik service when annotation patch fails - command: kubectl -n kube-system get service traefik -o yaml - register: traefik_service_dump - changed_when: false - failed_when: false - when: traefik_annotation.rc != 0 - -- name: Fail when Traefik load balancer annotation cannot be set - fail: - msg: | - Failed to set Hetzner load balancer location annotation on kube-system/traefik service. - Command output: - {{ traefik_annotation.stderr | default(traefik_annotation.stdout) }} - - Service dump: - {{ traefik_service_dump.stdout | default('n/a') }} - when: traefik_annotation.rc != 0 - -- name: Show CCM namespace objects when workload missing - command: kubectl -n kube-system get all | grep hcloud-cloud-controller-manager || true - register: ccm_ns_objects - changed_when: false - when: ccm_workload_kind.stdout == "missing" - -- name: Fail when CCM workload is missing - fail: - msg: | - hcloud-cloud-controller-manager workload not found after applying manifest. - Namespace objects: - {{ ccm_ns_objects.stdout | default('n/a') }} - when: ccm_workload_kind.stdout == "missing" diff --git a/ansible/roles/csi/defaults/main.yml b/ansible/roles/csi/defaults/main.yml deleted file mode 100644 index 5c05040..0000000 --- a/ansible/roles/csi/defaults/main.yml +++ /dev/null @@ -1,15 +0,0 @@ ---- -hcloud_token: "" -cluster_name: "k8s-cluster" -csi_manifest_url: "https://raw.githubusercontent.com/hetznercloud/csi-driver/main/deploy/kubernetes/hcloud-csi.yml" -csi_rollout_timeout_seconds: 30 -csi_rollout_retries: 8 -csi_rollout_delay_seconds: 5 -csi_failure_log_tail_lines: 120 -csi_smoke_test_enabled: true -csi_smoke_test_storage_class: "csi-smoke-hcloud-immediate" -csi_smoke_test_base_storage_class: "hcloud-volumes" -csi_smoke_test_size: "1Gi" -csi_smoke_test_pvc_timeout_seconds: 300 -csi_smoke_test_job_timeout_seconds: 300 -csi_smoke_test_required: false diff --git a/ansible/roles/csi/tasks/main.yml b/ansible/roles/csi/tasks/main.yml deleted file mode 100644 index f29ec77..0000000 --- a/ansible/roles/csi/tasks/main.yml +++ /dev/null @@ -1,383 +0,0 @@ ---- -- name: Create Hetzner CSI secret - shell: | - kubectl -n kube-system create secret generic hcloud \ - --from-literal=token='{{ hcloud_token }}' \ - --from-literal=network='{{ cluster_name }}-network' \ - --dry-run=client -o yaml | kubectl apply -f - - no_log: true - when: hcloud_token is defined - changed_when: true - -- name: Deploy Hetzner CSI - command: kubectl apply -f {{ csi_manifest_url }} - changed_when: true - -- name: Ensure CSI controller endpoint is set for sidecars - command: kubectl -n kube-system set env deployment/hcloud-csi-controller CSI_ENDPOINT=unix:///run/csi/socket - changed_when: true - -- name: Ensure CSI node endpoint is set for sidecars - command: kubectl -n kube-system set env daemonset/hcloud-csi-node CSI_ENDPOINT=unix:///run/csi/socket - changed_when: true - -- name: Restart CSI controller to pick up current secret - command: kubectl -n kube-system rollout restart deployment/hcloud-csi-controller - changed_when: true - -- name: Wait for CSI controller deployment generation - command: kubectl -n kube-system rollout status deployment/hcloud-csi-controller --timeout=30s - failed_when: false - changed_when: false - -- name: Wait for CSI controller rollout - command: kubectl rollout status deployment/hcloud-csi-controller -n kube-system --timeout={{ csi_rollout_timeout_seconds }}s - register: csi_controller_rollout - until: csi_controller_rollout.rc == 0 - retries: "{{ csi_rollout_retries | int }}" - delay: "{{ csi_rollout_delay_seconds | int }}" - failed_when: false - changed_when: false - -- name: Show CSI controller status on failure - command: kubectl -n kube-system get deployment hcloud-csi-controller -o wide - register: csi_controller_deploy_status - changed_when: false - failed_when: false - when: csi_controller_rollout.rc != 0 - -- name: Show CSI controller pods on failure - command: kubectl -n kube-system get pods -l app=hcloud-csi-controller -o wide - register: csi_controller_pods_status - changed_when: false - failed_when: false - when: csi_controller_rollout.rc != 0 - -- name: Describe CSI controller deployment on failure - command: kubectl -n kube-system describe deployment hcloud-csi-controller - register: csi_controller_deploy_describe - changed_when: false - failed_when: false - when: csi_controller_rollout.rc != 0 - -- name: Describe CSI controller pod on failure - shell: | - pod="$(kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)" - if [ -n "$pod" ]; then - kubectl -n kube-system describe pod "$pod" - fi - register: csi_controller_pod_describe - changed_when: false - failed_when: false - when: csi_controller_rollout.rc != 0 - -- name: Show CSI driver logs on failure - command: kubectl -n kube-system logs deployment/hcloud-csi-controller -c hcloud-csi-driver --tail={{ csi_failure_log_tail_lines }} - register: csi_driver_logs - changed_when: false - failed_when: false - when: csi_controller_rollout.rc != 0 - -- name: Show CSI driver previous logs on failure - shell: | - pod="$(kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)" - if [ -n "$pod" ]; then - kubectl -n kube-system logs "$pod" -c hcloud-csi-driver --previous --tail={{ csi_failure_log_tail_lines }} - fi - register: csi_driver_previous_logs - changed_when: false - failed_when: false - when: csi_controller_rollout.rc != 0 - -- name: Show sidecar previous logs on failure - shell: | - pod="$(kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)" - if [ -n "$pod" ]; then - for container in csi-attacher csi-resizer csi-provisioner; do - echo "===== $container =====" - kubectl -n kube-system logs "$pod" -c "$container" --previous --tail={{ csi_failure_log_tail_lines }} || true - done - fi - register: csi_sidecar_previous_logs - changed_when: false - failed_when: false - when: csi_controller_rollout.rc != 0 - -- name: Show recent kube-system events on failure - command: kubectl -n kube-system get events --sort-by=.lastTimestamp - register: csi_recent_events - changed_when: false - failed_when: false - when: csi_controller_rollout.rc != 0 - -- name: Fail with CSI controller diagnostics - fail: - msg: | - CSI controller rollout failed. - Deployment status: - {{ csi_controller_deploy_status.stdout | default('n/a') }} - - Pods status: - {{ csi_controller_pods_status.stdout | default('n/a') }} - - Deployment describe: - {{ csi_controller_deploy_describe.stdout | default('n/a') }} - - Pod describe: - {{ csi_controller_pod_describe.stdout | default('n/a') }} - - hcloud-csi-driver logs: - {{ csi_driver_logs.stdout | default('n/a') }} - - hcloud-csi-driver previous logs: - {{ csi_driver_previous_logs.stdout | default('n/a') }} - - Sidecar previous logs: - {{ csi_sidecar_previous_logs.stdout | default('n/a') }} - - Recent kube-system events: - {{ csi_recent_events.stdout | default('n/a') }} - when: csi_controller_rollout.rc != 0 - -- name: Wait for CSI node daemonset rollout - command: kubectl rollout status daemonset/hcloud-csi-node -n kube-system --timeout={{ csi_rollout_timeout_seconds }}s - register: csi_node_rollout - until: csi_node_rollout.rc == 0 - retries: "{{ csi_rollout_retries | int }}" - delay: "{{ csi_rollout_delay_seconds | int }}" - failed_when: false - changed_when: false - -- name: Fail when CSI node daemonset rollout does not complete - fail: - msg: "CSI node daemonset rollout failed: {{ csi_node_rollout.stdout | default('') }} {{ csi_node_rollout.stderr | default('') }}" - when: csi_node_rollout.rc != 0 - -- name: Generate CSI smoke test run identifier - set_fact: - csi_smoke_test_run_id: "{{ lookup('pipe', 'date +%s') }}" - when: csi_smoke_test_enabled | bool - -- name: Generate unique CSI smoke test resource names - set_fact: - csi_smoke_test_pvc_name: "csi-smoke-pvc-{{ csi_smoke_test_run_id }}" - csi_smoke_test_job_name: "csi-smoke-job-{{ csi_smoke_test_run_id }}" - when: csi_smoke_test_enabled | bool - -- name: Cleanup stale CSI smoke test resources before apply - shell: | - kubectl -n kube-system delete job,pvc -l app.kubernetes.io/name=csi-smoke --ignore-not-found --wait=true - kubectl delete storageclass {{ csi_smoke_test_storage_class }} --ignore-not-found - failed_when: false - changed_when: false - when: csi_smoke_test_enabled | bool - -- name: Apply CSI smoke test resources - template: - src: csi-smoke.yaml.j2 - dest: /tmp/csi-smoke.yaml - mode: "0644" - when: csi_smoke_test_enabled | bool - -- name: Apply CSI smoke test manifests - command: kubectl apply -f /tmp/csi-smoke.yaml - changed_when: true - when: csi_smoke_test_enabled | bool - -- name: Wait for CSI smoke PVC to bind - command: kubectl -n kube-system wait --for=jsonpath='{.status.phase}'=Bound pvc/{{ csi_smoke_test_pvc_name }} --timeout={{ csi_smoke_test_pvc_timeout_seconds }}s - register: csi_smoke_pvc_wait - failed_when: false - changed_when: false - when: csi_smoke_test_enabled | bool - -- name: Wait for CSI smoke Job completion - command: kubectl -n kube-system wait --for=condition=complete job/{{ csi_smoke_test_job_name }} --timeout={{ csi_smoke_test_job_timeout_seconds }}s - register: csi_smoke_job_wait - failed_when: false - changed_when: false - when: - - csi_smoke_test_enabled | bool - - csi_smoke_pvc_wait.rc == 0 - -- name: Show CSI smoke job logs - command: kubectl -n kube-system logs job/{{ csi_smoke_test_job_name }} - register: csi_smoke_job_logs - failed_when: false - changed_when: false - when: csi_smoke_test_enabled | bool - -- name: Show CSI smoke PVC on failure - command: kubectl -n kube-system get pvc {{ csi_smoke_test_pvc_name }} -o wide - register: csi_smoke_pvc_status - failed_when: false - changed_when: false - when: - - csi_smoke_test_enabled | bool - - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0 - -- name: Show CSI smoke Job on failure - command: kubectl -n kube-system get job {{ csi_smoke_test_job_name }} -o wide - register: csi_smoke_job_status - failed_when: false - changed_when: false - when: - - csi_smoke_test_enabled | bool - - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0 - -- name: Show CSI smoke pods on failure - command: kubectl -n kube-system get pod -l job-name={{ csi_smoke_test_job_name }} -o wide - register: csi_smoke_pod_status - failed_when: false - changed_when: false - when: - - csi_smoke_test_enabled | bool - - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0 - -- name: Describe CSI smoke PVC on failure - command: kubectl -n kube-system describe pvc {{ csi_smoke_test_pvc_name }} - register: csi_smoke_pvc_describe - failed_when: false - changed_when: false - when: - - csi_smoke_test_enabled | bool - - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0 - -- name: Show storage classes on failure - command: kubectl get storageclass - register: csi_storageclasses - failed_when: false - changed_when: false - when: - - csi_smoke_test_enabled | bool - - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0 - -- name: Get CSI controller pod name on smoke failure - shell: kubectl -n kube-system get pods -l app=hcloud-csi-controller -o jsonpath='{.items[0].metadata.name}' - register: csi_controller_pod_name - failed_when: false - changed_when: false - when: - - csi_smoke_test_enabled | bool - - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0 - -- name: Describe CSI controller pod on smoke failure - command: kubectl -n kube-system describe pod {{ csi_controller_pod_name.stdout }} - register: csi_controller_pod_smoke_describe - failed_when: false - changed_when: false - when: - - csi_smoke_test_enabled | bool - - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0 - - csi_controller_pod_name.stdout | length > 0 - -- name: Show CSI controller container logs on smoke failure - shell: | - pod="{{ csi_controller_pod_name.stdout }}" - for container in hcloud-csi-driver csi-provisioner csi-attacher csi-resizer liveness-probe; do - echo "===== ${container}: current =====" - kubectl -n kube-system logs "$pod" -c "$container" --tail={{ csi_failure_log_tail_lines }} || true - echo "===== ${container}: previous =====" - kubectl -n kube-system logs "$pod" -c "$container" --previous --tail={{ csi_failure_log_tail_lines }} || true - done - register: csi_controller_container_logs - failed_when: false - changed_when: false - when: - - csi_smoke_test_enabled | bool - - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0 - - csi_controller_pod_name.stdout | length > 0 - -- name: Show CSI driver and node driver objects on smoke failure - shell: | - echo "===== CSIDriver =====" - kubectl get csidriver csi.hetzner.cloud -o yaml || true - echo "===== CSINode =====" - kubectl get csinode -o wide || true - register: csi_driver_objects - failed_when: false - changed_when: false - when: - - csi_smoke_test_enabled | bool - - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0 - -- name: Show CSI smoke pod describe on failure - shell: | - pod="$(kubectl -n kube-system get pods -l job-name={{ csi_smoke_test_job_name }} -o jsonpath='{.items[0].metadata.name}' 2>/dev/null)" - if [ -n "$pod" ]; then - kubectl -n kube-system describe pod "$pod" - fi - register: csi_smoke_pod_describe - failed_when: false - changed_when: false - when: - - csi_smoke_test_enabled | bool - - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0 - -- name: Fail when CSI smoke test fails - fail: - msg: | - CSI smoke test failed. - PVC wait: - stdout: {{ csi_smoke_pvc_wait.stdout | default('') }} - stderr: {{ csi_smoke_pvc_wait.stderr | default('') }} - - Job wait: - stdout: {{ csi_smoke_job_wait.stdout | default('') }} - stderr: {{ csi_smoke_job_wait.stderr | default('') }} - - PVC: - {{ csi_smoke_pvc_status.stdout | default(csi_smoke_pvc_status.stderr | default('n/a')) }} - - Job: - {{ csi_smoke_job_status.stdout | default(csi_smoke_job_status.stderr | default('n/a')) }} - - Pod list: - {{ csi_smoke_pod_status.stdout | default(csi_smoke_pod_status.stderr | default('n/a')) }} - - PVC describe: - {{ csi_smoke_pvc_describe.stdout | default(csi_smoke_pvc_describe.stderr | default('n/a')) }} - - Storage classes: - {{ csi_storageclasses.stdout | default(csi_storageclasses.stderr | default('n/a')) }} - - CSI controller pod: - {{ csi_controller_pod_name.stdout | default('n/a') }} - - CSI controller pod describe: - {{ csi_controller_pod_smoke_describe.stdout | default(csi_controller_pod_smoke_describe.stderr | default('n/a')) }} - - CSI controller container logs: - {{ csi_controller_container_logs.stdout | default(csi_controller_container_logs.stderr | default('n/a')) }} - - CSI driver objects: - {{ csi_driver_objects.stdout | default(csi_driver_objects.stderr | default('n/a')) }} - - Pod describe: - {{ csi_smoke_pod_describe.stdout | default('n/a') }} - - Job logs: - {{ csi_smoke_job_logs.stdout | default('n/a') }} - when: - - csi_smoke_test_enabled | bool - - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0 - - csi_smoke_test_required | bool - -- name: Warn when CSI smoke test fails but is non-blocking - debug: - msg: | - CSI smoke test failed but csi_smoke_test_required is false, so deployment will continue. - PVC wait stderr: {{ csi_smoke_pvc_wait.stderr | default('') }} - Job wait stderr: {{ csi_smoke_job_wait.stderr | default('') }} - when: - - csi_smoke_test_enabled | bool - - csi_smoke_pvc_wait.rc != 0 or (csi_smoke_job_wait.rc | default(1)) != 0 - - not (csi_smoke_test_required | bool) - -- name: Cleanup CSI smoke test resources - shell: | - kubectl -n kube-system delete job {{ csi_smoke_test_job_name }} pvc {{ csi_smoke_test_pvc_name }} --ignore-not-found - kubectl delete storageclass {{ csi_smoke_test_storage_class }} --ignore-not-found - failed_when: false - changed_when: false - when: csi_smoke_test_enabled | bool diff --git a/ansible/roles/csi/templates/csi-smoke.yaml.j2 b/ansible/roles/csi/templates/csi-smoke.yaml.j2 deleted file mode 100644 index 7e41c98..0000000 --- a/ansible/roles/csi/templates/csi-smoke.yaml.j2 +++ /dev/null @@ -1,47 +0,0 @@ -apiVersion: storage.k8s.io/v1 -kind: StorageClass -metadata: - name: {{ csi_smoke_test_storage_class }} -provisioner: csi.hetzner.cloud -reclaimPolicy: Delete -volumeBindingMode: Immediate -allowVolumeExpansion: true ---- -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: {{ csi_smoke_test_pvc_name }} - namespace: kube-system - labels: - app.kubernetes.io/name: csi-smoke -spec: - accessModes: - - ReadWriteOnce - resources: - requests: - storage: {{ csi_smoke_test_size }} - storageClassName: {{ csi_smoke_test_storage_class }} ---- -apiVersion: batch/v1 -kind: Job -metadata: - name: {{ csi_smoke_test_job_name }} - namespace: kube-system - labels: - app.kubernetes.io/name: csi-smoke -spec: - backoffLimit: 0 - template: - spec: - restartPolicy: Never - containers: - - name: write-and-read - image: busybox:1.36 - command: ["/bin/sh", "-c", "echo csi-ok > /data/health && cat /data/health"] - volumeMounts: - - name: data - mountPath: /data - volumes: - - name: data - persistentVolumeClaim: - claimName: {{ csi_smoke_test_pvc_name }} diff --git a/ansible/roles/tailscale-operator/defaults/main.yml b/ansible/roles/tailscale-operator/defaults/main.yml deleted file mode 100644 index 13dd1a9..0000000 --- a/ansible/roles/tailscale-operator/defaults/main.yml +++ /dev/null @@ -1,21 +0,0 @@ ---- -tailscale_operator_namespace: "tailscale-system" -tailscale_operator_version: "1.95.91" - -tailscale_oauth_client_id: "" -tailscale_oauth_client_secret: "" - -tailscale_operator_default_tags: - - "tag:k8s" - -tailscale_proxyclass_name: "infra-stable" - -tailscale_operator_required: false - -tailscale_operator_node_selector: - kubernetes.io/hostname: "k8s-cluster-cp-1" - -tailscale_operator_tolerations: - - key: "node-role.kubernetes.io/control-plane" - operator: "Exists" - effect: "NoSchedule" diff --git a/ansible/roles/tailscale-operator/tasks/main.yml b/ansible/roles/tailscale-operator/tasks/main.yml deleted file mode 100644 index 61225d7..0000000 --- a/ansible/roles/tailscale-operator/tasks/main.yml +++ /dev/null @@ -1,171 +0,0 @@ ---- -- name: Determine if Tailscale operator is enabled - set_fact: - tailscale_operator_enabled: "{{ (tailscale_oauth_client_id | default('') | length) > 0 and (tailscale_oauth_client_secret | default('') | length) > 0 }}" - tailscale_operator_ready: false - changed_when: false - -- name: Skip Tailscale operator when OAuth credentials are missing - debug: - msg: "Skipping Tailscale Kubernetes Operator: set TAILSCALE_OAUTH_CLIENT_ID and TAILSCALE_OAUTH_CLIENT_SECRET to enable it." - when: not tailscale_operator_enabled - -- name: End Tailscale operator role when disabled - meta: end_host - when: not tailscale_operator_enabled - -- name: Check if Helm is installed - command: helm version --short - register: helm_check - changed_when: false - failed_when: false - -- name: Install Helm - shell: curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash - when: helm_check.rc != 0 - changed_when: true - -- name: Create Tailscale operator namespace - command: kubectl create namespace {{ tailscale_operator_namespace }} - register: create_ns - failed_when: create_ns.rc != 0 and "AlreadyExists" not in create_ns.stderr - changed_when: create_ns.rc == 0 - -- name: Add Tailscale Helm repo - command: helm repo add tailscale https://pkgs.tailscale.com/unstable/helmcharts - register: add_repo - failed_when: add_repo.rc != 0 and "already exists" not in add_repo.stderr - changed_when: add_repo.rc == 0 - -- name: Update Helm repos - command: helm repo update - changed_when: false - -- name: Write Tailscale operator values - template: - src: operator-values.yaml.j2 - dest: /tmp/tailscale-operator-values.yaml - mode: "0644" - -- name: Create or update Tailscale operator OAuth secret - shell: >- - kubectl -n {{ tailscale_operator_namespace }} create secret generic operator-oauth - --from-literal=client_id='{{ tailscale_oauth_client_id }}' - --from-literal=client_secret='{{ tailscale_oauth_client_secret }}' - --dry-run=client -o yaml | kubectl apply -f - - register: oauth_secret_result - changed_when: "'created' in oauth_secret_result.stdout or 'configured' in oauth_secret_result.stdout" - -- name: Install Tailscale Kubernetes Operator - command: >- - helm upgrade --install tailscale-operator tailscale/tailscale-operator - --namespace {{ tailscale_operator_namespace }} - --version {{ tailscale_operator_version }} - --values /tmp/tailscale-operator-values.yaml - --timeout 5m - register: tailscale_install - failed_when: false - changed_when: true - -- name: Show Tailscale operator pods on install failure - command: kubectl -n {{ tailscale_operator_namespace }} get pods -o wide - register: tailscale_pods - changed_when: false - failed_when: false - when: tailscale_install.rc != 0 - -- name: Show Tailscale operator events on install failure - command: kubectl -n {{ tailscale_operator_namespace }} get events --sort-by=.lastTimestamp - register: tailscale_events - changed_when: false - failed_when: false - when: tailscale_install.rc != 0 - -- name: Fail with Tailscale operator diagnostics - fail: - msg: | - Tailscale operator install failed. - Helm stderr: - {{ tailscale_install.stderr | default('') }} - - Pods: - {{ tailscale_pods.stdout | default('n/a') }} - - Events: - {{ tailscale_events.stdout | default('n/a') }} - when: tailscale_install.rc != 0 - -- name: Wait for Tailscale operator to be ready - command: kubectl -n {{ tailscale_operator_namespace }} rollout status deployment/operator --timeout=5m - register: tailscale_rollout - failed_when: false - changed_when: false - -- name: Show Tailscale operator deployment status - command: kubectl -n {{ tailscale_operator_namespace }} get deployment operator -o wide - register: tailscale_deploy - changed_when: false - failed_when: false - -- name: Get Tailscale operator logs - command: kubectl -n {{ tailscale_operator_namespace }} logs deployment/operator --tail=200 - register: tailscale_operator_logs - changed_when: false - failed_when: false - -- name: Fail when Tailscale OAuth permissions are insufficient - fail: - msg: | - Tailscale operator started but cannot create auth keys (OAuth/tag permission error). - Fix your Tailscale OAuth client/tag permissions. - - Required checks in Tailscale admin: - - OAuth client has devices:core, auth_keys, and services write access - - OAuth client can create tagged devices for: {{ tailscale_operator_default_tags | join(', ') }} - - ACL/tag ownership allows those tags for this OAuth client - - Operator log excerpt: - {{ tailscale_operator_logs.stdout | default('n/a') }} - when: - - tailscale_operator_required | bool - - "tailscale_operator_logs.stdout is defined and ('does not have enough permissions' in tailscale_operator_logs.stdout or 'Status: 403' in tailscale_operator_logs.stdout or 'invalid or not permitted' in tailscale_operator_logs.stdout or 'Status: 400' in tailscale_operator_logs.stdout)" - -- name: Warn when Tailscale OAuth permissions are insufficient (non-blocking) - debug: - msg: | - Tailscale operator is not ready due to OAuth/tag permissions. - Continuing deployment because tailscale_operator_required=false. - Operator log excerpt: - {{ tailscale_operator_logs.stdout | default('n/a') }} - when: - - not (tailscale_operator_required | bool) - - "tailscale_operator_logs.stdout is defined and ('does not have enough permissions' in tailscale_operator_logs.stdout or 'Status: 403' in tailscale_operator_logs.stdout or 'invalid or not permitted' in tailscale_operator_logs.stdout or 'Status: 400' in tailscale_operator_logs.stdout)" - -- name: Mark Tailscale operator ready when rollout succeeds and no auth errors - set_fact: - tailscale_operator_ready: true - when: - - tailscale_rollout.rc == 0 - - "tailscale_operator_logs.stdout is not defined or (('does not have enough permissions' not in tailscale_operator_logs.stdout) and ('Status: 403' not in tailscale_operator_logs.stdout) and ('invalid or not permitted' not in tailscale_operator_logs.stdout) and ('Status: 400' not in tailscale_operator_logs.stdout))" - -- name: Warn if Tailscale operator is not ready yet - debug: - msg: | - Tailscale operator deployment is still converging. - This is non-blocking for CI; service endpoints may appear shortly. - Rollout output: - {{ tailscale_rollout.stdout | default('') }} - {{ tailscale_deploy.stdout | default('') }} - when: tailscale_rollout.rc != 0 - -- name: Write Tailscale default ProxyClass manifest - template: - src: proxyclass.yaml.j2 - dest: /tmp/tailscale-proxyclass.yaml - mode: "0644" - when: tailscale_operator_ready | default(false) | bool - -- name: Apply Tailscale default ProxyClass - command: kubectl apply -f /tmp/tailscale-proxyclass.yaml - changed_when: true - when: tailscale_operator_ready | default(false) | bool diff --git a/ansible/roles/tailscale-operator/templates/operator-values.yaml.j2 b/ansible/roles/tailscale-operator/templates/operator-values.yaml.j2 deleted file mode 100644 index 53dd362..0000000 --- a/ansible/roles/tailscale-operator/templates/operator-values.yaml.j2 +++ /dev/null @@ -1,24 +0,0 @@ -apiServerProxyConfig: - mode: "true" - -operatorConfig: - defaultTags: -{% for tag in tailscale_operator_default_tags %} - - "{{ tag }}" -{% endfor %} - nodeSelector: -{% for key, value in tailscale_operator_node_selector.items() %} - {{ key }}: "{{ value }}" -{% endfor %} - tolerations: -{% for tol in tailscale_operator_tolerations %} - - key: "{{ tol.key }}" - operator: "{{ tol.operator }}" - effect: "{{ tol.effect }}" -{% endfor %} - -installCRDs: true - -proxyConfig: - defaultTags: "{{ tailscale_operator_default_tags | join(',') }}" - defaultProxyClass: "{{ tailscale_proxyclass_name }}" diff --git a/ansible/roles/tailscale-operator/templates/proxyclass.yaml.j2 b/ansible/roles/tailscale-operator/templates/proxyclass.yaml.j2 deleted file mode 100644 index 530f9e1..0000000 --- a/ansible/roles/tailscale-operator/templates/proxyclass.yaml.j2 +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: tailscale.com/v1alpha1 -kind: ProxyClass -metadata: - name: {{ tailscale_proxyclass_name }} -spec: - statefulSet: - pod: - nodeSelector: -{% for key, value in tailscale_operator_node_selector.items() %} - {{ key }}: "{{ value }}" -{% endfor %} - tolerations: -{% for tol in tailscale_operator_tolerations %} - - key: "{{ tol.key }}" - operator: "{{ tol.operator }}" - effect: "{{ tol.effect }}" -{% endfor %} diff --git a/ansible/site.yml b/ansible/site.yml index 25eee6b..2eefa56 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -75,30 +75,6 @@ roles: - k3s-agent -- name: Deploy Hetzner CCM - hosts: control_plane[0] - become: true - - roles: - - role: ccm - when: not (ccm_gitops_enabled | default(true) | bool) - -- name: Deploy Hetzner CSI - hosts: control_plane[0] - become: true - - roles: - - role: csi - when: not (csi_gitops_enabled | default(true) | bool) - -- name: Deploy Tailscale Kubernetes Operator - hosts: control_plane[0] - become: true - - roles: - - role: tailscale-operator - when: not (tailscale_operator_gitops_enabled | default(true) | bool) - - name: Bootstrap addon prerequisite secrets hosts: control_plane[0] become: true