From 31b82c937172369dc9d501187663bdcf9315f709 Mon Sep 17 00:00:00 2001 From: MichaelFisher1997 Date: Sun, 22 Mar 2026 23:58:03 +0000 Subject: [PATCH] Deploy CCM via Ansible before workers join to fix external cloud provider This fixes the chicken-and-egg problem where workers with --kubelet-arg=cloud-provider=external couldn't join because CCM wasn't running yet to remove the node.cloudprovider.kubernetes.io/uninitialized taint. Changes: - Create ansible/roles/ccm-deploy/ to deploy CCM via Helm during Ansible phase - Reorder site.yml: CCM deploys after secrets but before workers join - CCM runs on control_plane[0] with proper tolerations for control plane nodes - Add 10s pause after CCM ready to ensure it can process new nodes - Workers can now successfully join with external cloud provider enabled Flux still manages CCM for updates, but initial install happens in Ansible. --- ansible/roles/ccm-deploy/tasks/main.yml | 62 +++++++++++++++++++++++++ ansible/site.yml | 21 ++++++--- 2 files changed, 76 insertions(+), 7 deletions(-) create mode 100644 ansible/roles/ccm-deploy/tasks/main.yml diff --git a/ansible/roles/ccm-deploy/tasks/main.yml b/ansible/roles/ccm-deploy/tasks/main.yml new file mode 100644 index 0000000..a0cb42a --- /dev/null +++ b/ansible/roles/ccm-deploy/tasks/main.yml @@ -0,0 +1,62 @@ +--- +- name: Check if hcloud secret exists + command: kubectl -n kube-system get secret hcloud + register: hcloud_secret_check + changed_when: false + failed_when: false + +- name: Fail if hcloud secret is missing + fail: + msg: "hcloud secret not found in kube-system namespace. CCM requires it." + when: hcloud_secret_check.rc != 0 + +- name: Add Hetzner Helm repository + kubernetes.core.helm_repository: + name: hcloud + repo_url: https://charts.hetzner.cloud + kubeconfig: /etc/rancher/k3s/k3s.yaml + environment: + KUBECONFIG: /etc/rancher/k3s/k3s.yaml + +- name: Deploy Hetzner Cloud Controller Manager + kubernetes.core.helm: + name: hcloud-cloud-controller-manager + chart_ref: hcloud/hcloud-cloud-controller-manager + release_namespace: kube-system + create_namespace: true + values: + networking: + enabled: true + nodeSelector: + kubernetes.io/hostname: "{{ inventory_hostname }}" + additionalTolerations: + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule + kubeconfig: /etc/rancher/k3s/k3s.yaml + wait: true + wait_timeout: 300s + environment: + KUBECONFIG: /etc/rancher/k3s/k3s.yaml + +- name: Wait for CCM to be ready + command: kubectl -n kube-system rollout status deployment/hcloud-cloud-controller-manager --timeout=120s + changed_when: false + register: ccm_rollout + until: ccm_rollout.rc == 0 + retries: 3 + delay: 10 + +- name: Pause to ensure CCM is fully ready to process new nodes + pause: + seconds: 10 + +- name: Verify CCM is removing uninitialized taints + command: kubectl get nodes -o jsonpath='{.items[*].spec.taints[?(@.key=="node.cloudprovider.kubernetes.io/uninitialized")].key}' + register: uninitialized_taints + changed_when: false + failed_when: false + +- name: Display taint status + debug: + msg: "Nodes with uninitialized taint: {{ uninitialized_taints.stdout }}" diff --git a/ansible/site.yml b/ansible/site.yml index 2eefa56..01cbbe1 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -49,6 +49,20 @@ dest: ../outputs/kubeconfig flat: true +- name: Bootstrap addon prerequisite secrets + hosts: control_plane[0] + become: true + + roles: + - addon-secrets-bootstrap + +- name: Deploy Hetzner CCM (required for workers with external cloud provider) + hosts: control_plane[0] + become: true + + roles: + - ccm-deploy + - name: Setup secondary control planes hosts: control_plane[1:] become: true @@ -75,13 +89,6 @@ roles: - k3s-agent -- name: Bootstrap addon prerequisite secrets - hosts: control_plane[0] - become: true - - roles: - - addon-secrets-bootstrap - - name: Deploy observability stack hosts: control_plane[0] become: true