--- - name: Clean up stale Tailscale cluster node devices hosts: localhost connection: local vars: tailscale_reserved_hostnames: "{{ groups['cluster'] | default([]) | list }}" roles: - tailscale-cleanup - name: Bootstrap Kubernetes cluster hosts: cluster become: true gather_facts: false pre_tasks: - name: Wait for SSH wait_for_connection: delay: 10 timeout: 600 - name: Gather facts after SSH is reachable setup: register: initial_setup ignore_errors: true ignore_unreachable: true - name: Clear transient SSH unreachable state after first fact gather meta: clear_host_errors when: initial_setup.unreachable | default(false) - name: Wait for SSH after transient first-boot disconnect wait_for_connection: delay: 10 timeout: 300 when: initial_setup.unreachable | default(false) or initial_setup.failed | default(false) - name: Gather facts after transient first-boot disconnect setup: when: initial_setup.unreachable | default(false) or initial_setup.failed | default(false) roles: - common - name: Setup primary control plane hosts: control_plane[0] become: true vars: k3s_primary: true k3s_token: "{{ lookup('password', '/dev/null length=32 chars=ascii_letters,digits') }}" k3s_primary_private_ip: "{{ k3s_private_ip }}" k3s_primary_public_ip: "{{ ansible_host }}" k3s_primary_ip: "{{ k3s_private_ip }}" k3s_node_ip: "{{ k3s_private_ip }}" # kube_api_endpoint is set in inventory group_vars roles: - k3s-server - name: Get join info from primary hosts: control_plane[0] become: true tasks: - name: Fetch node token command: cat /var/lib/rancher/k3s/server/node-token register: node_token changed_when: false - name: Set join token fact set_fact: k3s_token: "{{ node_token.stdout }}" k3s_primary_private_ip: "{{ k3s_private_ip }}" k3s_primary_public_ip: "{{ ansible_host }}" - name: Fetch kubeconfig fetch: src: /etc/rancher/k3s/k3s.yaml dest: ../outputs/kubeconfig flat: true - name: Bootstrap addon prerequisite secrets hosts: control_plane[0] become: true roles: - addon-secrets-bootstrap - name: Deploy kube-vip for API HA hosts: control_plane[0] become: true roles: - kube-vip-deploy - name: Wait for Kubernetes API VIP readiness hosts: control_plane[0] become: true tasks: - name: Wait for Kubernetes readyz through the VIP command: kubectl --server=https://{{ kube_api_endpoint }}:6443 get --raw=/readyz register: api_readyz until: api_readyz.rc == 0 retries: 30 delay: 10 changed_when: false - name: Setup secondary control planes hosts: control_plane[1:] become: true vars: k3s_primary: false k3s_token: "{{ hostvars[groups['control_plane'][0]]['k3s_token'] }}" k3s_primary_ip: "{{ hostvars[groups['control_plane'][0]]['k3s_primary_private_ip'] }}" k3s_primary_public_ip: "{{ hostvars[groups['control_plane'][0]]['k3s_primary_public_ip'] }}" k3s_node_ip: "{{ k3s_private_ip }}" # Use Load Balancer for HA - all control planes join via LB endpoint k3s_join_endpoint: "{{ kube_api_endpoint | default(hostvars[groups['control_plane'][0]]['k3s_primary_private_ip']) }}" roles: - k3s-server - name: Wait for all control plane nodes to be Ready hosts: control_plane[0] become: true tasks: - name: Wait for kube-vip DaemonSet across control planes command: kubectl -n kube-system rollout status daemonset/kube-vip --timeout=300s register: kube_vip_rollout changed_when: false failed_when: false - name: Show kube-vip pod status on rollout failure command: kubectl -n kube-system get pods -l app.kubernetes.io/name=kube-vip -o wide register: kube_vip_pods_after_join changed_when: false failed_when: false when: kube_vip_rollout.rc != 0 - name: Describe kube-vip pods on rollout failure command: kubectl -n kube-system describe pods -l app.kubernetes.io/name=kube-vip register: kube_vip_describe_after_join changed_when: false failed_when: false when: kube_vip_rollout.rc != 0 - name: Fail when kube-vip is not healthy on all control planes fail: msg: | kube-vip DaemonSet did not become healthy after secondary control planes joined. Rollout: {{ kube_vip_rollout.stdout | default('') }} {{ kube_vip_rollout.stderr | default('') }} Pods: {{ kube_vip_pods_after_join.stdout | default('n/a') }} Describe: {{ kube_vip_describe_after_join.stdout | default('n/a') }} when: kube_vip_rollout.rc != 0 - name: Wait for control plane node readiness command: kubectl wait --for=condition=Ready node/{{ item }} --timeout=30s register: control_plane_ready until: control_plane_ready.rc == 0 retries: 20 delay: 15 changed_when: false loop: "{{ groups['control_plane'] }}" - name: Wait for Kubernetes readyz before worker joins command: kubectl --server=https://{{ kube_api_endpoint }}:6443 get --raw=/readyz register: api_readyz_before_workers until: api_readyz_before_workers.rc == 0 retries: 30 delay: 10 changed_when: false - name: Verify worker reachability to Kubernetes API VIP hosts: workers become: true tasks: - name: Wait for Kubernetes API VIP from worker wait_for: host: "{{ kube_api_endpoint }}" port: 6443 state: started timeout: 180 register: worker_vip_wait failed_when: false - name: Collect worker network diagnostics when VIP is unreachable shell: | set -euo pipefail echo "== ip addr ==" ip addr echo "== ip route ==" ip route echo "== ip neigh ==" ip neigh || true echo "== vip route ==" ip route get {{ kube_api_endpoint }} || true echo "== tcp probe ==" timeout 5 bash -c '- hostvars[groups['control_plane'][0]].existing_tailscale_hostnames.stdout_lines | default([]) | intersect(tailscale_reserved_hostnames) | length == 0 - name: Finalize hosts: localhost connection: local tasks: - name: Check whether kubeconfig was fetched stat: path: ../outputs/kubeconfig register: kubeconfig_file - name: Update kubeconfig server address command: | sed -i 's/127.0.0.1/{{ hostvars[groups["control_plane"][0]]["ansible_host"] }}/g' ../outputs/kubeconfig changed_when: true when: kubeconfig_file.stat.exists - name: Display success message debug: msg: | Cluster setup complete! Control planes: {{ groups['control_plane'] | length }} Workers: {{ groups['workers'] | length }} To access the cluster: export KUBECONFIG={{ playbook_dir }}/../outputs/kubeconfig kubectl get nodes