326 lines
9.5 KiB
YAML
326 lines
9.5 KiB
YAML
---
|
|
- name: Clean up stale Tailscale cluster node devices
|
|
hosts: localhost
|
|
connection: local
|
|
vars:
|
|
tailscale_reserved_hostnames: "{{ groups['cluster'] | default([]) | list }}"
|
|
|
|
roles:
|
|
- tailscale-cleanup
|
|
|
|
- name: Bootstrap Kubernetes cluster
|
|
hosts: cluster
|
|
become: true
|
|
gather_facts: false
|
|
|
|
pre_tasks:
|
|
- name: Wait for SSH
|
|
wait_for_connection:
|
|
delay: 10
|
|
timeout: 600
|
|
|
|
- name: Gather facts after SSH is reachable
|
|
setup:
|
|
register: initial_setup
|
|
ignore_errors: true
|
|
ignore_unreachable: true
|
|
|
|
- name: Clear transient SSH unreachable state after first fact gather
|
|
meta: clear_host_errors
|
|
when: initial_setup.unreachable | default(false)
|
|
|
|
- name: Wait for SSH after transient first-boot disconnect
|
|
wait_for_connection:
|
|
delay: 10
|
|
timeout: 300
|
|
when: initial_setup.unreachable | default(false) or initial_setup.failed | default(false)
|
|
|
|
- name: Gather facts after transient first-boot disconnect
|
|
setup:
|
|
when: initial_setup.unreachable | default(false) or initial_setup.failed | default(false)
|
|
|
|
roles:
|
|
- common
|
|
|
|
- name: Setup primary control plane
|
|
hosts: control_plane[0]
|
|
become: true
|
|
|
|
vars:
|
|
k3s_primary: true
|
|
k3s_token: "{{ lookup('password', '/dev/null length=32 chars=ascii_letters,digits') }}"
|
|
k3s_primary_private_ip: "{{ k3s_private_ip }}"
|
|
k3s_primary_public_ip: "{{ ansible_host }}"
|
|
k3s_primary_ip: "{{ k3s_private_ip }}"
|
|
k3s_node_ip: "{{ k3s_private_ip }}"
|
|
# kube_api_endpoint is set in inventory group_vars
|
|
|
|
roles:
|
|
- k3s-server
|
|
|
|
- name: Get join info from primary
|
|
hosts: control_plane[0]
|
|
become: true
|
|
tasks:
|
|
- name: Fetch node token
|
|
command: cat /var/lib/rancher/k3s/server/node-token
|
|
register: node_token
|
|
changed_when: false
|
|
|
|
- name: Set join token fact
|
|
set_fact:
|
|
k3s_token: "{{ node_token.stdout }}"
|
|
k3s_primary_private_ip: "{{ k3s_private_ip }}"
|
|
k3s_primary_public_ip: "{{ ansible_host }}"
|
|
|
|
- name: Fetch kubeconfig
|
|
fetch:
|
|
src: /etc/rancher/k3s/k3s.yaml
|
|
dest: ../outputs/kubeconfig
|
|
flat: true
|
|
|
|
- name: Bootstrap addon prerequisite secrets
|
|
hosts: control_plane[0]
|
|
become: true
|
|
|
|
roles:
|
|
- addon-secrets-bootstrap
|
|
|
|
- name: Deploy kube-vip for API HA
|
|
hosts: control_plane[0]
|
|
become: true
|
|
|
|
roles:
|
|
- kube-vip-deploy
|
|
|
|
- name: Wait for Kubernetes API VIP readiness
|
|
hosts: control_plane[0]
|
|
become: true
|
|
tasks:
|
|
- name: Wait for Kubernetes readyz through the VIP
|
|
command: kubectl --server=https://{{ kube_api_endpoint }}:6443 get --raw=/readyz
|
|
register: api_readyz
|
|
until: api_readyz.rc == 0
|
|
retries: 30
|
|
delay: 10
|
|
changed_when: false
|
|
|
|
- name: Setup secondary control planes
|
|
hosts: control_plane[1:]
|
|
become: true
|
|
|
|
vars:
|
|
k3s_primary: false
|
|
k3s_token: "{{ hostvars[groups['control_plane'][0]]['k3s_token'] }}"
|
|
k3s_primary_ip: "{{ hostvars[groups['control_plane'][0]]['k3s_primary_private_ip'] }}"
|
|
k3s_primary_public_ip: "{{ hostvars[groups['control_plane'][0]]['k3s_primary_public_ip'] }}"
|
|
k3s_node_ip: "{{ k3s_private_ip }}"
|
|
# Use Load Balancer for HA - all control planes join via LB endpoint
|
|
k3s_join_endpoint: "{{ kube_api_endpoint | default(hostvars[groups['control_plane'][0]]['k3s_primary_private_ip']) }}"
|
|
|
|
roles:
|
|
- k3s-server
|
|
|
|
- name: Wait for all control plane nodes to be Ready
|
|
hosts: control_plane[0]
|
|
become: true
|
|
tasks:
|
|
- name: Wait for kube-vip DaemonSet across control planes
|
|
command: kubectl -n kube-system rollout status daemonset/kube-vip --timeout=300s
|
|
register: kube_vip_rollout
|
|
changed_when: false
|
|
failed_when: false
|
|
|
|
- name: Show kube-vip pod status on rollout failure
|
|
command: kubectl -n kube-system get pods -l app.kubernetes.io/name=kube-vip -o wide
|
|
register: kube_vip_pods_after_join
|
|
changed_when: false
|
|
failed_when: false
|
|
when: kube_vip_rollout.rc != 0
|
|
|
|
- name: Describe kube-vip pods on rollout failure
|
|
command: kubectl -n kube-system describe pods -l app.kubernetes.io/name=kube-vip
|
|
register: kube_vip_describe_after_join
|
|
changed_when: false
|
|
failed_when: false
|
|
when: kube_vip_rollout.rc != 0
|
|
|
|
- name: Fail when kube-vip is not healthy on all control planes
|
|
fail:
|
|
msg: |
|
|
kube-vip DaemonSet did not become healthy after secondary control planes joined.
|
|
Rollout:
|
|
{{ kube_vip_rollout.stdout | default('') }}
|
|
{{ kube_vip_rollout.stderr | default('') }}
|
|
|
|
Pods:
|
|
{{ kube_vip_pods_after_join.stdout | default('n/a') }}
|
|
|
|
Describe:
|
|
{{ kube_vip_describe_after_join.stdout | default('n/a') }}
|
|
when: kube_vip_rollout.rc != 0
|
|
|
|
- name: Wait for control plane node readiness
|
|
command: kubectl wait --for=condition=Ready node/{{ item }} --timeout=30s
|
|
register: control_plane_ready
|
|
until: control_plane_ready.rc == 0
|
|
retries: 20
|
|
delay: 15
|
|
changed_when: false
|
|
loop: "{{ groups['control_plane'] }}"
|
|
|
|
- name: Wait for Kubernetes readyz before worker joins
|
|
command: kubectl --server=https://{{ kube_api_endpoint }}:6443 get --raw=/readyz
|
|
register: api_readyz_before_workers
|
|
until: api_readyz_before_workers.rc == 0
|
|
retries: 30
|
|
delay: 10
|
|
changed_when: false
|
|
|
|
- name: Verify worker reachability to Kubernetes API VIP
|
|
hosts: workers
|
|
become: true
|
|
tasks:
|
|
- name: Wait for Kubernetes API VIP from worker
|
|
wait_for:
|
|
host: "{{ kube_api_endpoint }}"
|
|
port: 6443
|
|
state: started
|
|
timeout: 180
|
|
register: worker_vip_wait
|
|
failed_when: false
|
|
|
|
- name: Collect worker network diagnostics when VIP is unreachable
|
|
shell: |
|
|
set -euo pipefail
|
|
echo "== ip addr =="
|
|
ip addr
|
|
echo "== ip route =="
|
|
ip route
|
|
echo "== ip neigh =="
|
|
ip neigh || true
|
|
echo "== vip route =="
|
|
ip route get {{ kube_api_endpoint }} || true
|
|
echo "== tcp probe =="
|
|
timeout 5 bash -c '</dev/tcp/{{ kube_api_endpoint }}/6443' && echo connected || echo failed
|
|
args:
|
|
executable: /bin/bash
|
|
register: worker_vip_diagnostics
|
|
changed_when: false
|
|
failed_when: false
|
|
when: worker_vip_wait.msg is defined
|
|
|
|
- name: Fail when worker cannot reach Kubernetes API VIP
|
|
fail:
|
|
msg: |
|
|
Worker {{ inventory_hostname }} cannot reach Kubernetes API VIP {{ kube_api_endpoint }}:6443.
|
|
This blocks k3s agent join and points to kube-vip/L2/routing reachability, not agent install.
|
|
|
|
Diagnostics:
|
|
{{ worker_vip_diagnostics.stdout | default('n/a') }}
|
|
when: worker_vip_wait.msg is defined
|
|
|
|
- name: Setup workers
|
|
hosts: workers
|
|
become: true
|
|
|
|
vars:
|
|
k3s_token: "{{ hostvars[groups['control_plane'][0]]['k3s_token'] }}"
|
|
# Use Load Balancer for HA - workers join via LB endpoint
|
|
k3s_server_url: "https://{{ kube_api_endpoint | default(hostvars[groups['control_plane'][0]]['k3s_primary_private_ip']) }}:6443"
|
|
k3s_node_ip: "{{ k3s_private_ip }}"
|
|
|
|
roles:
|
|
- k3s-agent
|
|
|
|
- name: Pre-pull bootstrap control-plane images
|
|
hosts: control_plane[0]
|
|
become: true
|
|
|
|
roles:
|
|
- bootstrap-image-prepull
|
|
|
|
- name: Pre-pull Rancher bootstrap images
|
|
hosts: workers
|
|
become: true
|
|
|
|
roles:
|
|
- role: rancher-image-prepull
|
|
when: rancher_image_prepull_enabled | default(false) | bool
|
|
|
|
- name: Deploy observability stack
|
|
hosts: control_plane[0]
|
|
become: true
|
|
|
|
roles:
|
|
- role: observability
|
|
when: not (observability_gitops_enabled | default(true) | bool)
|
|
|
|
- name: Provision Grafana content
|
|
hosts: control_plane[0]
|
|
become: true
|
|
|
|
roles:
|
|
- role: observability-content
|
|
when: not (observability_gitops_enabled | default(true) | bool)
|
|
|
|
- name: Bootstrap Doppler access for External Secrets
|
|
hosts: control_plane[0]
|
|
become: true
|
|
|
|
roles:
|
|
- doppler-bootstrap
|
|
|
|
- name: Detect existing Tailscale service proxies
|
|
hosts: control_plane[0]
|
|
become: true
|
|
tasks:
|
|
- name: Check for current Tailscale service hostnames
|
|
command: kubectl get svc -A -o jsonpath='{range .items[*]}{.metadata.annotations.tailscale\.com/hostname}{"\n"}{end}'
|
|
register: existing_tailscale_hostnames
|
|
changed_when: false
|
|
failed_when: false
|
|
|
|
- name: Clean up stale Tailscale devices
|
|
hosts: localhost
|
|
connection: local
|
|
vars:
|
|
tailscale_reserved_hostnames:
|
|
- rancher
|
|
- grafana
|
|
- prometheus
|
|
- flux
|
|
tasks:
|
|
- name: Delete stale devices only before service proxies exist
|
|
include_role:
|
|
name: tailscale-cleanup
|
|
when: >-
|
|
hostvars[groups['control_plane'][0]].existing_tailscale_hostnames.stdout_lines | default([])
|
|
| intersect(tailscale_reserved_hostnames)
|
|
| length == 0
|
|
|
|
- name: Finalize
|
|
hosts: localhost
|
|
connection: local
|
|
tasks:
|
|
- name: Check whether kubeconfig was fetched
|
|
stat:
|
|
path: ../outputs/kubeconfig
|
|
register: kubeconfig_file
|
|
|
|
- name: Update kubeconfig server address
|
|
command: |
|
|
sed -i 's/127.0.0.1/{{ hostvars[groups["control_plane"][0]]["ansible_host"] }}/g' ../outputs/kubeconfig
|
|
changed_when: true
|
|
when: kubeconfig_file.stat.exists
|
|
|
|
- name: Display success message
|
|
debug:
|
|
msg: |
|
|
Cluster setup complete!
|
|
Control planes: {{ groups['control_plane'] | length }}
|
|
Workers: {{ groups['workers'] | length }}
|
|
To access the cluster:
|
|
export KUBECONFIG={{ playbook_dir }}/../outputs/kubeconfig
|
|
kubectl get nodes
|