Files
HetznerTerra/ansible/site.yml
T

309 lines
8.9 KiB
YAML
Raw Normal View History

---
- name: Clean up stale Tailscale cluster node devices
hosts: localhost
connection: local
vars:
tailscale_reserved_hostnames: "{{ groups['cluster'] | default([]) | list }}"
roles:
- tailscale-cleanup
- name: Bootstrap Kubernetes cluster
hosts: cluster
become: true
2026-04-30 03:44:13 +00:00
gather_facts: false
pre_tasks:
- name: Wait for SSH
wait_for_connection:
delay: 10
2026-04-30 03:44:13 +00:00
timeout: 600
- name: Gather facts after SSH is reachable
setup:
roles:
- common
- name: Setup primary control plane
hosts: control_plane[0]
become: true
vars:
k3s_primary: true
k3s_token: "{{ lookup('password', '/dev/null length=32 chars=ascii_letters,digits') }}"
k3s_primary_private_ip: "{{ k3s_private_ip }}"
k3s_primary_public_ip: "{{ ansible_host }}"
k3s_primary_ip: "{{ k3s_private_ip }}"
k3s_node_ip: "{{ k3s_private_ip }}"
# kube_api_endpoint is set in inventory group_vars
roles:
- k3s-server
- name: Get join info from primary
hosts: control_plane[0]
become: true
tasks:
- name: Fetch node token
command: cat /var/lib/rancher/k3s/server/node-token
register: node_token
changed_when: false
- name: Set join token fact
set_fact:
k3s_token: "{{ node_token.stdout }}"
k3s_primary_private_ip: "{{ k3s_private_ip }}"
k3s_primary_public_ip: "{{ ansible_host }}"
- name: Fetch kubeconfig
fetch:
src: /etc/rancher/k3s/k3s.yaml
dest: ../outputs/kubeconfig
flat: true
- name: Bootstrap addon prerequisite secrets
hosts: control_plane[0]
become: true
roles:
- addon-secrets-bootstrap
- name: Deploy kube-vip for API HA
hosts: control_plane[0]
become: true
roles:
- kube-vip-deploy
2026-04-30 07:36:27 +00:00
- name: Wait for Kubernetes API VIP readiness
hosts: control_plane[0]
become: true
tasks:
- name: Wait for Kubernetes readyz through the VIP
command: kubectl --server=https://{{ kube_api_endpoint }}:6443 get --raw=/readyz
register: api_readyz
until: api_readyz.rc == 0
retries: 30
delay: 10
changed_when: false
- name: Setup secondary control planes
hosts: control_plane[1:]
become: true
vars:
k3s_primary: false
k3s_token: "{{ hostvars[groups['control_plane'][0]]['k3s_token'] }}"
k3s_primary_ip: "{{ hostvars[groups['control_plane'][0]]['k3s_primary_private_ip'] }}"
k3s_primary_public_ip: "{{ hostvars[groups['control_plane'][0]]['k3s_primary_public_ip'] }}"
k3s_node_ip: "{{ k3s_private_ip }}"
# Use Load Balancer for HA - all control planes join via LB endpoint
k3s_join_endpoint: "{{ kube_api_endpoint | default(hostvars[groups['control_plane'][0]]['k3s_primary_private_ip']) }}"
roles:
- k3s-server
2026-04-30 07:36:27 +00:00
- name: Wait for all control plane nodes to be Ready
hosts: control_plane[0]
become: true
tasks:
2026-05-02 01:04:06 +00:00
- name: Wait for kube-vip DaemonSet across control planes
command: kubectl -n kube-system rollout status daemonset/kube-vip --timeout=300s
register: kube_vip_rollout
changed_when: false
failed_when: false
- name: Show kube-vip pod status on rollout failure
command: kubectl -n kube-system get pods -l app.kubernetes.io/name=kube-vip -o wide
register: kube_vip_pods_after_join
changed_when: false
failed_when: false
when: kube_vip_rollout.rc != 0
- name: Describe kube-vip pods on rollout failure
command: kubectl -n kube-system describe pods -l app.kubernetes.io/name=kube-vip
register: kube_vip_describe_after_join
changed_when: false
failed_when: false
when: kube_vip_rollout.rc != 0
- name: Fail when kube-vip is not healthy on all control planes
fail:
msg: |
kube-vip DaemonSet did not become healthy after secondary control planes joined.
Rollout:
{{ kube_vip_rollout.stdout | default('') }}
{{ kube_vip_rollout.stderr | default('') }}
Pods:
{{ kube_vip_pods_after_join.stdout | default('n/a') }}
Describe:
{{ kube_vip_describe_after_join.stdout | default('n/a') }}
when: kube_vip_rollout.rc != 0
2026-04-30 07:36:27 +00:00
- name: Wait for control plane node readiness
command: kubectl wait --for=condition=Ready node/{{ item }} --timeout=30s
register: control_plane_ready
until: control_plane_ready.rc == 0
retries: 20
delay: 15
changed_when: false
loop: "{{ groups['control_plane'] }}"
- name: Wait for Kubernetes readyz before worker joins
command: kubectl --server=https://{{ kube_api_endpoint }}:6443 get --raw=/readyz
register: api_readyz_before_workers
until: api_readyz_before_workers.rc == 0
retries: 30
delay: 10
2026-04-26 02:14:02 +00:00
changed_when: false
2026-05-02 01:04:06 +00:00
- name: Verify worker reachability to Kubernetes API VIP
hosts: workers
become: true
tasks:
- name: Wait for Kubernetes API VIP from worker
wait_for:
host: "{{ kube_api_endpoint }}"
port: 6443
state: started
timeout: 180
register: worker_vip_wait
failed_when: false
- name: Collect worker network diagnostics when VIP is unreachable
shell: |
set -euo pipefail
echo "== ip addr =="
ip addr
echo "== ip route =="
ip route
echo "== ip neigh =="
ip neigh || true
echo "== vip route =="
ip route get {{ kube_api_endpoint }} || true
echo "== tcp probe =="
timeout 5 bash -c '</dev/tcp/{{ kube_api_endpoint }}/6443' && echo connected || echo failed
args:
executable: /bin/bash
register: worker_vip_diagnostics
changed_when: false
failed_when: false
when: worker_vip_wait.msg is defined
- name: Fail when worker cannot reach Kubernetes API VIP
fail:
msg: |
Worker {{ inventory_hostname }} cannot reach Kubernetes API VIP {{ kube_api_endpoint }}:6443.
This blocks k3s agent join and points to kube-vip/L2/routing reachability, not agent install.
Diagnostics:
{{ worker_vip_diagnostics.stdout | default('n/a') }}
when: worker_vip_wait.msg is defined
- name: Setup workers
hosts: workers
become: true
vars:
k3s_token: "{{ hostvars[groups['control_plane'][0]]['k3s_token'] }}"
# Use Load Balancer for HA - workers join via LB endpoint
k3s_server_url: "https://{{ kube_api_endpoint | default(hostvars[groups['control_plane'][0]]['k3s_primary_private_ip']) }}:6443"
k3s_node_ip: "{{ k3s_private_ip }}"
roles:
- k3s-agent
- name: Pre-pull bootstrap control-plane images
hosts: control_plane[0]
become: true
roles:
- bootstrap-image-prepull
- name: Pre-pull Rancher bootstrap images
2026-04-24 12:09:34 +00:00
hosts: workers
become: true
roles:
2026-04-24 12:09:34 +00:00
- role: rancher-image-prepull
when: rancher_image_prepull_enabled | default(false) | bool
- name: Deploy observability stack
hosts: control_plane[0]
become: true
roles:
- role: observability
when: not (observability_gitops_enabled | default(true) | bool)
- name: Provision Grafana content
hosts: control_plane[0]
become: true
roles:
- role: observability-content
when: not (observability_gitops_enabled | default(true) | bool)
2026-03-09 00:25:41 +00:00
- name: Bootstrap Doppler access for External Secrets
hosts: control_plane[0]
become: true
roles:
- doppler-bootstrap
- name: Detect existing Tailscale service proxies
hosts: control_plane[0]
become: true
tasks:
- name: Check for current Tailscale service hostnames
command: kubectl get svc -A -o jsonpath='{range .items[*]}{.metadata.annotations.tailscale\.com/hostname}{"\n"}{end}'
register: existing_tailscale_hostnames
changed_when: false
failed_when: false
- name: Clean up stale Tailscale devices
hosts: localhost
connection: local
vars:
tailscale_reserved_hostnames:
- rancher
- grafana
- prometheus
- flux
tasks:
- name: Delete stale devices only before service proxies exist
include_role:
name: tailscale-cleanup
when: >-
hostvars[groups['control_plane'][0]].existing_tailscale_hostnames.stdout_lines | default([])
| intersect(tailscale_reserved_hostnames)
| length == 0
- name: Finalize
hosts: localhost
connection: local
tasks:
- name: Check whether kubeconfig was fetched
stat:
path: ../outputs/kubeconfig
register: kubeconfig_file
- name: Update kubeconfig server address
command: |
sed -i 's/127.0.0.1/{{ hostvars[groups["control_plane"][0]]["ansible_host"] }}/g' ../outputs/kubeconfig
changed_when: true
when: kubeconfig_file.stat.exists
- name: Display success message
debug:
msg: |
Cluster setup complete!
Control planes: {{ groups['control_plane'] | length }}
Workers: {{ groups['workers'] | length }}
To access the cluster:
export KUBECONFIG={{ playbook_dir }}/../outputs/kubeconfig
kubectl get nodes