fix: gate workers on kube-vip reachability
Deploy Cluster / Terraform (push) Successful in 33s
Deploy Cluster / Ansible (push) Failing after 15m7s

This commit is contained in:
2026-05-02 01:04:06 +00:00
parent 17182f84a9
commit 0aba186d8b
+78
View File
@@ -108,6 +108,41 @@
hosts: control_plane[0] hosts: control_plane[0]
become: true become: true
tasks: tasks:
- name: Wait for kube-vip DaemonSet across control planes
command: kubectl -n kube-system rollout status daemonset/kube-vip --timeout=300s
register: kube_vip_rollout
changed_when: false
failed_when: false
- name: Show kube-vip pod status on rollout failure
command: kubectl -n kube-system get pods -l app.kubernetes.io/name=kube-vip -o wide
register: kube_vip_pods_after_join
changed_when: false
failed_when: false
when: kube_vip_rollout.rc != 0
- name: Describe kube-vip pods on rollout failure
command: kubectl -n kube-system describe pods -l app.kubernetes.io/name=kube-vip
register: kube_vip_describe_after_join
changed_when: false
failed_when: false
when: kube_vip_rollout.rc != 0
- name: Fail when kube-vip is not healthy on all control planes
fail:
msg: |
kube-vip DaemonSet did not become healthy after secondary control planes joined.
Rollout:
{{ kube_vip_rollout.stdout | default('') }}
{{ kube_vip_rollout.stderr | default('') }}
Pods:
{{ kube_vip_pods_after_join.stdout | default('n/a') }}
Describe:
{{ kube_vip_describe_after_join.stdout | default('n/a') }}
when: kube_vip_rollout.rc != 0
- name: Wait for control plane node readiness - name: Wait for control plane node readiness
command: kubectl wait --for=condition=Ready node/{{ item }} --timeout=30s command: kubectl wait --for=condition=Ready node/{{ item }} --timeout=30s
register: control_plane_ready register: control_plane_ready
@@ -125,6 +160,49 @@
delay: 10 delay: 10
changed_when: false changed_when: false
- name: Verify worker reachability to Kubernetes API VIP
hosts: workers
become: true
tasks:
- name: Wait for Kubernetes API VIP from worker
wait_for:
host: "{{ kube_api_endpoint }}"
port: 6443
state: started
timeout: 180
register: worker_vip_wait
failed_when: false
- name: Collect worker network diagnostics when VIP is unreachable
shell: |
set -euo pipefail
echo "== ip addr =="
ip addr
echo "== ip route =="
ip route
echo "== ip neigh =="
ip neigh || true
echo "== vip route =="
ip route get {{ kube_api_endpoint }} || true
echo "== tcp probe =="
timeout 5 bash -c '</dev/tcp/{{ kube_api_endpoint }}/6443' && echo connected || echo failed
args:
executable: /bin/bash
register: worker_vip_diagnostics
changed_when: false
failed_when: false
when: worker_vip_wait.msg is defined
- name: Fail when worker cannot reach Kubernetes API VIP
fail:
msg: |
Worker {{ inventory_hostname }} cannot reach Kubernetes API VIP {{ kube_api_endpoint }}:6443.
This blocks k3s agent join and points to kube-vip/L2/routing reachability, not agent install.
Diagnostics:
{{ worker_vip_diagnostics.stdout | default('n/a') }}
when: worker_vip_wait.msg is defined
- name: Setup workers - name: Setup workers
hosts: workers hosts: workers
become: true become: true