From 0aba186d8b8ad38d9c73d9b2733007868828527a Mon Sep 17 00:00:00 2001 From: MichaelFisher1997 Date: Sat, 2 May 2026 01:04:06 +0000 Subject: [PATCH] fix: gate workers on kube-vip reachability --- ansible/site.yml | 78 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/ansible/site.yml b/ansible/site.yml index bb416f8..8e1bc50 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -108,6 +108,41 @@ hosts: control_plane[0] become: true tasks: + - name: Wait for kube-vip DaemonSet across control planes + command: kubectl -n kube-system rollout status daemonset/kube-vip --timeout=300s + register: kube_vip_rollout + changed_when: false + failed_when: false + + - name: Show kube-vip pod status on rollout failure + command: kubectl -n kube-system get pods -l app.kubernetes.io/name=kube-vip -o wide + register: kube_vip_pods_after_join + changed_when: false + failed_when: false + when: kube_vip_rollout.rc != 0 + + - name: Describe kube-vip pods on rollout failure + command: kubectl -n kube-system describe pods -l app.kubernetes.io/name=kube-vip + register: kube_vip_describe_after_join + changed_when: false + failed_when: false + when: kube_vip_rollout.rc != 0 + + - name: Fail when kube-vip is not healthy on all control planes + fail: + msg: | + kube-vip DaemonSet did not become healthy after secondary control planes joined. + Rollout: + {{ kube_vip_rollout.stdout | default('') }} + {{ kube_vip_rollout.stderr | default('') }} + + Pods: + {{ kube_vip_pods_after_join.stdout | default('n/a') }} + + Describe: + {{ kube_vip_describe_after_join.stdout | default('n/a') }} + when: kube_vip_rollout.rc != 0 + - name: Wait for control plane node readiness command: kubectl wait --for=condition=Ready node/{{ item }} --timeout=30s register: control_plane_ready @@ -125,6 +160,49 @@ delay: 10 changed_when: false +- name: Verify worker reachability to Kubernetes API VIP + hosts: workers + become: true + tasks: + - name: Wait for Kubernetes API VIP from worker + wait_for: + host: "{{ kube_api_endpoint }}" + port: 6443 + state: started + timeout: 180 + register: worker_vip_wait + failed_when: false + + - name: Collect worker network diagnostics when VIP is unreachable + shell: | + set -euo pipefail + echo "== ip addr ==" + ip addr + echo "== ip route ==" + ip route + echo "== ip neigh ==" + ip neigh || true + echo "== vip route ==" + ip route get {{ kube_api_endpoint }} || true + echo "== tcp probe ==" + timeout 5 bash -c '