fix: gate workers on kube-vip reachability
This commit is contained in:
@@ -108,6 +108,41 @@
|
|||||||
hosts: control_plane[0]
|
hosts: control_plane[0]
|
||||||
become: true
|
become: true
|
||||||
tasks:
|
tasks:
|
||||||
|
- name: Wait for kube-vip DaemonSet across control planes
|
||||||
|
command: kubectl -n kube-system rollout status daemonset/kube-vip --timeout=300s
|
||||||
|
register: kube_vip_rollout
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
|
||||||
|
- name: Show kube-vip pod status on rollout failure
|
||||||
|
command: kubectl -n kube-system get pods -l app.kubernetes.io/name=kube-vip -o wide
|
||||||
|
register: kube_vip_pods_after_join
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
when: kube_vip_rollout.rc != 0
|
||||||
|
|
||||||
|
- name: Describe kube-vip pods on rollout failure
|
||||||
|
command: kubectl -n kube-system describe pods -l app.kubernetes.io/name=kube-vip
|
||||||
|
register: kube_vip_describe_after_join
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
when: kube_vip_rollout.rc != 0
|
||||||
|
|
||||||
|
- name: Fail when kube-vip is not healthy on all control planes
|
||||||
|
fail:
|
||||||
|
msg: |
|
||||||
|
kube-vip DaemonSet did not become healthy after secondary control planes joined.
|
||||||
|
Rollout:
|
||||||
|
{{ kube_vip_rollout.stdout | default('') }}
|
||||||
|
{{ kube_vip_rollout.stderr | default('') }}
|
||||||
|
|
||||||
|
Pods:
|
||||||
|
{{ kube_vip_pods_after_join.stdout | default('n/a') }}
|
||||||
|
|
||||||
|
Describe:
|
||||||
|
{{ kube_vip_describe_after_join.stdout | default('n/a') }}
|
||||||
|
when: kube_vip_rollout.rc != 0
|
||||||
|
|
||||||
- name: Wait for control plane node readiness
|
- name: Wait for control plane node readiness
|
||||||
command: kubectl wait --for=condition=Ready node/{{ item }} --timeout=30s
|
command: kubectl wait --for=condition=Ready node/{{ item }} --timeout=30s
|
||||||
register: control_plane_ready
|
register: control_plane_ready
|
||||||
@@ -125,6 +160,49 @@
|
|||||||
delay: 10
|
delay: 10
|
||||||
changed_when: false
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Verify worker reachability to Kubernetes API VIP
|
||||||
|
hosts: workers
|
||||||
|
become: true
|
||||||
|
tasks:
|
||||||
|
- name: Wait for Kubernetes API VIP from worker
|
||||||
|
wait_for:
|
||||||
|
host: "{{ kube_api_endpoint }}"
|
||||||
|
port: 6443
|
||||||
|
state: started
|
||||||
|
timeout: 180
|
||||||
|
register: worker_vip_wait
|
||||||
|
failed_when: false
|
||||||
|
|
||||||
|
- name: Collect worker network diagnostics when VIP is unreachable
|
||||||
|
shell: |
|
||||||
|
set -euo pipefail
|
||||||
|
echo "== ip addr =="
|
||||||
|
ip addr
|
||||||
|
echo "== ip route =="
|
||||||
|
ip route
|
||||||
|
echo "== ip neigh =="
|
||||||
|
ip neigh || true
|
||||||
|
echo "== vip route =="
|
||||||
|
ip route get {{ kube_api_endpoint }} || true
|
||||||
|
echo "== tcp probe =="
|
||||||
|
timeout 5 bash -c '</dev/tcp/{{ kube_api_endpoint }}/6443' && echo connected || echo failed
|
||||||
|
args:
|
||||||
|
executable: /bin/bash
|
||||||
|
register: worker_vip_diagnostics
|
||||||
|
changed_when: false
|
||||||
|
failed_when: false
|
||||||
|
when: worker_vip_wait.msg is defined
|
||||||
|
|
||||||
|
- name: Fail when worker cannot reach Kubernetes API VIP
|
||||||
|
fail:
|
||||||
|
msg: |
|
||||||
|
Worker {{ inventory_hostname }} cannot reach Kubernetes API VIP {{ kube_api_endpoint }}:6443.
|
||||||
|
This blocks k3s agent join and points to kube-vip/L2/routing reachability, not agent install.
|
||||||
|
|
||||||
|
Diagnostics:
|
||||||
|
{{ worker_vip_diagnostics.stdout | default('n/a') }}
|
||||||
|
when: worker_vip_wait.msg is defined
|
||||||
|
|
||||||
- name: Setup workers
|
- name: Setup workers
|
||||||
hosts: workers
|
hosts: workers
|
||||||
become: true
|
become: true
|
||||||
|
|||||||
Reference in New Issue
Block a user