Merge pull request 'fix: recover from kubeadm CRISocket node-registration race' (#111) from stage into master
Some checks failed
Terraform Apply / Terraform Apply (push) Failing after 18m6s

Reviewed-on: #111
This commit was merged in pull request #111.
This commit is contained in:
2026-03-04 03:03:17 +00:00

View File

@@ -210,27 +210,47 @@ in
echo "==> kube-vip manifest kubeconfig mount" echo "==> kube-vip manifest kubeconfig mount"
grep -E 'mountPath:|path:' /etc/kubernetes/manifests/kube-vip.yaml | grep -E 'kubernetes/(admin|super-admin)\.conf' || true grep -E 'mountPath:|path:' /etc/kubernetes/manifests/kube-vip.yaml | grep -E 'kubernetes/(admin|super-admin)\.conf' || true
env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm init \ KUBEADM_INIT_LOG=/tmp/kubeadm-init.log
if ! env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm init \
--config /tmp/kubeadm/init-config.yaml \ --config /tmp/kubeadm/init-config.yaml \
--upload-certs \ --upload-certs \
--ignore-preflight-errors=NumCPU,HTTPProxyCIDR,Port-10250 || { --ignore-preflight-errors=NumCPU,HTTPProxyCIDR,Port-10250 2>&1 | tee "$KUBEADM_INIT_LOG"; then
echo "==> kubeadm init failed, checking pod status:" if grep -q "error writing CRISocket for this node: nodes" "$KUBEADM_INIT_LOG" && [ -f /etc/kubernetes/admin.conf ]; then
crictl pods || true echo "==> kubeadm hit CRISocket race; waiting for node registration"
crictl ps -a || true registered=0
echo "==> kube-vip containers:" for i in $(seq 1 60); do
crictl ps -a --name kube-vip || true if KUBECONFIG=/etc/kubernetes/admin.conf kubectl get node "$node_name" >/dev/null 2>&1; then
echo "==> kube-vip logs:" echo "==> node $node_name registered; uploading kubelet config"
for container_id in $(crictl ps -a --name kube-vip -q 2>/dev/null); do env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm init phase upload-config kubelet --config /tmp/kubeadm/init-config.yaml
echo "--- kube-vip container $container_id ---" registered=1
crictl logs "$container_id" 2>/dev/null || true break
crictl inspect "$container_id" 2>/dev/null | jq -r '.status | "exitCode=\(.exitCode) reason=\(.reason // "") message=\(.message // "")"' || true fi
done sleep 2
echo "==> Checking if VIP is bound:" done
ip -4 addr show | grep "$vip" || echo "VIP NOT BOUND" if [ "$registered" -ne 1 ]; then
echo "==> kubelet logs:" echo "==> node $node_name did not register after kubeadm init failure"
journalctl -xeu kubelet --no-pager -n 50 KUBECONFIG=/etc/kubernetes/admin.conf kubectl get nodes -o wide || true
exit 1 exit 1
} fi
else
echo "==> kubeadm init failed, checking pod status:"
crictl pods || true
crictl ps -a || true
echo "==> kube-vip containers:"
crictl ps -a --name kube-vip || true
echo "==> kube-vip logs:"
for container_id in $(crictl ps -a --name kube-vip -q 2>/dev/null); do
echo "--- kube-vip container $container_id ---"
crictl logs "$container_id" 2>/dev/null || true
crictl inspect "$container_id" 2>/dev/null | jq -r '.status | "exitCode=\(.exitCode) reason=\(.reason // "") message=\(.message // "")"' || true
done
echo "==> Checking if VIP is bound:"
ip -4 addr show | grep "$vip" || echo "VIP NOT BOUND"
echo "==> kubelet logs:"
journalctl -xeu kubelet --no-pager -n 50
exit 1
fi
fi
echo "==> Waiting for kube-vip to claim VIP $vip" echo "==> Waiting for kube-vip to claim VIP $vip"
for i in $(seq 1 90); do for i in $(seq 1 90); do