From d2dd6105a64a36003e77711247e9c8cfe51e8b60 Mon Sep 17 00:00:00 2001 From: MichaelFisher1997 Date: Wed, 4 Mar 2026 03:00:34 +0000 Subject: [PATCH] fix: recover from kubeadm CRISocket node-registration race Handle kubeadm init failures where upload-config/kubelet runs before the node object exists. When that specific error occurs, wait for cp-1 registration and run upload-config kubelet phase explicitly instead of aborting immediately. --- nixos/kubeadm/modules/k8s-common.nix | 58 +++++++++++++++++++--------- 1 file changed, 39 insertions(+), 19 deletions(-) diff --git a/nixos/kubeadm/modules/k8s-common.nix b/nixos/kubeadm/modules/k8s-common.nix index a6e792d..2512c4a 100644 --- a/nixos/kubeadm/modules/k8s-common.nix +++ b/nixos/kubeadm/modules/k8s-common.nix @@ -210,27 +210,47 @@ in echo "==> kube-vip manifest kubeconfig mount" grep -E 'mountPath:|path:' /etc/kubernetes/manifests/kube-vip.yaml | grep -E 'kubernetes/(admin|super-admin)\.conf' || true - env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm init \ + KUBEADM_INIT_LOG=/tmp/kubeadm-init.log + if ! env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm init \ --config /tmp/kubeadm/init-config.yaml \ --upload-certs \ - --ignore-preflight-errors=NumCPU,HTTPProxyCIDR,Port-10250 || { - echo "==> kubeadm init failed, checking pod status:" - crictl pods || true - crictl ps -a || true - echo "==> kube-vip containers:" - crictl ps -a --name kube-vip || true - echo "==> kube-vip logs:" - for container_id in $(crictl ps -a --name kube-vip -q 2>/dev/null); do - echo "--- kube-vip container $container_id ---" - crictl logs "$container_id" 2>/dev/null || true - crictl inspect "$container_id" 2>/dev/null | jq -r '.status | "exitCode=\(.exitCode) reason=\(.reason // "") message=\(.message // "")"' || true - done - echo "==> Checking if VIP is bound:" - ip -4 addr show | grep "$vip" || echo "VIP NOT BOUND" - echo "==> kubelet logs:" - journalctl -xeu kubelet --no-pager -n 50 - exit 1 - } + --ignore-preflight-errors=NumCPU,HTTPProxyCIDR,Port-10250 2>&1 | tee "$KUBEADM_INIT_LOG"; then + if grep -q "error writing CRISocket for this node: nodes" "$KUBEADM_INIT_LOG" && [ -f /etc/kubernetes/admin.conf ]; then + echo "==> kubeadm hit CRISocket race; waiting for node registration" + registered=0 + for i in $(seq 1 60); do + if KUBECONFIG=/etc/kubernetes/admin.conf kubectl get node "$node_name" >/dev/null 2>&1; then + echo "==> node $node_name registered; uploading kubelet config" + env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm init phase upload-config kubelet --config /tmp/kubeadm/init-config.yaml + registered=1 + break + fi + sleep 2 + done + if [ "$registered" -ne 1 ]; then + echo "==> node $node_name did not register after kubeadm init failure" + KUBECONFIG=/etc/kubernetes/admin.conf kubectl get nodes -o wide || true + exit 1 + fi + else + echo "==> kubeadm init failed, checking pod status:" + crictl pods || true + crictl ps -a || true + echo "==> kube-vip containers:" + crictl ps -a --name kube-vip || true + echo "==> kube-vip logs:" + for container_id in $(crictl ps -a --name kube-vip -q 2>/dev/null); do + echo "--- kube-vip container $container_id ---" + crictl logs "$container_id" 2>/dev/null || true + crictl inspect "$container_id" 2>/dev/null | jq -r '.status | "exitCode=\(.exitCode) reason=\(.reason // "") message=\(.message // "")"' || true + done + echo "==> Checking if VIP is bound:" + ip -4 addr show | grep "$vip" || echo "VIP NOT BOUND" + echo "==> kubelet logs:" + journalctl -xeu kubelet --no-pager -n 50 + exit 1 + fi + fi echo "==> Waiting for kube-vip to claim VIP $vip" for i in $(seq 1 90); do