fix: restart kubelet during CRISocket recovery and add registration diagnostics
All checks were successful
Terraform Plan / Terraform Plan (push) Successful in 16s

When kubeadm init fails at upload-config/kubelet due missing node object, explicitly restart kubelet to ensure bootstrap flags are loaded before waiting for node registration. Add kubelet flag dump and focused registration log output to surface auth/cert errors.
This commit is contained in:
2026-03-04 18:37:50 +00:00
parent 3cd0c70727
commit ba6cf42c04

View File

@@ -209,6 +209,12 @@ in
--ignore-preflight-errors=NumCPU,HTTPProxyCIDR,Port-10250 2>&1 | tee "$KUBEADM_INIT_LOG"; then
if grep -q "error writing CRISocket for this node: nodes" "$KUBEADM_INIT_LOG" && [ -f /etc/kubernetes/admin.conf ]; then
echo "==> kubeadm hit CRISocket race; waiting for node registration"
echo "==> forcing kubelet restart to pick bootstrap flags"
systemctl daemon-reload || true
systemctl restart kubelet || true
sleep 3
echo "==> kubelet bootstrap flags"
cat /var/lib/kubelet/kubeadm-flags.env || true
registered=0
for i in $(seq 1 60); do
if KUBECONFIG=/etc/kubernetes/admin.conf kubectl get node "$node_name" >/dev/null 2>&1; then
@@ -222,6 +228,8 @@ in
if [ "$registered" -ne 1 ]; then
echo "==> node $node_name did not register after kubeadm init failure"
KUBECONFIG=/etc/kubernetes/admin.conf kubectl get nodes -o wide || true
echo "==> kubelet logs (registration hints)"
journalctl -u kubelet --no-pager -n 120 | grep -Ei "register|node|bootstrap|certificate|forbidden|unauthorized|refused|x509" || true
exit 1
fi
else