fix: restart kubelet during CRISocket recovery and add registration diagnostics
All checks were successful
Terraform Plan / Terraform Plan (push) Successful in 16s

When kubeadm init fails at upload-config/kubelet due missing node object, explicitly restart kubelet to ensure bootstrap flags are loaded before waiting for node registration. Add kubelet flag dump and focused registration log output to surface auth/cert errors.
This commit is contained in:
2026-03-04 18:37:50 +00:00
parent 3cd0c70727
commit ba6cf42c04

View File

@@ -209,6 +209,12 @@ in
--ignore-preflight-errors=NumCPU,HTTPProxyCIDR,Port-10250 2>&1 | tee "$KUBEADM_INIT_LOG"; then --ignore-preflight-errors=NumCPU,HTTPProxyCIDR,Port-10250 2>&1 | tee "$KUBEADM_INIT_LOG"; then
if grep -q "error writing CRISocket for this node: nodes" "$KUBEADM_INIT_LOG" && [ -f /etc/kubernetes/admin.conf ]; then if grep -q "error writing CRISocket for this node: nodes" "$KUBEADM_INIT_LOG" && [ -f /etc/kubernetes/admin.conf ]; then
echo "==> kubeadm hit CRISocket race; waiting for node registration" echo "==> kubeadm hit CRISocket race; waiting for node registration"
echo "==> forcing kubelet restart to pick bootstrap flags"
systemctl daemon-reload || true
systemctl restart kubelet || true
sleep 3
echo "==> kubelet bootstrap flags"
cat /var/lib/kubelet/kubeadm-flags.env || true
registered=0 registered=0
for i in $(seq 1 60); do for i in $(seq 1 60); do
if KUBECONFIG=/etc/kubernetes/admin.conf kubectl get node "$node_name" >/dev/null 2>&1; then if KUBECONFIG=/etc/kubernetes/admin.conf kubectl get node "$node_name" >/dev/null 2>&1; then
@@ -222,6 +228,8 @@ in
if [ "$registered" -ne 1 ]; then if [ "$registered" -ne 1 ]; then
echo "==> node $node_name did not register after kubeadm init failure" echo "==> node $node_name did not register after kubeadm init failure"
KUBECONFIG=/etc/kubernetes/admin.conf kubectl get nodes -o wide || true KUBECONFIG=/etc/kubernetes/admin.conf kubectl get nodes -o wide || true
echo "==> kubelet logs (registration hints)"
journalctl -u kubelet --no-pager -n 120 | grep -Ei "register|node|bootstrap|certificate|forbidden|unauthorized|refused|x509" || true
exit 1 exit 1
fi fi
else else