fix: stabilize kubeadm bootstrap and reduce Proxmox plan latency
Some checks failed
Terraform Plan / Terraform Plan (push) Has been cancelled

Move kubeadm reset ahead of kube-vip manifest generation, use super-admin.conf during bootstrap for kube-vip, and restore admin.conf after init. Also switch nixos-rebuild to --sudo and make QEMU guest agent optional so Terraform plan can skip slow guest-agent refreshes when it is not installed.
This commit is contained in:
2026-03-02 22:09:10 +00:00
parent 46c0786e57
commit a81799a2b5
4 changed files with 45 additions and 60 deletions

View File

@@ -129,51 +129,6 @@ in
echo "Using control-plane endpoint: $vip:6443"
echo "Using kube-vip interface: $iface"
mkdir -p /etc/kubernetes/manifests
ctr image pull "${kubeVipImage}"
ctr tasks kill kube-vip-bootstrap 2>/dev/null || true
ctr tasks rm kube-vip-bootstrap 2>/dev/null || true
ctr containers rm kube-vip-bootstrap 2>/dev/null || true
echo "==> Starting kube-vip daemon to claim VIP $vip"
ctr run --net-host -d "${kubeVipImage}" kube-vip-bootstrap /kube-vip \
--interface "$iface" \
--address "$vip" \
--controlplane \
--services \
--arp \
--leaderElection
sleep 3
echo "==> Waiting for VIP $vip to be claimed"
for i in $(seq 1 30); do
if ip -4 addr show | grep -q "$vip"; then
echo "==> VIP $vip is bound"
break
fi
echo "Waiting for VIP... ($i/30)"
sleep 1
done
if ! ip -4 addr show | grep -q "$vip"; then
echo "==> WARNING: VIP not bound, checking kube-vip logs:"
ctr task logs kube-vip-bootstrap 2>&1 | tail -20 || true
fi
echo "==> Creating kube-vip static pod manifest"
ctr run --rm --net-host "${kubeVipImage}" kube-vip-manifest /kube-vip manifest pod \
--interface "$iface" \
--address "$vip" \
--controlplane \
--services \
--arp \
--leaderElection \
> /etc/kubernetes/manifests/kube-vip.yaml
echo "==> kube-vip static pod manifest created"
rm -f /var/lib/kubelet/config.yaml /var/lib/kubelet/kubeadm-flags.env
systemctl unmask kubelet || true
@@ -193,6 +148,7 @@ in
exit 1
fi
mkdir -p /etc/kubernetes/manifests
mkdir -p /tmp/kubeadm
cat > /tmp/kubeadm/init-config.yaml << 'KUBEADMCONFIG'
apiVersion: kubeadm.k8s.io/v1beta4
@@ -225,31 +181,56 @@ in
echo "==> Pre-pulling kubeadm images"
env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm config images pull --config /tmp/kubeadm/init-config.yaml || true
echo "==> Creating kube-vip static pod manifest"
ctr image pull "${kubeVipImage}"
ctr run --rm --net-host "${kubeVipImage}" kube-vip-manifest /kube-vip manifest pod \
--interface "$iface" \
--address "$vip" \
--controlplane \
--services \
--arp \
--leaderElection \
> /etc/kubernetes/manifests/kube-vip.yaml
# kube-vip bootstrap workaround for Kubernetes >=1.29.
# During early kubeadm phases, super-admin.conf is available before admin.conf is fully usable.
sed -i 's#/etc/kubernetes/admin.conf#/etc/kubernetes/super-admin.conf#g' /etc/kubernetes/manifests/kube-vip.yaml || true
env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm init \
--config /tmp/kubeadm/init-config.yaml \
--upload-certs \
--ignore-preflight-errors=NumCPU,HTTPProxyCIDR,Port-10250 \
--skip-phases=wait-control-plane || {
echo "==> kubeadm init phases failed, checking pod status:"
--ignore-preflight-errors=NumCPU,HTTPProxyCIDR,Port-10250 || {
echo "==> kubeadm init failed, checking pod status:"
crictl pods || true
crictl ps -a || true
echo "==> kube-vip containers:"
crictl ps -a --name kube-vip || true
echo "==> kube-vip logs:"
for container_id in $(crictl ps -a --name kube-vip -q 2>/dev/null); do
echo "--- kube-vip container $container_id ---"
crictl logs "$container_id" 2>/dev/null || true
done
echo "==> Checking if VIP is bound:"
ip -4 addr show | grep "$vip" || echo "VIP NOT BOUND"
echo "==> kube-vip logs:"
crictl logs $(crictl ps --name kube-vip -q 2>/dev/null | head -1) 2>/dev/null || echo "Could not get kube-vip logs"
echo "==> kubelet logs:"
journalctl -xeu kubelet --no-pager -n 50
exit 1
}
echo "==> Waiting for kube-vip to claim VIP $vip"
for i in $(seq 1 60); do
for i in $(seq 1 90); do
if ip -4 addr show | grep -q "$vip"; then
echo "==> VIP $vip is bound"
break
fi
if [ "$i" -eq 60 ]; then
echo "==> WARNING: VIP not bound after 2 minutes, proceeding anyway"
if [ "$i" -eq 90 ]; then
echo "==> ERROR: VIP not bound after 3 minutes"
crictl ps -a --name kube-vip || true
for container_id in $(crictl ps -a --name kube-vip -q 2>/dev/null); do
echo "--- kube-vip container $container_id ---"
crictl logs "$container_id" 2>/dev/null || true
done
exit 1
fi
sleep 2
done
@@ -269,10 +250,8 @@ in
sleep 2
done
echo "==> Stopping bootstrap kube-vip (static pod will take over)"
ctr tasks kill kube-vip-bootstrap 2>/dev/null || true
ctr tasks rm kube-vip-bootstrap 2>/dev/null || true
ctr containers rm kube-vip-bootstrap 2>/dev/null || true
# Switch kube-vip to normal admin.conf after bootstrap finishes.
sed -i 's#/etc/kubernetes/super-admin.conf#/etc/kubernetes/admin.conf#g' /etc/kubernetes/manifests/kube-vip.yaml || true
mkdir -p /root/.kube
cp /etc/kubernetes/admin.conf /root/.kube/config

View File

@@ -164,7 +164,7 @@ rebuild_node() {
timeout "$REBUILD_TIMEOUT" nixos-rebuild switch \
--flake "$FLAKE_DIR#$node_name" \
--target-host "$ACTIVE_SSH_USER@$node_ip" \
--use-remote-sudo
--sudo
}
rebuild_node_with_retry() {