fix: stabilize kubeadm bootstrap and reduce Proxmox plan latency #94

Merged
micqdf merged 1 commits from stage into master 2026-03-02 22:13:29 +00:00
4 changed files with 45 additions and 60 deletions

View File

@@ -129,51 +129,6 @@ in
echo "Using control-plane endpoint: $vip:6443" echo "Using control-plane endpoint: $vip:6443"
echo "Using kube-vip interface: $iface" echo "Using kube-vip interface: $iface"
mkdir -p /etc/kubernetes/manifests
ctr image pull "${kubeVipImage}"
ctr tasks kill kube-vip-bootstrap 2>/dev/null || true
ctr tasks rm kube-vip-bootstrap 2>/dev/null || true
ctr containers rm kube-vip-bootstrap 2>/dev/null || true
echo "==> Starting kube-vip daemon to claim VIP $vip"
ctr run --net-host -d "${kubeVipImage}" kube-vip-bootstrap /kube-vip \
--interface "$iface" \
--address "$vip" \
--controlplane \
--services \
--arp \
--leaderElection
sleep 3
echo "==> Waiting for VIP $vip to be claimed"
for i in $(seq 1 30); do
if ip -4 addr show | grep -q "$vip"; then
echo "==> VIP $vip is bound"
break
fi
echo "Waiting for VIP... ($i/30)"
sleep 1
done
if ! ip -4 addr show | grep -q "$vip"; then
echo "==> WARNING: VIP not bound, checking kube-vip logs:"
ctr task logs kube-vip-bootstrap 2>&1 | tail -20 || true
fi
echo "==> Creating kube-vip static pod manifest"
ctr run --rm --net-host "${kubeVipImage}" kube-vip-manifest /kube-vip manifest pod \
--interface "$iface" \
--address "$vip" \
--controlplane \
--services \
--arp \
--leaderElection \
> /etc/kubernetes/manifests/kube-vip.yaml
echo "==> kube-vip static pod manifest created"
rm -f /var/lib/kubelet/config.yaml /var/lib/kubelet/kubeadm-flags.env rm -f /var/lib/kubelet/config.yaml /var/lib/kubelet/kubeadm-flags.env
systemctl unmask kubelet || true systemctl unmask kubelet || true
@@ -193,6 +148,7 @@ in
exit 1 exit 1
fi fi
mkdir -p /etc/kubernetes/manifests
mkdir -p /tmp/kubeadm mkdir -p /tmp/kubeadm
cat > /tmp/kubeadm/init-config.yaml << 'KUBEADMCONFIG' cat > /tmp/kubeadm/init-config.yaml << 'KUBEADMCONFIG'
apiVersion: kubeadm.k8s.io/v1beta4 apiVersion: kubeadm.k8s.io/v1beta4
@@ -225,31 +181,56 @@ in
echo "==> Pre-pulling kubeadm images" echo "==> Pre-pulling kubeadm images"
env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm config images pull --config /tmp/kubeadm/init-config.yaml || true env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm config images pull --config /tmp/kubeadm/init-config.yaml || true
echo "==> Creating kube-vip static pod manifest"
ctr image pull "${kubeVipImage}"
ctr run --rm --net-host "${kubeVipImage}" kube-vip-manifest /kube-vip manifest pod \
--interface "$iface" \
--address "$vip" \
--controlplane \
--services \
--arp \
--leaderElection \
> /etc/kubernetes/manifests/kube-vip.yaml
# kube-vip bootstrap workaround for Kubernetes >=1.29.
# During early kubeadm phases, super-admin.conf is available before admin.conf is fully usable.
sed -i 's#/etc/kubernetes/admin.conf#/etc/kubernetes/super-admin.conf#g' /etc/kubernetes/manifests/kube-vip.yaml || true
env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm init \ env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm init \
--config /tmp/kubeadm/init-config.yaml \ --config /tmp/kubeadm/init-config.yaml \
--upload-certs \ --upload-certs \
--ignore-preflight-errors=NumCPU,HTTPProxyCIDR,Port-10250 \ --ignore-preflight-errors=NumCPU,HTTPProxyCIDR,Port-10250 || {
--skip-phases=wait-control-plane || { echo "==> kubeadm init failed, checking pod status:"
echo "==> kubeadm init phases failed, checking pod status:"
crictl pods || true crictl pods || true
crictl ps -a || true crictl ps -a || true
echo "==> kube-vip containers:"
crictl ps -a --name kube-vip || true
echo "==> kube-vip logs:"
for container_id in $(crictl ps -a --name kube-vip -q 2>/dev/null); do
echo "--- kube-vip container $container_id ---"
crictl logs "$container_id" 2>/dev/null || true
done
echo "==> Checking if VIP is bound:" echo "==> Checking if VIP is bound:"
ip -4 addr show | grep "$vip" || echo "VIP NOT BOUND" ip -4 addr show | grep "$vip" || echo "VIP NOT BOUND"
echo "==> kube-vip logs:"
crictl logs $(crictl ps --name kube-vip -q 2>/dev/null | head -1) 2>/dev/null || echo "Could not get kube-vip logs"
echo "==> kubelet logs:" echo "==> kubelet logs:"
journalctl -xeu kubelet --no-pager -n 50 journalctl -xeu kubelet --no-pager -n 50
exit 1 exit 1
} }
echo "==> Waiting for kube-vip to claim VIP $vip" echo "==> Waiting for kube-vip to claim VIP $vip"
for i in $(seq 1 60); do for i in $(seq 1 90); do
if ip -4 addr show | grep -q "$vip"; then if ip -4 addr show | grep -q "$vip"; then
echo "==> VIP $vip is bound" echo "==> VIP $vip is bound"
break break
fi fi
if [ "$i" -eq 60 ]; then if [ "$i" -eq 90 ]; then
echo "==> WARNING: VIP not bound after 2 minutes, proceeding anyway" echo "==> ERROR: VIP not bound after 3 minutes"
crictl ps -a --name kube-vip || true
for container_id in $(crictl ps -a --name kube-vip -q 2>/dev/null); do
echo "--- kube-vip container $container_id ---"
crictl logs "$container_id" 2>/dev/null || true
done
exit 1
fi fi
sleep 2 sleep 2
done done
@@ -269,10 +250,8 @@ in
sleep 2 sleep 2
done done
echo "==> Stopping bootstrap kube-vip (static pod will take over)" # Switch kube-vip to normal admin.conf after bootstrap finishes.
ctr tasks kill kube-vip-bootstrap 2>/dev/null || true sed -i 's#/etc/kubernetes/super-admin.conf#/etc/kubernetes/admin.conf#g' /etc/kubernetes/manifests/kube-vip.yaml || true
ctr tasks rm kube-vip-bootstrap 2>/dev/null || true
ctr containers rm kube-vip-bootstrap 2>/dev/null || true
mkdir -p /root/.kube mkdir -p /root/.kube
cp /etc/kubernetes/admin.conf /root/.kube/config cp /etc/kubernetes/admin.conf /root/.kube/config

View File

@@ -164,7 +164,7 @@ rebuild_node() {
timeout "$REBUILD_TIMEOUT" nixos-rebuild switch \ timeout "$REBUILD_TIMEOUT" nixos-rebuild switch \
--flake "$FLAKE_DIR#$node_name" \ --flake "$FLAKE_DIR#$node_name" \
--target-host "$ACTIVE_SSH_USER@$node_ip" \ --target-host "$ACTIVE_SSH_USER@$node_ip" \
--use-remote-sudo --sudo
} }
rebuild_node_with_retry() { rebuild_node_with_retry() {

View File

@@ -24,7 +24,7 @@ resource "proxmox_vm_qemu" "control_planes" {
clone = var.clone_template clone = var.clone_template
full_clone = true full_clone = true
os_type = "cloud-init" os_type = "cloud-init"
agent = 1 agent = var.qemu_agent_enabled ? 1 : 0
automatic_reboot = true automatic_reboot = true
cpu { cpu {
@@ -79,7 +79,7 @@ resource "proxmox_vm_qemu" "workers" {
clone = var.clone_template clone = var.clone_template
full_clone = true full_clone = true
os_type = "cloud-init" os_type = "cloud-init"
agent = 1 agent = var.qemu_agent_enabled ? 1 : 0
automatic_reboot = true automatic_reboot = true
cpu { cpu {

View File

@@ -99,6 +99,12 @@ variable "pm_api_url" {
type = string type = string
} }
variable "qemu_agent_enabled" {
type = bool
default = false
description = "Enable QEMU guest agent integration in Proxmox resources"
}
variable "SSH_KEY_PUBLIC" { variable "SSH_KEY_PUBLIC" {
type = string type = string
description = "Public SSH key injected via cloud-init" description = "Public SSH key injected via cloud-init"