Merge pull request 'fix: stabilize kubeadm bootstrap and reduce Proxmox plan latency' (#94) from stage into master
Some checks failed
Terraform Apply / Terraform Apply (push) Failing after 16m3s

Reviewed-on: #94
This commit was merged in pull request #94.
This commit is contained in:
2026-03-02 22:13:28 +00:00
4 changed files with 45 additions and 60 deletions

View File

@@ -129,51 +129,6 @@ in
echo "Using control-plane endpoint: $vip:6443"
echo "Using kube-vip interface: $iface"
mkdir -p /etc/kubernetes/manifests
ctr image pull "${kubeVipImage}"
ctr tasks kill kube-vip-bootstrap 2>/dev/null || true
ctr tasks rm kube-vip-bootstrap 2>/dev/null || true
ctr containers rm kube-vip-bootstrap 2>/dev/null || true
echo "==> Starting kube-vip daemon to claim VIP $vip"
ctr run --net-host -d "${kubeVipImage}" kube-vip-bootstrap /kube-vip \
--interface "$iface" \
--address "$vip" \
--controlplane \
--services \
--arp \
--leaderElection
sleep 3
echo "==> Waiting for VIP $vip to be claimed"
for i in $(seq 1 30); do
if ip -4 addr show | grep -q "$vip"; then
echo "==> VIP $vip is bound"
break
fi
echo "Waiting for VIP... ($i/30)"
sleep 1
done
if ! ip -4 addr show | grep -q "$vip"; then
echo "==> WARNING: VIP not bound, checking kube-vip logs:"
ctr task logs kube-vip-bootstrap 2>&1 | tail -20 || true
fi
echo "==> Creating kube-vip static pod manifest"
ctr run --rm --net-host "${kubeVipImage}" kube-vip-manifest /kube-vip manifest pod \
--interface "$iface" \
--address "$vip" \
--controlplane \
--services \
--arp \
--leaderElection \
> /etc/kubernetes/manifests/kube-vip.yaml
echo "==> kube-vip static pod manifest created"
rm -f /var/lib/kubelet/config.yaml /var/lib/kubelet/kubeadm-flags.env
systemctl unmask kubelet || true
@@ -193,6 +148,7 @@ in
exit 1
fi
mkdir -p /etc/kubernetes/manifests
mkdir -p /tmp/kubeadm
cat > /tmp/kubeadm/init-config.yaml << 'KUBEADMCONFIG'
apiVersion: kubeadm.k8s.io/v1beta4
@@ -225,31 +181,56 @@ in
echo "==> Pre-pulling kubeadm images"
env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm config images pull --config /tmp/kubeadm/init-config.yaml || true
echo "==> Creating kube-vip static pod manifest"
ctr image pull "${kubeVipImage}"
ctr run --rm --net-host "${kubeVipImage}" kube-vip-manifest /kube-vip manifest pod \
--interface "$iface" \
--address "$vip" \
--controlplane \
--services \
--arp \
--leaderElection \
> /etc/kubernetes/manifests/kube-vip.yaml
# kube-vip bootstrap workaround for Kubernetes >=1.29.
# During early kubeadm phases, super-admin.conf is available before admin.conf is fully usable.
sed -i 's#/etc/kubernetes/admin.conf#/etc/kubernetes/super-admin.conf#g' /etc/kubernetes/manifests/kube-vip.yaml || true
env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm init \
--config /tmp/kubeadm/init-config.yaml \
--upload-certs \
--ignore-preflight-errors=NumCPU,HTTPProxyCIDR,Port-10250 \
--skip-phases=wait-control-plane || {
echo "==> kubeadm init phases failed, checking pod status:"
--ignore-preflight-errors=NumCPU,HTTPProxyCIDR,Port-10250 || {
echo "==> kubeadm init failed, checking pod status:"
crictl pods || true
crictl ps -a || true
echo "==> kube-vip containers:"
crictl ps -a --name kube-vip || true
echo "==> kube-vip logs:"
for container_id in $(crictl ps -a --name kube-vip -q 2>/dev/null); do
echo "--- kube-vip container $container_id ---"
crictl logs "$container_id" 2>/dev/null || true
done
echo "==> Checking if VIP is bound:"
ip -4 addr show | grep "$vip" || echo "VIP NOT BOUND"
echo "==> kube-vip logs:"
crictl logs $(crictl ps --name kube-vip -q 2>/dev/null | head -1) 2>/dev/null || echo "Could not get kube-vip logs"
echo "==> kubelet logs:"
journalctl -xeu kubelet --no-pager -n 50
exit 1
}
echo "==> Waiting for kube-vip to claim VIP $vip"
for i in $(seq 1 60); do
for i in $(seq 1 90); do
if ip -4 addr show | grep -q "$vip"; then
echo "==> VIP $vip is bound"
break
fi
if [ "$i" -eq 60 ]; then
echo "==> WARNING: VIP not bound after 2 minutes, proceeding anyway"
if [ "$i" -eq 90 ]; then
echo "==> ERROR: VIP not bound after 3 minutes"
crictl ps -a --name kube-vip || true
for container_id in $(crictl ps -a --name kube-vip -q 2>/dev/null); do
echo "--- kube-vip container $container_id ---"
crictl logs "$container_id" 2>/dev/null || true
done
exit 1
fi
sleep 2
done
@@ -269,10 +250,8 @@ in
sleep 2
done
echo "==> Stopping bootstrap kube-vip (static pod will take over)"
ctr tasks kill kube-vip-bootstrap 2>/dev/null || true
ctr tasks rm kube-vip-bootstrap 2>/dev/null || true
ctr containers rm kube-vip-bootstrap 2>/dev/null || true
# Switch kube-vip to normal admin.conf after bootstrap finishes.
sed -i 's#/etc/kubernetes/super-admin.conf#/etc/kubernetes/admin.conf#g' /etc/kubernetes/manifests/kube-vip.yaml || true
mkdir -p /root/.kube
cp /etc/kubernetes/admin.conf /root/.kube/config

View File

@@ -164,7 +164,7 @@ rebuild_node() {
timeout "$REBUILD_TIMEOUT" nixos-rebuild switch \
--flake "$FLAKE_DIR#$node_name" \
--target-host "$ACTIVE_SSH_USER@$node_ip" \
--use-remote-sudo
--sudo
}
rebuild_node_with_retry() {

View File

@@ -24,7 +24,7 @@ resource "proxmox_vm_qemu" "control_planes" {
clone = var.clone_template
full_clone = true
os_type = "cloud-init"
agent = 1
agent = var.qemu_agent_enabled ? 1 : 0
automatic_reboot = true
cpu {
@@ -79,7 +79,7 @@ resource "proxmox_vm_qemu" "workers" {
clone = var.clone_template
full_clone = true
os_type = "cloud-init"
agent = 1
agent = var.qemu_agent_enabled ? 1 : 0
automatic_reboot = true
cpu {

View File

@@ -99,6 +99,12 @@ variable "pm_api_url" {
type = string
}
variable "qemu_agent_enabled" {
type = bool
default = false
description = "Enable QEMU guest agent integration in Proxmox resources"
}
variable "SSH_KEY_PUBLIC" {
type = string
description = "Public SSH key injected via cloud-init"