fix: stabilize kubeadm bootstrap and reduce Proxmox plan latency
Some checks failed
Terraform Plan / Terraform Plan (push) Has been cancelled
Some checks failed
Terraform Plan / Terraform Plan (push) Has been cancelled
Move kubeadm reset ahead of kube-vip manifest generation, use super-admin.conf during bootstrap for kube-vip, and restore admin.conf after init. Also switch nixos-rebuild to --sudo and make QEMU guest agent optional so Terraform plan can skip slow guest-agent refreshes when it is not installed.
This commit is contained in:
@@ -129,51 +129,6 @@ in
|
||||
echo "Using control-plane endpoint: $vip:6443"
|
||||
echo "Using kube-vip interface: $iface"
|
||||
|
||||
mkdir -p /etc/kubernetes/manifests
|
||||
ctr image pull "${kubeVipImage}"
|
||||
|
||||
ctr tasks kill kube-vip-bootstrap 2>/dev/null || true
|
||||
ctr tasks rm kube-vip-bootstrap 2>/dev/null || true
|
||||
ctr containers rm kube-vip-bootstrap 2>/dev/null || true
|
||||
|
||||
echo "==> Starting kube-vip daemon to claim VIP $vip"
|
||||
ctr run --net-host -d "${kubeVipImage}" kube-vip-bootstrap /kube-vip \
|
||||
--interface "$iface" \
|
||||
--address "$vip" \
|
||||
--controlplane \
|
||||
--services \
|
||||
--arp \
|
||||
--leaderElection
|
||||
|
||||
sleep 3
|
||||
|
||||
echo "==> Waiting for VIP $vip to be claimed"
|
||||
for i in $(seq 1 30); do
|
||||
if ip -4 addr show | grep -q "$vip"; then
|
||||
echo "==> VIP $vip is bound"
|
||||
break
|
||||
fi
|
||||
echo "Waiting for VIP... ($i/30)"
|
||||
sleep 1
|
||||
done
|
||||
|
||||
if ! ip -4 addr show | grep -q "$vip"; then
|
||||
echo "==> WARNING: VIP not bound, checking kube-vip logs:"
|
||||
ctr task logs kube-vip-bootstrap 2>&1 | tail -20 || true
|
||||
fi
|
||||
|
||||
echo "==> Creating kube-vip static pod manifest"
|
||||
ctr run --rm --net-host "${kubeVipImage}" kube-vip-manifest /kube-vip manifest pod \
|
||||
--interface "$iface" \
|
||||
--address "$vip" \
|
||||
--controlplane \
|
||||
--services \
|
||||
--arp \
|
||||
--leaderElection \
|
||||
> /etc/kubernetes/manifests/kube-vip.yaml
|
||||
|
||||
echo "==> kube-vip static pod manifest created"
|
||||
|
||||
rm -f /var/lib/kubelet/config.yaml /var/lib/kubelet/kubeadm-flags.env
|
||||
|
||||
systemctl unmask kubelet || true
|
||||
@@ -193,6 +148,7 @@ in
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mkdir -p /etc/kubernetes/manifests
|
||||
mkdir -p /tmp/kubeadm
|
||||
cat > /tmp/kubeadm/init-config.yaml << 'KUBEADMCONFIG'
|
||||
apiVersion: kubeadm.k8s.io/v1beta4
|
||||
@@ -225,31 +181,56 @@ in
|
||||
echo "==> Pre-pulling kubeadm images"
|
||||
env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm config images pull --config /tmp/kubeadm/init-config.yaml || true
|
||||
|
||||
echo "==> Creating kube-vip static pod manifest"
|
||||
ctr image pull "${kubeVipImage}"
|
||||
ctr run --rm --net-host "${kubeVipImage}" kube-vip-manifest /kube-vip manifest pod \
|
||||
--interface "$iface" \
|
||||
--address "$vip" \
|
||||
--controlplane \
|
||||
--services \
|
||||
--arp \
|
||||
--leaderElection \
|
||||
> /etc/kubernetes/manifests/kube-vip.yaml
|
||||
|
||||
# kube-vip bootstrap workaround for Kubernetes >=1.29.
|
||||
# During early kubeadm phases, super-admin.conf is available before admin.conf is fully usable.
|
||||
sed -i 's#/etc/kubernetes/admin.conf#/etc/kubernetes/super-admin.conf#g' /etc/kubernetes/manifests/kube-vip.yaml || true
|
||||
|
||||
env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm init \
|
||||
--config /tmp/kubeadm/init-config.yaml \
|
||||
--upload-certs \
|
||||
--ignore-preflight-errors=NumCPU,HTTPProxyCIDR,Port-10250 \
|
||||
--skip-phases=wait-control-plane || {
|
||||
echo "==> kubeadm init phases failed, checking pod status:"
|
||||
--ignore-preflight-errors=NumCPU,HTTPProxyCIDR,Port-10250 || {
|
||||
echo "==> kubeadm init failed, checking pod status:"
|
||||
crictl pods || true
|
||||
crictl ps -a || true
|
||||
echo "==> kube-vip containers:"
|
||||
crictl ps -a --name kube-vip || true
|
||||
echo "==> kube-vip logs:"
|
||||
for container_id in $(crictl ps -a --name kube-vip -q 2>/dev/null); do
|
||||
echo "--- kube-vip container $container_id ---"
|
||||
crictl logs "$container_id" 2>/dev/null || true
|
||||
done
|
||||
echo "==> Checking if VIP is bound:"
|
||||
ip -4 addr show | grep "$vip" || echo "VIP NOT BOUND"
|
||||
echo "==> kube-vip logs:"
|
||||
crictl logs $(crictl ps --name kube-vip -q 2>/dev/null | head -1) 2>/dev/null || echo "Could not get kube-vip logs"
|
||||
echo "==> kubelet logs:"
|
||||
journalctl -xeu kubelet --no-pager -n 50
|
||||
exit 1
|
||||
}
|
||||
|
||||
echo "==> Waiting for kube-vip to claim VIP $vip"
|
||||
for i in $(seq 1 60); do
|
||||
for i in $(seq 1 90); do
|
||||
if ip -4 addr show | grep -q "$vip"; then
|
||||
echo "==> VIP $vip is bound"
|
||||
break
|
||||
fi
|
||||
if [ "$i" -eq 60 ]; then
|
||||
echo "==> WARNING: VIP not bound after 2 minutes, proceeding anyway"
|
||||
if [ "$i" -eq 90 ]; then
|
||||
echo "==> ERROR: VIP not bound after 3 minutes"
|
||||
crictl ps -a --name kube-vip || true
|
||||
for container_id in $(crictl ps -a --name kube-vip -q 2>/dev/null); do
|
||||
echo "--- kube-vip container $container_id ---"
|
||||
crictl logs "$container_id" 2>/dev/null || true
|
||||
done
|
||||
exit 1
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
@@ -269,10 +250,8 @@ in
|
||||
sleep 2
|
||||
done
|
||||
|
||||
echo "==> Stopping bootstrap kube-vip (static pod will take over)"
|
||||
ctr tasks kill kube-vip-bootstrap 2>/dev/null || true
|
||||
ctr tasks rm kube-vip-bootstrap 2>/dev/null || true
|
||||
ctr containers rm kube-vip-bootstrap 2>/dev/null || true
|
||||
# Switch kube-vip to normal admin.conf after bootstrap finishes.
|
||||
sed -i 's#/etc/kubernetes/super-admin.conf#/etc/kubernetes/admin.conf#g' /etc/kubernetes/manifests/kube-vip.yaml || true
|
||||
|
||||
mkdir -p /root/.kube
|
||||
cp /etc/kubernetes/admin.conf /root/.kube/config
|
||||
|
||||
@@ -164,7 +164,7 @@ rebuild_node() {
|
||||
timeout "$REBUILD_TIMEOUT" nixos-rebuild switch \
|
||||
--flake "$FLAKE_DIR#$node_name" \
|
||||
--target-host "$ACTIVE_SSH_USER@$node_ip" \
|
||||
--use-remote-sudo
|
||||
--sudo
|
||||
}
|
||||
|
||||
rebuild_node_with_retry() {
|
||||
|
||||
@@ -24,7 +24,7 @@ resource "proxmox_vm_qemu" "control_planes" {
|
||||
clone = var.clone_template
|
||||
full_clone = true
|
||||
os_type = "cloud-init"
|
||||
agent = 1
|
||||
agent = var.qemu_agent_enabled ? 1 : 0
|
||||
automatic_reboot = true
|
||||
|
||||
cpu {
|
||||
@@ -79,7 +79,7 @@ resource "proxmox_vm_qemu" "workers" {
|
||||
clone = var.clone_template
|
||||
full_clone = true
|
||||
os_type = "cloud-init"
|
||||
agent = 1
|
||||
agent = var.qemu_agent_enabled ? 1 : 0
|
||||
automatic_reboot = true
|
||||
|
||||
cpu {
|
||||
|
||||
@@ -99,6 +99,12 @@ variable "pm_api_url" {
|
||||
type = string
|
||||
}
|
||||
|
||||
variable "qemu_agent_enabled" {
|
||||
type = bool
|
||||
default = false
|
||||
description = "Enable QEMU guest agent integration in Proxmox resources"
|
||||
}
|
||||
|
||||
variable "SSH_KEY_PUBLIC" {
|
||||
type = string
|
||||
description = "Public SSH key injected via cloud-init"
|
||||
|
||||
Reference in New Issue
Block a user