fix: stabilize kubeadm bootstrap and reduce Proxmox plan latency #94
@@ -129,51 +129,6 @@ in
|
|||||||
echo "Using control-plane endpoint: $vip:6443"
|
echo "Using control-plane endpoint: $vip:6443"
|
||||||
echo "Using kube-vip interface: $iface"
|
echo "Using kube-vip interface: $iface"
|
||||||
|
|
||||||
mkdir -p /etc/kubernetes/manifests
|
|
||||||
ctr image pull "${kubeVipImage}"
|
|
||||||
|
|
||||||
ctr tasks kill kube-vip-bootstrap 2>/dev/null || true
|
|
||||||
ctr tasks rm kube-vip-bootstrap 2>/dev/null || true
|
|
||||||
ctr containers rm kube-vip-bootstrap 2>/dev/null || true
|
|
||||||
|
|
||||||
echo "==> Starting kube-vip daemon to claim VIP $vip"
|
|
||||||
ctr run --net-host -d "${kubeVipImage}" kube-vip-bootstrap /kube-vip \
|
|
||||||
--interface "$iface" \
|
|
||||||
--address "$vip" \
|
|
||||||
--controlplane \
|
|
||||||
--services \
|
|
||||||
--arp \
|
|
||||||
--leaderElection
|
|
||||||
|
|
||||||
sleep 3
|
|
||||||
|
|
||||||
echo "==> Waiting for VIP $vip to be claimed"
|
|
||||||
for i in $(seq 1 30); do
|
|
||||||
if ip -4 addr show | grep -q "$vip"; then
|
|
||||||
echo "==> VIP $vip is bound"
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
echo "Waiting for VIP... ($i/30)"
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
|
|
||||||
if ! ip -4 addr show | grep -q "$vip"; then
|
|
||||||
echo "==> WARNING: VIP not bound, checking kube-vip logs:"
|
|
||||||
ctr task logs kube-vip-bootstrap 2>&1 | tail -20 || true
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "==> Creating kube-vip static pod manifest"
|
|
||||||
ctr run --rm --net-host "${kubeVipImage}" kube-vip-manifest /kube-vip manifest pod \
|
|
||||||
--interface "$iface" \
|
|
||||||
--address "$vip" \
|
|
||||||
--controlplane \
|
|
||||||
--services \
|
|
||||||
--arp \
|
|
||||||
--leaderElection \
|
|
||||||
> /etc/kubernetes/manifests/kube-vip.yaml
|
|
||||||
|
|
||||||
echo "==> kube-vip static pod manifest created"
|
|
||||||
|
|
||||||
rm -f /var/lib/kubelet/config.yaml /var/lib/kubelet/kubeadm-flags.env
|
rm -f /var/lib/kubelet/config.yaml /var/lib/kubelet/kubeadm-flags.env
|
||||||
|
|
||||||
systemctl unmask kubelet || true
|
systemctl unmask kubelet || true
|
||||||
@@ -193,6 +148,7 @@ in
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
mkdir -p /etc/kubernetes/manifests
|
||||||
mkdir -p /tmp/kubeadm
|
mkdir -p /tmp/kubeadm
|
||||||
cat > /tmp/kubeadm/init-config.yaml << 'KUBEADMCONFIG'
|
cat > /tmp/kubeadm/init-config.yaml << 'KUBEADMCONFIG'
|
||||||
apiVersion: kubeadm.k8s.io/v1beta4
|
apiVersion: kubeadm.k8s.io/v1beta4
|
||||||
@@ -225,31 +181,56 @@ in
|
|||||||
echo "==> Pre-pulling kubeadm images"
|
echo "==> Pre-pulling kubeadm images"
|
||||||
env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm config images pull --config /tmp/kubeadm/init-config.yaml || true
|
env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm config images pull --config /tmp/kubeadm/init-config.yaml || true
|
||||||
|
|
||||||
|
echo "==> Creating kube-vip static pod manifest"
|
||||||
|
ctr image pull "${kubeVipImage}"
|
||||||
|
ctr run --rm --net-host "${kubeVipImage}" kube-vip-manifest /kube-vip manifest pod \
|
||||||
|
--interface "$iface" \
|
||||||
|
--address "$vip" \
|
||||||
|
--controlplane \
|
||||||
|
--services \
|
||||||
|
--arp \
|
||||||
|
--leaderElection \
|
||||||
|
> /etc/kubernetes/manifests/kube-vip.yaml
|
||||||
|
|
||||||
|
# kube-vip bootstrap workaround for Kubernetes >=1.29.
|
||||||
|
# During early kubeadm phases, super-admin.conf is available before admin.conf is fully usable.
|
||||||
|
sed -i 's#/etc/kubernetes/admin.conf#/etc/kubernetes/super-admin.conf#g' /etc/kubernetes/manifests/kube-vip.yaml || true
|
||||||
|
|
||||||
env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm init \
|
env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm init \
|
||||||
--config /tmp/kubeadm/init-config.yaml \
|
--config /tmp/kubeadm/init-config.yaml \
|
||||||
--upload-certs \
|
--upload-certs \
|
||||||
--ignore-preflight-errors=NumCPU,HTTPProxyCIDR,Port-10250 \
|
--ignore-preflight-errors=NumCPU,HTTPProxyCIDR,Port-10250 || {
|
||||||
--skip-phases=wait-control-plane || {
|
echo "==> kubeadm init failed, checking pod status:"
|
||||||
echo "==> kubeadm init phases failed, checking pod status:"
|
|
||||||
crictl pods || true
|
crictl pods || true
|
||||||
crictl ps -a || true
|
crictl ps -a || true
|
||||||
|
echo "==> kube-vip containers:"
|
||||||
|
crictl ps -a --name kube-vip || true
|
||||||
|
echo "==> kube-vip logs:"
|
||||||
|
for container_id in $(crictl ps -a --name kube-vip -q 2>/dev/null); do
|
||||||
|
echo "--- kube-vip container $container_id ---"
|
||||||
|
crictl logs "$container_id" 2>/dev/null || true
|
||||||
|
done
|
||||||
echo "==> Checking if VIP is bound:"
|
echo "==> Checking if VIP is bound:"
|
||||||
ip -4 addr show | grep "$vip" || echo "VIP NOT BOUND"
|
ip -4 addr show | grep "$vip" || echo "VIP NOT BOUND"
|
||||||
echo "==> kube-vip logs:"
|
|
||||||
crictl logs $(crictl ps --name kube-vip -q 2>/dev/null | head -1) 2>/dev/null || echo "Could not get kube-vip logs"
|
|
||||||
echo "==> kubelet logs:"
|
echo "==> kubelet logs:"
|
||||||
journalctl -xeu kubelet --no-pager -n 50
|
journalctl -xeu kubelet --no-pager -n 50
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
echo "==> Waiting for kube-vip to claim VIP $vip"
|
echo "==> Waiting for kube-vip to claim VIP $vip"
|
||||||
for i in $(seq 1 60); do
|
for i in $(seq 1 90); do
|
||||||
if ip -4 addr show | grep -q "$vip"; then
|
if ip -4 addr show | grep -q "$vip"; then
|
||||||
echo "==> VIP $vip is bound"
|
echo "==> VIP $vip is bound"
|
||||||
break
|
break
|
||||||
fi
|
fi
|
||||||
if [ "$i" -eq 60 ]; then
|
if [ "$i" -eq 90 ]; then
|
||||||
echo "==> WARNING: VIP not bound after 2 minutes, proceeding anyway"
|
echo "==> ERROR: VIP not bound after 3 minutes"
|
||||||
|
crictl ps -a --name kube-vip || true
|
||||||
|
for container_id in $(crictl ps -a --name kube-vip -q 2>/dev/null); do
|
||||||
|
echo "--- kube-vip container $container_id ---"
|
||||||
|
crictl logs "$container_id" 2>/dev/null || true
|
||||||
|
done
|
||||||
|
exit 1
|
||||||
fi
|
fi
|
||||||
sleep 2
|
sleep 2
|
||||||
done
|
done
|
||||||
@@ -269,10 +250,8 @@ in
|
|||||||
sleep 2
|
sleep 2
|
||||||
done
|
done
|
||||||
|
|
||||||
echo "==> Stopping bootstrap kube-vip (static pod will take over)"
|
# Switch kube-vip to normal admin.conf after bootstrap finishes.
|
||||||
ctr tasks kill kube-vip-bootstrap 2>/dev/null || true
|
sed -i 's#/etc/kubernetes/super-admin.conf#/etc/kubernetes/admin.conf#g' /etc/kubernetes/manifests/kube-vip.yaml || true
|
||||||
ctr tasks rm kube-vip-bootstrap 2>/dev/null || true
|
|
||||||
ctr containers rm kube-vip-bootstrap 2>/dev/null || true
|
|
||||||
|
|
||||||
mkdir -p /root/.kube
|
mkdir -p /root/.kube
|
||||||
cp /etc/kubernetes/admin.conf /root/.kube/config
|
cp /etc/kubernetes/admin.conf /root/.kube/config
|
||||||
|
|||||||
@@ -164,7 +164,7 @@ rebuild_node() {
|
|||||||
timeout "$REBUILD_TIMEOUT" nixos-rebuild switch \
|
timeout "$REBUILD_TIMEOUT" nixos-rebuild switch \
|
||||||
--flake "$FLAKE_DIR#$node_name" \
|
--flake "$FLAKE_DIR#$node_name" \
|
||||||
--target-host "$ACTIVE_SSH_USER@$node_ip" \
|
--target-host "$ACTIVE_SSH_USER@$node_ip" \
|
||||||
--use-remote-sudo
|
--sudo
|
||||||
}
|
}
|
||||||
|
|
||||||
rebuild_node_with_retry() {
|
rebuild_node_with_retry() {
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ resource "proxmox_vm_qemu" "control_planes" {
|
|||||||
clone = var.clone_template
|
clone = var.clone_template
|
||||||
full_clone = true
|
full_clone = true
|
||||||
os_type = "cloud-init"
|
os_type = "cloud-init"
|
||||||
agent = 1
|
agent = var.qemu_agent_enabled ? 1 : 0
|
||||||
automatic_reboot = true
|
automatic_reboot = true
|
||||||
|
|
||||||
cpu {
|
cpu {
|
||||||
@@ -79,7 +79,7 @@ resource "proxmox_vm_qemu" "workers" {
|
|||||||
clone = var.clone_template
|
clone = var.clone_template
|
||||||
full_clone = true
|
full_clone = true
|
||||||
os_type = "cloud-init"
|
os_type = "cloud-init"
|
||||||
agent = 1
|
agent = var.qemu_agent_enabled ? 1 : 0
|
||||||
automatic_reboot = true
|
automatic_reboot = true
|
||||||
|
|
||||||
cpu {
|
cpu {
|
||||||
|
|||||||
@@ -99,6 +99,12 @@ variable "pm_api_url" {
|
|||||||
type = string
|
type = string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "qemu_agent_enabled" {
|
||||||
|
type = bool
|
||||||
|
default = false
|
||||||
|
description = "Enable QEMU guest agent integration in Proxmox resources"
|
||||||
|
}
|
||||||
|
|
||||||
variable "SSH_KEY_PUBLIC" {
|
variable "SSH_KEY_PUBLIC" {
|
||||||
type = string
|
type = string
|
||||||
description = "Public SSH key injected via cloud-init"
|
description = "Public SSH key injected via cloud-init"
|
||||||
|
|||||||
Reference in New Issue
Block a user