diff --git a/nixos/kubeadm/modules/k8s-common.nix b/nixos/kubeadm/modules/k8s-common.nix index 718281d..8a4b255 100644 --- a/nixos/kubeadm/modules/k8s-common.nix +++ b/nixos/kubeadm/modules/k8s-common.nix @@ -129,51 +129,6 @@ in echo "Using control-plane endpoint: $vip:6443" echo "Using kube-vip interface: $iface" - mkdir -p /etc/kubernetes/manifests - ctr image pull "${kubeVipImage}" - - ctr tasks kill kube-vip-bootstrap 2>/dev/null || true - ctr tasks rm kube-vip-bootstrap 2>/dev/null || true - ctr containers rm kube-vip-bootstrap 2>/dev/null || true - - echo "==> Starting kube-vip daemon to claim VIP $vip" - ctr run --net-host -d "${kubeVipImage}" kube-vip-bootstrap /kube-vip \ - --interface "$iface" \ - --address "$vip" \ - --controlplane \ - --services \ - --arp \ - --leaderElection - - sleep 3 - - echo "==> Waiting for VIP $vip to be claimed" - for i in $(seq 1 30); do - if ip -4 addr show | grep -q "$vip"; then - echo "==> VIP $vip is bound" - break - fi - echo "Waiting for VIP... ($i/30)" - sleep 1 - done - - if ! ip -4 addr show | grep -q "$vip"; then - echo "==> WARNING: VIP not bound, checking kube-vip logs:" - ctr task logs kube-vip-bootstrap 2>&1 | tail -20 || true - fi - - echo "==> Creating kube-vip static pod manifest" - ctr run --rm --net-host "${kubeVipImage}" kube-vip-manifest /kube-vip manifest pod \ - --interface "$iface" \ - --address "$vip" \ - --controlplane \ - --services \ - --arp \ - --leaderElection \ - > /etc/kubernetes/manifests/kube-vip.yaml - - echo "==> kube-vip static pod manifest created" - rm -f /var/lib/kubelet/config.yaml /var/lib/kubelet/kubeadm-flags.env systemctl unmask kubelet || true @@ -193,6 +148,7 @@ in exit 1 fi + mkdir -p /etc/kubernetes/manifests mkdir -p /tmp/kubeadm cat > /tmp/kubeadm/init-config.yaml << 'KUBEADMCONFIG' apiVersion: kubeadm.k8s.io/v1beta4 @@ -225,31 +181,56 @@ in echo "==> Pre-pulling kubeadm images" env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm config images pull --config /tmp/kubeadm/init-config.yaml || true + echo "==> Creating kube-vip static pod manifest" + ctr image pull "${kubeVipImage}" + ctr run --rm --net-host "${kubeVipImage}" kube-vip-manifest /kube-vip manifest pod \ + --interface "$iface" \ + --address "$vip" \ + --controlplane \ + --services \ + --arp \ + --leaderElection \ + > /etc/kubernetes/manifests/kube-vip.yaml + + # kube-vip bootstrap workaround for Kubernetes >=1.29. + # During early kubeadm phases, super-admin.conf is available before admin.conf is fully usable. + sed -i 's#/etc/kubernetes/admin.conf#/etc/kubernetes/super-admin.conf#g' /etc/kubernetes/manifests/kube-vip.yaml || true + env -i PATH=/run/current-system/sw/bin:/usr/bin:/bin kubeadm init \ --config /tmp/kubeadm/init-config.yaml \ --upload-certs \ - --ignore-preflight-errors=NumCPU,HTTPProxyCIDR,Port-10250 \ - --skip-phases=wait-control-plane || { - echo "==> kubeadm init phases failed, checking pod status:" + --ignore-preflight-errors=NumCPU,HTTPProxyCIDR,Port-10250 || { + echo "==> kubeadm init failed, checking pod status:" crictl pods || true crictl ps -a || true + echo "==> kube-vip containers:" + crictl ps -a --name kube-vip || true + echo "==> kube-vip logs:" + for container_id in $(crictl ps -a --name kube-vip -q 2>/dev/null); do + echo "--- kube-vip container $container_id ---" + crictl logs "$container_id" 2>/dev/null || true + done echo "==> Checking if VIP is bound:" ip -4 addr show | grep "$vip" || echo "VIP NOT BOUND" - echo "==> kube-vip logs:" - crictl logs $(crictl ps --name kube-vip -q 2>/dev/null | head -1) 2>/dev/null || echo "Could not get kube-vip logs" echo "==> kubelet logs:" journalctl -xeu kubelet --no-pager -n 50 exit 1 } echo "==> Waiting for kube-vip to claim VIP $vip" - for i in $(seq 1 60); do + for i in $(seq 1 90); do if ip -4 addr show | grep -q "$vip"; then echo "==> VIP $vip is bound" break fi - if [ "$i" -eq 60 ]; then - echo "==> WARNING: VIP not bound after 2 minutes, proceeding anyway" + if [ "$i" -eq 90 ]; then + echo "==> ERROR: VIP not bound after 3 minutes" + crictl ps -a --name kube-vip || true + for container_id in $(crictl ps -a --name kube-vip -q 2>/dev/null); do + echo "--- kube-vip container $container_id ---" + crictl logs "$container_id" 2>/dev/null || true + done + exit 1 fi sleep 2 done @@ -269,10 +250,8 @@ in sleep 2 done - echo "==> Stopping bootstrap kube-vip (static pod will take over)" - ctr tasks kill kube-vip-bootstrap 2>/dev/null || true - ctr tasks rm kube-vip-bootstrap 2>/dev/null || true - ctr containers rm kube-vip-bootstrap 2>/dev/null || true + # Switch kube-vip to normal admin.conf after bootstrap finishes. + sed -i 's#/etc/kubernetes/super-admin.conf#/etc/kubernetes/admin.conf#g' /etc/kubernetes/manifests/kube-vip.yaml || true mkdir -p /root/.kube cp /etc/kubernetes/admin.conf /root/.kube/config diff --git a/nixos/kubeadm/scripts/rebuild-and-bootstrap.sh b/nixos/kubeadm/scripts/rebuild-and-bootstrap.sh index ff873cc..1b60dbc 100755 --- a/nixos/kubeadm/scripts/rebuild-and-bootstrap.sh +++ b/nixos/kubeadm/scripts/rebuild-and-bootstrap.sh @@ -164,7 +164,7 @@ rebuild_node() { timeout "$REBUILD_TIMEOUT" nixos-rebuild switch \ --flake "$FLAKE_DIR#$node_name" \ --target-host "$ACTIVE_SSH_USER@$node_ip" \ - --use-remote-sudo + --sudo } rebuild_node_with_retry() { diff --git a/terraform/main.tf b/terraform/main.tf index 9a68fcf..90fd52a 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -24,7 +24,7 @@ resource "proxmox_vm_qemu" "control_planes" { clone = var.clone_template full_clone = true os_type = "cloud-init" - agent = 1 + agent = var.qemu_agent_enabled ? 1 : 0 automatic_reboot = true cpu { @@ -79,7 +79,7 @@ resource "proxmox_vm_qemu" "workers" { clone = var.clone_template full_clone = true os_type = "cloud-init" - agent = 1 + agent = var.qemu_agent_enabled ? 1 : 0 automatic_reboot = true cpu { diff --git a/terraform/variables.tf b/terraform/variables.tf index 91ebd27..2df61d9 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -99,6 +99,12 @@ variable "pm_api_url" { type = string } +variable "qemu_agent_enabled" { + type = bool + default = false + description = "Enable QEMU guest agent integration in Proxmox resources" +} + variable "SSH_KEY_PUBLIC" { type = string description = "Public SSH key injected via cloud-init"