From ca54c44fa4e9f5f6412bb8d4dd2647841a3eef18 Mon Sep 17 00:00:00 2001 From: MichaelFisher1997 Date: Thu, 5 Mar 2026 00:48:41 +0000 Subject: [PATCH] fix: stabilize Cilium install defaults and add rollout diagnostics Set Cilium kubeProxyReplacement from env (default false for homelab stability) and collect cilium daemonset/pod/log diagnostics when rollout times out during verification. --- nixos/kubeadm/bootstrap/controller.py | 35 +++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/nixos/kubeadm/bootstrap/controller.py b/nixos/kubeadm/bootstrap/controller.py index 3707ff7..99cf082 100755 --- a/nixos/kubeadm/bootstrap/controller.py +++ b/nixos/kubeadm/bootstrap/controller.py @@ -125,6 +125,7 @@ class Controller: self.fast_mode = self.env.get("FAST_MODE", "1") self.skip_rebuild = self.env.get("SKIP_REBUILD", "0") == "1" self.force_reinit = False + self.cilium_kpr = self.env.get("CILIUM_KUBE_PROXY_REPLACEMENT", "false") def log(self, msg): print(f"==> {msg}") @@ -338,7 +339,12 @@ class Controller: self.remote(self.primary_ip, "sudo kubectl --kubeconfig /etc/kubernetes/admin.conf create namespace kube-system >/dev/null 2>&1 || true") self.remote( self.primary_ip, - "sudo KUBECONFIG=/etc/kubernetes/admin.conf helm upgrade --install cilium cilium/cilium --namespace kube-system --set kubeProxyReplacement=true", + ( + "sudo KUBECONFIG=/etc/kubernetes/admin.conf " + "helm upgrade --install cilium cilium/cilium " + "--namespace kube-system " + f"--set kubeProxyReplacement={shlex.quote(self.cilium_kpr)}" + ), ) self.mark_done("cni_installed") @@ -397,10 +403,29 @@ class Controller: self.log("Verification already complete") return self.log("Final node verification") - self.remote( - self.primary_ip, - "sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-system rollout status ds/cilium --timeout=10m", - ) + try: + self.remote( + self.primary_ip, + "sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-system rollout status ds/cilium --timeout=10m", + ) + except Exception: + self.log("Cilium rollout failed; collecting diagnostics") + self.remote( + self.primary_ip, + "sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-system get ds cilium -o wide || true", + check=False, + ) + self.remote( + self.primary_ip, + "sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-system get pods -l k8s-app=cilium -o wide || true", + check=False, + ) + self.remote( + self.primary_ip, + "for p in $(sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-system get pods -l k8s-app=cilium -o name 2>/dev/null); do sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-system logs --tail=120 $p || true; done", + check=False, + ) + raise self.remote( self.primary_ip, "sudo kubectl --kubeconfig /etc/kubernetes/admin.conf wait --for=condition=Ready nodes --all --timeout=10m",