From 3ebeb121b4f87ab2b1e24f69f4d0f95b27af283e Mon Sep 17 00:00:00 2001 From: MichaelFisher1997 Date: Wed, 4 Mar 2026 00:26:37 +0000 Subject: [PATCH] fix: force fresh bootstrap stages after rebuild and stabilize join node identity Clear completed bootstrap stage checkpoints whenever nodes are rebuilt so reconcile does not skip required init/cni/join work on fresh hosts. Also pass explicit --node-name for control-plane and worker joins, and ensure kubelet is enabled before join commands run. --- nixos/kubeadm/bootstrap/controller.py | 27 +++++++++++++++++++++------ nixos/kubeadm/modules/k8s-common.nix | 2 ++ 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/nixos/kubeadm/bootstrap/controller.py b/nixos/kubeadm/bootstrap/controller.py index 0c7b393..19d28f9 100755 --- a/nixos/kubeadm/bootstrap/controller.py +++ b/nixos/kubeadm/bootstrap/controller.py @@ -198,6 +198,13 @@ class Controller: state["updated_at"] = int(time.time()) self.set_state(state) + def clear_done(self, keys): + state = self.get_state() + for key in keys: + state.pop(key, None) + state["updated_at"] = int(time.time()) + self.set_state(state) + def stage_done(self, key): return bool(self.get_state().get(key)) @@ -291,6 +298,14 @@ class Controller: if failures: raise RuntimeError(f"Worker rebuild failures: {failures}") + # Rebuild can invalidate prior bootstrap stages; force reconciliation. + self.clear_done([ + "primary_initialized", + "cni_installed", + "control_planes_joined", + "workers_joined", + "verified", + ]) self.mark_done("nodes_rebuilt") def has_admin_conf(self): @@ -301,7 +316,7 @@ class Controller: return self.remote(self.primary_ip, cmd, check=False).returncode == 0 def stage_init_primary(self): - if self.stage_done("primary_initialized"): + if self.stage_done("primary_initialized") and self.has_admin_conf() and self.cluster_ready(): self.log("Primary control plane init already complete") return if self.has_admin_conf() and self.cluster_ready(): @@ -312,7 +327,7 @@ class Controller: self.mark_done("primary_initialized") def stage_install_cni(self): - if self.stage_done("cni_installed"): + if self.stage_done("cni_installed") and self.cluster_ready(): self.log("CNI install already complete") return self.log("Installing or upgrading Cilium") @@ -348,7 +363,6 @@ class Controller: self.log("Control-plane join already complete") return _, cp_join = self.build_join_cmds() - encoded = base64.b64encode(cp_join.encode()).decode() for node in self.cp_names: if node == self.primary_cp: continue @@ -357,7 +371,8 @@ class Controller: continue self.log(f"Joining control plane {node}") ip = self.node_ips[node] - self.remote(ip, f"sudo th-kubeadm-join-control-plane \"$(echo {encoded} | base64 -d)\"") + node_join = f"{cp_join} --node-name {node}" + self.remote(ip, f"sudo th-kubeadm-join-control-plane {shlex.quote(node_join)}") self.mark_done("control_planes_joined") def stage_join_workers(self): @@ -365,14 +380,14 @@ class Controller: self.log("Worker join already complete") return join_cmd, _ = self.build_join_cmds() - encoded = base64.b64encode(join_cmd.encode()).decode() for node in self.wk_names: if self.cluster_has_node(node): self.log(f"{node} already joined") continue self.log(f"Joining worker {node}") ip = self.node_ips[node] - self.remote(ip, f"sudo th-kubeadm-join-worker \"$(echo {encoded} | base64 -d)\"") + node_join = f"{join_cmd} --node-name {node}" + self.remote(ip, f"sudo th-kubeadm-join-worker {shlex.quote(node_join)}") self.mark_done("workers_joined") def stage_verify(self): diff --git a/nixos/kubeadm/modules/k8s-common.nix b/nixos/kubeadm/modules/k8s-common.nix index 6456531..9c9a53d 100644 --- a/nixos/kubeadm/modules/k8s-common.nix +++ b/nixos/kubeadm/modules/k8s-common.nix @@ -309,6 +309,7 @@ in systemctl unmask kubelet || true systemctl stop kubelet || true + systemctl enable kubelet || true systemctl reset-failed kubelet || true systemctl daemon-reload eval "$1" @@ -326,6 +327,7 @@ in systemctl unmask kubelet || true systemctl stop kubelet || true + systemctl enable kubelet || true systemctl reset-failed kubelet || true systemctl daemon-reload eval "$1"