Merge pull request 'fix: force fresh bootstrap stages after rebuild and stabilize join node identity' (#106) from stage into master
Some checks failed
Terraform Apply / Terraform Apply (push) Failing after 20m28s

Reviewed-on: #106
This commit was merged in pull request #106.
This commit is contained in:
2026-03-04 00:32:06 +00:00
2 changed files with 23 additions and 6 deletions

View File

@@ -198,6 +198,13 @@ class Controller:
state["updated_at"] = int(time.time())
self.set_state(state)
def clear_done(self, keys):
state = self.get_state()
for key in keys:
state.pop(key, None)
state["updated_at"] = int(time.time())
self.set_state(state)
def stage_done(self, key):
return bool(self.get_state().get(key))
@@ -291,6 +298,14 @@ class Controller:
if failures:
raise RuntimeError(f"Worker rebuild failures: {failures}")
# Rebuild can invalidate prior bootstrap stages; force reconciliation.
self.clear_done([
"primary_initialized",
"cni_installed",
"control_planes_joined",
"workers_joined",
"verified",
])
self.mark_done("nodes_rebuilt")
def has_admin_conf(self):
@@ -301,7 +316,7 @@ class Controller:
return self.remote(self.primary_ip, cmd, check=False).returncode == 0
def stage_init_primary(self):
if self.stage_done("primary_initialized"):
if self.stage_done("primary_initialized") and self.has_admin_conf() and self.cluster_ready():
self.log("Primary control plane init already complete")
return
if self.has_admin_conf() and self.cluster_ready():
@@ -312,7 +327,7 @@ class Controller:
self.mark_done("primary_initialized")
def stage_install_cni(self):
if self.stage_done("cni_installed"):
if self.stage_done("cni_installed") and self.cluster_ready():
self.log("CNI install already complete")
return
self.log("Installing or upgrading Cilium")
@@ -348,7 +363,6 @@ class Controller:
self.log("Control-plane join already complete")
return
_, cp_join = self.build_join_cmds()
encoded = base64.b64encode(cp_join.encode()).decode()
for node in self.cp_names:
if node == self.primary_cp:
continue
@@ -357,7 +371,8 @@ class Controller:
continue
self.log(f"Joining control plane {node}")
ip = self.node_ips[node]
self.remote(ip, f"sudo th-kubeadm-join-control-plane \"$(echo {encoded} | base64 -d)\"")
node_join = f"{cp_join} --node-name {node}"
self.remote(ip, f"sudo th-kubeadm-join-control-plane {shlex.quote(node_join)}")
self.mark_done("control_planes_joined")
def stage_join_workers(self):
@@ -365,14 +380,14 @@ class Controller:
self.log("Worker join already complete")
return
join_cmd, _ = self.build_join_cmds()
encoded = base64.b64encode(join_cmd.encode()).decode()
for node in self.wk_names:
if self.cluster_has_node(node):
self.log(f"{node} already joined")
continue
self.log(f"Joining worker {node}")
ip = self.node_ips[node]
self.remote(ip, f"sudo th-kubeadm-join-worker \"$(echo {encoded} | base64 -d)\"")
node_join = f"{join_cmd} --node-name {node}"
self.remote(ip, f"sudo th-kubeadm-join-worker {shlex.quote(node_join)}")
self.mark_done("workers_joined")
def stage_verify(self):

View File

@@ -309,6 +309,7 @@ in
systemctl unmask kubelet || true
systemctl stop kubelet || true
systemctl enable kubelet || true
systemctl reset-failed kubelet || true
systemctl daemon-reload
eval "$1"
@@ -326,6 +327,7 @@ in
systemctl unmask kubelet || true
systemctl stop kubelet || true
systemctl enable kubelet || true
systemctl reset-failed kubelet || true
systemctl daemon-reload
eval "$1"