Merge pull request 'fix: force fresh bootstrap stages after rebuild and stabilize join node identity' (#106) from stage into master
Some checks failed
Terraform Apply / Terraform Apply (push) Failing after 20m28s
Some checks failed
Terraform Apply / Terraform Apply (push) Failing after 20m28s
Reviewed-on: #106
This commit was merged in pull request #106.
This commit is contained in:
@@ -198,6 +198,13 @@ class Controller:
|
|||||||
state["updated_at"] = int(time.time())
|
state["updated_at"] = int(time.time())
|
||||||
self.set_state(state)
|
self.set_state(state)
|
||||||
|
|
||||||
|
def clear_done(self, keys):
|
||||||
|
state = self.get_state()
|
||||||
|
for key in keys:
|
||||||
|
state.pop(key, None)
|
||||||
|
state["updated_at"] = int(time.time())
|
||||||
|
self.set_state(state)
|
||||||
|
|
||||||
def stage_done(self, key):
|
def stage_done(self, key):
|
||||||
return bool(self.get_state().get(key))
|
return bool(self.get_state().get(key))
|
||||||
|
|
||||||
@@ -291,6 +298,14 @@ class Controller:
|
|||||||
if failures:
|
if failures:
|
||||||
raise RuntimeError(f"Worker rebuild failures: {failures}")
|
raise RuntimeError(f"Worker rebuild failures: {failures}")
|
||||||
|
|
||||||
|
# Rebuild can invalidate prior bootstrap stages; force reconciliation.
|
||||||
|
self.clear_done([
|
||||||
|
"primary_initialized",
|
||||||
|
"cni_installed",
|
||||||
|
"control_planes_joined",
|
||||||
|
"workers_joined",
|
||||||
|
"verified",
|
||||||
|
])
|
||||||
self.mark_done("nodes_rebuilt")
|
self.mark_done("nodes_rebuilt")
|
||||||
|
|
||||||
def has_admin_conf(self):
|
def has_admin_conf(self):
|
||||||
@@ -301,7 +316,7 @@ class Controller:
|
|||||||
return self.remote(self.primary_ip, cmd, check=False).returncode == 0
|
return self.remote(self.primary_ip, cmd, check=False).returncode == 0
|
||||||
|
|
||||||
def stage_init_primary(self):
|
def stage_init_primary(self):
|
||||||
if self.stage_done("primary_initialized"):
|
if self.stage_done("primary_initialized") and self.has_admin_conf() and self.cluster_ready():
|
||||||
self.log("Primary control plane init already complete")
|
self.log("Primary control plane init already complete")
|
||||||
return
|
return
|
||||||
if self.has_admin_conf() and self.cluster_ready():
|
if self.has_admin_conf() and self.cluster_ready():
|
||||||
@@ -312,7 +327,7 @@ class Controller:
|
|||||||
self.mark_done("primary_initialized")
|
self.mark_done("primary_initialized")
|
||||||
|
|
||||||
def stage_install_cni(self):
|
def stage_install_cni(self):
|
||||||
if self.stage_done("cni_installed"):
|
if self.stage_done("cni_installed") and self.cluster_ready():
|
||||||
self.log("CNI install already complete")
|
self.log("CNI install already complete")
|
||||||
return
|
return
|
||||||
self.log("Installing or upgrading Cilium")
|
self.log("Installing or upgrading Cilium")
|
||||||
@@ -348,7 +363,6 @@ class Controller:
|
|||||||
self.log("Control-plane join already complete")
|
self.log("Control-plane join already complete")
|
||||||
return
|
return
|
||||||
_, cp_join = self.build_join_cmds()
|
_, cp_join = self.build_join_cmds()
|
||||||
encoded = base64.b64encode(cp_join.encode()).decode()
|
|
||||||
for node in self.cp_names:
|
for node in self.cp_names:
|
||||||
if node == self.primary_cp:
|
if node == self.primary_cp:
|
||||||
continue
|
continue
|
||||||
@@ -357,7 +371,8 @@ class Controller:
|
|||||||
continue
|
continue
|
||||||
self.log(f"Joining control plane {node}")
|
self.log(f"Joining control plane {node}")
|
||||||
ip = self.node_ips[node]
|
ip = self.node_ips[node]
|
||||||
self.remote(ip, f"sudo th-kubeadm-join-control-plane \"$(echo {encoded} | base64 -d)\"")
|
node_join = f"{cp_join} --node-name {node}"
|
||||||
|
self.remote(ip, f"sudo th-kubeadm-join-control-plane {shlex.quote(node_join)}")
|
||||||
self.mark_done("control_planes_joined")
|
self.mark_done("control_planes_joined")
|
||||||
|
|
||||||
def stage_join_workers(self):
|
def stage_join_workers(self):
|
||||||
@@ -365,14 +380,14 @@ class Controller:
|
|||||||
self.log("Worker join already complete")
|
self.log("Worker join already complete")
|
||||||
return
|
return
|
||||||
join_cmd, _ = self.build_join_cmds()
|
join_cmd, _ = self.build_join_cmds()
|
||||||
encoded = base64.b64encode(join_cmd.encode()).decode()
|
|
||||||
for node in self.wk_names:
|
for node in self.wk_names:
|
||||||
if self.cluster_has_node(node):
|
if self.cluster_has_node(node):
|
||||||
self.log(f"{node} already joined")
|
self.log(f"{node} already joined")
|
||||||
continue
|
continue
|
||||||
self.log(f"Joining worker {node}")
|
self.log(f"Joining worker {node}")
|
||||||
ip = self.node_ips[node]
|
ip = self.node_ips[node]
|
||||||
self.remote(ip, f"sudo th-kubeadm-join-worker \"$(echo {encoded} | base64 -d)\"")
|
node_join = f"{join_cmd} --node-name {node}"
|
||||||
|
self.remote(ip, f"sudo th-kubeadm-join-worker {shlex.quote(node_join)}")
|
||||||
self.mark_done("workers_joined")
|
self.mark_done("workers_joined")
|
||||||
|
|
||||||
def stage_verify(self):
|
def stage_verify(self):
|
||||||
|
|||||||
@@ -309,6 +309,7 @@ in
|
|||||||
|
|
||||||
systemctl unmask kubelet || true
|
systemctl unmask kubelet || true
|
||||||
systemctl stop kubelet || true
|
systemctl stop kubelet || true
|
||||||
|
systemctl enable kubelet || true
|
||||||
systemctl reset-failed kubelet || true
|
systemctl reset-failed kubelet || true
|
||||||
systemctl daemon-reload
|
systemctl daemon-reload
|
||||||
eval "$1"
|
eval "$1"
|
||||||
@@ -326,6 +327,7 @@ in
|
|||||||
|
|
||||||
systemctl unmask kubelet || true
|
systemctl unmask kubelet || true
|
||||||
systemctl stop kubelet || true
|
systemctl stop kubelet || true
|
||||||
|
systemctl enable kubelet || true
|
||||||
systemctl reset-failed kubelet || true
|
systemctl reset-failed kubelet || true
|
||||||
systemctl daemon-reload
|
systemctl daemon-reload
|
||||||
eval "$1"
|
eval "$1"
|
||||||
|
|||||||
Reference in New Issue
Block a user