fix: force fresh kubeadm init after rebuild and make kubelet enable-able
All checks were successful
Terraform Plan / Terraform Plan (push) Successful in 17s

Always re-run primary init when reconcile performs node rebuilds to avoid stale/partial cluster state causing join preflight failures. Also add wantedBy for kubelet so systemctl enable works as expected during join/init flows.
This commit is contained in:
2026-03-04 00:55:20 +00:00
parent 3ebeb121b4
commit 422b7d7f23
2 changed files with 5 additions and 2 deletions

View File

@@ -124,6 +124,7 @@ class Controller:
self.worker_parallelism = int(self.env.get("WORKER_PARALLELISM", "3"))
self.fast_mode = self.env.get("FAST_MODE", "1")
self.skip_rebuild = self.env.get("SKIP_REBUILD", "0") == "1"
self.force_reinit = False
def log(self, msg):
print(f"==> {msg}")
@@ -299,6 +300,7 @@ class Controller:
raise RuntimeError(f"Worker rebuild failures: {failures}")
# Rebuild can invalidate prior bootstrap stages; force reconciliation.
self.force_reinit = True
self.clear_done([
"primary_initialized",
"cni_installed",
@@ -316,10 +318,10 @@ class Controller:
return self.remote(self.primary_ip, cmd, check=False).returncode == 0
def stage_init_primary(self):
if self.stage_done("primary_initialized") and self.has_admin_conf() and self.cluster_ready():
if (not self.force_reinit) and self.stage_done("primary_initialized") and self.has_admin_conf() and self.cluster_ready():
self.log("Primary control plane init already complete")
return
if self.has_admin_conf() and self.cluster_ready():
if (not self.force_reinit) and self.has_admin_conf() and self.cluster_ready():
self.log("Existing cluster detected on primary control plane")
else:
self.log(f"Initializing primary control plane on {self.primary_cp}")

View File

@@ -343,6 +343,7 @@ in
systemd.services.kubelet = {
description = "Kubernetes Kubelet";
wantedBy = [ "multi-user.target" ];
wants = [ "network-online.target" ];
after = [ "containerd.service" "network-online.target" ];
serviceConfig = {