fix: vendor Flannel manifest and harden CNI bootstrap timing
All checks were successful
Terraform Plan / Terraform Plan (push) Successful in 17s

Stop depending on GitHub during cluster bring-up by shipping the Flannel manifest in-repo, ensure required host paths exist on NixOS nodes, and wait/retry against a stable API before applying the CNI. This removes the TLS handshake timeout failure mode and makes early network bootstrap deterministic.
This commit is contained in:
2026-03-08 03:24:16 +00:00
parent bd866f7dac
commit b7b364a112
3 changed files with 247 additions and 1 deletions

View File

@@ -265,11 +265,42 @@ class Controller:
def stage_install_cni(self):
self.log("Installing Flannel")
manifest_path = self.script_dir.parent / "manifests" / "kube-flannel.yml"
manifest_b64 = base64.b64encode(manifest_path.read_bytes()).decode()
self.remote(
self.primary_ip,
"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf apply -f https://raw.githubusercontent.com/flannel-io/flannel/v0.25.5/Documentation/kube-flannel.yml",
(
"sudo mkdir -p /var/lib/terrahome && "
f"echo {shlex.quote(manifest_b64)} | base64 -d | sudo tee /var/lib/terrahome/kube-flannel.yml >/dev/null"
),
)
self.log("Waiting for API readiness before applying Flannel")
ready = False
for _ in range(30):
if self.cluster_ready():
ready = True
break
time.sleep(10)
if not ready:
raise RuntimeError("API server did not become ready before Flannel install")
last_error = None
for attempt in range(1, 6):
proc = self.remote(
self.primary_ip,
"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf apply -f /var/lib/terrahome/kube-flannel.yml",
check=False,
)
if proc.returncode == 0:
return
last_error = (proc.stdout or "") + ("\n" if proc.stdout and proc.stderr else "") + (proc.stderr or "")
self.log(f"Flannel apply attempt {attempt}/5 failed; retrying in 15s")
time.sleep(15)
raise RuntimeError(f"Flannel apply failed after retries\n{last_error or ''}")
def cluster_has_node(self, name):
cmd = f"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf get node {shlex.quote(name)} >/dev/null 2>&1"
return self.remote(self.primary_ip, cmd, check=False).returncode == 0