fix: vendor Flannel manifest and harden CNI bootstrap timing
All checks were successful
Terraform Plan / Terraform Plan (push) Successful in 17s
All checks were successful
Terraform Plan / Terraform Plan (push) Successful in 17s
Stop depending on GitHub during cluster bring-up by shipping the Flannel manifest in-repo, ensure required host paths exist on NixOS nodes, and wait/retry against a stable API before applying the CNI. This removes the TLS handshake timeout failure mode and makes early network bootstrap deterministic.
This commit is contained in:
@@ -265,11 +265,42 @@ class Controller:
|
||||
|
||||
def stage_install_cni(self):
|
||||
self.log("Installing Flannel")
|
||||
manifest_path = self.script_dir.parent / "manifests" / "kube-flannel.yml"
|
||||
manifest_b64 = base64.b64encode(manifest_path.read_bytes()).decode()
|
||||
|
||||
self.remote(
|
||||
self.primary_ip,
|
||||
"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf apply -f https://raw.githubusercontent.com/flannel-io/flannel/v0.25.5/Documentation/kube-flannel.yml",
|
||||
(
|
||||
"sudo mkdir -p /var/lib/terrahome && "
|
||||
f"echo {shlex.quote(manifest_b64)} | base64 -d | sudo tee /var/lib/terrahome/kube-flannel.yml >/dev/null"
|
||||
),
|
||||
)
|
||||
|
||||
self.log("Waiting for API readiness before applying Flannel")
|
||||
ready = False
|
||||
for _ in range(30):
|
||||
if self.cluster_ready():
|
||||
ready = True
|
||||
break
|
||||
time.sleep(10)
|
||||
if not ready:
|
||||
raise RuntimeError("API server did not become ready before Flannel install")
|
||||
|
||||
last_error = None
|
||||
for attempt in range(1, 6):
|
||||
proc = self.remote(
|
||||
self.primary_ip,
|
||||
"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf apply -f /var/lib/terrahome/kube-flannel.yml",
|
||||
check=False,
|
||||
)
|
||||
if proc.returncode == 0:
|
||||
return
|
||||
last_error = (proc.stdout or "") + ("\n" if proc.stdout and proc.stderr else "") + (proc.stderr or "")
|
||||
self.log(f"Flannel apply attempt {attempt}/5 failed; retrying in 15s")
|
||||
time.sleep(15)
|
||||
|
||||
raise RuntimeError(f"Flannel apply failed after retries\n{last_error or ''}")
|
||||
|
||||
def cluster_has_node(self, name):
|
||||
cmd = f"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf get node {shlex.quote(name)} >/dev/null 2>&1"
|
||||
return self.remote(self.primary_ip, cmd, check=False).returncode == 0
|
||||
|
||||
Reference in New Issue
Block a user