From 4c167f618ad74244043b8a03d08d7f97fc91698a Mon Sep 17 00:00:00 2001 From: MichaelFisher1997 Date: Sun, 8 Mar 2026 05:00:39 +0000 Subject: [PATCH] fix: wait for SSH readiness after VM provisioning Freshly recreated VMs can take a few minutes before cloud-init users and SSH are available. Retry SSH authentication in the bootstrap controller before failing so rebuild/bootstrap does not abort immediately on new hosts. --- nixos/kubeadm/bootstrap/controller.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/nixos/kubeadm/bootstrap/controller.py b/nixos/kubeadm/bootstrap/controller.py index 7549684..0faf76e 100755 --- a/nixos/kubeadm/bootstrap/controller.py +++ b/nixos/kubeadm/bootstrap/controller.py @@ -121,6 +121,8 @@ class Controller: self.fast_mode = self.env.get("FAST_MODE", "1") self.skip_rebuild = self.env.get("SKIP_REBUILD", "0") == "1" self.force_reinit = True + self.ssh_ready_retries = int(self.env.get("SSH_READY_RETRIES", "20")) + self.ssh_ready_delay = int(self.env.get("SSH_READY_DELAY_SEC", "15")) def log(self, msg): print(f"==> {msg}") @@ -130,12 +132,19 @@ class Controller: return run_local(full, check=check, capture=True) def detect_user(self, ip): - for user in self.ssh_candidates: - proc = self._ssh(user, ip, "true", check=False) - if proc.returncode == 0: - self.active_ssh_user = user - self.log(f"Using SSH user '{user}' for {ip}") - return + for attempt in range(1, self.ssh_ready_retries + 1): + for user in self.ssh_candidates: + proc = self._ssh(user, ip, "true", check=False) + if proc.returncode == 0: + self.active_ssh_user = user + self.log(f"Using SSH user '{user}' for {ip}") + return + if attempt < self.ssh_ready_retries: + self.log( + f"SSH not ready on {ip} yet; retrying in {self.ssh_ready_delay}s " + f"({attempt}/{self.ssh_ready_retries})" + ) + time.sleep(self.ssh_ready_delay) raise RuntimeError(f"Unable to authenticate to {ip} with users: {', '.join(self.ssh_candidates)}") def remote(self, ip, cmd, check=True):