fix: wait for SSH readiness after VM provisioning
All checks were successful
Terraform Plan / Terraform Plan (push) Successful in 17s
All checks were successful
Terraform Plan / Terraform Plan (push) Successful in 17s
Freshly recreated VMs can take a few minutes before cloud-init users and SSH are available. Retry SSH authentication in the bootstrap controller before failing so rebuild/bootstrap does not abort immediately on new hosts.
This commit is contained in:
@@ -121,6 +121,8 @@ class Controller:
|
|||||||
self.fast_mode = self.env.get("FAST_MODE", "1")
|
self.fast_mode = self.env.get("FAST_MODE", "1")
|
||||||
self.skip_rebuild = self.env.get("SKIP_REBUILD", "0") == "1"
|
self.skip_rebuild = self.env.get("SKIP_REBUILD", "0") == "1"
|
||||||
self.force_reinit = True
|
self.force_reinit = True
|
||||||
|
self.ssh_ready_retries = int(self.env.get("SSH_READY_RETRIES", "20"))
|
||||||
|
self.ssh_ready_delay = int(self.env.get("SSH_READY_DELAY_SEC", "15"))
|
||||||
|
|
||||||
def log(self, msg):
|
def log(self, msg):
|
||||||
print(f"==> {msg}")
|
print(f"==> {msg}")
|
||||||
@@ -130,12 +132,19 @@ class Controller:
|
|||||||
return run_local(full, check=check, capture=True)
|
return run_local(full, check=check, capture=True)
|
||||||
|
|
||||||
def detect_user(self, ip):
|
def detect_user(self, ip):
|
||||||
for user in self.ssh_candidates:
|
for attempt in range(1, self.ssh_ready_retries + 1):
|
||||||
proc = self._ssh(user, ip, "true", check=False)
|
for user in self.ssh_candidates:
|
||||||
if proc.returncode == 0:
|
proc = self._ssh(user, ip, "true", check=False)
|
||||||
self.active_ssh_user = user
|
if proc.returncode == 0:
|
||||||
self.log(f"Using SSH user '{user}' for {ip}")
|
self.active_ssh_user = user
|
||||||
return
|
self.log(f"Using SSH user '{user}' for {ip}")
|
||||||
|
return
|
||||||
|
if attempt < self.ssh_ready_retries:
|
||||||
|
self.log(
|
||||||
|
f"SSH not ready on {ip} yet; retrying in {self.ssh_ready_delay}s "
|
||||||
|
f"({attempt}/{self.ssh_ready_retries})"
|
||||||
|
)
|
||||||
|
time.sleep(self.ssh_ready_delay)
|
||||||
raise RuntimeError(f"Unable to authenticate to {ip} with users: {', '.join(self.ssh_candidates)}")
|
raise RuntimeError(f"Unable to authenticate to {ip} with users: {', '.join(self.ssh_candidates)}")
|
||||||
|
|
||||||
def remote(self, ip, cmd, check=True):
|
def remote(self, ip, cmd, check=True):
|
||||||
|
|||||||
Reference in New Issue
Block a user