fix: make SSH inventory discovery more reliable on CI
All checks were successful
Terraform Plan / Terraform Plan (push) Successful in 16s

Increase default SSH timeout, reduce scan concurrency, and add a second slower scan pass to avoid transient misses on busy runners. Also print discovered hostnames to improve failure diagnostics when node-name matching fails.
This commit is contained in:
2026-03-03 21:08:29 +00:00
parent a66ae788f6
commit 9ae8eb6134

View File

@@ -80,8 +80,8 @@ def main() -> int:
ssh_user = os.environ.get("KUBEADM_SSH_USER", "").strip() or "micqdf"
users = [u for u in os.environ.get("SSH_USER_CANDIDATES", f"{ssh_user} root").split() if u]
key_path = os.environ.get("SSH_KEY_PATH", os.path.expanduser("~/.ssh/id_ed25519"))
timeout_sec = int(os.environ.get("SSH_DISCOVERY_TIMEOUT_SEC", "3"))
max_workers = int(os.environ.get("SSH_DISCOVERY_WORKERS", "64"))
timeout_sec = int(os.environ.get("SSH_DISCOVERY_TIMEOUT_SEC", "6"))
max_workers = int(os.environ.get("SSH_DISCOVERY_WORKERS", "32"))
prefix = derive_prefix(payload)
start = int(os.environ.get("KUBEADM_SUBNET_START", "2"))
@@ -90,20 +90,33 @@ def main() -> int:
scan_ips = [str(ipaddress.IPv4Address(f"{prefix}.{i}")) for i in range(start, end + 1)]
found: Dict[str, str] = {}
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as pool:
futures = [pool.submit(ssh_hostname, ip, users, key_path, timeout_sec) for ip in scan_ips]
for fut in concurrent.futures.as_completed(futures):
result = fut.result()
if not result:
continue
host, ip = result
if host in target_names and host not in found:
found[host] = ip
if all(name in found for name in target_names):
break
seen_hostnames: Dict[str, str] = {}
def run_pass(pass_timeout: int, pass_workers: int) -> None:
with concurrent.futures.ThreadPoolExecutor(max_workers=pass_workers) as pool:
futures = [pool.submit(ssh_hostname, ip, users, key_path, pass_timeout) for ip in scan_ips]
for fut in concurrent.futures.as_completed(futures):
result = fut.result()
if not result:
continue
host, ip = result
if host not in seen_hostnames:
seen_hostnames[host] = ip
if host in target_names and host not in found:
found[host] = ip
if all(name in found for name in target_names):
return
run_pass(timeout_sec, max_workers)
if not all(name in found for name in target_names):
# Slower second pass for busy runners/networks.
run_pass(max(timeout_sec + 2, 8), max(8, max_workers // 2))
missing = sorted([n for n in target_names if n not in found])
if missing:
discovered = ", ".join(sorted(seen_hostnames.keys())[:20])
if discovered:
sys.stderr.write(f"Discovered hostnames during scan: {discovered}\n")
raise SystemExit(
"Failed SSH-based IP discovery for nodes: " + ", ".join(missing) +
f" (scanned {prefix}.{start}-{prefix}.{end})"