From 9ae8eb613444996802b2f24523c592bfeb780ba6 Mon Sep 17 00:00:00 2001 From: MichaelFisher1997 Date: Tue, 3 Mar 2026 21:08:29 +0000 Subject: [PATCH] fix: make SSH inventory discovery more reliable on CI Increase default SSH timeout, reduce scan concurrency, and add a second slower scan pass to avoid transient misses on busy runners. Also print discovered hostnames to improve failure diagnostics when node-name matching fails. --- .../scripts/discover-inventory-from-ssh.py | 39 ++++++++++++------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/nixos/kubeadm/scripts/discover-inventory-from-ssh.py b/nixos/kubeadm/scripts/discover-inventory-from-ssh.py index f29a154..01b0a69 100755 --- a/nixos/kubeadm/scripts/discover-inventory-from-ssh.py +++ b/nixos/kubeadm/scripts/discover-inventory-from-ssh.py @@ -80,8 +80,8 @@ def main() -> int: ssh_user = os.environ.get("KUBEADM_SSH_USER", "").strip() or "micqdf" users = [u for u in os.environ.get("SSH_USER_CANDIDATES", f"{ssh_user} root").split() if u] key_path = os.environ.get("SSH_KEY_PATH", os.path.expanduser("~/.ssh/id_ed25519")) - timeout_sec = int(os.environ.get("SSH_DISCOVERY_TIMEOUT_SEC", "3")) - max_workers = int(os.environ.get("SSH_DISCOVERY_WORKERS", "64")) + timeout_sec = int(os.environ.get("SSH_DISCOVERY_TIMEOUT_SEC", "6")) + max_workers = int(os.environ.get("SSH_DISCOVERY_WORKERS", "32")) prefix = derive_prefix(payload) start = int(os.environ.get("KUBEADM_SUBNET_START", "2")) @@ -90,20 +90,33 @@ def main() -> int: scan_ips = [str(ipaddress.IPv4Address(f"{prefix}.{i}")) for i in range(start, end + 1)] found: Dict[str, str] = {} - with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as pool: - futures = [pool.submit(ssh_hostname, ip, users, key_path, timeout_sec) for ip in scan_ips] - for fut in concurrent.futures.as_completed(futures): - result = fut.result() - if not result: - continue - host, ip = result - if host in target_names and host not in found: - found[host] = ip - if all(name in found for name in target_names): - break + seen_hostnames: Dict[str, str] = {} + + def run_pass(pass_timeout: int, pass_workers: int) -> None: + with concurrent.futures.ThreadPoolExecutor(max_workers=pass_workers) as pool: + futures = [pool.submit(ssh_hostname, ip, users, key_path, pass_timeout) for ip in scan_ips] + for fut in concurrent.futures.as_completed(futures): + result = fut.result() + if not result: + continue + host, ip = result + if host not in seen_hostnames: + seen_hostnames[host] = ip + if host in target_names and host not in found: + found[host] = ip + if all(name in found for name in target_names): + return + + run_pass(timeout_sec, max_workers) + if not all(name in found for name in target_names): + # Slower second pass for busy runners/networks. + run_pass(max(timeout_sec + 2, 8), max(8, max_workers // 2)) missing = sorted([n for n in target_names if n not in found]) if missing: + discovered = ", ".join(sorted(seen_hostnames.keys())[:20]) + if discovered: + sys.stderr.write(f"Discovered hostnames during scan: {discovered}\n") raise SystemExit( "Failed SSH-based IP discovery for nodes: " + ", ".join(missing) + f" (scanned {prefix}.{start}-{prefix}.{end})"