fix: make SSH inventory discovery more reliable on CI #104
@@ -80,8 +80,8 @@ def main() -> int:
|
|||||||
ssh_user = os.environ.get("KUBEADM_SSH_USER", "").strip() or "micqdf"
|
ssh_user = os.environ.get("KUBEADM_SSH_USER", "").strip() or "micqdf"
|
||||||
users = [u for u in os.environ.get("SSH_USER_CANDIDATES", f"{ssh_user} root").split() if u]
|
users = [u for u in os.environ.get("SSH_USER_CANDIDATES", f"{ssh_user} root").split() if u]
|
||||||
key_path = os.environ.get("SSH_KEY_PATH", os.path.expanduser("~/.ssh/id_ed25519"))
|
key_path = os.environ.get("SSH_KEY_PATH", os.path.expanduser("~/.ssh/id_ed25519"))
|
||||||
timeout_sec = int(os.environ.get("SSH_DISCOVERY_TIMEOUT_SEC", "3"))
|
timeout_sec = int(os.environ.get("SSH_DISCOVERY_TIMEOUT_SEC", "6"))
|
||||||
max_workers = int(os.environ.get("SSH_DISCOVERY_WORKERS", "64"))
|
max_workers = int(os.environ.get("SSH_DISCOVERY_WORKERS", "32"))
|
||||||
|
|
||||||
prefix = derive_prefix(payload)
|
prefix = derive_prefix(payload)
|
||||||
start = int(os.environ.get("KUBEADM_SUBNET_START", "2"))
|
start = int(os.environ.get("KUBEADM_SUBNET_START", "2"))
|
||||||
@@ -90,20 +90,33 @@ def main() -> int:
|
|||||||
scan_ips = [str(ipaddress.IPv4Address(f"{prefix}.{i}")) for i in range(start, end + 1)]
|
scan_ips = [str(ipaddress.IPv4Address(f"{prefix}.{i}")) for i in range(start, end + 1)]
|
||||||
found: Dict[str, str] = {}
|
found: Dict[str, str] = {}
|
||||||
|
|
||||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as pool:
|
seen_hostnames: Dict[str, str] = {}
|
||||||
futures = [pool.submit(ssh_hostname, ip, users, key_path, timeout_sec) for ip in scan_ips]
|
|
||||||
for fut in concurrent.futures.as_completed(futures):
|
def run_pass(pass_timeout: int, pass_workers: int) -> None:
|
||||||
result = fut.result()
|
with concurrent.futures.ThreadPoolExecutor(max_workers=pass_workers) as pool:
|
||||||
if not result:
|
futures = [pool.submit(ssh_hostname, ip, users, key_path, pass_timeout) for ip in scan_ips]
|
||||||
continue
|
for fut in concurrent.futures.as_completed(futures):
|
||||||
host, ip = result
|
result = fut.result()
|
||||||
if host in target_names and host not in found:
|
if not result:
|
||||||
found[host] = ip
|
continue
|
||||||
if all(name in found for name in target_names):
|
host, ip = result
|
||||||
break
|
if host not in seen_hostnames:
|
||||||
|
seen_hostnames[host] = ip
|
||||||
|
if host in target_names and host not in found:
|
||||||
|
found[host] = ip
|
||||||
|
if all(name in found for name in target_names):
|
||||||
|
return
|
||||||
|
|
||||||
|
run_pass(timeout_sec, max_workers)
|
||||||
|
if not all(name in found for name in target_names):
|
||||||
|
# Slower second pass for busy runners/networks.
|
||||||
|
run_pass(max(timeout_sec + 2, 8), max(8, max_workers // 2))
|
||||||
|
|
||||||
missing = sorted([n for n in target_names if n not in found])
|
missing = sorted([n for n in target_names if n not in found])
|
||||||
if missing:
|
if missing:
|
||||||
|
discovered = ", ".join(sorted(seen_hostnames.keys())[:20])
|
||||||
|
if discovered:
|
||||||
|
sys.stderr.write(f"Discovered hostnames during scan: {discovered}\n")
|
||||||
raise SystemExit(
|
raise SystemExit(
|
||||||
"Failed SSH-based IP discovery for nodes: " + ", ".join(missing) +
|
"Failed SSH-based IP discovery for nodes: " + ", ".join(missing) +
|
||||||
f" (scanned {prefix}.{start}-{prefix}.{end})"
|
f" (scanned {prefix}.{start}-{prefix}.{end})"
|
||||||
|
|||||||
Reference in New Issue
Block a user