diff --git a/nixos/kubeadm/scripts/discover-inventory-from-ssh.py b/nixos/kubeadm/scripts/discover-inventory-from-ssh.py index bbbc1f2..57b7b37 100755 --- a/nixos/kubeadm/scripts/discover-inventory-from-ssh.py +++ b/nixos/kubeadm/scripts/discover-inventory-from-ssh.py @@ -117,6 +117,7 @@ def main() -> int: vmid_to_name[str(vmid)] = name seen_hostnames: Dict[str, str] = {} + seen_ips: Dict[str, Tuple[str, str]] = {} def run_pass(pass_timeout: int, pass_workers: int) -> None: with concurrent.futures.ThreadPoolExecutor(max_workers=pass_workers) as pool: @@ -128,6 +129,8 @@ def main() -> int: host, ip, serial = result if host not in seen_hostnames: seen_hostnames[host] = ip + if ip not in seen_ips: + seen_ips[ip] = (host, serial) target = None if serial in vmid_to_name: inferred = vmid_to_name[serial] @@ -147,11 +150,25 @@ def main() -> int: # Slower second pass for busy runners/networks. run_pass(max(timeout_sec + 2, 8), max(8, max_workers // 2)) + # Heuristic fallback: if nodes still missing, assign from remaining SSH-reachable + # IPs not already used, ordered by IP. This helps when cloned nodes temporarily + # share a generic hostname (e.g. "flex") and DMI serial mapping is unavailable. + missing = sorted([n for n in target_names if n not in found]) + if missing: + used_ips = set(found.values()) + candidates = sorted(ip for ip in seen_ips.keys() if ip not in used_ips) + if len(candidates) >= len(missing): + for name, ip in zip(missing, candidates): + found[name] = ip + missing = sorted([n for n in target_names if n not in found]) if missing: discovered = ", ".join(sorted(seen_hostnames.keys())[:20]) if discovered: sys.stderr.write(f"Discovered hostnames during scan: {discovered}\n") + if seen_ips: + sample = ", ".join(f"{ip}={meta[0]}" for ip, meta in list(sorted(seen_ips.items()))[:20]) + sys.stderr.write(f"SSH-reachable IPs: {sample}\n") raise SystemExit( "Failed SSH-based IP discovery for nodes: " + ", ".join(missing) + f" (scanned {prefix}.{start}-{prefix}.{end})"