From 3fa227d7c9e2fd85b1d0ff1318687a3aec3b6323 Mon Sep 17 00:00:00 2001 From: MichaelFisher1997 Date: Sun, 1 Mar 2026 19:28:00 +0000 Subject: [PATCH] feat: add SSH-based fallback for kubeadm IP inventory --- .gitea/workflows/kubeadm-bootstrap.yml | 6 +- .gitea/workflows/kubeadm-reset.yml | 6 +- .gitea/workflows/terraform-apply.yml | 6 +- nixos/kubeadm/README.md | 1 + .../scripts/discover-inventory-from-ssh.py | 117 ++++++++++++++++++ 5 files changed, 130 insertions(+), 6 deletions(-) create mode 100755 nixos/kubeadm/scripts/discover-inventory-from-ssh.py diff --git a/.gitea/workflows/kubeadm-bootstrap.yml b/.gitea/workflows/kubeadm-bootstrap.yml index 4b19cb8..99371b8 100644 --- a/.gitea/workflows/kubeadm-bootstrap.yml +++ b/.gitea/workflows/kubeadm-bootstrap.yml @@ -103,8 +103,10 @@ jobs: - name: Create kubeadm inventory env: KUBEADM_SSH_USER: ${{ secrets.KUBEADM_SSH_USER }} + KUBEADM_SUBNET_PREFIX: ${{ secrets.KUBEADM_SUBNET_PREFIX }} run: | set -euo pipefail + TF_OUTPUT_JSON="" for attempt in 1 2 3 4 5 6; do echo "Inventory render attempt $attempt/6" TF_OUTPUT_JSON="$(terraform -chdir=terraform output -json)" @@ -118,8 +120,8 @@ jobs: fi done - echo "Failed to render kubeadm inventory after retries" - exit 1 + echo "Falling back to SSH-based inventory discovery" + printf '%s' "$TF_OUTPUT_JSON" | ./nixos/kubeadm/scripts/discover-inventory-from-ssh.py > nixos/kubeadm/scripts/inventory.env - name: Validate nix installation run: | diff --git a/.gitea/workflows/kubeadm-reset.yml b/.gitea/workflows/kubeadm-reset.yml index 7ff1435..054e5b2 100644 --- a/.gitea/workflows/kubeadm-reset.yml +++ b/.gitea/workflows/kubeadm-reset.yml @@ -103,8 +103,10 @@ jobs: - name: Create kubeadm inventory env: KUBEADM_SSH_USER: ${{ secrets.KUBEADM_SSH_USER }} + KUBEADM_SUBNET_PREFIX: ${{ secrets.KUBEADM_SUBNET_PREFIX }} run: | set -euo pipefail + TF_OUTPUT_JSON="" for attempt in 1 2 3 4 5 6; do echo "Inventory render attempt $attempt/6" TF_OUTPUT_JSON="$(terraform -chdir=terraform output -json)" @@ -118,8 +120,8 @@ jobs: fi done - echo "Failed to render kubeadm inventory after retries" - exit 1 + echo "Falling back to SSH-based inventory discovery" + printf '%s' "$TF_OUTPUT_JSON" | ./nixos/kubeadm/scripts/discover-inventory-from-ssh.py > nixos/kubeadm/scripts/inventory.env - name: Run cluster reset run: | diff --git a/.gitea/workflows/terraform-apply.yml b/.gitea/workflows/terraform-apply.yml index eb03bf0..43ab4b5 100644 --- a/.gitea/workflows/terraform-apply.yml +++ b/.gitea/workflows/terraform-apply.yml @@ -151,8 +151,10 @@ jobs: - name: Create kubeadm inventory from Terraform outputs env: KUBEADM_SSH_USER: ${{ secrets.KUBEADM_SSH_USER }} + KUBEADM_SUBNET_PREFIX: ${{ secrets.KUBEADM_SUBNET_PREFIX }} run: | set -euo pipefail + TF_OUTPUT_JSON="" for attempt in 1 2 3 4 5 6; do echo "Inventory render attempt $attempt/6" TF_OUTPUT_JSON="$(terraform -chdir=terraform output -json)" @@ -166,8 +168,8 @@ jobs: fi done - echo "Failed to render kubeadm inventory after retries" - exit 1 + echo "Falling back to SSH-based inventory discovery" + printf '%s' "$TF_OUTPUT_JSON" | ./nixos/kubeadm/scripts/discover-inventory-from-ssh.py > nixos/kubeadm/scripts/inventory.env - name: Ensure nix and nixos-rebuild env: diff --git a/nixos/kubeadm/README.md b/nixos/kubeadm/README.md index 1ba65ee..2325064 100644 --- a/nixos/kubeadm/README.md +++ b/nixos/kubeadm/README.md @@ -147,6 +147,7 @@ Required repository secrets: Optional secrets: - `KUBEADM_SSH_USER` (defaults to `micqdf`) +- `KUBEADM_SUBNET_PREFIX` (optional, e.g. `10.27.27`; used for SSH-based IP discovery fallback) Node IPs are auto-discovered from Terraform state outputs (`control_plane_vm_ipv4`, `worker_vm_ipv4`), so you do not need per-node IP secrets. diff --git a/nixos/kubeadm/scripts/discover-inventory-from-ssh.py b/nixos/kubeadm/scripts/discover-inventory-from-ssh.py new file mode 100755 index 0000000..f29a154 --- /dev/null +++ b/nixos/kubeadm/scripts/discover-inventory-from-ssh.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 + +import concurrent.futures +import ipaddress +import json +import os +import subprocess +import sys +from typing import Dict, Set + + +def derive_prefix(payload: dict) -> str: + explicit = os.environ.get("KUBEADM_SUBNET_PREFIX", "").strip() + if explicit: + return explicit + + for key in ("control_plane_vm_ipv4", "worker_vm_ipv4"): + values = payload.get(key, {}).get("value", {}) + for ip in values.values(): + if ip: + parts = ip.split(".") + if len(parts) == 4: + return ".".join(parts[:3]) + + return "10.27.27" + + +def ssh_hostname(ip: str, users: list[str], key_path: str, timeout_sec: int) -> tuple[str, str] | None: + cmd_tail = [ + "-o", + "BatchMode=yes", + "-o", + "IdentitiesOnly=yes", + "-o", + "StrictHostKeyChecking=accept-new", + "-o", + f"ConnectTimeout={timeout_sec}", + "-i", + key_path, + ] + for user in users: + cmd = ["ssh", *cmd_tail, f"{user}@{ip}", "hostnamectl --static 2>/dev/null || hostname"] + try: + out = subprocess.check_output(cmd, stderr=subprocess.DEVNULL, text=True, timeout=timeout_sec + 2).strip() + except Exception: + continue + if out: + return out.splitlines()[0].strip(), ip + return None + + +def build_inventory(names: Set[str], found: Dict[str, str], ssh_user: str) -> str: + cp = sorted([n for n in names if n.startswith("cp-")], key=lambda x: int(x.split("-")[1])) + wk = sorted([n for n in names if n.startswith("wk-")], key=lambda x: int(x.split("-")[1])) + + cp_pairs = " ".join(f"{n}={found[n]}" for n in cp) + wk_pairs = " ".join(f"{n}={found[n]}" for n in wk) + primary = cp[0] if cp else "cp-1" + + return "\n".join( + [ + f"SSH_USER={ssh_user}", + f"PRIMARY_CONTROL_PLANE={primary}", + f'CONTROL_PLANES="{cp_pairs}"', + f'WORKERS="{wk_pairs}"', + "", + ] + ) + + +def main() -> int: + payload = json.load(sys.stdin) + + cp_names = set(payload.get("control_plane_vm_ids", {}).get("value", {}).keys()) + wk_names = set(payload.get("worker_vm_ids", {}).get("value", {}).keys()) + target_names = cp_names | wk_names + if not target_names: + raise SystemExit("Could not determine target node names from Terraform outputs") + + ssh_user = os.environ.get("KUBEADM_SSH_USER", "").strip() or "micqdf" + users = [u for u in os.environ.get("SSH_USER_CANDIDATES", f"{ssh_user} root").split() if u] + key_path = os.environ.get("SSH_KEY_PATH", os.path.expanduser("~/.ssh/id_ed25519")) + timeout_sec = int(os.environ.get("SSH_DISCOVERY_TIMEOUT_SEC", "3")) + max_workers = int(os.environ.get("SSH_DISCOVERY_WORKERS", "64")) + + prefix = derive_prefix(payload) + start = int(os.environ.get("KUBEADM_SUBNET_START", "2")) + end = int(os.environ.get("KUBEADM_SUBNET_END", "254")) + + scan_ips = [str(ipaddress.IPv4Address(f"{prefix}.{i}")) for i in range(start, end + 1)] + found: Dict[str, str] = {} + + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as pool: + futures = [pool.submit(ssh_hostname, ip, users, key_path, timeout_sec) for ip in scan_ips] + for fut in concurrent.futures.as_completed(futures): + result = fut.result() + if not result: + continue + host, ip = result + if host in target_names and host not in found: + found[host] = ip + if all(name in found for name in target_names): + break + + missing = sorted([n for n in target_names if n not in found]) + if missing: + raise SystemExit( + "Failed SSH-based IP discovery for nodes: " + ", ".join(missing) + + f" (scanned {prefix}.{start}-{prefix}.{end})" + ) + + sys.stdout.write(build_inventory(target_names, found, ssh_user)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main())