diff --git a/.gitea/workflows/terraform-apply.yml b/.gitea/workflows/terraform-apply.yml index 7684db6..1f12c35 100644 --- a/.gitea/workflows/terraform-apply.yml +++ b/.gitea/workflows/terraform-apply.yml @@ -24,6 +24,7 @@ jobs: cat > secrets.auto.tfvars << EOF pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}" SSH_KEY_PUBLIC = "$(printf '%s' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r\n')" + TS_AUTHKEY = "$(printf '%s' "${{ secrets.TS_AUTHKEY }}" | tr -d '\r\n')" EOF cat > backend.hcl << EOF bucket = "${{ secrets.B2_TF_BUCKET }}" @@ -70,174 +71,3 @@ jobs: - name: Terraform Apply working-directory: terraform run: terraform apply -auto-approve tfplan - - - name: Enroll VMs in Tailscale - env: - TS_AUTHKEY: ${{ secrets.TS_AUTHKEY }} - PM_API_TOKEN_SECRET: ${{ secrets.PM_API_TOKEN_SECRET }} - TAILSCALE_ENROLL_STRICT: ${{ secrets.TAILSCALE_ENROLL_STRICT }} - working-directory: terraform - run: | - if [ -z "$TS_AUTHKEY" ] || [ -z "$PM_API_TOKEN_SECRET" ]; then - echo "Skipping Tailscale enrollment (missing TS_AUTHKEY or PM_API_TOKEN_SECRET)." - exit 0 - fi - - PM_API_URL=$(awk -F'"' '/^pm_api_url/{print $2}' terraform.tfvars) - PM_API_TOKEN_ID=$(awk -F'"' '/^pm_api_token_id/{print $2}' terraform.tfvars) - TARGET_NODE=$(awk -F'"' '/^target_node/{print $2}' terraform.tfvars) - - export PM_API_URL PM_API_TOKEN_ID TARGET_NODE - - terraform output -json > tfoutputs.json - cat > enroll_tailscale.py <<'PY' - import json - import os - import ssl - import sys - import time - import urllib.error - import urllib.parse - import urllib.request - - api_url = os.environ["PM_API_URL"].rstrip("/") - if api_url.endswith("/api2/json"): - api_url = api_url[: -len("/api2/json")] - token_id = os.environ["PM_API_TOKEN_ID"].strip() - token_secret = os.environ["PM_API_TOKEN_SECRET"].strip() - target_node = os.environ["TARGET_NODE"].strip() - ts_authkey = os.environ["TS_AUTHKEY"] - enroll_strict = os.environ.get("TAILSCALE_ENROLL_STRICT", "false").strip().lower() == "true" - - if not token_id or not token_secret: - raise SystemExit("Missing Proxmox token id/secret") - - raw_outputs = open("tfoutputs.json", "rb").read().decode("utf-8", "ignore") - start = raw_outputs.find("{") - if start == -1: - raise SystemExit("Could not find JSON payload in terraform output") - outputs = json.JSONDecoder().raw_decode(raw_outputs[start:])[0] - - targets = [] - for output_name in ("alpaca_vm_ids", "llama_vm_ids"): - mapping = outputs.get(output_name, {}).get("value", {}) - if isinstance(mapping, dict): - for hostname, vmid in mapping.items(): - targets.append((str(hostname), int(vmid))) - - if not targets: - print("No VMs found in terraform outputs; skipping tailscale enrollment") - raise SystemExit(0) - - print("Tailscale enrollment targets:", ", ".join(f"{h}:{v}" for h, v in targets)) - - ssl_ctx = ssl._create_unverified_context() - auth_header = f"PVEAPIToken={token_id}={token_secret}" - - def api_request(method, path, data=None): - url = f"{api_url}{path}" - headers = {"Authorization": auth_header} - body = None - if data is not None: - body = urllib.parse.urlencode(data, doseq=True).encode("utf-8") - headers["Content-Type"] = "application/x-www-form-urlencoded" - req = urllib.request.Request(url, data=body, headers=headers, method=method) - with urllib.request.urlopen(req, context=ssl_ctx, timeout=30) as resp: - payload = resp.read().decode("utf-8") - return json.loads(payload) - - def wait_for_guest_agent(vmid, timeout_seconds=120): - deadline = time.time() + timeout_seconds - tries = 0 - while time.time() < deadline: - tries += 1 - try: - res = api_request("POST", f"/api2/json/nodes/{target_node}/qemu/{vmid}/agent/ping") - if res.get("data") == "pong": - print(f"Guest agent ready for vmid {vmid}", flush=True) - return True - except urllib.error.HTTPError as exc: - detail = exc.read().decode("utf-8", "ignore") - print(f"Agent ping HTTP error for vmid {vmid}: {exc.code} {detail}", flush=True) - if exc.code in (401, 403): - return False - except Exception as exc: - if tries == 1: - print(f"Agent ping error for vmid {vmid}: {exc}", flush=True) - if tries % 6 == 0: - remaining = int(deadline - time.time()) - print(f"Waiting for guest agent on vmid {vmid} ({remaining}s left)", flush=True) - time.sleep(5) - return False - - def exec_guest(vmid, command): - res = api_request( - "POST", - f"/api2/json/nodes/{target_node}/qemu/{vmid}/agent/exec", - { - "command": "/run/current-system/sw/bin/sh", - "extra-args": ["-lc", command], - }, - ) - pid = res["data"]["pid"] - for _ in range(120): - status = api_request( - "GET", - f"/api2/json/nodes/{target_node}/qemu/{vmid}/agent/exec-status?pid={pid}", - ).get("data", {}) - if status.get("exited"): - return ( - int(status.get("exitcode", 1)), - status.get("out-data", ""), - status.get("err-data", ""), - ) - time.sleep(2) - return (124, "", "Timed out waiting for guest command") - - failures = [] - safe_key = ts_authkey.replace("'", "'\"'\"'") - - for hostname, vmid in targets: - print(f"\n== Enrolling {hostname} (vmid {vmid}) ==") - if not wait_for_guest_agent(vmid): - failures.append((hostname, "agent_not_ready", "guest agent not ready")) - print(f"ERROR: guest agent not ready for vmid {vmid}") - continue - - safe_hostname = hostname.replace("'", "'\"'\"'") - cmd = ( - "set -e; " - f"hostnamectl set-hostname '{safe_hostname}' || true; " - "install -d -m 700 /var/lib/tailscale; " - "rm -f /var/lib/tailscale/tailscaled.state; " - "systemctl restart tailscaled; " - f"/run/current-system/sw/bin/tailscale up --reset --auth-key='{safe_key}' --hostname='{safe_hostname}'; " - "/run/current-system/sw/bin/tailscale status || true" - ) - - exitcode, stdout, stderr = exec_guest(vmid, cmd) - if stdout: - print(stdout) - if stderr: - print(stderr, file=sys.stderr) - - if exitcode != 0: - failures.append((hostname, "command_failed", f"command failed exit {exitcode}")) - print(f"ERROR: tailscale enrollment failed for {hostname} (exit {exitcode})") - - if failures: - print("\nEnrollment failures:") - for hostname, kind, detail in failures: - print(f"- {hostname}: {detail}") - - only_agent_ready_failures = all(kind == "agent_not_ready" for _, kind, _ in failures) - if only_agent_ready_failures and not enroll_strict: - print("\nWARNING: Enrollment skipped because guest agent was unavailable. Set TAILSCALE_ENROLL_STRICT=true to fail the workflow in this case.") - raise SystemExit(0) - - raise SystemExit(1) - - print("\nTailscale enrollment completed for all managed VMs") - PY - - python3 -u enroll_tailscale.py diff --git a/.gitea/workflows/terraform-destroy.yml b/.gitea/workflows/terraform-destroy.yml index e592710..c07d3a1 100644 --- a/.gitea/workflows/terraform-destroy.yml +++ b/.gitea/workflows/terraform-destroy.yml @@ -44,6 +44,7 @@ jobs: cat > secrets.auto.tfvars << EOF pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}" SSH_KEY_PUBLIC = "$(printf '%s' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r\n')" + TS_AUTHKEY = "$(printf '%s' "${{ secrets.TS_AUTHKEY }}" | tr -d '\r\n')" EOF cat > backend.hcl << EOF bucket = "${{ secrets.B2_TF_BUCKET }}" diff --git a/.gitea/workflows/terraform-plan.yml b/.gitea/workflows/terraform-plan.yml index 3bd9459..8fb5688 100644 --- a/.gitea/workflows/terraform-plan.yml +++ b/.gitea/workflows/terraform-plan.yml @@ -26,6 +26,7 @@ jobs: cat > secrets.auto.tfvars << EOF pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}" SSH_KEY_PUBLIC = "$(printf '%s' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r\n')" + TS_AUTHKEY = "$(printf '%s' "${{ secrets.TS_AUTHKEY }}" | tr -d '\r\n')" EOF cat > backend.hcl << EOF bucket = "${{ secrets.B2_TF_BUCKET }}" diff --git a/terraform/cloud-init.tf b/terraform/cloud-init.tf index 2750576..afe4e58 100644 --- a/terraform/cloud-init.tf +++ b/terraform/cloud-init.tf @@ -2,9 +2,8 @@ data "template_file" "cloud_init_global" { template = file("${path.module}/files/cloud_init_global.tpl") vars = { - hostname = "generic" - domain = "home.arpa" SSH_KEY_PUBLIC = var.SSH_KEY_PUBLIC + TS_AUTHKEY = var.TS_AUTHKEY } } diff --git a/terraform/files/cloud_init_global.tpl b/terraform/files/cloud_init_global.tpl index 91f161a..633522d 100644 --- a/terraform/files/cloud_init_global.tpl +++ b/terraform/files/cloud_init_global.tpl @@ -1,5 +1,4 @@ #cloud-config -hostname: ${hostname} manage_etc_hosts: true resolv_conf: nameservers: @@ -7,9 +6,12 @@ resolv_conf: - 1.1.1.1 preserve_hostname: false -fqdn: ${hostname}.${domain} users: - name: micqdf ssh_authorized_keys: - ${SSH_KEY_PUBLIC} + +runcmd: + - [ /run/current-system/sw/bin/sh, -lc, "install -d -m 700 /var/lib/tailscale && rm -f /var/lib/tailscale/tailscaled.state" ] + - [ /run/current-system/sw/bin/sh, -lc, "/run/current-system/sw/bin/tailscale up --reset --auth-key='${TS_AUTHKEY}' --hostname='$(hostname)' --advertise-tags='tag:k8s'" ] diff --git a/terraform/variables.tf b/terraform/variables.tf index e805ebe..020f304 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -71,3 +71,9 @@ variable "SSH_KEY_PUBLIC" { type = string description = "Public SSH key injected via cloud-init" } + +variable "TS_AUTHKEY" { + type = string + sensitive = true + description = "Tailscale auth key used during cloud-init enrollment" +}