Compare commits

6 Commits

Author SHA1 Message Date
a2d61d6972 Merge pull request 'fix: make tailscale enrollment resilient when guest agent is unavailable' (#25) from stage into master
Some checks are pending
Terraform Apply / Terraform Apply (push) Waiting to run
Reviewed-on: #25
2026-02-28 11:36:29 +00:00
6fbc4dd80f fix: make tailscale enrollment resilient when guest agent is unavailable
All checks were successful
Terraform Plan / Terraform Plan (push) Successful in 18s
Increase guest-agent wait window and treat agent-unavailable as warning by default, while keeping strict failure optional via TAILSCALE_ENROLL_STRICT secret.
2026-02-28 10:34:46 +00:00
5acb8370cc Merge pull request 'fix: parse terraform output JSON robustly in enroll step' (#24) from stage into master
Some checks failed
Terraform Apply / Terraform Apply (push) Failing after 16m5s
Reviewed-on: #24
2026-02-28 02:29:06 +00:00
f207f774de fix: parse terraform output JSON robustly in enroll step
All checks were successful
Terraform Plan / Terraform Plan (push) Successful in 19s
Handle setup-terraform wrapper prefixes by decoding from first JSON object before reading VM outputs.
2026-02-28 02:21:57 +00:00
1a309cbe4f Merge pull request 'feat: enroll tailscale via Proxmox guest agent by VMID' (#23) from stage into master
Some checks failed
Terraform Apply / Terraform Apply (push) Failing after 1m56s
Reviewed-on: #23
2026-02-28 02:16:58 +00:00
83d277d144 feat: enroll tailscale via Proxmox guest agent by VMID
All checks were successful
Terraform Plan / Terraform Plan (push) Successful in 19s
Replace SSH/IP-based enrollment with Proxmox API guest-agent execution using Terraform outputs, set per-VM hostnames from resource names, and reset cloned tailscale state before join for unique node identities.
2026-02-28 02:14:39 +00:00

View File

@@ -73,33 +73,158 @@ jobs:
- name: Enroll VMs in Tailscale - name: Enroll VMs in Tailscale
env: env:
TS_AUTHKEY: ${{ secrets.TS_AUTHKEY }} TS_AUTHKEY: ${{ secrets.TS_AUTHKEY }}
TAILSCALE_ENROLL_HOSTS: ${{ secrets.TAILSCALE_ENROLL_HOSTS }} PM_API_TOKEN_SECRET: ${{ secrets.PM_API_TOKEN_SECRET }}
VM_SSH_PRIVATE_KEY: ${{ secrets.VM_SSH_PRIVATE_KEY }} TAILSCALE_ENROLL_STRICT: ${{ secrets.TAILSCALE_ENROLL_STRICT }}
working-directory: terraform
run: | run: |
if [ -z "$TS_AUTHKEY" ] || [ -z "$TAILSCALE_ENROLL_HOSTS" ] || [ -z "$VM_SSH_PRIVATE_KEY" ]; then if [ -z "$TS_AUTHKEY" ] || [ -z "$PM_API_TOKEN_SECRET" ]; then
echo "Skipping Tailscale enrollment (missing TS_AUTHKEY, TAILSCALE_ENROLL_HOSTS, or VM_SSH_PRIVATE_KEY)." echo "Skipping Tailscale enrollment (missing TS_AUTHKEY or PM_API_TOKEN_SECRET)."
exit 0 exit 0
fi fi
echo "Expected format: host or host=hostname (comma-separated)" PM_API_URL=$(awk -F'"' '/^pm_api_url/{print $2}' terraform.tfvars)
PM_API_TOKEN_ID=$(awk -F'"' '/^pm_api_token_id/{print $2}' terraform.tfvars)
TARGET_NODE=$(awk -F'"' '/^target_node/{print $2}' terraform.tfvars)
install -m 700 -d ~/.ssh export PM_API_URL PM_API_TOKEN_ID TARGET_NODE
printf '%s\n' "$VM_SSH_PRIVATE_KEY" > ~/.ssh/id_rsa
chmod 600 ~/.ssh/id_rsa
for target in $(printf '%s' "$TAILSCALE_ENROLL_HOSTS" | tr ',' ' '); do terraform output -json > tfoutputs.json
host="${target%%=*}" cat > enroll_tailscale.py <<'PY'
ts_hostname="" import json
if [ "$host" != "$target" ]; then import os
ts_hostname="${target#*=}" import ssl
fi import sys
import time
import urllib.parse
import urllib.request
echo "Enrolling $host into Tailscale" api_url = os.environ["PM_API_URL"].rstrip("/")
if [ -n "$ts_hostname" ]; then if api_url.endswith("/api2/json"):
ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -i ~/.ssh/id_rsa "micqdf@$host" \ api_url = api_url[: -len("/api2/json")]
"set -e; echo '$TS_AUTHKEY' | sudo tee /etc/tailscale/authkey >/dev/null; echo '$ts_hostname' | sudo tee /etc/tailscale/hostname >/dev/null; sudo chmod 600 /etc/tailscale/authkey; sudo hostnamectl set-hostname '$ts_hostname' || true; sudo systemctl restart tailscaled; sudo systemctl start tailscale-firstboot.service" token_id = os.environ["PM_API_TOKEN_ID"].strip()
else token_secret = os.environ["PM_API_TOKEN_SECRET"].strip()
ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -i ~/.ssh/id_rsa "micqdf@$host" \ target_node = os.environ["TARGET_NODE"].strip()
"set -e; echo '$TS_AUTHKEY' | sudo tee /etc/tailscale/authkey >/dev/null; sudo chmod 600 /etc/tailscale/authkey; sudo systemctl restart tailscaled; sudo systemctl start tailscale-firstboot.service" ts_authkey = os.environ["TS_AUTHKEY"]
fi enroll_strict = os.environ.get("TAILSCALE_ENROLL_STRICT", "false").strip().lower() == "true"
done
if not token_id or not token_secret:
raise SystemExit("Missing Proxmox token id/secret")
raw_outputs = open("tfoutputs.json", "rb").read().decode("utf-8", "ignore")
start = raw_outputs.find("{")
if start == -1:
raise SystemExit("Could not find JSON payload in terraform output")
outputs = json.JSONDecoder().raw_decode(raw_outputs[start:])[0]
targets = []
for output_name in ("alpaca_vm_ids", "llama_vm_ids"):
mapping = outputs.get(output_name, {}).get("value", {})
if isinstance(mapping, dict):
for hostname, vmid in mapping.items():
targets.append((str(hostname), int(vmid)))
if not targets:
print("No VMs found in terraform outputs; skipping tailscale enrollment")
raise SystemExit(0)
print("Tailscale enrollment targets:", ", ".join(f"{h}:{v}" for h, v in targets))
ssl_ctx = ssl._create_unverified_context()
auth_header = f"PVEAPIToken={token_id}={token_secret}"
def api_request(method, path, data=None):
url = f"{api_url}{path}"
headers = {"Authorization": auth_header}
body = None
if data is not None:
body = urllib.parse.urlencode(data, doseq=True).encode("utf-8")
headers["Content-Type"] = "application/x-www-form-urlencoded"
req = urllib.request.Request(url, data=body, headers=headers, method=method)
with urllib.request.urlopen(req, context=ssl_ctx, timeout=30) as resp:
payload = resp.read().decode("utf-8")
return json.loads(payload)
def wait_for_guest_agent(vmid, timeout_seconds=900):
deadline = time.time() + timeout_seconds
while time.time() < deadline:
try:
res = api_request("GET", f"/api2/json/nodes/{target_node}/qemu/{vmid}/agent/ping")
if res.get("data") == "pong":
return True
except Exception:
pass
time.sleep(5)
return False
def exec_guest(vmid, command):
res = api_request(
"POST",
f"/api2/json/nodes/{target_node}/qemu/{vmid}/agent/exec",
{
"command": "/run/current-system/sw/bin/sh",
"extra-args": ["-lc", command],
},
)
pid = res["data"]["pid"]
for _ in range(120):
status = api_request(
"GET",
f"/api2/json/nodes/{target_node}/qemu/{vmid}/agent/exec-status?pid={pid}",
).get("data", {})
if status.get("exited"):
return (
int(status.get("exitcode", 1)),
status.get("out-data", ""),
status.get("err-data", ""),
)
time.sleep(2)
return (124, "", "Timed out waiting for guest command")
failures = []
safe_key = ts_authkey.replace("'", "'\"'\"'")
for hostname, vmid in targets:
print(f"\n== Enrolling {hostname} (vmid {vmid}) ==")
if not wait_for_guest_agent(vmid):
failures.append((hostname, "agent_not_ready", "guest agent not ready"))
print(f"ERROR: guest agent not ready for vmid {vmid}")
continue
safe_hostname = hostname.replace("'", "'\"'\"'")
cmd = (
"set -e; "
f"printf '%s' '{safe_key}' > /etc/tailscale/authkey; "
f"printf '%s' '{safe_hostname}' > /etc/tailscale/hostname; "
"chmod 600 /etc/tailscale/authkey; "
f"hostnamectl set-hostname '{safe_hostname}' || true; "
"systemctl restart tailscaled; "
"systemctl start tailscale-firstboot.service; "
"tailscale status || true"
)
exitcode, stdout, stderr = exec_guest(vmid, cmd)
if stdout:
print(stdout)
if stderr:
print(stderr, file=sys.stderr)
if exitcode != 0:
failures.append((hostname, "command_failed", f"command failed exit {exitcode}"))
print(f"ERROR: tailscale enrollment failed for {hostname} (exit {exitcode})")
if failures:
print("\nEnrollment failures:")
for hostname, kind, detail in failures:
print(f"- {hostname}: {detail}")
only_agent_ready_failures = all(kind == "agent_not_ready" for _, kind, _ in failures)
if only_agent_ready_failures and not enroll_strict:
print("\nWARNING: Enrollment skipped because guest agent was unavailable. Set TAILSCALE_ENROLL_STRICT=true to fail the workflow in this case.")
raise SystemExit(0)
raise SystemExit(1)
print("\nTailscale enrollment completed for all managed VMs")
PY
python3 enroll_tailscale.py