Compare commits

4 Commits

Author SHA1 Message Date
a2d61d6972 Merge pull request 'fix: make tailscale enrollment resilient when guest agent is unavailable' (#25) from stage into master
Some checks are pending
Terraform Apply / Terraform Apply (push) Waiting to run
Reviewed-on: #25
2026-02-28 11:36:29 +00:00
6fbc4dd80f fix: make tailscale enrollment resilient when guest agent is unavailable
All checks were successful
Terraform Plan / Terraform Plan (push) Successful in 18s
Increase guest-agent wait window and treat agent-unavailable as warning by default, while keeping strict failure optional via TAILSCALE_ENROLL_STRICT secret.
2026-02-28 10:34:46 +00:00
5acb8370cc Merge pull request 'fix: parse terraform output JSON robustly in enroll step' (#24) from stage into master
Some checks failed
Terraform Apply / Terraform Apply (push) Failing after 16m5s
Reviewed-on: #24
2026-02-28 02:29:06 +00:00
f207f774de fix: parse terraform output JSON robustly in enroll step
All checks were successful
Terraform Plan / Terraform Plan (push) Successful in 19s
Handle setup-terraform wrapper prefixes by decoding from first JSON object before reading VM outputs.
2026-02-28 02:21:57 +00:00

View File

@@ -74,6 +74,7 @@ jobs:
env: env:
TS_AUTHKEY: ${{ secrets.TS_AUTHKEY }} TS_AUTHKEY: ${{ secrets.TS_AUTHKEY }}
PM_API_TOKEN_SECRET: ${{ secrets.PM_API_TOKEN_SECRET }} PM_API_TOKEN_SECRET: ${{ secrets.PM_API_TOKEN_SECRET }}
TAILSCALE_ENROLL_STRICT: ${{ secrets.TAILSCALE_ENROLL_STRICT }}
working-directory: terraform working-directory: terraform
run: | run: |
if [ -z "$TS_AUTHKEY" ] || [ -z "$PM_API_TOKEN_SECRET" ]; then if [ -z "$TS_AUTHKEY" ] || [ -z "$PM_API_TOKEN_SECRET" ]; then
@@ -104,12 +105,16 @@ jobs:
token_secret = os.environ["PM_API_TOKEN_SECRET"].strip() token_secret = os.environ["PM_API_TOKEN_SECRET"].strip()
target_node = os.environ["TARGET_NODE"].strip() target_node = os.environ["TARGET_NODE"].strip()
ts_authkey = os.environ["TS_AUTHKEY"] ts_authkey = os.environ["TS_AUTHKEY"]
enroll_strict = os.environ.get("TAILSCALE_ENROLL_STRICT", "false").strip().lower() == "true"
if not token_id or not token_secret: if not token_id or not token_secret:
raise SystemExit("Missing Proxmox token id/secret") raise SystemExit("Missing Proxmox token id/secret")
with open("tfoutputs.json", "r", encoding="utf-8") as f: raw_outputs = open("tfoutputs.json", "rb").read().decode("utf-8", "ignore")
outputs = json.load(f) start = raw_outputs.find("{")
if start == -1:
raise SystemExit("Could not find JSON payload in terraform output")
outputs = json.JSONDecoder().raw_decode(raw_outputs[start:])[0]
targets = [] targets = []
for output_name in ("alpaca_vm_ids", "llama_vm_ids"): for output_name in ("alpaca_vm_ids", "llama_vm_ids"):
@@ -139,7 +144,7 @@ jobs:
payload = resp.read().decode("utf-8") payload = resp.read().decode("utf-8")
return json.loads(payload) return json.loads(payload)
def wait_for_guest_agent(vmid, timeout_seconds=420): def wait_for_guest_agent(vmid, timeout_seconds=900):
deadline = time.time() + timeout_seconds deadline = time.time() + timeout_seconds
while time.time() < deadline: while time.time() < deadline:
try: try:
@@ -181,7 +186,7 @@ jobs:
for hostname, vmid in targets: for hostname, vmid in targets:
print(f"\n== Enrolling {hostname} (vmid {vmid}) ==") print(f"\n== Enrolling {hostname} (vmid {vmid}) ==")
if not wait_for_guest_agent(vmid): if not wait_for_guest_agent(vmid):
failures.append(f"{hostname}: guest agent not ready") failures.append((hostname, "agent_not_ready", "guest agent not ready"))
print(f"ERROR: guest agent not ready for vmid {vmid}") print(f"ERROR: guest agent not ready for vmid {vmid}")
continue continue
@@ -204,13 +209,19 @@ jobs:
print(stderr, file=sys.stderr) print(stderr, file=sys.stderr)
if exitcode != 0: if exitcode != 0:
failures.append(f"{hostname}: command failed exit {exitcode}") failures.append((hostname, "command_failed", f"command failed exit {exitcode}"))
print(f"ERROR: tailscale enrollment failed for {hostname} (exit {exitcode})") print(f"ERROR: tailscale enrollment failed for {hostname} (exit {exitcode})")
if failures: if failures:
print("\nEnrollment failures:") print("\nEnrollment failures:")
for failure in failures: for hostname, kind, detail in failures:
print(f"- {failure}") print(f"- {hostname}: {detail}")
only_agent_ready_failures = all(kind == "agent_not_ready" for _, kind, _ in failures)
if only_agent_ready_failures and not enroll_strict:
print("\nWARNING: Enrollment skipped because guest agent was unavailable. Set TAILSCALE_ENROLL_STRICT=true to fail the workflow in this case.")
raise SystemExit(0)
raise SystemExit(1) raise SystemExit(1)
print("\nTailscale enrollment completed for all managed VMs") print("\nTailscale enrollment completed for all managed VMs")