From 6fbc4dd80f226e3c073639c2099222836d2bc68f Mon Sep 17 00:00:00 2001 From: MichaelFisher1997 Date: Sat, 28 Feb 2026 10:34:46 +0000 Subject: [PATCH] fix: make tailscale enrollment resilient when guest agent is unavailable Increase guest-agent wait window and treat agent-unavailable as warning by default, while keeping strict failure optional via TAILSCALE_ENROLL_STRICT secret. --- .gitea/workflows/terraform-apply.yml | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/.gitea/workflows/terraform-apply.yml b/.gitea/workflows/terraform-apply.yml index 626ddce..2fef320 100644 --- a/.gitea/workflows/terraform-apply.yml +++ b/.gitea/workflows/terraform-apply.yml @@ -74,6 +74,7 @@ jobs: env: TS_AUTHKEY: ${{ secrets.TS_AUTHKEY }} PM_API_TOKEN_SECRET: ${{ secrets.PM_API_TOKEN_SECRET }} + TAILSCALE_ENROLL_STRICT: ${{ secrets.TAILSCALE_ENROLL_STRICT }} working-directory: terraform run: | if [ -z "$TS_AUTHKEY" ] || [ -z "$PM_API_TOKEN_SECRET" ]; then @@ -104,6 +105,7 @@ jobs: token_secret = os.environ["PM_API_TOKEN_SECRET"].strip() target_node = os.environ["TARGET_NODE"].strip() ts_authkey = os.environ["TS_AUTHKEY"] + enroll_strict = os.environ.get("TAILSCALE_ENROLL_STRICT", "false").strip().lower() == "true" if not token_id or not token_secret: raise SystemExit("Missing Proxmox token id/secret") @@ -142,7 +144,7 @@ jobs: payload = resp.read().decode("utf-8") return json.loads(payload) - def wait_for_guest_agent(vmid, timeout_seconds=420): + def wait_for_guest_agent(vmid, timeout_seconds=900): deadline = time.time() + timeout_seconds while time.time() < deadline: try: @@ -184,7 +186,7 @@ jobs: for hostname, vmid in targets: print(f"\n== Enrolling {hostname} (vmid {vmid}) ==") if not wait_for_guest_agent(vmid): - failures.append(f"{hostname}: guest agent not ready") + failures.append((hostname, "agent_not_ready", "guest agent not ready")) print(f"ERROR: guest agent not ready for vmid {vmid}") continue @@ -207,13 +209,19 @@ jobs: print(stderr, file=sys.stderr) if exitcode != 0: - failures.append(f"{hostname}: command failed exit {exitcode}") + failures.append((hostname, "command_failed", f"command failed exit {exitcode}")) print(f"ERROR: tailscale enrollment failed for {hostname} (exit {exitcode})") if failures: print("\nEnrollment failures:") - for failure in failures: - print(f"- {failure}") + for hostname, kind, detail in failures: + print(f"- {hostname}: {detail}") + + only_agent_ready_failures = all(kind == "agent_not_ready" for _, kind, _ in failures) + if only_agent_ready_failures and not enroll_strict: + print("\nWARNING: Enrollment skipped because guest agent was unavailable. Set TAILSCALE_ENROLL_STRICT=true to fail the workflow in this case.") + raise SystemExit(0) + raise SystemExit(1) print("\nTailscale enrollment completed for all managed VMs")