fix: make tailscale enrollment resilient when guest agent is unavailable
All checks were successful
Terraform Plan / Terraform Plan (push) Successful in 18s

Increase guest-agent wait window and treat agent-unavailable as warning by default, while keeping strict failure optional via TAILSCALE_ENROLL_STRICT secret.
This commit is contained in:
2026-02-28 10:34:46 +00:00
parent f207f774de
commit 6fbc4dd80f

View File

@@ -74,6 +74,7 @@ jobs:
env:
TS_AUTHKEY: ${{ secrets.TS_AUTHKEY }}
PM_API_TOKEN_SECRET: ${{ secrets.PM_API_TOKEN_SECRET }}
TAILSCALE_ENROLL_STRICT: ${{ secrets.TAILSCALE_ENROLL_STRICT }}
working-directory: terraform
run: |
if [ -z "$TS_AUTHKEY" ] || [ -z "$PM_API_TOKEN_SECRET" ]; then
@@ -104,6 +105,7 @@ jobs:
token_secret = os.environ["PM_API_TOKEN_SECRET"].strip()
target_node = os.environ["TARGET_NODE"].strip()
ts_authkey = os.environ["TS_AUTHKEY"]
enroll_strict = os.environ.get("TAILSCALE_ENROLL_STRICT", "false").strip().lower() == "true"
if not token_id or not token_secret:
raise SystemExit("Missing Proxmox token id/secret")
@@ -142,7 +144,7 @@ jobs:
payload = resp.read().decode("utf-8")
return json.loads(payload)
def wait_for_guest_agent(vmid, timeout_seconds=420):
def wait_for_guest_agent(vmid, timeout_seconds=900):
deadline = time.time() + timeout_seconds
while time.time() < deadline:
try:
@@ -184,7 +186,7 @@ jobs:
for hostname, vmid in targets:
print(f"\n== Enrolling {hostname} (vmid {vmid}) ==")
if not wait_for_guest_agent(vmid):
failures.append(f"{hostname}: guest agent not ready")
failures.append((hostname, "agent_not_ready", "guest agent not ready"))
print(f"ERROR: guest agent not ready for vmid {vmid}")
continue
@@ -207,13 +209,19 @@ jobs:
print(stderr, file=sys.stderr)
if exitcode != 0:
failures.append(f"{hostname}: command failed exit {exitcode}")
failures.append((hostname, "command_failed", f"command failed exit {exitcode}"))
print(f"ERROR: tailscale enrollment failed for {hostname} (exit {exitcode})")
if failures:
print("\nEnrollment failures:")
for failure in failures:
print(f"- {failure}")
for hostname, kind, detail in failures:
print(f"- {hostname}: {detail}")
only_agent_ready_failures = all(kind == "agent_not_ready" for _, kind, _ in failures)
if only_agent_ready_failures and not enroll_strict:
print("\nWARNING: Enrollment skipped because guest agent was unavailable. Set TAILSCALE_ENROLL_STRICT=true to fail the workflow in this case.")
raise SystemExit(0)
raise SystemExit(1)
print("\nTailscale enrollment completed for all managed VMs")