fix: make tailscale enrollment resilient when guest agent is unavailable
All checks were successful
Terraform Plan / Terraform Plan (push) Successful in 18s

Increase guest-agent wait window and treat agent-unavailable as warning by default, while keeping strict failure optional via TAILSCALE_ENROLL_STRICT secret.
This commit is contained in:
2026-02-28 10:34:46 +00:00
parent f207f774de
commit 6fbc4dd80f

View File

@@ -74,6 +74,7 @@ jobs:
env: env:
TS_AUTHKEY: ${{ secrets.TS_AUTHKEY }} TS_AUTHKEY: ${{ secrets.TS_AUTHKEY }}
PM_API_TOKEN_SECRET: ${{ secrets.PM_API_TOKEN_SECRET }} PM_API_TOKEN_SECRET: ${{ secrets.PM_API_TOKEN_SECRET }}
TAILSCALE_ENROLL_STRICT: ${{ secrets.TAILSCALE_ENROLL_STRICT }}
working-directory: terraform working-directory: terraform
run: | run: |
if [ -z "$TS_AUTHKEY" ] || [ -z "$PM_API_TOKEN_SECRET" ]; then if [ -z "$TS_AUTHKEY" ] || [ -z "$PM_API_TOKEN_SECRET" ]; then
@@ -104,6 +105,7 @@ jobs:
token_secret = os.environ["PM_API_TOKEN_SECRET"].strip() token_secret = os.environ["PM_API_TOKEN_SECRET"].strip()
target_node = os.environ["TARGET_NODE"].strip() target_node = os.environ["TARGET_NODE"].strip()
ts_authkey = os.environ["TS_AUTHKEY"] ts_authkey = os.environ["TS_AUTHKEY"]
enroll_strict = os.environ.get("TAILSCALE_ENROLL_STRICT", "false").strip().lower() == "true"
if not token_id or not token_secret: if not token_id or not token_secret:
raise SystemExit("Missing Proxmox token id/secret") raise SystemExit("Missing Proxmox token id/secret")
@@ -142,7 +144,7 @@ jobs:
payload = resp.read().decode("utf-8") payload = resp.read().decode("utf-8")
return json.loads(payload) return json.loads(payload)
def wait_for_guest_agent(vmid, timeout_seconds=420): def wait_for_guest_agent(vmid, timeout_seconds=900):
deadline = time.time() + timeout_seconds deadline = time.time() + timeout_seconds
while time.time() < deadline: while time.time() < deadline:
try: try:
@@ -184,7 +186,7 @@ jobs:
for hostname, vmid in targets: for hostname, vmid in targets:
print(f"\n== Enrolling {hostname} (vmid {vmid}) ==") print(f"\n== Enrolling {hostname} (vmid {vmid}) ==")
if not wait_for_guest_agent(vmid): if not wait_for_guest_agent(vmid):
failures.append(f"{hostname}: guest agent not ready") failures.append((hostname, "agent_not_ready", "guest agent not ready"))
print(f"ERROR: guest agent not ready for vmid {vmid}") print(f"ERROR: guest agent not ready for vmid {vmid}")
continue continue
@@ -207,13 +209,19 @@ jobs:
print(stderr, file=sys.stderr) print(stderr, file=sys.stderr)
if exitcode != 0: if exitcode != 0:
failures.append(f"{hostname}: command failed exit {exitcode}") failures.append((hostname, "command_failed", f"command failed exit {exitcode}"))
print(f"ERROR: tailscale enrollment failed for {hostname} (exit {exitcode})") print(f"ERROR: tailscale enrollment failed for {hostname} (exit {exitcode})")
if failures: if failures:
print("\nEnrollment failures:") print("\nEnrollment failures:")
for failure in failures: for hostname, kind, detail in failures:
print(f"- {failure}") print(f"- {hostname}: {detail}")
only_agent_ready_failures = all(kind == "agent_not_ready" for _, kind, _ in failures)
if only_agent_ready_failures and not enroll_strict:
print("\nWARNING: Enrollment skipped because guest agent was unavailable. Set TAILSCALE_ENROLL_STRICT=true to fail the workflow in this case.")
raise SystemExit(0)
raise SystemExit(1) raise SystemExit(1)
print("\nTailscale enrollment completed for all managed VMs") print("\nTailscale enrollment completed for all managed VMs")