refactor: move tailscale join fully into cloud-init

Remove guest-agent enrollment workflow, pass TS auth key through Terraform variables/secrets, and run tailscale up with tag:k8s during cloud-init bootstrap alongside SSH key injection.
2026-02-28 13:13:34 +00:00
parent c87bb16f10
commit 8887a8bb87
6 changed files with 14 additions and 175 deletions
--- a/.gitea/workflows/terraform-apply.yml
+++ b/.gitea/workflows/terraform-apply.yml
@@ -24,6 +24,7 @@ jobs:
          cat > secrets.auto.tfvars << EOF
          pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}"
          SSH_KEY_PUBLIC      = "$(printf '%s' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r\n')"
+          TS_AUTHKEY          = "$(printf '%s' "${{ secrets.TS_AUTHKEY }}" | tr -d '\r\n')"
          EOF
          cat > backend.hcl << EOF
          bucket                      = "${{ secrets.B2_TF_BUCKET }}"
@@ -70,174 +71,3 @@ jobs:
      - name: Terraform Apply
        working-directory: terraform
        run: terraform apply -auto-approve tfplan
-
-      - name: Enroll VMs in Tailscale
-        env:
-          TS_AUTHKEY: ${{ secrets.TS_AUTHKEY }}
-          PM_API_TOKEN_SECRET: ${{ secrets.PM_API_TOKEN_SECRET }}
-          TAILSCALE_ENROLL_STRICT: ${{ secrets.TAILSCALE_ENROLL_STRICT }}
-        working-directory: terraform
-        run: |
-          if [ -z "$TS_AUTHKEY" ] || [ -z "$PM_API_TOKEN_SECRET" ]; then
-            echo "Skipping Tailscale enrollment (missing TS_AUTHKEY or PM_API_TOKEN_SECRET)."
-            exit 0
-          fi
-
-          PM_API_URL=$(awk -F'"' '/^pm_api_url/{print $2}' terraform.tfvars)
-          PM_API_TOKEN_ID=$(awk -F'"' '/^pm_api_token_id/{print $2}' terraform.tfvars)
-          TARGET_NODE=$(awk -F'"' '/^target_node/{print $2}' terraform.tfvars)
-
-          export PM_API_URL PM_API_TOKEN_ID TARGET_NODE
-
-          terraform output -json > tfoutputs.json
-          cat > enroll_tailscale.py <<'PY'
-          import json
-          import os
-          import ssl
-          import sys
-          import time
-          import urllib.error
-          import urllib.parse
-          import urllib.request
-
-          api_url = os.environ["PM_API_URL"].rstrip("/")
-          if api_url.endswith("/api2/json"):
-              api_url = api_url[: -len("/api2/json")]
-          token_id = os.environ["PM_API_TOKEN_ID"].strip()
-          token_secret = os.environ["PM_API_TOKEN_SECRET"].strip()
-          target_node = os.environ["TARGET_NODE"].strip()
-          ts_authkey = os.environ["TS_AUTHKEY"]
-          enroll_strict = os.environ.get("TAILSCALE_ENROLL_STRICT", "false").strip().lower() == "true"
-
-          if not token_id or not token_secret:
-              raise SystemExit("Missing Proxmox token id/secret")
-
-          raw_outputs = open("tfoutputs.json", "rb").read().decode("utf-8", "ignore")
-          start = raw_outputs.find("{")
-          if start == -1:
-              raise SystemExit("Could not find JSON payload in terraform output")
-          outputs = json.JSONDecoder().raw_decode(raw_outputs[start:])[0]
-
-          targets = []
-          for output_name in ("alpaca_vm_ids", "llama_vm_ids"):
-              mapping = outputs.get(output_name, {}).get("value", {})
-              if isinstance(mapping, dict):
-                  for hostname, vmid in mapping.items():
-                      targets.append((str(hostname), int(vmid)))
-
-          if not targets:
-              print("No VMs found in terraform outputs; skipping tailscale enrollment")
-              raise SystemExit(0)
-
-          print("Tailscale enrollment targets:", ", ".join(f"{h}:{v}" for h, v in targets))
-
-          ssl_ctx = ssl._create_unverified_context()
-          auth_header = f"PVEAPIToken={token_id}={token_secret}"
-
-          def api_request(method, path, data=None):
-              url = f"{api_url}{path}"
-              headers = {"Authorization": auth_header}
-              body = None
-              if data is not None:
-                  body = urllib.parse.urlencode(data, doseq=True).encode("utf-8")
-                  headers["Content-Type"] = "application/x-www-form-urlencoded"
-              req = urllib.request.Request(url, data=body, headers=headers, method=method)
-              with urllib.request.urlopen(req, context=ssl_ctx, timeout=30) as resp:
-                  payload = resp.read().decode("utf-8")
-              return json.loads(payload)
-
-          def wait_for_guest_agent(vmid, timeout_seconds=120):
-              deadline = time.time() + timeout_seconds
-              tries = 0
-              while time.time() < deadline:
-                  tries += 1
-                  try:
-                      res = api_request("POST", f"/api2/json/nodes/{target_node}/qemu/{vmid}/agent/ping")
-                      if res.get("data") == "pong":
-                          print(f"Guest agent ready for vmid {vmid}", flush=True)
-                          return True
-                  except urllib.error.HTTPError as exc:
-                      detail = exc.read().decode("utf-8", "ignore")
-                      print(f"Agent ping HTTP error for vmid {vmid}: {exc.code} {detail}", flush=True)
-                      if exc.code in (401, 403):
-                          return False
-                  except Exception as exc:
-                      if tries == 1:
-                          print(f"Agent ping error for vmid {vmid}: {exc}", flush=True)
-                  if tries % 6 == 0:
-                      remaining = int(deadline - time.time())
-                      print(f"Waiting for guest agent on vmid {vmid} ({remaining}s left)", flush=True)
-                  time.sleep(5)
-              return False
-
-          def exec_guest(vmid, command):
-              res = api_request(
-                  "POST",
-                  f"/api2/json/nodes/{target_node}/qemu/{vmid}/agent/exec",
-                  {
-                      "command": "/run/current-system/sw/bin/sh",
-                      "extra-args": ["-lc", command],
-                  },
-              )
-              pid = res["data"]["pid"]
-              for _ in range(120):
-                  status = api_request(
-                      "GET",
-                      f"/api2/json/nodes/{target_node}/qemu/{vmid}/agent/exec-status?pid={pid}",
-                  ).get("data", {})
-                  if status.get("exited"):
-                      return (
-                          int(status.get("exitcode", 1)),
-                          status.get("out-data", ""),
-                          status.get("err-data", ""),
-                      )
-                  time.sleep(2)
-              return (124, "", "Timed out waiting for guest command")
-
-          failures = []
-          safe_key = ts_authkey.replace("'", "'\"'\"'")
-
-          for hostname, vmid in targets:
-              print(f"\n== Enrolling {hostname} (vmid {vmid}) ==")
-              if not wait_for_guest_agent(vmid):
-                  failures.append((hostname, "agent_not_ready", "guest agent not ready"))
-                  print(f"ERROR: guest agent not ready for vmid {vmid}")
-                  continue
-
-              safe_hostname = hostname.replace("'", "'\"'\"'")
-              cmd = (
-                  "set -e; "
-                  f"hostnamectl set-hostname '{safe_hostname}' || true; "
-                  "install -d -m 700 /var/lib/tailscale; "
-                  "rm -f /var/lib/tailscale/tailscaled.state; "
-                  "systemctl restart tailscaled; "
-                  f"/run/current-system/sw/bin/tailscale up --reset --auth-key='{safe_key}' --hostname='{safe_hostname}'; "
-                  "/run/current-system/sw/bin/tailscale status || true"
-              )
-
-              exitcode, stdout, stderr = exec_guest(vmid, cmd)
-              if stdout:
-                  print(stdout)
-              if stderr:
-                  print(stderr, file=sys.stderr)
-
-              if exitcode != 0:
-                  failures.append((hostname, "command_failed", f"command failed exit {exitcode}"))
-                  print(f"ERROR: tailscale enrollment failed for {hostname} (exit {exitcode})")
-
-          if failures:
-              print("\nEnrollment failures:")
-              for hostname, kind, detail in failures:
-                  print(f"- {hostname}: {detail}")
-
-              only_agent_ready_failures = all(kind == "agent_not_ready" for _, kind, _ in failures)
-              if only_agent_ready_failures and not enroll_strict:
-                  print("\nWARNING: Enrollment skipped because guest agent was unavailable. Set TAILSCALE_ENROLL_STRICT=true to fail the workflow in this case.")
-                  raise SystemExit(0)
-
-              raise SystemExit(1)
-
-          print("\nTailscale enrollment completed for all managed VMs")
-          PY
-
-          python3 -u enroll_tailscale.py
--- a/.gitea/workflows/terraform-destroy.yml
+++ b/.gitea/workflows/terraform-destroy.yml
@@ -44,6 +44,7 @@ jobs:
          cat > secrets.auto.tfvars << EOF
          pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}"
          SSH_KEY_PUBLIC      = "$(printf '%s' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r\n')"
+          TS_AUTHKEY          = "$(printf '%s' "${{ secrets.TS_AUTHKEY }}" | tr -d '\r\n')"
          EOF
          cat > backend.hcl << EOF
          bucket                      = "${{ secrets.B2_TF_BUCKET }}"
--- a/.gitea/workflows/terraform-plan.yml
+++ b/.gitea/workflows/terraform-plan.yml
@@ -26,6 +26,7 @@ jobs:
          cat > secrets.auto.tfvars << EOF
          pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}"
          SSH_KEY_PUBLIC      = "$(printf '%s' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r\n')"
+          TS_AUTHKEY          = "$(printf '%s' "${{ secrets.TS_AUTHKEY }}" | tr -d '\r\n')"
          EOF
          cat > backend.hcl << EOF
          bucket                      = "${{ secrets.B2_TF_BUCKET }}"
--- a/terraform/cloud-init.tf
+++ b/terraform/cloud-init.tf
@@ -2,9 +2,8 @@ data "template_file" "cloud_init_global" {
  template = file("${path.module}/files/cloud_init_global.tpl")

  vars = {
-    hostname       = "generic"
-    domain         = "home.arpa"
    SSH_KEY_PUBLIC = var.SSH_KEY_PUBLIC
+    TS_AUTHKEY     = var.TS_AUTHKEY
  }
 }

--- a/terraform/files/cloud_init_global.tpl
+++ b/terraform/files/cloud_init_global.tpl
@@ -1,5 +1,4 @@
 #cloud-config
-hostname: ${hostname}
 manage_etc_hosts: true
 resolv_conf:
  nameservers:
@@ -7,9 +6,12 @@ resolv_conf:
    - 1.1.1.1

 preserve_hostname: false
-fqdn: ${hostname}.${domain}

 users:
  - name: micqdf
    ssh_authorized_keys:
      - ${SSH_KEY_PUBLIC}
+
+runcmd:
+  - [ /run/current-system/sw/bin/sh, -lc, "install -d -m 700 /var/lib/tailscale && rm -f /var/lib/tailscale/tailscaled.state" ]
+  - [ /run/current-system/sw/bin/sh, -lc, "/run/current-system/sw/bin/tailscale up --reset --auth-key='${TS_AUTHKEY}' --hostname='$(hostname)' --advertise-tags='tag:k8s'" ]
--- a/terraform/variables.tf
+++ b/terraform/variables.tf
@@ -71,3 +71,9 @@ variable "SSH_KEY_PUBLIC" {
  type        = string
  description = "Public SSH key injected via cloud-init"
 }
+
+variable "TS_AUTHKEY" {
+  type        = string
+  sensitive   = true
+  description = "Tailscale auth key used during cloud-init enrollment"
+}