Merge pull request 'refactor: move tailscale join fully into cloud-init' (#30) from stage into master
Reviewed-on: micqdf/TerraHome#30
This commit is contained in:
@@ -24,6 +24,7 @@ jobs:
|
|||||||
cat > secrets.auto.tfvars << EOF
|
cat > secrets.auto.tfvars << EOF
|
||||||
pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}"
|
pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}"
|
||||||
SSH_KEY_PUBLIC = "$(printf '%s' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r\n')"
|
SSH_KEY_PUBLIC = "$(printf '%s' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r\n')"
|
||||||
|
TS_AUTHKEY = "$(printf '%s' "${{ secrets.TS_AUTHKEY }}" | tr -d '\r\n')"
|
||||||
EOF
|
EOF
|
||||||
cat > backend.hcl << EOF
|
cat > backend.hcl << EOF
|
||||||
bucket = "${{ secrets.B2_TF_BUCKET }}"
|
bucket = "${{ secrets.B2_TF_BUCKET }}"
|
||||||
@@ -70,174 +71,3 @@ jobs:
|
|||||||
- name: Terraform Apply
|
- name: Terraform Apply
|
||||||
working-directory: terraform
|
working-directory: terraform
|
||||||
run: terraform apply -auto-approve tfplan
|
run: terraform apply -auto-approve tfplan
|
||||||
|
|
||||||
- name: Enroll VMs in Tailscale
|
|
||||||
env:
|
|
||||||
TS_AUTHKEY: ${{ secrets.TS_AUTHKEY }}
|
|
||||||
PM_API_TOKEN_SECRET: ${{ secrets.PM_API_TOKEN_SECRET }}
|
|
||||||
TAILSCALE_ENROLL_STRICT: ${{ secrets.TAILSCALE_ENROLL_STRICT }}
|
|
||||||
working-directory: terraform
|
|
||||||
run: |
|
|
||||||
if [ -z "$TS_AUTHKEY" ] || [ -z "$PM_API_TOKEN_SECRET" ]; then
|
|
||||||
echo "Skipping Tailscale enrollment (missing TS_AUTHKEY or PM_API_TOKEN_SECRET)."
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
PM_API_URL=$(awk -F'"' '/^pm_api_url/{print $2}' terraform.tfvars)
|
|
||||||
PM_API_TOKEN_ID=$(awk -F'"' '/^pm_api_token_id/{print $2}' terraform.tfvars)
|
|
||||||
TARGET_NODE=$(awk -F'"' '/^target_node/{print $2}' terraform.tfvars)
|
|
||||||
|
|
||||||
export PM_API_URL PM_API_TOKEN_ID TARGET_NODE
|
|
||||||
|
|
||||||
terraform output -json > tfoutputs.json
|
|
||||||
cat > enroll_tailscale.py <<'PY'
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import ssl
|
|
||||||
import sys
|
|
||||||
import time
|
|
||||||
import urllib.error
|
|
||||||
import urllib.parse
|
|
||||||
import urllib.request
|
|
||||||
|
|
||||||
api_url = os.environ["PM_API_URL"].rstrip("/")
|
|
||||||
if api_url.endswith("/api2/json"):
|
|
||||||
api_url = api_url[: -len("/api2/json")]
|
|
||||||
token_id = os.environ["PM_API_TOKEN_ID"].strip()
|
|
||||||
token_secret = os.environ["PM_API_TOKEN_SECRET"].strip()
|
|
||||||
target_node = os.environ["TARGET_NODE"].strip()
|
|
||||||
ts_authkey = os.environ["TS_AUTHKEY"]
|
|
||||||
enroll_strict = os.environ.get("TAILSCALE_ENROLL_STRICT", "false").strip().lower() == "true"
|
|
||||||
|
|
||||||
if not token_id or not token_secret:
|
|
||||||
raise SystemExit("Missing Proxmox token id/secret")
|
|
||||||
|
|
||||||
raw_outputs = open("tfoutputs.json", "rb").read().decode("utf-8", "ignore")
|
|
||||||
start = raw_outputs.find("{")
|
|
||||||
if start == -1:
|
|
||||||
raise SystemExit("Could not find JSON payload in terraform output")
|
|
||||||
outputs = json.JSONDecoder().raw_decode(raw_outputs[start:])[0]
|
|
||||||
|
|
||||||
targets = []
|
|
||||||
for output_name in ("alpaca_vm_ids", "llama_vm_ids"):
|
|
||||||
mapping = outputs.get(output_name, {}).get("value", {})
|
|
||||||
if isinstance(mapping, dict):
|
|
||||||
for hostname, vmid in mapping.items():
|
|
||||||
targets.append((str(hostname), int(vmid)))
|
|
||||||
|
|
||||||
if not targets:
|
|
||||||
print("No VMs found in terraform outputs; skipping tailscale enrollment")
|
|
||||||
raise SystemExit(0)
|
|
||||||
|
|
||||||
print("Tailscale enrollment targets:", ", ".join(f"{h}:{v}" for h, v in targets))
|
|
||||||
|
|
||||||
ssl_ctx = ssl._create_unverified_context()
|
|
||||||
auth_header = f"PVEAPIToken={token_id}={token_secret}"
|
|
||||||
|
|
||||||
def api_request(method, path, data=None):
|
|
||||||
url = f"{api_url}{path}"
|
|
||||||
headers = {"Authorization": auth_header}
|
|
||||||
body = None
|
|
||||||
if data is not None:
|
|
||||||
body = urllib.parse.urlencode(data, doseq=True).encode("utf-8")
|
|
||||||
headers["Content-Type"] = "application/x-www-form-urlencoded"
|
|
||||||
req = urllib.request.Request(url, data=body, headers=headers, method=method)
|
|
||||||
with urllib.request.urlopen(req, context=ssl_ctx, timeout=30) as resp:
|
|
||||||
payload = resp.read().decode("utf-8")
|
|
||||||
return json.loads(payload)
|
|
||||||
|
|
||||||
def wait_for_guest_agent(vmid, timeout_seconds=120):
|
|
||||||
deadline = time.time() + timeout_seconds
|
|
||||||
tries = 0
|
|
||||||
while time.time() < deadline:
|
|
||||||
tries += 1
|
|
||||||
try:
|
|
||||||
res = api_request("POST", f"/api2/json/nodes/{target_node}/qemu/{vmid}/agent/ping")
|
|
||||||
if res.get("data") == "pong":
|
|
||||||
print(f"Guest agent ready for vmid {vmid}", flush=True)
|
|
||||||
return True
|
|
||||||
except urllib.error.HTTPError as exc:
|
|
||||||
detail = exc.read().decode("utf-8", "ignore")
|
|
||||||
print(f"Agent ping HTTP error for vmid {vmid}: {exc.code} {detail}", flush=True)
|
|
||||||
if exc.code in (401, 403):
|
|
||||||
return False
|
|
||||||
except Exception as exc:
|
|
||||||
if tries == 1:
|
|
||||||
print(f"Agent ping error for vmid {vmid}: {exc}", flush=True)
|
|
||||||
if tries % 6 == 0:
|
|
||||||
remaining = int(deadline - time.time())
|
|
||||||
print(f"Waiting for guest agent on vmid {vmid} ({remaining}s left)", flush=True)
|
|
||||||
time.sleep(5)
|
|
||||||
return False
|
|
||||||
|
|
||||||
def exec_guest(vmid, command):
|
|
||||||
res = api_request(
|
|
||||||
"POST",
|
|
||||||
f"/api2/json/nodes/{target_node}/qemu/{vmid}/agent/exec",
|
|
||||||
{
|
|
||||||
"command": "/run/current-system/sw/bin/sh",
|
|
||||||
"extra-args": ["-lc", command],
|
|
||||||
},
|
|
||||||
)
|
|
||||||
pid = res["data"]["pid"]
|
|
||||||
for _ in range(120):
|
|
||||||
status = api_request(
|
|
||||||
"GET",
|
|
||||||
f"/api2/json/nodes/{target_node}/qemu/{vmid}/agent/exec-status?pid={pid}",
|
|
||||||
).get("data", {})
|
|
||||||
if status.get("exited"):
|
|
||||||
return (
|
|
||||||
int(status.get("exitcode", 1)),
|
|
||||||
status.get("out-data", ""),
|
|
||||||
status.get("err-data", ""),
|
|
||||||
)
|
|
||||||
time.sleep(2)
|
|
||||||
return (124, "", "Timed out waiting for guest command")
|
|
||||||
|
|
||||||
failures = []
|
|
||||||
safe_key = ts_authkey.replace("'", "'\"'\"'")
|
|
||||||
|
|
||||||
for hostname, vmid in targets:
|
|
||||||
print(f"\n== Enrolling {hostname} (vmid {vmid}) ==")
|
|
||||||
if not wait_for_guest_agent(vmid):
|
|
||||||
failures.append((hostname, "agent_not_ready", "guest agent not ready"))
|
|
||||||
print(f"ERROR: guest agent not ready for vmid {vmid}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
safe_hostname = hostname.replace("'", "'\"'\"'")
|
|
||||||
cmd = (
|
|
||||||
"set -e; "
|
|
||||||
f"hostnamectl set-hostname '{safe_hostname}' || true; "
|
|
||||||
"install -d -m 700 /var/lib/tailscale; "
|
|
||||||
"rm -f /var/lib/tailscale/tailscaled.state; "
|
|
||||||
"systemctl restart tailscaled; "
|
|
||||||
f"/run/current-system/sw/bin/tailscale up --reset --auth-key='{safe_key}' --hostname='{safe_hostname}'; "
|
|
||||||
"/run/current-system/sw/bin/tailscale status || true"
|
|
||||||
)
|
|
||||||
|
|
||||||
exitcode, stdout, stderr = exec_guest(vmid, cmd)
|
|
||||||
if stdout:
|
|
||||||
print(stdout)
|
|
||||||
if stderr:
|
|
||||||
print(stderr, file=sys.stderr)
|
|
||||||
|
|
||||||
if exitcode != 0:
|
|
||||||
failures.append((hostname, "command_failed", f"command failed exit {exitcode}"))
|
|
||||||
print(f"ERROR: tailscale enrollment failed for {hostname} (exit {exitcode})")
|
|
||||||
|
|
||||||
if failures:
|
|
||||||
print("\nEnrollment failures:")
|
|
||||||
for hostname, kind, detail in failures:
|
|
||||||
print(f"- {hostname}: {detail}")
|
|
||||||
|
|
||||||
only_agent_ready_failures = all(kind == "agent_not_ready" for _, kind, _ in failures)
|
|
||||||
if only_agent_ready_failures and not enroll_strict:
|
|
||||||
print("\nWARNING: Enrollment skipped because guest agent was unavailable. Set TAILSCALE_ENROLL_STRICT=true to fail the workflow in this case.")
|
|
||||||
raise SystemExit(0)
|
|
||||||
|
|
||||||
raise SystemExit(1)
|
|
||||||
|
|
||||||
print("\nTailscale enrollment completed for all managed VMs")
|
|
||||||
PY
|
|
||||||
|
|
||||||
python3 -u enroll_tailscale.py
|
|
||||||
|
|||||||
@@ -44,6 +44,7 @@ jobs:
|
|||||||
cat > secrets.auto.tfvars << EOF
|
cat > secrets.auto.tfvars << EOF
|
||||||
pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}"
|
pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}"
|
||||||
SSH_KEY_PUBLIC = "$(printf '%s' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r\n')"
|
SSH_KEY_PUBLIC = "$(printf '%s' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r\n')"
|
||||||
|
TS_AUTHKEY = "$(printf '%s' "${{ secrets.TS_AUTHKEY }}" | tr -d '\r\n')"
|
||||||
EOF
|
EOF
|
||||||
cat > backend.hcl << EOF
|
cat > backend.hcl << EOF
|
||||||
bucket = "${{ secrets.B2_TF_BUCKET }}"
|
bucket = "${{ secrets.B2_TF_BUCKET }}"
|
||||||
|
|||||||
@@ -26,6 +26,7 @@ jobs:
|
|||||||
cat > secrets.auto.tfvars << EOF
|
cat > secrets.auto.tfvars << EOF
|
||||||
pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}"
|
pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}"
|
||||||
SSH_KEY_PUBLIC = "$(printf '%s' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r\n')"
|
SSH_KEY_PUBLIC = "$(printf '%s' "${{ secrets.SSH_KEY_PUBLIC }}" | tr -d '\r\n')"
|
||||||
|
TS_AUTHKEY = "$(printf '%s' "${{ secrets.TS_AUTHKEY }}" | tr -d '\r\n')"
|
||||||
EOF
|
EOF
|
||||||
cat > backend.hcl << EOF
|
cat > backend.hcl << EOF
|
||||||
bucket = "${{ secrets.B2_TF_BUCKET }}"
|
bucket = "${{ secrets.B2_TF_BUCKET }}"
|
||||||
|
|||||||
@@ -2,9 +2,8 @@ data "template_file" "cloud_init_global" {
|
|||||||
template = file("${path.module}/files/cloud_init_global.tpl")
|
template = file("${path.module}/files/cloud_init_global.tpl")
|
||||||
|
|
||||||
vars = {
|
vars = {
|
||||||
hostname = "generic"
|
|
||||||
domain = "home.arpa"
|
|
||||||
SSH_KEY_PUBLIC = var.SSH_KEY_PUBLIC
|
SSH_KEY_PUBLIC = var.SSH_KEY_PUBLIC
|
||||||
|
TS_AUTHKEY = var.TS_AUTHKEY
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
#cloud-config
|
#cloud-config
|
||||||
hostname: ${hostname}
|
|
||||||
manage_etc_hosts: true
|
manage_etc_hosts: true
|
||||||
resolv_conf:
|
resolv_conf:
|
||||||
nameservers:
|
nameservers:
|
||||||
@@ -7,9 +6,12 @@ resolv_conf:
|
|||||||
- 1.1.1.1
|
- 1.1.1.1
|
||||||
|
|
||||||
preserve_hostname: false
|
preserve_hostname: false
|
||||||
fqdn: ${hostname}.${domain}
|
|
||||||
|
|
||||||
users:
|
users:
|
||||||
- name: micqdf
|
- name: micqdf
|
||||||
ssh_authorized_keys:
|
ssh_authorized_keys:
|
||||||
- ${SSH_KEY_PUBLIC}
|
- ${SSH_KEY_PUBLIC}
|
||||||
|
|
||||||
|
runcmd:
|
||||||
|
- [ /run/current-system/sw/bin/sh, -lc, "install -d -m 700 /var/lib/tailscale && rm -f /var/lib/tailscale/tailscaled.state" ]
|
||||||
|
- [ /run/current-system/sw/bin/sh, -lc, "/run/current-system/sw/bin/tailscale up --reset --auth-key='${TS_AUTHKEY}' --hostname='$(hostname)' --advertise-tags='tag:k8s'" ]
|
||||||
|
|||||||
@@ -71,3 +71,9 @@ variable "SSH_KEY_PUBLIC" {
|
|||||||
type = string
|
type = string
|
||||||
description = "Public SSH key injected via cloud-init"
|
description = "Public SSH key injected via cloud-init"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "TS_AUTHKEY" {
|
||||||
|
type = string
|
||||||
|
sensitive = true
|
||||||
|
description = "Tailscale auth key used during cloud-init enrollment"
|
||||||
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user