7 Commits

Author SHA1 Message Date
f207f774de fix: parse terraform output JSON robustly in enroll step
All checks were successful
Terraform Plan / Terraform Plan (push) Successful in 19s
Handle setup-terraform wrapper prefixes by decoding from first JSON object before reading VM outputs.
2026-02-28 02:21:57 +00:00
83d277d144 feat: enroll tailscale via Proxmox guest agent by VMID
All checks were successful
Terraform Plan / Terraform Plan (push) Successful in 19s
Replace SSH/IP-based enrollment with Proxmox API guest-agent execution using Terraform outputs, set per-VM hostnames from resource names, and reset cloned tailscale state before join for unique node identities.
2026-02-28 02:14:39 +00:00
3335020db5 fix: make tailscale enrollment clone-safe and hostname-aware
All checks were successful
Terraform Plan / Terraform Plan (push) Successful in 17s
Reset cloned tailscale state before first join, remove one-shot marker dependency, and allow workflow host entries in host=hostname format so nodes join with VM-aligned tailscale names.
2026-02-28 02:01:48 +00:00
a7f68c0c4b fix: tolerate extra output in destroy guard parser
All checks were successful
Terraform Plan / Terraform Plan (push) Successful in 3m34s
Parse the first JSON object from terraform show output to avoid failures when extra non-JSON lines are present.
2026-02-28 01:23:07 +00:00
d1a7ccc98c chore: serialize Terraform workflows to prevent races
Some checks failed
Terraform Plan / Terraform Plan (push) Failing after 3m34s
Add global workflow concurrency group with queueing enabled so plan/apply/destroy runs do not overlap and contend for shared remote state.
2026-02-28 01:17:51 +00:00
afe19041d9 fix: make destroy guard parse tfplan JSON robustly
Some checks failed
Terraform Plan / Terraform Plan (push) Has been cancelled
Use terraform show with no-color and resilient JSON extraction to avoid parser failures when workflow output includes non-JSON noise.
2026-02-28 01:16:19 +00:00
c9be2a2fc8 fix: align VM boot disk and add Terraform safety workflows
Some checks failed
Terraform Plan / Terraform Plan (push) Failing after 3m35s
Switch VM boot order/disks to scsi0 to match cloned NixOS template boot layout, add destroy guards to plan/apply workflows, and replace destroy workflow with a confirmed manual dispatch nuke flow that uses remote B2 state.
2026-02-28 01:10:31 +00:00
5 changed files with 273 additions and 49 deletions

View File

@@ -5,6 +5,10 @@ on:
branches:
- master
concurrency:
group: terraform-global
cancel-in-progress: false
jobs:
terraform:
name: "Terraform Apply"
@@ -47,29 +51,172 @@ jobs:
- name: Terraform Plan
working-directory: terraform
run: terraform plan
run: terraform plan -out=tfplan
- name: Block accidental destroy
env:
ALLOW_TF_DESTROY: ${{ secrets.ALLOW_TF_DESTROY }}
working-directory: terraform
run: |
terraform show -json -no-color tfplan > tfplan.json
DESTROY_COUNT=$(python3 -c 'import json; raw=open("tfplan.json","rb").read().decode("utf-8","ignore"); start=raw.find("{"); data=json.JSONDecoder().raw_decode(raw[start:])[0]; print(sum(1 for rc in data.get("resource_changes", []) if "delete" in rc.get("change", {}).get("actions", [])))')
echo "Planned deletes: $DESTROY_COUNT"
if [ "$DESTROY_COUNT" -gt 0 ] && [ "${ALLOW_TF_DESTROY}" != "true" ]; then
echo "Destroy actions detected. Set ALLOW_TF_DESTROY=true to allow."
exit 1
fi
- name: Terraform Apply
working-directory: terraform
run: terraform apply -auto-approve
run: terraform apply -auto-approve tfplan
- name: Enroll VMs in Tailscale
env:
TS_AUTHKEY: ${{ secrets.TS_AUTHKEY }}
TAILSCALE_ENROLL_HOSTS: ${{ secrets.TAILSCALE_ENROLL_HOSTS }}
VM_SSH_PRIVATE_KEY: ${{ secrets.VM_SSH_PRIVATE_KEY }}
PM_API_TOKEN_SECRET: ${{ secrets.PM_API_TOKEN_SECRET }}
working-directory: terraform
run: |
if [ -z "$TS_AUTHKEY" ] || [ -z "$TAILSCALE_ENROLL_HOSTS" ] || [ -z "$VM_SSH_PRIVATE_KEY" ]; then
echo "Skipping Tailscale enrollment (missing TS_AUTHKEY, TAILSCALE_ENROLL_HOSTS, or VM_SSH_PRIVATE_KEY)."
if [ -z "$TS_AUTHKEY" ] || [ -z "$PM_API_TOKEN_SECRET" ]; then
echo "Skipping Tailscale enrollment (missing TS_AUTHKEY or PM_API_TOKEN_SECRET)."
exit 0
fi
install -m 700 -d ~/.ssh
printf '%s\n' "$VM_SSH_PRIVATE_KEY" > ~/.ssh/id_rsa
chmod 600 ~/.ssh/id_rsa
PM_API_URL=$(awk -F'"' '/^pm_api_url/{print $2}' terraform.tfvars)
PM_API_TOKEN_ID=$(awk -F'"' '/^pm_api_token_id/{print $2}' terraform.tfvars)
TARGET_NODE=$(awk -F'"' '/^target_node/{print $2}' terraform.tfvars)
for host in $(printf '%s' "$TAILSCALE_ENROLL_HOSTS" | tr ',' ' '); do
echo "Enrolling $host into Tailscale"
ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -i ~/.ssh/id_rsa "micqdf@$host" \
"echo '$TS_AUTHKEY' | sudo tee /etc/tailscale/authkey >/dev/null && sudo chmod 600 /etc/tailscale/authkey && sudo systemctl start tailscale-firstboot.service"
done
export PM_API_URL PM_API_TOKEN_ID TARGET_NODE
terraform output -json > tfoutputs.json
cat > enroll_tailscale.py <<'PY'
import json
import os
import ssl
import sys
import time
import urllib.parse
import urllib.request
api_url = os.environ["PM_API_URL"].rstrip("/")
if api_url.endswith("/api2/json"):
api_url = api_url[: -len("/api2/json")]
token_id = os.environ["PM_API_TOKEN_ID"].strip()
token_secret = os.environ["PM_API_TOKEN_SECRET"].strip()
target_node = os.environ["TARGET_NODE"].strip()
ts_authkey = os.environ["TS_AUTHKEY"]
if not token_id or not token_secret:
raise SystemExit("Missing Proxmox token id/secret")
raw_outputs = open("tfoutputs.json", "rb").read().decode("utf-8", "ignore")
start = raw_outputs.find("{")
if start == -1:
raise SystemExit("Could not find JSON payload in terraform output")
outputs = json.JSONDecoder().raw_decode(raw_outputs[start:])[0]
targets = []
for output_name in ("alpaca_vm_ids", "llama_vm_ids"):
mapping = outputs.get(output_name, {}).get("value", {})
if isinstance(mapping, dict):
for hostname, vmid in mapping.items():
targets.append((str(hostname), int(vmid)))
if not targets:
print("No VMs found in terraform outputs; skipping tailscale enrollment")
raise SystemExit(0)
print("Tailscale enrollment targets:", ", ".join(f"{h}:{v}" for h, v in targets))
ssl_ctx = ssl._create_unverified_context()
auth_header = f"PVEAPIToken={token_id}={token_secret}"
def api_request(method, path, data=None):
url = f"{api_url}{path}"
headers = {"Authorization": auth_header}
body = None
if data is not None:
body = urllib.parse.urlencode(data, doseq=True).encode("utf-8")
headers["Content-Type"] = "application/x-www-form-urlencoded"
req = urllib.request.Request(url, data=body, headers=headers, method=method)
with urllib.request.urlopen(req, context=ssl_ctx, timeout=30) as resp:
payload = resp.read().decode("utf-8")
return json.loads(payload)
def wait_for_guest_agent(vmid, timeout_seconds=420):
deadline = time.time() + timeout_seconds
while time.time() < deadline:
try:
res = api_request("GET", f"/api2/json/nodes/{target_node}/qemu/{vmid}/agent/ping")
if res.get("data") == "pong":
return True
except Exception:
pass
time.sleep(5)
return False
def exec_guest(vmid, command):
res = api_request(
"POST",
f"/api2/json/nodes/{target_node}/qemu/{vmid}/agent/exec",
{
"command": "/run/current-system/sw/bin/sh",
"extra-args": ["-lc", command],
},
)
pid = res["data"]["pid"]
for _ in range(120):
status = api_request(
"GET",
f"/api2/json/nodes/{target_node}/qemu/{vmid}/agent/exec-status?pid={pid}",
).get("data", {})
if status.get("exited"):
return (
int(status.get("exitcode", 1)),
status.get("out-data", ""),
status.get("err-data", ""),
)
time.sleep(2)
return (124, "", "Timed out waiting for guest command")
failures = []
safe_key = ts_authkey.replace("'", "'\"'\"'")
for hostname, vmid in targets:
print(f"\n== Enrolling {hostname} (vmid {vmid}) ==")
if not wait_for_guest_agent(vmid):
failures.append(f"{hostname}: guest agent not ready")
print(f"ERROR: guest agent not ready for vmid {vmid}")
continue
safe_hostname = hostname.replace("'", "'\"'\"'")
cmd = (
"set -e; "
f"printf '%s' '{safe_key}' > /etc/tailscale/authkey; "
f"printf '%s' '{safe_hostname}' > /etc/tailscale/hostname; "
"chmod 600 /etc/tailscale/authkey; "
f"hostnamectl set-hostname '{safe_hostname}' || true; "
"systemctl restart tailscaled; "
"systemctl start tailscale-firstboot.service; "
"tailscale status || true"
)
exitcode, stdout, stderr = exec_guest(vmid, cmd)
if stdout:
print(stdout)
if stderr:
print(stderr, file=sys.stderr)
if exitcode != 0:
failures.append(f"{hostname}: command failed exit {exitcode}")
print(f"ERROR: tailscale enrollment failed for {hostname} (exit {exitcode})")
if failures:
print("\nEnrollment failures:")
for failure in failures:
print(f"- {failure}")
raise SystemExit(1)
print("\nTailscale enrollment completed for all managed VMs")
PY
python3 enroll_tailscale.py

View File

@@ -1,28 +1,65 @@
name: Gitea Destroy Terraform
run-name: ${{ gitea.actor }} triggered a Terraform Destroy 🧨
name: Terraform Destroy
run-name: ${{ gitea.actor }} requested Terraform destroy
on:
workflow_dispatch: # Manual trigger
workflow_dispatch:
inputs:
confirm:
description: "Type NUKE to confirm destroy"
required: true
type: string
target:
description: "Destroy scope"
required: true
default: all
type: choice
options:
- all
- alpacas
- llamas
concurrency:
group: terraform-global
cancel-in-progress: false
jobs:
destroy:
name: "Terraform Destroy"
runs-on: ubuntu-latest
permissions:
contents: read
pull-requests: write
env:
TF_VAR_SSH_KEY: ${{ secrets.TF_VAR_SSH_KEY_PUBLIC }}
TF_VAR_TS_AUTHKEY: ${{ secrets.TF_VAR_TS_AUTHKEY }}
TF_VAR_PROXMOX_PASSWORD: ${{ secrets.TF_VAR_PROXMOX_PASSWORD }}
steps:
- name: Validate confirmation phrase
run: |
if [ "${{ inputs.confirm }}" != "NUKE" ]; then
echo "Confirmation failed. You must type NUKE."
exit 1
fi
- name: Checkout repository
uses: actions/checkout@v4
- name: Create Terraform secret files
working-directory: terraform
run: |
cat > secrets.auto.tfvars << EOF
pm_api_token_secret = "${{ secrets.PM_API_TOKEN_SECRET }}"
EOF
cat > backend.hcl << EOF
bucket = "${{ secrets.B2_TF_BUCKET }}"
key = "terraform.tfstate"
region = "us-east-005"
endpoints = {
s3 = "${{ secrets.B2_TF_ENDPOINT }}"
}
access_key = "$(printf '%s' "${{ secrets.B2_KEY_ID }}" | tr -d '\r\n')"
secret_key = "$(printf '%s' "${{ secrets.B2_APPLICATION_KEY }}" | tr -d '\r\n')"
skip_credentials_validation = true
skip_metadata_api_check = true
skip_region_validation = true
skip_requesting_account_id = true
use_path_style = true
EOF
- name: Set up Terraform
uses: hashicorp/setup-terraform@v2
with:
@@ -30,9 +67,27 @@ jobs:
- name: Terraform Init
working-directory: terraform
run: terraform init
run: terraform init -reconfigure -backend-config=backend.hcl
- name: Terraform Destroy
- name: Terraform Destroy Plan
working-directory: terraform
run: terraform destroy -auto-approve
run: |
case "${{ inputs.target }}" in
all)
terraform plan -destroy -out=tfdestroy
;;
alpacas)
terraform plan -destroy -target=proxmox_vm_qemu.alpacas -out=tfdestroy
;;
llamas)
terraform plan -destroy -target=proxmox_vm_qemu.llamas -out=tfdestroy
;;
*)
echo "Invalid destroy target: ${{ inputs.target }}"
exit 1
;;
esac
- name: Terraform Destroy Apply
working-directory: terraform
run: terraform apply -auto-approve tfdestroy

View File

@@ -6,6 +6,10 @@ on:
- stage
- test
concurrency:
group: terraform-global
cancel-in-progress: false
jobs:
terraform:
name: "Terraform Plan"
@@ -63,6 +67,19 @@ jobs:
working-directory: terraform
run: terraform plan -out=tfplan
- name: Block accidental destroy
env:
ALLOW_TF_DESTROY: ${{ secrets.ALLOW_TF_DESTROY }}
working-directory: terraform
run: |
terraform show -json -no-color tfplan > tfplan.json
DESTROY_COUNT=$(python3 -c 'import json; raw=open("tfplan.json","rb").read().decode("utf-8","ignore"); start=raw.find("{"); data=json.JSONDecoder().raw_decode(raw[start:])[0]; print(sum(1 for rc in data.get("resource_changes", []) if "delete" in rc.get("change", {}).get("actions", [])))')
echo "Planned deletes: $DESTROY_COUNT"
if [ "$DESTROY_COUNT" -gt 0 ] && [ "${ALLOW_TF_DESTROY}" != "true" ]; then
echo "Destroy actions detected. Set ALLOW_TF_DESTROY=true to allow."
exit 1
fi
- name: Upload Terraform Plan
uses: actions/upload-artifact@v3
with:

View File

@@ -49,20 +49,21 @@
RemainAfterExit = true;
};
script = ''
if [ -f /var/lib/tailscale/.joined ]; then
exit 0
fi
if [ ! -s /etc/tailscale/authkey ]; then
exit 0
fi
key="$(cat /etc/tailscale/authkey)"
${pkgs.tailscale}/bin/tailscale up --auth-key="$key" --hostname="$(hostname)"
ts_hostname=""
if [ -s /etc/tailscale/hostname ]; then
ts_hostname="--hostname=$(cat /etc/tailscale/hostname)"
fi
rm -f /var/lib/tailscale/tailscaled.state
${pkgs.tailscale}/bin/tailscale up --reset --auth-key="$key" $ts_hostname
install -d -m 0700 /var/lib/tailscale
touch /var/lib/tailscale/.joined
rm -f /etc/tailscale/authkey
rm -f /etc/tailscale/hostname
'';
};

View File

@@ -26,19 +26,21 @@ resource "proxmox_vm_qemu" "alpacas" {
os_type = "cloud-init"
agent = 1
sockets = var.sockets
cores = var.cores
cpu {
sockets = var.sockets
cores = var.cores
}
memory = var.memory
scsihw = "virtio-scsi-pci"
boot = "order=virtio0"
bootdisk = "virtio0"
boot = "order=scsi0"
bootdisk = "scsi0"
ipconfig0 = "ip=dhcp"
cicustom = "user=local:snippets/cloud_init_global.yaml"
disks {
virtio {
virtio0 {
scsi {
scsi0 {
disk {
size = var.disk_size
storage = var.storage
@@ -73,18 +75,20 @@ resource "proxmox_vm_qemu" "llamas" {
os_type = "cloud-init"
agent = 1
sockets = var.sockets
cores = var.cores
cpu {
sockets = var.sockets
cores = var.cores
}
memory = var.memory
scsihw = "virtio-scsi-pci"
boot = "order=virtio0"
bootdisk = "virtio0"
boot = "order=scsi0"
bootdisk = "scsi0"
ipconfig0 = "ip=dhcp"
cicustom = "user=local:snippets/cloud_init_global.yaml"
disks {
virtio {
virtio0 {
scsi {
scsi0 {
disk {
size = var.disk_size
storage = var.storage