diff --git a/.gitea/workflows/deploy.yml b/.gitea/workflows/deploy.yml index 10f22eb..a98a6d6 100644 --- a/.gitea/workflows/deploy.yml +++ b/.gitea/workflows/deploy.yml @@ -154,10 +154,151 @@ jobs: if: github.ref == 'refs/heads/main' && github.event_name == 'push' working-directory: terraform run: | - terraform apply \ - -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \ - -var="ssh_private_key=$HOME/.ssh/id_ed25519" \ - -auto-approve + set -euo pipefail + + run_apply() { + local log_file="$1" + terraform apply \ + -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \ + -var="ssh_private_key=$HOME/.ssh/id_ed25519" \ + -auto-approve 2>&1 | tee "${log_file}" + return "${PIPESTATUS[0]}" + } + + cleanup_untracked_target_vms() { + python3 - <<'PY' + import os + import ssl + import subprocess + import time + import urllib.error + import urllib.parse + import urllib.request + + endpoint = os.environ["TF_VAR_proxmox_endpoint"].strip().removesuffix("/api2/json").rstrip("/") + token_id = os.environ["TF_VAR_proxmox_api_token_id"] + token_secret = os.environ["TF_VAR_proxmox_api_token_secret"] + insecure = os.environ.get("TF_VAR_proxmox_insecure", "false").lower() == "true" + node = "flex" + storage = "Flash" + context = ssl._create_unverified_context() if insecure else None + headers = {"Authorization": f"PVEAPIToken={token_id}={token_secret}"} + targets = { + 'proxmox_virtual_environment_vm.nodes["k8s-cluster-cp-1"]': (200, "k8s-cluster-cp-1"), + 'proxmox_virtual_environment_vm.nodes["k8s-cluster-cp-2"]': (201, "k8s-cluster-cp-2"), + 'proxmox_virtual_environment_vm.nodes["k8s-cluster-cp-3"]': (202, "k8s-cluster-cp-3"), + 'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-1"]': (210, "k8s-cluster-worker-1"), + 'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-2"]': (211, "k8s-cluster-worker-2"), + 'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-3"]': (212, "k8s-cluster-worker-3"), + 'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-4"]': (213, "k8s-cluster-worker-4"), + 'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-5"]': (214, "k8s-cluster-worker-5"), + } + + def request(method, path, data=None): + body = None + req_headers = dict(headers) + if data is not None: + body = urllib.parse.urlencode(data).encode() + req_headers["Content-Type"] = "application/x-www-form-urlencoded" + req = urllib.request.Request( + f"{endpoint}/api2/json{path}", + method=method, + headers=req_headers, + data=body, + ) + with urllib.request.urlopen(req, context=context, timeout=60) as resp: + return resp.read() + + def vm_status(vmid): + try: + request("GET", f"/nodes/{node}/qemu/{vmid}/status/current") + return True + except urllib.error.HTTPError as err: + if err.code == 404 or (err.code == 500 and "conf' does not exist" in err.reason): + return False + raise + + def vm_config(vmid): + try: + raw = request("GET", f"/nodes/{node}/qemu/{vmid}/config") + except urllib.error.HTTPError as err: + if err.code == 404 or (err.code == 500 and "conf' does not exist" in err.reason): + return {} + raise + import json + return json.loads(raw).get("data", {}) + + def wait_absent(vmid): + for _ in range(60): + if not vm_status(vmid): + return + time.sleep(5) + raise RuntimeError(f"VM {vmid} still exists after delete") + + state = set( + subprocess.run( + ["terraform", "state", "list"], + check=False, + text=True, + stdout=subprocess.PIPE, + ).stdout.splitlines() + ) + + for address, (vmid, expected_name) in targets.items(): + if address in state: + continue + if not vm_status(vmid): + continue + + config = vm_config(vmid) + actual_name = config.get("name") + if actual_name != expected_name: + raise RuntimeError( + f"Refusing to delete VM {vmid}: expected name {expected_name!r}, got {actual_name!r}" + ) + + print(f"Deleting partial Terraform-untracked VM {vmid} ({expected_name}) before retry") + try: + request("POST", f"/nodes/{node}/qemu/{vmid}/status/stop") + time.sleep(10) + except urllib.error.HTTPError as err: + if err.code not in (400, 500): + raise + + request( + "DELETE", + f"/nodes/{node}/qemu/{vmid}", + {"purge": "1", "destroy-unreferenced-disks": "1"}, + ) + wait_absent(vmid) + + volume = urllib.parse.quote(f"{storage}:vm-{vmid}-cloudinit", safe="") + try: + request("DELETE", f"/nodes/{node}/storage/{storage}/content/{volume}") + except urllib.error.HTTPError as err: + if err.code != 404: + raise + PY + } + + for attempt in 1 2 3; do + log_file="/tmp/terraform-apply-${attempt}.log" + if run_apply "${log_file}"; then + exit 0 + fi + + if [ "${attempt}" = "3" ]; then + exit 1 + fi + + if ! grep -Eq 'HTTP 596|Broken pipe|disk update fails' "${log_file}"; then + exit 1 + fi + + echo "Terraform apply hit transient Proxmox API failure; cleaning partial VM creates before retry ${attempt}/2" + cleanup_untracked_target_vms + sleep 20 + done - name: Save Terraform Outputs if: github.ref == 'refs/heads/main' && github.event_name == 'push'