fix: retry transient Proxmox apply failures
Deploy Cluster / Terraform (push) Successful in 1m39s
Deploy Cluster / Ansible (push) Failing after 22m17s

This commit is contained in:
2026-04-26 04:02:14 +00:00
parent ded8efe7fb
commit 24851f5a9b
+142 -1
View File
@@ -154,10 +154,151 @@ jobs:
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
working-directory: terraform
run: |
set -euo pipefail
run_apply() {
local log_file="$1"
terraform apply \
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
-auto-approve
-auto-approve 2>&1 | tee "${log_file}"
return "${PIPESTATUS[0]}"
}
cleanup_untracked_target_vms() {
python3 - <<'PY'
import os
import ssl
import subprocess
import time
import urllib.error
import urllib.parse
import urllib.request
endpoint = os.environ["TF_VAR_proxmox_endpoint"].strip().removesuffix("/api2/json").rstrip("/")
token_id = os.environ["TF_VAR_proxmox_api_token_id"]
token_secret = os.environ["TF_VAR_proxmox_api_token_secret"]
insecure = os.environ.get("TF_VAR_proxmox_insecure", "false").lower() == "true"
node = "flex"
storage = "Flash"
context = ssl._create_unverified_context() if insecure else None
headers = {"Authorization": f"PVEAPIToken={token_id}={token_secret}"}
targets = {
'proxmox_virtual_environment_vm.nodes["k8s-cluster-cp-1"]': (200, "k8s-cluster-cp-1"),
'proxmox_virtual_environment_vm.nodes["k8s-cluster-cp-2"]': (201, "k8s-cluster-cp-2"),
'proxmox_virtual_environment_vm.nodes["k8s-cluster-cp-3"]': (202, "k8s-cluster-cp-3"),
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-1"]': (210, "k8s-cluster-worker-1"),
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-2"]': (211, "k8s-cluster-worker-2"),
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-3"]': (212, "k8s-cluster-worker-3"),
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-4"]': (213, "k8s-cluster-worker-4"),
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-5"]': (214, "k8s-cluster-worker-5"),
}
def request(method, path, data=None):
body = None
req_headers = dict(headers)
if data is not None:
body = urllib.parse.urlencode(data).encode()
req_headers["Content-Type"] = "application/x-www-form-urlencoded"
req = urllib.request.Request(
f"{endpoint}/api2/json{path}",
method=method,
headers=req_headers,
data=body,
)
with urllib.request.urlopen(req, context=context, timeout=60) as resp:
return resp.read()
def vm_status(vmid):
try:
request("GET", f"/nodes/{node}/qemu/{vmid}/status/current")
return True
except urllib.error.HTTPError as err:
if err.code == 404 or (err.code == 500 and "conf' does not exist" in err.reason):
return False
raise
def vm_config(vmid):
try:
raw = request("GET", f"/nodes/{node}/qemu/{vmid}/config")
except urllib.error.HTTPError as err:
if err.code == 404 or (err.code == 500 and "conf' does not exist" in err.reason):
return {}
raise
import json
return json.loads(raw).get("data", {})
def wait_absent(vmid):
for _ in range(60):
if not vm_status(vmid):
return
time.sleep(5)
raise RuntimeError(f"VM {vmid} still exists after delete")
state = set(
subprocess.run(
["terraform", "state", "list"],
check=False,
text=True,
stdout=subprocess.PIPE,
).stdout.splitlines()
)
for address, (vmid, expected_name) in targets.items():
if address in state:
continue
if not vm_status(vmid):
continue
config = vm_config(vmid)
actual_name = config.get("name")
if actual_name != expected_name:
raise RuntimeError(
f"Refusing to delete VM {vmid}: expected name {expected_name!r}, got {actual_name!r}"
)
print(f"Deleting partial Terraform-untracked VM {vmid} ({expected_name}) before retry")
try:
request("POST", f"/nodes/{node}/qemu/{vmid}/status/stop")
time.sleep(10)
except urllib.error.HTTPError as err:
if err.code not in (400, 500):
raise
request(
"DELETE",
f"/nodes/{node}/qemu/{vmid}",
{"purge": "1", "destroy-unreferenced-disks": "1"},
)
wait_absent(vmid)
volume = urllib.parse.quote(f"{storage}:vm-{vmid}-cloudinit", safe="")
try:
request("DELETE", f"/nodes/{node}/storage/{storage}/content/{volume}")
except urllib.error.HTTPError as err:
if err.code != 404:
raise
PY
}
for attempt in 1 2 3; do
log_file="/tmp/terraform-apply-${attempt}.log"
if run_apply "${log_file}"; then
exit 0
fi
if [ "${attempt}" = "3" ]; then
exit 1
fi
if ! grep -Eq 'HTTP 596|Broken pipe|disk update fails' "${log_file}"; then
exit 1
fi
echo "Terraform apply hit transient Proxmox API failure; cleaning partial VM creates before retry ${attempt}/2"
cleanup_untracked_target_vms
sleep 20
done
- name: Save Terraform Outputs
if: github.ref == 'refs/heads/main' && github.event_name == 'push'