fix: retry transient Proxmox apply failures
This commit is contained in:
+145
-4
@@ -154,10 +154,151 @@ jobs:
|
||||
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||
working-directory: terraform
|
||||
run: |
|
||||
terraform apply \
|
||||
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
||||
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
||||
-auto-approve
|
||||
set -euo pipefail
|
||||
|
||||
run_apply() {
|
||||
local log_file="$1"
|
||||
terraform apply \
|
||||
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
||||
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
||||
-auto-approve 2>&1 | tee "${log_file}"
|
||||
return "${PIPESTATUS[0]}"
|
||||
}
|
||||
|
||||
cleanup_untracked_target_vms() {
|
||||
python3 - <<'PY'
|
||||
import os
|
||||
import ssl
|
||||
import subprocess
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
|
||||
endpoint = os.environ["TF_VAR_proxmox_endpoint"].strip().removesuffix("/api2/json").rstrip("/")
|
||||
token_id = os.environ["TF_VAR_proxmox_api_token_id"]
|
||||
token_secret = os.environ["TF_VAR_proxmox_api_token_secret"]
|
||||
insecure = os.environ.get("TF_VAR_proxmox_insecure", "false").lower() == "true"
|
||||
node = "flex"
|
||||
storage = "Flash"
|
||||
context = ssl._create_unverified_context() if insecure else None
|
||||
headers = {"Authorization": f"PVEAPIToken={token_id}={token_secret}"}
|
||||
targets = {
|
||||
'proxmox_virtual_environment_vm.nodes["k8s-cluster-cp-1"]': (200, "k8s-cluster-cp-1"),
|
||||
'proxmox_virtual_environment_vm.nodes["k8s-cluster-cp-2"]': (201, "k8s-cluster-cp-2"),
|
||||
'proxmox_virtual_environment_vm.nodes["k8s-cluster-cp-3"]': (202, "k8s-cluster-cp-3"),
|
||||
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-1"]': (210, "k8s-cluster-worker-1"),
|
||||
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-2"]': (211, "k8s-cluster-worker-2"),
|
||||
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-3"]': (212, "k8s-cluster-worker-3"),
|
||||
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-4"]': (213, "k8s-cluster-worker-4"),
|
||||
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-5"]': (214, "k8s-cluster-worker-5"),
|
||||
}
|
||||
|
||||
def request(method, path, data=None):
|
||||
body = None
|
||||
req_headers = dict(headers)
|
||||
if data is not None:
|
||||
body = urllib.parse.urlencode(data).encode()
|
||||
req_headers["Content-Type"] = "application/x-www-form-urlencoded"
|
||||
req = urllib.request.Request(
|
||||
f"{endpoint}/api2/json{path}",
|
||||
method=method,
|
||||
headers=req_headers,
|
||||
data=body,
|
||||
)
|
||||
with urllib.request.urlopen(req, context=context, timeout=60) as resp:
|
||||
return resp.read()
|
||||
|
||||
def vm_status(vmid):
|
||||
try:
|
||||
request("GET", f"/nodes/{node}/qemu/{vmid}/status/current")
|
||||
return True
|
||||
except urllib.error.HTTPError as err:
|
||||
if err.code == 404 or (err.code == 500 and "conf' does not exist" in err.reason):
|
||||
return False
|
||||
raise
|
||||
|
||||
def vm_config(vmid):
|
||||
try:
|
||||
raw = request("GET", f"/nodes/{node}/qemu/{vmid}/config")
|
||||
except urllib.error.HTTPError as err:
|
||||
if err.code == 404 or (err.code == 500 and "conf' does not exist" in err.reason):
|
||||
return {}
|
||||
raise
|
||||
import json
|
||||
return json.loads(raw).get("data", {})
|
||||
|
||||
def wait_absent(vmid):
|
||||
for _ in range(60):
|
||||
if not vm_status(vmid):
|
||||
return
|
||||
time.sleep(5)
|
||||
raise RuntimeError(f"VM {vmid} still exists after delete")
|
||||
|
||||
state = set(
|
||||
subprocess.run(
|
||||
["terraform", "state", "list"],
|
||||
check=False,
|
||||
text=True,
|
||||
stdout=subprocess.PIPE,
|
||||
).stdout.splitlines()
|
||||
)
|
||||
|
||||
for address, (vmid, expected_name) in targets.items():
|
||||
if address in state:
|
||||
continue
|
||||
if not vm_status(vmid):
|
||||
continue
|
||||
|
||||
config = vm_config(vmid)
|
||||
actual_name = config.get("name")
|
||||
if actual_name != expected_name:
|
||||
raise RuntimeError(
|
||||
f"Refusing to delete VM {vmid}: expected name {expected_name!r}, got {actual_name!r}"
|
||||
)
|
||||
|
||||
print(f"Deleting partial Terraform-untracked VM {vmid} ({expected_name}) before retry")
|
||||
try:
|
||||
request("POST", f"/nodes/{node}/qemu/{vmid}/status/stop")
|
||||
time.sleep(10)
|
||||
except urllib.error.HTTPError as err:
|
||||
if err.code not in (400, 500):
|
||||
raise
|
||||
|
||||
request(
|
||||
"DELETE",
|
||||
f"/nodes/{node}/qemu/{vmid}",
|
||||
{"purge": "1", "destroy-unreferenced-disks": "1"},
|
||||
)
|
||||
wait_absent(vmid)
|
||||
|
||||
volume = urllib.parse.quote(f"{storage}:vm-{vmid}-cloudinit", safe="")
|
||||
try:
|
||||
request("DELETE", f"/nodes/{node}/storage/{storage}/content/{volume}")
|
||||
except urllib.error.HTTPError as err:
|
||||
if err.code != 404:
|
||||
raise
|
||||
PY
|
||||
}
|
||||
|
||||
for attempt in 1 2 3; do
|
||||
log_file="/tmp/terraform-apply-${attempt}.log"
|
||||
if run_apply "${log_file}"; then
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [ "${attempt}" = "3" ]; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! grep -Eq 'HTTP 596|Broken pipe|disk update fails' "${log_file}"; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Terraform apply hit transient Proxmox API failure; cleaning partial VM creates before retry ${attempt}/2"
|
||||
cleanup_untracked_target_vms
|
||||
sleep 20
|
||||
done
|
||||
|
||||
- name: Save Terraform Outputs
|
||||
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||
|
||||
Reference in New Issue
Block a user