fix: retry transient Proxmox apply failures
This commit is contained in:
+145
-4
@@ -154,10 +154,151 @@ jobs:
|
|||||||
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||||
working-directory: terraform
|
working-directory: terraform
|
||||||
run: |
|
run: |
|
||||||
terraform apply \
|
set -euo pipefail
|
||||||
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
|
||||||
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
run_apply() {
|
||||||
-auto-approve
|
local log_file="$1"
|
||||||
|
terraform apply \
|
||||||
|
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
||||||
|
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
||||||
|
-auto-approve 2>&1 | tee "${log_file}"
|
||||||
|
return "${PIPESTATUS[0]}"
|
||||||
|
}
|
||||||
|
|
||||||
|
cleanup_untracked_target_vms() {
|
||||||
|
python3 - <<'PY'
|
||||||
|
import os
|
||||||
|
import ssl
|
||||||
|
import subprocess
|
||||||
|
import time
|
||||||
|
import urllib.error
|
||||||
|
import urllib.parse
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
endpoint = os.environ["TF_VAR_proxmox_endpoint"].strip().removesuffix("/api2/json").rstrip("/")
|
||||||
|
token_id = os.environ["TF_VAR_proxmox_api_token_id"]
|
||||||
|
token_secret = os.environ["TF_VAR_proxmox_api_token_secret"]
|
||||||
|
insecure = os.environ.get("TF_VAR_proxmox_insecure", "false").lower() == "true"
|
||||||
|
node = "flex"
|
||||||
|
storage = "Flash"
|
||||||
|
context = ssl._create_unverified_context() if insecure else None
|
||||||
|
headers = {"Authorization": f"PVEAPIToken={token_id}={token_secret}"}
|
||||||
|
targets = {
|
||||||
|
'proxmox_virtual_environment_vm.nodes["k8s-cluster-cp-1"]': (200, "k8s-cluster-cp-1"),
|
||||||
|
'proxmox_virtual_environment_vm.nodes["k8s-cluster-cp-2"]': (201, "k8s-cluster-cp-2"),
|
||||||
|
'proxmox_virtual_environment_vm.nodes["k8s-cluster-cp-3"]': (202, "k8s-cluster-cp-3"),
|
||||||
|
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-1"]': (210, "k8s-cluster-worker-1"),
|
||||||
|
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-2"]': (211, "k8s-cluster-worker-2"),
|
||||||
|
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-3"]': (212, "k8s-cluster-worker-3"),
|
||||||
|
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-4"]': (213, "k8s-cluster-worker-4"),
|
||||||
|
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-5"]': (214, "k8s-cluster-worker-5"),
|
||||||
|
}
|
||||||
|
|
||||||
|
def request(method, path, data=None):
|
||||||
|
body = None
|
||||||
|
req_headers = dict(headers)
|
||||||
|
if data is not None:
|
||||||
|
body = urllib.parse.urlencode(data).encode()
|
||||||
|
req_headers["Content-Type"] = "application/x-www-form-urlencoded"
|
||||||
|
req = urllib.request.Request(
|
||||||
|
f"{endpoint}/api2/json{path}",
|
||||||
|
method=method,
|
||||||
|
headers=req_headers,
|
||||||
|
data=body,
|
||||||
|
)
|
||||||
|
with urllib.request.urlopen(req, context=context, timeout=60) as resp:
|
||||||
|
return resp.read()
|
||||||
|
|
||||||
|
def vm_status(vmid):
|
||||||
|
try:
|
||||||
|
request("GET", f"/nodes/{node}/qemu/{vmid}/status/current")
|
||||||
|
return True
|
||||||
|
except urllib.error.HTTPError as err:
|
||||||
|
if err.code == 404 or (err.code == 500 and "conf' does not exist" in err.reason):
|
||||||
|
return False
|
||||||
|
raise
|
||||||
|
|
||||||
|
def vm_config(vmid):
|
||||||
|
try:
|
||||||
|
raw = request("GET", f"/nodes/{node}/qemu/{vmid}/config")
|
||||||
|
except urllib.error.HTTPError as err:
|
||||||
|
if err.code == 404 or (err.code == 500 and "conf' does not exist" in err.reason):
|
||||||
|
return {}
|
||||||
|
raise
|
||||||
|
import json
|
||||||
|
return json.loads(raw).get("data", {})
|
||||||
|
|
||||||
|
def wait_absent(vmid):
|
||||||
|
for _ in range(60):
|
||||||
|
if not vm_status(vmid):
|
||||||
|
return
|
||||||
|
time.sleep(5)
|
||||||
|
raise RuntimeError(f"VM {vmid} still exists after delete")
|
||||||
|
|
||||||
|
state = set(
|
||||||
|
subprocess.run(
|
||||||
|
["terraform", "state", "list"],
|
||||||
|
check=False,
|
||||||
|
text=True,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
).stdout.splitlines()
|
||||||
|
)
|
||||||
|
|
||||||
|
for address, (vmid, expected_name) in targets.items():
|
||||||
|
if address in state:
|
||||||
|
continue
|
||||||
|
if not vm_status(vmid):
|
||||||
|
continue
|
||||||
|
|
||||||
|
config = vm_config(vmid)
|
||||||
|
actual_name = config.get("name")
|
||||||
|
if actual_name != expected_name:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Refusing to delete VM {vmid}: expected name {expected_name!r}, got {actual_name!r}"
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"Deleting partial Terraform-untracked VM {vmid} ({expected_name}) before retry")
|
||||||
|
try:
|
||||||
|
request("POST", f"/nodes/{node}/qemu/{vmid}/status/stop")
|
||||||
|
time.sleep(10)
|
||||||
|
except urllib.error.HTTPError as err:
|
||||||
|
if err.code not in (400, 500):
|
||||||
|
raise
|
||||||
|
|
||||||
|
request(
|
||||||
|
"DELETE",
|
||||||
|
f"/nodes/{node}/qemu/{vmid}",
|
||||||
|
{"purge": "1", "destroy-unreferenced-disks": "1"},
|
||||||
|
)
|
||||||
|
wait_absent(vmid)
|
||||||
|
|
||||||
|
volume = urllib.parse.quote(f"{storage}:vm-{vmid}-cloudinit", safe="")
|
||||||
|
try:
|
||||||
|
request("DELETE", f"/nodes/{node}/storage/{storage}/content/{volume}")
|
||||||
|
except urllib.error.HTTPError as err:
|
||||||
|
if err.code != 404:
|
||||||
|
raise
|
||||||
|
PY
|
||||||
|
}
|
||||||
|
|
||||||
|
for attempt in 1 2 3; do
|
||||||
|
log_file="/tmp/terraform-apply-${attempt}.log"
|
||||||
|
if run_apply "${log_file}"; then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "${attempt}" = "3" ]; then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! grep -Eq 'HTTP 596|Broken pipe|disk update fails' "${log_file}"; then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Terraform apply hit transient Proxmox API failure; cleaning partial VM creates before retry ${attempt}/2"
|
||||||
|
cleanup_untracked_target_vms
|
||||||
|
sleep 20
|
||||||
|
done
|
||||||
|
|
||||||
- name: Save Terraform Outputs
|
- name: Save Terraform Outputs
|
||||||
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||||
|
|||||||
Reference in New Issue
Block a user