From a33a9938672e5bf678fbf3bab51e43908414494d Mon Sep 17 00:00:00 2001 From: MichaelFisher1997 Date: Thu, 30 Apr 2026 07:36:27 +0000 Subject: [PATCH] fix: harden cluster rebuild determinism --- .gitea/workflows/dashboards.yml | 9 +- .gitea/workflows/deploy.yml | 479 +++++++++--------- .gitea/workflows/destroy.yml | 25 +- .gitignore | 1 - .../bootstrap-image-prepull/tasks/main.yml | 11 - ansible/roles/common/tasks/main.yml | 9 + ansible/roles/k3s-agent/tasks/main.yml | 15 +- ansible/roles/k3s-server/tasks/main.yml | 12 +- ansible/roles/kube-vip-deploy/tasks/main.yml | 15 +- .../observability-content/tasks/main.yml | 5 + .../rancher-image-prepull/tasks/main.yml | 11 - .../roles/tailscale-cleanup/tasks/main.yml | 7 + ansible/site.yml | 37 ++ ...ustersecretstore-doppler-hetznerterra.yaml | 0 .../external-secrets-store/kustomization.yaml | 4 + .../kustomization-external-secrets-store.yaml | 21 + .../kustomization-external-secrets.yaml | 10 +- .../kustomization-observability-secrets.yaml | 26 + .../addons/kustomization-observability.yaml | 5 +- .../addons/kustomization-rancher-config.yaml | 2 +- .../addons/kustomization-rancher-secrets.yaml | 34 ++ .../addons/kustomization-rancher.yaml | 20 +- .../kustomization-tailscale-operator.yaml | 9 +- infrastructure/addons/kustomization.yaml | 3 + .../grafana-admin-externalsecret.yaml | 0 .../observability-secrets/kustomization.yaml | 5 + .../namespace.yaml | 0 .../addons/observability/kustomization.yaml | 2 - .../addons/rancher-secrets/kustomization.yaml | 6 + .../namespace.yaml | 0 ...her-bootstrap-password-externalsecret.yaml | 0 ...ootstrap-password-flux-externalsecret.yaml | 0 .../addons/rancher/kustomization.yaml | 3 - scripts/proxmox-rebuild-cleanup.py | 275 ++++++++++ terraform/.terraform.lock.hcl | 44 ++ terraform/main.tf | 4 +- terraform/outputs.tf | 15 + terraform/variables.tf | 30 ++ 38 files changed, 865 insertions(+), 289 deletions(-) rename infrastructure/addons/{external-secrets => external-secrets-store}/clustersecretstore-doppler-hetznerterra.yaml (100%) create mode 100644 infrastructure/addons/external-secrets-store/kustomization.yaml create mode 100644 infrastructure/addons/kustomization-external-secrets-store.yaml create mode 100644 infrastructure/addons/kustomization-observability-secrets.yaml create mode 100644 infrastructure/addons/kustomization-rancher-secrets.yaml rename infrastructure/addons/{observability => observability-secrets}/grafana-admin-externalsecret.yaml (100%) create mode 100644 infrastructure/addons/observability-secrets/kustomization.yaml rename infrastructure/addons/{observability => observability-secrets}/namespace.yaml (100%) create mode 100644 infrastructure/addons/rancher-secrets/kustomization.yaml rename infrastructure/addons/{rancher => rancher-secrets}/namespace.yaml (100%) rename infrastructure/addons/{rancher => rancher-secrets}/rancher-bootstrap-password-externalsecret.yaml (100%) rename infrastructure/addons/{rancher => rancher-secrets}/rancher-bootstrap-password-flux-externalsecret.yaml (100%) create mode 100644 scripts/proxmox-rebuild-cleanup.py create mode 100644 terraform/.terraform.lock.hcl diff --git a/.gitea/workflows/dashboards.yml b/.gitea/workflows/dashboards.yml index 750f856..22aedc2 100644 --- a/.gitea/workflows/dashboards.yml +++ b/.gitea/workflows/dashboards.yml @@ -9,6 +9,10 @@ on: - "ansible/roles/observability-content/**" workflow_dispatch: +concurrency: + group: prod-cluster + cancel-in-progress: false + env: TF_VERSION: "1.7.0" TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }} @@ -24,7 +28,7 @@ env: jobs: dashboards: name: Grafana Content - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Checkout uses: actions/checkout@v4 @@ -46,6 +50,7 @@ jobs: working-directory: terraform run: | terraform init \ + -lockfile=readonly \ -backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \ -backend-config="bucket=${{ secrets.S3_BUCKET }}" \ -backend-config="region=auto" \ @@ -56,7 +61,7 @@ jobs: - name: Install Python Dependencies run: | apt-get update && apt-get install -y python3-pip - pip3 install --break-system-packages ansible kubernetes jinja2 pyyaml + pip3 install --break-system-packages ansible==11.2.0 kubernetes==32.0.1 jinja2==3.1.5 pyyaml==6.0.2 - name: Install Ansible Collections run: ansible-galaxy collection install -r ansible/requirements.yml diff --git a/.gitea/workflows/deploy.yml b/.gitea/workflows/deploy.yml index 4fee677..94a9f96 100644 --- a/.gitea/workflows/deploy.yml +++ b/.gitea/workflows/deploy.yml @@ -15,6 +15,7 @@ concurrency: env: TF_VERSION: "1.7.0" + KUBECTL_VERSION: "v1.34.6" TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }} TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }} TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }} @@ -30,7 +31,7 @@ env: jobs: terraform: name: Terraform - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - name: Checkout uses: actions/checkout@v4 @@ -48,6 +49,7 @@ jobs: working-directory: terraform run: | terraform init \ + -lockfile=readonly \ -backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \ -backend-config="bucket=${{ secrets.S3_BUCKET }}" \ -backend-config="region=auto" \ @@ -100,59 +102,7 @@ jobs: - name: Cleanup orphan Proxmox cloud-init volumes if: github.ref == 'refs/heads/main' && github.event_name == 'push' - run: | - set -euo pipefail - python3 - <<'PY' - import os - import ssl - import urllib.error - import urllib.parse - import urllib.request - - endpoint = os.environ["TF_VAR_proxmox_endpoint"].strip().removesuffix("/api2/json").rstrip("/") - token_id = os.environ["TF_VAR_proxmox_api_token_id"] - token_secret = os.environ["TF_VAR_proxmox_api_token_secret"] - insecure = os.environ.get("TF_VAR_proxmox_insecure", "false").lower() == "true" - node = "flex" - storage = "Flash" - vm_ids = [200, 201, 202, 210, 211, 212, 213, 214] - context = ssl._create_unverified_context() if insecure else None - headers = {"Authorization": f"PVEAPIToken={token_id}={token_secret}"} - - def request(method, path): - req = urllib.request.Request( - f"{endpoint}/api2/json{path}", - method=method, - headers=headers, - ) - return urllib.request.urlopen(req, context=context, timeout=30) - - def vm_exists(vmid): - try: - request("GET", f"/nodes/{node}/qemu/{vmid}/status/current").close() - return True - except urllib.error.HTTPError as err: - if err.code == 404: - return False - if err.code == 500 and "conf' does not exist" in err.reason: - return False - raise - - for vmid in vm_ids: - if vm_exists(vmid): - print(f"VM {vmid} exists; keeping cloud-init volume") - continue - - volume = urllib.parse.quote(f"{storage}:vm-{vmid}-cloudinit", safe="") - try: - request("DELETE", f"/nodes/{node}/storage/{storage}/content/{volume}").close() - print(f"Deleted orphan cloud-init volume for VM {vmid}") - except urllib.error.HTTPError as err: - if err.code == 404: - print(f"No orphan cloud-init volume for VM {vmid}") - continue - raise - PY + run: python3 scripts/proxmox-rebuild-cleanup.py --mode orphan-cloudinit --terraform-dir terraform --plan tfplan - name: Terraform Apply if: github.ref == 'refs/heads/main' && github.event_name == 'push' @@ -163,6 +113,7 @@ jobs: run_apply() { local log_file="$1" terraform apply \ + -parallelism=2 \ -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \ -var="ssh_private_key=$HOME/.ssh/id_ed25519" \ -auto-approve 2>&1 | tee "${log_file}" @@ -170,124 +121,10 @@ jobs: } cleanup_untracked_target_vms() { - python3 - <<'PY' - import os - import ssl - import subprocess - import time - import urllib.error - import urllib.parse - import urllib.request - - endpoint = os.environ["TF_VAR_proxmox_endpoint"].strip().removesuffix("/api2/json").rstrip("/") - token_id = os.environ["TF_VAR_proxmox_api_token_id"] - token_secret = os.environ["TF_VAR_proxmox_api_token_secret"] - insecure = os.environ.get("TF_VAR_proxmox_insecure", "false").lower() == "true" - node = "flex" - storage = "Flash" - context = ssl._create_unverified_context() if insecure else None - headers = {"Authorization": f"PVEAPIToken={token_id}={token_secret}"} - targets = { - 'proxmox_virtual_environment_vm.nodes["k8s-cluster-cp-1"]': (200, "k8s-cluster-cp-1"), - 'proxmox_virtual_environment_vm.nodes["k8s-cluster-cp-2"]': (201, "k8s-cluster-cp-2"), - 'proxmox_virtual_environment_vm.nodes["k8s-cluster-cp-3"]': (202, "k8s-cluster-cp-3"), - 'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-1"]': (210, "k8s-cluster-worker-1"), - 'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-2"]': (211, "k8s-cluster-worker-2"), - 'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-3"]': (212, "k8s-cluster-worker-3"), - 'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-4"]': (213, "k8s-cluster-worker-4"), - 'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-5"]': (214, "k8s-cluster-worker-5"), + python3 ../scripts/proxmox-rebuild-cleanup.py --mode untracked-vms --terraform-dir . --plan tfplan } - def request(method, path, data=None): - body = None - req_headers = dict(headers) - if data is not None: - encoded = urllib.parse.urlencode(data) - if method == "DELETE": - path = f"{path}?{encoded}" - else: - body = encoded.encode() - req_headers["Content-Type"] = "application/x-www-form-urlencoded" - req = urllib.request.Request( - f"{endpoint}/api2/json{path}", - method=method, - headers=req_headers, - data=body, - ) - with urllib.request.urlopen(req, context=context, timeout=60) as resp: - return resp.read() - - def vm_status(vmid): - try: - request("GET", f"/nodes/{node}/qemu/{vmid}/status/current") - return True - except urllib.error.HTTPError as err: - if err.code == 404 or (err.code == 500 and "conf' does not exist" in err.reason): - return False - raise - - def vm_config(vmid): - try: - raw = request("GET", f"/nodes/{node}/qemu/{vmid}/config") - except urllib.error.HTTPError as err: - if err.code == 404 or (err.code == 500 and "conf' does not exist" in err.reason): - return {} - raise - import json - return json.loads(raw).get("data", {}) - - def wait_absent(vmid): - for _ in range(60): - if not vm_status(vmid): - return - time.sleep(5) - raise RuntimeError(f"VM {vmid} still exists after delete") - - state = set( - subprocess.run( - ["terraform", "state", "list"], - check=False, - text=True, - stdout=subprocess.PIPE, - ).stdout.splitlines() - ) - - for address, (vmid, expected_name) in targets.items(): - if address in state: - continue - if not vm_status(vmid): - continue - - config = vm_config(vmid) - actual_name = config.get("name") - if actual_name != expected_name: - raise RuntimeError( - f"Refusing to delete VM {vmid}: expected name {expected_name!r}, got {actual_name!r}" - ) - - print(f"Deleting partial Terraform-untracked VM {vmid} ({expected_name}) before retry") - try: - request("POST", f"/nodes/{node}/qemu/{vmid}/status/stop") - time.sleep(10) - except urllib.error.HTTPError as err: - if err.code not in (400, 500): - raise - - request( - "DELETE", - f"/nodes/{node}/qemu/{vmid}", - {"purge": "1", "destroy-unreferenced-disks": "1"}, - ) - wait_absent(vmid) - - volume = urllib.parse.quote(f"{storage}:vm-{vmid}-cloudinit", safe="") - try: - request("DELETE", f"/nodes/{node}/storage/{storage}/content/{volume}") - except urllib.error.HTTPError as err: - if err.code != 404: - raise - PY - } + cleanup_untracked_target_vms for attempt in 1 2 3; do log_file="/tmp/terraform-apply-${attempt}.log" @@ -299,11 +136,7 @@ jobs: exit 1 fi - if ! grep -Eq 'HTTP 596|Broken pipe|disk update fails' "${log_file}"; then - exit 1 - fi - - echo "Terraform apply hit transient Proxmox API failure; cleaning partial VM creates before retry ${attempt}/2" + echo "Terraform apply failed; cleaning Terraform-untracked partial VM creates before retry ${attempt}/2" cleanup_untracked_target_vms sleep 20 done @@ -320,11 +153,11 @@ jobs: uses: actions/upload-artifact@v3 with: name: terraform-outputs - path: outputs/terraform_outputs.json + path: terraform/outputs/terraform_outputs.json ansible: name: Ansible - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 needs: terraform if: github.ref == 'refs/heads/main' && github.event_name == 'push' steps: @@ -348,6 +181,7 @@ jobs: working-directory: terraform run: | terraform init \ + -lockfile=readonly \ -backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \ -backend-config="bucket=${{ secrets.S3_BUCKET }}" \ -backend-config="region=auto" \ @@ -364,7 +198,7 @@ jobs: - name: Install Python Dependencies run: | apt-get update && apt-get install -y python3-pip - pip3 install --break-system-packages ansible kubernetes jinja2 pyyaml + pip3 install --break-system-packages ansible==11.2.0 kubernetes==32.0.1 jinja2==3.1.5 pyyaml==6.0.2 - name: Install Ansible Collections run: ansible-galaxy collection install -r ansible/requirements.yml @@ -461,7 +295,7 @@ jobs: - name: Install kubectl run: | - curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" + curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl" chmod +x /usr/local/bin/kubectl - name: Rewrite kubeconfig for runner-reachable API @@ -476,6 +310,7 @@ jobs: KUBECONFIG: outputs/kubeconfig FLUX_GIT_HOST: 64.176.189.59 FLUX_GIT_PORT: "2222" + FLUX_KNOWN_HOSTS: ${{ secrets.FLUX_KNOWN_HOSTS }} run: | set -euo pipefail flux_rollout_status() { @@ -512,6 +347,52 @@ jobs: done } + wait_for_reconcile_handled() { + local namespace="$1" + local resource="$2" + local reconcile_at="$3" + local timeout_seconds="$4" + local elapsed=0 + local handled + + while [ "${elapsed}" -lt "${timeout_seconds}" ]; do + handled="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.lastHandledReconcileAt}' 2>/dev/null || true)" + if [ "${handled}" = "${reconcile_at}" ]; then + return 0 + fi + + sleep 5 + elapsed=$((elapsed + 5)) + done + + echo "Timed out waiting for ${resource} to handle reconcile ${reconcile_at}" >&2 + kubectl -n "${namespace}" describe "${resource}" || true + exit 1 + } + + reconcile_flux_resource() { + local namespace="$1" + local resource="$2" + local timeout_seconds="$3" + local reconcile_at + reconcile_at="$(date +%s%N)" + kubectl -n "${namespace}" annotate "${resource}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite + wait_for_reconcile_handled "${namespace}" "${resource}" "${reconcile_at}" "${timeout_seconds}" + } + + reconcile_helmrelease() { + local release_name="$1" + local timeout_seconds="$2" + local reconcile_at + reconcile_at="$(date +%s%N)" + kubectl -n flux-system annotate "helmrelease/${release_name}" \ + reconcile.fluxcd.io/requestedAt="${reconcile_at}" \ + reconcile.fluxcd.io/resetAt="${reconcile_at}" \ + reconcile.fluxcd.io/forceAt="${reconcile_at}" \ + --overwrite + wait_for_reconcile_handled flux-system "helmrelease/${release_name}" "${reconcile_at}" "${timeout_seconds}" + } + pull_required_image() { local image="$1" local host_ip="$2" @@ -594,12 +475,16 @@ jobs: local elapsed=0 local ready local stalled + local generation + local observed_generation while [ "${elapsed}" -lt "${timeout_seconds}" ]; do ready="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)" stalled="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)" + generation="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.metadata.generation}' 2>/dev/null || true)" + observed_generation="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.observedGeneration}' 2>/dev/null || true)" - if [ "${ready}" = "True" ]; then + if [ "${ready}" = "True" ] && [ "${observed_generation}" = "${generation}" ]; then return 0 fi @@ -626,16 +511,10 @@ jobs: local target_namespace="$3" local oci_timeout="$4" local release_timeout="$5" - local reconcile_at local artifact_storage wait_for_resource flux-system "ocirepository.source.toolkit.fluxcd.io/${oci_name}" 600 - reconcile_at="$(date +%s)" - kubectl -n flux-system annotate "helmrelease/${release_name}" \ - reconcile.fluxcd.io/requestedAt="${reconcile_at}" \ - reconcile.fluxcd.io/resetAt="${reconcile_at}" \ - reconcile.fluxcd.io/forceAt="${reconcile_at}" \ - --overwrite + reconcile_helmrelease "${release_name}" 300 if ! kubectl -n flux-system wait --for=condition=Ready "ocirepository/${oci_name}" --timeout="${oci_timeout}"; then artifact_storage="$(kubectl -n flux-system get "ocirepository/${oci_name}" -o jsonpath='{.status.conditions[?(@.type=="ArtifactInStorage")].status}' 2>/dev/null || true)" @@ -671,7 +550,6 @@ jobs: local repo_timeout="$5" local chart_timeout="$6" local release_timeout="$7" - local reconcile_at wait_for_resource flux-system "helmrepository.source.toolkit.fluxcd.io/${repo_name}" 600 if ! kubectl -n flux-system wait --for=condition=Ready "helmrepository/${repo_name}" --timeout="${repo_timeout}"; then @@ -680,13 +558,8 @@ jobs: fi wait_for_resource flux-system "helmchart.source.toolkit.fluxcd.io/${chart_name}" 600 - reconcile_at="$(date +%s)" - kubectl -n flux-system annotate "helmchart.source.toolkit.fluxcd.io/${chart_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite - kubectl -n flux-system annotate "helmrelease/${release_name}" \ - reconcile.fluxcd.io/requestedAt="${reconcile_at}" \ - reconcile.fluxcd.io/resetAt="${reconcile_at}" \ - reconcile.fluxcd.io/forceAt="${reconcile_at}" \ - --overwrite + reconcile_flux_resource flux-system "helmchart.source.toolkit.fluxcd.io/${chart_name}" 300 + reconcile_helmrelease "${release_name}" 300 for attempt in $(seq 1 6); do if kubectl -n flux-system wait --for=condition=Ready "helmchart.source.toolkit.fluxcd.io/${chart_name}" --timeout="${chart_timeout}"; then @@ -695,9 +568,8 @@ jobs: fi echo "HelmChart ${chart_name} did not become Ready after ${chart_timeout}; forcing retry (${attempt}/6)" >&2 - reconcile_at="$(date +%s)" - kubectl -n flux-system annotate "helmchart.source.toolkit.fluxcd.io/${chart_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite - kubectl -n flux-system annotate "helmrelease/${release_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite + reconcile_flux_resource flux-system "helmchart.source.toolkit.fluxcd.io/${chart_name}" 300 + reconcile_helmrelease "${release_name}" 300 done flux_helm_diagnostics "${repo_name}" "${chart_name}" "${release_name}" "${target_namespace}" @@ -705,7 +577,11 @@ jobs: } kubectl create namespace flux-system --dry-run=client -o yaml | kubectl apply -f - - ssh-keyscan -p "${FLUX_GIT_PORT}" "${FLUX_GIT_HOST}" > /tmp/flux_known_hosts + if [ -n "${FLUX_KNOWN_HOSTS}" ]; then + printf '%s\n' "${FLUX_KNOWN_HOSTS}" > /tmp/flux_known_hosts + else + ssh-keyscan -p "${FLUX_GIT_PORT}" "${FLUX_GIT_HOST}" > /tmp/flux_known_hosts + fi kubectl -n flux-system create secret generic flux-system \ --from-file=identity="$HOME/.ssh/id_ed25519" \ --from-file=known_hosts=/tmp/flux_known_hosts \ @@ -741,18 +617,17 @@ jobs: kubectl -n flux-system patch deployment kustomize-controller --type='merge' -p="$PATCH" kubectl -n flux-system patch deployment helm-controller --type='merge' -p="$PATCH" kubectl -n flux-system patch deployment notification-controller --type='merge' -p="$PATCH" - kubectl -n flux-system delete pod --field-selector=status.phase!=Running || true flux_rollout_status source-controller flux_rollout_status kustomize-controller flux_rollout_status helm-controller kubectl -n flux-system wait --for=condition=Ready gitrepository/platform --timeout=300s kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=600s - kubectl -n flux-system annotate kustomization/addon-cert-manager reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite + reconcile_flux_resource flux-system kustomization/addon-cert-manager 300 kubectl -n flux-system wait --for=condition=Ready kustomization/addon-cert-manager --timeout=1200s kubectl -n flux-system wait --for=condition=Ready helmrelease/cert-manager --timeout=1200s # Wait directly on the ESO Helm objects; Kustomization readiness hides useful failure details. wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-external-secrets 600 - kubectl -n flux-system annotate kustomization/addon-external-secrets reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite + reconcile_flux_resource flux-system kustomization/addon-external-secrets 300 import_required_image oci.external-secrets.io/external-secrets/external-secrets:v2.1.0 "${PRIMARY_CP_IP}" wait_for_flux_oci_helm_release external-secrets external-secrets external-secrets 600s 600 wait_for_resource "" crd/clustersecretstores.external-secrets.io 900 @@ -764,35 +639,68 @@ jobs: wait_for_resource external-secrets service/external-secrets-external-secrets-webhook 600 wait_for_resource external-secrets endpoints/external-secrets-external-secrets-webhook 600 kubectl -n external-secrets wait --for=jsonpath='{.subsets[0].addresses[0].ip}' endpoints/external-secrets-external-secrets-webhook --timeout=600s - # Create Doppler ClusterSecretStore now that ESO CRDs are available - kubectl apply -f - <<'EOF' - apiVersion: external-secrets.io/v1 - kind: ClusterSecretStore - metadata: - name: doppler-hetznerterra - spec: - provider: - doppler: - auth: - secretRef: - dopplerToken: - name: doppler-hetznerterra-service-token - key: dopplerToken - namespace: external-secrets - EOF + wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-external-secrets-store 600 + reconcile_flux_resource flux-system kustomization/addon-external-secrets-store 300 + kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets-store --timeout=600s # Wait for the storage layer and private access components import_required_image ghcr.io/tailscale/k8s-operator:v1.96.5 "${PRIMARY_CP_IP}" import_required_image ghcr.io/tailscale/tailscale:v1.96.5 "${PRIMARY_CP_IP}" - kubectl -n flux-system annotate kustomization/addon-tailscale-operator reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite + reconcile_flux_resource flux-system kustomization/addon-tailscale-operator 300 kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=600s kubectl -n tailscale-system rollout status deployment/operator --timeout=600s import_required_image registry.k8s.io/sig-storage/nfs-subdir-external-provisioner:v4.0.2 "${PRIMARY_CP_IP}" - kubectl -n flux-system annotate kustomization/addon-nfs-storage reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite + reconcile_flux_resource flux-system kustomization/addon-nfs-storage 300 kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=300s kubectl -n kube-system rollout status deployment/nfs-subdir-external-provisioner --timeout=300s kubectl annotate storageclass local-path storageclass.kubernetes.io/is-default-class=false --overwrite kubectl annotate storageclass flash-nfs storageclass.kubernetes.io/is-default-class=true --overwrite kubectl get storageclass flash-nfs + import_required_image docker.io/library/busybox:1.31.1 "${PRIMARY_CP_IP}" + kubectl -n kube-system delete pod/nfs-smoke pvc/nfs-smoke --ignore-not-found=true + kubectl apply -f - <<'EOF' + apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: nfs-smoke + namespace: kube-system + spec: + accessModes: + - ReadWriteOnce + storageClassName: flash-nfs + resources: + requests: + storage: 1Mi + --- + apiVersion: v1 + kind: Pod + metadata: + name: nfs-smoke + namespace: kube-system + spec: + restartPolicy: Never + nodeSelector: + kubernetes.io/hostname: k8s-cluster-cp-1 + tolerations: + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule + containers: + - name: smoke + image: docker.io/library/busybox:1.31.1 + command: + - sh + - -c + - echo ok >/data/smoke && test -s /data/smoke && sleep 30 + volumeMounts: + - name: data + mountPath: /data + volumes: + - name: data + persistentVolumeClaim: + claimName: nfs-smoke + EOF + kubectl -n kube-system wait --for=condition=Ready pod/nfs-smoke --timeout=180s + kubectl -n kube-system delete pod/nfs-smoke pvc/nfs-smoke --ignore-not-found=true --wait=false - name: Wait for Rancher env: @@ -823,15 +731,50 @@ jobs: done } + wait_for_reconcile_handled() { + local namespace="$1" + local resource="$2" + local reconcile_at="$3" + local timeout_seconds="$4" + local elapsed=0 + local handled + + while [ "${elapsed}" -lt "${timeout_seconds}" ]; do + handled="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.lastHandledReconcileAt}' 2>/dev/null || true)" + if [ "${handled}" = "${reconcile_at}" ]; then + return 0 + fi + + sleep 5 + elapsed=$((elapsed + 5)) + done + + echo "Timed out waiting for ${resource} to handle reconcile ${reconcile_at}" >&2 + kubectl -n "${namespace}" describe "${resource}" || true + exit 1 + } + + reconcile_flux_resource() { + local namespace="$1" + local resource="$2" + local timeout_seconds="$3" + local reconcile_at + reconcile_at="$(date +%s%N)" + kubectl -n "${namespace}" annotate "${resource}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite + wait_for_reconcile_handled "${namespace}" "${resource}" "${reconcile_at}" "${timeout_seconds}" + } + reconcile_helmrelease() { local release_name="$1" + local timeout_seconds="${2:-300}" local reconcile_at - reconcile_at="$(date +%s)" + reconcile_at="$(date +%s%N)" kubectl -n flux-system annotate "helmrelease/${release_name}" \ reconcile.fluxcd.io/requestedAt="${reconcile_at}" \ reconcile.fluxcd.io/resetAt="${reconcile_at}" \ reconcile.fluxcd.io/forceAt="${reconcile_at}" \ --overwrite + wait_for_reconcile_handled flux-system "helmrelease/${release_name}" "${reconcile_at}" "${timeout_seconds}" } wait_for_helmchart_ready() { @@ -839,13 +782,11 @@ jobs: local release_name="$2" local timeout="$3" local attempts="$4" - local reconcile_at wait_for_resource flux-system "helmchart.source.toolkit.fluxcd.io/${chart_name}" 600 for attempt in $(seq 1 "${attempts}"); do - reconcile_at="$(date +%s)" - kubectl -n flux-system annotate "helmchart.source.toolkit.fluxcd.io/${chart_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite - kubectl -n flux-system annotate "helmrelease/${release_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite + reconcile_flux_resource flux-system "helmchart.source.toolkit.fluxcd.io/${chart_name}" 300 + reconcile_helmrelease "${release_name}" 300 if kubectl -n flux-system wait --for=condition=Ready "helmchart.source.toolkit.fluxcd.io/${chart_name}" --timeout="${timeout}"; then return 0 @@ -866,12 +807,16 @@ jobs: local elapsed=0 local ready local stalled + local generation + local observed_generation while [ "${elapsed}" -lt "${timeout_seconds}" ]; do ready="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)" stalled="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)" + generation="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.metadata.generation}' 2>/dev/null || true)" + observed_generation="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.observedGeneration}' 2>/dev/null || true)" - if [ "${ready}" = "True" ]; then + if [ "${ready}" = "True" ] && [ "${observed_generation}" = "${generation}" ]; then return 0 fi @@ -928,10 +873,13 @@ jobs: } echo "Waiting for Rancher..." + wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-rancher-secrets 600 + reconcile_flux_resource flux-system kustomization/addon-rancher-secrets 300 + kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-secrets --timeout=600s wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-rancher 600 - kubectl -n flux-system annotate kustomization/addon-rancher reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite + reconcile_flux_resource flux-system kustomization/addon-rancher 300 wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher 600 - reconcile_helmrelease rancher + reconcile_helmrelease rancher 300 wait_for_helmchart_ready flux-system-rancher rancher 180s 5 wait_for_helmrelease_ready rancher cattle-system 900 wait_for_resource "" namespace/cattle-system 600 @@ -956,6 +904,66 @@ jobs: printf '%s' "$1" | tr '/:' '__' } + wait_for_resource() { + local namespace="$1" + local resource="$2" + local timeout_seconds="$3" + local elapsed=0 + + until kubectl -n "${namespace}" get "${resource}" >/dev/null 2>&1; do + if [ "${elapsed}" -ge "${timeout_seconds}" ]; then + echo "Timed out waiting for ${resource} to exist" >&2 + kubectl -n flux-system get kustomizations,helmreleases || true + exit 1 + fi + + sleep 10 + elapsed=$((elapsed + 10)) + done + } + + wait_for_reconcile_handled() { + local resource="$1" + local reconcile_at="$2" + local timeout_seconds="$3" + local elapsed=0 + local handled + + while [ "${elapsed}" -lt "${timeout_seconds}" ]; do + handled="$(kubectl -n flux-system get "${resource}" -o jsonpath='{.status.lastHandledReconcileAt}' 2>/dev/null || true)" + if [ "${handled}" = "${reconcile_at}" ]; then + return 0 + fi + + sleep 5 + elapsed=$((elapsed + 5)) + done + + echo "Timed out waiting for ${resource} to handle reconcile ${reconcile_at}" >&2 + kubectl -n flux-system describe "${resource}" || true + exit 1 + } + + reconcile_flux_resource() { + local resource="$1" + local reconcile_at + reconcile_at="$(date +%s%N)" + kubectl -n flux-system annotate "${resource}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite + wait_for_reconcile_handled "${resource}" "${reconcile_at}" 300 + } + + reconcile_helmrelease() { + local release="$1" + local reconcile_at + reconcile_at="$(date +%s%N)" + kubectl -n flux-system annotate "helmrelease/${release}" \ + reconcile.fluxcd.io/requestedAt="${reconcile_at}" \ + reconcile.fluxcd.io/resetAt="${reconcile_at}" \ + reconcile.fluxcd.io/forceAt="${reconcile_at}" \ + --overwrite + wait_for_reconcile_handled "helmrelease/${release}" "${reconcile_at}" 300 + } + import_required_image() { local image="$1" local host_ip="$2" @@ -1015,7 +1023,8 @@ jobs: done if [ "${failed}" = "true" ]; then - echo "Warning: failed to import ${image} on one or more nodes; continuing so Kubernetes can use already-seeded nodes or retry pulls" >&2 + echo "Failed to import required image ${image} on one or more nodes" >&2 + exit 1 fi } @@ -1034,13 +1043,10 @@ jobs: quay.io/prometheus/node-exporter:v1.8.2; do import_required_image_on_all_nodes "${image}" done - reconcile_at="$(date +%s)" + reconcile_flux_resource kustomization/addon-observability + kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=1200s for release in kube-prometheus-stack loki promtail; do - kubectl -n flux-system annotate "helmrelease/${release}" \ - reconcile.fluxcd.io/requestedAt="${reconcile_at}" \ - reconcile.fluxcd.io/resetAt="${reconcile_at}" \ - reconcile.fluxcd.io/forceAt="${reconcile_at}" \ - --overwrite + reconcile_helmrelease "${release}" done kubectl -n observability rollout restart deployment/observability-kube-prometheus-stack-grafana || true @@ -1055,11 +1061,14 @@ jobs: kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=300s kubectl -n flux-system wait --for=condition=Ready kustomization/addon-cert-manager --timeout=300s kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets --timeout=300s + kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets-store --timeout=300s kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=300s kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-proxyclass --timeout=300s + kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-secrets --timeout=300s kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=900s kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-config --timeout=300s + kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-secrets --timeout=300s kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=1200s kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=300s kubectl -n flux-system wait --for=condition=Ready helmrelease --all --timeout=1200s diff --git a/.gitea/workflows/destroy.yml b/.gitea/workflows/destroy.yml index b4a1ea1..e6cdad6 100644 --- a/.gitea/workflows/destroy.yml +++ b/.gitea/workflows/destroy.yml @@ -27,7 +27,7 @@ env: jobs: destroy: name: Destroy Cluster - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 if: github.event.inputs.confirm == 'destroy' environment: destroy steps: @@ -51,6 +51,7 @@ jobs: working-directory: terraform run: | terraform init \ + -lockfile=readonly \ -backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \ -backend-config="bucket=${{ secrets.S3_BUCKET }}" \ -backend-config="region=auto" \ @@ -58,6 +59,19 @@ jobs: -backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \ -backend-config="skip_requesting_account_id=true" + - name: Save Proxmox target list + run: | + mkdir -p outputs + if ! terraform -chdir=terraform output -json proxmox_target_vms > outputs/proxmox_target_vms.json; then + terraform -chdir=terraform plan \ + -refresh=false \ + -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \ + -var="ssh_private_key=$HOME/.ssh/id_ed25519" \ + -out=cleanup.tfplan \ + -no-color || true + printf '[]' > outputs/proxmox_target_vms.json + fi + - name: Terraform Destroy id: destroy working-directory: terraform @@ -66,6 +80,7 @@ jobs: for attempt in 1 2 3; do echo "Terraform destroy attempt ${attempt}/3" terraform destroy \ + -parallelism=2 \ -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \ -var="ssh_private_key=$HOME/.ssh/id_ed25519" \ -auto-approve @@ -83,6 +98,14 @@ jobs: done exit "$rc" + - name: Verify Proxmox target VMs removed + if: success() + run: | + python3 scripts/proxmox-rebuild-cleanup.py --mode post-destroy --targets-file outputs/proxmox_target_vms.json + if [ -f terraform/cleanup.tfplan ]; then + python3 scripts/proxmox-rebuild-cleanup.py --mode post-destroy --terraform-dir terraform --plan cleanup.tfplan + fi + - name: Terraform state diagnostics if: failure() && steps.destroy.outcome == 'failure' run: | diff --git a/.gitignore b/.gitignore index 90bff6b..b0a52da 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,6 @@ *.tfstate.* *.tfstate.backup .terraform/ -.terraform.lock.hcl terraform.tfvars crash.log override.tf diff --git a/ansible/roles/bootstrap-image-prepull/tasks/main.yml b/ansible/roles/bootstrap-image-prepull/tasks/main.yml index 77bc04f..b1998bb 100644 --- a/ansible/roles/bootstrap-image-prepull/tasks/main.yml +++ b/ansible/roles/bootstrap-image-prepull/tasks/main.yml @@ -21,14 +21,3 @@ register: bootstrap_image_pull loop: "{{ bootstrap_prepull_images }}" changed_when: "'pulled image' in bootstrap_image_pull.stdout" - failed_when: false - -- name: Report bootstrap images that did not pre-pull after retries - debug: - msg: >- - Best-effort bootstrap image pre-pull did not complete for {{ item.item }} after - 3 attempt(s): {{ item.stderr | default('no stderr') }} - loop: "{{ bootstrap_image_pull.results | default([]) }}" - loop_control: - label: "{{ item.item }}" - when: item.rc is defined and item.rc != 0 diff --git a/ansible/roles/common/tasks/main.yml b/ansible/roles/common/tasks/main.yml index a34104f..17ad3b2 100644 --- a/ansible/roles/common/tasks/main.yml +++ b/ansible/roles/common/tasks/main.yml @@ -95,6 +95,10 @@ - name: Install tailscale shell: curl -fsSL https://tailscale.com/install.sh | sh + register: tailscale_install + until: tailscale_install.rc == 0 + retries: 5 + delay: 15 when: - tailscale_auth_key | length > 0 - tailscale_binary.rc != 0 @@ -117,6 +121,11 @@ - name: Connect node to tailnet command: tailscale up --authkey {{ tailscale_auth_key }} --hostname {{ inventory_hostname }} --ssh={{ tailscale_ssh | ternary('true', 'false') }} --accept-routes={{ tailscale_accept_routes | ternary('true', 'false') }} + register: tailscale_up + until: tailscale_up.rc == 0 + retries: 5 + delay: 15 + no_log: true when: - tailscale_auth_key | length > 0 - tailscale_status.rc != 0 or (tailscale_backend_state | default('')) != 'Running' diff --git a/ansible/roles/k3s-agent/tasks/main.yml b/ansible/roles/k3s-agent/tasks/main.yml index 243b512..8ce510e 100644 --- a/ansible/roles/k3s-agent/tasks/main.yml +++ b/ansible/roles/k3s-agent/tasks/main.yml @@ -32,11 +32,22 @@ url: https://get.k3s.io dest: /tmp/install-k3s.sh mode: "0755" + register: k3s_agent_install_script + until: k3s_agent_install_script is succeeded + retries: 5 + delay: 10 when: k3s_agent_install_needed - name: Install k3s agent when: k3s_agent_install_needed block: + - name: Wait for Kubernetes API endpoint before agent join + wait_for: + host: "{{ k3s_server_url | regex_replace('^https?://([^:/]+).*$', '\\1') }}" + port: 6443 + state: started + timeout: 180 + - name: Run k3s agent install environment: INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}" @@ -48,7 +59,9 @@ --flannel-iface={{ k3s_flannel_iface }} {% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %} register: k3s_agent_install - failed_when: false + until: k3s_agent_install.rc == 0 + retries: 3 + delay: 20 - name: Wait for k3s agent to be ready command: systemctl is-active k3s-agent diff --git a/ansible/roles/k3s-server/tasks/main.yml b/ansible/roles/k3s-server/tasks/main.yml index 693dfbe..89a36c3 100644 --- a/ansible/roles/k3s-server/tasks/main.yml +++ b/ansible/roles/k3s-server/tasks/main.yml @@ -62,6 +62,10 @@ url: https://get.k3s.io dest: /tmp/install-k3s.sh mode: "0755" + register: k3s_install_script + until: k3s_install_script is succeeded + retries: 5 + delay: 10 when: k3s_install_needed - name: Install k3s server (primary) @@ -82,7 +86,9 @@ {% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %} {% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %} register: primary_install - failed_when: false + until: primary_install.rc == 0 + retries: 3 + delay: 20 when: - k3s_install_needed - k3s_primary | default(false) @@ -106,7 +112,9 @@ {% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %} {% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %} register: secondary_install - failed_when: false + until: secondary_install.rc == 0 + retries: 3 + delay: 20 - name: Wait for k3s to be ready command: "{{ (k3s_primary | default(false)) | ternary('kubectl get nodes', 'systemctl is-active k3s') }}" diff --git a/ansible/roles/kube-vip-deploy/tasks/main.yml b/ansible/roles/kube-vip-deploy/tasks/main.yml index 5964da1..5c00bfe 100644 --- a/ansible/roles/kube-vip-deploy/tasks/main.yml +++ b/ansible/roles/kube-vip-deploy/tasks/main.yml @@ -40,17 +40,6 @@ register: kube_vip_image_pull loop: "{{ kube_vip_prepull_images }}" changed_when: "'pulled image' in kube_vip_image_pull.stdout" - failed_when: false - -- name: Report kube-vip images that did not pre-pull after retries - debug: - msg: >- - Best-effort kube-vip image pre-pull did not complete for {{ item.item }} after - 3 attempt(s): {{ item.stderr | default('no stderr') }} - loop: "{{ kube_vip_image_pull.results | default([]) }}" - loop_control: - label: "{{ item.item }}" - when: item.rc is defined and item.rc != 0 - name: Render kube-vip control plane manifest template: @@ -60,6 +49,10 @@ - name: Apply kube-vip control plane manifest command: kubectl apply -f /tmp/kube-vip-control-plane.yaml + register: kube_vip_apply + until: kube_vip_apply.rc == 0 + retries: 3 + delay: 10 changed_when: true - name: Wait for local kube-vip pod to be ready diff --git a/ansible/roles/observability-content/tasks/main.yml b/ansible/roles/observability-content/tasks/main.yml index 6b5379e..3502234 100644 --- a/ansible/roles/observability-content/tasks/main.yml +++ b/ansible/roles/observability-content/tasks/main.yml @@ -105,6 +105,11 @@ register: grafana_loki_labels changed_when: false failed_when: false + until: >- + grafana_loki_labels.rc != 0 or + '"data":[]' not in (grafana_loki_labels.stdout | replace(' ', '')) + retries: 30 + delay: 10 when: loki_enabled - name: Fail when Loki is reachable but has zero indexed labels diff --git a/ansible/roles/rancher-image-prepull/tasks/main.yml b/ansible/roles/rancher-image-prepull/tasks/main.yml index 76ae8ae..eac5075 100644 --- a/ansible/roles/rancher-image-prepull/tasks/main.yml +++ b/ansible/roles/rancher-image-prepull/tasks/main.yml @@ -21,14 +21,3 @@ register: rancher_image_pull loop: "{{ rancher_images_to_prepull }}" changed_when: "'pulled image' in rancher_image_pull.stdout" - failed_when: false - -- name: Report Rancher images that did not pre-pull after retries - debug: - msg: >- - Best-effort Rancher image pre-pull did not complete for {{ item.item }} after - 3 attempt(s): {{ item.stderr | default('no stderr') }} - loop: "{{ rancher_image_pull.results | default([]) }}" - loop_control: - label: "{{ item.item }}" - when: item.rc is defined and item.rc != 0 diff --git a/ansible/roles/tailscale-cleanup/tasks/main.yml b/ansible/roles/tailscale-cleanup/tasks/main.yml index 967429d..7bad46b 100644 --- a/ansible/roles/tailscale-cleanup/tasks/main.yml +++ b/ansible/roles/tailscale-cleanup/tasks/main.yml @@ -9,6 +9,9 @@ Authorization: "Bearer {{ tailscale_api_key }}" return_content: true register: ts_devices + until: ts_devices.status == 200 + retries: 5 + delay: 10 - name: Find stale devices matching reserved hostnames set_fact: @@ -34,6 +37,10 @@ headers: Authorization: "Bearer {{ tailscale_api_key }}" status_code: 200 + register: ts_delete_device + until: ts_delete_device.status == 200 + retries: 3 + delay: 5 loop: "{{ stale_devices }}" loop_control: label: "{{ item.name }} ({{ item.id }})" diff --git a/ansible/site.yml b/ansible/site.yml index 3507cd5..38bb454 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -76,6 +76,18 @@ roles: - kube-vip-deploy +- name: Wait for Kubernetes API VIP readiness + hosts: control_plane[0] + become: true + tasks: + - name: Wait for Kubernetes readyz through the VIP + command: kubectl --server=https://{{ kube_api_endpoint }}:6443 get --raw=/readyz + register: api_readyz + until: api_readyz.rc == 0 + retries: 30 + delay: 10 + changed_when: false + - name: Setup secondary control planes hosts: control_plane[1:] become: true @@ -123,6 +135,31 @@ - name: Import kube-vip image into containerd command: /usr/local/bin/ctr -n k8s.io images import /tmp/kube-vip-bootstrap.tar + register: kube_vip_secondary_import + until: kube_vip_secondary_import.rc == 0 + retries: 3 + delay: 10 + changed_when: false + +- name: Wait for all control plane nodes to be Ready + hosts: control_plane[0] + become: true + tasks: + - name: Wait for control plane node readiness + command: kubectl wait --for=condition=Ready node/{{ item }} --timeout=30s + register: control_plane_ready + until: control_plane_ready.rc == 0 + retries: 20 + delay: 15 + changed_when: false + loop: "{{ groups['control_plane'] }}" + + - name: Wait for Kubernetes readyz before worker joins + command: kubectl --server=https://{{ kube_api_endpoint }}:6443 get --raw=/readyz + register: api_readyz_before_workers + until: api_readyz_before_workers.rc == 0 + retries: 30 + delay: 10 changed_when: false - name: Setup workers diff --git a/infrastructure/addons/external-secrets/clustersecretstore-doppler-hetznerterra.yaml b/infrastructure/addons/external-secrets-store/clustersecretstore-doppler-hetznerterra.yaml similarity index 100% rename from infrastructure/addons/external-secrets/clustersecretstore-doppler-hetznerterra.yaml rename to infrastructure/addons/external-secrets-store/clustersecretstore-doppler-hetznerterra.yaml diff --git a/infrastructure/addons/external-secrets-store/kustomization.yaml b/infrastructure/addons/external-secrets-store/kustomization.yaml new file mode 100644 index 0000000..15238f5 --- /dev/null +++ b/infrastructure/addons/external-secrets-store/kustomization.yaml @@ -0,0 +1,4 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - clustersecretstore-doppler-hetznerterra.yaml diff --git a/infrastructure/addons/kustomization-external-secrets-store.yaml b/infrastructure/addons/kustomization-external-secrets-store.yaml new file mode 100644 index 0000000..daf5b3e --- /dev/null +++ b/infrastructure/addons/kustomization-external-secrets-store.yaml @@ -0,0 +1,21 @@ +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: addon-external-secrets-store + namespace: flux-system +spec: + interval: 10m + prune: true + sourceRef: + kind: GitRepository + name: platform + path: ./infrastructure/addons/external-secrets-store + dependsOn: + - name: addon-external-secrets + wait: false + healthChecks: + - apiVersion: external-secrets.io/v1 + kind: ClusterSecretStore + name: doppler-hetznerterra + timeout: 5m + suspend: false diff --git a/infrastructure/addons/kustomization-external-secrets.yaml b/infrastructure/addons/kustomization-external-secrets.yaml index 6ec3eea..a426d58 100644 --- a/infrastructure/addons/kustomization-external-secrets.yaml +++ b/infrastructure/addons/kustomization-external-secrets.yaml @@ -16,5 +16,13 @@ spec: kind: HelmRelease name: external-secrets namespace: flux-system - timeout: 5m + - apiVersion: apps/v1 + kind: Deployment + name: external-secrets-external-secrets + namespace: external-secrets + - apiVersion: apps/v1 + kind: Deployment + name: external-secrets-external-secrets-webhook + namespace: external-secrets + timeout: 10m suspend: false diff --git a/infrastructure/addons/kustomization-observability-secrets.yaml b/infrastructure/addons/kustomization-observability-secrets.yaml new file mode 100644 index 0000000..760ff85 --- /dev/null +++ b/infrastructure/addons/kustomization-observability-secrets.yaml @@ -0,0 +1,26 @@ +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: addon-observability-secrets + namespace: flux-system +spec: + interval: 10m + prune: true + sourceRef: + kind: GitRepository + name: platform + path: ./infrastructure/addons/observability-secrets + dependsOn: + - name: addon-external-secrets-store + wait: false + healthChecks: + - apiVersion: external-secrets.io/v1 + kind: ExternalSecret + name: grafana-admin + namespace: observability + - apiVersion: v1 + kind: Secret + name: grafana-admin-credentials + namespace: observability + timeout: 5m + suspend: false diff --git a/infrastructure/addons/kustomization-observability.yaml b/infrastructure/addons/kustomization-observability.yaml index 2f9b2f3..f4ff8ff 100644 --- a/infrastructure/addons/kustomization-observability.yaml +++ b/infrastructure/addons/kustomization-observability.yaml @@ -11,7 +11,8 @@ spec: name: platform path: ./infrastructure/addons/observability dependsOn: - - name: addon-external-secrets + - name: addon-observability-secrets + - name: addon-nfs-storage - name: addon-tailscale-operator - name: addon-tailscale-proxyclass wait: false @@ -28,5 +29,5 @@ spec: kind: HelmRelease name: promtail namespace: flux-system - timeout: 5m + timeout: 15m suspend: false diff --git a/infrastructure/addons/kustomization-rancher-config.yaml b/infrastructure/addons/kustomization-rancher-config.yaml index 6c18099..005e685 100644 --- a/infrastructure/addons/kustomization-rancher-config.yaml +++ b/infrastructure/addons/kustomization-rancher-config.yaml @@ -13,5 +13,5 @@ spec: dependsOn: - name: addon-rancher wait: true - timeout: 5m + timeout: 10m suspend: false diff --git a/infrastructure/addons/kustomization-rancher-secrets.yaml b/infrastructure/addons/kustomization-rancher-secrets.yaml new file mode 100644 index 0000000..0e1d38a --- /dev/null +++ b/infrastructure/addons/kustomization-rancher-secrets.yaml @@ -0,0 +1,34 @@ +apiVersion: kustomize.toolkit.fluxcd.io/v1 +kind: Kustomization +metadata: + name: addon-rancher-secrets + namespace: flux-system +spec: + interval: 10m + prune: true + sourceRef: + kind: GitRepository + name: platform + path: ./infrastructure/addons/rancher-secrets + dependsOn: + - name: addon-external-secrets-store + wait: false + healthChecks: + - apiVersion: external-secrets.io/v1 + kind: ExternalSecret + name: rancher-bootstrap-password + namespace: flux-system + - apiVersion: v1 + kind: Secret + name: rancher-bootstrap-password + namespace: flux-system + - apiVersion: external-secrets.io/v1 + kind: ExternalSecret + name: rancher-bootstrap-password + namespace: cattle-system + - apiVersion: v1 + kind: Secret + name: rancher-bootstrap-password + namespace: cattle-system + timeout: 5m + suspend: false diff --git a/infrastructure/addons/kustomization-rancher.yaml b/infrastructure/addons/kustomization-rancher.yaml index 93d892f..fa719fd 100644 --- a/infrastructure/addons/kustomization-rancher.yaml +++ b/infrastructure/addons/kustomization-rancher.yaml @@ -10,12 +10,12 @@ spec: kind: GitRepository name: platform path: ./infrastructure/addons/rancher - timeout: 15m + timeout: 30m suspend: false dependsOn: - name: addon-tailscale-operator - name: addon-tailscale-proxyclass - - name: addon-external-secrets + - name: addon-rancher-secrets - name: addon-cert-manager wait: false healthChecks: @@ -23,3 +23,19 @@ spec: kind: HelmRelease name: rancher namespace: flux-system + - apiVersion: apps/v1 + kind: Deployment + name: cattle-system-rancher + namespace: cattle-system + - apiVersion: apps/v1 + kind: Deployment + name: rancher-webhook + namespace: cattle-system + - apiVersion: cert-manager.io/v1 + kind: Issuer + name: cattle-system-rancher + namespace: cattle-system + - apiVersion: cert-manager.io/v1 + kind: Certificate + name: tls-rancher-ingress + namespace: cattle-system diff --git a/infrastructure/addons/kustomization-tailscale-operator.yaml b/infrastructure/addons/kustomization-tailscale-operator.yaml index 44a6dd4..2e52943 100644 --- a/infrastructure/addons/kustomization-tailscale-operator.yaml +++ b/infrastructure/addons/kustomization-tailscale-operator.yaml @@ -16,5 +16,12 @@ spec: kind: HelmRelease name: tailscale-operator namespace: flux-system - timeout: 5m + - apiVersion: apps/v1 + kind: Deployment + name: operator + namespace: tailscale-system + - apiVersion: apiextensions.k8s.io/v1 + kind: CustomResourceDefinition + name: proxyclasses.tailscale.com + timeout: 10m suspend: false diff --git a/infrastructure/addons/kustomization.yaml b/infrastructure/addons/kustomization.yaml index d10dc64..32f4995 100644 --- a/infrastructure/addons/kustomization.yaml +++ b/infrastructure/addons/kustomization.yaml @@ -3,11 +3,14 @@ kind: Kustomization resources: - kustomization-nfs-storage.yaml - kustomization-external-secrets.yaml + - kustomization-external-secrets-store.yaml - kustomization-cert-manager.yaml - kustomization-tailscale-operator.yaml - kustomization-tailscale-proxyclass.yaml - traefik + - kustomization-observability-secrets.yaml - kustomization-observability.yaml - kustomization-observability-content.yaml + - kustomization-rancher-secrets.yaml - kustomization-rancher.yaml - kustomization-rancher-config.yaml diff --git a/infrastructure/addons/observability/grafana-admin-externalsecret.yaml b/infrastructure/addons/observability-secrets/grafana-admin-externalsecret.yaml similarity index 100% rename from infrastructure/addons/observability/grafana-admin-externalsecret.yaml rename to infrastructure/addons/observability-secrets/grafana-admin-externalsecret.yaml diff --git a/infrastructure/addons/observability-secrets/kustomization.yaml b/infrastructure/addons/observability-secrets/kustomization.yaml new file mode 100644 index 0000000..bdf93b1 --- /dev/null +++ b/infrastructure/addons/observability-secrets/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - namespace.yaml + - grafana-admin-externalsecret.yaml diff --git a/infrastructure/addons/observability/namespace.yaml b/infrastructure/addons/observability-secrets/namespace.yaml similarity index 100% rename from infrastructure/addons/observability/namespace.yaml rename to infrastructure/addons/observability-secrets/namespace.yaml diff --git a/infrastructure/addons/observability/kustomization.yaml b/infrastructure/addons/observability/kustomization.yaml index 273635a..d1373cd 100644 --- a/infrastructure/addons/observability/kustomization.yaml +++ b/infrastructure/addons/observability/kustomization.yaml @@ -1,8 +1,6 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - - namespace.yaml - - grafana-admin-externalsecret.yaml - ocirepository-loki.yaml - ocirepository-promtail.yaml - helmrelease-kube-prometheus-stack.yaml diff --git a/infrastructure/addons/rancher-secrets/kustomization.yaml b/infrastructure/addons/rancher-secrets/kustomization.yaml new file mode 100644 index 0000000..55b2d82 --- /dev/null +++ b/infrastructure/addons/rancher-secrets/kustomization.yaml @@ -0,0 +1,6 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - namespace.yaml + - rancher-bootstrap-password-flux-externalsecret.yaml + - rancher-bootstrap-password-externalsecret.yaml diff --git a/infrastructure/addons/rancher/namespace.yaml b/infrastructure/addons/rancher-secrets/namespace.yaml similarity index 100% rename from infrastructure/addons/rancher/namespace.yaml rename to infrastructure/addons/rancher-secrets/namespace.yaml diff --git a/infrastructure/addons/rancher/rancher-bootstrap-password-externalsecret.yaml b/infrastructure/addons/rancher-secrets/rancher-bootstrap-password-externalsecret.yaml similarity index 100% rename from infrastructure/addons/rancher/rancher-bootstrap-password-externalsecret.yaml rename to infrastructure/addons/rancher-secrets/rancher-bootstrap-password-externalsecret.yaml diff --git a/infrastructure/addons/rancher/rancher-bootstrap-password-flux-externalsecret.yaml b/infrastructure/addons/rancher-secrets/rancher-bootstrap-password-flux-externalsecret.yaml similarity index 100% rename from infrastructure/addons/rancher/rancher-bootstrap-password-flux-externalsecret.yaml rename to infrastructure/addons/rancher-secrets/rancher-bootstrap-password-flux-externalsecret.yaml diff --git a/infrastructure/addons/rancher/kustomization.yaml b/infrastructure/addons/rancher/kustomization.yaml index b92e381..8b7e743 100644 --- a/infrastructure/addons/rancher/kustomization.yaml +++ b/infrastructure/addons/rancher/kustomization.yaml @@ -1,8 +1,5 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - - namespace.yaml - helmrelease-rancher.yaml - - rancher-bootstrap-password-flux-externalsecret.yaml - - rancher-bootstrap-password-externalsecret.yaml - rancher-tailscale-service.yaml diff --git a/scripts/proxmox-rebuild-cleanup.py b/scripts/proxmox-rebuild-cleanup.py new file mode 100644 index 0000000..1e2e480 --- /dev/null +++ b/scripts/proxmox-rebuild-cleanup.py @@ -0,0 +1,275 @@ +#!/usr/bin/env python3 + +import argparse +import json +import os +import ssl +import subprocess +import sys +import time +import urllib.error +import urllib.parse +import urllib.request + + +def api_context(): + endpoint = os.environ["TF_VAR_proxmox_endpoint"].strip().removesuffix("/api2/json").rstrip("/") + token_id = os.environ["TF_VAR_proxmox_api_token_id"] + token_secret = os.environ["TF_VAR_proxmox_api_token_secret"] + insecure = os.environ.get("TF_VAR_proxmox_insecure", "false").lower() == "true" + context = ssl._create_unverified_context() if insecure else None + headers = {"Authorization": f"PVEAPIToken={token_id}={token_secret}"} + return endpoint, context, headers + + +ENDPOINT, SSL_CONTEXT, HEADERS = api_context() + + +def request(method, path, data=None, timeout=60): + body = None + headers = dict(HEADERS) + if data is not None: + encoded = urllib.parse.urlencode(data) + if method == "DELETE": + path = f"{path}?{encoded}" + else: + body = encoded.encode() + headers["Content-Type"] = "application/x-www-form-urlencoded" + + req = urllib.request.Request( + f"{ENDPOINT}/api2/json{path}", + method=method, + headers=headers, + data=body, + ) + with urllib.request.urlopen(req, context=SSL_CONTEXT, timeout=timeout) as resp: + return resp.read() + + +def is_missing_vm_error(err): + return err.code == 404 or (err.code == 500 and "conf' does not exist" in err.reason) + + +def vm_exists(target): + try: + request("GET", f"/nodes/{target['node_name']}/qemu/{target['vm_id']}/status/current") + return True + except urllib.error.HTTPError as err: + if is_missing_vm_error(err): + return False + raise + + +def vm_config(target): + try: + raw = request("GET", f"/nodes/{target['node_name']}/qemu/{target['vm_id']}/config") + except urllib.error.HTTPError as err: + if is_missing_vm_error(err): + return {} + raise + return json.loads(raw).get("data", {}) + + +def wait_absent(target): + for _ in range(60): + if not vm_exists(target): + return + time.sleep(5) + raise RuntimeError(f"VM {target['vm_id']} still exists after delete") + + +def normalize_target(raw, address=None): + initialization = raw.get("initialization") or [] + cloud_init_storage = raw.get("cloud_init_storage") + if not cloud_init_storage and initialization and isinstance(initialization, list): + cloud_init_storage = (initialization[0] or {}).get("datastore_id") + + return { + "address": address or raw.get("address"), + "name": raw["name"], + "vm_id": int(raw["vm_id"]), + "node_name": raw.get("node_name") or os.environ.get("TF_VAR_proxmox_node_name", "flex"), + "cloud_init_storage": cloud_init_storage + or os.environ.get("TF_VAR_proxmox_cloud_init_storage_pool", "Flash"), + "tags": raw.get("tags") or [], + "description": raw.get("description") or "", + } + + +def targets_from_plan(terraform_dir, plan_path): + result = subprocess.run( + ["terraform", "-chdir=" + terraform_dir, "show", "-json", plan_path], + check=True, + text=True, + stdout=subprocess.PIPE, + ) + plan = json.loads(result.stdout) + targets = [] + for change in plan.get("resource_changes", []): + if change.get("type") != "proxmox_virtual_environment_vm": + continue + after = (change.get("change") or {}).get("after") or {} + if not after.get("name") or after.get("vm_id") is None: + continue + targets.append(normalize_target(after, change.get("address"))) + return targets + + +def targets_from_output(terraform_dir): + result = subprocess.run( + ["terraform", "-chdir=" + terraform_dir, "output", "-json", "proxmox_target_vms"], + check=True, + text=True, + stdout=subprocess.PIPE, + ) + return [normalize_target(target) for target in json.loads(result.stdout)] + + +def targets_from_file(path): + with open(path, encoding="utf-8") as handle: + data = json.load(handle) + if isinstance(data, dict) and "proxmox_target_vms" in data: + data = data["proxmox_target_vms"]["value"] + return [normalize_target(target) for target in data] + + +def load_targets(args): + if args.targets_file: + return targets_from_file(args.targets_file) + if args.plan: + return targets_from_plan(args.terraform_dir, args.plan) + return targets_from_output(args.terraform_dir) + + +def terraform_state(terraform_dir): + result = subprocess.run( + ["terraform", "-chdir=" + terraform_dir, "state", "list"], + check=False, + text=True, + stdout=subprocess.PIPE, + ) + return set(result.stdout.splitlines()) + + +def tags_from_config(config): + raw = config.get("tags") or "" + if isinstance(raw, list): + return set(raw) + return {tag for tag in raw.split(";") if tag} + + +def assert_owned(target, config): + actual_name = config.get("name") + if actual_name != target["name"]: + raise RuntimeError( + f"Refusing to delete VM {target['vm_id']}: expected name {target['name']!r}, got {actual_name!r}" + ) + + tags = tags_from_config(config) + expected_tags = set(target.get("tags") or []) + description = config.get("description") or "" + expected_description = target.get("description") or "" + has_expected_tags = bool(expected_tags) and expected_tags.issubset(tags) + has_expected_description = bool(expected_description) and description == expected_description + + if not has_expected_tags and not has_expected_description: + raise RuntimeError( + f"Refusing to delete VM {target['vm_id']} ({target['name']}): ownership tags/description do not match" + ) + + +def delete_cloud_init(target): + volume = urllib.parse.quote( + f"{target['cloud_init_storage']}:vm-{target['vm_id']}-cloudinit", + safe="", + ) + try: + request( + "DELETE", + f"/nodes/{target['node_name']}/storage/{target['cloud_init_storage']}/content/{volume}", + ) + print(f"Deleted orphan cloud-init volume for VM {target['vm_id']}") + except urllib.error.HTTPError as err: + if err.code == 404: + print(f"No orphan cloud-init volume for VM {target['vm_id']}") + return + raise + + +def delete_vm(target): + config = vm_config(target) + assert_owned(target, config) + print(f"Deleting Terraform-owned VM {target['vm_id']} ({target['name']})") + try: + request("POST", f"/nodes/{target['node_name']}/qemu/{target['vm_id']}/status/stop") + time.sleep(10) + except urllib.error.HTTPError as err: + if err.code not in (400, 500): + raise + + request( + "DELETE", + f"/nodes/{target['node_name']}/qemu/{target['vm_id']}", + {"purge": "1", "destroy-unreferenced-disks": "1"}, + ) + wait_absent(target) + delete_cloud_init(target) + + +def cleanup_orphan_cloud_init(targets): + for target in targets: + if vm_exists(target): + print(f"VM {target['vm_id']} exists; keeping cloud-init volume") + continue + delete_cloud_init(target) + + +def cleanup_untracked_vms(targets, terraform_dir): + state = terraform_state(terraform_dir) + for target in targets: + if target.get("address") and target["address"] in state: + continue + if not vm_exists(target): + continue + delete_vm(target) + + +def cleanup_post_destroy(targets): + remaining = [] + for target in targets: + if vm_exists(target): + delete_vm(target) + if vm_exists(target): + remaining.append(f"{target['vm_id']} ({target['name']})") + + if remaining: + raise RuntimeError("Target VMs still exist after cleanup: " + ", ".join(remaining)) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--mode", choices=("orphan-cloudinit", "untracked-vms", "post-destroy"), required=True) + parser.add_argument("--terraform-dir", default="terraform") + parser.add_argument("--plan") + parser.add_argument("--targets-file") + args = parser.parse_args() + + targets = load_targets(args) + if not targets: + print("No Proxmox target VMs found") + return + + if args.mode == "orphan-cloudinit": + cleanup_orphan_cloud_init(targets) + elif args.mode == "untracked-vms": + cleanup_untracked_vms(targets, args.terraform_dir) + else: + cleanup_post_destroy(targets) + + +if __name__ == "__main__": + try: + main() + except Exception as err: + print(f"ERROR: {err}", file=sys.stderr) + sys.exit(1) diff --git a/terraform/.terraform.lock.hcl b/terraform/.terraform.lock.hcl new file mode 100644 index 0000000..8b7cada --- /dev/null +++ b/terraform/.terraform.lock.hcl @@ -0,0 +1,44 @@ +# This file is maintained automatically by "terraform init". +# Manual edits may be lost in future updates. + +provider "registry.terraform.io/bpg/proxmox" { + version = "0.103.0" + constraints = "~> 0.103.0" + hashes = [ + "h1:jC9kBUJj9zUCLmM3ApA7OzZXHE1G+DcqxqdRR1fesGc=", + "zh:03ffc90757ed3827bbe50997664ed3ddf6d9b6419723a8091c5d5f81d65f8066", + "zh:1aef5db248cf68976fc0b5c032e1da7fca0a3c2ea6e9074aebb99828a561a898", + "zh:3deab5284c81c92524203a93a0dd21509eb89b867911a3612b0524f05f400740", + "zh:6b44e3293475d528e7a0fd298880652fa6283093ea368e227ebffaa00c3b8821", + "zh:739246a7653ae7052e0398bdb53d07a103aa018de5d7547d423ff5cca8b4a973", + "zh:74adb0f6936460318b3f0af14e11fa6483b7a8551ee592d24e2c855bf952f9ee", + "zh:8eac58a1d8c571bc9e997f21473fd140d8e89ff631b538e3f614dd8aa2fb2cfa", + "zh:ab4415f2ecafa81df3208a940ddf6efc24a661001b5003b04ba5c08b35e98b4c", + "zh:b6a551cf318a6e02fc04f9c817bb53ba6ab39ff7c3fa9a222529ddde7870cbad", + "zh:c1e4c97e079139420d9b158cb6a1008951a3b2f0280fdbe517c3026d413c71d9", + "zh:c2b6ac65a9d78a7558b573279a7c6afd130c9d1b6edd7819786b3eb77183f95f", + "zh:c8544a696504cdae6e3739e6b74372fe57b19ac081232970db8348519e23c4d5", + "zh:ccf3cee3bd04d339380db00b7d35eedf329c42e9441ff06e4e58682a1cccc42e", + "zh:f26e0763dbe6a6b2195c94b44696f2110f7f55433dc142839be16b9697fa5597", + ] +} + +provider "registry.terraform.io/hashicorp/local" { + version = "2.8.0" + constraints = "~> 2.8.0" + hashes = [ + "h1:KCuj8nPbNP/ofQrAoQIuQ3CP6k+ADpULvxr7dw2PrpM=", + "zh:05f18164beab4a84753e5fedf463771ee0c6eca8e90346b8766f1e1c186dec1e", + "zh:563a0702e3711e25ba8930120899b681378b50cbb957fd204b37745c7c9b5f40", + "zh:5b56ab2ed70ed92721febb4a070af0837f1084c44825c18e4b95f7efb1d45d26", + "zh:6cbedc09b67a5cdb9501ff1b18a315fa46a38e0530424cab1c7f4b3acc75f489", + "zh:71b3bd50f89fb385a42a436ba2ce2b8e00f9de53535ce956deff1477b0b117dc", + "zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3", + "zh:9d45ac0a00b85cabdd398b859349d17f124c598b6e6bf272f1bb01321ce708a8", + "zh:a453efe8641a8f31fe806b597bf2b34d7b78b971a8e3919061ea89d61fda7b8d", + "zh:ac692bacb8c3dca8b5b37e5383168aca1f87d3cd7b40615efd300defb76494f5", + "zh:bda9e90c8547d90c9c573206985c5675cc1406047605af037a5069942c3c5966", + "zh:c30a1967de040d00f5038086dd53cdbfb78cc05d1dbc75037410f011bf2a20d8", + "zh:c80bbd1c3f56b3c836d80cf93ac0e8809305c2642f0c98b54bf5d05d3b12718c", + ] +} diff --git a/terraform/main.tf b/terraform/main.tf index 7f8c37a..324007f 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -4,12 +4,12 @@ terraform { required_providers { local = { source = "hashicorp/local" - version = "~> 2.5" + version = "~> 2.8.0" } proxmox = { source = "bpg/proxmox" - version = ">= 0.60.0" + version = "~> 0.103.0" } } } diff --git a/terraform/outputs.tf b/terraform/outputs.tf index e7e9ef8..696a5a2 100644 --- a/terraform/outputs.tf +++ b/terraform/outputs.tf @@ -62,3 +62,18 @@ output "kube_api_lb_ip" { description = "Load Balancer private IP for Kubernetes API (used for cluster joins)" value = var.kube_api_vip } + +output "proxmox_target_vms" { + description = "Proxmox VM targets managed by Terraform, used by rebuild cleanup checks" + value = [ + for name, node in local.nodes : { + name = name + vm_id = node.vm_id + role = node.role + node_name = var.proxmox_node_name + cloud_init_storage = var.proxmox_cloud_init_storage_pool + tags = ["terraform", var.cluster_name, node.role] + description = "Managed by Terraform for ${var.cluster_name}" + } + ] +} diff --git a/terraform/variables.tf b/terraform/variables.tf index 2965a62..cffed5e 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -20,6 +20,11 @@ variable "control_plane_count" { description = "Number of control plane nodes" type = number default = 3 + + validation { + condition = var.control_plane_count > 0 + error_message = "control_plane_count must be greater than zero." + } } variable "control_plane_cores" { @@ -44,6 +49,11 @@ variable "worker_count" { description = "Number of worker nodes" type = number default = 5 + + validation { + condition = var.worker_count >= 0 + error_message = "worker_count must be zero or greater." + } } variable "worker_cores" { @@ -193,24 +203,44 @@ variable "control_plane_ips" { description = "Static IPv4 addresses for control plane VMs" type = list(string) default = ["10.27.27.30", "10.27.27.31", "10.27.27.32"] + + validation { + condition = length(var.control_plane_ips) == length(distinct(var.control_plane_ips)) + error_message = "control_plane_ips must be unique." + } } variable "worker_ips" { description = "Static IPv4 addresses for worker VMs" type = list(string) default = ["10.27.27.41", "10.27.27.42", "10.27.27.43", "10.27.27.44", "10.27.27.45"] + + validation { + condition = length(var.worker_ips) == length(distinct(var.worker_ips)) + error_message = "worker_ips must be unique." + } } variable "control_plane_vm_ids" { description = "Fixed VMIDs for control plane VMs" type = list(number) default = [200, 201, 202] + + validation { + condition = length(var.control_plane_vm_ids) == length(distinct(var.control_plane_vm_ids)) + error_message = "control_plane_vm_ids must be unique." + } } variable "worker_vm_ids" { description = "Fixed VMIDs for worker VMs" type = list(number) default = [210, 211, 212, 213, 214] + + validation { + condition = length(var.worker_vm_ids) == length(distinct(var.worker_vm_ids)) + error_message = "worker_vm_ids must be unique." + } } variable "kube_api_vip" {