fix: harden cluster rebuild determinism
This commit is contained in:
@@ -9,6 +9,10 @@ on:
|
|||||||
- "ansible/roles/observability-content/**"
|
- "ansible/roles/observability-content/**"
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: prod-cluster
|
||||||
|
cancel-in-progress: false
|
||||||
|
|
||||||
env:
|
env:
|
||||||
TF_VERSION: "1.7.0"
|
TF_VERSION: "1.7.0"
|
||||||
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
|
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
|
||||||
@@ -24,7 +28,7 @@ env:
|
|||||||
jobs:
|
jobs:
|
||||||
dashboards:
|
dashboards:
|
||||||
name: Grafana Content
|
name: Grafana Content
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-22.04
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -46,6 +50,7 @@ jobs:
|
|||||||
working-directory: terraform
|
working-directory: terraform
|
||||||
run: |
|
run: |
|
||||||
terraform init \
|
terraform init \
|
||||||
|
-lockfile=readonly \
|
||||||
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
||||||
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
||||||
-backend-config="region=auto" \
|
-backend-config="region=auto" \
|
||||||
@@ -56,7 +61,7 @@ jobs:
|
|||||||
- name: Install Python Dependencies
|
- name: Install Python Dependencies
|
||||||
run: |
|
run: |
|
||||||
apt-get update && apt-get install -y python3-pip
|
apt-get update && apt-get install -y python3-pip
|
||||||
pip3 install --break-system-packages ansible kubernetes jinja2 pyyaml
|
pip3 install --break-system-packages ansible==11.2.0 kubernetes==32.0.1 jinja2==3.1.5 pyyaml==6.0.2
|
||||||
|
|
||||||
- name: Install Ansible Collections
|
- name: Install Ansible Collections
|
||||||
run: ansible-galaxy collection install -r ansible/requirements.yml
|
run: ansible-galaxy collection install -r ansible/requirements.yml
|
||||||
|
|||||||
+244
-235
@@ -15,6 +15,7 @@ concurrency:
|
|||||||
|
|
||||||
env:
|
env:
|
||||||
TF_VERSION: "1.7.0"
|
TF_VERSION: "1.7.0"
|
||||||
|
KUBECTL_VERSION: "v1.34.6"
|
||||||
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
|
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
|
||||||
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
|
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
|
||||||
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
|
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
|
||||||
@@ -30,7 +31,7 @@ env:
|
|||||||
jobs:
|
jobs:
|
||||||
terraform:
|
terraform:
|
||||||
name: Terraform
|
name: Terraform
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-22.04
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
@@ -48,6 +49,7 @@ jobs:
|
|||||||
working-directory: terraform
|
working-directory: terraform
|
||||||
run: |
|
run: |
|
||||||
terraform init \
|
terraform init \
|
||||||
|
-lockfile=readonly \
|
||||||
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
||||||
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
||||||
-backend-config="region=auto" \
|
-backend-config="region=auto" \
|
||||||
@@ -100,59 +102,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Cleanup orphan Proxmox cloud-init volumes
|
- name: Cleanup orphan Proxmox cloud-init volumes
|
||||||
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||||
run: |
|
run: python3 scripts/proxmox-rebuild-cleanup.py --mode orphan-cloudinit --terraform-dir terraform --plan tfplan
|
||||||
set -euo pipefail
|
|
||||||
python3 - <<'PY'
|
|
||||||
import os
|
|
||||||
import ssl
|
|
||||||
import urllib.error
|
|
||||||
import urllib.parse
|
|
||||||
import urllib.request
|
|
||||||
|
|
||||||
endpoint = os.environ["TF_VAR_proxmox_endpoint"].strip().removesuffix("/api2/json").rstrip("/")
|
|
||||||
token_id = os.environ["TF_VAR_proxmox_api_token_id"]
|
|
||||||
token_secret = os.environ["TF_VAR_proxmox_api_token_secret"]
|
|
||||||
insecure = os.environ.get("TF_VAR_proxmox_insecure", "false").lower() == "true"
|
|
||||||
node = "flex"
|
|
||||||
storage = "Flash"
|
|
||||||
vm_ids = [200, 201, 202, 210, 211, 212, 213, 214]
|
|
||||||
context = ssl._create_unverified_context() if insecure else None
|
|
||||||
headers = {"Authorization": f"PVEAPIToken={token_id}={token_secret}"}
|
|
||||||
|
|
||||||
def request(method, path):
|
|
||||||
req = urllib.request.Request(
|
|
||||||
f"{endpoint}/api2/json{path}",
|
|
||||||
method=method,
|
|
||||||
headers=headers,
|
|
||||||
)
|
|
||||||
return urllib.request.urlopen(req, context=context, timeout=30)
|
|
||||||
|
|
||||||
def vm_exists(vmid):
|
|
||||||
try:
|
|
||||||
request("GET", f"/nodes/{node}/qemu/{vmid}/status/current").close()
|
|
||||||
return True
|
|
||||||
except urllib.error.HTTPError as err:
|
|
||||||
if err.code == 404:
|
|
||||||
return False
|
|
||||||
if err.code == 500 and "conf' does not exist" in err.reason:
|
|
||||||
return False
|
|
||||||
raise
|
|
||||||
|
|
||||||
for vmid in vm_ids:
|
|
||||||
if vm_exists(vmid):
|
|
||||||
print(f"VM {vmid} exists; keeping cloud-init volume")
|
|
||||||
continue
|
|
||||||
|
|
||||||
volume = urllib.parse.quote(f"{storage}:vm-{vmid}-cloudinit", safe="")
|
|
||||||
try:
|
|
||||||
request("DELETE", f"/nodes/{node}/storage/{storage}/content/{volume}").close()
|
|
||||||
print(f"Deleted orphan cloud-init volume for VM {vmid}")
|
|
||||||
except urllib.error.HTTPError as err:
|
|
||||||
if err.code == 404:
|
|
||||||
print(f"No orphan cloud-init volume for VM {vmid}")
|
|
||||||
continue
|
|
||||||
raise
|
|
||||||
PY
|
|
||||||
|
|
||||||
- name: Terraform Apply
|
- name: Terraform Apply
|
||||||
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||||
@@ -163,6 +113,7 @@ jobs:
|
|||||||
run_apply() {
|
run_apply() {
|
||||||
local log_file="$1"
|
local log_file="$1"
|
||||||
terraform apply \
|
terraform apply \
|
||||||
|
-parallelism=2 \
|
||||||
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
||||||
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
||||||
-auto-approve 2>&1 | tee "${log_file}"
|
-auto-approve 2>&1 | tee "${log_file}"
|
||||||
@@ -170,124 +121,10 @@ jobs:
|
|||||||
}
|
}
|
||||||
|
|
||||||
cleanup_untracked_target_vms() {
|
cleanup_untracked_target_vms() {
|
||||||
python3 - <<'PY'
|
python3 ../scripts/proxmox-rebuild-cleanup.py --mode untracked-vms --terraform-dir . --plan tfplan
|
||||||
import os
|
|
||||||
import ssl
|
|
||||||
import subprocess
|
|
||||||
import time
|
|
||||||
import urllib.error
|
|
||||||
import urllib.parse
|
|
||||||
import urllib.request
|
|
||||||
|
|
||||||
endpoint = os.environ["TF_VAR_proxmox_endpoint"].strip().removesuffix("/api2/json").rstrip("/")
|
|
||||||
token_id = os.environ["TF_VAR_proxmox_api_token_id"]
|
|
||||||
token_secret = os.environ["TF_VAR_proxmox_api_token_secret"]
|
|
||||||
insecure = os.environ.get("TF_VAR_proxmox_insecure", "false").lower() == "true"
|
|
||||||
node = "flex"
|
|
||||||
storage = "Flash"
|
|
||||||
context = ssl._create_unverified_context() if insecure else None
|
|
||||||
headers = {"Authorization": f"PVEAPIToken={token_id}={token_secret}"}
|
|
||||||
targets = {
|
|
||||||
'proxmox_virtual_environment_vm.nodes["k8s-cluster-cp-1"]': (200, "k8s-cluster-cp-1"),
|
|
||||||
'proxmox_virtual_environment_vm.nodes["k8s-cluster-cp-2"]': (201, "k8s-cluster-cp-2"),
|
|
||||||
'proxmox_virtual_environment_vm.nodes["k8s-cluster-cp-3"]': (202, "k8s-cluster-cp-3"),
|
|
||||||
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-1"]': (210, "k8s-cluster-worker-1"),
|
|
||||||
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-2"]': (211, "k8s-cluster-worker-2"),
|
|
||||||
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-3"]': (212, "k8s-cluster-worker-3"),
|
|
||||||
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-4"]': (213, "k8s-cluster-worker-4"),
|
|
||||||
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-5"]': (214, "k8s-cluster-worker-5"),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def request(method, path, data=None):
|
cleanup_untracked_target_vms
|
||||||
body = None
|
|
||||||
req_headers = dict(headers)
|
|
||||||
if data is not None:
|
|
||||||
encoded = urllib.parse.urlencode(data)
|
|
||||||
if method == "DELETE":
|
|
||||||
path = f"{path}?{encoded}"
|
|
||||||
else:
|
|
||||||
body = encoded.encode()
|
|
||||||
req_headers["Content-Type"] = "application/x-www-form-urlencoded"
|
|
||||||
req = urllib.request.Request(
|
|
||||||
f"{endpoint}/api2/json{path}",
|
|
||||||
method=method,
|
|
||||||
headers=req_headers,
|
|
||||||
data=body,
|
|
||||||
)
|
|
||||||
with urllib.request.urlopen(req, context=context, timeout=60) as resp:
|
|
||||||
return resp.read()
|
|
||||||
|
|
||||||
def vm_status(vmid):
|
|
||||||
try:
|
|
||||||
request("GET", f"/nodes/{node}/qemu/{vmid}/status/current")
|
|
||||||
return True
|
|
||||||
except urllib.error.HTTPError as err:
|
|
||||||
if err.code == 404 or (err.code == 500 and "conf' does not exist" in err.reason):
|
|
||||||
return False
|
|
||||||
raise
|
|
||||||
|
|
||||||
def vm_config(vmid):
|
|
||||||
try:
|
|
||||||
raw = request("GET", f"/nodes/{node}/qemu/{vmid}/config")
|
|
||||||
except urllib.error.HTTPError as err:
|
|
||||||
if err.code == 404 or (err.code == 500 and "conf' does not exist" in err.reason):
|
|
||||||
return {}
|
|
||||||
raise
|
|
||||||
import json
|
|
||||||
return json.loads(raw).get("data", {})
|
|
||||||
|
|
||||||
def wait_absent(vmid):
|
|
||||||
for _ in range(60):
|
|
||||||
if not vm_status(vmid):
|
|
||||||
return
|
|
||||||
time.sleep(5)
|
|
||||||
raise RuntimeError(f"VM {vmid} still exists after delete")
|
|
||||||
|
|
||||||
state = set(
|
|
||||||
subprocess.run(
|
|
||||||
["terraform", "state", "list"],
|
|
||||||
check=False,
|
|
||||||
text=True,
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
).stdout.splitlines()
|
|
||||||
)
|
|
||||||
|
|
||||||
for address, (vmid, expected_name) in targets.items():
|
|
||||||
if address in state:
|
|
||||||
continue
|
|
||||||
if not vm_status(vmid):
|
|
||||||
continue
|
|
||||||
|
|
||||||
config = vm_config(vmid)
|
|
||||||
actual_name = config.get("name")
|
|
||||||
if actual_name != expected_name:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"Refusing to delete VM {vmid}: expected name {expected_name!r}, got {actual_name!r}"
|
|
||||||
)
|
|
||||||
|
|
||||||
print(f"Deleting partial Terraform-untracked VM {vmid} ({expected_name}) before retry")
|
|
||||||
try:
|
|
||||||
request("POST", f"/nodes/{node}/qemu/{vmid}/status/stop")
|
|
||||||
time.sleep(10)
|
|
||||||
except urllib.error.HTTPError as err:
|
|
||||||
if err.code not in (400, 500):
|
|
||||||
raise
|
|
||||||
|
|
||||||
request(
|
|
||||||
"DELETE",
|
|
||||||
f"/nodes/{node}/qemu/{vmid}",
|
|
||||||
{"purge": "1", "destroy-unreferenced-disks": "1"},
|
|
||||||
)
|
|
||||||
wait_absent(vmid)
|
|
||||||
|
|
||||||
volume = urllib.parse.quote(f"{storage}:vm-{vmid}-cloudinit", safe="")
|
|
||||||
try:
|
|
||||||
request("DELETE", f"/nodes/{node}/storage/{storage}/content/{volume}")
|
|
||||||
except urllib.error.HTTPError as err:
|
|
||||||
if err.code != 404:
|
|
||||||
raise
|
|
||||||
PY
|
|
||||||
}
|
|
||||||
|
|
||||||
for attempt in 1 2 3; do
|
for attempt in 1 2 3; do
|
||||||
log_file="/tmp/terraform-apply-${attempt}.log"
|
log_file="/tmp/terraform-apply-${attempt}.log"
|
||||||
@@ -299,11 +136,7 @@ jobs:
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if ! grep -Eq 'HTTP 596|Broken pipe|disk update fails' "${log_file}"; then
|
echo "Terraform apply failed; cleaning Terraform-untracked partial VM creates before retry ${attempt}/2"
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Terraform apply hit transient Proxmox API failure; cleaning partial VM creates before retry ${attempt}/2"
|
|
||||||
cleanup_untracked_target_vms
|
cleanup_untracked_target_vms
|
||||||
sleep 20
|
sleep 20
|
||||||
done
|
done
|
||||||
@@ -320,11 +153,11 @@ jobs:
|
|||||||
uses: actions/upload-artifact@v3
|
uses: actions/upload-artifact@v3
|
||||||
with:
|
with:
|
||||||
name: terraform-outputs
|
name: terraform-outputs
|
||||||
path: outputs/terraform_outputs.json
|
path: terraform/outputs/terraform_outputs.json
|
||||||
|
|
||||||
ansible:
|
ansible:
|
||||||
name: Ansible
|
name: Ansible
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-22.04
|
||||||
needs: terraform
|
needs: terraform
|
||||||
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||||
steps:
|
steps:
|
||||||
@@ -348,6 +181,7 @@ jobs:
|
|||||||
working-directory: terraform
|
working-directory: terraform
|
||||||
run: |
|
run: |
|
||||||
terraform init \
|
terraform init \
|
||||||
|
-lockfile=readonly \
|
||||||
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
||||||
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
||||||
-backend-config="region=auto" \
|
-backend-config="region=auto" \
|
||||||
@@ -364,7 +198,7 @@ jobs:
|
|||||||
- name: Install Python Dependencies
|
- name: Install Python Dependencies
|
||||||
run: |
|
run: |
|
||||||
apt-get update && apt-get install -y python3-pip
|
apt-get update && apt-get install -y python3-pip
|
||||||
pip3 install --break-system-packages ansible kubernetes jinja2 pyyaml
|
pip3 install --break-system-packages ansible==11.2.0 kubernetes==32.0.1 jinja2==3.1.5 pyyaml==6.0.2
|
||||||
|
|
||||||
- name: Install Ansible Collections
|
- name: Install Ansible Collections
|
||||||
run: ansible-galaxy collection install -r ansible/requirements.yml
|
run: ansible-galaxy collection install -r ansible/requirements.yml
|
||||||
@@ -461,7 +295,7 @@ jobs:
|
|||||||
|
|
||||||
- name: Install kubectl
|
- name: Install kubectl
|
||||||
run: |
|
run: |
|
||||||
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
|
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl"
|
||||||
chmod +x /usr/local/bin/kubectl
|
chmod +x /usr/local/bin/kubectl
|
||||||
|
|
||||||
- name: Rewrite kubeconfig for runner-reachable API
|
- name: Rewrite kubeconfig for runner-reachable API
|
||||||
@@ -476,6 +310,7 @@ jobs:
|
|||||||
KUBECONFIG: outputs/kubeconfig
|
KUBECONFIG: outputs/kubeconfig
|
||||||
FLUX_GIT_HOST: 64.176.189.59
|
FLUX_GIT_HOST: 64.176.189.59
|
||||||
FLUX_GIT_PORT: "2222"
|
FLUX_GIT_PORT: "2222"
|
||||||
|
FLUX_KNOWN_HOSTS: ${{ secrets.FLUX_KNOWN_HOSTS }}
|
||||||
run: |
|
run: |
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
flux_rollout_status() {
|
flux_rollout_status() {
|
||||||
@@ -512,6 +347,52 @@ jobs:
|
|||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
|
wait_for_reconcile_handled() {
|
||||||
|
local namespace="$1"
|
||||||
|
local resource="$2"
|
||||||
|
local reconcile_at="$3"
|
||||||
|
local timeout_seconds="$4"
|
||||||
|
local elapsed=0
|
||||||
|
local handled
|
||||||
|
|
||||||
|
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
|
||||||
|
handled="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.lastHandledReconcileAt}' 2>/dev/null || true)"
|
||||||
|
if [ "${handled}" = "${reconcile_at}" ]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
sleep 5
|
||||||
|
elapsed=$((elapsed + 5))
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Timed out waiting for ${resource} to handle reconcile ${reconcile_at}" >&2
|
||||||
|
kubectl -n "${namespace}" describe "${resource}" || true
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
reconcile_flux_resource() {
|
||||||
|
local namespace="$1"
|
||||||
|
local resource="$2"
|
||||||
|
local timeout_seconds="$3"
|
||||||
|
local reconcile_at
|
||||||
|
reconcile_at="$(date +%s%N)"
|
||||||
|
kubectl -n "${namespace}" annotate "${resource}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
|
||||||
|
wait_for_reconcile_handled "${namespace}" "${resource}" "${reconcile_at}" "${timeout_seconds}"
|
||||||
|
}
|
||||||
|
|
||||||
|
reconcile_helmrelease() {
|
||||||
|
local release_name="$1"
|
||||||
|
local timeout_seconds="$2"
|
||||||
|
local reconcile_at
|
||||||
|
reconcile_at="$(date +%s%N)"
|
||||||
|
kubectl -n flux-system annotate "helmrelease/${release_name}" \
|
||||||
|
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
|
||||||
|
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
|
||||||
|
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
|
||||||
|
--overwrite
|
||||||
|
wait_for_reconcile_handled flux-system "helmrelease/${release_name}" "${reconcile_at}" "${timeout_seconds}"
|
||||||
|
}
|
||||||
|
|
||||||
pull_required_image() {
|
pull_required_image() {
|
||||||
local image="$1"
|
local image="$1"
|
||||||
local host_ip="$2"
|
local host_ip="$2"
|
||||||
@@ -594,12 +475,16 @@ jobs:
|
|||||||
local elapsed=0
|
local elapsed=0
|
||||||
local ready
|
local ready
|
||||||
local stalled
|
local stalled
|
||||||
|
local generation
|
||||||
|
local observed_generation
|
||||||
|
|
||||||
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
|
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
|
||||||
ready="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
|
ready="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
|
||||||
stalled="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)"
|
stalled="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)"
|
||||||
|
generation="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.metadata.generation}' 2>/dev/null || true)"
|
||||||
|
observed_generation="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.observedGeneration}' 2>/dev/null || true)"
|
||||||
|
|
||||||
if [ "${ready}" = "True" ]; then
|
if [ "${ready}" = "True" ] && [ "${observed_generation}" = "${generation}" ]; then
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -626,16 +511,10 @@ jobs:
|
|||||||
local target_namespace="$3"
|
local target_namespace="$3"
|
||||||
local oci_timeout="$4"
|
local oci_timeout="$4"
|
||||||
local release_timeout="$5"
|
local release_timeout="$5"
|
||||||
local reconcile_at
|
|
||||||
local artifact_storage
|
local artifact_storage
|
||||||
|
|
||||||
wait_for_resource flux-system "ocirepository.source.toolkit.fluxcd.io/${oci_name}" 600
|
wait_for_resource flux-system "ocirepository.source.toolkit.fluxcd.io/${oci_name}" 600
|
||||||
reconcile_at="$(date +%s)"
|
reconcile_helmrelease "${release_name}" 300
|
||||||
kubectl -n flux-system annotate "helmrelease/${release_name}" \
|
|
||||||
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
|
|
||||||
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
|
|
||||||
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
|
|
||||||
--overwrite
|
|
||||||
|
|
||||||
if ! kubectl -n flux-system wait --for=condition=Ready "ocirepository/${oci_name}" --timeout="${oci_timeout}"; then
|
if ! kubectl -n flux-system wait --for=condition=Ready "ocirepository/${oci_name}" --timeout="${oci_timeout}"; then
|
||||||
artifact_storage="$(kubectl -n flux-system get "ocirepository/${oci_name}" -o jsonpath='{.status.conditions[?(@.type=="ArtifactInStorage")].status}' 2>/dev/null || true)"
|
artifact_storage="$(kubectl -n flux-system get "ocirepository/${oci_name}" -o jsonpath='{.status.conditions[?(@.type=="ArtifactInStorage")].status}' 2>/dev/null || true)"
|
||||||
@@ -671,7 +550,6 @@ jobs:
|
|||||||
local repo_timeout="$5"
|
local repo_timeout="$5"
|
||||||
local chart_timeout="$6"
|
local chart_timeout="$6"
|
||||||
local release_timeout="$7"
|
local release_timeout="$7"
|
||||||
local reconcile_at
|
|
||||||
|
|
||||||
wait_for_resource flux-system "helmrepository.source.toolkit.fluxcd.io/${repo_name}" 600
|
wait_for_resource flux-system "helmrepository.source.toolkit.fluxcd.io/${repo_name}" 600
|
||||||
if ! kubectl -n flux-system wait --for=condition=Ready "helmrepository/${repo_name}" --timeout="${repo_timeout}"; then
|
if ! kubectl -n flux-system wait --for=condition=Ready "helmrepository/${repo_name}" --timeout="${repo_timeout}"; then
|
||||||
@@ -680,13 +558,8 @@ jobs:
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
wait_for_resource flux-system "helmchart.source.toolkit.fluxcd.io/${chart_name}" 600
|
wait_for_resource flux-system "helmchart.source.toolkit.fluxcd.io/${chart_name}" 600
|
||||||
reconcile_at="$(date +%s)"
|
reconcile_flux_resource flux-system "helmchart.source.toolkit.fluxcd.io/${chart_name}" 300
|
||||||
kubectl -n flux-system annotate "helmchart.source.toolkit.fluxcd.io/${chart_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
|
reconcile_helmrelease "${release_name}" 300
|
||||||
kubectl -n flux-system annotate "helmrelease/${release_name}" \
|
|
||||||
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
|
|
||||||
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
|
|
||||||
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
|
|
||||||
--overwrite
|
|
||||||
|
|
||||||
for attempt in $(seq 1 6); do
|
for attempt in $(seq 1 6); do
|
||||||
if kubectl -n flux-system wait --for=condition=Ready "helmchart.source.toolkit.fluxcd.io/${chart_name}" --timeout="${chart_timeout}"; then
|
if kubectl -n flux-system wait --for=condition=Ready "helmchart.source.toolkit.fluxcd.io/${chart_name}" --timeout="${chart_timeout}"; then
|
||||||
@@ -695,9 +568,8 @@ jobs:
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
echo "HelmChart ${chart_name} did not become Ready after ${chart_timeout}; forcing retry (${attempt}/6)" >&2
|
echo "HelmChart ${chart_name} did not become Ready after ${chart_timeout}; forcing retry (${attempt}/6)" >&2
|
||||||
reconcile_at="$(date +%s)"
|
reconcile_flux_resource flux-system "helmchart.source.toolkit.fluxcd.io/${chart_name}" 300
|
||||||
kubectl -n flux-system annotate "helmchart.source.toolkit.fluxcd.io/${chart_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
|
reconcile_helmrelease "${release_name}" 300
|
||||||
kubectl -n flux-system annotate "helmrelease/${release_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
|
|
||||||
done
|
done
|
||||||
|
|
||||||
flux_helm_diagnostics "${repo_name}" "${chart_name}" "${release_name}" "${target_namespace}"
|
flux_helm_diagnostics "${repo_name}" "${chart_name}" "${release_name}" "${target_namespace}"
|
||||||
@@ -705,7 +577,11 @@ jobs:
|
|||||||
}
|
}
|
||||||
|
|
||||||
kubectl create namespace flux-system --dry-run=client -o yaml | kubectl apply -f -
|
kubectl create namespace flux-system --dry-run=client -o yaml | kubectl apply -f -
|
||||||
ssh-keyscan -p "${FLUX_GIT_PORT}" "${FLUX_GIT_HOST}" > /tmp/flux_known_hosts
|
if [ -n "${FLUX_KNOWN_HOSTS}" ]; then
|
||||||
|
printf '%s\n' "${FLUX_KNOWN_HOSTS}" > /tmp/flux_known_hosts
|
||||||
|
else
|
||||||
|
ssh-keyscan -p "${FLUX_GIT_PORT}" "${FLUX_GIT_HOST}" > /tmp/flux_known_hosts
|
||||||
|
fi
|
||||||
kubectl -n flux-system create secret generic flux-system \
|
kubectl -n flux-system create secret generic flux-system \
|
||||||
--from-file=identity="$HOME/.ssh/id_ed25519" \
|
--from-file=identity="$HOME/.ssh/id_ed25519" \
|
||||||
--from-file=known_hosts=/tmp/flux_known_hosts \
|
--from-file=known_hosts=/tmp/flux_known_hosts \
|
||||||
@@ -741,18 +617,17 @@ jobs:
|
|||||||
kubectl -n flux-system patch deployment kustomize-controller --type='merge' -p="$PATCH"
|
kubectl -n flux-system patch deployment kustomize-controller --type='merge' -p="$PATCH"
|
||||||
kubectl -n flux-system patch deployment helm-controller --type='merge' -p="$PATCH"
|
kubectl -n flux-system patch deployment helm-controller --type='merge' -p="$PATCH"
|
||||||
kubectl -n flux-system patch deployment notification-controller --type='merge' -p="$PATCH"
|
kubectl -n flux-system patch deployment notification-controller --type='merge' -p="$PATCH"
|
||||||
kubectl -n flux-system delete pod --field-selector=status.phase!=Running || true
|
|
||||||
flux_rollout_status source-controller
|
flux_rollout_status source-controller
|
||||||
flux_rollout_status kustomize-controller
|
flux_rollout_status kustomize-controller
|
||||||
flux_rollout_status helm-controller
|
flux_rollout_status helm-controller
|
||||||
kubectl -n flux-system wait --for=condition=Ready gitrepository/platform --timeout=300s
|
kubectl -n flux-system wait --for=condition=Ready gitrepository/platform --timeout=300s
|
||||||
kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=600s
|
kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=600s
|
||||||
kubectl -n flux-system annotate kustomization/addon-cert-manager reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
|
reconcile_flux_resource flux-system kustomization/addon-cert-manager 300
|
||||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-cert-manager --timeout=1200s
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-cert-manager --timeout=1200s
|
||||||
kubectl -n flux-system wait --for=condition=Ready helmrelease/cert-manager --timeout=1200s
|
kubectl -n flux-system wait --for=condition=Ready helmrelease/cert-manager --timeout=1200s
|
||||||
# Wait directly on the ESO Helm objects; Kustomization readiness hides useful failure details.
|
# Wait directly on the ESO Helm objects; Kustomization readiness hides useful failure details.
|
||||||
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-external-secrets 600
|
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-external-secrets 600
|
||||||
kubectl -n flux-system annotate kustomization/addon-external-secrets reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
|
reconcile_flux_resource flux-system kustomization/addon-external-secrets 300
|
||||||
import_required_image oci.external-secrets.io/external-secrets/external-secrets:v2.1.0 "${PRIMARY_CP_IP}"
|
import_required_image oci.external-secrets.io/external-secrets/external-secrets:v2.1.0 "${PRIMARY_CP_IP}"
|
||||||
wait_for_flux_oci_helm_release external-secrets external-secrets external-secrets 600s 600
|
wait_for_flux_oci_helm_release external-secrets external-secrets external-secrets 600s 600
|
||||||
wait_for_resource "" crd/clustersecretstores.external-secrets.io 900
|
wait_for_resource "" crd/clustersecretstores.external-secrets.io 900
|
||||||
@@ -764,35 +639,68 @@ jobs:
|
|||||||
wait_for_resource external-secrets service/external-secrets-external-secrets-webhook 600
|
wait_for_resource external-secrets service/external-secrets-external-secrets-webhook 600
|
||||||
wait_for_resource external-secrets endpoints/external-secrets-external-secrets-webhook 600
|
wait_for_resource external-secrets endpoints/external-secrets-external-secrets-webhook 600
|
||||||
kubectl -n external-secrets wait --for=jsonpath='{.subsets[0].addresses[0].ip}' endpoints/external-secrets-external-secrets-webhook --timeout=600s
|
kubectl -n external-secrets wait --for=jsonpath='{.subsets[0].addresses[0].ip}' endpoints/external-secrets-external-secrets-webhook --timeout=600s
|
||||||
# Create Doppler ClusterSecretStore now that ESO CRDs are available
|
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-external-secrets-store 600
|
||||||
kubectl apply -f - <<'EOF'
|
reconcile_flux_resource flux-system kustomization/addon-external-secrets-store 300
|
||||||
apiVersion: external-secrets.io/v1
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets-store --timeout=600s
|
||||||
kind: ClusterSecretStore
|
|
||||||
metadata:
|
|
||||||
name: doppler-hetznerterra
|
|
||||||
spec:
|
|
||||||
provider:
|
|
||||||
doppler:
|
|
||||||
auth:
|
|
||||||
secretRef:
|
|
||||||
dopplerToken:
|
|
||||||
name: doppler-hetznerterra-service-token
|
|
||||||
key: dopplerToken
|
|
||||||
namespace: external-secrets
|
|
||||||
EOF
|
|
||||||
# Wait for the storage layer and private access components
|
# Wait for the storage layer and private access components
|
||||||
import_required_image ghcr.io/tailscale/k8s-operator:v1.96.5 "${PRIMARY_CP_IP}"
|
import_required_image ghcr.io/tailscale/k8s-operator:v1.96.5 "${PRIMARY_CP_IP}"
|
||||||
import_required_image ghcr.io/tailscale/tailscale:v1.96.5 "${PRIMARY_CP_IP}"
|
import_required_image ghcr.io/tailscale/tailscale:v1.96.5 "${PRIMARY_CP_IP}"
|
||||||
kubectl -n flux-system annotate kustomization/addon-tailscale-operator reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
|
reconcile_flux_resource flux-system kustomization/addon-tailscale-operator 300
|
||||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=600s
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=600s
|
||||||
kubectl -n tailscale-system rollout status deployment/operator --timeout=600s
|
kubectl -n tailscale-system rollout status deployment/operator --timeout=600s
|
||||||
import_required_image registry.k8s.io/sig-storage/nfs-subdir-external-provisioner:v4.0.2 "${PRIMARY_CP_IP}"
|
import_required_image registry.k8s.io/sig-storage/nfs-subdir-external-provisioner:v4.0.2 "${PRIMARY_CP_IP}"
|
||||||
kubectl -n flux-system annotate kustomization/addon-nfs-storage reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
|
reconcile_flux_resource flux-system kustomization/addon-nfs-storage 300
|
||||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=300s
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=300s
|
||||||
kubectl -n kube-system rollout status deployment/nfs-subdir-external-provisioner --timeout=300s
|
kubectl -n kube-system rollout status deployment/nfs-subdir-external-provisioner --timeout=300s
|
||||||
kubectl annotate storageclass local-path storageclass.kubernetes.io/is-default-class=false --overwrite
|
kubectl annotate storageclass local-path storageclass.kubernetes.io/is-default-class=false --overwrite
|
||||||
kubectl annotate storageclass flash-nfs storageclass.kubernetes.io/is-default-class=true --overwrite
|
kubectl annotate storageclass flash-nfs storageclass.kubernetes.io/is-default-class=true --overwrite
|
||||||
kubectl get storageclass flash-nfs
|
kubectl get storageclass flash-nfs
|
||||||
|
import_required_image docker.io/library/busybox:1.31.1 "${PRIMARY_CP_IP}"
|
||||||
|
kubectl -n kube-system delete pod/nfs-smoke pvc/nfs-smoke --ignore-not-found=true
|
||||||
|
kubectl apply -f - <<'EOF'
|
||||||
|
apiVersion: v1
|
||||||
|
kind: PersistentVolumeClaim
|
||||||
|
metadata:
|
||||||
|
name: nfs-smoke
|
||||||
|
namespace: kube-system
|
||||||
|
spec:
|
||||||
|
accessModes:
|
||||||
|
- ReadWriteOnce
|
||||||
|
storageClassName: flash-nfs
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 1Mi
|
||||||
|
---
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Pod
|
||||||
|
metadata:
|
||||||
|
name: nfs-smoke
|
||||||
|
namespace: kube-system
|
||||||
|
spec:
|
||||||
|
restartPolicy: Never
|
||||||
|
nodeSelector:
|
||||||
|
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||||
|
tolerations:
|
||||||
|
- key: node-role.kubernetes.io/control-plane
|
||||||
|
operator: Exists
|
||||||
|
effect: NoSchedule
|
||||||
|
containers:
|
||||||
|
- name: smoke
|
||||||
|
image: docker.io/library/busybox:1.31.1
|
||||||
|
command:
|
||||||
|
- sh
|
||||||
|
- -c
|
||||||
|
- echo ok >/data/smoke && test -s /data/smoke && sleep 30
|
||||||
|
volumeMounts:
|
||||||
|
- name: data
|
||||||
|
mountPath: /data
|
||||||
|
volumes:
|
||||||
|
- name: data
|
||||||
|
persistentVolumeClaim:
|
||||||
|
claimName: nfs-smoke
|
||||||
|
EOF
|
||||||
|
kubectl -n kube-system wait --for=condition=Ready pod/nfs-smoke --timeout=180s
|
||||||
|
kubectl -n kube-system delete pod/nfs-smoke pvc/nfs-smoke --ignore-not-found=true --wait=false
|
||||||
|
|
||||||
- name: Wait for Rancher
|
- name: Wait for Rancher
|
||||||
env:
|
env:
|
||||||
@@ -823,15 +731,50 @@ jobs:
|
|||||||
done
|
done
|
||||||
}
|
}
|
||||||
|
|
||||||
|
wait_for_reconcile_handled() {
|
||||||
|
local namespace="$1"
|
||||||
|
local resource="$2"
|
||||||
|
local reconcile_at="$3"
|
||||||
|
local timeout_seconds="$4"
|
||||||
|
local elapsed=0
|
||||||
|
local handled
|
||||||
|
|
||||||
|
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
|
||||||
|
handled="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.lastHandledReconcileAt}' 2>/dev/null || true)"
|
||||||
|
if [ "${handled}" = "${reconcile_at}" ]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
sleep 5
|
||||||
|
elapsed=$((elapsed + 5))
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Timed out waiting for ${resource} to handle reconcile ${reconcile_at}" >&2
|
||||||
|
kubectl -n "${namespace}" describe "${resource}" || true
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
reconcile_flux_resource() {
|
||||||
|
local namespace="$1"
|
||||||
|
local resource="$2"
|
||||||
|
local timeout_seconds="$3"
|
||||||
|
local reconcile_at
|
||||||
|
reconcile_at="$(date +%s%N)"
|
||||||
|
kubectl -n "${namespace}" annotate "${resource}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
|
||||||
|
wait_for_reconcile_handled "${namespace}" "${resource}" "${reconcile_at}" "${timeout_seconds}"
|
||||||
|
}
|
||||||
|
|
||||||
reconcile_helmrelease() {
|
reconcile_helmrelease() {
|
||||||
local release_name="$1"
|
local release_name="$1"
|
||||||
|
local timeout_seconds="${2:-300}"
|
||||||
local reconcile_at
|
local reconcile_at
|
||||||
reconcile_at="$(date +%s)"
|
reconcile_at="$(date +%s%N)"
|
||||||
kubectl -n flux-system annotate "helmrelease/${release_name}" \
|
kubectl -n flux-system annotate "helmrelease/${release_name}" \
|
||||||
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
|
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
|
||||||
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
|
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
|
||||||
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
|
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
|
||||||
--overwrite
|
--overwrite
|
||||||
|
wait_for_reconcile_handled flux-system "helmrelease/${release_name}" "${reconcile_at}" "${timeout_seconds}"
|
||||||
}
|
}
|
||||||
|
|
||||||
wait_for_helmchart_ready() {
|
wait_for_helmchart_ready() {
|
||||||
@@ -839,13 +782,11 @@ jobs:
|
|||||||
local release_name="$2"
|
local release_name="$2"
|
||||||
local timeout="$3"
|
local timeout="$3"
|
||||||
local attempts="$4"
|
local attempts="$4"
|
||||||
local reconcile_at
|
|
||||||
|
|
||||||
wait_for_resource flux-system "helmchart.source.toolkit.fluxcd.io/${chart_name}" 600
|
wait_for_resource flux-system "helmchart.source.toolkit.fluxcd.io/${chart_name}" 600
|
||||||
for attempt in $(seq 1 "${attempts}"); do
|
for attempt in $(seq 1 "${attempts}"); do
|
||||||
reconcile_at="$(date +%s)"
|
reconcile_flux_resource flux-system "helmchart.source.toolkit.fluxcd.io/${chart_name}" 300
|
||||||
kubectl -n flux-system annotate "helmchart.source.toolkit.fluxcd.io/${chart_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
|
reconcile_helmrelease "${release_name}" 300
|
||||||
kubectl -n flux-system annotate "helmrelease/${release_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
|
|
||||||
|
|
||||||
if kubectl -n flux-system wait --for=condition=Ready "helmchart.source.toolkit.fluxcd.io/${chart_name}" --timeout="${timeout}"; then
|
if kubectl -n flux-system wait --for=condition=Ready "helmchart.source.toolkit.fluxcd.io/${chart_name}" --timeout="${timeout}"; then
|
||||||
return 0
|
return 0
|
||||||
@@ -866,12 +807,16 @@ jobs:
|
|||||||
local elapsed=0
|
local elapsed=0
|
||||||
local ready
|
local ready
|
||||||
local stalled
|
local stalled
|
||||||
|
local generation
|
||||||
|
local observed_generation
|
||||||
|
|
||||||
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
|
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
|
||||||
ready="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
|
ready="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
|
||||||
stalled="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)"
|
stalled="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)"
|
||||||
|
generation="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.metadata.generation}' 2>/dev/null || true)"
|
||||||
|
observed_generation="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.observedGeneration}' 2>/dev/null || true)"
|
||||||
|
|
||||||
if [ "${ready}" = "True" ]; then
|
if [ "${ready}" = "True" ] && [ "${observed_generation}" = "${generation}" ]; then
|
||||||
return 0
|
return 0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
@@ -928,10 +873,13 @@ jobs:
|
|||||||
}
|
}
|
||||||
|
|
||||||
echo "Waiting for Rancher..."
|
echo "Waiting for Rancher..."
|
||||||
|
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-rancher-secrets 600
|
||||||
|
reconcile_flux_resource flux-system kustomization/addon-rancher-secrets 300
|
||||||
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-secrets --timeout=600s
|
||||||
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-rancher 600
|
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-rancher 600
|
||||||
kubectl -n flux-system annotate kustomization/addon-rancher reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
|
reconcile_flux_resource flux-system kustomization/addon-rancher 300
|
||||||
wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher 600
|
wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher 600
|
||||||
reconcile_helmrelease rancher
|
reconcile_helmrelease rancher 300
|
||||||
wait_for_helmchart_ready flux-system-rancher rancher 180s 5
|
wait_for_helmchart_ready flux-system-rancher rancher 180s 5
|
||||||
wait_for_helmrelease_ready rancher cattle-system 900
|
wait_for_helmrelease_ready rancher cattle-system 900
|
||||||
wait_for_resource "" namespace/cattle-system 600
|
wait_for_resource "" namespace/cattle-system 600
|
||||||
@@ -956,6 +904,66 @@ jobs:
|
|||||||
printf '%s' "$1" | tr '/:' '__'
|
printf '%s' "$1" | tr '/:' '__'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
wait_for_resource() {
|
||||||
|
local namespace="$1"
|
||||||
|
local resource="$2"
|
||||||
|
local timeout_seconds="$3"
|
||||||
|
local elapsed=0
|
||||||
|
|
||||||
|
until kubectl -n "${namespace}" get "${resource}" >/dev/null 2>&1; do
|
||||||
|
if [ "${elapsed}" -ge "${timeout_seconds}" ]; then
|
||||||
|
echo "Timed out waiting for ${resource} to exist" >&2
|
||||||
|
kubectl -n flux-system get kustomizations,helmreleases || true
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
sleep 10
|
||||||
|
elapsed=$((elapsed + 10))
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
wait_for_reconcile_handled() {
|
||||||
|
local resource="$1"
|
||||||
|
local reconcile_at="$2"
|
||||||
|
local timeout_seconds="$3"
|
||||||
|
local elapsed=0
|
||||||
|
local handled
|
||||||
|
|
||||||
|
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
|
||||||
|
handled="$(kubectl -n flux-system get "${resource}" -o jsonpath='{.status.lastHandledReconcileAt}' 2>/dev/null || true)"
|
||||||
|
if [ "${handled}" = "${reconcile_at}" ]; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
sleep 5
|
||||||
|
elapsed=$((elapsed + 5))
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Timed out waiting for ${resource} to handle reconcile ${reconcile_at}" >&2
|
||||||
|
kubectl -n flux-system describe "${resource}" || true
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
reconcile_flux_resource() {
|
||||||
|
local resource="$1"
|
||||||
|
local reconcile_at
|
||||||
|
reconcile_at="$(date +%s%N)"
|
||||||
|
kubectl -n flux-system annotate "${resource}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
|
||||||
|
wait_for_reconcile_handled "${resource}" "${reconcile_at}" 300
|
||||||
|
}
|
||||||
|
|
||||||
|
reconcile_helmrelease() {
|
||||||
|
local release="$1"
|
||||||
|
local reconcile_at
|
||||||
|
reconcile_at="$(date +%s%N)"
|
||||||
|
kubectl -n flux-system annotate "helmrelease/${release}" \
|
||||||
|
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
|
||||||
|
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
|
||||||
|
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
|
||||||
|
--overwrite
|
||||||
|
wait_for_reconcile_handled "helmrelease/${release}" "${reconcile_at}" 300
|
||||||
|
}
|
||||||
|
|
||||||
import_required_image() {
|
import_required_image() {
|
||||||
local image="$1"
|
local image="$1"
|
||||||
local host_ip="$2"
|
local host_ip="$2"
|
||||||
@@ -1015,7 +1023,8 @@ jobs:
|
|||||||
done
|
done
|
||||||
|
|
||||||
if [ "${failed}" = "true" ]; then
|
if [ "${failed}" = "true" ]; then
|
||||||
echo "Warning: failed to import ${image} on one or more nodes; continuing so Kubernetes can use already-seeded nodes or retry pulls" >&2
|
echo "Failed to import required image ${image} on one or more nodes" >&2
|
||||||
|
exit 1
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1034,13 +1043,10 @@ jobs:
|
|||||||
quay.io/prometheus/node-exporter:v1.8.2; do
|
quay.io/prometheus/node-exporter:v1.8.2; do
|
||||||
import_required_image_on_all_nodes "${image}"
|
import_required_image_on_all_nodes "${image}"
|
||||||
done
|
done
|
||||||
reconcile_at="$(date +%s)"
|
reconcile_flux_resource kustomization/addon-observability
|
||||||
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=1200s
|
||||||
for release in kube-prometheus-stack loki promtail; do
|
for release in kube-prometheus-stack loki promtail; do
|
||||||
kubectl -n flux-system annotate "helmrelease/${release}" \
|
reconcile_helmrelease "${release}"
|
||||||
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
|
|
||||||
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
|
|
||||||
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
|
|
||||||
--overwrite
|
|
||||||
done
|
done
|
||||||
kubectl -n observability rollout restart deployment/observability-kube-prometheus-stack-grafana || true
|
kubectl -n observability rollout restart deployment/observability-kube-prometheus-stack-grafana || true
|
||||||
|
|
||||||
@@ -1055,11 +1061,14 @@ jobs:
|
|||||||
kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=300s
|
kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=300s
|
||||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-cert-manager --timeout=300s
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-cert-manager --timeout=300s
|
||||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets --timeout=300s
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets --timeout=300s
|
||||||
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets-store --timeout=300s
|
||||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=300s
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=300s
|
||||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s
|
||||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-proxyclass --timeout=300s
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-proxyclass --timeout=300s
|
||||||
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-secrets --timeout=300s
|
||||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=900s
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=900s
|
||||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-config --timeout=300s
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-config --timeout=300s
|
||||||
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-secrets --timeout=300s
|
||||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=1200s
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=1200s
|
||||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=300s
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=300s
|
||||||
kubectl -n flux-system wait --for=condition=Ready helmrelease --all --timeout=1200s
|
kubectl -n flux-system wait --for=condition=Ready helmrelease --all --timeout=1200s
|
||||||
|
|||||||
@@ -27,7 +27,7 @@ env:
|
|||||||
jobs:
|
jobs:
|
||||||
destroy:
|
destroy:
|
||||||
name: Destroy Cluster
|
name: Destroy Cluster
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-22.04
|
||||||
if: github.event.inputs.confirm == 'destroy'
|
if: github.event.inputs.confirm == 'destroy'
|
||||||
environment: destroy
|
environment: destroy
|
||||||
steps:
|
steps:
|
||||||
@@ -51,6 +51,7 @@ jobs:
|
|||||||
working-directory: terraform
|
working-directory: terraform
|
||||||
run: |
|
run: |
|
||||||
terraform init \
|
terraform init \
|
||||||
|
-lockfile=readonly \
|
||||||
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
||||||
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
||||||
-backend-config="region=auto" \
|
-backend-config="region=auto" \
|
||||||
@@ -58,6 +59,19 @@ jobs:
|
|||||||
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
|
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
|
||||||
-backend-config="skip_requesting_account_id=true"
|
-backend-config="skip_requesting_account_id=true"
|
||||||
|
|
||||||
|
- name: Save Proxmox target list
|
||||||
|
run: |
|
||||||
|
mkdir -p outputs
|
||||||
|
if ! terraform -chdir=terraform output -json proxmox_target_vms > outputs/proxmox_target_vms.json; then
|
||||||
|
terraform -chdir=terraform plan \
|
||||||
|
-refresh=false \
|
||||||
|
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
||||||
|
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
||||||
|
-out=cleanup.tfplan \
|
||||||
|
-no-color || true
|
||||||
|
printf '[]' > outputs/proxmox_target_vms.json
|
||||||
|
fi
|
||||||
|
|
||||||
- name: Terraform Destroy
|
- name: Terraform Destroy
|
||||||
id: destroy
|
id: destroy
|
||||||
working-directory: terraform
|
working-directory: terraform
|
||||||
@@ -66,6 +80,7 @@ jobs:
|
|||||||
for attempt in 1 2 3; do
|
for attempt in 1 2 3; do
|
||||||
echo "Terraform destroy attempt ${attempt}/3"
|
echo "Terraform destroy attempt ${attempt}/3"
|
||||||
terraform destroy \
|
terraform destroy \
|
||||||
|
-parallelism=2 \
|
||||||
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
||||||
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
||||||
-auto-approve
|
-auto-approve
|
||||||
@@ -83,6 +98,14 @@ jobs:
|
|||||||
done
|
done
|
||||||
exit "$rc"
|
exit "$rc"
|
||||||
|
|
||||||
|
- name: Verify Proxmox target VMs removed
|
||||||
|
if: success()
|
||||||
|
run: |
|
||||||
|
python3 scripts/proxmox-rebuild-cleanup.py --mode post-destroy --targets-file outputs/proxmox_target_vms.json
|
||||||
|
if [ -f terraform/cleanup.tfplan ]; then
|
||||||
|
python3 scripts/proxmox-rebuild-cleanup.py --mode post-destroy --terraform-dir terraform --plan cleanup.tfplan
|
||||||
|
fi
|
||||||
|
|
||||||
- name: Terraform state diagnostics
|
- name: Terraform state diagnostics
|
||||||
if: failure() && steps.destroy.outcome == 'failure'
|
if: failure() && steps.destroy.outcome == 'failure'
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
@@ -3,7 +3,6 @@
|
|||||||
*.tfstate.*
|
*.tfstate.*
|
||||||
*.tfstate.backup
|
*.tfstate.backup
|
||||||
.terraform/
|
.terraform/
|
||||||
.terraform.lock.hcl
|
|
||||||
terraform.tfvars
|
terraform.tfvars
|
||||||
crash.log
|
crash.log
|
||||||
override.tf
|
override.tf
|
||||||
|
|||||||
@@ -21,14 +21,3 @@
|
|||||||
register: bootstrap_image_pull
|
register: bootstrap_image_pull
|
||||||
loop: "{{ bootstrap_prepull_images }}"
|
loop: "{{ bootstrap_prepull_images }}"
|
||||||
changed_when: "'pulled image' in bootstrap_image_pull.stdout"
|
changed_when: "'pulled image' in bootstrap_image_pull.stdout"
|
||||||
failed_when: false
|
|
||||||
|
|
||||||
- name: Report bootstrap images that did not pre-pull after retries
|
|
||||||
debug:
|
|
||||||
msg: >-
|
|
||||||
Best-effort bootstrap image pre-pull did not complete for {{ item.item }} after
|
|
||||||
3 attempt(s): {{ item.stderr | default('no stderr') }}
|
|
||||||
loop: "{{ bootstrap_image_pull.results | default([]) }}"
|
|
||||||
loop_control:
|
|
||||||
label: "{{ item.item }}"
|
|
||||||
when: item.rc is defined and item.rc != 0
|
|
||||||
|
|||||||
@@ -95,6 +95,10 @@
|
|||||||
|
|
||||||
- name: Install tailscale
|
- name: Install tailscale
|
||||||
shell: curl -fsSL https://tailscale.com/install.sh | sh
|
shell: curl -fsSL https://tailscale.com/install.sh | sh
|
||||||
|
register: tailscale_install
|
||||||
|
until: tailscale_install.rc == 0
|
||||||
|
retries: 5
|
||||||
|
delay: 15
|
||||||
when:
|
when:
|
||||||
- tailscale_auth_key | length > 0
|
- tailscale_auth_key | length > 0
|
||||||
- tailscale_binary.rc != 0
|
- tailscale_binary.rc != 0
|
||||||
@@ -117,6 +121,11 @@
|
|||||||
|
|
||||||
- name: Connect node to tailnet
|
- name: Connect node to tailnet
|
||||||
command: tailscale up --authkey {{ tailscale_auth_key }} --hostname {{ inventory_hostname }} --ssh={{ tailscale_ssh | ternary('true', 'false') }} --accept-routes={{ tailscale_accept_routes | ternary('true', 'false') }}
|
command: tailscale up --authkey {{ tailscale_auth_key }} --hostname {{ inventory_hostname }} --ssh={{ tailscale_ssh | ternary('true', 'false') }} --accept-routes={{ tailscale_accept_routes | ternary('true', 'false') }}
|
||||||
|
register: tailscale_up
|
||||||
|
until: tailscale_up.rc == 0
|
||||||
|
retries: 5
|
||||||
|
delay: 15
|
||||||
|
no_log: true
|
||||||
when:
|
when:
|
||||||
- tailscale_auth_key | length > 0
|
- tailscale_auth_key | length > 0
|
||||||
- tailscale_status.rc != 0 or (tailscale_backend_state | default('')) != 'Running'
|
- tailscale_status.rc != 0 or (tailscale_backend_state | default('')) != 'Running'
|
||||||
|
|||||||
@@ -32,11 +32,22 @@
|
|||||||
url: https://get.k3s.io
|
url: https://get.k3s.io
|
||||||
dest: /tmp/install-k3s.sh
|
dest: /tmp/install-k3s.sh
|
||||||
mode: "0755"
|
mode: "0755"
|
||||||
|
register: k3s_agent_install_script
|
||||||
|
until: k3s_agent_install_script is succeeded
|
||||||
|
retries: 5
|
||||||
|
delay: 10
|
||||||
when: k3s_agent_install_needed
|
when: k3s_agent_install_needed
|
||||||
|
|
||||||
- name: Install k3s agent
|
- name: Install k3s agent
|
||||||
when: k3s_agent_install_needed
|
when: k3s_agent_install_needed
|
||||||
block:
|
block:
|
||||||
|
- name: Wait for Kubernetes API endpoint before agent join
|
||||||
|
wait_for:
|
||||||
|
host: "{{ k3s_server_url | regex_replace('^https?://([^:/]+).*$', '\\1') }}"
|
||||||
|
port: 6443
|
||||||
|
state: started
|
||||||
|
timeout: 180
|
||||||
|
|
||||||
- name: Run k3s agent install
|
- name: Run k3s agent install
|
||||||
environment:
|
environment:
|
||||||
INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
|
INSTALL_K3S_VERSION: "{{ k3s_version if k3s_version != 'latest' else '' }}"
|
||||||
@@ -48,7 +59,9 @@
|
|||||||
--flannel-iface={{ k3s_flannel_iface }}
|
--flannel-iface={{ k3s_flannel_iface }}
|
||||||
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
|
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
|
||||||
register: k3s_agent_install
|
register: k3s_agent_install
|
||||||
failed_when: false
|
until: k3s_agent_install.rc == 0
|
||||||
|
retries: 3
|
||||||
|
delay: 20
|
||||||
|
|
||||||
- name: Wait for k3s agent to be ready
|
- name: Wait for k3s agent to be ready
|
||||||
command: systemctl is-active k3s-agent
|
command: systemctl is-active k3s-agent
|
||||||
|
|||||||
@@ -62,6 +62,10 @@
|
|||||||
url: https://get.k3s.io
|
url: https://get.k3s.io
|
||||||
dest: /tmp/install-k3s.sh
|
dest: /tmp/install-k3s.sh
|
||||||
mode: "0755"
|
mode: "0755"
|
||||||
|
register: k3s_install_script
|
||||||
|
until: k3s_install_script is succeeded
|
||||||
|
retries: 5
|
||||||
|
delay: 10
|
||||||
when: k3s_install_needed
|
when: k3s_install_needed
|
||||||
|
|
||||||
- name: Install k3s server (primary)
|
- name: Install k3s server (primary)
|
||||||
@@ -82,7 +86,9 @@
|
|||||||
{% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
|
{% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
|
||||||
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
|
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
|
||||||
register: primary_install
|
register: primary_install
|
||||||
failed_when: false
|
until: primary_install.rc == 0
|
||||||
|
retries: 3
|
||||||
|
delay: 20
|
||||||
when:
|
when:
|
||||||
- k3s_install_needed
|
- k3s_install_needed
|
||||||
- k3s_primary | default(false)
|
- k3s_primary | default(false)
|
||||||
@@ -106,7 +112,9 @@
|
|||||||
{% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
|
{% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
|
||||||
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
|
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
|
||||||
register: secondary_install
|
register: secondary_install
|
||||||
failed_when: false
|
until: secondary_install.rc == 0
|
||||||
|
retries: 3
|
||||||
|
delay: 20
|
||||||
|
|
||||||
- name: Wait for k3s to be ready
|
- name: Wait for k3s to be ready
|
||||||
command: "{{ (k3s_primary | default(false)) | ternary('kubectl get nodes', 'systemctl is-active k3s') }}"
|
command: "{{ (k3s_primary | default(false)) | ternary('kubectl get nodes', 'systemctl is-active k3s') }}"
|
||||||
|
|||||||
@@ -40,17 +40,6 @@
|
|||||||
register: kube_vip_image_pull
|
register: kube_vip_image_pull
|
||||||
loop: "{{ kube_vip_prepull_images }}"
|
loop: "{{ kube_vip_prepull_images }}"
|
||||||
changed_when: "'pulled image' in kube_vip_image_pull.stdout"
|
changed_when: "'pulled image' in kube_vip_image_pull.stdout"
|
||||||
failed_when: false
|
|
||||||
|
|
||||||
- name: Report kube-vip images that did not pre-pull after retries
|
|
||||||
debug:
|
|
||||||
msg: >-
|
|
||||||
Best-effort kube-vip image pre-pull did not complete for {{ item.item }} after
|
|
||||||
3 attempt(s): {{ item.stderr | default('no stderr') }}
|
|
||||||
loop: "{{ kube_vip_image_pull.results | default([]) }}"
|
|
||||||
loop_control:
|
|
||||||
label: "{{ item.item }}"
|
|
||||||
when: item.rc is defined and item.rc != 0
|
|
||||||
|
|
||||||
- name: Render kube-vip control plane manifest
|
- name: Render kube-vip control plane manifest
|
||||||
template:
|
template:
|
||||||
@@ -60,6 +49,10 @@
|
|||||||
|
|
||||||
- name: Apply kube-vip control plane manifest
|
- name: Apply kube-vip control plane manifest
|
||||||
command: kubectl apply -f /tmp/kube-vip-control-plane.yaml
|
command: kubectl apply -f /tmp/kube-vip-control-plane.yaml
|
||||||
|
register: kube_vip_apply
|
||||||
|
until: kube_vip_apply.rc == 0
|
||||||
|
retries: 3
|
||||||
|
delay: 10
|
||||||
changed_when: true
|
changed_when: true
|
||||||
|
|
||||||
- name: Wait for local kube-vip pod to be ready
|
- name: Wait for local kube-vip pod to be ready
|
||||||
|
|||||||
@@ -105,6 +105,11 @@
|
|||||||
register: grafana_loki_labels
|
register: grafana_loki_labels
|
||||||
changed_when: false
|
changed_when: false
|
||||||
failed_when: false
|
failed_when: false
|
||||||
|
until: >-
|
||||||
|
grafana_loki_labels.rc != 0 or
|
||||||
|
'"data":[]' not in (grafana_loki_labels.stdout | replace(' ', ''))
|
||||||
|
retries: 30
|
||||||
|
delay: 10
|
||||||
when: loki_enabled
|
when: loki_enabled
|
||||||
|
|
||||||
- name: Fail when Loki is reachable but has zero indexed labels
|
- name: Fail when Loki is reachable but has zero indexed labels
|
||||||
|
|||||||
@@ -21,14 +21,3 @@
|
|||||||
register: rancher_image_pull
|
register: rancher_image_pull
|
||||||
loop: "{{ rancher_images_to_prepull }}"
|
loop: "{{ rancher_images_to_prepull }}"
|
||||||
changed_when: "'pulled image' in rancher_image_pull.stdout"
|
changed_when: "'pulled image' in rancher_image_pull.stdout"
|
||||||
failed_when: false
|
|
||||||
|
|
||||||
- name: Report Rancher images that did not pre-pull after retries
|
|
||||||
debug:
|
|
||||||
msg: >-
|
|
||||||
Best-effort Rancher image pre-pull did not complete for {{ item.item }} after
|
|
||||||
3 attempt(s): {{ item.stderr | default('no stderr') }}
|
|
||||||
loop: "{{ rancher_image_pull.results | default([]) }}"
|
|
||||||
loop_control:
|
|
||||||
label: "{{ item.item }}"
|
|
||||||
when: item.rc is defined and item.rc != 0
|
|
||||||
|
|||||||
@@ -9,6 +9,9 @@
|
|||||||
Authorization: "Bearer {{ tailscale_api_key }}"
|
Authorization: "Bearer {{ tailscale_api_key }}"
|
||||||
return_content: true
|
return_content: true
|
||||||
register: ts_devices
|
register: ts_devices
|
||||||
|
until: ts_devices.status == 200
|
||||||
|
retries: 5
|
||||||
|
delay: 10
|
||||||
|
|
||||||
- name: Find stale devices matching reserved hostnames
|
- name: Find stale devices matching reserved hostnames
|
||||||
set_fact:
|
set_fact:
|
||||||
@@ -34,6 +37,10 @@
|
|||||||
headers:
|
headers:
|
||||||
Authorization: "Bearer {{ tailscale_api_key }}"
|
Authorization: "Bearer {{ tailscale_api_key }}"
|
||||||
status_code: 200
|
status_code: 200
|
||||||
|
register: ts_delete_device
|
||||||
|
until: ts_delete_device.status == 200
|
||||||
|
retries: 3
|
||||||
|
delay: 5
|
||||||
loop: "{{ stale_devices }}"
|
loop: "{{ stale_devices }}"
|
||||||
loop_control:
|
loop_control:
|
||||||
label: "{{ item.name }} ({{ item.id }})"
|
label: "{{ item.name }} ({{ item.id }})"
|
||||||
|
|||||||
@@ -76,6 +76,18 @@
|
|||||||
roles:
|
roles:
|
||||||
- kube-vip-deploy
|
- kube-vip-deploy
|
||||||
|
|
||||||
|
- name: Wait for Kubernetes API VIP readiness
|
||||||
|
hosts: control_plane[0]
|
||||||
|
become: true
|
||||||
|
tasks:
|
||||||
|
- name: Wait for Kubernetes readyz through the VIP
|
||||||
|
command: kubectl --server=https://{{ kube_api_endpoint }}:6443 get --raw=/readyz
|
||||||
|
register: api_readyz
|
||||||
|
until: api_readyz.rc == 0
|
||||||
|
retries: 30
|
||||||
|
delay: 10
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
- name: Setup secondary control planes
|
- name: Setup secondary control planes
|
||||||
hosts: control_plane[1:]
|
hosts: control_plane[1:]
|
||||||
become: true
|
become: true
|
||||||
@@ -123,6 +135,31 @@
|
|||||||
|
|
||||||
- name: Import kube-vip image into containerd
|
- name: Import kube-vip image into containerd
|
||||||
command: /usr/local/bin/ctr -n k8s.io images import /tmp/kube-vip-bootstrap.tar
|
command: /usr/local/bin/ctr -n k8s.io images import /tmp/kube-vip-bootstrap.tar
|
||||||
|
register: kube_vip_secondary_import
|
||||||
|
until: kube_vip_secondary_import.rc == 0
|
||||||
|
retries: 3
|
||||||
|
delay: 10
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
- name: Wait for all control plane nodes to be Ready
|
||||||
|
hosts: control_plane[0]
|
||||||
|
become: true
|
||||||
|
tasks:
|
||||||
|
- name: Wait for control plane node readiness
|
||||||
|
command: kubectl wait --for=condition=Ready node/{{ item }} --timeout=30s
|
||||||
|
register: control_plane_ready
|
||||||
|
until: control_plane_ready.rc == 0
|
||||||
|
retries: 20
|
||||||
|
delay: 15
|
||||||
|
changed_when: false
|
||||||
|
loop: "{{ groups['control_plane'] }}"
|
||||||
|
|
||||||
|
- name: Wait for Kubernetes readyz before worker joins
|
||||||
|
command: kubectl --server=https://{{ kube_api_endpoint }}:6443 get --raw=/readyz
|
||||||
|
register: api_readyz_before_workers
|
||||||
|
until: api_readyz_before_workers.rc == 0
|
||||||
|
retries: 30
|
||||||
|
delay: 10
|
||||||
changed_when: false
|
changed_when: false
|
||||||
|
|
||||||
- name: Setup workers
|
- name: Setup workers
|
||||||
|
|||||||
@@ -0,0 +1,4 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
- clustersecretstore-doppler-hetznerterra.yaml
|
||||||
@@ -0,0 +1,21 @@
|
|||||||
|
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||||
|
kind: Kustomization
|
||||||
|
metadata:
|
||||||
|
name: addon-external-secrets-store
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 10m
|
||||||
|
prune: true
|
||||||
|
sourceRef:
|
||||||
|
kind: GitRepository
|
||||||
|
name: platform
|
||||||
|
path: ./infrastructure/addons/external-secrets-store
|
||||||
|
dependsOn:
|
||||||
|
- name: addon-external-secrets
|
||||||
|
wait: false
|
||||||
|
healthChecks:
|
||||||
|
- apiVersion: external-secrets.io/v1
|
||||||
|
kind: ClusterSecretStore
|
||||||
|
name: doppler-hetznerterra
|
||||||
|
timeout: 5m
|
||||||
|
suspend: false
|
||||||
@@ -16,5 +16,13 @@ spec:
|
|||||||
kind: HelmRelease
|
kind: HelmRelease
|
||||||
name: external-secrets
|
name: external-secrets
|
||||||
namespace: flux-system
|
namespace: flux-system
|
||||||
timeout: 5m
|
- apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
name: external-secrets-external-secrets
|
||||||
|
namespace: external-secrets
|
||||||
|
- apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
name: external-secrets-external-secrets-webhook
|
||||||
|
namespace: external-secrets
|
||||||
|
timeout: 10m
|
||||||
suspend: false
|
suspend: false
|
||||||
|
|||||||
@@ -0,0 +1,26 @@
|
|||||||
|
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||||
|
kind: Kustomization
|
||||||
|
metadata:
|
||||||
|
name: addon-observability-secrets
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 10m
|
||||||
|
prune: true
|
||||||
|
sourceRef:
|
||||||
|
kind: GitRepository
|
||||||
|
name: platform
|
||||||
|
path: ./infrastructure/addons/observability-secrets
|
||||||
|
dependsOn:
|
||||||
|
- name: addon-external-secrets-store
|
||||||
|
wait: false
|
||||||
|
healthChecks:
|
||||||
|
- apiVersion: external-secrets.io/v1
|
||||||
|
kind: ExternalSecret
|
||||||
|
name: grafana-admin
|
||||||
|
namespace: observability
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: Secret
|
||||||
|
name: grafana-admin-credentials
|
||||||
|
namespace: observability
|
||||||
|
timeout: 5m
|
||||||
|
suspend: false
|
||||||
@@ -11,7 +11,8 @@ spec:
|
|||||||
name: platform
|
name: platform
|
||||||
path: ./infrastructure/addons/observability
|
path: ./infrastructure/addons/observability
|
||||||
dependsOn:
|
dependsOn:
|
||||||
- name: addon-external-secrets
|
- name: addon-observability-secrets
|
||||||
|
- name: addon-nfs-storage
|
||||||
- name: addon-tailscale-operator
|
- name: addon-tailscale-operator
|
||||||
- name: addon-tailscale-proxyclass
|
- name: addon-tailscale-proxyclass
|
||||||
wait: false
|
wait: false
|
||||||
@@ -28,5 +29,5 @@ spec:
|
|||||||
kind: HelmRelease
|
kind: HelmRelease
|
||||||
name: promtail
|
name: promtail
|
||||||
namespace: flux-system
|
namespace: flux-system
|
||||||
timeout: 5m
|
timeout: 15m
|
||||||
suspend: false
|
suspend: false
|
||||||
|
|||||||
@@ -13,5 +13,5 @@ spec:
|
|||||||
dependsOn:
|
dependsOn:
|
||||||
- name: addon-rancher
|
- name: addon-rancher
|
||||||
wait: true
|
wait: true
|
||||||
timeout: 5m
|
timeout: 10m
|
||||||
suspend: false
|
suspend: false
|
||||||
|
|||||||
@@ -0,0 +1,34 @@
|
|||||||
|
apiVersion: kustomize.toolkit.fluxcd.io/v1
|
||||||
|
kind: Kustomization
|
||||||
|
metadata:
|
||||||
|
name: addon-rancher-secrets
|
||||||
|
namespace: flux-system
|
||||||
|
spec:
|
||||||
|
interval: 10m
|
||||||
|
prune: true
|
||||||
|
sourceRef:
|
||||||
|
kind: GitRepository
|
||||||
|
name: platform
|
||||||
|
path: ./infrastructure/addons/rancher-secrets
|
||||||
|
dependsOn:
|
||||||
|
- name: addon-external-secrets-store
|
||||||
|
wait: false
|
||||||
|
healthChecks:
|
||||||
|
- apiVersion: external-secrets.io/v1
|
||||||
|
kind: ExternalSecret
|
||||||
|
name: rancher-bootstrap-password
|
||||||
|
namespace: flux-system
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: Secret
|
||||||
|
name: rancher-bootstrap-password
|
||||||
|
namespace: flux-system
|
||||||
|
- apiVersion: external-secrets.io/v1
|
||||||
|
kind: ExternalSecret
|
||||||
|
name: rancher-bootstrap-password
|
||||||
|
namespace: cattle-system
|
||||||
|
- apiVersion: v1
|
||||||
|
kind: Secret
|
||||||
|
name: rancher-bootstrap-password
|
||||||
|
namespace: cattle-system
|
||||||
|
timeout: 5m
|
||||||
|
suspend: false
|
||||||
@@ -10,12 +10,12 @@ spec:
|
|||||||
kind: GitRepository
|
kind: GitRepository
|
||||||
name: platform
|
name: platform
|
||||||
path: ./infrastructure/addons/rancher
|
path: ./infrastructure/addons/rancher
|
||||||
timeout: 15m
|
timeout: 30m
|
||||||
suspend: false
|
suspend: false
|
||||||
dependsOn:
|
dependsOn:
|
||||||
- name: addon-tailscale-operator
|
- name: addon-tailscale-operator
|
||||||
- name: addon-tailscale-proxyclass
|
- name: addon-tailscale-proxyclass
|
||||||
- name: addon-external-secrets
|
- name: addon-rancher-secrets
|
||||||
- name: addon-cert-manager
|
- name: addon-cert-manager
|
||||||
wait: false
|
wait: false
|
||||||
healthChecks:
|
healthChecks:
|
||||||
@@ -23,3 +23,19 @@ spec:
|
|||||||
kind: HelmRelease
|
kind: HelmRelease
|
||||||
name: rancher
|
name: rancher
|
||||||
namespace: flux-system
|
namespace: flux-system
|
||||||
|
- apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
name: cattle-system-rancher
|
||||||
|
namespace: cattle-system
|
||||||
|
- apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
name: rancher-webhook
|
||||||
|
namespace: cattle-system
|
||||||
|
- apiVersion: cert-manager.io/v1
|
||||||
|
kind: Issuer
|
||||||
|
name: cattle-system-rancher
|
||||||
|
namespace: cattle-system
|
||||||
|
- apiVersion: cert-manager.io/v1
|
||||||
|
kind: Certificate
|
||||||
|
name: tls-rancher-ingress
|
||||||
|
namespace: cattle-system
|
||||||
|
|||||||
@@ -16,5 +16,12 @@ spec:
|
|||||||
kind: HelmRelease
|
kind: HelmRelease
|
||||||
name: tailscale-operator
|
name: tailscale-operator
|
||||||
namespace: flux-system
|
namespace: flux-system
|
||||||
timeout: 5m
|
- apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
name: operator
|
||||||
|
namespace: tailscale-system
|
||||||
|
- apiVersion: apiextensions.k8s.io/v1
|
||||||
|
kind: CustomResourceDefinition
|
||||||
|
name: proxyclasses.tailscale.com
|
||||||
|
timeout: 10m
|
||||||
suspend: false
|
suspend: false
|
||||||
|
|||||||
@@ -3,11 +3,14 @@ kind: Kustomization
|
|||||||
resources:
|
resources:
|
||||||
- kustomization-nfs-storage.yaml
|
- kustomization-nfs-storage.yaml
|
||||||
- kustomization-external-secrets.yaml
|
- kustomization-external-secrets.yaml
|
||||||
|
- kustomization-external-secrets-store.yaml
|
||||||
- kustomization-cert-manager.yaml
|
- kustomization-cert-manager.yaml
|
||||||
- kustomization-tailscale-operator.yaml
|
- kustomization-tailscale-operator.yaml
|
||||||
- kustomization-tailscale-proxyclass.yaml
|
- kustomization-tailscale-proxyclass.yaml
|
||||||
- traefik
|
- traefik
|
||||||
|
- kustomization-observability-secrets.yaml
|
||||||
- kustomization-observability.yaml
|
- kustomization-observability.yaml
|
||||||
- kustomization-observability-content.yaml
|
- kustomization-observability-content.yaml
|
||||||
|
- kustomization-rancher-secrets.yaml
|
||||||
- kustomization-rancher.yaml
|
- kustomization-rancher.yaml
|
||||||
- kustomization-rancher-config.yaml
|
- kustomization-rancher-config.yaml
|
||||||
|
|||||||
@@ -0,0 +1,5 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
- namespace.yaml
|
||||||
|
- grafana-admin-externalsecret.yaml
|
||||||
@@ -1,8 +1,6 @@
|
|||||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
kind: Kustomization
|
kind: Kustomization
|
||||||
resources:
|
resources:
|
||||||
- namespace.yaml
|
|
||||||
- grafana-admin-externalsecret.yaml
|
|
||||||
- ocirepository-loki.yaml
|
- ocirepository-loki.yaml
|
||||||
- ocirepository-promtail.yaml
|
- ocirepository-promtail.yaml
|
||||||
- helmrelease-kube-prometheus-stack.yaml
|
- helmrelease-kube-prometheus-stack.yaml
|
||||||
|
|||||||
@@ -0,0 +1,6 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
resources:
|
||||||
|
- namespace.yaml
|
||||||
|
- rancher-bootstrap-password-flux-externalsecret.yaml
|
||||||
|
- rancher-bootstrap-password-externalsecret.yaml
|
||||||
@@ -1,8 +1,5 @@
|
|||||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
kind: Kustomization
|
kind: Kustomization
|
||||||
resources:
|
resources:
|
||||||
- namespace.yaml
|
|
||||||
- helmrelease-rancher.yaml
|
- helmrelease-rancher.yaml
|
||||||
- rancher-bootstrap-password-flux-externalsecret.yaml
|
|
||||||
- rancher-bootstrap-password-externalsecret.yaml
|
|
||||||
- rancher-tailscale-service.yaml
|
- rancher-tailscale-service.yaml
|
||||||
|
|||||||
@@ -0,0 +1,275 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import ssl
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import urllib.error
|
||||||
|
import urllib.parse
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
|
||||||
|
def api_context():
|
||||||
|
endpoint = os.environ["TF_VAR_proxmox_endpoint"].strip().removesuffix("/api2/json").rstrip("/")
|
||||||
|
token_id = os.environ["TF_VAR_proxmox_api_token_id"]
|
||||||
|
token_secret = os.environ["TF_VAR_proxmox_api_token_secret"]
|
||||||
|
insecure = os.environ.get("TF_VAR_proxmox_insecure", "false").lower() == "true"
|
||||||
|
context = ssl._create_unverified_context() if insecure else None
|
||||||
|
headers = {"Authorization": f"PVEAPIToken={token_id}={token_secret}"}
|
||||||
|
return endpoint, context, headers
|
||||||
|
|
||||||
|
|
||||||
|
ENDPOINT, SSL_CONTEXT, HEADERS = api_context()
|
||||||
|
|
||||||
|
|
||||||
|
def request(method, path, data=None, timeout=60):
|
||||||
|
body = None
|
||||||
|
headers = dict(HEADERS)
|
||||||
|
if data is not None:
|
||||||
|
encoded = urllib.parse.urlencode(data)
|
||||||
|
if method == "DELETE":
|
||||||
|
path = f"{path}?{encoded}"
|
||||||
|
else:
|
||||||
|
body = encoded.encode()
|
||||||
|
headers["Content-Type"] = "application/x-www-form-urlencoded"
|
||||||
|
|
||||||
|
req = urllib.request.Request(
|
||||||
|
f"{ENDPOINT}/api2/json{path}",
|
||||||
|
method=method,
|
||||||
|
headers=headers,
|
||||||
|
data=body,
|
||||||
|
)
|
||||||
|
with urllib.request.urlopen(req, context=SSL_CONTEXT, timeout=timeout) as resp:
|
||||||
|
return resp.read()
|
||||||
|
|
||||||
|
|
||||||
|
def is_missing_vm_error(err):
|
||||||
|
return err.code == 404 or (err.code == 500 and "conf' does not exist" in err.reason)
|
||||||
|
|
||||||
|
|
||||||
|
def vm_exists(target):
|
||||||
|
try:
|
||||||
|
request("GET", f"/nodes/{target['node_name']}/qemu/{target['vm_id']}/status/current")
|
||||||
|
return True
|
||||||
|
except urllib.error.HTTPError as err:
|
||||||
|
if is_missing_vm_error(err):
|
||||||
|
return False
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def vm_config(target):
|
||||||
|
try:
|
||||||
|
raw = request("GET", f"/nodes/{target['node_name']}/qemu/{target['vm_id']}/config")
|
||||||
|
except urllib.error.HTTPError as err:
|
||||||
|
if is_missing_vm_error(err):
|
||||||
|
return {}
|
||||||
|
raise
|
||||||
|
return json.loads(raw).get("data", {})
|
||||||
|
|
||||||
|
|
||||||
|
def wait_absent(target):
|
||||||
|
for _ in range(60):
|
||||||
|
if not vm_exists(target):
|
||||||
|
return
|
||||||
|
time.sleep(5)
|
||||||
|
raise RuntimeError(f"VM {target['vm_id']} still exists after delete")
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_target(raw, address=None):
|
||||||
|
initialization = raw.get("initialization") or []
|
||||||
|
cloud_init_storage = raw.get("cloud_init_storage")
|
||||||
|
if not cloud_init_storage and initialization and isinstance(initialization, list):
|
||||||
|
cloud_init_storage = (initialization[0] or {}).get("datastore_id")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"address": address or raw.get("address"),
|
||||||
|
"name": raw["name"],
|
||||||
|
"vm_id": int(raw["vm_id"]),
|
||||||
|
"node_name": raw.get("node_name") or os.environ.get("TF_VAR_proxmox_node_name", "flex"),
|
||||||
|
"cloud_init_storage": cloud_init_storage
|
||||||
|
or os.environ.get("TF_VAR_proxmox_cloud_init_storage_pool", "Flash"),
|
||||||
|
"tags": raw.get("tags") or [],
|
||||||
|
"description": raw.get("description") or "",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def targets_from_plan(terraform_dir, plan_path):
|
||||||
|
result = subprocess.run(
|
||||||
|
["terraform", "-chdir=" + terraform_dir, "show", "-json", plan_path],
|
||||||
|
check=True,
|
||||||
|
text=True,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
)
|
||||||
|
plan = json.loads(result.stdout)
|
||||||
|
targets = []
|
||||||
|
for change in plan.get("resource_changes", []):
|
||||||
|
if change.get("type") != "proxmox_virtual_environment_vm":
|
||||||
|
continue
|
||||||
|
after = (change.get("change") or {}).get("after") or {}
|
||||||
|
if not after.get("name") or after.get("vm_id") is None:
|
||||||
|
continue
|
||||||
|
targets.append(normalize_target(after, change.get("address")))
|
||||||
|
return targets
|
||||||
|
|
||||||
|
|
||||||
|
def targets_from_output(terraform_dir):
|
||||||
|
result = subprocess.run(
|
||||||
|
["terraform", "-chdir=" + terraform_dir, "output", "-json", "proxmox_target_vms"],
|
||||||
|
check=True,
|
||||||
|
text=True,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
)
|
||||||
|
return [normalize_target(target) for target in json.loads(result.stdout)]
|
||||||
|
|
||||||
|
|
||||||
|
def targets_from_file(path):
|
||||||
|
with open(path, encoding="utf-8") as handle:
|
||||||
|
data = json.load(handle)
|
||||||
|
if isinstance(data, dict) and "proxmox_target_vms" in data:
|
||||||
|
data = data["proxmox_target_vms"]["value"]
|
||||||
|
return [normalize_target(target) for target in data]
|
||||||
|
|
||||||
|
|
||||||
|
def load_targets(args):
|
||||||
|
if args.targets_file:
|
||||||
|
return targets_from_file(args.targets_file)
|
||||||
|
if args.plan:
|
||||||
|
return targets_from_plan(args.terraform_dir, args.plan)
|
||||||
|
return targets_from_output(args.terraform_dir)
|
||||||
|
|
||||||
|
|
||||||
|
def terraform_state(terraform_dir):
|
||||||
|
result = subprocess.run(
|
||||||
|
["terraform", "-chdir=" + terraform_dir, "state", "list"],
|
||||||
|
check=False,
|
||||||
|
text=True,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
)
|
||||||
|
return set(result.stdout.splitlines())
|
||||||
|
|
||||||
|
|
||||||
|
def tags_from_config(config):
|
||||||
|
raw = config.get("tags") or ""
|
||||||
|
if isinstance(raw, list):
|
||||||
|
return set(raw)
|
||||||
|
return {tag for tag in raw.split(";") if tag}
|
||||||
|
|
||||||
|
|
||||||
|
def assert_owned(target, config):
|
||||||
|
actual_name = config.get("name")
|
||||||
|
if actual_name != target["name"]:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Refusing to delete VM {target['vm_id']}: expected name {target['name']!r}, got {actual_name!r}"
|
||||||
|
)
|
||||||
|
|
||||||
|
tags = tags_from_config(config)
|
||||||
|
expected_tags = set(target.get("tags") or [])
|
||||||
|
description = config.get("description") or ""
|
||||||
|
expected_description = target.get("description") or ""
|
||||||
|
has_expected_tags = bool(expected_tags) and expected_tags.issubset(tags)
|
||||||
|
has_expected_description = bool(expected_description) and description == expected_description
|
||||||
|
|
||||||
|
if not has_expected_tags and not has_expected_description:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Refusing to delete VM {target['vm_id']} ({target['name']}): ownership tags/description do not match"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def delete_cloud_init(target):
|
||||||
|
volume = urllib.parse.quote(
|
||||||
|
f"{target['cloud_init_storage']}:vm-{target['vm_id']}-cloudinit",
|
||||||
|
safe="",
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
request(
|
||||||
|
"DELETE",
|
||||||
|
f"/nodes/{target['node_name']}/storage/{target['cloud_init_storage']}/content/{volume}",
|
||||||
|
)
|
||||||
|
print(f"Deleted orphan cloud-init volume for VM {target['vm_id']}")
|
||||||
|
except urllib.error.HTTPError as err:
|
||||||
|
if err.code == 404:
|
||||||
|
print(f"No orphan cloud-init volume for VM {target['vm_id']}")
|
||||||
|
return
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
def delete_vm(target):
|
||||||
|
config = vm_config(target)
|
||||||
|
assert_owned(target, config)
|
||||||
|
print(f"Deleting Terraform-owned VM {target['vm_id']} ({target['name']})")
|
||||||
|
try:
|
||||||
|
request("POST", f"/nodes/{target['node_name']}/qemu/{target['vm_id']}/status/stop")
|
||||||
|
time.sleep(10)
|
||||||
|
except urllib.error.HTTPError as err:
|
||||||
|
if err.code not in (400, 500):
|
||||||
|
raise
|
||||||
|
|
||||||
|
request(
|
||||||
|
"DELETE",
|
||||||
|
f"/nodes/{target['node_name']}/qemu/{target['vm_id']}",
|
||||||
|
{"purge": "1", "destroy-unreferenced-disks": "1"},
|
||||||
|
)
|
||||||
|
wait_absent(target)
|
||||||
|
delete_cloud_init(target)
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup_orphan_cloud_init(targets):
|
||||||
|
for target in targets:
|
||||||
|
if vm_exists(target):
|
||||||
|
print(f"VM {target['vm_id']} exists; keeping cloud-init volume")
|
||||||
|
continue
|
||||||
|
delete_cloud_init(target)
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup_untracked_vms(targets, terraform_dir):
|
||||||
|
state = terraform_state(terraform_dir)
|
||||||
|
for target in targets:
|
||||||
|
if target.get("address") and target["address"] in state:
|
||||||
|
continue
|
||||||
|
if not vm_exists(target):
|
||||||
|
continue
|
||||||
|
delete_vm(target)
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup_post_destroy(targets):
|
||||||
|
remaining = []
|
||||||
|
for target in targets:
|
||||||
|
if vm_exists(target):
|
||||||
|
delete_vm(target)
|
||||||
|
if vm_exists(target):
|
||||||
|
remaining.append(f"{target['vm_id']} ({target['name']})")
|
||||||
|
|
||||||
|
if remaining:
|
||||||
|
raise RuntimeError("Target VMs still exist after cleanup: " + ", ".join(remaining))
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--mode", choices=("orphan-cloudinit", "untracked-vms", "post-destroy"), required=True)
|
||||||
|
parser.add_argument("--terraform-dir", default="terraform")
|
||||||
|
parser.add_argument("--plan")
|
||||||
|
parser.add_argument("--targets-file")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
targets = load_targets(args)
|
||||||
|
if not targets:
|
||||||
|
print("No Proxmox target VMs found")
|
||||||
|
return
|
||||||
|
|
||||||
|
if args.mode == "orphan-cloudinit":
|
||||||
|
cleanup_orphan_cloud_init(targets)
|
||||||
|
elif args.mode == "untracked-vms":
|
||||||
|
cleanup_untracked_vms(targets, args.terraform_dir)
|
||||||
|
else:
|
||||||
|
cleanup_post_destroy(targets)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
try:
|
||||||
|
main()
|
||||||
|
except Exception as err:
|
||||||
|
print(f"ERROR: {err}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
Generated
+44
@@ -0,0 +1,44 @@
|
|||||||
|
# This file is maintained automatically by "terraform init".
|
||||||
|
# Manual edits may be lost in future updates.
|
||||||
|
|
||||||
|
provider "registry.terraform.io/bpg/proxmox" {
|
||||||
|
version = "0.103.0"
|
||||||
|
constraints = "~> 0.103.0"
|
||||||
|
hashes = [
|
||||||
|
"h1:jC9kBUJj9zUCLmM3ApA7OzZXHE1G+DcqxqdRR1fesGc=",
|
||||||
|
"zh:03ffc90757ed3827bbe50997664ed3ddf6d9b6419723a8091c5d5f81d65f8066",
|
||||||
|
"zh:1aef5db248cf68976fc0b5c032e1da7fca0a3c2ea6e9074aebb99828a561a898",
|
||||||
|
"zh:3deab5284c81c92524203a93a0dd21509eb89b867911a3612b0524f05f400740",
|
||||||
|
"zh:6b44e3293475d528e7a0fd298880652fa6283093ea368e227ebffaa00c3b8821",
|
||||||
|
"zh:739246a7653ae7052e0398bdb53d07a103aa018de5d7547d423ff5cca8b4a973",
|
||||||
|
"zh:74adb0f6936460318b3f0af14e11fa6483b7a8551ee592d24e2c855bf952f9ee",
|
||||||
|
"zh:8eac58a1d8c571bc9e997f21473fd140d8e89ff631b538e3f614dd8aa2fb2cfa",
|
||||||
|
"zh:ab4415f2ecafa81df3208a940ddf6efc24a661001b5003b04ba5c08b35e98b4c",
|
||||||
|
"zh:b6a551cf318a6e02fc04f9c817bb53ba6ab39ff7c3fa9a222529ddde7870cbad",
|
||||||
|
"zh:c1e4c97e079139420d9b158cb6a1008951a3b2f0280fdbe517c3026d413c71d9",
|
||||||
|
"zh:c2b6ac65a9d78a7558b573279a7c6afd130c9d1b6edd7819786b3eb77183f95f",
|
||||||
|
"zh:c8544a696504cdae6e3739e6b74372fe57b19ac081232970db8348519e23c4d5",
|
||||||
|
"zh:ccf3cee3bd04d339380db00b7d35eedf329c42e9441ff06e4e58682a1cccc42e",
|
||||||
|
"zh:f26e0763dbe6a6b2195c94b44696f2110f7f55433dc142839be16b9697fa5597",
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
provider "registry.terraform.io/hashicorp/local" {
|
||||||
|
version = "2.8.0"
|
||||||
|
constraints = "~> 2.8.0"
|
||||||
|
hashes = [
|
||||||
|
"h1:KCuj8nPbNP/ofQrAoQIuQ3CP6k+ADpULvxr7dw2PrpM=",
|
||||||
|
"zh:05f18164beab4a84753e5fedf463771ee0c6eca8e90346b8766f1e1c186dec1e",
|
||||||
|
"zh:563a0702e3711e25ba8930120899b681378b50cbb957fd204b37745c7c9b5f40",
|
||||||
|
"zh:5b56ab2ed70ed92721febb4a070af0837f1084c44825c18e4b95f7efb1d45d26",
|
||||||
|
"zh:6cbedc09b67a5cdb9501ff1b18a315fa46a38e0530424cab1c7f4b3acc75f489",
|
||||||
|
"zh:71b3bd50f89fb385a42a436ba2ce2b8e00f9de53535ce956deff1477b0b117dc",
|
||||||
|
"zh:78d5eefdd9e494defcb3c68d282b8f96630502cac21d1ea161f53cfe9bb483b3",
|
||||||
|
"zh:9d45ac0a00b85cabdd398b859349d17f124c598b6e6bf272f1bb01321ce708a8",
|
||||||
|
"zh:a453efe8641a8f31fe806b597bf2b34d7b78b971a8e3919061ea89d61fda7b8d",
|
||||||
|
"zh:ac692bacb8c3dca8b5b37e5383168aca1f87d3cd7b40615efd300defb76494f5",
|
||||||
|
"zh:bda9e90c8547d90c9c573206985c5675cc1406047605af037a5069942c3c5966",
|
||||||
|
"zh:c30a1967de040d00f5038086dd53cdbfb78cc05d1dbc75037410f011bf2a20d8",
|
||||||
|
"zh:c80bbd1c3f56b3c836d80cf93ac0e8809305c2642f0c98b54bf5d05d3b12718c",
|
||||||
|
]
|
||||||
|
}
|
||||||
+2
-2
@@ -4,12 +4,12 @@ terraform {
|
|||||||
required_providers {
|
required_providers {
|
||||||
local = {
|
local = {
|
||||||
source = "hashicorp/local"
|
source = "hashicorp/local"
|
||||||
version = "~> 2.5"
|
version = "~> 2.8.0"
|
||||||
}
|
}
|
||||||
|
|
||||||
proxmox = {
|
proxmox = {
|
||||||
source = "bpg/proxmox"
|
source = "bpg/proxmox"
|
||||||
version = ">= 0.60.0"
|
version = "~> 0.103.0"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -62,3 +62,18 @@ output "kube_api_lb_ip" {
|
|||||||
description = "Load Balancer private IP for Kubernetes API (used for cluster joins)"
|
description = "Load Balancer private IP for Kubernetes API (used for cluster joins)"
|
||||||
value = var.kube_api_vip
|
value = var.kube_api_vip
|
||||||
}
|
}
|
||||||
|
|
||||||
|
output "proxmox_target_vms" {
|
||||||
|
description = "Proxmox VM targets managed by Terraform, used by rebuild cleanup checks"
|
||||||
|
value = [
|
||||||
|
for name, node in local.nodes : {
|
||||||
|
name = name
|
||||||
|
vm_id = node.vm_id
|
||||||
|
role = node.role
|
||||||
|
node_name = var.proxmox_node_name
|
||||||
|
cloud_init_storage = var.proxmox_cloud_init_storage_pool
|
||||||
|
tags = ["terraform", var.cluster_name, node.role]
|
||||||
|
description = "Managed by Terraform for ${var.cluster_name}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|||||||
@@ -20,6 +20,11 @@ variable "control_plane_count" {
|
|||||||
description = "Number of control plane nodes"
|
description = "Number of control plane nodes"
|
||||||
type = number
|
type = number
|
||||||
default = 3
|
default = 3
|
||||||
|
|
||||||
|
validation {
|
||||||
|
condition = var.control_plane_count > 0
|
||||||
|
error_message = "control_plane_count must be greater than zero."
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "control_plane_cores" {
|
variable "control_plane_cores" {
|
||||||
@@ -44,6 +49,11 @@ variable "worker_count" {
|
|||||||
description = "Number of worker nodes"
|
description = "Number of worker nodes"
|
||||||
type = number
|
type = number
|
||||||
default = 5
|
default = 5
|
||||||
|
|
||||||
|
validation {
|
||||||
|
condition = var.worker_count >= 0
|
||||||
|
error_message = "worker_count must be zero or greater."
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "worker_cores" {
|
variable "worker_cores" {
|
||||||
@@ -193,24 +203,44 @@ variable "control_plane_ips" {
|
|||||||
description = "Static IPv4 addresses for control plane VMs"
|
description = "Static IPv4 addresses for control plane VMs"
|
||||||
type = list(string)
|
type = list(string)
|
||||||
default = ["10.27.27.30", "10.27.27.31", "10.27.27.32"]
|
default = ["10.27.27.30", "10.27.27.31", "10.27.27.32"]
|
||||||
|
|
||||||
|
validation {
|
||||||
|
condition = length(var.control_plane_ips) == length(distinct(var.control_plane_ips))
|
||||||
|
error_message = "control_plane_ips must be unique."
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "worker_ips" {
|
variable "worker_ips" {
|
||||||
description = "Static IPv4 addresses for worker VMs"
|
description = "Static IPv4 addresses for worker VMs"
|
||||||
type = list(string)
|
type = list(string)
|
||||||
default = ["10.27.27.41", "10.27.27.42", "10.27.27.43", "10.27.27.44", "10.27.27.45"]
|
default = ["10.27.27.41", "10.27.27.42", "10.27.27.43", "10.27.27.44", "10.27.27.45"]
|
||||||
|
|
||||||
|
validation {
|
||||||
|
condition = length(var.worker_ips) == length(distinct(var.worker_ips))
|
||||||
|
error_message = "worker_ips must be unique."
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "control_plane_vm_ids" {
|
variable "control_plane_vm_ids" {
|
||||||
description = "Fixed VMIDs for control plane VMs"
|
description = "Fixed VMIDs for control plane VMs"
|
||||||
type = list(number)
|
type = list(number)
|
||||||
default = [200, 201, 202]
|
default = [200, 201, 202]
|
||||||
|
|
||||||
|
validation {
|
||||||
|
condition = length(var.control_plane_vm_ids) == length(distinct(var.control_plane_vm_ids))
|
||||||
|
error_message = "control_plane_vm_ids must be unique."
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "worker_vm_ids" {
|
variable "worker_vm_ids" {
|
||||||
description = "Fixed VMIDs for worker VMs"
|
description = "Fixed VMIDs for worker VMs"
|
||||||
type = list(number)
|
type = list(number)
|
||||||
default = [210, 211, 212, 213, 214]
|
default = [210, 211, 212, 213, 214]
|
||||||
|
|
||||||
|
validation {
|
||||||
|
condition = length(var.worker_vm_ids) == length(distinct(var.worker_vm_ids))
|
||||||
|
error_message = "worker_vm_ids must be unique."
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
variable "kube_api_vip" {
|
variable "kube_api_vip" {
|
||||||
|
|||||||
Reference in New Issue
Block a user