fix: harden cluster rebuild determinism
This commit is contained in:
+244
-235
@@ -15,6 +15,7 @@ concurrency:
|
||||
|
||||
env:
|
||||
TF_VERSION: "1.7.0"
|
||||
KUBECTL_VERSION: "v1.34.6"
|
||||
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
|
||||
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
|
||||
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
|
||||
@@ -30,7 +31,7 @@ env:
|
||||
jobs:
|
||||
terraform:
|
||||
name: Terraform
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
@@ -48,6 +49,7 @@ jobs:
|
||||
working-directory: terraform
|
||||
run: |
|
||||
terraform init \
|
||||
-lockfile=readonly \
|
||||
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
||||
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
||||
-backend-config="region=auto" \
|
||||
@@ -100,59 +102,7 @@ jobs:
|
||||
|
||||
- name: Cleanup orphan Proxmox cloud-init volumes
|
||||
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||
run: |
|
||||
set -euo pipefail
|
||||
python3 - <<'PY'
|
||||
import os
|
||||
import ssl
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
|
||||
endpoint = os.environ["TF_VAR_proxmox_endpoint"].strip().removesuffix("/api2/json").rstrip("/")
|
||||
token_id = os.environ["TF_VAR_proxmox_api_token_id"]
|
||||
token_secret = os.environ["TF_VAR_proxmox_api_token_secret"]
|
||||
insecure = os.environ.get("TF_VAR_proxmox_insecure", "false").lower() == "true"
|
||||
node = "flex"
|
||||
storage = "Flash"
|
||||
vm_ids = [200, 201, 202, 210, 211, 212, 213, 214]
|
||||
context = ssl._create_unverified_context() if insecure else None
|
||||
headers = {"Authorization": f"PVEAPIToken={token_id}={token_secret}"}
|
||||
|
||||
def request(method, path):
|
||||
req = urllib.request.Request(
|
||||
f"{endpoint}/api2/json{path}",
|
||||
method=method,
|
||||
headers=headers,
|
||||
)
|
||||
return urllib.request.urlopen(req, context=context, timeout=30)
|
||||
|
||||
def vm_exists(vmid):
|
||||
try:
|
||||
request("GET", f"/nodes/{node}/qemu/{vmid}/status/current").close()
|
||||
return True
|
||||
except urllib.error.HTTPError as err:
|
||||
if err.code == 404:
|
||||
return False
|
||||
if err.code == 500 and "conf' does not exist" in err.reason:
|
||||
return False
|
||||
raise
|
||||
|
||||
for vmid in vm_ids:
|
||||
if vm_exists(vmid):
|
||||
print(f"VM {vmid} exists; keeping cloud-init volume")
|
||||
continue
|
||||
|
||||
volume = urllib.parse.quote(f"{storage}:vm-{vmid}-cloudinit", safe="")
|
||||
try:
|
||||
request("DELETE", f"/nodes/{node}/storage/{storage}/content/{volume}").close()
|
||||
print(f"Deleted orphan cloud-init volume for VM {vmid}")
|
||||
except urllib.error.HTTPError as err:
|
||||
if err.code == 404:
|
||||
print(f"No orphan cloud-init volume for VM {vmid}")
|
||||
continue
|
||||
raise
|
||||
PY
|
||||
run: python3 scripts/proxmox-rebuild-cleanup.py --mode orphan-cloudinit --terraform-dir terraform --plan tfplan
|
||||
|
||||
- name: Terraform Apply
|
||||
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||
@@ -163,6 +113,7 @@ jobs:
|
||||
run_apply() {
|
||||
local log_file="$1"
|
||||
terraform apply \
|
||||
-parallelism=2 \
|
||||
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
||||
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
||||
-auto-approve 2>&1 | tee "${log_file}"
|
||||
@@ -170,124 +121,10 @@ jobs:
|
||||
}
|
||||
|
||||
cleanup_untracked_target_vms() {
|
||||
python3 - <<'PY'
|
||||
import os
|
||||
import ssl
|
||||
import subprocess
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
|
||||
endpoint = os.environ["TF_VAR_proxmox_endpoint"].strip().removesuffix("/api2/json").rstrip("/")
|
||||
token_id = os.environ["TF_VAR_proxmox_api_token_id"]
|
||||
token_secret = os.environ["TF_VAR_proxmox_api_token_secret"]
|
||||
insecure = os.environ.get("TF_VAR_proxmox_insecure", "false").lower() == "true"
|
||||
node = "flex"
|
||||
storage = "Flash"
|
||||
context = ssl._create_unverified_context() if insecure else None
|
||||
headers = {"Authorization": f"PVEAPIToken={token_id}={token_secret}"}
|
||||
targets = {
|
||||
'proxmox_virtual_environment_vm.nodes["k8s-cluster-cp-1"]': (200, "k8s-cluster-cp-1"),
|
||||
'proxmox_virtual_environment_vm.nodes["k8s-cluster-cp-2"]': (201, "k8s-cluster-cp-2"),
|
||||
'proxmox_virtual_environment_vm.nodes["k8s-cluster-cp-3"]': (202, "k8s-cluster-cp-3"),
|
||||
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-1"]': (210, "k8s-cluster-worker-1"),
|
||||
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-2"]': (211, "k8s-cluster-worker-2"),
|
||||
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-3"]': (212, "k8s-cluster-worker-3"),
|
||||
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-4"]': (213, "k8s-cluster-worker-4"),
|
||||
'proxmox_virtual_environment_vm.nodes["k8s-cluster-worker-5"]': (214, "k8s-cluster-worker-5"),
|
||||
python3 ../scripts/proxmox-rebuild-cleanup.py --mode untracked-vms --terraform-dir . --plan tfplan
|
||||
}
|
||||
|
||||
def request(method, path, data=None):
|
||||
body = None
|
||||
req_headers = dict(headers)
|
||||
if data is not None:
|
||||
encoded = urllib.parse.urlencode(data)
|
||||
if method == "DELETE":
|
||||
path = f"{path}?{encoded}"
|
||||
else:
|
||||
body = encoded.encode()
|
||||
req_headers["Content-Type"] = "application/x-www-form-urlencoded"
|
||||
req = urllib.request.Request(
|
||||
f"{endpoint}/api2/json{path}",
|
||||
method=method,
|
||||
headers=req_headers,
|
||||
data=body,
|
||||
)
|
||||
with urllib.request.urlopen(req, context=context, timeout=60) as resp:
|
||||
return resp.read()
|
||||
|
||||
def vm_status(vmid):
|
||||
try:
|
||||
request("GET", f"/nodes/{node}/qemu/{vmid}/status/current")
|
||||
return True
|
||||
except urllib.error.HTTPError as err:
|
||||
if err.code == 404 or (err.code == 500 and "conf' does not exist" in err.reason):
|
||||
return False
|
||||
raise
|
||||
|
||||
def vm_config(vmid):
|
||||
try:
|
||||
raw = request("GET", f"/nodes/{node}/qemu/{vmid}/config")
|
||||
except urllib.error.HTTPError as err:
|
||||
if err.code == 404 or (err.code == 500 and "conf' does not exist" in err.reason):
|
||||
return {}
|
||||
raise
|
||||
import json
|
||||
return json.loads(raw).get("data", {})
|
||||
|
||||
def wait_absent(vmid):
|
||||
for _ in range(60):
|
||||
if not vm_status(vmid):
|
||||
return
|
||||
time.sleep(5)
|
||||
raise RuntimeError(f"VM {vmid} still exists after delete")
|
||||
|
||||
state = set(
|
||||
subprocess.run(
|
||||
["terraform", "state", "list"],
|
||||
check=False,
|
||||
text=True,
|
||||
stdout=subprocess.PIPE,
|
||||
).stdout.splitlines()
|
||||
)
|
||||
|
||||
for address, (vmid, expected_name) in targets.items():
|
||||
if address in state:
|
||||
continue
|
||||
if not vm_status(vmid):
|
||||
continue
|
||||
|
||||
config = vm_config(vmid)
|
||||
actual_name = config.get("name")
|
||||
if actual_name != expected_name:
|
||||
raise RuntimeError(
|
||||
f"Refusing to delete VM {vmid}: expected name {expected_name!r}, got {actual_name!r}"
|
||||
)
|
||||
|
||||
print(f"Deleting partial Terraform-untracked VM {vmid} ({expected_name}) before retry")
|
||||
try:
|
||||
request("POST", f"/nodes/{node}/qemu/{vmid}/status/stop")
|
||||
time.sleep(10)
|
||||
except urllib.error.HTTPError as err:
|
||||
if err.code not in (400, 500):
|
||||
raise
|
||||
|
||||
request(
|
||||
"DELETE",
|
||||
f"/nodes/{node}/qemu/{vmid}",
|
||||
{"purge": "1", "destroy-unreferenced-disks": "1"},
|
||||
)
|
||||
wait_absent(vmid)
|
||||
|
||||
volume = urllib.parse.quote(f"{storage}:vm-{vmid}-cloudinit", safe="")
|
||||
try:
|
||||
request("DELETE", f"/nodes/{node}/storage/{storage}/content/{volume}")
|
||||
except urllib.error.HTTPError as err:
|
||||
if err.code != 404:
|
||||
raise
|
||||
PY
|
||||
}
|
||||
cleanup_untracked_target_vms
|
||||
|
||||
for attempt in 1 2 3; do
|
||||
log_file="/tmp/terraform-apply-${attempt}.log"
|
||||
@@ -299,11 +136,7 @@ jobs:
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ! grep -Eq 'HTTP 596|Broken pipe|disk update fails' "${log_file}"; then
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Terraform apply hit transient Proxmox API failure; cleaning partial VM creates before retry ${attempt}/2"
|
||||
echo "Terraform apply failed; cleaning Terraform-untracked partial VM creates before retry ${attempt}/2"
|
||||
cleanup_untracked_target_vms
|
||||
sleep 20
|
||||
done
|
||||
@@ -320,11 +153,11 @@ jobs:
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: terraform-outputs
|
||||
path: outputs/terraform_outputs.json
|
||||
path: terraform/outputs/terraform_outputs.json
|
||||
|
||||
ansible:
|
||||
name: Ansible
|
||||
runs-on: ubuntu-latest
|
||||
runs-on: ubuntu-22.04
|
||||
needs: terraform
|
||||
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||
steps:
|
||||
@@ -348,6 +181,7 @@ jobs:
|
||||
working-directory: terraform
|
||||
run: |
|
||||
terraform init \
|
||||
-lockfile=readonly \
|
||||
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
||||
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
||||
-backend-config="region=auto" \
|
||||
@@ -364,7 +198,7 @@ jobs:
|
||||
- name: Install Python Dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install -y python3-pip
|
||||
pip3 install --break-system-packages ansible kubernetes jinja2 pyyaml
|
||||
pip3 install --break-system-packages ansible==11.2.0 kubernetes==32.0.1 jinja2==3.1.5 pyyaml==6.0.2
|
||||
|
||||
- name: Install Ansible Collections
|
||||
run: ansible-galaxy collection install -r ansible/requirements.yml
|
||||
@@ -461,7 +295,7 @@ jobs:
|
||||
|
||||
- name: Install kubectl
|
||||
run: |
|
||||
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
|
||||
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl"
|
||||
chmod +x /usr/local/bin/kubectl
|
||||
|
||||
- name: Rewrite kubeconfig for runner-reachable API
|
||||
@@ -476,6 +310,7 @@ jobs:
|
||||
KUBECONFIG: outputs/kubeconfig
|
||||
FLUX_GIT_HOST: 64.176.189.59
|
||||
FLUX_GIT_PORT: "2222"
|
||||
FLUX_KNOWN_HOSTS: ${{ secrets.FLUX_KNOWN_HOSTS }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
flux_rollout_status() {
|
||||
@@ -512,6 +347,52 @@ jobs:
|
||||
done
|
||||
}
|
||||
|
||||
wait_for_reconcile_handled() {
|
||||
local namespace="$1"
|
||||
local resource="$2"
|
||||
local reconcile_at="$3"
|
||||
local timeout_seconds="$4"
|
||||
local elapsed=0
|
||||
local handled
|
||||
|
||||
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
|
||||
handled="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.lastHandledReconcileAt}' 2>/dev/null || true)"
|
||||
if [ "${handled}" = "${reconcile_at}" ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
sleep 5
|
||||
elapsed=$((elapsed + 5))
|
||||
done
|
||||
|
||||
echo "Timed out waiting for ${resource} to handle reconcile ${reconcile_at}" >&2
|
||||
kubectl -n "${namespace}" describe "${resource}" || true
|
||||
exit 1
|
||||
}
|
||||
|
||||
reconcile_flux_resource() {
|
||||
local namespace="$1"
|
||||
local resource="$2"
|
||||
local timeout_seconds="$3"
|
||||
local reconcile_at
|
||||
reconcile_at="$(date +%s%N)"
|
||||
kubectl -n "${namespace}" annotate "${resource}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
|
||||
wait_for_reconcile_handled "${namespace}" "${resource}" "${reconcile_at}" "${timeout_seconds}"
|
||||
}
|
||||
|
||||
reconcile_helmrelease() {
|
||||
local release_name="$1"
|
||||
local timeout_seconds="$2"
|
||||
local reconcile_at
|
||||
reconcile_at="$(date +%s%N)"
|
||||
kubectl -n flux-system annotate "helmrelease/${release_name}" \
|
||||
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
|
||||
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
|
||||
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
|
||||
--overwrite
|
||||
wait_for_reconcile_handled flux-system "helmrelease/${release_name}" "${reconcile_at}" "${timeout_seconds}"
|
||||
}
|
||||
|
||||
pull_required_image() {
|
||||
local image="$1"
|
||||
local host_ip="$2"
|
||||
@@ -594,12 +475,16 @@ jobs:
|
||||
local elapsed=0
|
||||
local ready
|
||||
local stalled
|
||||
local generation
|
||||
local observed_generation
|
||||
|
||||
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
|
||||
ready="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
|
||||
stalled="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)"
|
||||
generation="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.metadata.generation}' 2>/dev/null || true)"
|
||||
observed_generation="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.observedGeneration}' 2>/dev/null || true)"
|
||||
|
||||
if [ "${ready}" = "True" ]; then
|
||||
if [ "${ready}" = "True" ] && [ "${observed_generation}" = "${generation}" ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
@@ -626,16 +511,10 @@ jobs:
|
||||
local target_namespace="$3"
|
||||
local oci_timeout="$4"
|
||||
local release_timeout="$5"
|
||||
local reconcile_at
|
||||
local artifact_storage
|
||||
|
||||
wait_for_resource flux-system "ocirepository.source.toolkit.fluxcd.io/${oci_name}" 600
|
||||
reconcile_at="$(date +%s)"
|
||||
kubectl -n flux-system annotate "helmrelease/${release_name}" \
|
||||
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
|
||||
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
|
||||
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
|
||||
--overwrite
|
||||
reconcile_helmrelease "${release_name}" 300
|
||||
|
||||
if ! kubectl -n flux-system wait --for=condition=Ready "ocirepository/${oci_name}" --timeout="${oci_timeout}"; then
|
||||
artifact_storage="$(kubectl -n flux-system get "ocirepository/${oci_name}" -o jsonpath='{.status.conditions[?(@.type=="ArtifactInStorage")].status}' 2>/dev/null || true)"
|
||||
@@ -671,7 +550,6 @@ jobs:
|
||||
local repo_timeout="$5"
|
||||
local chart_timeout="$6"
|
||||
local release_timeout="$7"
|
||||
local reconcile_at
|
||||
|
||||
wait_for_resource flux-system "helmrepository.source.toolkit.fluxcd.io/${repo_name}" 600
|
||||
if ! kubectl -n flux-system wait --for=condition=Ready "helmrepository/${repo_name}" --timeout="${repo_timeout}"; then
|
||||
@@ -680,13 +558,8 @@ jobs:
|
||||
fi
|
||||
|
||||
wait_for_resource flux-system "helmchart.source.toolkit.fluxcd.io/${chart_name}" 600
|
||||
reconcile_at="$(date +%s)"
|
||||
kubectl -n flux-system annotate "helmchart.source.toolkit.fluxcd.io/${chart_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
|
||||
kubectl -n flux-system annotate "helmrelease/${release_name}" \
|
||||
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
|
||||
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
|
||||
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
|
||||
--overwrite
|
||||
reconcile_flux_resource flux-system "helmchart.source.toolkit.fluxcd.io/${chart_name}" 300
|
||||
reconcile_helmrelease "${release_name}" 300
|
||||
|
||||
for attempt in $(seq 1 6); do
|
||||
if kubectl -n flux-system wait --for=condition=Ready "helmchart.source.toolkit.fluxcd.io/${chart_name}" --timeout="${chart_timeout}"; then
|
||||
@@ -695,9 +568,8 @@ jobs:
|
||||
fi
|
||||
|
||||
echo "HelmChart ${chart_name} did not become Ready after ${chart_timeout}; forcing retry (${attempt}/6)" >&2
|
||||
reconcile_at="$(date +%s)"
|
||||
kubectl -n flux-system annotate "helmchart.source.toolkit.fluxcd.io/${chart_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
|
||||
kubectl -n flux-system annotate "helmrelease/${release_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
|
||||
reconcile_flux_resource flux-system "helmchart.source.toolkit.fluxcd.io/${chart_name}" 300
|
||||
reconcile_helmrelease "${release_name}" 300
|
||||
done
|
||||
|
||||
flux_helm_diagnostics "${repo_name}" "${chart_name}" "${release_name}" "${target_namespace}"
|
||||
@@ -705,7 +577,11 @@ jobs:
|
||||
}
|
||||
|
||||
kubectl create namespace flux-system --dry-run=client -o yaml | kubectl apply -f -
|
||||
ssh-keyscan -p "${FLUX_GIT_PORT}" "${FLUX_GIT_HOST}" > /tmp/flux_known_hosts
|
||||
if [ -n "${FLUX_KNOWN_HOSTS}" ]; then
|
||||
printf '%s\n' "${FLUX_KNOWN_HOSTS}" > /tmp/flux_known_hosts
|
||||
else
|
||||
ssh-keyscan -p "${FLUX_GIT_PORT}" "${FLUX_GIT_HOST}" > /tmp/flux_known_hosts
|
||||
fi
|
||||
kubectl -n flux-system create secret generic flux-system \
|
||||
--from-file=identity="$HOME/.ssh/id_ed25519" \
|
||||
--from-file=known_hosts=/tmp/flux_known_hosts \
|
||||
@@ -741,18 +617,17 @@ jobs:
|
||||
kubectl -n flux-system patch deployment kustomize-controller --type='merge' -p="$PATCH"
|
||||
kubectl -n flux-system patch deployment helm-controller --type='merge' -p="$PATCH"
|
||||
kubectl -n flux-system patch deployment notification-controller --type='merge' -p="$PATCH"
|
||||
kubectl -n flux-system delete pod --field-selector=status.phase!=Running || true
|
||||
flux_rollout_status source-controller
|
||||
flux_rollout_status kustomize-controller
|
||||
flux_rollout_status helm-controller
|
||||
kubectl -n flux-system wait --for=condition=Ready gitrepository/platform --timeout=300s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=600s
|
||||
kubectl -n flux-system annotate kustomization/addon-cert-manager reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
|
||||
reconcile_flux_resource flux-system kustomization/addon-cert-manager 300
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-cert-manager --timeout=1200s
|
||||
kubectl -n flux-system wait --for=condition=Ready helmrelease/cert-manager --timeout=1200s
|
||||
# Wait directly on the ESO Helm objects; Kustomization readiness hides useful failure details.
|
||||
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-external-secrets 600
|
||||
kubectl -n flux-system annotate kustomization/addon-external-secrets reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
|
||||
reconcile_flux_resource flux-system kustomization/addon-external-secrets 300
|
||||
import_required_image oci.external-secrets.io/external-secrets/external-secrets:v2.1.0 "${PRIMARY_CP_IP}"
|
||||
wait_for_flux_oci_helm_release external-secrets external-secrets external-secrets 600s 600
|
||||
wait_for_resource "" crd/clustersecretstores.external-secrets.io 900
|
||||
@@ -764,35 +639,68 @@ jobs:
|
||||
wait_for_resource external-secrets service/external-secrets-external-secrets-webhook 600
|
||||
wait_for_resource external-secrets endpoints/external-secrets-external-secrets-webhook 600
|
||||
kubectl -n external-secrets wait --for=jsonpath='{.subsets[0].addresses[0].ip}' endpoints/external-secrets-external-secrets-webhook --timeout=600s
|
||||
# Create Doppler ClusterSecretStore now that ESO CRDs are available
|
||||
kubectl apply -f - <<'EOF'
|
||||
apiVersion: external-secrets.io/v1
|
||||
kind: ClusterSecretStore
|
||||
metadata:
|
||||
name: doppler-hetznerterra
|
||||
spec:
|
||||
provider:
|
||||
doppler:
|
||||
auth:
|
||||
secretRef:
|
||||
dopplerToken:
|
||||
name: doppler-hetznerterra-service-token
|
||||
key: dopplerToken
|
||||
namespace: external-secrets
|
||||
EOF
|
||||
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-external-secrets-store 600
|
||||
reconcile_flux_resource flux-system kustomization/addon-external-secrets-store 300
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets-store --timeout=600s
|
||||
# Wait for the storage layer and private access components
|
||||
import_required_image ghcr.io/tailscale/k8s-operator:v1.96.5 "${PRIMARY_CP_IP}"
|
||||
import_required_image ghcr.io/tailscale/tailscale:v1.96.5 "${PRIMARY_CP_IP}"
|
||||
kubectl -n flux-system annotate kustomization/addon-tailscale-operator reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
|
||||
reconcile_flux_resource flux-system kustomization/addon-tailscale-operator 300
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=600s
|
||||
kubectl -n tailscale-system rollout status deployment/operator --timeout=600s
|
||||
import_required_image registry.k8s.io/sig-storage/nfs-subdir-external-provisioner:v4.0.2 "${PRIMARY_CP_IP}"
|
||||
kubectl -n flux-system annotate kustomization/addon-nfs-storage reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
|
||||
reconcile_flux_resource flux-system kustomization/addon-nfs-storage 300
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=300s
|
||||
kubectl -n kube-system rollout status deployment/nfs-subdir-external-provisioner --timeout=300s
|
||||
kubectl annotate storageclass local-path storageclass.kubernetes.io/is-default-class=false --overwrite
|
||||
kubectl annotate storageclass flash-nfs storageclass.kubernetes.io/is-default-class=true --overwrite
|
||||
kubectl get storageclass flash-nfs
|
||||
import_required_image docker.io/library/busybox:1.31.1 "${PRIMARY_CP_IP}"
|
||||
kubectl -n kube-system delete pod/nfs-smoke pvc/nfs-smoke --ignore-not-found=true
|
||||
kubectl apply -f - <<'EOF'
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: nfs-smoke
|
||||
namespace: kube-system
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
storageClassName: flash-nfs
|
||||
resources:
|
||||
requests:
|
||||
storage: 1Mi
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: nfs-smoke
|
||||
namespace: kube-system
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||
tolerations:
|
||||
- key: node-role.kubernetes.io/control-plane
|
||||
operator: Exists
|
||||
effect: NoSchedule
|
||||
containers:
|
||||
- name: smoke
|
||||
image: docker.io/library/busybox:1.31.1
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- echo ok >/data/smoke && test -s /data/smoke && sleep 30
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /data
|
||||
volumes:
|
||||
- name: data
|
||||
persistentVolumeClaim:
|
||||
claimName: nfs-smoke
|
||||
EOF
|
||||
kubectl -n kube-system wait --for=condition=Ready pod/nfs-smoke --timeout=180s
|
||||
kubectl -n kube-system delete pod/nfs-smoke pvc/nfs-smoke --ignore-not-found=true --wait=false
|
||||
|
||||
- name: Wait for Rancher
|
||||
env:
|
||||
@@ -823,15 +731,50 @@ jobs:
|
||||
done
|
||||
}
|
||||
|
||||
wait_for_reconcile_handled() {
|
||||
local namespace="$1"
|
||||
local resource="$2"
|
||||
local reconcile_at="$3"
|
||||
local timeout_seconds="$4"
|
||||
local elapsed=0
|
||||
local handled
|
||||
|
||||
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
|
||||
handled="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.lastHandledReconcileAt}' 2>/dev/null || true)"
|
||||
if [ "${handled}" = "${reconcile_at}" ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
sleep 5
|
||||
elapsed=$((elapsed + 5))
|
||||
done
|
||||
|
||||
echo "Timed out waiting for ${resource} to handle reconcile ${reconcile_at}" >&2
|
||||
kubectl -n "${namespace}" describe "${resource}" || true
|
||||
exit 1
|
||||
}
|
||||
|
||||
reconcile_flux_resource() {
|
||||
local namespace="$1"
|
||||
local resource="$2"
|
||||
local timeout_seconds="$3"
|
||||
local reconcile_at
|
||||
reconcile_at="$(date +%s%N)"
|
||||
kubectl -n "${namespace}" annotate "${resource}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
|
||||
wait_for_reconcile_handled "${namespace}" "${resource}" "${reconcile_at}" "${timeout_seconds}"
|
||||
}
|
||||
|
||||
reconcile_helmrelease() {
|
||||
local release_name="$1"
|
||||
local timeout_seconds="${2:-300}"
|
||||
local reconcile_at
|
||||
reconcile_at="$(date +%s)"
|
||||
reconcile_at="$(date +%s%N)"
|
||||
kubectl -n flux-system annotate "helmrelease/${release_name}" \
|
||||
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
|
||||
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
|
||||
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
|
||||
--overwrite
|
||||
wait_for_reconcile_handled flux-system "helmrelease/${release_name}" "${reconcile_at}" "${timeout_seconds}"
|
||||
}
|
||||
|
||||
wait_for_helmchart_ready() {
|
||||
@@ -839,13 +782,11 @@ jobs:
|
||||
local release_name="$2"
|
||||
local timeout="$3"
|
||||
local attempts="$4"
|
||||
local reconcile_at
|
||||
|
||||
wait_for_resource flux-system "helmchart.source.toolkit.fluxcd.io/${chart_name}" 600
|
||||
for attempt in $(seq 1 "${attempts}"); do
|
||||
reconcile_at="$(date +%s)"
|
||||
kubectl -n flux-system annotate "helmchart.source.toolkit.fluxcd.io/${chart_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
|
||||
kubectl -n flux-system annotate "helmrelease/${release_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
|
||||
reconcile_flux_resource flux-system "helmchart.source.toolkit.fluxcd.io/${chart_name}" 300
|
||||
reconcile_helmrelease "${release_name}" 300
|
||||
|
||||
if kubectl -n flux-system wait --for=condition=Ready "helmchart.source.toolkit.fluxcd.io/${chart_name}" --timeout="${timeout}"; then
|
||||
return 0
|
||||
@@ -866,12 +807,16 @@ jobs:
|
||||
local elapsed=0
|
||||
local ready
|
||||
local stalled
|
||||
local generation
|
||||
local observed_generation
|
||||
|
||||
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
|
||||
ready="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
|
||||
stalled="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)"
|
||||
generation="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.metadata.generation}' 2>/dev/null || true)"
|
||||
observed_generation="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.observedGeneration}' 2>/dev/null || true)"
|
||||
|
||||
if [ "${ready}" = "True" ]; then
|
||||
if [ "${ready}" = "True" ] && [ "${observed_generation}" = "${generation}" ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
@@ -928,10 +873,13 @@ jobs:
|
||||
}
|
||||
|
||||
echo "Waiting for Rancher..."
|
||||
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-rancher-secrets 600
|
||||
reconcile_flux_resource flux-system kustomization/addon-rancher-secrets 300
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-secrets --timeout=600s
|
||||
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-rancher 600
|
||||
kubectl -n flux-system annotate kustomization/addon-rancher reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
|
||||
reconcile_flux_resource flux-system kustomization/addon-rancher 300
|
||||
wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher 600
|
||||
reconcile_helmrelease rancher
|
||||
reconcile_helmrelease rancher 300
|
||||
wait_for_helmchart_ready flux-system-rancher rancher 180s 5
|
||||
wait_for_helmrelease_ready rancher cattle-system 900
|
||||
wait_for_resource "" namespace/cattle-system 600
|
||||
@@ -956,6 +904,66 @@ jobs:
|
||||
printf '%s' "$1" | tr '/:' '__'
|
||||
}
|
||||
|
||||
wait_for_resource() {
|
||||
local namespace="$1"
|
||||
local resource="$2"
|
||||
local timeout_seconds="$3"
|
||||
local elapsed=0
|
||||
|
||||
until kubectl -n "${namespace}" get "${resource}" >/dev/null 2>&1; do
|
||||
if [ "${elapsed}" -ge "${timeout_seconds}" ]; then
|
||||
echo "Timed out waiting for ${resource} to exist" >&2
|
||||
kubectl -n flux-system get kustomizations,helmreleases || true
|
||||
exit 1
|
||||
fi
|
||||
|
||||
sleep 10
|
||||
elapsed=$((elapsed + 10))
|
||||
done
|
||||
}
|
||||
|
||||
wait_for_reconcile_handled() {
|
||||
local resource="$1"
|
||||
local reconcile_at="$2"
|
||||
local timeout_seconds="$3"
|
||||
local elapsed=0
|
||||
local handled
|
||||
|
||||
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
|
||||
handled="$(kubectl -n flux-system get "${resource}" -o jsonpath='{.status.lastHandledReconcileAt}' 2>/dev/null || true)"
|
||||
if [ "${handled}" = "${reconcile_at}" ]; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
sleep 5
|
||||
elapsed=$((elapsed + 5))
|
||||
done
|
||||
|
||||
echo "Timed out waiting for ${resource} to handle reconcile ${reconcile_at}" >&2
|
||||
kubectl -n flux-system describe "${resource}" || true
|
||||
exit 1
|
||||
}
|
||||
|
||||
reconcile_flux_resource() {
|
||||
local resource="$1"
|
||||
local reconcile_at
|
||||
reconcile_at="$(date +%s%N)"
|
||||
kubectl -n flux-system annotate "${resource}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
|
||||
wait_for_reconcile_handled "${resource}" "${reconcile_at}" 300
|
||||
}
|
||||
|
||||
reconcile_helmrelease() {
|
||||
local release="$1"
|
||||
local reconcile_at
|
||||
reconcile_at="$(date +%s%N)"
|
||||
kubectl -n flux-system annotate "helmrelease/${release}" \
|
||||
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
|
||||
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
|
||||
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
|
||||
--overwrite
|
||||
wait_for_reconcile_handled "helmrelease/${release}" "${reconcile_at}" 300
|
||||
}
|
||||
|
||||
import_required_image() {
|
||||
local image="$1"
|
||||
local host_ip="$2"
|
||||
@@ -1015,7 +1023,8 @@ jobs:
|
||||
done
|
||||
|
||||
if [ "${failed}" = "true" ]; then
|
||||
echo "Warning: failed to import ${image} on one or more nodes; continuing so Kubernetes can use already-seeded nodes or retry pulls" >&2
|
||||
echo "Failed to import required image ${image} on one or more nodes" >&2
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
@@ -1034,13 +1043,10 @@ jobs:
|
||||
quay.io/prometheus/node-exporter:v1.8.2; do
|
||||
import_required_image_on_all_nodes "${image}"
|
||||
done
|
||||
reconcile_at="$(date +%s)"
|
||||
reconcile_flux_resource kustomization/addon-observability
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=1200s
|
||||
for release in kube-prometheus-stack loki promtail; do
|
||||
kubectl -n flux-system annotate "helmrelease/${release}" \
|
||||
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
|
||||
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
|
||||
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
|
||||
--overwrite
|
||||
reconcile_helmrelease "${release}"
|
||||
done
|
||||
kubectl -n observability rollout restart deployment/observability-kube-prometheus-stack-grafana || true
|
||||
|
||||
@@ -1055,11 +1061,14 @@ jobs:
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=300s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-cert-manager --timeout=300s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets --timeout=300s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets-store --timeout=300s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=300s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-proxyclass --timeout=300s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-secrets --timeout=300s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=900s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-config --timeout=300s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-secrets --timeout=300s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=1200s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=300s
|
||||
kubectl -n flux-system wait --for=condition=Ready helmrelease --all --timeout=1200s
|
||||
|
||||
Reference in New Issue
Block a user