diff --git a/.gitea/workflows/terraform-apply.yml b/.gitea/workflows/terraform-apply.yml index 5235a63..439d581 100644 --- a/.gitea/workflows/terraform-apply.yml +++ b/.gitea/workflows/terraform-apply.yml @@ -53,7 +53,20 @@ jobs: - name: Terraform Plan working-directory: terraform - run: terraform plan -out=tfplan + run: | + set -euo pipefail + for attempt in 1 2; do + echo "Terraform plan attempt $attempt/2" + if timeout 20m terraform plan -parallelism=1 -out=tfplan; then + exit 0 + fi + if [ "$attempt" -eq 1 ]; then + echo "Plan attempt failed or timed out; retrying in 20s" + sleep 20 + fi + done + echo "Terraform plan failed after retries" + exit 1 - name: Block accidental destroy env: diff --git a/.gitea/workflows/terraform-destroy.yml b/.gitea/workflows/terraform-destroy.yml index 5a57a71..4631b4e 100644 --- a/.gitea/workflows/terraform-destroy.yml +++ b/.gitea/workflows/terraform-destroy.yml @@ -74,15 +74,16 @@ jobs: - name: Terraform Destroy Plan working-directory: terraform run: | + set -euo pipefail case "${{ inputs.target }}" in all) - terraform plan -destroy -out=tfdestroy + TF_PLAN_CMD="terraform plan -parallelism=1 -destroy -out=tfdestroy" ;; control-planes) - terraform plan -destroy -target=proxmox_vm_qemu.control_planes -out=tfdestroy + TF_PLAN_CMD="terraform plan -parallelism=1 -destroy -target=proxmox_vm_qemu.control_planes -out=tfdestroy" ;; workers) - terraform plan -destroy -target=proxmox_vm_qemu.workers -out=tfdestroy + TF_PLAN_CMD="terraform plan -parallelism=1 -destroy -target=proxmox_vm_qemu.workers -out=tfdestroy" ;; *) echo "Invalid destroy target: ${{ inputs.target }}" @@ -90,6 +91,20 @@ jobs: ;; esac + for attempt in 1 2; do + echo "Terraform destroy plan attempt $attempt/2" + if timeout 20m bash -lc "$TF_PLAN_CMD"; then + exit 0 + fi + if [ "$attempt" -eq 1 ]; then + echo "Destroy plan attempt failed or timed out; retrying in 20s" + sleep 20 + fi + done + + echo "Terraform destroy plan failed after retries" + exit 1 + - name: Terraform Destroy Apply working-directory: terraform run: | diff --git a/.gitea/workflows/terraform-plan.yml b/.gitea/workflows/terraform-plan.yml index cb5874f..eee7cf2 100644 --- a/.gitea/workflows/terraform-plan.yml +++ b/.gitea/workflows/terraform-plan.yml @@ -67,7 +67,20 @@ jobs: - name: Terraform Plan working-directory: terraform - run: terraform plan -out=tfplan + run: | + set -euo pipefail + for attempt in 1 2; do + echo "Terraform plan attempt $attempt/2" + if timeout 20m terraform plan -parallelism=1 -out=tfplan; then + exit 0 + fi + if [ "$attempt" -eq 1 ]; then + echo "Plan attempt failed or timed out; retrying in 20s" + sleep 20 + fi + done + echo "Terraform plan failed after retries" + exit 1 - name: Block accidental destroy env: diff --git a/nixos/kubeadm/README.md b/nixos/kubeadm/README.md index 34f734b..0b523ec 100644 --- a/nixos/kubeadm/README.md +++ b/nixos/kubeadm/README.md @@ -105,6 +105,12 @@ $EDITOR ./scripts/inventory.env ./scripts/rebuild-and-bootstrap.sh ``` +Optional tuning env vars: + +```bash +WORKER_PARALLELISM=2 REBUILD_TIMEOUT=45m REBUILD_RETRIES=2 ./scripts/rebuild-and-bootstrap.sh +``` + 3. If you only want to reset Kubernetes state on existing VMs: ```bash diff --git a/nixos/kubeadm/scripts/rebuild-and-bootstrap.sh b/nixos/kubeadm/scripts/rebuild-and-bootstrap.sh index 8d35378..c38431b 100755 --- a/nixos/kubeadm/scripts/rebuild-and-bootstrap.sh +++ b/nixos/kubeadm/scripts/rebuild-and-bootstrap.sh @@ -18,6 +18,9 @@ SSH_USER="${SSH_USER:-micqdf}" SSH_KEY_PATH="${SSH_KEY_PATH:-$HOME/.ssh/id_ed25519}" SSH_OPTS="${SSH_OPTS:--o BatchMode=yes -o IdentitiesOnly=yes -o StrictHostKeyChecking=accept-new -i $SSH_KEY_PATH}" SSH_USER_CANDIDATES="${SSH_USER_CANDIDATES:-root $SSH_USER}" +REBUILD_TIMEOUT="${REBUILD_TIMEOUT:-45m}" +REBUILD_RETRIES="${REBUILD_RETRIES:-2}" +WORKER_PARALLELISM="${WORKER_PARALLELISM:-2}" declare -A NODE_IPS=() declare -a CP_NAMES=() @@ -126,12 +129,36 @@ rebuild_node() { local node_ip="$2" echo "==> Rebuilding $node_name on $node_ip" - nixos-rebuild switch \ + timeout "$REBUILD_TIMEOUT" nixos-rebuild switch \ --flake "$FLAKE_DIR#$node_name" \ --target-host "$ACTIVE_SSH_USER@$node_ip" \ --use-remote-sudo } +rebuild_node_with_retry() { + local node_name="$1" + local node_ip="$2" + local attempt=1 + local max_attempts=$((REBUILD_RETRIES + 1)) + + while [ "$attempt" -le "$max_attempts" ]; do + echo "==> Rebuild attempt $attempt/$max_attempts for $node_name" + if rebuild_node "$node_name" "$node_ip"; then + return 0 + fi + + if [ "$attempt" -lt "$max_attempts" ]; then + echo "==> Rebuild failed for $node_name, retrying after 20s" + sleep 20 + fi + + attempt=$((attempt + 1)) + done + + echo "==> Rebuild failed permanently for $node_name" + return 1 +} + prepare_remote_nix_trust() { local node_ip="$1" echo "==> Ensuring nix trusted-users on $node_ip" @@ -164,15 +191,42 @@ detect_ssh_user "$PRIMARY_CP_IP" for node in "${CP_NAMES[@]}"; do prepare_remote_nix_trust "${NODE_IPS[$node]}" prepare_remote_space "${NODE_IPS[$node]}" - rebuild_node "$node" "${NODE_IPS[$node]}" + rebuild_node_with_retry "$node" "${NODE_IPS[$node]}" done +worker_failures=0 for node in "${WK_NAMES[@]}"; do prepare_remote_nix_trust "${NODE_IPS[$node]}" prepare_remote_space "${NODE_IPS[$node]}" - rebuild_node "$node" "${NODE_IPS[$node]}" done +active_jobs=0 +for node in "${WK_NAMES[@]}"; do + ( + rebuild_node_with_retry "$node" "${NODE_IPS[$node]}" + ) & + + active_jobs=$((active_jobs + 1)) + if [ "$active_jobs" -ge "$WORKER_PARALLELISM" ]; then + if ! wait -n; then + worker_failures=$((worker_failures + 1)) + fi + active_jobs=$((active_jobs - 1)) + fi +done + +while [ "$active_jobs" -gt 0 ]; do + if ! wait -n; then + worker_failures=$((worker_failures + 1)) + fi + active_jobs=$((active_jobs - 1)) +done + +if [ "$worker_failures" -gt 0 ]; then + echo "==> $worker_failures worker rebuild job(s) failed" + exit 1 +fi + echo "==> Initializing control plane on $PRIMARY_CONTROL_PLANE" if cluster_ready; then echo "==> Existing cluster detected on $PRIMARY_CONTROL_PLANE; skipping kubeadm init"