Merge pull request 'stage' (#59) from stage into master
Some checks failed
Terraform Apply / Terraform Apply (push) Has been cancelled
Some checks failed
Terraform Apply / Terraform Apply (push) Has been cancelled
Reviewed-on: #59
This commit was merged in pull request #59.
This commit is contained in:
@@ -53,7 +53,20 @@ jobs:
|
|||||||
|
|
||||||
- name: Terraform Plan
|
- name: Terraform Plan
|
||||||
working-directory: terraform
|
working-directory: terraform
|
||||||
run: terraform plan -out=tfplan
|
run: |
|
||||||
|
set -euo pipefail
|
||||||
|
for attempt in 1 2; do
|
||||||
|
echo "Terraform plan attempt $attempt/2"
|
||||||
|
if timeout 20m terraform plan -parallelism=1 -out=tfplan; then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
if [ "$attempt" -eq 1 ]; then
|
||||||
|
echo "Plan attempt failed or timed out; retrying in 20s"
|
||||||
|
sleep 20
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
echo "Terraform plan failed after retries"
|
||||||
|
exit 1
|
||||||
|
|
||||||
- name: Block accidental destroy
|
- name: Block accidental destroy
|
||||||
env:
|
env:
|
||||||
|
|||||||
@@ -74,15 +74,16 @@ jobs:
|
|||||||
- name: Terraform Destroy Plan
|
- name: Terraform Destroy Plan
|
||||||
working-directory: terraform
|
working-directory: terraform
|
||||||
run: |
|
run: |
|
||||||
|
set -euo pipefail
|
||||||
case "${{ inputs.target }}" in
|
case "${{ inputs.target }}" in
|
||||||
all)
|
all)
|
||||||
terraform plan -destroy -out=tfdestroy
|
TF_PLAN_CMD="terraform plan -parallelism=1 -destroy -out=tfdestroy"
|
||||||
;;
|
;;
|
||||||
control-planes)
|
control-planes)
|
||||||
terraform plan -destroy -target=proxmox_vm_qemu.control_planes -out=tfdestroy
|
TF_PLAN_CMD="terraform plan -parallelism=1 -destroy -target=proxmox_vm_qemu.control_planes -out=tfdestroy"
|
||||||
;;
|
;;
|
||||||
workers)
|
workers)
|
||||||
terraform plan -destroy -target=proxmox_vm_qemu.workers -out=tfdestroy
|
TF_PLAN_CMD="terraform plan -parallelism=1 -destroy -target=proxmox_vm_qemu.workers -out=tfdestroy"
|
||||||
;;
|
;;
|
||||||
*)
|
*)
|
||||||
echo "Invalid destroy target: ${{ inputs.target }}"
|
echo "Invalid destroy target: ${{ inputs.target }}"
|
||||||
@@ -90,6 +91,20 @@ jobs:
|
|||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
|
for attempt in 1 2; do
|
||||||
|
echo "Terraform destroy plan attempt $attempt/2"
|
||||||
|
if timeout 20m bash -lc "$TF_PLAN_CMD"; then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
if [ "$attempt" -eq 1 ]; then
|
||||||
|
echo "Destroy plan attempt failed or timed out; retrying in 20s"
|
||||||
|
sleep 20
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Terraform destroy plan failed after retries"
|
||||||
|
exit 1
|
||||||
|
|
||||||
- name: Terraform Destroy Apply
|
- name: Terraform Destroy Apply
|
||||||
working-directory: terraform
|
working-directory: terraform
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
@@ -67,7 +67,20 @@ jobs:
|
|||||||
|
|
||||||
- name: Terraform Plan
|
- name: Terraform Plan
|
||||||
working-directory: terraform
|
working-directory: terraform
|
||||||
run: terraform plan -out=tfplan
|
run: |
|
||||||
|
set -euo pipefail
|
||||||
|
for attempt in 1 2; do
|
||||||
|
echo "Terraform plan attempt $attempt/2"
|
||||||
|
if timeout 20m terraform plan -parallelism=1 -out=tfplan; then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
if [ "$attempt" -eq 1 ]; then
|
||||||
|
echo "Plan attempt failed or timed out; retrying in 20s"
|
||||||
|
sleep 20
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
echo "Terraform plan failed after retries"
|
||||||
|
exit 1
|
||||||
|
|
||||||
- name: Block accidental destroy
|
- name: Block accidental destroy
|
||||||
env:
|
env:
|
||||||
|
|||||||
@@ -105,6 +105,12 @@ $EDITOR ./scripts/inventory.env
|
|||||||
./scripts/rebuild-and-bootstrap.sh
|
./scripts/rebuild-and-bootstrap.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Optional tuning env vars:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
WORKER_PARALLELISM=2 REBUILD_TIMEOUT=45m REBUILD_RETRIES=2 ./scripts/rebuild-and-bootstrap.sh
|
||||||
|
```
|
||||||
|
|
||||||
3. If you only want to reset Kubernetes state on existing VMs:
|
3. If you only want to reset Kubernetes state on existing VMs:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|||||||
@@ -18,6 +18,9 @@ SSH_USER="${SSH_USER:-micqdf}"
|
|||||||
SSH_KEY_PATH="${SSH_KEY_PATH:-$HOME/.ssh/id_ed25519}"
|
SSH_KEY_PATH="${SSH_KEY_PATH:-$HOME/.ssh/id_ed25519}"
|
||||||
SSH_OPTS="${SSH_OPTS:--o BatchMode=yes -o IdentitiesOnly=yes -o StrictHostKeyChecking=accept-new -i $SSH_KEY_PATH}"
|
SSH_OPTS="${SSH_OPTS:--o BatchMode=yes -o IdentitiesOnly=yes -o StrictHostKeyChecking=accept-new -i $SSH_KEY_PATH}"
|
||||||
SSH_USER_CANDIDATES="${SSH_USER_CANDIDATES:-root $SSH_USER}"
|
SSH_USER_CANDIDATES="${SSH_USER_CANDIDATES:-root $SSH_USER}"
|
||||||
|
REBUILD_TIMEOUT="${REBUILD_TIMEOUT:-45m}"
|
||||||
|
REBUILD_RETRIES="${REBUILD_RETRIES:-2}"
|
||||||
|
WORKER_PARALLELISM="${WORKER_PARALLELISM:-2}"
|
||||||
|
|
||||||
declare -A NODE_IPS=()
|
declare -A NODE_IPS=()
|
||||||
declare -a CP_NAMES=()
|
declare -a CP_NAMES=()
|
||||||
@@ -126,12 +129,36 @@ rebuild_node() {
|
|||||||
local node_ip="$2"
|
local node_ip="$2"
|
||||||
|
|
||||||
echo "==> Rebuilding $node_name on $node_ip"
|
echo "==> Rebuilding $node_name on $node_ip"
|
||||||
nixos-rebuild switch \
|
timeout "$REBUILD_TIMEOUT" nixos-rebuild switch \
|
||||||
--flake "$FLAKE_DIR#$node_name" \
|
--flake "$FLAKE_DIR#$node_name" \
|
||||||
--target-host "$ACTIVE_SSH_USER@$node_ip" \
|
--target-host "$ACTIVE_SSH_USER@$node_ip" \
|
||||||
--use-remote-sudo
|
--use-remote-sudo
|
||||||
}
|
}
|
||||||
|
|
||||||
|
rebuild_node_with_retry() {
|
||||||
|
local node_name="$1"
|
||||||
|
local node_ip="$2"
|
||||||
|
local attempt=1
|
||||||
|
local max_attempts=$((REBUILD_RETRIES + 1))
|
||||||
|
|
||||||
|
while [ "$attempt" -le "$max_attempts" ]; do
|
||||||
|
echo "==> Rebuild attempt $attempt/$max_attempts for $node_name"
|
||||||
|
if rebuild_node "$node_name" "$node_ip"; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$attempt" -lt "$max_attempts" ]; then
|
||||||
|
echo "==> Rebuild failed for $node_name, retrying after 20s"
|
||||||
|
sleep 20
|
||||||
|
fi
|
||||||
|
|
||||||
|
attempt=$((attempt + 1))
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "==> Rebuild failed permanently for $node_name"
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
prepare_remote_nix_trust() {
|
prepare_remote_nix_trust() {
|
||||||
local node_ip="$1"
|
local node_ip="$1"
|
||||||
echo "==> Ensuring nix trusted-users on $node_ip"
|
echo "==> Ensuring nix trusted-users on $node_ip"
|
||||||
@@ -164,15 +191,42 @@ detect_ssh_user "$PRIMARY_CP_IP"
|
|||||||
for node in "${CP_NAMES[@]}"; do
|
for node in "${CP_NAMES[@]}"; do
|
||||||
prepare_remote_nix_trust "${NODE_IPS[$node]}"
|
prepare_remote_nix_trust "${NODE_IPS[$node]}"
|
||||||
prepare_remote_space "${NODE_IPS[$node]}"
|
prepare_remote_space "${NODE_IPS[$node]}"
|
||||||
rebuild_node "$node" "${NODE_IPS[$node]}"
|
rebuild_node_with_retry "$node" "${NODE_IPS[$node]}"
|
||||||
done
|
done
|
||||||
|
|
||||||
|
worker_failures=0
|
||||||
for node in "${WK_NAMES[@]}"; do
|
for node in "${WK_NAMES[@]}"; do
|
||||||
prepare_remote_nix_trust "${NODE_IPS[$node]}"
|
prepare_remote_nix_trust "${NODE_IPS[$node]}"
|
||||||
prepare_remote_space "${NODE_IPS[$node]}"
|
prepare_remote_space "${NODE_IPS[$node]}"
|
||||||
rebuild_node "$node" "${NODE_IPS[$node]}"
|
|
||||||
done
|
done
|
||||||
|
|
||||||
|
active_jobs=0
|
||||||
|
for node in "${WK_NAMES[@]}"; do
|
||||||
|
(
|
||||||
|
rebuild_node_with_retry "$node" "${NODE_IPS[$node]}"
|
||||||
|
) &
|
||||||
|
|
||||||
|
active_jobs=$((active_jobs + 1))
|
||||||
|
if [ "$active_jobs" -ge "$WORKER_PARALLELISM" ]; then
|
||||||
|
if ! wait -n; then
|
||||||
|
worker_failures=$((worker_failures + 1))
|
||||||
|
fi
|
||||||
|
active_jobs=$((active_jobs - 1))
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
while [ "$active_jobs" -gt 0 ]; do
|
||||||
|
if ! wait -n; then
|
||||||
|
worker_failures=$((worker_failures + 1))
|
||||||
|
fi
|
||||||
|
active_jobs=$((active_jobs - 1))
|
||||||
|
done
|
||||||
|
|
||||||
|
if [ "$worker_failures" -gt 0 ]; then
|
||||||
|
echo "==> $worker_failures worker rebuild job(s) failed"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
echo "==> Initializing control plane on $PRIMARY_CONTROL_PLANE"
|
echo "==> Initializing control plane on $PRIMARY_CONTROL_PLANE"
|
||||||
if cluster_ready; then
|
if cluster_ready; then
|
||||||
echo "==> Existing cluster detected on $PRIMARY_CONTROL_PLANE; skipping kubeadm init"
|
echo "==> Existing cluster detected on $PRIMARY_CONTROL_PLANE; skipping kubeadm init"
|
||||||
|
|||||||
Reference in New Issue
Block a user