feat: parallelize worker rebuilds with retry and timeout
Some checks failed
Terraform Plan / Terraform Plan (push) Has been cancelled

This commit is contained in:
2026-02-28 22:15:40 +00:00
parent 23a85cc099
commit f5d9eba9d0
2 changed files with 63 additions and 3 deletions

View File

@@ -18,6 +18,9 @@ SSH_USER="${SSH_USER:-micqdf}"
SSH_KEY_PATH="${SSH_KEY_PATH:-$HOME/.ssh/id_ed25519}"
SSH_OPTS="${SSH_OPTS:--o BatchMode=yes -o IdentitiesOnly=yes -o StrictHostKeyChecking=accept-new -i $SSH_KEY_PATH}"
SSH_USER_CANDIDATES="${SSH_USER_CANDIDATES:-root $SSH_USER}"
REBUILD_TIMEOUT="${REBUILD_TIMEOUT:-45m}"
REBUILD_RETRIES="${REBUILD_RETRIES:-2}"
WORKER_PARALLELISM="${WORKER_PARALLELISM:-2}"
declare -A NODE_IPS=()
declare -a CP_NAMES=()
@@ -126,12 +129,36 @@ rebuild_node() {
local node_ip="$2"
echo "==> Rebuilding $node_name on $node_ip"
nixos-rebuild switch \
timeout "$REBUILD_TIMEOUT" nixos-rebuild switch \
--flake "$FLAKE_DIR#$node_name" \
--target-host "$ACTIVE_SSH_USER@$node_ip" \
--use-remote-sudo
}
rebuild_node_with_retry() {
local node_name="$1"
local node_ip="$2"
local attempt=1
local max_attempts=$((REBUILD_RETRIES + 1))
while [ "$attempt" -le "$max_attempts" ]; do
echo "==> Rebuild attempt $attempt/$max_attempts for $node_name"
if rebuild_node "$node_name" "$node_ip"; then
return 0
fi
if [ "$attempt" -lt "$max_attempts" ]; then
echo "==> Rebuild failed for $node_name, retrying after 20s"
sleep 20
fi
attempt=$((attempt + 1))
done
echo "==> Rebuild failed permanently for $node_name"
return 1
}
prepare_remote_nix_trust() {
local node_ip="$1"
echo "==> Ensuring nix trusted-users on $node_ip"
@@ -164,15 +191,42 @@ detect_ssh_user "$PRIMARY_CP_IP"
for node in "${CP_NAMES[@]}"; do
prepare_remote_nix_trust "${NODE_IPS[$node]}"
prepare_remote_space "${NODE_IPS[$node]}"
rebuild_node "$node" "${NODE_IPS[$node]}"
rebuild_node_with_retry "$node" "${NODE_IPS[$node]}"
done
worker_failures=0
for node in "${WK_NAMES[@]}"; do
prepare_remote_nix_trust "${NODE_IPS[$node]}"
prepare_remote_space "${NODE_IPS[$node]}"
rebuild_node "$node" "${NODE_IPS[$node]}"
done
active_jobs=0
for node in "${WK_NAMES[@]}"; do
(
rebuild_node_with_retry "$node" "${NODE_IPS[$node]}"
) &
active_jobs=$((active_jobs + 1))
if [ "$active_jobs" -ge "$WORKER_PARALLELISM" ]; then
if ! wait -n; then
worker_failures=$((worker_failures + 1))
fi
active_jobs=$((active_jobs - 1))
fi
done
while [ "$active_jobs" -gt 0 ]; do
if ! wait -n; then
worker_failures=$((worker_failures + 1))
fi
active_jobs=$((active_jobs - 1))
done
if [ "$worker_failures" -gt 0 ]; then
echo "==> $worker_failures worker rebuild job(s) failed"
exit 1
fi
echo "==> Initializing control plane on $PRIMARY_CONTROL_PLANE"
if cluster_ready; then
echo "==> Existing cluster detected on $PRIMARY_CONTROL_PLANE; skipping kubeadm init"