diff --git a/nixos/kubeadm/README.md b/nixos/kubeadm/README.md index 34f734b..0b523ec 100644 --- a/nixos/kubeadm/README.md +++ b/nixos/kubeadm/README.md @@ -105,6 +105,12 @@ $EDITOR ./scripts/inventory.env ./scripts/rebuild-and-bootstrap.sh ``` +Optional tuning env vars: + +```bash +WORKER_PARALLELISM=2 REBUILD_TIMEOUT=45m REBUILD_RETRIES=2 ./scripts/rebuild-and-bootstrap.sh +``` + 3. If you only want to reset Kubernetes state on existing VMs: ```bash diff --git a/nixos/kubeadm/scripts/rebuild-and-bootstrap.sh b/nixos/kubeadm/scripts/rebuild-and-bootstrap.sh index 8d35378..c38431b 100755 --- a/nixos/kubeadm/scripts/rebuild-and-bootstrap.sh +++ b/nixos/kubeadm/scripts/rebuild-and-bootstrap.sh @@ -18,6 +18,9 @@ SSH_USER="${SSH_USER:-micqdf}" SSH_KEY_PATH="${SSH_KEY_PATH:-$HOME/.ssh/id_ed25519}" SSH_OPTS="${SSH_OPTS:--o BatchMode=yes -o IdentitiesOnly=yes -o StrictHostKeyChecking=accept-new -i $SSH_KEY_PATH}" SSH_USER_CANDIDATES="${SSH_USER_CANDIDATES:-root $SSH_USER}" +REBUILD_TIMEOUT="${REBUILD_TIMEOUT:-45m}" +REBUILD_RETRIES="${REBUILD_RETRIES:-2}" +WORKER_PARALLELISM="${WORKER_PARALLELISM:-2}" declare -A NODE_IPS=() declare -a CP_NAMES=() @@ -126,12 +129,36 @@ rebuild_node() { local node_ip="$2" echo "==> Rebuilding $node_name on $node_ip" - nixos-rebuild switch \ + timeout "$REBUILD_TIMEOUT" nixos-rebuild switch \ --flake "$FLAKE_DIR#$node_name" \ --target-host "$ACTIVE_SSH_USER@$node_ip" \ --use-remote-sudo } +rebuild_node_with_retry() { + local node_name="$1" + local node_ip="$2" + local attempt=1 + local max_attempts=$((REBUILD_RETRIES + 1)) + + while [ "$attempt" -le "$max_attempts" ]; do + echo "==> Rebuild attempt $attempt/$max_attempts for $node_name" + if rebuild_node "$node_name" "$node_ip"; then + return 0 + fi + + if [ "$attempt" -lt "$max_attempts" ]; then + echo "==> Rebuild failed for $node_name, retrying after 20s" + sleep 20 + fi + + attempt=$((attempt + 1)) + done + + echo "==> Rebuild failed permanently for $node_name" + return 1 +} + prepare_remote_nix_trust() { local node_ip="$1" echo "==> Ensuring nix trusted-users on $node_ip" @@ -164,15 +191,42 @@ detect_ssh_user "$PRIMARY_CP_IP" for node in "${CP_NAMES[@]}"; do prepare_remote_nix_trust "${NODE_IPS[$node]}" prepare_remote_space "${NODE_IPS[$node]}" - rebuild_node "$node" "${NODE_IPS[$node]}" + rebuild_node_with_retry "$node" "${NODE_IPS[$node]}" done +worker_failures=0 for node in "${WK_NAMES[@]}"; do prepare_remote_nix_trust "${NODE_IPS[$node]}" prepare_remote_space "${NODE_IPS[$node]}" - rebuild_node "$node" "${NODE_IPS[$node]}" done +active_jobs=0 +for node in "${WK_NAMES[@]}"; do + ( + rebuild_node_with_retry "$node" "${NODE_IPS[$node]}" + ) & + + active_jobs=$((active_jobs + 1)) + if [ "$active_jobs" -ge "$WORKER_PARALLELISM" ]; then + if ! wait -n; then + worker_failures=$((worker_failures + 1)) + fi + active_jobs=$((active_jobs - 1)) + fi +done + +while [ "$active_jobs" -gt 0 ]; do + if ! wait -n; then + worker_failures=$((worker_failures + 1)) + fi + active_jobs=$((active_jobs - 1)) +done + +if [ "$worker_failures" -gt 0 ]; then + echo "==> $worker_failures worker rebuild job(s) failed" + exit 1 +fi + echo "==> Initializing control plane on $PRIMARY_CONTROL_PLANE" if cluster_ready; then echo "==> Existing cluster detected on $PRIMARY_CONTROL_PLANE; skipping kubeadm init"