2026-02-28 16:24:45 +00:00
|
|
|
#!/usr/bin/env bash
|
|
|
|
|
set -euo pipefail
|
|
|
|
|
|
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
|
|
|
FLAKE_DIR="${FLAKE_DIR:-$(cd "$SCRIPT_DIR/.." && pwd)}"
|
|
|
|
|
INVENTORY_FILE="${1:-$SCRIPT_DIR/inventory.env}"
|
|
|
|
|
|
|
|
|
|
if [ ! -f "$INVENTORY_FILE" ]; then
|
|
|
|
|
echo "Missing inventory file: $INVENTORY_FILE"
|
2026-02-28 16:43:22 +00:00
|
|
|
echo "Copy $SCRIPT_DIR/inventory.example.env to $SCRIPT_DIR/inventory.env and edit node mappings."
|
2026-02-28 16:24:45 +00:00
|
|
|
exit 1
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
# shellcheck disable=SC1090
|
|
|
|
|
source "$INVENTORY_FILE"
|
|
|
|
|
|
|
|
|
|
SSH_USER="${SSH_USER:-micqdf}"
|
2026-02-28 17:16:31 +00:00
|
|
|
SSH_KEY_PATH="${SSH_KEY_PATH:-$HOME/.ssh/id_ed25519}"
|
|
|
|
|
SSH_OPTS="${SSH_OPTS:--o BatchMode=yes -o IdentitiesOnly=yes -o StrictHostKeyChecking=accept-new -i $SSH_KEY_PATH}"
|
2026-02-28 20:03:26 +00:00
|
|
|
SSH_USER_CANDIDATES="${SSH_USER_CANDIDATES:-root $SSH_USER}"
|
2026-02-28 22:15:40 +00:00
|
|
|
REBUILD_TIMEOUT="${REBUILD_TIMEOUT:-45m}"
|
|
|
|
|
REBUILD_RETRIES="${REBUILD_RETRIES:-2}"
|
2026-03-01 03:33:33 +00:00
|
|
|
WORKER_PARALLELISM="${WORKER_PARALLELISM:-3}"
|
|
|
|
|
FAST_MODE="${FAST_MODE:-1}"
|
2026-02-28 16:24:45 +00:00
|
|
|
|
2026-02-28 16:43:22 +00:00
|
|
|
declare -A NODE_IPS=()
|
|
|
|
|
declare -a CP_NAMES=()
|
|
|
|
|
declare -a WK_NAMES=()
|
|
|
|
|
|
|
|
|
|
add_node_pair() {
|
|
|
|
|
local role="$1"
|
|
|
|
|
local pair="$2"
|
|
|
|
|
local name="${pair%%=*}"
|
|
|
|
|
local ip="${pair#*=}"
|
|
|
|
|
|
|
|
|
|
if [ -z "$name" ] || [ -z "$ip" ] || [ "$name" = "$ip" ]; then
|
|
|
|
|
echo "Invalid node pair '$pair' (expected name=ip)."
|
2026-02-28 16:24:45 +00:00
|
|
|
exit 1
|
|
|
|
|
fi
|
|
|
|
|
|
2026-02-28 16:43:22 +00:00
|
|
|
NODE_IPS["$name"]="$ip"
|
|
|
|
|
if [ "$role" = "cp" ]; then
|
|
|
|
|
CP_NAMES+=("$name")
|
|
|
|
|
else
|
|
|
|
|
WK_NAMES+=("$name")
|
|
|
|
|
fi
|
2026-02-28 16:39:04 +00:00
|
|
|
}
|
|
|
|
|
|
2026-02-28 16:43:22 +00:00
|
|
|
populate_nodes() {
|
|
|
|
|
if [ -n "${CONTROL_PLANES:-}" ]; then
|
|
|
|
|
for pair in $CONTROL_PLANES; do
|
|
|
|
|
add_node_pair "cp" "$pair"
|
|
|
|
|
done
|
|
|
|
|
else
|
|
|
|
|
while IFS= read -r var_name; do
|
|
|
|
|
idx="${var_name#CP_}"
|
|
|
|
|
add_node_pair "cp" "cp-$idx=${!var_name}"
|
|
|
|
|
done < <(compgen -A variable | grep -E '^CP_[0-9]+$' | sort -V)
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
if [ -n "${WORKERS:-}" ]; then
|
|
|
|
|
for pair in $WORKERS; do
|
|
|
|
|
add_node_pair "wk" "$pair"
|
|
|
|
|
done
|
|
|
|
|
else
|
|
|
|
|
while IFS= read -r var_name; do
|
|
|
|
|
idx="${var_name#WK_}"
|
|
|
|
|
add_node_pair "wk" "wk-$idx=${!var_name}"
|
|
|
|
|
done < <(compgen -A variable | grep -E '^WK_[0-9]+$' | sort -V)
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
if [ "${#CP_NAMES[@]}" -eq 0 ]; then
|
|
|
|
|
echo "No control planes found in inventory."
|
|
|
|
|
exit 1
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
if [ "${#WK_NAMES[@]}" -eq 0 ]; then
|
|
|
|
|
echo "No workers found in inventory."
|
|
|
|
|
exit 1
|
|
|
|
|
fi
|
2026-02-28 16:39:04 +00:00
|
|
|
}
|
|
|
|
|
|
2026-02-28 16:24:45 +00:00
|
|
|
remote() {
|
|
|
|
|
local host_ip="$1"
|
|
|
|
|
local cmd="$2"
|
2026-02-28 21:06:26 +00:00
|
|
|
local quoted_cmd
|
|
|
|
|
quoted_cmd="$(printf '%q' "$cmd")"
|
|
|
|
|
ssh $SSH_OPTS "$ACTIVE_SSH_USER@$host_ip" "bash -lc $quoted_cmd"
|
2026-02-28 19:25:48 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
detect_ssh_user() {
|
|
|
|
|
local probe_ip="$1"
|
|
|
|
|
local candidate
|
|
|
|
|
|
|
|
|
|
for candidate in $SSH_USER_CANDIDATES; do
|
|
|
|
|
if ssh $SSH_OPTS "$candidate@$probe_ip" "true" >/dev/null 2>&1; then
|
|
|
|
|
ACTIVE_SSH_USER="$candidate"
|
|
|
|
|
echo "==> Using SSH user '$ACTIVE_SSH_USER'"
|
|
|
|
|
return 0
|
|
|
|
|
fi
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
echo "Unable to authenticate to $probe_ip with candidates: $SSH_USER_CANDIDATES"
|
|
|
|
|
return 1
|
2026-02-28 16:24:45 +00:00
|
|
|
}
|
|
|
|
|
|
2026-02-28 17:07:43 +00:00
|
|
|
prepare_known_hosts() {
|
|
|
|
|
mkdir -p "$HOME/.ssh"
|
|
|
|
|
chmod 700 "$HOME/.ssh"
|
|
|
|
|
touch "$HOME/.ssh/known_hosts"
|
|
|
|
|
chmod 600 "$HOME/.ssh/known_hosts"
|
|
|
|
|
|
|
|
|
|
for node in "${!NODE_IPS[@]}"; do
|
|
|
|
|
ssh-keygen -R "${NODE_IPS[$node]}" >/dev/null 2>&1 || true
|
|
|
|
|
ssh-keyscan -H "${NODE_IPS[$node]}" >> "$HOME/.ssh/known_hosts" 2>/dev/null || true
|
|
|
|
|
done
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-28 16:43:22 +00:00
|
|
|
cluster_has_node() {
|
|
|
|
|
local node_name="$1"
|
|
|
|
|
remote "$PRIMARY_CP_IP" "sudo kubectl --kubeconfig /etc/kubernetes/admin.conf get node $node_name >/dev/null 2>&1"
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
cluster_ready() {
|
|
|
|
|
remote "$PRIMARY_CP_IP" "test -f /etc/kubernetes/admin.conf && sudo kubectl --kubeconfig /etc/kubernetes/admin.conf get nodes >/dev/null 2>&1"
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-28 16:24:45 +00:00
|
|
|
rebuild_node() {
|
|
|
|
|
local node_name="$1"
|
|
|
|
|
local node_ip="$2"
|
|
|
|
|
|
|
|
|
|
echo "==> Rebuilding $node_name on $node_ip"
|
2026-02-28 22:15:40 +00:00
|
|
|
timeout "$REBUILD_TIMEOUT" nixos-rebuild switch \
|
2026-02-28 16:24:45 +00:00
|
|
|
--flake "$FLAKE_DIR#$node_name" \
|
2026-02-28 19:25:48 +00:00
|
|
|
--target-host "$ACTIVE_SSH_USER@$node_ip" \
|
2026-02-28 23:20:12 +00:00
|
|
|
--use-remote-sudo
|
2026-02-28 16:24:45 +00:00
|
|
|
}
|
|
|
|
|
|
2026-02-28 22:15:40 +00:00
|
|
|
rebuild_node_with_retry() {
|
|
|
|
|
local node_name="$1"
|
|
|
|
|
local node_ip="$2"
|
|
|
|
|
local attempt=1
|
|
|
|
|
local max_attempts=$((REBUILD_RETRIES + 1))
|
|
|
|
|
|
|
|
|
|
while [ "$attempt" -le "$max_attempts" ]; do
|
|
|
|
|
echo "==> Rebuild attempt $attempt/$max_attempts for $node_name"
|
|
|
|
|
if rebuild_node "$node_name" "$node_ip"; then
|
|
|
|
|
return 0
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
if [ "$attempt" -lt "$max_attempts" ]; then
|
|
|
|
|
echo "==> Rebuild failed for $node_name, retrying after 20s"
|
|
|
|
|
sleep 20
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
attempt=$((attempt + 1))
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
echo "==> Rebuild failed permanently for $node_name"
|
|
|
|
|
return 1
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-28 20:25:50 +00:00
|
|
|
prepare_remote_nix_trust() {
|
|
|
|
|
local node_ip="$1"
|
|
|
|
|
echo "==> Ensuring nix trusted-users on $node_ip"
|
|
|
|
|
remote "$node_ip" "sudo mkdir -p /etc/nix"
|
|
|
|
|
remote "$node_ip" "if [ -f /etc/nix/nix.conf ]; then sudo sed -i '/^trusted-users[[:space:]]*=/d' /etc/nix/nix.conf; fi"
|
|
|
|
|
remote "$node_ip" "echo 'trusted-users = root micqdf' | sudo tee -a /etc/nix/nix.conf >/dev/null"
|
|
|
|
|
remote "$node_ip" "sudo systemctl restart nix-daemon 2>/dev/null || true"
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-28 21:24:16 +00:00
|
|
|
prepare_remote_space() {
|
|
|
|
|
local node_ip="$1"
|
|
|
|
|
echo "==> Reclaiming disk space on $node_ip"
|
|
|
|
|
remote "$node_ip" "sudo nix-collect-garbage -d || true"
|
2026-02-28 22:55:05 +00:00
|
|
|
remote "$node_ip" "sudo nix --extra-experimental-features nix-command store gc || true"
|
2026-02-28 21:24:16 +00:00
|
|
|
remote "$node_ip" "sudo rm -rf /tmp/nix* /tmp/nixos-rebuild* || true"
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-28 16:43:22 +00:00
|
|
|
populate_nodes
|
2026-02-28 17:07:43 +00:00
|
|
|
prepare_known_hosts
|
|
|
|
|
export NIX_SSHOPTS="$SSH_OPTS"
|
2026-02-28 16:43:22 +00:00
|
|
|
|
|
|
|
|
PRIMARY_CONTROL_PLANE="${PRIMARY_CONTROL_PLANE:-cp-1}"
|
|
|
|
|
if [ -z "${NODE_IPS[$PRIMARY_CONTROL_PLANE]:-}" ]; then
|
|
|
|
|
PRIMARY_CONTROL_PLANE="${CP_NAMES[0]}"
|
|
|
|
|
fi
|
|
|
|
|
PRIMARY_CP_IP="${NODE_IPS[$PRIMARY_CONTROL_PLANE]}"
|
2026-02-28 19:25:48 +00:00
|
|
|
ACTIVE_SSH_USER="$SSH_USER"
|
|
|
|
|
detect_ssh_user "$PRIMARY_CP_IP"
|
2026-02-28 16:43:22 +00:00
|
|
|
|
|
|
|
|
for node in "${CP_NAMES[@]}"; do
|
2026-02-28 20:25:50 +00:00
|
|
|
prepare_remote_nix_trust "${NODE_IPS[$node]}"
|
2026-03-01 03:33:33 +00:00
|
|
|
if [ "$FAST_MODE" != "1" ]; then
|
|
|
|
|
prepare_remote_space "${NODE_IPS[$node]}"
|
|
|
|
|
fi
|
2026-02-28 22:15:40 +00:00
|
|
|
rebuild_node_with_retry "$node" "${NODE_IPS[$node]}"
|
2026-02-28 16:24:45 +00:00
|
|
|
done
|
|
|
|
|
|
2026-02-28 22:15:40 +00:00
|
|
|
worker_failures=0
|
2026-02-28 16:43:22 +00:00
|
|
|
for node in "${WK_NAMES[@]}"; do
|
2026-02-28 20:25:50 +00:00
|
|
|
prepare_remote_nix_trust "${NODE_IPS[$node]}"
|
2026-03-01 03:33:33 +00:00
|
|
|
if [ "$FAST_MODE" != "1" ]; then
|
|
|
|
|
prepare_remote_space "${NODE_IPS[$node]}"
|
|
|
|
|
fi
|
2026-02-28 16:43:22 +00:00
|
|
|
done
|
|
|
|
|
|
2026-02-28 22:15:40 +00:00
|
|
|
active_jobs=0
|
|
|
|
|
for node in "${WK_NAMES[@]}"; do
|
|
|
|
|
(
|
|
|
|
|
rebuild_node_with_retry "$node" "${NODE_IPS[$node]}"
|
|
|
|
|
) &
|
|
|
|
|
|
|
|
|
|
active_jobs=$((active_jobs + 1))
|
|
|
|
|
if [ "$active_jobs" -ge "$WORKER_PARALLELISM" ]; then
|
|
|
|
|
if ! wait -n; then
|
|
|
|
|
worker_failures=$((worker_failures + 1))
|
|
|
|
|
fi
|
|
|
|
|
active_jobs=$((active_jobs - 1))
|
|
|
|
|
fi
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
while [ "$active_jobs" -gt 0 ]; do
|
|
|
|
|
if ! wait -n; then
|
|
|
|
|
worker_failures=$((worker_failures + 1))
|
|
|
|
|
fi
|
|
|
|
|
active_jobs=$((active_jobs - 1))
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
if [ "$worker_failures" -gt 0 ]; then
|
|
|
|
|
echo "==> $worker_failures worker rebuild job(s) failed"
|
|
|
|
|
exit 1
|
|
|
|
|
fi
|
|
|
|
|
|
2026-02-28 16:43:22 +00:00
|
|
|
echo "==> Initializing control plane on $PRIMARY_CONTROL_PLANE"
|
2026-02-28 16:39:04 +00:00
|
|
|
if cluster_ready; then
|
2026-02-28 16:43:22 +00:00
|
|
|
echo "==> Existing cluster detected on $PRIMARY_CONTROL_PLANE; skipping kubeadm init"
|
2026-02-28 16:39:04 +00:00
|
|
|
else
|
2026-02-28 16:43:22 +00:00
|
|
|
remote "$PRIMARY_CP_IP" "sudo th-kubeadm-init"
|
2026-02-28 16:39:04 +00:00
|
|
|
|
2026-02-28 16:43:22 +00:00
|
|
|
echo "==> Installing Cilium on $PRIMARY_CONTROL_PLANE"
|
|
|
|
|
remote "$PRIMARY_CP_IP" "helm repo add cilium https://helm.cilium.io >/dev/null 2>&1 || true"
|
|
|
|
|
remote "$PRIMARY_CP_IP" "helm repo update >/dev/null"
|
|
|
|
|
remote "$PRIMARY_CP_IP" "kubectl create namespace kube-system >/dev/null 2>&1 || true"
|
|
|
|
|
remote "$PRIMARY_CP_IP" "helm upgrade --install cilium cilium/cilium --namespace kube-system --set kubeProxyReplacement=true"
|
2026-02-28 16:39:04 +00:00
|
|
|
fi
|
2026-02-28 16:24:45 +00:00
|
|
|
|
|
|
|
|
echo "==> Building kubeadm join commands"
|
2026-02-28 16:43:22 +00:00
|
|
|
JOIN_CMD="$(remote "$PRIMARY_CP_IP" "sudo kubeadm token create --print-join-command")"
|
|
|
|
|
CERT_KEY="$(remote "$PRIMARY_CP_IP" "sudo kubeadm init phase upload-certs --upload-certs | tail -n 1")"
|
2026-02-28 16:24:45 +00:00
|
|
|
CP_JOIN_CMD="$JOIN_CMD --control-plane --certificate-key $CERT_KEY"
|
|
|
|
|
|
|
|
|
|
join_control_plane() {
|
|
|
|
|
local node_ip="$1"
|
|
|
|
|
local encoded
|
|
|
|
|
encoded="$(printf '%s' "$CP_JOIN_CMD" | base64 -w0)"
|
|
|
|
|
remote "$node_ip" "sudo th-kubeadm-join-control-plane \"\$(echo $encoded | base64 -d)\""
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
join_worker() {
|
|
|
|
|
local node_ip="$1"
|
|
|
|
|
local encoded
|
|
|
|
|
encoded="$(printf '%s' "$JOIN_CMD" | base64 -w0)"
|
|
|
|
|
remote "$node_ip" "sudo th-kubeadm-join-worker \"\$(echo $encoded | base64 -d)\""
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
echo "==> Joining remaining control planes"
|
2026-02-28 16:43:22 +00:00
|
|
|
for node in "${CP_NAMES[@]}"; do
|
|
|
|
|
if [ "$node" = "$PRIMARY_CONTROL_PLANE" ]; then
|
|
|
|
|
continue
|
|
|
|
|
fi
|
2026-02-28 16:39:04 +00:00
|
|
|
|
2026-02-28 16:43:22 +00:00
|
|
|
if cluster_has_node "$node"; then
|
|
|
|
|
echo "$node already joined; skipping"
|
|
|
|
|
else
|
|
|
|
|
join_control_plane "${NODE_IPS[$node]}"
|
|
|
|
|
fi
|
|
|
|
|
done
|
2026-02-28 16:24:45 +00:00
|
|
|
|
|
|
|
|
echo "==> Joining workers"
|
2026-02-28 16:43:22 +00:00
|
|
|
for node in "${WK_NAMES[@]}"; do
|
|
|
|
|
if cluster_has_node "$node"; then
|
|
|
|
|
echo "$node already joined; skipping"
|
|
|
|
|
else
|
|
|
|
|
join_worker "${NODE_IPS[$node]}"
|
|
|
|
|
fi
|
|
|
|
|
done
|
2026-02-28 16:24:45 +00:00
|
|
|
|
|
|
|
|
echo "==> Final node list"
|
2026-02-28 16:43:22 +00:00
|
|
|
remote "$PRIMARY_CP_IP" "kubectl get nodes -o wide"
|