From 9fe845b53d936b289b17aadafa2722334c23e7f2 Mon Sep 17 00:00:00 2001 From: MichaelFisher1997 Date: Sat, 28 Feb 2026 16:24:45 +0000 Subject: [PATCH] feat: add repeatable kubeadm rebuild and reset scripts --- nixos/kubeadm/README.md | 25 ++++++ nixos/kubeadm/scripts/inventory.example.env | 11 +++ .../kubeadm/scripts/rebuild-and-bootstrap.sh | 89 +++++++++++++++++++ nixos/kubeadm/scripts/reset-cluster-nodes.sh | 37 ++++++++ 4 files changed, 162 insertions(+) create mode 100644 nixos/kubeadm/scripts/inventory.example.env create mode 100755 nixos/kubeadm/scripts/rebuild-and-bootstrap.sh create mode 100755 nixos/kubeadm/scripts/reset-cluster-nodes.sh diff --git a/nixos/kubeadm/README.md b/nixos/kubeadm/README.md index 38f5303..794bf82 100644 --- a/nixos/kubeadm/README.md +++ b/nixos/kubeadm/README.md @@ -90,6 +90,31 @@ kubectl get nodes -o wide kubectl -n kube-system get pods -o wide ``` +## Repeatable rebuild flow (recommended) + +1. Copy and edit inventory: + +```bash +cp ./scripts/inventory.example.env ./scripts/inventory.env +$EDITOR ./scripts/inventory.env +``` + +2. Rebuild all nodes and bootstrap cluster: + +```bash +./scripts/rebuild-and-bootstrap.sh +``` + +3. If you only want to reset Kubernetes state on existing VMs: + +```bash +./scripts/reset-cluster-nodes.sh +``` + +For a full nuke/recreate lifecycle: +- run Terraform destroy/apply for VMs first, +- then run `./scripts/rebuild-and-bootstrap.sh` again. + ## Notes - Scripts are intentionally manual-triggered (predictable for homelab bring-up). diff --git a/nixos/kubeadm/scripts/inventory.example.env b/nixos/kubeadm/scripts/inventory.example.env new file mode 100644 index 0000000..9501588 --- /dev/null +++ b/nixos/kubeadm/scripts/inventory.example.env @@ -0,0 +1,11 @@ +SSH_USER=micqdf + +# Control planes +CP_1=192.168.1.101 +CP_2=192.168.1.102 +CP_3=192.168.1.103 + +# Workers +WK_1=192.168.1.111 +WK_2=192.168.1.112 +WK_3=192.168.1.113 diff --git a/nixos/kubeadm/scripts/rebuild-and-bootstrap.sh b/nixos/kubeadm/scripts/rebuild-and-bootstrap.sh new file mode 100755 index 0000000..d38c2bc --- /dev/null +++ b/nixos/kubeadm/scripts/rebuild-and-bootstrap.sh @@ -0,0 +1,89 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +FLAKE_DIR="${FLAKE_DIR:-$(cd "$SCRIPT_DIR/.." && pwd)}" +INVENTORY_FILE="${1:-$SCRIPT_DIR/inventory.env}" + +if [ ! -f "$INVENTORY_FILE" ]; then + echo "Missing inventory file: $INVENTORY_FILE" + echo "Copy $SCRIPT_DIR/inventory.example.env to $SCRIPT_DIR/inventory.env and edit IPs." + exit 1 +fi + +# shellcheck disable=SC1090 +source "$INVENTORY_FILE" + +SSH_USER="${SSH_USER:-micqdf}" +SSH_OPTS="${SSH_OPTS:- -o BatchMode=yes -o StrictHostKeyChecking=accept-new }" + +required=(CP_1 CP_2 CP_3 WK_1 WK_2 WK_3) +for key in "${required[@]}"; do + if [ -z "${!key:-}" ]; then + echo "Missing required inventory variable: $key" + exit 1 + fi +done + +remote() { + local host_ip="$1" + local cmd="$2" + ssh $SSH_OPTS "$SSH_USER@$host_ip" "$cmd" +} + +rebuild_node() { + local node_name="$1" + local node_ip="$2" + + echo "==> Rebuilding $node_name on $node_ip" + nixos-rebuild switch \ + --flake "$FLAKE_DIR#$node_name" \ + --target-host "$SSH_USER@$node_ip" \ + --use-remote-sudo +} + +for node in cp-1 cp-2 cp-3 wk-1 wk-2 wk-3; do + key="${node^^}" + key="${key//-/_}" + rebuild_node "$node" "${!key}" +done + +echo "==> Initializing control plane on cp-1" +remote "$CP_1" "sudo th-kubeadm-init" + +echo "==> Installing Cilium on cp-1" +remote "$CP_1" "helm repo add cilium https://helm.cilium.io >/dev/null 2>&1 || true" +remote "$CP_1" "helm repo update >/dev/null" +remote "$CP_1" "kubectl create namespace kube-system >/dev/null 2>&1 || true" +remote "$CP_1" "helm upgrade --install cilium cilium/cilium --namespace kube-system --set kubeProxyReplacement=true" + +echo "==> Building kubeadm join commands" +JOIN_CMD="$(remote "$CP_1" "sudo kubeadm token create --print-join-command")" +CERT_KEY="$(remote "$CP_1" "sudo kubeadm init phase upload-certs --upload-certs | tail -n 1")" +CP_JOIN_CMD="$JOIN_CMD --control-plane --certificate-key $CERT_KEY" + +join_control_plane() { + local node_ip="$1" + local encoded + encoded="$(printf '%s' "$CP_JOIN_CMD" | base64 -w0)" + remote "$node_ip" "sudo th-kubeadm-join-control-plane \"\$(echo $encoded | base64 -d)\"" +} + +join_worker() { + local node_ip="$1" + local encoded + encoded="$(printf '%s' "$JOIN_CMD" | base64 -w0)" + remote "$node_ip" "sudo th-kubeadm-join-worker \"\$(echo $encoded | base64 -d)\"" +} + +echo "==> Joining remaining control planes" +join_control_plane "$CP_2" +join_control_plane "$CP_3" + +echo "==> Joining workers" +join_worker "$WK_1" +join_worker "$WK_2" +join_worker "$WK_3" + +echo "==> Final node list" +remote "$CP_1" "kubectl get nodes -o wide" diff --git a/nixos/kubeadm/scripts/reset-cluster-nodes.sh b/nixos/kubeadm/scripts/reset-cluster-nodes.sh new file mode 100755 index 0000000..733c635 --- /dev/null +++ b/nixos/kubeadm/scripts/reset-cluster-nodes.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +INVENTORY_FILE="${1:-$SCRIPT_DIR/inventory.env}" + +if [ ! -f "$INVENTORY_FILE" ]; then + echo "Missing inventory file: $INVENTORY_FILE" + echo "Copy $SCRIPT_DIR/inventory.example.env to $SCRIPT_DIR/inventory.env and edit IPs." + exit 1 +fi + +# shellcheck disable=SC1090 +source "$INVENTORY_FILE" + +SSH_USER="${SSH_USER:-micqdf}" +SSH_OPTS="${SSH_OPTS:- -o BatchMode=yes -o StrictHostKeyChecking=accept-new }" + +required=(CP_1 CP_2 CP_3 WK_1 WK_2 WK_3) +for key in "${required[@]}"; do + if [ -z "${!key:-}" ]; then + echo "Missing required inventory variable: $key" + exit 1 + fi +done + +reset_node() { + local node_ip="$1" + echo "==> Resetting $node_ip" + ssh $SSH_OPTS "$SSH_USER@$node_ip" "sudo kubeadm reset -f && sudo systemctl stop kubelet && sudo rm -rf /etc/kubernetes /var/lib/etcd /var/lib/cni /etc/cni/net.d" +} + +for key in CP_1 CP_2 CP_3 WK_1 WK_2 WK_3; do + reset_node "${!key}" +done + +echo "Cluster components reset on all listed nodes."