From 9fe845b53d936b289b17aadafa2722334c23e7f2 Mon Sep 17 00:00:00 2001
From: MichaelFisher1997 <contact@michaelfisher.tech>
Date: Sat, 28 Feb 2026 16:24:45 +0000
Subject: [PATCH] feat: add repeatable kubeadm rebuild and reset scripts

---
 nixos/kubeadm/README.md                       | 25 ++++++
 nixos/kubeadm/scripts/inventory.example.env   | 11 +++
 .../kubeadm/scripts/rebuild-and-bootstrap.sh  | 89 +++++++++++++++++++
 nixos/kubeadm/scripts/reset-cluster-nodes.sh  | 37 ++++++++
 4 files changed, 162 insertions(+)
 create mode 100644 nixos/kubeadm/scripts/inventory.example.env
 create mode 100755 nixos/kubeadm/scripts/rebuild-and-bootstrap.sh
 create mode 100755 nixos/kubeadm/scripts/reset-cluster-nodes.sh

diff --git a/nixos/kubeadm/README.md b/nixos/kubeadm/README.md
index 38f5303..794bf82 100644
--- a/nixos/kubeadm/README.md
+++ b/nixos/kubeadm/README.md
@@ -90,6 +90,31 @@ kubectl get nodes -o wide
 kubectl -n kube-system get pods -o wide
 ```
 
+## Repeatable rebuild flow (recommended)
+
+1. Copy and edit inventory:
+
+```bash
+cp ./scripts/inventory.example.env ./scripts/inventory.env
+$EDITOR ./scripts/inventory.env
+```
+
+2. Rebuild all nodes and bootstrap cluster:
+
+```bash
+./scripts/rebuild-and-bootstrap.sh
+```
+
+3. If you only want to reset Kubernetes state on existing VMs:
+
+```bash
+./scripts/reset-cluster-nodes.sh
+```
+
+For a full nuke/recreate lifecycle:
+- run Terraform destroy/apply for VMs first,
+- then run `./scripts/rebuild-and-bootstrap.sh` again.
+
 ## Notes
 
 - Scripts are intentionally manual-triggered (predictable for homelab bring-up).
diff --git a/nixos/kubeadm/scripts/inventory.example.env b/nixos/kubeadm/scripts/inventory.example.env
new file mode 100644
index 0000000..9501588
--- /dev/null
+++ b/nixos/kubeadm/scripts/inventory.example.env
@@ -0,0 +1,11 @@
+SSH_USER=micqdf
+
+# Control planes
+CP_1=192.168.1.101
+CP_2=192.168.1.102
+CP_3=192.168.1.103
+
+# Workers
+WK_1=192.168.1.111
+WK_2=192.168.1.112
+WK_3=192.168.1.113
diff --git a/nixos/kubeadm/scripts/rebuild-and-bootstrap.sh b/nixos/kubeadm/scripts/rebuild-and-bootstrap.sh
new file mode 100755
index 0000000..d38c2bc
--- /dev/null
+++ b/nixos/kubeadm/scripts/rebuild-and-bootstrap.sh
@@ -0,0 +1,89 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+FLAKE_DIR="${FLAKE_DIR:-$(cd "$SCRIPT_DIR/.." && pwd)}"
+INVENTORY_FILE="${1:-$SCRIPT_DIR/inventory.env}"
+
+if [ ! -f "$INVENTORY_FILE" ]; then
+  echo "Missing inventory file: $INVENTORY_FILE"
+  echo "Copy $SCRIPT_DIR/inventory.example.env to $SCRIPT_DIR/inventory.env and edit IPs."
+  exit 1
+fi
+
+# shellcheck disable=SC1090
+source "$INVENTORY_FILE"
+
+SSH_USER="${SSH_USER:-micqdf}"
+SSH_OPTS="${SSH_OPTS:- -o BatchMode=yes -o StrictHostKeyChecking=accept-new }"
+
+required=(CP_1 CP_2 CP_3 WK_1 WK_2 WK_3)
+for key in "${required[@]}"; do
+  if [ -z "${!key:-}" ]; then
+    echo "Missing required inventory variable: $key"
+    exit 1
+  fi
+done
+
+remote() {
+  local host_ip="$1"
+  local cmd="$2"
+  ssh $SSH_OPTS "$SSH_USER@$host_ip" "$cmd"
+}
+
+rebuild_node() {
+  local node_name="$1"
+  local node_ip="$2"
+
+  echo "==> Rebuilding $node_name on $node_ip"
+  nixos-rebuild switch \
+    --flake "$FLAKE_DIR#$node_name" \
+    --target-host "$SSH_USER@$node_ip" \
+    --use-remote-sudo
+}
+
+for node in cp-1 cp-2 cp-3 wk-1 wk-2 wk-3; do
+  key="${node^^}"
+  key="${key//-/_}"
+  rebuild_node "$node" "${!key}"
+done
+
+echo "==> Initializing control plane on cp-1"
+remote "$CP_1" "sudo th-kubeadm-init"
+
+echo "==> Installing Cilium on cp-1"
+remote "$CP_1" "helm repo add cilium https://helm.cilium.io >/dev/null 2>&1 || true"
+remote "$CP_1" "helm repo update >/dev/null"
+remote "$CP_1" "kubectl create namespace kube-system >/dev/null 2>&1 || true"
+remote "$CP_1" "helm upgrade --install cilium cilium/cilium --namespace kube-system --set kubeProxyReplacement=true"
+
+echo "==> Building kubeadm join commands"
+JOIN_CMD="$(remote "$CP_1" "sudo kubeadm token create --print-join-command")"
+CERT_KEY="$(remote "$CP_1" "sudo kubeadm init phase upload-certs --upload-certs | tail -n 1")"
+CP_JOIN_CMD="$JOIN_CMD --control-plane --certificate-key $CERT_KEY"
+
+join_control_plane() {
+  local node_ip="$1"
+  local encoded
+  encoded="$(printf '%s' "$CP_JOIN_CMD" | base64 -w0)"
+  remote "$node_ip" "sudo th-kubeadm-join-control-plane \"\$(echo $encoded | base64 -d)\""
+}
+
+join_worker() {
+  local node_ip="$1"
+  local encoded
+  encoded="$(printf '%s' "$JOIN_CMD" | base64 -w0)"
+  remote "$node_ip" "sudo th-kubeadm-join-worker \"\$(echo $encoded | base64 -d)\""
+}
+
+echo "==> Joining remaining control planes"
+join_control_plane "$CP_2"
+join_control_plane "$CP_3"
+
+echo "==> Joining workers"
+join_worker "$WK_1"
+join_worker "$WK_2"
+join_worker "$WK_3"
+
+echo "==> Final node list"
+remote "$CP_1" "kubectl get nodes -o wide"
diff --git a/nixos/kubeadm/scripts/reset-cluster-nodes.sh b/nixos/kubeadm/scripts/reset-cluster-nodes.sh
new file mode 100755
index 0000000..733c635
--- /dev/null
+++ b/nixos/kubeadm/scripts/reset-cluster-nodes.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+INVENTORY_FILE="${1:-$SCRIPT_DIR/inventory.env}"
+
+if [ ! -f "$INVENTORY_FILE" ]; then
+  echo "Missing inventory file: $INVENTORY_FILE"
+  echo "Copy $SCRIPT_DIR/inventory.example.env to $SCRIPT_DIR/inventory.env and edit IPs."
+  exit 1
+fi
+
+# shellcheck disable=SC1090
+source "$INVENTORY_FILE"
+
+SSH_USER="${SSH_USER:-micqdf}"
+SSH_OPTS="${SSH_OPTS:- -o BatchMode=yes -o StrictHostKeyChecking=accept-new }"
+
+required=(CP_1 CP_2 CP_3 WK_1 WK_2 WK_3)
+for key in "${required[@]}"; do
+  if [ -z "${!key:-}" ]; then
+    echo "Missing required inventory variable: $key"
+    exit 1
+  fi
+done
+
+reset_node() {
+  local node_ip="$1"
+  echo "==> Resetting $node_ip"
+  ssh $SSH_OPTS "$SSH_USER@$node_ip" "sudo kubeadm reset -f && sudo systemctl stop kubelet && sudo rm -rf /etc/kubernetes /var/lib/etcd /var/lib/cni /etc/cni/net.d"
+}
+
+for key in CP_1 CP_2 CP_3 WK_1 WK_2 WK_3; do
+  reset_node "${!key}"
+done
+
+echo "Cluster components reset on all listed nodes."