diff --git a/.gitea/workflows/kubeadm-bootstrap.yml b/.gitea/workflows/kubeadm-bootstrap.yml index e2f85cb..c286b3b 100644 --- a/.gitea/workflows/kubeadm-bootstrap.yml +++ b/.gitea/workflows/kubeadm-bootstrap.yml @@ -80,30 +80,11 @@ jobs: run: terraform init -reconfigure -backend-config=backend.hcl - name: Create kubeadm inventory + env: + KUBEADM_SSH_USER: ${{ secrets.KUBEADM_SSH_USER }} run: | TF_OUTPUT_JSON="$(terraform -chdir=terraform output -json)" - - CP_1="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["control_plane_vm_ipv4"]["value"]["cp-1"])' <<< "$TF_OUTPUT_JSON")" - CP_2="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["control_plane_vm_ipv4"]["value"]["cp-2"])' <<< "$TF_OUTPUT_JSON")" - CP_3="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["control_plane_vm_ipv4"]["value"]["cp-3"])' <<< "$TF_OUTPUT_JSON")" - WK_1="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["worker_vm_ipv4"]["value"]["wk-1"])' <<< "$TF_OUTPUT_JSON")" - WK_2="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["worker_vm_ipv4"]["value"]["wk-2"])' <<< "$TF_OUTPUT_JSON")" - WK_3="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["worker_vm_ipv4"]["value"]["wk-3"])' <<< "$TF_OUTPUT_JSON")" - - SSH_USER="$(printf '%s' "${{ secrets.KUBEADM_SSH_USER }}")" - if [ -z "$SSH_USER" ]; then - SSH_USER="micqdf" - fi - - cat > nixos/kubeadm/scripts/inventory.env << EOF - SSH_USER=$SSH_USER - CP_1=$CP_1 - CP_2=$CP_2 - CP_3=$CP_3 - WK_1=$WK_1 - WK_2=$WK_2 - WK_3=$WK_3 - EOF + printf '%s' "$TF_OUTPUT_JSON" | ./nixos/kubeadm/scripts/render-inventory-from-tf-output.py > nixos/kubeadm/scripts/inventory.env - name: Validate nix installation run: | diff --git a/.gitea/workflows/kubeadm-reset.yml b/.gitea/workflows/kubeadm-reset.yml index a7b04c5..5e2b783 100644 --- a/.gitea/workflows/kubeadm-reset.yml +++ b/.gitea/workflows/kubeadm-reset.yml @@ -80,30 +80,11 @@ jobs: run: terraform init -reconfigure -backend-config=backend.hcl - name: Create kubeadm inventory + env: + KUBEADM_SSH_USER: ${{ secrets.KUBEADM_SSH_USER }} run: | TF_OUTPUT_JSON="$(terraform -chdir=terraform output -json)" - - CP_1="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["control_plane_vm_ipv4"]["value"]["cp-1"])' <<< "$TF_OUTPUT_JSON")" - CP_2="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["control_plane_vm_ipv4"]["value"]["cp-2"])' <<< "$TF_OUTPUT_JSON")" - CP_3="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["control_plane_vm_ipv4"]["value"]["cp-3"])' <<< "$TF_OUTPUT_JSON")" - WK_1="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["worker_vm_ipv4"]["value"]["wk-1"])' <<< "$TF_OUTPUT_JSON")" - WK_2="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["worker_vm_ipv4"]["value"]["wk-2"])' <<< "$TF_OUTPUT_JSON")" - WK_3="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["worker_vm_ipv4"]["value"]["wk-3"])' <<< "$TF_OUTPUT_JSON")" - - SSH_USER="$(printf '%s' "${{ secrets.KUBEADM_SSH_USER }}")" - if [ -z "$SSH_USER" ]; then - SSH_USER="micqdf" - fi - - cat > nixos/kubeadm/scripts/inventory.env << EOF - SSH_USER=$SSH_USER - CP_1=$CP_1 - CP_2=$CP_2 - CP_3=$CP_3 - WK_1=$WK_1 - WK_2=$WK_2 - WK_3=$WK_3 - EOF + printf '%s' "$TF_OUTPUT_JSON" | ./nixos/kubeadm/scripts/render-inventory-from-tf-output.py > nixos/kubeadm/scripts/inventory.env - name: Run cluster reset run: | diff --git a/.gitea/workflows/terraform-apply.yml b/.gitea/workflows/terraform-apply.yml index e7696ff..a66acca 100644 --- a/.gitea/workflows/terraform-apply.yml +++ b/.gitea/workflows/terraform-apply.yml @@ -89,30 +89,11 @@ jobs: chmod 0600 ~/.ssh/id_ed25519 - name: Create kubeadm inventory from Terraform outputs + env: + KUBEADM_SSH_USER: ${{ secrets.KUBEADM_SSH_USER }} run: | TF_OUTPUT_JSON="$(terraform -chdir=terraform output -json)" - - CP_1="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["control_plane_vm_ipv4"]["value"]["cp-1"])' <<< "$TF_OUTPUT_JSON")" - CP_2="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["control_plane_vm_ipv4"]["value"]["cp-2"])' <<< "$TF_OUTPUT_JSON")" - CP_3="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["control_plane_vm_ipv4"]["value"]["cp-3"])' <<< "$TF_OUTPUT_JSON")" - WK_1="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["worker_vm_ipv4"]["value"]["wk-1"])' <<< "$TF_OUTPUT_JSON")" - WK_2="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["worker_vm_ipv4"]["value"]["wk-2"])' <<< "$TF_OUTPUT_JSON")" - WK_3="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["worker_vm_ipv4"]["value"]["wk-3"])' <<< "$TF_OUTPUT_JSON")" - - SSH_USER="$(printf '%s' "${{ secrets.KUBEADM_SSH_USER }}")" - if [ -z "$SSH_USER" ]; then - SSH_USER="micqdf" - fi - - cat > nixos/kubeadm/scripts/inventory.env << EOF - SSH_USER=$SSH_USER - CP_1=$CP_1 - CP_2=$CP_2 - CP_3=$CP_3 - WK_1=$WK_1 - WK_2=$WK_2 - WK_3=$WK_3 - EOF + printf '%s' "$TF_OUTPUT_JSON" | ./nixos/kubeadm/scripts/render-inventory-from-tf-output.py > nixos/kubeadm/scripts/inventory.env - name: Ensure nix and nixos-rebuild env: diff --git a/nixos/kubeadm/README.md b/nixos/kubeadm/README.md index 79e5f3e..34f734b 100644 --- a/nixos/kubeadm/README.md +++ b/nixos/kubeadm/README.md @@ -99,7 +99,7 @@ cp ./scripts/inventory.example.env ./scripts/inventory.env $EDITOR ./scripts/inventory.env ``` -2. Rebuild all nodes and bootstrap cluster: +2. Rebuild all nodes and bootstrap/reconcile cluster: ```bash ./scripts/rebuild-and-bootstrap.sh @@ -115,6 +115,9 @@ For a full nuke/recreate lifecycle: - run Terraform destroy/apply for VMs first, - then run `./scripts/rebuild-and-bootstrap.sh` again. +Node lists are discovered from Terraform outputs, so adding new workers/control +planes in Terraform is picked up automatically by the bootstrap/reconcile flow. + ## Optional Gitea workflow automation Primary flow: diff --git a/nixos/kubeadm/scripts/inventory.example.env b/nixos/kubeadm/scripts/inventory.example.env index 9501588..c37a16c 100644 --- a/nixos/kubeadm/scripts/inventory.example.env +++ b/nixos/kubeadm/scripts/inventory.example.env @@ -1,11 +1,7 @@ SSH_USER=micqdf +PRIMARY_CONTROL_PLANE=cp-1 -# Control planes -CP_1=192.168.1.101 -CP_2=192.168.1.102 -CP_3=192.168.1.103 +# Name=IP pairs (space-separated) +CONTROL_PLANES="cp-1=192.168.1.101 cp-2=192.168.1.102 cp-3=192.168.1.103" -# Workers -WK_1=192.168.1.111 -WK_2=192.168.1.112 -WK_3=192.168.1.113 +WORKERS="wk-1=192.168.1.111 wk-2=192.168.1.112 wk-3=192.168.1.113" diff --git a/nixos/kubeadm/scripts/rebuild-and-bootstrap.sh b/nixos/kubeadm/scripts/rebuild-and-bootstrap.sh index 8992b5a..944ffc0 100755 --- a/nixos/kubeadm/scripts/rebuild-and-bootstrap.sh +++ b/nixos/kubeadm/scripts/rebuild-and-bootstrap.sh @@ -7,7 +7,7 @@ INVENTORY_FILE="${1:-$SCRIPT_DIR/inventory.env}" if [ ! -f "$INVENTORY_FILE" ]; then echo "Missing inventory file: $INVENTORY_FILE" - echo "Copy $SCRIPT_DIR/inventory.example.env to $SCRIPT_DIR/inventory.env and edit IPs." + echo "Copy $SCRIPT_DIR/inventory.example.env to $SCRIPT_DIR/inventory.env and edit node mappings." exit 1 fi @@ -17,21 +17,61 @@ source "$INVENTORY_FILE" SSH_USER="${SSH_USER:-micqdf}" SSH_OPTS="${SSH_OPTS:- -o BatchMode=yes -o StrictHostKeyChecking=accept-new }" -required=(CP_1 CP_2 CP_3 WK_1 WK_2 WK_3) -for key in "${required[@]}"; do - if [ -z "${!key:-}" ]; then - echo "Missing required inventory variable: $key" +declare -A NODE_IPS=() +declare -a CP_NAMES=() +declare -a WK_NAMES=() + +add_node_pair() { + local role="$1" + local pair="$2" + local name="${pair%%=*}" + local ip="${pair#*=}" + + if [ -z "$name" ] || [ -z "$ip" ] || [ "$name" = "$ip" ]; then + echo "Invalid node pair '$pair' (expected name=ip)." exit 1 fi -done -cluster_has_node() { - local node_name="$1" - remote "$CP_1" "sudo kubectl --kubeconfig /etc/kubernetes/admin.conf get node $node_name >/dev/null 2>&1" + NODE_IPS["$name"]="$ip" + if [ "$role" = "cp" ]; then + CP_NAMES+=("$name") + else + WK_NAMES+=("$name") + fi } -cluster_ready() { - remote "$CP_1" "test -f /etc/kubernetes/admin.conf && sudo kubectl --kubeconfig /etc/kubernetes/admin.conf get nodes >/dev/null 2>&1" +populate_nodes() { + if [ -n "${CONTROL_PLANES:-}" ]; then + for pair in $CONTROL_PLANES; do + add_node_pair "cp" "$pair" + done + else + while IFS= read -r var_name; do + idx="${var_name#CP_}" + add_node_pair "cp" "cp-$idx=${!var_name}" + done < <(compgen -A variable | grep -E '^CP_[0-9]+$' | sort -V) + fi + + if [ -n "${WORKERS:-}" ]; then + for pair in $WORKERS; do + add_node_pair "wk" "$pair" + done + else + while IFS= read -r var_name; do + idx="${var_name#WK_}" + add_node_pair "wk" "wk-$idx=${!var_name}" + done < <(compgen -A variable | grep -E '^WK_[0-9]+$' | sort -V) + fi + + if [ "${#CP_NAMES[@]}" -eq 0 ]; then + echo "No control planes found in inventory." + exit 1 + fi + + if [ "${#WK_NAMES[@]}" -eq 0 ]; then + echo "No workers found in inventory." + exit 1 + fi } remote() { @@ -40,6 +80,15 @@ remote() { ssh $SSH_OPTS "$SSH_USER@$host_ip" "$cmd" } +cluster_has_node() { + local node_name="$1" + remote "$PRIMARY_CP_IP" "sudo kubectl --kubeconfig /etc/kubernetes/admin.conf get node $node_name >/dev/null 2>&1" +} + +cluster_ready() { + remote "$PRIMARY_CP_IP" "test -f /etc/kubernetes/admin.conf && sudo kubectl --kubeconfig /etc/kubernetes/admin.conf get nodes >/dev/null 2>&1" +} + rebuild_node() { local node_name="$1" local node_ip="$2" @@ -51,28 +100,38 @@ rebuild_node() { --use-remote-sudo } -for node in cp-1 cp-2 cp-3 wk-1 wk-2 wk-3; do - key="${node^^}" - key="${key//-/_}" - rebuild_node "$node" "${!key}" +populate_nodes + +PRIMARY_CONTROL_PLANE="${PRIMARY_CONTROL_PLANE:-cp-1}" +if [ -z "${NODE_IPS[$PRIMARY_CONTROL_PLANE]:-}" ]; then + PRIMARY_CONTROL_PLANE="${CP_NAMES[0]}" +fi +PRIMARY_CP_IP="${NODE_IPS[$PRIMARY_CONTROL_PLANE]}" + +for node in "${CP_NAMES[@]}"; do + rebuild_node "$node" "${NODE_IPS[$node]}" done -echo "==> Initializing control plane on cp-1" -if cluster_ready; then - echo "==> Existing cluster detected on cp-1; skipping kubeadm init" -else - remote "$CP_1" "sudo th-kubeadm-init" +for node in "${WK_NAMES[@]}"; do + rebuild_node "$node" "${NODE_IPS[$node]}" +done - echo "==> Installing Cilium on cp-1" - remote "$CP_1" "helm repo add cilium https://helm.cilium.io >/dev/null 2>&1 || true" - remote "$CP_1" "helm repo update >/dev/null" - remote "$CP_1" "kubectl create namespace kube-system >/dev/null 2>&1 || true" - remote "$CP_1" "helm upgrade --install cilium cilium/cilium --namespace kube-system --set kubeProxyReplacement=true" +echo "==> Initializing control plane on $PRIMARY_CONTROL_PLANE" +if cluster_ready; then + echo "==> Existing cluster detected on $PRIMARY_CONTROL_PLANE; skipping kubeadm init" +else + remote "$PRIMARY_CP_IP" "sudo th-kubeadm-init" + + echo "==> Installing Cilium on $PRIMARY_CONTROL_PLANE" + remote "$PRIMARY_CP_IP" "helm repo add cilium https://helm.cilium.io >/dev/null 2>&1 || true" + remote "$PRIMARY_CP_IP" "helm repo update >/dev/null" + remote "$PRIMARY_CP_IP" "kubectl create namespace kube-system >/dev/null 2>&1 || true" + remote "$PRIMARY_CP_IP" "helm upgrade --install cilium cilium/cilium --namespace kube-system --set kubeProxyReplacement=true" fi echo "==> Building kubeadm join commands" -JOIN_CMD="$(remote "$CP_1" "sudo kubeadm token create --print-join-command")" -CERT_KEY="$(remote "$CP_1" "sudo kubeadm init phase upload-certs --upload-certs | tail -n 1")" +JOIN_CMD="$(remote "$PRIMARY_CP_IP" "sudo kubeadm token create --print-join-command")" +CERT_KEY="$(remote "$PRIMARY_CP_IP" "sudo kubeadm init phase upload-certs --upload-certs | tail -n 1")" CP_JOIN_CMD="$JOIN_CMD --control-plane --certificate-key $CERT_KEY" join_control_plane() { @@ -90,36 +149,26 @@ join_worker() { } echo "==> Joining remaining control planes" -if cluster_has_node "cp-2"; then - echo "cp-2 already joined; skipping" -else - join_control_plane "$CP_2" -fi +for node in "${CP_NAMES[@]}"; do + if [ "$node" = "$PRIMARY_CONTROL_PLANE" ]; then + continue + fi -if cluster_has_node "cp-3"; then - echo "cp-3 already joined; skipping" -else - join_control_plane "$CP_3" -fi + if cluster_has_node "$node"; then + echo "$node already joined; skipping" + else + join_control_plane "${NODE_IPS[$node]}" + fi +done echo "==> Joining workers" -if cluster_has_node "wk-1"; then - echo "wk-1 already joined; skipping" -else - join_worker "$WK_1" -fi - -if cluster_has_node "wk-2"; then - echo "wk-2 already joined; skipping" -else - join_worker "$WK_2" -fi - -if cluster_has_node "wk-3"; then - echo "wk-3 already joined; skipping" -else - join_worker "$WK_3" -fi +for node in "${WK_NAMES[@]}"; do + if cluster_has_node "$node"; then + echo "$node already joined; skipping" + else + join_worker "${NODE_IPS[$node]}" + fi +done echo "==> Final node list" -remote "$CP_1" "kubectl get nodes -o wide" +remote "$PRIMARY_CP_IP" "kubectl get nodes -o wide" diff --git a/nixos/kubeadm/scripts/render-inventory-from-tf-output.py b/nixos/kubeadm/scripts/render-inventory-from-tf-output.py new file mode 100755 index 0000000..b911877 --- /dev/null +++ b/nixos/kubeadm/scripts/render-inventory-from-tf-output.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 + +import json +import os +import re +import sys + + +def natural_key(name: str): + m = re.match(r"^([a-zA-Z-]+)-(\d+)$", name) + if m: + return (m.group(1), int(m.group(2))) + return (name, 0) + + +def map_to_pairs(items: dict[str, str]) -> str: + ordered = sorted(items.items(), key=lambda kv: natural_key(kv[0])) + return " ".join(f"{k}={v}" for k, v in ordered) + + +def main() -> int: + payload = json.load(sys.stdin) + + cp_map = payload.get("control_plane_vm_ipv4", {}).get("value", {}) + wk_map = payload.get("worker_vm_ipv4", {}).get("value", {}) + + if not cp_map or not wk_map: + raise SystemExit("Missing control_plane_vm_ipv4 or worker_vm_ipv4 in terraform output") + + ssh_user = os.environ.get("KUBEADM_SSH_USER", "").strip() or "micqdf" + + print(f"SSH_USER={ssh_user}") + print("PRIMARY_CONTROL_PLANE=cp-1") + print(f"CONTROL_PLANES=\"{map_to_pairs(cp_map)}\"") + print(f"WORKERS=\"{map_to_pairs(wk_map)}\"") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/nixos/kubeadm/scripts/reset-cluster-nodes.sh b/nixos/kubeadm/scripts/reset-cluster-nodes.sh index 733c635..44d5b61 100755 --- a/nixos/kubeadm/scripts/reset-cluster-nodes.sh +++ b/nixos/kubeadm/scripts/reset-cluster-nodes.sh @@ -6,7 +6,7 @@ INVENTORY_FILE="${1:-$SCRIPT_DIR/inventory.env}" if [ ! -f "$INVENTORY_FILE" ]; then echo "Missing inventory file: $INVENTORY_FILE" - echo "Copy $SCRIPT_DIR/inventory.example.env to $SCRIPT_DIR/inventory.env and edit IPs." + echo "Copy $SCRIPT_DIR/inventory.example.env to $SCRIPT_DIR/inventory.env and edit node mappings." exit 1 fi @@ -16,22 +16,57 @@ source "$INVENTORY_FILE" SSH_USER="${SSH_USER:-micqdf}" SSH_OPTS="${SSH_OPTS:- -o BatchMode=yes -o StrictHostKeyChecking=accept-new }" -required=(CP_1 CP_2 CP_3 WK_1 WK_2 WK_3) -for key in "${required[@]}"; do - if [ -z "${!key:-}" ]; then - echo "Missing required inventory variable: $key" +declare -A NODE_IPS=() + +add_pair() { + local pair="$1" + local name="${pair%%=*}" + local ip="${pair#*=}" + + if [ -z "$name" ] || [ -z "$ip" ] || [ "$name" = "$ip" ]; then + echo "Invalid node pair '$pair' (expected name=ip)." exit 1 fi -done + + NODE_IPS["$name"]="$ip" +} + +if [ -n "${CONTROL_PLANES:-}" ]; then + for pair in $CONTROL_PLANES; do + add_pair "$pair" + done +else + while IFS= read -r var_name; do + idx="${var_name#CP_}" + add_pair "cp-$idx=${!var_name}" + done < <(compgen -A variable | grep -E '^CP_[0-9]+$' | sort -V) +fi + +if [ -n "${WORKERS:-}" ]; then + for pair in $WORKERS; do + add_pair "$pair" + done +else + while IFS= read -r var_name; do + idx="${var_name#WK_}" + add_pair "wk-$idx=${!var_name}" + done < <(compgen -A variable | grep -E '^WK_[0-9]+$' | sort -V) +fi + +if [ "${#NODE_IPS[@]}" -eq 0 ]; then + echo "No nodes found in inventory." + exit 1 +fi reset_node() { - local node_ip="$1" - echo "==> Resetting $node_ip" + local node_name="$1" + local node_ip="$2" + echo "==> Resetting $node_name ($node_ip)" ssh $SSH_OPTS "$SSH_USER@$node_ip" "sudo kubeadm reset -f && sudo systemctl stop kubelet && sudo rm -rf /etc/kubernetes /var/lib/etcd /var/lib/cni /etc/cni/net.d" } -for key in CP_1 CP_2 CP_3 WK_1 WK_2 WK_3; do - reset_node "${!key}" -done +while IFS= read -r node_name; do + reset_node "$node_name" "${NODE_IPS[$node_name]}" +done < <(printf '%s\n' "${!NODE_IPS[@]}" | sort -V) echo "Cluster components reset on all listed nodes."