diff --git a/.gitea/workflows/kubeadm-bootstrap.yml b/.gitea/workflows/kubeadm-bootstrap.yml index 7da5ae2..c286b3b 100644 --- a/.gitea/workflows/kubeadm-bootstrap.yml +++ b/.gitea/workflows/kubeadm-bootstrap.yml @@ -80,40 +80,50 @@ jobs: run: terraform init -reconfigure -backend-config=backend.hcl - name: Create kubeadm inventory + env: + KUBEADM_SSH_USER: ${{ secrets.KUBEADM_SSH_USER }} run: | TF_OUTPUT_JSON="$(terraform -chdir=terraform output -json)" - - CP_1="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["control_plane_vm_ipv4"]["value"]["cp-1"])' <<< "$TF_OUTPUT_JSON")" - CP_2="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["control_plane_vm_ipv4"]["value"]["cp-2"])' <<< "$TF_OUTPUT_JSON")" - CP_3="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["control_plane_vm_ipv4"]["value"]["cp-3"])' <<< "$TF_OUTPUT_JSON")" - WK_1="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["worker_vm_ipv4"]["value"]["wk-1"])' <<< "$TF_OUTPUT_JSON")" - WK_2="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["worker_vm_ipv4"]["value"]["wk-2"])' <<< "$TF_OUTPUT_JSON")" - WK_3="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["worker_vm_ipv4"]["value"]["wk-3"])' <<< "$TF_OUTPUT_JSON")" - - SSH_USER="$(printf '%s' "${{ secrets.KUBEADM_SSH_USER }}")" - if [ -z "$SSH_USER" ]; then - SSH_USER="micqdf" - fi - - cat > nixos/kubeadm/scripts/inventory.env << EOF - SSH_USER=$SSH_USER - CP_1=$CP_1 - CP_2=$CP_2 - CP_3=$CP_3 - WK_1=$WK_1 - WK_2=$WK_2 - WK_3=$WK_3 - EOF + printf '%s' "$TF_OUTPUT_JSON" | ./nixos/kubeadm/scripts/render-inventory-from-tf-output.py > nixos/kubeadm/scripts/inventory.env - name: Validate nix installation run: | - if [ ! -x /nix/var/nix/profiles/default/bin/nix ]; then - echo "Nix not found at /nix/var/nix/profiles/default/bin/nix" - exit 1 + if [ -x /nix/var/nix/profiles/default/bin/nix ]; then + /nix/var/nix/profiles/default/bin/nix --version + exit 0 fi + if command -v nix >/dev/null 2>&1; then + nix --version + exit 0 + fi + + echo "Nix missing; installing no-daemon Nix for this runner job" + sh <(curl -L https://nixos.org/nix/install) --no-daemon + + if [ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]; then + . "$HOME/.nix-profile/etc/profile.d/nix.sh" + fi + + nix --version + + - name: Install nixos-rebuild tool + env: + NIX_CONFIG: experimental-features = nix-command flakes + run: | + if [ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]; then + . "$HOME/.nix-profile/etc/profile.d/nix.sh" + fi + + nix profile install nixpkgs#nixos-rebuild + - name: Run cluster rebuild and bootstrap env: - PATH: /nix/var/nix/profiles/default/bin:${{ env.PATH }} + NIX_CONFIG: experimental-features = nix-command flakes + PATH: $HOME/.nix-profile/bin:/nix/var/nix/profiles/default/bin:${{ env.PATH }} run: | + if [ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]; then + . "$HOME/.nix-profile/etc/profile.d/nix.sh" + fi + ./nixos/kubeadm/scripts/rebuild-and-bootstrap.sh diff --git a/.gitea/workflows/kubeadm-reset.yml b/.gitea/workflows/kubeadm-reset.yml index a7b04c5..5e2b783 100644 --- a/.gitea/workflows/kubeadm-reset.yml +++ b/.gitea/workflows/kubeadm-reset.yml @@ -80,30 +80,11 @@ jobs: run: terraform init -reconfigure -backend-config=backend.hcl - name: Create kubeadm inventory + env: + KUBEADM_SSH_USER: ${{ secrets.KUBEADM_SSH_USER }} run: | TF_OUTPUT_JSON="$(terraform -chdir=terraform output -json)" - - CP_1="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["control_plane_vm_ipv4"]["value"]["cp-1"])' <<< "$TF_OUTPUT_JSON")" - CP_2="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["control_plane_vm_ipv4"]["value"]["cp-2"])' <<< "$TF_OUTPUT_JSON")" - CP_3="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["control_plane_vm_ipv4"]["value"]["cp-3"])' <<< "$TF_OUTPUT_JSON")" - WK_1="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["worker_vm_ipv4"]["value"]["wk-1"])' <<< "$TF_OUTPUT_JSON")" - WK_2="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["worker_vm_ipv4"]["value"]["wk-2"])' <<< "$TF_OUTPUT_JSON")" - WK_3="$(python3 -c 'import json,sys; d=json.loads(sys.stdin.read()); print(d["worker_vm_ipv4"]["value"]["wk-3"])' <<< "$TF_OUTPUT_JSON")" - - SSH_USER="$(printf '%s' "${{ secrets.KUBEADM_SSH_USER }}")" - if [ -z "$SSH_USER" ]; then - SSH_USER="micqdf" - fi - - cat > nixos/kubeadm/scripts/inventory.env << EOF - SSH_USER=$SSH_USER - CP_1=$CP_1 - CP_2=$CP_2 - CP_3=$CP_3 - WK_1=$WK_1 - WK_2=$WK_2 - WK_3=$WK_3 - EOF + printf '%s' "$TF_OUTPUT_JSON" | ./nixos/kubeadm/scripts/render-inventory-from-tf-output.py > nixos/kubeadm/scripts/inventory.env - name: Run cluster reset run: | diff --git a/.gitea/workflows/terraform-apply.yml b/.gitea/workflows/terraform-apply.yml index a3d2ab6..a66acca 100644 --- a/.gitea/workflows/terraform-apply.yml +++ b/.gitea/workflows/terraform-apply.yml @@ -71,3 +71,52 @@ jobs: - name: Terraform Apply working-directory: terraform run: terraform apply -auto-approve tfplan + + - name: Create SSH key + run: | + install -m 0700 -d ~/.ssh + KEY_CONTENT="$(printf '%s' "${{ secrets.KUBEADM_SSH_PRIVATE_KEY }}")" + if [ -z "$KEY_CONTENT" ]; then + KEY_CONTENT="$(printf '%s' "${{ secrets.SSH_KEY_PRIVATE }}")" + fi + + if [ -z "$KEY_CONTENT" ]; then + echo "Missing SSH private key secret. Set KUBEADM_SSH_PRIVATE_KEY or SSH_KEY_PRIVATE." + exit 1 + fi + + printf '%s\n' "$KEY_CONTENT" > ~/.ssh/id_ed25519 + chmod 0600 ~/.ssh/id_ed25519 + + - name: Create kubeadm inventory from Terraform outputs + env: + KUBEADM_SSH_USER: ${{ secrets.KUBEADM_SSH_USER }} + run: | + TF_OUTPUT_JSON="$(terraform -chdir=terraform output -json)" + printf '%s' "$TF_OUTPUT_JSON" | ./nixos/kubeadm/scripts/render-inventory-from-tf-output.py > nixos/kubeadm/scripts/inventory.env + + - name: Ensure nix and nixos-rebuild + env: + NIX_CONFIG: experimental-features = nix-command flakes + run: | + if [ ! -x /nix/var/nix/profiles/default/bin/nix ] && ! command -v nix >/dev/null 2>&1; then + sh <(curl -L https://nixos.org/nix/install) --no-daemon + fi + + if [ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]; then + . "$HOME/.nix-profile/etc/profile.d/nix.sh" + fi + + nix --version + nix profile install nixpkgs#nixos-rebuild + + - name: Rebuild and bootstrap/reconcile kubeadm cluster + env: + NIX_CONFIG: experimental-features = nix-command flakes + PATH: $HOME/.nix-profile/bin:/nix/var/nix/profiles/default/bin:${{ env.PATH }} + run: | + if [ -f "$HOME/.nix-profile/etc/profile.d/nix.sh" ]; then + . "$HOME/.nix-profile/etc/profile.d/nix.sh" + fi + + ./nixos/kubeadm/scripts/rebuild-and-bootstrap.sh diff --git a/nixos/kubeadm/README.md b/nixos/kubeadm/README.md index 30e2873..34f734b 100644 --- a/nixos/kubeadm/README.md +++ b/nixos/kubeadm/README.md @@ -99,7 +99,7 @@ cp ./scripts/inventory.example.env ./scripts/inventory.env $EDITOR ./scripts/inventory.env ``` -2. Rebuild all nodes and bootstrap cluster: +2. Rebuild all nodes and bootstrap/reconcile cluster: ```bash ./scripts/rebuild-and-bootstrap.sh @@ -115,8 +115,16 @@ For a full nuke/recreate lifecycle: - run Terraform destroy/apply for VMs first, - then run `./scripts/rebuild-and-bootstrap.sh` again. +Node lists are discovered from Terraform outputs, so adding new workers/control +planes in Terraform is picked up automatically by the bootstrap/reconcile flow. + ## Optional Gitea workflow automation +Primary flow: + +- Push to `master` triggers `.gitea/workflows/terraform-apply.yml` +- That workflow now does Terraform apply and then runs kubeadm rebuild/bootstrap reconciliation automatically + Manual dispatch workflows are available: - `.gitea/workflows/kubeadm-bootstrap.yml` diff --git a/nixos/kubeadm/scripts/inventory.example.env b/nixos/kubeadm/scripts/inventory.example.env index 9501588..c37a16c 100644 --- a/nixos/kubeadm/scripts/inventory.example.env +++ b/nixos/kubeadm/scripts/inventory.example.env @@ -1,11 +1,7 @@ SSH_USER=micqdf +PRIMARY_CONTROL_PLANE=cp-1 -# Control planes -CP_1=192.168.1.101 -CP_2=192.168.1.102 -CP_3=192.168.1.103 +# Name=IP pairs (space-separated) +CONTROL_PLANES="cp-1=192.168.1.101 cp-2=192.168.1.102 cp-3=192.168.1.103" -# Workers -WK_1=192.168.1.111 -WK_2=192.168.1.112 -WK_3=192.168.1.113 +WORKERS="wk-1=192.168.1.111 wk-2=192.168.1.112 wk-3=192.168.1.113" diff --git a/nixos/kubeadm/scripts/rebuild-and-bootstrap.sh b/nixos/kubeadm/scripts/rebuild-and-bootstrap.sh index d38c2bc..944ffc0 100755 --- a/nixos/kubeadm/scripts/rebuild-and-bootstrap.sh +++ b/nixos/kubeadm/scripts/rebuild-and-bootstrap.sh @@ -7,7 +7,7 @@ INVENTORY_FILE="${1:-$SCRIPT_DIR/inventory.env}" if [ ! -f "$INVENTORY_FILE" ]; then echo "Missing inventory file: $INVENTORY_FILE" - echo "Copy $SCRIPT_DIR/inventory.example.env to $SCRIPT_DIR/inventory.env and edit IPs." + echo "Copy $SCRIPT_DIR/inventory.example.env to $SCRIPT_DIR/inventory.env and edit node mappings." exit 1 fi @@ -17,13 +17,62 @@ source "$INVENTORY_FILE" SSH_USER="${SSH_USER:-micqdf}" SSH_OPTS="${SSH_OPTS:- -o BatchMode=yes -o StrictHostKeyChecking=accept-new }" -required=(CP_1 CP_2 CP_3 WK_1 WK_2 WK_3) -for key in "${required[@]}"; do - if [ -z "${!key:-}" ]; then - echo "Missing required inventory variable: $key" +declare -A NODE_IPS=() +declare -a CP_NAMES=() +declare -a WK_NAMES=() + +add_node_pair() { + local role="$1" + local pair="$2" + local name="${pair%%=*}" + local ip="${pair#*=}" + + if [ -z "$name" ] || [ -z "$ip" ] || [ "$name" = "$ip" ]; then + echo "Invalid node pair '$pair' (expected name=ip)." exit 1 fi -done + + NODE_IPS["$name"]="$ip" + if [ "$role" = "cp" ]; then + CP_NAMES+=("$name") + else + WK_NAMES+=("$name") + fi +} + +populate_nodes() { + if [ -n "${CONTROL_PLANES:-}" ]; then + for pair in $CONTROL_PLANES; do + add_node_pair "cp" "$pair" + done + else + while IFS= read -r var_name; do + idx="${var_name#CP_}" + add_node_pair "cp" "cp-$idx=${!var_name}" + done < <(compgen -A variable | grep -E '^CP_[0-9]+$' | sort -V) + fi + + if [ -n "${WORKERS:-}" ]; then + for pair in $WORKERS; do + add_node_pair "wk" "$pair" + done + else + while IFS= read -r var_name; do + idx="${var_name#WK_}" + add_node_pair "wk" "wk-$idx=${!var_name}" + done < <(compgen -A variable | grep -E '^WK_[0-9]+$' | sort -V) + fi + + if [ "${#CP_NAMES[@]}" -eq 0 ]; then + echo "No control planes found in inventory." + exit 1 + fi + + if [ "${#WK_NAMES[@]}" -eq 0 ]; then + echo "No workers found in inventory." + exit 1 + fi +} remote() { local host_ip="$1" @@ -31,6 +80,15 @@ remote() { ssh $SSH_OPTS "$SSH_USER@$host_ip" "$cmd" } +cluster_has_node() { + local node_name="$1" + remote "$PRIMARY_CP_IP" "sudo kubectl --kubeconfig /etc/kubernetes/admin.conf get node $node_name >/dev/null 2>&1" +} + +cluster_ready() { + remote "$PRIMARY_CP_IP" "test -f /etc/kubernetes/admin.conf && sudo kubectl --kubeconfig /etc/kubernetes/admin.conf get nodes >/dev/null 2>&1" +} + rebuild_node() { local node_name="$1" local node_ip="$2" @@ -42,24 +100,38 @@ rebuild_node() { --use-remote-sudo } -for node in cp-1 cp-2 cp-3 wk-1 wk-2 wk-3; do - key="${node^^}" - key="${key//-/_}" - rebuild_node "$node" "${!key}" +populate_nodes + +PRIMARY_CONTROL_PLANE="${PRIMARY_CONTROL_PLANE:-cp-1}" +if [ -z "${NODE_IPS[$PRIMARY_CONTROL_PLANE]:-}" ]; then + PRIMARY_CONTROL_PLANE="${CP_NAMES[0]}" +fi +PRIMARY_CP_IP="${NODE_IPS[$PRIMARY_CONTROL_PLANE]}" + +for node in "${CP_NAMES[@]}"; do + rebuild_node "$node" "${NODE_IPS[$node]}" done -echo "==> Initializing control plane on cp-1" -remote "$CP_1" "sudo th-kubeadm-init" +for node in "${WK_NAMES[@]}"; do + rebuild_node "$node" "${NODE_IPS[$node]}" +done -echo "==> Installing Cilium on cp-1" -remote "$CP_1" "helm repo add cilium https://helm.cilium.io >/dev/null 2>&1 || true" -remote "$CP_1" "helm repo update >/dev/null" -remote "$CP_1" "kubectl create namespace kube-system >/dev/null 2>&1 || true" -remote "$CP_1" "helm upgrade --install cilium cilium/cilium --namespace kube-system --set kubeProxyReplacement=true" +echo "==> Initializing control plane on $PRIMARY_CONTROL_PLANE" +if cluster_ready; then + echo "==> Existing cluster detected on $PRIMARY_CONTROL_PLANE; skipping kubeadm init" +else + remote "$PRIMARY_CP_IP" "sudo th-kubeadm-init" + + echo "==> Installing Cilium on $PRIMARY_CONTROL_PLANE" + remote "$PRIMARY_CP_IP" "helm repo add cilium https://helm.cilium.io >/dev/null 2>&1 || true" + remote "$PRIMARY_CP_IP" "helm repo update >/dev/null" + remote "$PRIMARY_CP_IP" "kubectl create namespace kube-system >/dev/null 2>&1 || true" + remote "$PRIMARY_CP_IP" "helm upgrade --install cilium cilium/cilium --namespace kube-system --set kubeProxyReplacement=true" +fi echo "==> Building kubeadm join commands" -JOIN_CMD="$(remote "$CP_1" "sudo kubeadm token create --print-join-command")" -CERT_KEY="$(remote "$CP_1" "sudo kubeadm init phase upload-certs --upload-certs | tail -n 1")" +JOIN_CMD="$(remote "$PRIMARY_CP_IP" "sudo kubeadm token create --print-join-command")" +CERT_KEY="$(remote "$PRIMARY_CP_IP" "sudo kubeadm init phase upload-certs --upload-certs | tail -n 1")" CP_JOIN_CMD="$JOIN_CMD --control-plane --certificate-key $CERT_KEY" join_control_plane() { @@ -77,13 +149,26 @@ join_worker() { } echo "==> Joining remaining control planes" -join_control_plane "$CP_2" -join_control_plane "$CP_3" +for node in "${CP_NAMES[@]}"; do + if [ "$node" = "$PRIMARY_CONTROL_PLANE" ]; then + continue + fi + + if cluster_has_node "$node"; then + echo "$node already joined; skipping" + else + join_control_plane "${NODE_IPS[$node]}" + fi +done echo "==> Joining workers" -join_worker "$WK_1" -join_worker "$WK_2" -join_worker "$WK_3" +for node in "${WK_NAMES[@]}"; do + if cluster_has_node "$node"; then + echo "$node already joined; skipping" + else + join_worker "${NODE_IPS[$node]}" + fi +done echo "==> Final node list" -remote "$CP_1" "kubectl get nodes -o wide" +remote "$PRIMARY_CP_IP" "kubectl get nodes -o wide" diff --git a/nixos/kubeadm/scripts/render-inventory-from-tf-output.py b/nixos/kubeadm/scripts/render-inventory-from-tf-output.py new file mode 100755 index 0000000..b911877 --- /dev/null +++ b/nixos/kubeadm/scripts/render-inventory-from-tf-output.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 + +import json +import os +import re +import sys + + +def natural_key(name: str): + m = re.match(r"^([a-zA-Z-]+)-(\d+)$", name) + if m: + return (m.group(1), int(m.group(2))) + return (name, 0) + + +def map_to_pairs(items: dict[str, str]) -> str: + ordered = sorted(items.items(), key=lambda kv: natural_key(kv[0])) + return " ".join(f"{k}={v}" for k, v in ordered) + + +def main() -> int: + payload = json.load(sys.stdin) + + cp_map = payload.get("control_plane_vm_ipv4", {}).get("value", {}) + wk_map = payload.get("worker_vm_ipv4", {}).get("value", {}) + + if not cp_map or not wk_map: + raise SystemExit("Missing control_plane_vm_ipv4 or worker_vm_ipv4 in terraform output") + + ssh_user = os.environ.get("KUBEADM_SSH_USER", "").strip() or "micqdf" + + print(f"SSH_USER={ssh_user}") + print("PRIMARY_CONTROL_PLANE=cp-1") + print(f"CONTROL_PLANES=\"{map_to_pairs(cp_map)}\"") + print(f"WORKERS=\"{map_to_pairs(wk_map)}\"") + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/nixos/kubeadm/scripts/reset-cluster-nodes.sh b/nixos/kubeadm/scripts/reset-cluster-nodes.sh index 733c635..44d5b61 100755 --- a/nixos/kubeadm/scripts/reset-cluster-nodes.sh +++ b/nixos/kubeadm/scripts/reset-cluster-nodes.sh @@ -6,7 +6,7 @@ INVENTORY_FILE="${1:-$SCRIPT_DIR/inventory.env}" if [ ! -f "$INVENTORY_FILE" ]; then echo "Missing inventory file: $INVENTORY_FILE" - echo "Copy $SCRIPT_DIR/inventory.example.env to $SCRIPT_DIR/inventory.env and edit IPs." + echo "Copy $SCRIPT_DIR/inventory.example.env to $SCRIPT_DIR/inventory.env and edit node mappings." exit 1 fi @@ -16,22 +16,57 @@ source "$INVENTORY_FILE" SSH_USER="${SSH_USER:-micqdf}" SSH_OPTS="${SSH_OPTS:- -o BatchMode=yes -o StrictHostKeyChecking=accept-new }" -required=(CP_1 CP_2 CP_3 WK_1 WK_2 WK_3) -for key in "${required[@]}"; do - if [ -z "${!key:-}" ]; then - echo "Missing required inventory variable: $key" +declare -A NODE_IPS=() + +add_pair() { + local pair="$1" + local name="${pair%%=*}" + local ip="${pair#*=}" + + if [ -z "$name" ] || [ -z "$ip" ] || [ "$name" = "$ip" ]; then + echo "Invalid node pair '$pair' (expected name=ip)." exit 1 fi -done + + NODE_IPS["$name"]="$ip" +} + +if [ -n "${CONTROL_PLANES:-}" ]; then + for pair in $CONTROL_PLANES; do + add_pair "$pair" + done +else + while IFS= read -r var_name; do + idx="${var_name#CP_}" + add_pair "cp-$idx=${!var_name}" + done < <(compgen -A variable | grep -E '^CP_[0-9]+$' | sort -V) +fi + +if [ -n "${WORKERS:-}" ]; then + for pair in $WORKERS; do + add_pair "$pair" + done +else + while IFS= read -r var_name; do + idx="${var_name#WK_}" + add_pair "wk-$idx=${!var_name}" + done < <(compgen -A variable | grep -E '^WK_[0-9]+$' | sort -V) +fi + +if [ "${#NODE_IPS[@]}" -eq 0 ]; then + echo "No nodes found in inventory." + exit 1 +fi reset_node() { - local node_ip="$1" - echo "==> Resetting $node_ip" + local node_name="$1" + local node_ip="$2" + echo "==> Resetting $node_name ($node_ip)" ssh $SSH_OPTS "$SSH_USER@$node_ip" "sudo kubeadm reset -f && sudo systemctl stop kubelet && sudo rm -rf /etc/kubernetes /var/lib/etcd /var/lib/cni /etc/cni/net.d" } -for key in CP_1 CP_2 CP_3 WK_1 WK_2 WK_3; do - reset_node "${!key}" -done +while IFS= read -r node_name; do + reset_node "$node_name" "${NODE_IPS[$node_name]}" +done < <(printf '%s\n' "${!NODE_IPS[@]}" | sort -V) echo "Cluster components reset on all listed nodes."