diff --git a/.gitea/workflows/kubeadm-bootstrap.yml b/.gitea/workflows/kubeadm-bootstrap.yml index 99371b8..f0b5978 100644 --- a/.gitea/workflows/kubeadm-bootstrap.yml +++ b/.gitea/workflows/kubeadm-bootstrap.yml @@ -103,25 +103,9 @@ jobs: - name: Create kubeadm inventory env: KUBEADM_SSH_USER: ${{ secrets.KUBEADM_SSH_USER }} - KUBEADM_SUBNET_PREFIX: ${{ secrets.KUBEADM_SUBNET_PREFIX }} run: | set -euo pipefail - TF_OUTPUT_JSON="" - for attempt in 1 2 3 4 5 6; do - echo "Inventory render attempt $attempt/6" - TF_OUTPUT_JSON="$(terraform -chdir=terraform output -json)" - if printf '%s' "$TF_OUTPUT_JSON" | ./nixos/kubeadm/scripts/render-inventory-from-tf-output.py > nixos/kubeadm/scripts/inventory.env; then - exit 0 - fi - - if [ "$attempt" -lt 6 ]; then - echo "VM IPv4s not available yet; waiting 30s before retry" - sleep 30 - fi - done - - echo "Falling back to SSH-based inventory discovery" - printf '%s' "$TF_OUTPUT_JSON" | ./nixos/kubeadm/scripts/discover-inventory-from-ssh.py > nixos/kubeadm/scripts/inventory.env + terraform -chdir=terraform output -json | ./nixos/kubeadm/scripts/render-inventory-from-tf-output.py > nixos/kubeadm/scripts/inventory.env - name: Validate nix installation run: | diff --git a/.gitea/workflows/kubeadm-reset.yml b/.gitea/workflows/kubeadm-reset.yml index 054e5b2..939d3a1 100644 --- a/.gitea/workflows/kubeadm-reset.yml +++ b/.gitea/workflows/kubeadm-reset.yml @@ -103,25 +103,9 @@ jobs: - name: Create kubeadm inventory env: KUBEADM_SSH_USER: ${{ secrets.KUBEADM_SSH_USER }} - KUBEADM_SUBNET_PREFIX: ${{ secrets.KUBEADM_SUBNET_PREFIX }} run: | set -euo pipefail - TF_OUTPUT_JSON="" - for attempt in 1 2 3 4 5 6; do - echo "Inventory render attempt $attempt/6" - TF_OUTPUT_JSON="$(terraform -chdir=terraform output -json)" - if printf '%s' "$TF_OUTPUT_JSON" | ./nixos/kubeadm/scripts/render-inventory-from-tf-output.py > nixos/kubeadm/scripts/inventory.env; then - exit 0 - fi - - if [ "$attempt" -lt 6 ]; then - echo "VM IPv4s not available yet; waiting 30s before retry" - sleep 30 - fi - done - - echo "Falling back to SSH-based inventory discovery" - printf '%s' "$TF_OUTPUT_JSON" | ./nixos/kubeadm/scripts/discover-inventory-from-ssh.py > nixos/kubeadm/scripts/inventory.env + terraform -chdir=terraform output -json | ./nixos/kubeadm/scripts/render-inventory-from-tf-output.py > nixos/kubeadm/scripts/inventory.env - name: Run cluster reset run: | diff --git a/.gitea/workflows/terraform-apply.yml b/.gitea/workflows/terraform-apply.yml index 0095094..ba368ac 100644 --- a/.gitea/workflows/terraform-apply.yml +++ b/.gitea/workflows/terraform-apply.yml @@ -151,25 +151,9 @@ jobs: - name: Create kubeadm inventory from Terraform outputs env: KUBEADM_SSH_USER: ${{ secrets.KUBEADM_SSH_USER }} - KUBEADM_SUBNET_PREFIX: ${{ secrets.KUBEADM_SUBNET_PREFIX }} run: | set -euo pipefail - TF_OUTPUT_JSON="" - for attempt in 1 2 3 4 5 6; do - echo "Inventory render attempt $attempt/6" - TF_OUTPUT_JSON="$(terraform -chdir=terraform output -json)" - if printf '%s' "$TF_OUTPUT_JSON" | ./nixos/kubeadm/scripts/render-inventory-from-tf-output.py > nixos/kubeadm/scripts/inventory.env; then - exit 0 - fi - - if [ "$attempt" -lt 6 ]; then - echo "VM IPv4s not available yet; waiting 30s before retry" - sleep 30 - fi - done - - echo "Falling back to SSH-based inventory discovery" - printf '%s' "$TF_OUTPUT_JSON" | ./nixos/kubeadm/scripts/discover-inventory-from-ssh.py > nixos/kubeadm/scripts/inventory.env + terraform -chdir=terraform output -json | ./nixos/kubeadm/scripts/render-inventory-from-tf-output.py > nixos/kubeadm/scripts/inventory.env - name: Ensure nix and nixos-rebuild env: diff --git a/nixos/kubeadm/README.md b/nixos/kubeadm/README.md index 5ff6330..33c9be0 100644 --- a/nixos/kubeadm/README.md +++ b/nixos/kubeadm/README.md @@ -50,7 +50,7 @@ sudo nixos-rebuild switch --flake .#cp-1 For remote target-host workflows, use your preferred deploy wrapper later (`nixos-rebuild --target-host ...` or deploy-rs/colmena). -## Bootstrap runbook (kubeadm + kube-vip + Cilium) +## Bootstrap runbook (kubeadm + kube-vip + Flannel) 1. Apply Nix config on all nodes (`cp-*`, then `wk-*`). 2. On `cp-1`, run: @@ -62,14 +62,10 @@ sudo th-kubeadm-init This infers the control-plane VIP as `.250` on `eth0`, creates the kube-vip static pod manifest, and runs `kubeadm init`. -3. Install Cilium from `cp-1`: +3. Install Flannel from `cp-1`: ```bash -helm repo add cilium https://helm.cilium.io -helm repo update -helm upgrade --install cilium cilium/cilium \ - --namespace kube-system \ - --set kubeProxyReplacement=true +kubectl apply -f https://raw.githubusercontent.com/flannel-io/flannel/v0.25.5/Documentation/kube-flannel.yml ``` 4. Generate join commands on `cp-1`: @@ -98,7 +94,7 @@ kubectl get nodes -o wide kubectl -n kube-system get pods -o wide ``` -## Repeatable rebuild flow (recommended) +## Fresh bootstrap flow (recommended) 1. Copy and edit inventory: @@ -107,7 +103,7 @@ cp ./scripts/inventory.example.env ./scripts/inventory.env $EDITOR ./scripts/inventory.env ``` -2. Rebuild all nodes and bootstrap/reconcile cluster: +2. Rebuild all nodes and bootstrap a fresh cluster: ```bash ./scripts/rebuild-and-bootstrap.sh @@ -141,15 +137,15 @@ For a full nuke/recreate lifecycle: - run Terraform destroy/apply for VMs first, - then run `./scripts/rebuild-and-bootstrap.sh` again. -Node lists are discovered from Terraform outputs, so adding new workers/control -planes in Terraform is picked up automatically by the bootstrap/reconcile flow. +Node lists now come directly from static Terraform outputs, so bootstrap no longer +depends on Proxmox guest-agent IP discovery or SSH subnet scanning. ## Optional Gitea workflow automation Primary flow: - Push to `master` triggers `.gitea/workflows/terraform-apply.yml` -- That workflow now does Terraform apply and then runs kubeadm rebuild/bootstrap reconciliation automatically +- That workflow now does Terraform apply and then runs a fresh kubeadm bootstrap automatically Manual dispatch workflows are available: @@ -164,9 +160,7 @@ Required repository secrets: Optional secrets: - `KUBEADM_SSH_USER` (defaults to `micqdf`) -- `KUBEADM_SUBNET_PREFIX` (optional, e.g. `10.27.27`; used for SSH-based IP discovery fallback) - -Node IPs are auto-discovered from Terraform state outputs (`control_plane_vm_ipv4`, `worker_vm_ipv4`), so you do not need per-node IP secrets. +Node IPs are rendered directly from static Terraform outputs (`control_plane_vm_ipv4`, `worker_vm_ipv4`), so you do not need per-node IP secrets or SSH discovery fallbacks. ## Notes diff --git a/nixos/kubeadm/bootstrap/controller.py b/nixos/kubeadm/bootstrap/controller.py index c7eb17d..b91d336 100755 --- a/nixos/kubeadm/bootstrap/controller.py +++ b/nixos/kubeadm/bootstrap/controller.py @@ -11,9 +11,6 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from pathlib import Path -REMOTE_STATE_PATH = "/var/lib/terrahome/bootstrap-state.json" - - def run_local(cmd, check=True, capture=False): if isinstance(cmd, str): shell = True @@ -102,7 +99,6 @@ class Controller: self.script_dir = Path(__file__).resolve().parent self.flake_dir = Path(self.env.get("FLAKE_DIR") or (self.script_dir.parent)).resolve() - self.local_state_path = self.script_dir / "bootstrap-state-last.json" self.ssh_user = self.env.get("SSH_USER", "micqdf") self.ssh_candidates = self.env.get("SSH_USER_CANDIDATES", f"root {self.ssh_user}").split() @@ -124,8 +120,7 @@ class Controller: self.worker_parallelism = int(self.env.get("WORKER_PARALLELISM", "3")) self.fast_mode = self.env.get("FAST_MODE", "1") self.skip_rebuild = self.env.get("SKIP_REBUILD", "0") == "1" - self.force_reinit = False - self.cilium_kpr = self.env.get("CILIUM_KUBE_PROXY_REPLACEMENT", "false") + self.force_reinit = True def log(self, msg): print(f"==> {msg}") @@ -171,45 +166,6 @@ class Controller: run_local(["ssh-keygen", "-R", ip], check=False) run_local(f"ssh-keyscan -H {shlex.quote(ip)} >> {shlex.quote(str(ssh_dir / 'known_hosts'))}", check=False) - def get_state(self): - proc = self.remote( - self.primary_ip, - "sudo test -f /var/lib/terrahome/bootstrap-state.json && sudo cat /var/lib/terrahome/bootstrap-state.json || echo '{}'", - ) - try: - state = json.loads(proc.stdout.strip() or "{}") - except Exception: - state = {} - return state - - def set_state(self, state): - payload = json.dumps(state, sort_keys=True) - b64 = base64.b64encode(payload.encode()).decode() - self.remote( - self.primary_ip, - ( - "sudo mkdir -p /var/lib/terrahome && " - f"echo {shlex.quote(b64)} | base64 -d | sudo tee {REMOTE_STATE_PATH} >/dev/null" - ), - ) - self.local_state_path.write_text(payload + "\n", encoding="utf-8") - - def mark_done(self, key): - state = self.get_state() - state[key] = True - state["updated_at"] = int(time.time()) - self.set_state(state) - - def clear_done(self, keys): - state = self.get_state() - for key in keys: - state.pop(key, None) - state["updated_at"] = int(time.time()) - self.set_state(state) - - def stage_done(self, key): - return bool(self.get_state().get(key)) - def prepare_remote_nix(self, ip): self.remote(ip, "sudo mkdir -p /etc/nix") self.remote(ip, "if [ -f /etc/nix/nix.conf ]; then sudo sed -i '/^trusted-users[[:space:]]*=/d' /etc/nix/nix.conf; fi") @@ -258,15 +214,11 @@ class Controller: raise RuntimeError(f"Rebuild failed permanently for {name}") def stage_preflight(self): - if self.stage_done("preflight_done"): - self.log("Preflight already complete") - return self.prepare_known_hosts() self.detect_user(self.primary_ip) - self.mark_done("preflight_done") def stage_rebuild(self): - if self.skip_rebuild and self.stage_done("nodes_rebuilt"): + if self.skip_rebuild: self.log("Node rebuild already complete") return @@ -300,17 +252,6 @@ class Controller: if failures: raise RuntimeError(f"Worker rebuild failures: {failures}") - # Rebuild can invalidate prior bootstrap stages; force reconciliation. - self.force_reinit = True - self.clear_done([ - "primary_initialized", - "cni_installed", - "control_planes_joined", - "workers_joined", - "verified", - ]) - self.mark_done("nodes_rebuilt") - def has_admin_conf(self): return self.remote(self.primary_ip, "sudo test -f /etc/kubernetes/admin.conf", check=False).returncode == 0 @@ -319,44 +260,21 @@ class Controller: return self.remote(self.primary_ip, cmd, check=False).returncode == 0 def stage_init_primary(self): - if (not self.force_reinit) and self.stage_done("primary_initialized") and self.has_admin_conf() and self.cluster_ready(): - self.log("Primary control plane init already complete") - return - if (not self.force_reinit) and self.has_admin_conf() and self.cluster_ready(): - self.log("Existing cluster detected on primary control plane") - else: - self.log(f"Initializing primary control plane on {self.primary_cp}") - self.remote(self.primary_ip, "sudo th-kubeadm-init") - self.mark_done("primary_initialized") + self.log(f"Initializing primary control plane on {self.primary_cp}") + self.remote(self.primary_ip, "sudo th-kubeadm-init") def stage_install_cni(self): - if self.stage_done("cni_installed") and self.cluster_ready(): - self.log("CNI install already complete") - return - self.log("Installing or upgrading Cilium") - self.remote(self.primary_ip, "sudo helm repo add cilium https://helm.cilium.io >/dev/null 2>&1 || true") - self.remote(self.primary_ip, "sudo helm repo update >/dev/null") - self.remote(self.primary_ip, "sudo kubectl --kubeconfig /etc/kubernetes/admin.conf create namespace kube-system >/dev/null 2>&1 || true") + self.log("Installing Flannel") self.remote( self.primary_ip, - ( - "sudo KUBECONFIG=/etc/kubernetes/admin.conf " - "helm upgrade --install cilium cilium/cilium " - "--namespace kube-system " - f"--set k8sServiceHost={shlex.quote(self.primary_ip)} " - "--set k8sServicePort=6443 " - f"--set kubeProxyReplacement={shlex.quote(self.cilium_kpr)}" - ), + "sudo kubectl --kubeconfig /etc/kubernetes/admin.conf apply -f https://raw.githubusercontent.com/flannel-io/flannel/v0.25.5/Documentation/kube-flannel.yml", ) - self.mark_done("cni_installed") def cluster_has_node(self, name): cmd = f"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf get node {shlex.quote(name)} >/dev/null 2>&1" return self.remote(self.primary_ip, cmd, check=False).returncode == 0 def build_join_cmds(self): - if not self.has_admin_conf(): - self.remote(self.primary_ip, "sudo th-kubeadm-init") join_cmd = self.remote( self.primary_ip, "sudo KUBECONFIG=/etc/kubernetes/admin.conf kubeadm token create --print-join-command", @@ -369,9 +287,6 @@ class Controller: return join_cmd, cp_join def stage_join_control_planes(self): - if self.stage_done("control_planes_joined"): - self.log("Control-plane join already complete") - return _, cp_join = self.build_join_cmds() for node in self.cp_names: if node == self.primary_cp: @@ -383,12 +298,8 @@ class Controller: ip = self.node_ips[node] node_join = f"{cp_join} --node-name {node} --ignore-preflight-errors=NumCPU,HTTPProxyCIDR" self.remote(ip, f"sudo th-kubeadm-join-control-plane {shlex.quote(node_join)}") - self.mark_done("control_planes_joined") def stage_join_workers(self): - if self.stage_done("workers_joined"): - self.log("Worker join already complete") - return join_cmd, _ = self.build_join_cmds() for node in self.wk_names: if self.cluster_has_node(node): @@ -398,35 +309,31 @@ class Controller: ip = self.node_ips[node] node_join = f"{join_cmd} --node-name {node} --ignore-preflight-errors=HTTPProxyCIDR" self.remote(ip, f"sudo th-kubeadm-join-worker {shlex.quote(node_join)}") - self.mark_done("workers_joined") def stage_verify(self): - if self.stage_done("verified"): - self.log("Verification already complete") - return self.log("Final node verification") try: self.remote( self.primary_ip, - "sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-system rollout status ds/cilium --timeout=10m", + "sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel rollout status ds/kube-flannel-ds --timeout=10m", ) except Exception: - self.log("Cilium rollout failed; collecting diagnostics") + self.log("Flannel rollout failed; collecting diagnostics") proc = self.remote( self.primary_ip, - "sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-system get ds cilium -o wide || true", + "sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel get ds -o wide || true", check=False, ) print(proc.stdout) proc = self.remote( self.primary_ip, - "sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-system get pods -l k8s-app=cilium -o wide || true", + "sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel get pods -o wide || true", check=False, ) print(proc.stdout) proc = self.remote( self.primary_ip, - "for p in $(sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-system get pods -l k8s-app=cilium -o name 2>/dev/null); do sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-system logs --tail=120 $p || true; done", + "for p in $(sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel get pods -o name 2>/dev/null); do sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel logs --tail=120 $p || true; done", check=False, ) print(proc.stdout) @@ -437,7 +344,6 @@ class Controller: ) proc = self.remote(self.primary_ip, "sudo kubectl --kubeconfig /etc/kubernetes/admin.conf get nodes -o wide") print(proc.stdout) - self.mark_done("verified") def reconcile(self): self.stage_preflight() diff --git a/terraform/main.tf b/terraform/main.tf index 90fd52a..466fae6 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -9,6 +9,15 @@ terraform { } } +locals { + control_plane_ipconfig = [ + for ip in var.control_plane_ips : "ip=${ip}/${var.network_prefix_length},gw=${var.network_gateway}" + ] + worker_ipconfig = [ + for ip in var.worker_ips : "ip=${ip}/${var.network_prefix_length},gw=${var.network_gateway}" + ] +} + provider "proxmox" { pm_api_url = var.pm_api_url pm_api_token_id = var.pm_api_token_id @@ -35,7 +44,7 @@ resource "proxmox_vm_qemu" "control_planes" { scsihw = "virtio-scsi-pci" boot = "order=scsi0" bootdisk = "scsi0" - ipconfig0 = "ip=dhcp" + ipconfig0 = local.control_plane_ipconfig[count.index] ciuser = "micqdf" sshkeys = var.SSH_KEY_PUBLIC @@ -90,7 +99,7 @@ resource "proxmox_vm_qemu" "workers" { scsihw = "virtio-scsi-pci" boot = "order=scsi0" bootdisk = "scsi0" - ipconfig0 = "ip=dhcp" + ipconfig0 = local.worker_ipconfig[count.index] ciuser = "micqdf" sshkeys = var.SSH_KEY_PUBLIC diff --git a/terraform/outputs.tf b/terraform/outputs.tf index c817350..39e3338 100644 --- a/terraform/outputs.tf +++ b/terraform/outputs.tf @@ -11,8 +11,8 @@ output "control_plane_vm_names" { output "control_plane_vm_ipv4" { value = { - for vm in proxmox_vm_qemu.control_planes : - vm.name => vm.default_ipv4_address + for i in range(var.control_plane_count) : + proxmox_vm_qemu.control_planes[i].name => var.control_plane_ips[i] } } @@ -29,7 +29,7 @@ output "worker_vm_names" { output "worker_vm_ipv4" { value = { - for vm in proxmox_vm_qemu.workers : - vm.name => vm.default_ipv4_address + for i in range(var.worker_count) : + proxmox_vm_qemu.workers[i].name => var.worker_ips[i] } } diff --git a/terraform/terraform.tfvars b/terraform/terraform.tfvars index c2ed336..79beabe 100644 --- a/terraform/terraform.tfvars +++ b/terraform/terraform.tfvars @@ -17,3 +17,9 @@ control_plane_disk_size = "80G" worker_cores = [4, 4, 4] worker_memory_mb = [12288, 12288, 12288] worker_disk_size = "120G" + +network_prefix_length = 10 +network_gateway = "10.27.27.1" + +control_plane_ips = ["10.27.27.50", "10.27.27.51", "10.27.27.49"] +worker_ips = ["10.27.27.47", "10.27.27.46", "10.27.27.48"] diff --git a/terraform/variables.tf b/terraform/variables.tf index 2df61d9..17cf482 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -87,6 +87,40 @@ variable "worker_disk_size" { description = "Disk size for worker VMs" } +variable "network_prefix_length" { + type = number + default = 10 + description = "CIDR prefix length for static VM addresses" +} + +variable "network_gateway" { + type = string + default = "10.27.27.1" + description = "Gateway for static VM addresses" +} + +variable "control_plane_ips" { + type = list(string) + default = ["10.27.27.50", "10.27.27.51", "10.27.27.49"] + description = "Static IPv4 addresses for control plane VMs" + + validation { + condition = length(var.control_plane_ips) == var.control_plane_count + error_message = "control_plane_ips length must match control_plane_count." + } +} + +variable "worker_ips" { + type = list(string) + default = ["10.27.27.47", "10.27.27.46", "10.27.27.48"] + description = "Static IPv4 addresses for worker VMs" + + validation { + condition = length(var.worker_ips) == var.worker_count + error_message = "worker_ips length must match worker_count." + } +} + variable "bridge" { type = string }