Merge pull request 'stage' (#121) from stage into master
Some checks failed
Terraform Apply / Terraform Apply (push) Failing after 30m28s
Some checks failed
Terraform Apply / Terraform Apply (push) Failing after 30m28s
Reviewed-on: #121
This commit was merged in pull request #121.
This commit is contained in:
@@ -103,25 +103,9 @@ jobs:
|
|||||||
- name: Create kubeadm inventory
|
- name: Create kubeadm inventory
|
||||||
env:
|
env:
|
||||||
KUBEADM_SSH_USER: ${{ secrets.KUBEADM_SSH_USER }}
|
KUBEADM_SSH_USER: ${{ secrets.KUBEADM_SSH_USER }}
|
||||||
KUBEADM_SUBNET_PREFIX: ${{ secrets.KUBEADM_SUBNET_PREFIX }}
|
|
||||||
run: |
|
run: |
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
TF_OUTPUT_JSON=""
|
terraform -chdir=terraform output -json | ./nixos/kubeadm/scripts/render-inventory-from-tf-output.py > nixos/kubeadm/scripts/inventory.env
|
||||||
for attempt in 1 2 3 4 5 6; do
|
|
||||||
echo "Inventory render attempt $attempt/6"
|
|
||||||
TF_OUTPUT_JSON="$(terraform -chdir=terraform output -json)"
|
|
||||||
if printf '%s' "$TF_OUTPUT_JSON" | ./nixos/kubeadm/scripts/render-inventory-from-tf-output.py > nixos/kubeadm/scripts/inventory.env; then
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ "$attempt" -lt 6 ]; then
|
|
||||||
echo "VM IPv4s not available yet; waiting 30s before retry"
|
|
||||||
sleep 30
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "Falling back to SSH-based inventory discovery"
|
|
||||||
printf '%s' "$TF_OUTPUT_JSON" | ./nixos/kubeadm/scripts/discover-inventory-from-ssh.py > nixos/kubeadm/scripts/inventory.env
|
|
||||||
|
|
||||||
- name: Validate nix installation
|
- name: Validate nix installation
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
@@ -103,25 +103,9 @@ jobs:
|
|||||||
- name: Create kubeadm inventory
|
- name: Create kubeadm inventory
|
||||||
env:
|
env:
|
||||||
KUBEADM_SSH_USER: ${{ secrets.KUBEADM_SSH_USER }}
|
KUBEADM_SSH_USER: ${{ secrets.KUBEADM_SSH_USER }}
|
||||||
KUBEADM_SUBNET_PREFIX: ${{ secrets.KUBEADM_SUBNET_PREFIX }}
|
|
||||||
run: |
|
run: |
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
TF_OUTPUT_JSON=""
|
terraform -chdir=terraform output -json | ./nixos/kubeadm/scripts/render-inventory-from-tf-output.py > nixos/kubeadm/scripts/inventory.env
|
||||||
for attempt in 1 2 3 4 5 6; do
|
|
||||||
echo "Inventory render attempt $attempt/6"
|
|
||||||
TF_OUTPUT_JSON="$(terraform -chdir=terraform output -json)"
|
|
||||||
if printf '%s' "$TF_OUTPUT_JSON" | ./nixos/kubeadm/scripts/render-inventory-from-tf-output.py > nixos/kubeadm/scripts/inventory.env; then
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ "$attempt" -lt 6 ]; then
|
|
||||||
echo "VM IPv4s not available yet; waiting 30s before retry"
|
|
||||||
sleep 30
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "Falling back to SSH-based inventory discovery"
|
|
||||||
printf '%s' "$TF_OUTPUT_JSON" | ./nixos/kubeadm/scripts/discover-inventory-from-ssh.py > nixos/kubeadm/scripts/inventory.env
|
|
||||||
|
|
||||||
- name: Run cluster reset
|
- name: Run cluster reset
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
@@ -151,25 +151,9 @@ jobs:
|
|||||||
- name: Create kubeadm inventory from Terraform outputs
|
- name: Create kubeadm inventory from Terraform outputs
|
||||||
env:
|
env:
|
||||||
KUBEADM_SSH_USER: ${{ secrets.KUBEADM_SSH_USER }}
|
KUBEADM_SSH_USER: ${{ secrets.KUBEADM_SSH_USER }}
|
||||||
KUBEADM_SUBNET_PREFIX: ${{ secrets.KUBEADM_SUBNET_PREFIX }}
|
|
||||||
run: |
|
run: |
|
||||||
set -euo pipefail
|
set -euo pipefail
|
||||||
TF_OUTPUT_JSON=""
|
terraform -chdir=terraform output -json | ./nixos/kubeadm/scripts/render-inventory-from-tf-output.py > nixos/kubeadm/scripts/inventory.env
|
||||||
for attempt in 1 2 3 4 5 6; do
|
|
||||||
echo "Inventory render attempt $attempt/6"
|
|
||||||
TF_OUTPUT_JSON="$(terraform -chdir=terraform output -json)"
|
|
||||||
if printf '%s' "$TF_OUTPUT_JSON" | ./nixos/kubeadm/scripts/render-inventory-from-tf-output.py > nixos/kubeadm/scripts/inventory.env; then
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ "$attempt" -lt 6 ]; then
|
|
||||||
echo "VM IPv4s not available yet; waiting 30s before retry"
|
|
||||||
sleep 30
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
|
|
||||||
echo "Falling back to SSH-based inventory discovery"
|
|
||||||
printf '%s' "$TF_OUTPUT_JSON" | ./nixos/kubeadm/scripts/discover-inventory-from-ssh.py > nixos/kubeadm/scripts/inventory.env
|
|
||||||
|
|
||||||
- name: Ensure nix and nixos-rebuild
|
- name: Ensure nix and nixos-rebuild
|
||||||
env:
|
env:
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ sudo nixos-rebuild switch --flake .#cp-1
|
|||||||
For remote target-host workflows, use your preferred deploy wrapper later
|
For remote target-host workflows, use your preferred deploy wrapper later
|
||||||
(`nixos-rebuild --target-host ...` or deploy-rs/colmena).
|
(`nixos-rebuild --target-host ...` or deploy-rs/colmena).
|
||||||
|
|
||||||
## Bootstrap runbook (kubeadm + kube-vip + Cilium)
|
## Bootstrap runbook (kubeadm + kube-vip + Flannel)
|
||||||
|
|
||||||
1. Apply Nix config on all nodes (`cp-*`, then `wk-*`).
|
1. Apply Nix config on all nodes (`cp-*`, then `wk-*`).
|
||||||
2. On `cp-1`, run:
|
2. On `cp-1`, run:
|
||||||
@@ -62,14 +62,10 @@ sudo th-kubeadm-init
|
|||||||
This infers the control-plane VIP as `<node-subnet>.250` on `eth0`, creates the
|
This infers the control-plane VIP as `<node-subnet>.250` on `eth0`, creates the
|
||||||
kube-vip static pod manifest, and runs `kubeadm init`.
|
kube-vip static pod manifest, and runs `kubeadm init`.
|
||||||
|
|
||||||
3. Install Cilium from `cp-1`:
|
3. Install Flannel from `cp-1`:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
helm repo add cilium https://helm.cilium.io
|
kubectl apply -f https://raw.githubusercontent.com/flannel-io/flannel/v0.25.5/Documentation/kube-flannel.yml
|
||||||
helm repo update
|
|
||||||
helm upgrade --install cilium cilium/cilium \
|
|
||||||
--namespace kube-system \
|
|
||||||
--set kubeProxyReplacement=true
|
|
||||||
```
|
```
|
||||||
|
|
||||||
4. Generate join commands on `cp-1`:
|
4. Generate join commands on `cp-1`:
|
||||||
@@ -98,7 +94,7 @@ kubectl get nodes -o wide
|
|||||||
kubectl -n kube-system get pods -o wide
|
kubectl -n kube-system get pods -o wide
|
||||||
```
|
```
|
||||||
|
|
||||||
## Repeatable rebuild flow (recommended)
|
## Fresh bootstrap flow (recommended)
|
||||||
|
|
||||||
1. Copy and edit inventory:
|
1. Copy and edit inventory:
|
||||||
|
|
||||||
@@ -107,7 +103,7 @@ cp ./scripts/inventory.example.env ./scripts/inventory.env
|
|||||||
$EDITOR ./scripts/inventory.env
|
$EDITOR ./scripts/inventory.env
|
||||||
```
|
```
|
||||||
|
|
||||||
2. Rebuild all nodes and bootstrap/reconcile cluster:
|
2. Rebuild all nodes and bootstrap a fresh cluster:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
./scripts/rebuild-and-bootstrap.sh
|
./scripts/rebuild-and-bootstrap.sh
|
||||||
@@ -141,15 +137,15 @@ For a full nuke/recreate lifecycle:
|
|||||||
- run Terraform destroy/apply for VMs first,
|
- run Terraform destroy/apply for VMs first,
|
||||||
- then run `./scripts/rebuild-and-bootstrap.sh` again.
|
- then run `./scripts/rebuild-and-bootstrap.sh` again.
|
||||||
|
|
||||||
Node lists are discovered from Terraform outputs, so adding new workers/control
|
Node lists now come directly from static Terraform outputs, so bootstrap no longer
|
||||||
planes in Terraform is picked up automatically by the bootstrap/reconcile flow.
|
depends on Proxmox guest-agent IP discovery or SSH subnet scanning.
|
||||||
|
|
||||||
## Optional Gitea workflow automation
|
## Optional Gitea workflow automation
|
||||||
|
|
||||||
Primary flow:
|
Primary flow:
|
||||||
|
|
||||||
- Push to `master` triggers `.gitea/workflows/terraform-apply.yml`
|
- Push to `master` triggers `.gitea/workflows/terraform-apply.yml`
|
||||||
- That workflow now does Terraform apply and then runs kubeadm rebuild/bootstrap reconciliation automatically
|
- That workflow now does Terraform apply and then runs a fresh kubeadm bootstrap automatically
|
||||||
|
|
||||||
Manual dispatch workflows are available:
|
Manual dispatch workflows are available:
|
||||||
|
|
||||||
@@ -164,9 +160,7 @@ Required repository secrets:
|
|||||||
Optional secrets:
|
Optional secrets:
|
||||||
|
|
||||||
- `KUBEADM_SSH_USER` (defaults to `micqdf`)
|
- `KUBEADM_SSH_USER` (defaults to `micqdf`)
|
||||||
- `KUBEADM_SUBNET_PREFIX` (optional, e.g. `10.27.27`; used for SSH-based IP discovery fallback)
|
Node IPs are rendered directly from static Terraform outputs (`control_plane_vm_ipv4`, `worker_vm_ipv4`), so you do not need per-node IP secrets or SSH discovery fallbacks.
|
||||||
|
|
||||||
Node IPs are auto-discovered from Terraform state outputs (`control_plane_vm_ipv4`, `worker_vm_ipv4`), so you do not need per-node IP secrets.
|
|
||||||
|
|
||||||
## Notes
|
## Notes
|
||||||
|
|
||||||
|
|||||||
@@ -11,9 +11,6 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
REMOTE_STATE_PATH = "/var/lib/terrahome/bootstrap-state.json"
|
|
||||||
|
|
||||||
|
|
||||||
def run_local(cmd, check=True, capture=False):
|
def run_local(cmd, check=True, capture=False):
|
||||||
if isinstance(cmd, str):
|
if isinstance(cmd, str):
|
||||||
shell = True
|
shell = True
|
||||||
@@ -102,7 +99,6 @@ class Controller:
|
|||||||
|
|
||||||
self.script_dir = Path(__file__).resolve().parent
|
self.script_dir = Path(__file__).resolve().parent
|
||||||
self.flake_dir = Path(self.env.get("FLAKE_DIR") or (self.script_dir.parent)).resolve()
|
self.flake_dir = Path(self.env.get("FLAKE_DIR") or (self.script_dir.parent)).resolve()
|
||||||
self.local_state_path = self.script_dir / "bootstrap-state-last.json"
|
|
||||||
|
|
||||||
self.ssh_user = self.env.get("SSH_USER", "micqdf")
|
self.ssh_user = self.env.get("SSH_USER", "micqdf")
|
||||||
self.ssh_candidates = self.env.get("SSH_USER_CANDIDATES", f"root {self.ssh_user}").split()
|
self.ssh_candidates = self.env.get("SSH_USER_CANDIDATES", f"root {self.ssh_user}").split()
|
||||||
@@ -124,8 +120,7 @@ class Controller:
|
|||||||
self.worker_parallelism = int(self.env.get("WORKER_PARALLELISM", "3"))
|
self.worker_parallelism = int(self.env.get("WORKER_PARALLELISM", "3"))
|
||||||
self.fast_mode = self.env.get("FAST_MODE", "1")
|
self.fast_mode = self.env.get("FAST_MODE", "1")
|
||||||
self.skip_rebuild = self.env.get("SKIP_REBUILD", "0") == "1"
|
self.skip_rebuild = self.env.get("SKIP_REBUILD", "0") == "1"
|
||||||
self.force_reinit = False
|
self.force_reinit = True
|
||||||
self.cilium_kpr = self.env.get("CILIUM_KUBE_PROXY_REPLACEMENT", "false")
|
|
||||||
|
|
||||||
def log(self, msg):
|
def log(self, msg):
|
||||||
print(f"==> {msg}")
|
print(f"==> {msg}")
|
||||||
@@ -171,45 +166,6 @@ class Controller:
|
|||||||
run_local(["ssh-keygen", "-R", ip], check=False)
|
run_local(["ssh-keygen", "-R", ip], check=False)
|
||||||
run_local(f"ssh-keyscan -H {shlex.quote(ip)} >> {shlex.quote(str(ssh_dir / 'known_hosts'))}", check=False)
|
run_local(f"ssh-keyscan -H {shlex.quote(ip)} >> {shlex.quote(str(ssh_dir / 'known_hosts'))}", check=False)
|
||||||
|
|
||||||
def get_state(self):
|
|
||||||
proc = self.remote(
|
|
||||||
self.primary_ip,
|
|
||||||
"sudo test -f /var/lib/terrahome/bootstrap-state.json && sudo cat /var/lib/terrahome/bootstrap-state.json || echo '{}'",
|
|
||||||
)
|
|
||||||
try:
|
|
||||||
state = json.loads(proc.stdout.strip() or "{}")
|
|
||||||
except Exception:
|
|
||||||
state = {}
|
|
||||||
return state
|
|
||||||
|
|
||||||
def set_state(self, state):
|
|
||||||
payload = json.dumps(state, sort_keys=True)
|
|
||||||
b64 = base64.b64encode(payload.encode()).decode()
|
|
||||||
self.remote(
|
|
||||||
self.primary_ip,
|
|
||||||
(
|
|
||||||
"sudo mkdir -p /var/lib/terrahome && "
|
|
||||||
f"echo {shlex.quote(b64)} | base64 -d | sudo tee {REMOTE_STATE_PATH} >/dev/null"
|
|
||||||
),
|
|
||||||
)
|
|
||||||
self.local_state_path.write_text(payload + "\n", encoding="utf-8")
|
|
||||||
|
|
||||||
def mark_done(self, key):
|
|
||||||
state = self.get_state()
|
|
||||||
state[key] = True
|
|
||||||
state["updated_at"] = int(time.time())
|
|
||||||
self.set_state(state)
|
|
||||||
|
|
||||||
def clear_done(self, keys):
|
|
||||||
state = self.get_state()
|
|
||||||
for key in keys:
|
|
||||||
state.pop(key, None)
|
|
||||||
state["updated_at"] = int(time.time())
|
|
||||||
self.set_state(state)
|
|
||||||
|
|
||||||
def stage_done(self, key):
|
|
||||||
return bool(self.get_state().get(key))
|
|
||||||
|
|
||||||
def prepare_remote_nix(self, ip):
|
def prepare_remote_nix(self, ip):
|
||||||
self.remote(ip, "sudo mkdir -p /etc/nix")
|
self.remote(ip, "sudo mkdir -p /etc/nix")
|
||||||
self.remote(ip, "if [ -f /etc/nix/nix.conf ]; then sudo sed -i '/^trusted-users[[:space:]]*=/d' /etc/nix/nix.conf; fi")
|
self.remote(ip, "if [ -f /etc/nix/nix.conf ]; then sudo sed -i '/^trusted-users[[:space:]]*=/d' /etc/nix/nix.conf; fi")
|
||||||
@@ -258,15 +214,11 @@ class Controller:
|
|||||||
raise RuntimeError(f"Rebuild failed permanently for {name}")
|
raise RuntimeError(f"Rebuild failed permanently for {name}")
|
||||||
|
|
||||||
def stage_preflight(self):
|
def stage_preflight(self):
|
||||||
if self.stage_done("preflight_done"):
|
|
||||||
self.log("Preflight already complete")
|
|
||||||
return
|
|
||||||
self.prepare_known_hosts()
|
self.prepare_known_hosts()
|
||||||
self.detect_user(self.primary_ip)
|
self.detect_user(self.primary_ip)
|
||||||
self.mark_done("preflight_done")
|
|
||||||
|
|
||||||
def stage_rebuild(self):
|
def stage_rebuild(self):
|
||||||
if self.skip_rebuild and self.stage_done("nodes_rebuilt"):
|
if self.skip_rebuild:
|
||||||
self.log("Node rebuild already complete")
|
self.log("Node rebuild already complete")
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -300,17 +252,6 @@ class Controller:
|
|||||||
if failures:
|
if failures:
|
||||||
raise RuntimeError(f"Worker rebuild failures: {failures}")
|
raise RuntimeError(f"Worker rebuild failures: {failures}")
|
||||||
|
|
||||||
# Rebuild can invalidate prior bootstrap stages; force reconciliation.
|
|
||||||
self.force_reinit = True
|
|
||||||
self.clear_done([
|
|
||||||
"primary_initialized",
|
|
||||||
"cni_installed",
|
|
||||||
"control_planes_joined",
|
|
||||||
"workers_joined",
|
|
||||||
"verified",
|
|
||||||
])
|
|
||||||
self.mark_done("nodes_rebuilt")
|
|
||||||
|
|
||||||
def has_admin_conf(self):
|
def has_admin_conf(self):
|
||||||
return self.remote(self.primary_ip, "sudo test -f /etc/kubernetes/admin.conf", check=False).returncode == 0
|
return self.remote(self.primary_ip, "sudo test -f /etc/kubernetes/admin.conf", check=False).returncode == 0
|
||||||
|
|
||||||
@@ -319,44 +260,21 @@ class Controller:
|
|||||||
return self.remote(self.primary_ip, cmd, check=False).returncode == 0
|
return self.remote(self.primary_ip, cmd, check=False).returncode == 0
|
||||||
|
|
||||||
def stage_init_primary(self):
|
def stage_init_primary(self):
|
||||||
if (not self.force_reinit) and self.stage_done("primary_initialized") and self.has_admin_conf() and self.cluster_ready():
|
self.log(f"Initializing primary control plane on {self.primary_cp}")
|
||||||
self.log("Primary control plane init already complete")
|
self.remote(self.primary_ip, "sudo th-kubeadm-init")
|
||||||
return
|
|
||||||
if (not self.force_reinit) and self.has_admin_conf() and self.cluster_ready():
|
|
||||||
self.log("Existing cluster detected on primary control plane")
|
|
||||||
else:
|
|
||||||
self.log(f"Initializing primary control plane on {self.primary_cp}")
|
|
||||||
self.remote(self.primary_ip, "sudo th-kubeadm-init")
|
|
||||||
self.mark_done("primary_initialized")
|
|
||||||
|
|
||||||
def stage_install_cni(self):
|
def stage_install_cni(self):
|
||||||
if self.stage_done("cni_installed") and self.cluster_ready():
|
self.log("Installing Flannel")
|
||||||
self.log("CNI install already complete")
|
|
||||||
return
|
|
||||||
self.log("Installing or upgrading Cilium")
|
|
||||||
self.remote(self.primary_ip, "sudo helm repo add cilium https://helm.cilium.io >/dev/null 2>&1 || true")
|
|
||||||
self.remote(self.primary_ip, "sudo helm repo update >/dev/null")
|
|
||||||
self.remote(self.primary_ip, "sudo kubectl --kubeconfig /etc/kubernetes/admin.conf create namespace kube-system >/dev/null 2>&1 || true")
|
|
||||||
self.remote(
|
self.remote(
|
||||||
self.primary_ip,
|
self.primary_ip,
|
||||||
(
|
"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf apply -f https://raw.githubusercontent.com/flannel-io/flannel/v0.25.5/Documentation/kube-flannel.yml",
|
||||||
"sudo KUBECONFIG=/etc/kubernetes/admin.conf "
|
|
||||||
"helm upgrade --install cilium cilium/cilium "
|
|
||||||
"--namespace kube-system "
|
|
||||||
f"--set k8sServiceHost={shlex.quote(self.primary_ip)} "
|
|
||||||
"--set k8sServicePort=6443 "
|
|
||||||
f"--set kubeProxyReplacement={shlex.quote(self.cilium_kpr)}"
|
|
||||||
),
|
|
||||||
)
|
)
|
||||||
self.mark_done("cni_installed")
|
|
||||||
|
|
||||||
def cluster_has_node(self, name):
|
def cluster_has_node(self, name):
|
||||||
cmd = f"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf get node {shlex.quote(name)} >/dev/null 2>&1"
|
cmd = f"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf get node {shlex.quote(name)} >/dev/null 2>&1"
|
||||||
return self.remote(self.primary_ip, cmd, check=False).returncode == 0
|
return self.remote(self.primary_ip, cmd, check=False).returncode == 0
|
||||||
|
|
||||||
def build_join_cmds(self):
|
def build_join_cmds(self):
|
||||||
if not self.has_admin_conf():
|
|
||||||
self.remote(self.primary_ip, "sudo th-kubeadm-init")
|
|
||||||
join_cmd = self.remote(
|
join_cmd = self.remote(
|
||||||
self.primary_ip,
|
self.primary_ip,
|
||||||
"sudo KUBECONFIG=/etc/kubernetes/admin.conf kubeadm token create --print-join-command",
|
"sudo KUBECONFIG=/etc/kubernetes/admin.conf kubeadm token create --print-join-command",
|
||||||
@@ -369,9 +287,6 @@ class Controller:
|
|||||||
return join_cmd, cp_join
|
return join_cmd, cp_join
|
||||||
|
|
||||||
def stage_join_control_planes(self):
|
def stage_join_control_planes(self):
|
||||||
if self.stage_done("control_planes_joined"):
|
|
||||||
self.log("Control-plane join already complete")
|
|
||||||
return
|
|
||||||
_, cp_join = self.build_join_cmds()
|
_, cp_join = self.build_join_cmds()
|
||||||
for node in self.cp_names:
|
for node in self.cp_names:
|
||||||
if node == self.primary_cp:
|
if node == self.primary_cp:
|
||||||
@@ -383,12 +298,8 @@ class Controller:
|
|||||||
ip = self.node_ips[node]
|
ip = self.node_ips[node]
|
||||||
node_join = f"{cp_join} --node-name {node} --ignore-preflight-errors=NumCPU,HTTPProxyCIDR"
|
node_join = f"{cp_join} --node-name {node} --ignore-preflight-errors=NumCPU,HTTPProxyCIDR"
|
||||||
self.remote(ip, f"sudo th-kubeadm-join-control-plane {shlex.quote(node_join)}")
|
self.remote(ip, f"sudo th-kubeadm-join-control-plane {shlex.quote(node_join)}")
|
||||||
self.mark_done("control_planes_joined")
|
|
||||||
|
|
||||||
def stage_join_workers(self):
|
def stage_join_workers(self):
|
||||||
if self.stage_done("workers_joined"):
|
|
||||||
self.log("Worker join already complete")
|
|
||||||
return
|
|
||||||
join_cmd, _ = self.build_join_cmds()
|
join_cmd, _ = self.build_join_cmds()
|
||||||
for node in self.wk_names:
|
for node in self.wk_names:
|
||||||
if self.cluster_has_node(node):
|
if self.cluster_has_node(node):
|
||||||
@@ -398,35 +309,31 @@ class Controller:
|
|||||||
ip = self.node_ips[node]
|
ip = self.node_ips[node]
|
||||||
node_join = f"{join_cmd} --node-name {node} --ignore-preflight-errors=HTTPProxyCIDR"
|
node_join = f"{join_cmd} --node-name {node} --ignore-preflight-errors=HTTPProxyCIDR"
|
||||||
self.remote(ip, f"sudo th-kubeadm-join-worker {shlex.quote(node_join)}")
|
self.remote(ip, f"sudo th-kubeadm-join-worker {shlex.quote(node_join)}")
|
||||||
self.mark_done("workers_joined")
|
|
||||||
|
|
||||||
def stage_verify(self):
|
def stage_verify(self):
|
||||||
if self.stage_done("verified"):
|
|
||||||
self.log("Verification already complete")
|
|
||||||
return
|
|
||||||
self.log("Final node verification")
|
self.log("Final node verification")
|
||||||
try:
|
try:
|
||||||
self.remote(
|
self.remote(
|
||||||
self.primary_ip,
|
self.primary_ip,
|
||||||
"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-system rollout status ds/cilium --timeout=10m",
|
"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel rollout status ds/kube-flannel-ds --timeout=10m",
|
||||||
)
|
)
|
||||||
except Exception:
|
except Exception:
|
||||||
self.log("Cilium rollout failed; collecting diagnostics")
|
self.log("Flannel rollout failed; collecting diagnostics")
|
||||||
proc = self.remote(
|
proc = self.remote(
|
||||||
self.primary_ip,
|
self.primary_ip,
|
||||||
"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-system get ds cilium -o wide || true",
|
"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel get ds -o wide || true",
|
||||||
check=False,
|
check=False,
|
||||||
)
|
)
|
||||||
print(proc.stdout)
|
print(proc.stdout)
|
||||||
proc = self.remote(
|
proc = self.remote(
|
||||||
self.primary_ip,
|
self.primary_ip,
|
||||||
"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-system get pods -l k8s-app=cilium -o wide || true",
|
"sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel get pods -o wide || true",
|
||||||
check=False,
|
check=False,
|
||||||
)
|
)
|
||||||
print(proc.stdout)
|
print(proc.stdout)
|
||||||
proc = self.remote(
|
proc = self.remote(
|
||||||
self.primary_ip,
|
self.primary_ip,
|
||||||
"for p in $(sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-system get pods -l k8s-app=cilium -o name 2>/dev/null); do sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-system logs --tail=120 $p || true; done",
|
"for p in $(sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel get pods -o name 2>/dev/null); do sudo kubectl --kubeconfig /etc/kubernetes/admin.conf -n kube-flannel logs --tail=120 $p || true; done",
|
||||||
check=False,
|
check=False,
|
||||||
)
|
)
|
||||||
print(proc.stdout)
|
print(proc.stdout)
|
||||||
@@ -437,7 +344,6 @@ class Controller:
|
|||||||
)
|
)
|
||||||
proc = self.remote(self.primary_ip, "sudo kubectl --kubeconfig /etc/kubernetes/admin.conf get nodes -o wide")
|
proc = self.remote(self.primary_ip, "sudo kubectl --kubeconfig /etc/kubernetes/admin.conf get nodes -o wide")
|
||||||
print(proc.stdout)
|
print(proc.stdout)
|
||||||
self.mark_done("verified")
|
|
||||||
|
|
||||||
def reconcile(self):
|
def reconcile(self):
|
||||||
self.stage_preflight()
|
self.stage_preflight()
|
||||||
|
|||||||
@@ -9,6 +9,15 @@ terraform {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
locals {
|
||||||
|
control_plane_ipconfig = [
|
||||||
|
for ip in var.control_plane_ips : "ip=${ip}/${var.network_prefix_length},gw=${var.network_gateway}"
|
||||||
|
]
|
||||||
|
worker_ipconfig = [
|
||||||
|
for ip in var.worker_ips : "ip=${ip}/${var.network_prefix_length},gw=${var.network_gateway}"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
provider "proxmox" {
|
provider "proxmox" {
|
||||||
pm_api_url = var.pm_api_url
|
pm_api_url = var.pm_api_url
|
||||||
pm_api_token_id = var.pm_api_token_id
|
pm_api_token_id = var.pm_api_token_id
|
||||||
@@ -35,7 +44,7 @@ resource "proxmox_vm_qemu" "control_planes" {
|
|||||||
scsihw = "virtio-scsi-pci"
|
scsihw = "virtio-scsi-pci"
|
||||||
boot = "order=scsi0"
|
boot = "order=scsi0"
|
||||||
bootdisk = "scsi0"
|
bootdisk = "scsi0"
|
||||||
ipconfig0 = "ip=dhcp"
|
ipconfig0 = local.control_plane_ipconfig[count.index]
|
||||||
ciuser = "micqdf"
|
ciuser = "micqdf"
|
||||||
sshkeys = var.SSH_KEY_PUBLIC
|
sshkeys = var.SSH_KEY_PUBLIC
|
||||||
|
|
||||||
@@ -90,7 +99,7 @@ resource "proxmox_vm_qemu" "workers" {
|
|||||||
scsihw = "virtio-scsi-pci"
|
scsihw = "virtio-scsi-pci"
|
||||||
boot = "order=scsi0"
|
boot = "order=scsi0"
|
||||||
bootdisk = "scsi0"
|
bootdisk = "scsi0"
|
||||||
ipconfig0 = "ip=dhcp"
|
ipconfig0 = local.worker_ipconfig[count.index]
|
||||||
ciuser = "micqdf"
|
ciuser = "micqdf"
|
||||||
sshkeys = var.SSH_KEY_PUBLIC
|
sshkeys = var.SSH_KEY_PUBLIC
|
||||||
|
|
||||||
|
|||||||
@@ -11,8 +11,8 @@ output "control_plane_vm_names" {
|
|||||||
|
|
||||||
output "control_plane_vm_ipv4" {
|
output "control_plane_vm_ipv4" {
|
||||||
value = {
|
value = {
|
||||||
for vm in proxmox_vm_qemu.control_planes :
|
for i in range(var.control_plane_count) :
|
||||||
vm.name => vm.default_ipv4_address
|
proxmox_vm_qemu.control_planes[i].name => var.control_plane_ips[i]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -29,7 +29,7 @@ output "worker_vm_names" {
|
|||||||
|
|
||||||
output "worker_vm_ipv4" {
|
output "worker_vm_ipv4" {
|
||||||
value = {
|
value = {
|
||||||
for vm in proxmox_vm_qemu.workers :
|
for i in range(var.worker_count) :
|
||||||
vm.name => vm.default_ipv4_address
|
proxmox_vm_qemu.workers[i].name => var.worker_ips[i]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,3 +17,9 @@ control_plane_disk_size = "80G"
|
|||||||
worker_cores = [4, 4, 4]
|
worker_cores = [4, 4, 4]
|
||||||
worker_memory_mb = [12288, 12288, 12288]
|
worker_memory_mb = [12288, 12288, 12288]
|
||||||
worker_disk_size = "120G"
|
worker_disk_size = "120G"
|
||||||
|
|
||||||
|
network_prefix_length = 10
|
||||||
|
network_gateway = "10.27.27.1"
|
||||||
|
|
||||||
|
control_plane_ips = ["10.27.27.50", "10.27.27.51", "10.27.27.49"]
|
||||||
|
worker_ips = ["10.27.27.47", "10.27.27.46", "10.27.27.48"]
|
||||||
|
|||||||
@@ -87,6 +87,40 @@ variable "worker_disk_size" {
|
|||||||
description = "Disk size for worker VMs"
|
description = "Disk size for worker VMs"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
variable "network_prefix_length" {
|
||||||
|
type = number
|
||||||
|
default = 10
|
||||||
|
description = "CIDR prefix length for static VM addresses"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "network_gateway" {
|
||||||
|
type = string
|
||||||
|
default = "10.27.27.1"
|
||||||
|
description = "Gateway for static VM addresses"
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "control_plane_ips" {
|
||||||
|
type = list(string)
|
||||||
|
default = ["10.27.27.50", "10.27.27.51", "10.27.27.49"]
|
||||||
|
description = "Static IPv4 addresses for control plane VMs"
|
||||||
|
|
||||||
|
validation {
|
||||||
|
condition = length(var.control_plane_ips) == 3
|
||||||
|
error_message = "control_plane_ips must contain exactly 3 IPs."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
variable "worker_ips" {
|
||||||
|
type = list(string)
|
||||||
|
default = ["10.27.27.47", "10.27.27.46", "10.27.27.48"]
|
||||||
|
description = "Static IPv4 addresses for worker VMs"
|
||||||
|
|
||||||
|
validation {
|
||||||
|
condition = length(var.worker_ips) == 3
|
||||||
|
error_message = "worker_ips must contain exactly 3 IPs."
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
variable "bridge" {
|
variable "bridge" {
|
||||||
type = string
|
type = string
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user