diff --git a/.gitea/workflows/deploy.yml b/.gitea/workflows/deploy.yml index 95e30f3..39600c8 100644 --- a/.gitea/workflows/deploy.yml +++ b/.gitea/workflows/deploy.yml @@ -88,8 +88,11 @@ jobs: } ensure_import 'hcloud_server.control_plane[0]' 'k8s-cluster-cp-1' + ensure_import 'hcloud_server.control_plane[1]' 'k8s-cluster-cp-2' + ensure_import 'hcloud_server.control_plane[2]' 'k8s-cluster-cp-3' ensure_import 'hcloud_server.workers[0]' 'k8s-cluster-worker-1' ensure_import 'hcloud_server.workers[1]' 'k8s-cluster-worker-2' + ensure_import 'hcloud_server.workers[2]' 'k8s-cluster-worker-3' - name: Terraform Plan id: plan diff --git a/STABLE_BASELINE.md b/STABLE_BASELINE.md index 97a8fd4..1e953c0 100644 --- a/STABLE_BASELINE.md +++ b/STABLE_BASELINE.md @@ -4,8 +4,9 @@ This document defines the current engineering target for this repository. ## Topology -- 1 control plane -- 2 workers +- 3 control planes (HA etcd cluster) +- 3 workers +- Hetzner Load Balancer for Kubernetes API - private Hetzner network - Tailscale operator access @@ -13,6 +14,8 @@ This document defines the current engineering target for this repository. - Terraform infrastructure bootstrap - Ansible k3s bootstrap with external cloud provider +- **HA control plane (3 nodes with etcd quorum)** +- **Hetzner Load Balancer for Kubernetes API** - **Hetzner CCM deployed via Ansible (before workers join)** - **Hetzner CSI for persistent volumes (via Flux)** - Flux core reconciliation @@ -26,7 +29,6 @@ This document defines the current engineering target for this repository. ## Out of Scope -- HA control plane - public ingress or DNS - public TLS - app workloads @@ -35,21 +37,28 @@ This document defines the current engineering target for this repository. ## Phase Gates -1. Terraform apply completes for the default topology. -2. k3s server bootstrap completes with external cloud provider enabled. -3. **CCM deployed via Ansible before workers join** (fixes uninitialized taint issue). -4. Workers join successfully and all nodes show proper `providerID`. -5. Flux source and infrastructure reconciliation are healthy. -6. **CSI deploys and creates `hcloud-volumes` StorageClass**. -7. **PVC provisioning tested and working** (validated with test pod). -8. External Secrets sync required secrets. -9. Tailscale private access works. -10. Terraform destroy succeeds cleanly or via workflow retry. +1. Terraform apply completes for HA topology (3 CP, 3 workers, 1 LB). +2. Load Balancer is healthy with all 3 control plane targets. +3. Primary control plane bootstraps with `--cluster-init`. +4. Secondary control planes join via Load Balancer endpoint. +5. **CCM deployed via Ansible before workers join** (fixes uninitialized taint issue). +6. Workers join successfully via Load Balancer and all nodes show proper `providerID`. +7. etcd reports 3 healthy members. +8. Flux source and infrastructure reconciliation are healthy. +9. **CSI deploys and creates `hcloud-volumes` StorageClass**. +10. **PVC provisioning tested and working**. +11. External Secrets sync required secrets. +12. Tailscale private access works. +13. Terraform destroy succeeds cleanly or via workflow retry. ## Success Criteria -✅ **ACHIEVED** - Two consecutive fresh rebuilds passed all phase gates with no manual fixes: +✅ **ACHIEVED** - HA Cluster with CCM/CSI: - Build 1: Initial CCM/CSI deployment and validation (2026-03-23) - Build 2: Full destroy/rebuild cycle successful (2026-03-23) -The platform is now stable with cloud provider integration and persistent volume support. +🔄 **IN PROGRESS** - HA Control Plane Validation: +- Build 3: Deploy 3-3 topology with Load Balancer +- Build 4: Destroy/rebuild to validate HA configuration + +Success requires two consecutive HA rebuilds passing all phase gates with no manual fixes. diff --git a/ansible/generate_inventory.py b/ansible/generate_inventory.py index dc931b2..b209949 100644 --- a/ansible/generate_inventory.py +++ b/ansible/generate_inventory.py @@ -32,6 +32,7 @@ def main(): worker_names = outputs["worker_names"]["value"] worker_ips = outputs["worker_ips"]["value"] worker_private_ips = outputs["worker_private_ips"]["value"] + kube_api_lb_ip = outputs.get("kube_api_lb_ip", {}).get("value", control_plane_ips[0]) control_planes = [ { @@ -59,6 +60,7 @@ def main(): "control_planes": control_planes, "workers": workers, "private_key_file": outputs["ssh_private_key_path"]["value"], + "kube_api_lb_ip": kube_api_lb_ip, } env = Environment(loader=FileSystemLoader(".")) diff --git a/ansible/inventory.tmpl b/ansible/inventory.tmpl index 5e89075..b4818f0 100644 --- a/ansible/inventory.tmpl +++ b/ansible/inventory.tmpl @@ -17,3 +17,4 @@ ansible_user=root ansible_python_interpreter=/usr/bin/python3 ansible_ssh_private_key_file={{ private_key_file }} k3s_version=latest +kube_api_endpoint={{ kube_api_lb_ip }} diff --git a/ansible/requirements.yml b/ansible/requirements.yml index 11cfdb6..cdcfbb4 100644 --- a/ansible/requirements.yml +++ b/ansible/requirements.yml @@ -3,3 +3,5 @@ collections: version: ">=2.4.0" - name: community.general version: ">=8.0.0" + - name: community.network + version: ">=5.0.0" diff --git a/ansible/roles/k3s-server/tasks/main.yml b/ansible/roles/k3s-server/tasks/main.yml index f1368c6..bae699c 100644 --- a/ansible/roles/k3s-server/tasks/main.yml +++ b/ansible/roles/k3s-server/tasks/main.yml @@ -15,9 +15,9 @@ set_fact: k3s_install_needed: "{{ (not k3s_service.stat.exists) or ((k3s_service_state.stdout | default('')) != 'active') }}" -- name: Wait for primary API on 6443 (secondary only) +- name: Wait for API endpoint on 6443 (secondary only) wait_for: - host: "{{ k3s_primary_ip }}" + host: "{{ k3s_join_endpoint | default(k3s_primary_ip) }}" port: 6443 state: started timeout: 120 @@ -81,7 +81,7 @@ K3S_TOKEN: "{{ k3s_token }}" command: >- /tmp/install-k3s.sh server - --server https://{{ k3s_primary_ip }}:6443 + --server https://{{ k3s_join_endpoint | default(k3s_primary_ip) }}:6443 --advertise-address={{ k3s_node_ip }} --node-ip={{ k3s_node_ip }} {% if k3s_disable_embedded_ccm | bool %}--disable-cloud-controller{% endif %} diff --git a/ansible/site.yml b/ansible/site.yml index 01cbbe1..b9b1bad 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -73,6 +73,8 @@ k3s_primary_ip: "{{ hostvars[groups['control_plane'][0]]['k3s_primary_private_ip'] }}" k3s_primary_public_ip: "{{ hostvars[groups['control_plane'][0]]['k3s_primary_public_ip'] }}" k3s_node_ip: "{{ k3s_private_ip }}" + # Use Load Balancer for HA - all control planes join via LB endpoint + k3s_join_endpoint: "{{ kube_api_endpoint | default(hostvars[groups['control_plane'][0]]['k3s_primary_private_ip']) }}" roles: - k3s-server @@ -83,7 +85,8 @@ vars: k3s_token: "{{ hostvars[groups['control_plane'][0]]['k3s_token'] }}" - k3s_server_url: "https://{{ hostvars[groups['control_plane'][0]]['k3s_primary_private_ip'] }}:6443" + # Use Load Balancer for HA - workers join via LB endpoint + k3s_server_url: "https://{{ kube_api_endpoint | default(hostvars[groups['control_plane'][0]]['k3s_primary_private_ip']) }}:6443" k3s_node_ip: "{{ k3s_private_ip }}" roles: diff --git a/terraform/loadbalancer.tf b/terraform/loadbalancer.tf new file mode 100644 index 0000000..7a92a58 --- /dev/null +++ b/terraform/loadbalancer.tf @@ -0,0 +1,43 @@ +# Load Balancer for Kubernetes API High Availability +# Provides a single endpoint for all control planes + +resource "hcloud_load_balancer" "kube_api" { + name = "${var.cluster_name}-api" + load_balancer_type = "lb11" # Cheapest tier: €5.39/month + location = var.location + + labels = { + cluster = var.cluster_name + role = "kube-api" + } +} + +# Attach all control plane servers as targets +resource "hcloud_load_balancer_target" "kube_api_targets" { + count = var.control_plane_count + type = "server" + load_balancer_id = hcloud_load_balancer.kube_api.id + server_id = hcloud_server.control_plane[count.index].id + use_private_ip = true + + depends_on = [hcloud_server.control_plane] +} + +# Kubernetes API service on port 6443 +resource "hcloud_load_balancer_service" "kube_api" { + load_balancer_id = hcloud_load_balancer.kube_api.id + protocol = "tcp" + listen_port = 6443 + destination_port = 6443 + + health_check { + protocol = "tcp" + port = 6443 + interval = 15 + timeout = 10 + retries = 3 + } +} + +# Firewall rule to allow LB access to control planes on 6443 +# This is added to the existing cluster firewall diff --git a/terraform/outputs.tf b/terraform/outputs.tf index 4a96d28..55b3839 100644 --- a/terraform/outputs.tf +++ b/terraform/outputs.tf @@ -63,3 +63,8 @@ output "kubeconfig_command" { description = "Command to fetch kubeconfig" value = "ssh root@${hcloud_server.control_plane[0].ipv4_address} 'cat /etc/rancher/k3s/k3s.yaml' > kubeconfig && sed -i 's/127.0.0.1/${hcloud_server.control_plane[0].ipv4_address}/g' kubeconfig" } + +output "kube_api_lb_ip" { + description = "Load Balancer IP for Kubernetes API" + value = hcloud_load_balancer.kube_api.ipv4_address +} diff --git a/terraform/variables.tf b/terraform/variables.tf index abfc326..24d37ae 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -25,7 +25,7 @@ variable "cluster_name" { variable "control_plane_count" { description = "Number of control plane nodes" type = number - default = 1 + default = 3 } variable "control_plane_type" { @@ -37,7 +37,7 @@ variable "control_plane_type" { variable "worker_count" { description = "Number of worker nodes" type = number - default = 2 + default = 3 } variable "worker_type" {