feat: migrate cluster baseline from Hetzner to Proxmox
Deploy Cluster / Terraform (push) Failing after 52s
Deploy Cluster / Ansible (push) Has been skipped
Deploy Grafana Content / Grafana Content (push) Failing after 1m37s

Replace Hetzner infrastructure and cloud-provider assumptions with Proxmox
VM clones, kube-vip API HA, and NFS-backed storage. Update bootstrap,
Flux addons, CI workflows, and docs to target the new private Proxmox
baseline while preserving the existing Tailscale, Doppler, Flux, Rancher,
and B2 backup flows.
This commit is contained in:
2026-04-22 03:02:13 +00:00
parent 6c6b9d20ca
commit b1dae28aa5
40 changed files with 577 additions and 784 deletions
-118
View File
@@ -1,118 +0,0 @@
locals {
ssh_source_ips = var.restrict_api_ssh_to_tailnet ? concat([var.tailnet_cidr], var.allowed_ssh_ips) : var.allowed_ssh_ips
api_source_ips = var.restrict_api_ssh_to_tailnet ? concat([var.tailnet_cidr], var.allowed_api_ips) : var.allowed_api_ips
}
resource "hcloud_firewall" "cluster" {
name = "${var.cluster_name}-firewall"
rule {
description = "SSH"
direction = "in"
protocol = "tcp"
port = "22"
source_ips = local.ssh_source_ips
}
rule {
description = "Kubernetes API"
direction = "in"
protocol = "tcp"
port = "6443"
source_ips = local.api_source_ips
}
rule {
description = "Tailscale WireGuard"
direction = "in"
protocol = "udp"
port = "41641"
source_ips = ["0.0.0.0/0"]
}
rule {
description = "Kubernetes API (internal)"
direction = "in"
protocol = "tcp"
port = "6443"
source_ips = [var.subnet_cidr]
}
rule {
description = "k3s Supervisor"
direction = "in"
protocol = "tcp"
port = "9345"
source_ips = [var.subnet_cidr]
}
rule {
description = "etcd Client"
direction = "in"
protocol = "tcp"
port = "2379"
source_ips = [var.subnet_cidr]
}
rule {
description = "etcd Peer"
direction = "in"
protocol = "tcp"
port = "2380"
source_ips = [var.subnet_cidr]
}
rule {
description = "Flannel VXLAN"
direction = "in"
protocol = "udp"
port = "8472"
source_ips = [var.subnet_cidr]
}
rule {
description = "Kubelet"
direction = "in"
protocol = "tcp"
port = "10250"
source_ips = [var.subnet_cidr]
}
dynamic "rule" {
for_each = var.enable_nodeport_public ? [1] : []
content {
description = "NodePorts"
direction = "in"
protocol = "tcp"
port = "30000-32767"
source_ips = ["0.0.0.0/0"]
}
}
rule {
description = "HTTP from Load Balancer"
direction = "in"
protocol = "tcp"
port = "80"
source_ips = ["0.0.0.0/0"]
}
rule {
description = "HTTPS from Load Balancer"
direction = "in"
protocol = "tcp"
port = "443"
source_ips = ["0.0.0.0/0"]
}
rule {
description = "ICMP"
direction = "in"
protocol = "icmp"
source_ips = ["0.0.0.0/0"]
}
apply_to {
label_selector = "cluster=${var.cluster_name}"
}
}
-50
View File
@@ -1,50 +0,0 @@
# Load Balancer for Kubernetes API High Availability
# Provides a single endpoint for all control planes
resource "hcloud_load_balancer" "kube_api" {
name = "${var.cluster_name}-api"
load_balancer_type = "lb11" # Cheapest tier: €5.39/month
location = var.location
labels = {
cluster = var.cluster_name
role = "kube-api"
}
}
# Attach Load Balancer to private network (required for use_private_ip)
resource "hcloud_load_balancer_network" "kube_api" {
load_balancer_id = hcloud_load_balancer.kube_api.id
network_id = hcloud_network.cluster.id
ip = cidrhost(var.subnet_cidr, 5) # 10.0.1.5
}
# Attach all control plane servers as targets
resource "hcloud_load_balancer_target" "kube_api_targets" {
count = var.control_plane_count
type = "server"
load_balancer_id = hcloud_load_balancer.kube_api.id
server_id = hcloud_server.control_plane[count.index].id
use_private_ip = true
depends_on = [hcloud_load_balancer_network.kube_api, hcloud_server.control_plane]
}
# Kubernetes API service on port 6443
resource "hcloud_load_balancer_service" "kube_api" {
load_balancer_id = hcloud_load_balancer.kube_api.id
protocol = "tcp"
listen_port = 6443
destination_port = 6443
health_check {
protocol = "tcp"
port = 6443
interval = 15
timeout = 10
retries = 3
}
}
# Firewall rule to allow LB access to control planes on 6443
# This is added to the existing cluster firewall
+12 -5
View File
@@ -2,13 +2,20 @@ terraform {
required_version = ">= 1.0"
required_providers {
hcloud = {
source = "hetznercloud/hcloud"
version = "~> 1.45"
local = {
source = "hashicorp/local"
version = "~> 2.5"
}
proxmox = {
source = "bpg/proxmox"
version = ">= 0.60.0"
}
}
}
provider "hcloud" {
token = var.hcloud_token
provider "proxmox" {
endpoint = var.proxmox_endpoint
api_token = "${var.proxmox_api_token_id}=${var.proxmox_api_token_secret}"
insecure = var.proxmox_insecure
}
-11
View File
@@ -1,11 +0,0 @@
resource "hcloud_network" "cluster" {
name = "${var.cluster_name}-network"
ip_range = var.network_cidr
}
resource "hcloud_network_subnet" "servers" {
network_id = hcloud_network.cluster.id
type = "cloud"
network_zone = "eu-central"
ip_range = var.subnet_cidr
}
+9 -15
View File
@@ -1,42 +1,36 @@
output "control_plane_ips" {
description = "Public IPs of control plane nodes"
value = [for cp in hcloud_server.control_plane : cp.ipv4_address]
value = var.control_plane_ips
}
output "control_plane_names" {
description = "Control plane hostnames"
value = [for cp in hcloud_server.control_plane : cp.name]
value = [for idx in range(var.control_plane_count) : format("%s-cp-%d", var.cluster_name, idx + 1)]
}
output "control_plane_private_ips" {
description = "Private IPs of control plane nodes"
value = [
for idx, cp in hcloud_server.control_plane :
try(one(cp.network).ip, cidrhost(var.subnet_cidr, 10 + idx))
]
value = var.control_plane_ips
}
output "primary_control_plane_ip" {
description = "Public IP of the primary control plane (first node)"
value = hcloud_server.control_plane[0].ipv4_address
value = var.control_plane_ips[0]
}
output "worker_ips" {
description = "Public IPs of worker nodes"
value = [for worker in hcloud_server.workers : worker.ipv4_address]
value = var.worker_ips
}
output "worker_names" {
description = "Worker hostnames"
value = [for worker in hcloud_server.workers : worker.name]
value = [for idx in range(var.worker_count) : format("%s-worker-%d", var.cluster_name, idx + 1)]
}
output "worker_private_ips" {
description = "Private IPs of worker nodes"
value = [
for idx, worker in hcloud_server.workers :
try(one(worker.network).ip, cidrhost(var.subnet_cidr, 20 + idx))
]
value = var.worker_ips
}
output "ssh_private_key_path" {
@@ -61,10 +55,10 @@ output "network_cidr" {
output "kubeconfig_command" {
description = "Command to fetch kubeconfig"
value = "ssh root@${hcloud_server.control_plane[0].ipv4_address} 'cat /etc/rancher/k3s/k3s.yaml' > kubeconfig && sed -i 's/127.0.0.1/${hcloud_server.control_plane[0].ipv4_address}/g' kubeconfig"
value = "ssh ubuntu@${var.control_plane_ips[0]} 'sudo cat /etc/rancher/k3s/k3s.yaml' > kubeconfig && sed -i 's/127.0.0.1/${var.control_plane_ips[0]}/g' kubeconfig"
}
output "kube_api_lb_ip" {
description = "Load Balancer private IP for Kubernetes API (used for cluster joins)"
value = hcloud_load_balancer_network.kube_api.ip
value = var.kube_api_vip
}
+102 -41
View File
@@ -1,60 +1,121 @@
data "hcloud_image" "ubuntu" {
name = "ubuntu-24.04"
with_status = ["available"]
data "local_file" "ssh_public_key" {
filename = pathexpand(var.ssh_public_key)
}
resource "hcloud_server" "control_plane" {
count = var.control_plane_count
locals {
subnet_prefix = split("/", var.subnet_cidr)[1]
name = "${var.cluster_name}-cp-${count.index + 1}"
server_type = var.control_plane_type
image = data.hcloud_image.ubuntu.id
location = var.location
ssh_keys = [data.hcloud_ssh_key.cluster.id]
labels = {
cluster = var.cluster_name
role = "control-plane"
control_planes = {
for idx in range(var.control_plane_count) :
format("%s-cp-%d", var.cluster_name, idx + 1) => {
role = "control-plane"
vm_id = var.control_plane_vm_ids[idx]
ip = var.control_plane_ips[idx]
cpu = var.control_plane_cores
memory_mb = var.control_plane_memory_mb
disk_gb = var.control_plane_disk_gb
startup = 1
}
}
network {
network_id = hcloud_network.cluster.id
ip = cidrhost(var.subnet_cidr, 10 + count.index)
workers = {
for idx in range(var.worker_count) :
format("%s-worker-%d", var.cluster_name, idx + 1) => {
role = "worker"
vm_id = var.worker_vm_ids[idx]
ip = var.worker_ips[idx]
cpu = var.worker_cores
memory_mb = var.worker_memory_mb
disk_gb = var.worker_disk_gb
startup = 2
}
}
public_net {
ipv4_enabled = true
ipv6_enabled = true
}
firewall_ids = [hcloud_firewall.cluster.id]
nodes = merge(local.control_planes, local.workers)
}
resource "hcloud_server" "workers" {
count = var.worker_count
resource "proxmox_virtual_environment_vm" "nodes" {
for_each = local.nodes
name = "${var.cluster_name}-worker-${count.index + 1}"
server_type = var.worker_type
image = data.hcloud_image.ubuntu.id
location = var.location
ssh_keys = [data.hcloud_ssh_key.cluster.id]
name = each.key
description = "Managed by Terraform for ${var.cluster_name}"
tags = ["terraform", var.cluster_name, each.value.role]
node_name = var.proxmox_node_name
vm_id = each.value.vm_id
labels = {
cluster = var.cluster_name
role = "worker"
on_boot = true
started = true
stop_on_destroy = true
reboot_after_update = true
timeout_clone = 1800
timeout_create = 1800
timeout_shutdown_vm = 300
timeout_start_vm = 300
scsi_hardware = "virtio-scsi-single"
clone {
vm_id = var.proxmox_template_vm_id
datastore_id = var.proxmox_vm_storage_pool
full = var.proxmox_clone_full
retries = 3
}
network {
network_id = hcloud_network.cluster.id
ip = cidrhost(var.subnet_cidr, 20 + count.index)
agent {
enabled = true
trim = true
}
public_net {
ipv4_enabled = true
ipv6_enabled = true
cpu {
cores = each.value.cpu
type = "x86-64-v2-AES"
}
firewall_ids = [hcloud_firewall.cluster.id]
memory {
dedicated = each.value.memory_mb
floating = each.value.memory_mb
}
depends_on = [hcloud_server.control_plane]
startup {
order = tostring(each.value.startup)
up_delay = "20"
down_delay = "20"
}
disk {
datastore_id = var.proxmox_vm_storage_pool
interface = "scsi0"
size = each.value.disk_gb
discard = "on"
iothread = true
ssd = true
}
initialization {
datastore_id = var.proxmox_cloud_init_storage_pool
dns {
servers = var.proxmox_dns_servers
}
ip_config {
ipv4 {
address = "${each.value.ip}/${local.subnet_prefix}"
gateway = var.proxmox_gateway
}
}
user_account {
username = var.proxmox_ssh_username
keys = [trimspace(data.local_file.ssh_public_key.content)]
}
}
network_device {
bridge = var.proxmox_bridge
model = "virtio"
}
operating_system {
type = "l26"
}
}
-7
View File
@@ -1,7 +0,0 @@
data "local_file" "ssh_public_key" {
filename = pathexpand(var.ssh_public_key)
}
data "hcloud_ssh_key" "cluster" {
name = "infra"
}
+142 -22
View File
@@ -1,19 +1,13 @@
variable "hcloud_token" {
description = "Hetzner Cloud API token"
type = string
sensitive = true
}
variable "ssh_public_key" {
description = "Path to SSH public key"
type = string
default = "~/.ssh/id_ed25519.pub"
default = "~/.ssh/infra.pub"
}
variable "ssh_private_key" {
description = "Path to SSH private key"
type = string
default = "~/.ssh/id_ed25519"
default = "~/.ssh/infra"
}
variable "cluster_name" {
@@ -28,28 +22,112 @@ variable "control_plane_count" {
default = 3
}
variable "control_plane_type" {
description = "Hetzner server type for control plane"
type = string
default = "cx23"
variable "control_plane_cores" {
description = "vCPU count for control plane VMs"
type = number
default = 2
}
variable "control_plane_memory_mb" {
description = "Dedicated memory for control plane VMs in MiB"
type = number
default = 4096
}
variable "control_plane_disk_gb" {
description = "Disk size for control plane VMs in GiB"
type = number
default = 32
}
variable "worker_count" {
description = "Number of worker nodes"
type = number
default = 3
default = 5
}
variable "worker_type" {
description = "Hetzner server type for workers"
type = string
default = "cx33"
variable "worker_cores" {
description = "vCPU count for worker VMs"
type = number
default = 4
}
variable "location" {
description = "Hetzner datacenter location"
variable "worker_memory_mb" {
description = "Dedicated memory for worker VMs in MiB"
type = number
default = 8192
}
variable "worker_disk_gb" {
description = "Disk size for worker VMs in GiB"
type = number
default = 64
}
variable "proxmox_endpoint" {
description = "Proxmox API endpoint without /api2/json suffix"
type = string
default = "nbg1"
default = "https://100.105.0.115:8006/"
}
variable "proxmox_api_token_id" {
description = "Proxmox API token ID"
type = string
sensitive = true
}
variable "proxmox_api_token_secret" {
description = "Proxmox API token secret"
type = string
sensitive = true
}
variable "proxmox_insecure" {
description = "Skip TLS verification for the Proxmox API"
type = bool
default = true
}
variable "proxmox_node_name" {
description = "Fixed Proxmox node name for all cluster VMs"
type = string
default = "flex"
}
variable "proxmox_template_vm_id" {
description = "Template VM ID used for linked clones"
type = number
default = 9000
}
variable "proxmox_clone_full" {
description = "Whether to use full clones instead of linked clones"
type = bool
default = false
}
variable "proxmox_vm_storage_pool" {
description = "Proxmox datastore for VM disks"
type = string
default = "Flash"
}
variable "proxmox_cloud_init_storage_pool" {
description = "Proxmox datastore for cloud-init disks"
type = string
default = "Flash"
}
variable "proxmox_bridge" {
description = "Proxmox bridge for cluster VM interfaces"
type = string
default = "vmbr0"
}
variable "proxmox_ssh_username" {
description = "Cloud-init user injected into cloned VMs"
type = string
default = "ubuntu"
}
variable "allowed_ssh_ips" {
@@ -90,13 +168,55 @@ variable "enable_nodeport_public" {
variable "network_cidr" {
description = "CIDR for private network"
type = string
default = "10.0.0.0/16"
default = "10.27.27.0/24"
}
variable "subnet_cidr" {
description = "CIDR for server subnet"
type = string
default = "10.0.1.0/24"
default = "10.27.27.0/24"
}
variable "proxmox_gateway" {
description = "Gateway for cluster VM networking"
type = string
default = "10.27.27.1"
}
variable "proxmox_dns_servers" {
description = "DNS servers configured through cloud-init"
type = list(string)
default = ["1.1.1.1", "8.8.8.8"]
}
variable "control_plane_ips" {
description = "Static IPv4 addresses for control plane VMs"
type = list(string)
default = ["10.27.27.30", "10.27.27.31", "10.27.27.32"]
}
variable "worker_ips" {
description = "Static IPv4 addresses for worker VMs"
type = list(string)
default = ["10.27.27.41", "10.27.27.42", "10.27.27.43", "10.27.27.44", "10.27.27.45"]
}
variable "control_plane_vm_ids" {
description = "Fixed VMIDs for control plane VMs"
type = list(number)
default = [200, 201, 202]
}
variable "worker_vm_ids" {
description = "Fixed VMIDs for worker VMs"
type = list(number)
default = [210, 211, 212, 213, 214]
}
variable "kube_api_vip" {
description = "Virtual IP advertised by kube-vip for the Kubernetes API"
type = string
default = "10.27.27.40"
}
variable "s3_access_key" {