feat: migrate cluster baseline from Hetzner to Proxmox
Deploy Cluster / Terraform (push) Failing after 52s
Deploy Cluster / Ansible (push) Has been skipped
Deploy Grafana Content / Grafana Content (push) Failing after 1m37s

Replace Hetzner infrastructure and cloud-provider assumptions with Proxmox
VM clones, kube-vip API HA, and NFS-backed storage. Update bootstrap,
Flux addons, CI workflows, and docs to target the new private Proxmox
baseline while preserving the existing Tailscale, Doppler, Flux, Rancher,
and B2 backup flows.
This commit is contained in:
2026-04-22 03:02:13 +00:00
parent 6c6b9d20ca
commit b1dae28aa5
40 changed files with 577 additions and 784 deletions
+102 -41
View File
@@ -1,60 +1,121 @@
data "hcloud_image" "ubuntu" {
name = "ubuntu-24.04"
with_status = ["available"]
data "local_file" "ssh_public_key" {
filename = pathexpand(var.ssh_public_key)
}
resource "hcloud_server" "control_plane" {
count = var.control_plane_count
locals {
subnet_prefix = split("/", var.subnet_cidr)[1]
name = "${var.cluster_name}-cp-${count.index + 1}"
server_type = var.control_plane_type
image = data.hcloud_image.ubuntu.id
location = var.location
ssh_keys = [data.hcloud_ssh_key.cluster.id]
labels = {
cluster = var.cluster_name
role = "control-plane"
control_planes = {
for idx in range(var.control_plane_count) :
format("%s-cp-%d", var.cluster_name, idx + 1) => {
role = "control-plane"
vm_id = var.control_plane_vm_ids[idx]
ip = var.control_plane_ips[idx]
cpu = var.control_plane_cores
memory_mb = var.control_plane_memory_mb
disk_gb = var.control_plane_disk_gb
startup = 1
}
}
network {
network_id = hcloud_network.cluster.id
ip = cidrhost(var.subnet_cidr, 10 + count.index)
workers = {
for idx in range(var.worker_count) :
format("%s-worker-%d", var.cluster_name, idx + 1) => {
role = "worker"
vm_id = var.worker_vm_ids[idx]
ip = var.worker_ips[idx]
cpu = var.worker_cores
memory_mb = var.worker_memory_mb
disk_gb = var.worker_disk_gb
startup = 2
}
}
public_net {
ipv4_enabled = true
ipv6_enabled = true
}
firewall_ids = [hcloud_firewall.cluster.id]
nodes = merge(local.control_planes, local.workers)
}
resource "hcloud_server" "workers" {
count = var.worker_count
resource "proxmox_virtual_environment_vm" "nodes" {
for_each = local.nodes
name = "${var.cluster_name}-worker-${count.index + 1}"
server_type = var.worker_type
image = data.hcloud_image.ubuntu.id
location = var.location
ssh_keys = [data.hcloud_ssh_key.cluster.id]
name = each.key
description = "Managed by Terraform for ${var.cluster_name}"
tags = ["terraform", var.cluster_name, each.value.role]
node_name = var.proxmox_node_name
vm_id = each.value.vm_id
labels = {
cluster = var.cluster_name
role = "worker"
on_boot = true
started = true
stop_on_destroy = true
reboot_after_update = true
timeout_clone = 1800
timeout_create = 1800
timeout_shutdown_vm = 300
timeout_start_vm = 300
scsi_hardware = "virtio-scsi-single"
clone {
vm_id = var.proxmox_template_vm_id
datastore_id = var.proxmox_vm_storage_pool
full = var.proxmox_clone_full
retries = 3
}
network {
network_id = hcloud_network.cluster.id
ip = cidrhost(var.subnet_cidr, 20 + count.index)
agent {
enabled = true
trim = true
}
public_net {
ipv4_enabled = true
ipv6_enabled = true
cpu {
cores = each.value.cpu
type = "x86-64-v2-AES"
}
firewall_ids = [hcloud_firewall.cluster.id]
memory {
dedicated = each.value.memory_mb
floating = each.value.memory_mb
}
depends_on = [hcloud_server.control_plane]
startup {
order = tostring(each.value.startup)
up_delay = "20"
down_delay = "20"
}
disk {
datastore_id = var.proxmox_vm_storage_pool
interface = "scsi0"
size = each.value.disk_gb
discard = "on"
iothread = true
ssd = true
}
initialization {
datastore_id = var.proxmox_cloud_init_storage_pool
dns {
servers = var.proxmox_dns_servers
}
ip_config {
ipv4 {
address = "${each.value.ip}/${local.subnet_prefix}"
gateway = var.proxmox_gateway
}
}
user_account {
username = var.proxmox_ssh_username
keys = [trimspace(data.local_file.ssh_public_key.content)]
}
}
network_device {
bridge = var.proxmox_bridge
model = "virtio"
}
operating_system {
type = "l26"
}
}