feat: migrate cluster baseline from Hetzner to Proxmox
Deploy Cluster / Terraform (push) Failing after 52s
Deploy Cluster / Ansible (push) Has been skipped
Deploy Grafana Content / Grafana Content (push) Failing after 1m37s

Replace Hetzner infrastructure and cloud-provider assumptions with Proxmox
VM clones, kube-vip API HA, and NFS-backed storage. Update bootstrap,
Flux addons, CI workflows, and docs to target the new private Proxmox
baseline while preserving the existing Tailscale, Doppler, Flux, Rancher,
and B2 backup flows.
This commit is contained in:
2026-04-22 03:02:13 +00:00
parent 6c6b9d20ca
commit b1dae28aa5
40 changed files with 577 additions and 784 deletions
+4 -20
View File
@@ -12,12 +12,15 @@ on:
env:
TF_VERSION: "1.7.0"
TF_VAR_hcloud_token: ${{ secrets.HCLOUD_TOKEN }}
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
TF_VAR_proxmox_insecure: "true"
jobs:
dashboards:
@@ -51,25 +54,6 @@ jobs:
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
-backend-config="skip_requesting_account_id=true"
- name: Detect runner egress IP
run: |
RUNNER_IP=$(curl -fsSL https://api.ipify.org)
echo "RUNNER_CIDR=[\"${RUNNER_IP}/32\"]" >> "$GITHUB_ENV"
echo "Runner egress IP: ${RUNNER_IP}"
- name: Open SSH/API for current runner CIDR
working-directory: terraform
run: |
terraform apply \
-refresh=false \
-target=hcloud_firewall.cluster \
-var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
-var="allowed_ssh_ips=${RUNNER_CIDR}" \
-var="allowed_api_ips=${RUNNER_CIDR}" \
-auto-approve
- name: Install Python Dependencies
run: |
apt-get update && apt-get install -y python3-pip
+7 -62
View File
@@ -11,12 +11,15 @@ on:
env:
TF_VERSION: "1.7.0"
TF_VAR_hcloud_token: ${{ secrets.HCLOUD_TOKEN }}
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
TF_VAR_proxmox_insecure: "true"
TS_OAUTH_CLIENT_ID: ${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}
TS_OAUTH_CLIENT_SECRET: ${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}
@@ -60,40 +63,6 @@ jobs:
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
chmod 644 ~/.ssh/id_ed25519.pub
- name: Install jq
run: |
apt-get update
apt-get install -y jq
- name: Import existing servers into state (if missing)
working-directory: terraform
env:
HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }}
run: |
set -e
ensure_import() {
address="$1"
name="$2"
if terraform state show "$address" >/dev/null 2>&1; then
echo "$address already in state"
return
fi
id=$(curl -sS -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/servers?name=${name}" | jq -r '.servers[0].id // empty')
if [ -n "$id" ]; then
echo "Importing $address from server $name ($id)"
terraform import "$address" "$id"
else
echo "No existing server found for $name; skipping import"
fi
}
ensure_import 'hcloud_server.control_plane[0]' 'k8s-cluster-cp-1'
ensure_import 'hcloud_server.control_plane[1]' 'k8s-cluster-cp-2'
ensure_import 'hcloud_server.control_plane[2]' 'k8s-cluster-cp-3'
ensure_import 'hcloud_server.workers[0]' 'k8s-cluster-worker-1'
ensure_import 'hcloud_server.workers[1]' 'k8s-cluster-worker-2'
ensure_import 'hcloud_server.workers[2]' 'k8s-cluster-worker-3'
- name: Terraform Plan
id: plan
working-directory: terraform
@@ -187,32 +156,11 @@ jobs:
mkdir -p ../outputs
terraform output -json > ../outputs/terraform_outputs.json
- name: Detect runner egress IP
run: |
RUNNER_IP=$(curl -fsSL https://api.ipify.org)
echo "RUNNER_CIDR=[\"${RUNNER_IP}/32\"]" >> "$GITHUB_ENV"
echo "Runner egress IP: ${RUNNER_IP}"
- name: Open SSH/API for current runner CIDR
working-directory: terraform
run: |
terraform apply \
-target=hcloud_firewall.cluster \
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
-var="allowed_ssh_ips=${RUNNER_CIDR}" \
-var="allowed_api_ips=${RUNNER_CIDR}" \
-auto-approve
- name: Install Python Dependencies
run: |
apt-get update && apt-get install -y python3-pip
pip3 install --break-system-packages ansible kubernetes jinja2 pyyaml
- name: Note runner connectivity mode
run: |
echo "Using runner public network access with RUNNER_ALLOWED_CIDRS for SSH/API"
- name: Install Ansible Collections
run: ansible-galaxy collection install -r ansible/requirements.yml
@@ -224,7 +172,6 @@ jobs:
working-directory: ansible
run: |
ansible-playbook site.yml \
-e "hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
-e "tailscale_auth_key=${{ secrets.TAILSCALE_AUTH_KEY }}" \
-e "tailscale_tailnet=${{ secrets.TAILSCALE_TAILNET }}" \
-e "tailscale_oauth_client_id=${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}" \
@@ -294,9 +241,8 @@ jobs:
key: dopplerToken
namespace: external-secrets
EOF
# Wait for CCM and CSI (Hetzner cloud integration)
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-ccm --timeout=600s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-csi --timeout=600s
# Wait for the storage layer and private access components
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=600s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s
- name: Wait for Rancher and backup operator
@@ -397,10 +343,9 @@ jobs:
working-directory: ansible
run: |
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get nodes -o wide"
ansible -i inventory.ini 'control_plane[0]' -m shell -a "kubectl describe nodes | grep -E 'Name:|providerID:'"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n flux-system get gitrepositories,kustomizations,helmreleases"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n kube-system get pods -o wide"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass flash-nfs"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n tailscale-system get pods -o wide"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n external-secrets get pods"
env:
+13 -123
View File
@@ -10,107 +10,22 @@ on:
env:
TF_VERSION: "1.7.0"
TF_VAR_hcloud_token: ${{ secrets.HCLOUD_TOKEN }}
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }}
B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }}
TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
TF_VAR_proxmox_insecure: "true"
jobs:
pre-destroy-backup:
name: Pre-Destroy Backup
runs-on: ubuntu-latest
if: github.event.inputs.confirm == 'destroy'
environment: destroy
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Terraform
uses: hashicorp/setup-terraform@v3
with:
terraform_version: ${{ env.TF_VERSION }}
- name: Terraform Init
working-directory: terraform
run: |
terraform init \
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
-backend-config="region=auto" \
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
-backend-config="skip_requesting_account_id=true"
- name: Setup SSH Keys
run: |
mkdir -p ~/.ssh
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
chmod 644 ~/.ssh/id_ed25519.pub
- name: Get Control Plane IP
id: cp_ip
working-directory: terraform
run: |
PRIMARY_IP=$(terraform output -raw primary_control_plane_ip)
echo "PRIMARY_IP=${PRIMARY_IP}" >> "$GITHUB_ENV"
- name: Pre-Destroy pg_dump to B2
run: |
set +e
echo "Attempting pre-destroy backup to B2..."
ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null root@${PRIMARY_IP} << 'EOF'
set -e
# Check if kubectl is available and cluster is up
if ! command -v kubectl &> /dev/null; then
echo "kubectl not found, skipping pre-destroy backup"
exit 0
fi
# Check if we can reach the cluster
if ! kubectl cluster-info &> /dev/null; then
echo "Cannot reach cluster, skipping pre-destroy backup"
exit 0
fi
# Check if CNP is deployed
if ! kubectl get namespace cnpg-cluster &> /dev/null; then
echo "CNP namespace not found, skipping pre-destroy backup"
exit 0
fi
# Run backup using the pgdump image directly
BACKUP_FILE="rancher-backup-$(date +%Y%m%d-%H%M%S).sql.gz"
B2_ACCOUNT_ID="$(cat /etc/kubernetes/secret/b2_account_id 2>/dev/null || echo '')"
B2_APPLICATION_KEY="$(cat /etc/kubernetes/secret/b2_application_key 2>/dev/null || echo '')"
if [ -z "$B2_ACCOUNT_ID" ] || [ -z "$B2_APPLICATION_KEY" ]; then
echo "B2 credentials not found in secret, skipping pre-destroy backup"
exit 0
fi
kubectl run pgdump-manual --image=ghcr.io/cloudnative-pg/pgbackrest:latest --restart=Never \
-n cnpg-cluster --dry-run=client -o yaml | \
kubectl apply -f -
echo "Waiting for backup job to complete..."
kubectl wait --for=condition=complete job/pgdump-manual -n cnpg-cluster --timeout=300s || true
kubectl logs job/pgdump-manual -n cnpg-cluster || true
kubectl delete job pgdump-manual -n cnpg-cluster --ignore-not-found=true || true
EOF
echo "Pre-destroy backup step completed (failure is non-fatal)"
destroy:
name: Destroy Cluster
runs-on: ubuntu-latest
if: github.event.inputs.confirm == 'destroy'
environment: destroy
needs: pre-destroy-backup
steps:
- name: Checkout
uses: actions/checkout@v4
@@ -120,6 +35,14 @@ jobs:
with:
terraform_version: ${{ env.TF_VERSION }}
- name: Setup SSH Keys
run: |
mkdir -p ~/.ssh
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
chmod 644 ~/.ssh/id_ed25519.pub
- name: Terraform Init
working-directory: terraform
run: |
@@ -131,19 +54,6 @@ jobs:
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
-backend-config="skip_requesting_account_id=true"
- name: Setup SSH Keys
run: |
mkdir -p ~/.ssh
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
chmod 644 ~/.ssh/id_ed25519.pub
- name: Install jq
run: |
apt-get update
apt-get install -y jq
- name: Terraform Destroy
id: destroy
working-directory: terraform
@@ -152,7 +62,6 @@ jobs:
for attempt in 1 2 3; do
echo "Terraform destroy attempt ${attempt}/3"
terraform destroy \
-var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
-auto-approve
@@ -164,32 +73,13 @@ jobs:
echo "Terraform destroy failed with exit code ${rc}; retrying in 30s"
sleep 30
terraform refresh \
-var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
-var="ssh_private_key=$HOME/.ssh/id_ed25519" || true
fi
done
exit "$rc"
- name: Hetzner destroy diagnostics
- name: Terraform state diagnostics
if: failure() && steps.destroy.outcome == 'failure'
env:
HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }}
run: |
set +e
echo "== Terraform state list =="
terraform -chdir=terraform state list || true
network_id=$(terraform -chdir=terraform state show hcloud_network.cluster 2>/dev/null | awk '/^id *=/ {gsub(/"/, "", $3); print $3; exit}')
if [ -z "$network_id" ]; then
network_id="11988935"
fi
echo "== Hetzner network =="
curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/networks/${network_id}" | jq . || true
echo "== Hetzner servers attached to network =="
curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/servers" | jq --argjson id "$network_id" '.servers[] | select(any(.private_net[]?; .network == $id)) | {id, name, private_net}' || true
echo "== Hetzner load balancers attached to network =="
curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/load_balancers" | jq --argjson id "$network_id" '.load_balancers[] | select(any(.private_net[]?; .network == $id)) | {id, name, private_net}' || true
+7 -3
View File
@@ -9,7 +9,9 @@ Repository guide for OpenCode sessions in this repo.
## Current Baseline
- HA private cluster: 3 control planes, 3 workers.
- HA private cluster: 3 control planes, 5 workers on Proxmox.
- Proxmox clones come from template `9000` on node `flex`; API VIP is `10.27.27.40` via kube-vip.
- Storage is `nfs-subdir-external-provisioner` backed by `10.27.27.22:/TheFlash/k8s-nfs` with StorageClass `flash-nfs`.
- Tailscale is the private access path for Rancher and shared services.
- Rancher, Grafana, and Prometheus are exposed through Tailscale; Flux UI / Weave GitOps is removed.
- `apps/` is suspended by default.
@@ -20,8 +22,8 @@ Repository guide for OpenCode sessions in this repo.
- Terraform: `terraform -chdir=terraform fmt -recursive`, `terraform -chdir=terraform validate`, `terraform -chdir=terraform plan -var-file=../terraform.tfvars`, `terraform -chdir=terraform apply -var-file=../terraform.tfvars`
- Ansible: `ansible-galaxy collection install -r ansible/requirements.yml`, `cd ansible && python3 generate_inventory.py`, `ansible-playbook -i ansible/inventory.ini ansible/site.yml --syntax-check`, `ansible-playbook ansible/site.yml`
- Flux/Kustomize: `kubectl kustomize infrastructure/addons/<addon>`, `kubectl kustomize clusters/prod/flux-system`
- Kubeconfig refresh: `scripts/refresh-kubeconfig.sh <cp1-public-ip>`
- Tailnet smoke check: `ssh root@<cp1-ip> 'bash -s' < scripts/smoke-check-tailnet-services.sh`
- Kubeconfig refresh: `scripts/refresh-kubeconfig.sh <cp1-ip>`
- Tailnet smoke check: `ssh ubuntu@<cp1-ip> 'bash -s' < scripts/smoke-check-tailnet-services.sh`
## Workflow Rules
@@ -31,12 +33,14 @@ Repository guide for OpenCode sessions in this repo.
- CI deploy order is Terraform -> Ansible -> Flux bootstrap -> Rancher restore -> health checks.
- One object per Kubernetes YAML file; keep filenames kebab-case.
- If `kubectl` points at `localhost:8080` after a rebuild, refresh kubeconfig from the primary control-plane IP.
- Bootstrap assumptions that matter: SSH user is `ubuntu`, NIC is `ens18`, API join endpoint is the kube-vip address.
## Repo-Specific Gotchas
- `rancher-backup` uses a postRenderer to swap the broken hook image to `rancher/kubectl:v1.34.0`; do not put S3 config in HelmRelease values. Put it in the Backup CR.
- Tailscale cleanup only runs before service proxies exist; it removes stale offline `rancher`/`grafana`/`prometheus`/`flux` devices, then must stop so live proxies are not deleted.
- Keep the Tailscale operator on the stable Helm repo `https://pkgs.tailscale.com/helmcharts` at `1.96.5` unless you have a reason to change it.
- The repo no longer uses a cloud controller manager. If you see `providerID` or Hetzner-specific logic, it is stale.
- Current private URLs:
- Rancher: `https://rancher.silverside-gopher.ts.net/`
- Grafana: `http://grafana.silverside-gopher.ts.net/`
+41 -56
View File
@@ -1,30 +1,28 @@
# Hetzner Kubernetes Cluster
# Proxmox Kubernetes Cluster
Production-ready Kubernetes cluster on Hetzner Cloud using Terraform and Ansible.
Production-ready private Kubernetes cluster on Proxmox using Terraform, Ansible, and Flux.
## Architecture
| Component | Details |
|-----------|---------|
| **Control Plane** | 3x CX23 (HA) |
| **Workers** | 3x CX33 |
| **Control Plane** | 3x Proxmox VMs (2 vCPU / 4 GiB / 32 GiB) |
| **Workers** | 5x Proxmox VMs (4 vCPU / 8 GiB / 64 GiB) |
| **K8s** | k3s (latest, HA) |
| **Addons** | Hetzner CCM + CSI + Prometheus + Grafana + Loki |
| **Addons** | NFS provisioner + Prometheus + Grafana + Loki + Rancher |
| **Access** | SSH/API and private services restricted to Tailnet |
| **Bootstrap** | Terraform + Ansible + Flux |
## Prerequisites
### 1. Hetzner Cloud API Token
### 1. Proxmox API Token
1. Go to [Hetzner Cloud Console](https://console.hetzner.com/)
2. Select your project (or create a new one)
3. Navigate to **Security****API Tokens**
4. Click **Generate API Token**
5. Set description: `k8s-cluster-terraform`
6. Select permissions: **Read & Write**
7. Click **Generate API Token**
8. **Copy the token immediately** - it won't be shown again!
Create an API token for the Proxmox VE user used by Terraform. The repo expects the `bpg/proxmox` provider with:
- endpoint: `https://100.105.0.115:8006/`
- node: `flex`
- clone source: template `9000` (`ubuntu-2404-k8s-template`)
- auth: API token
### 2. Backblaze B2 Bucket (for Terraform State)
@@ -44,7 +42,7 @@ Production-ready Kubernetes cluster on Hetzner Cloud using Terraform and Ansible
### 3. SSH Key Pair
```bash
ssh-keygen -t ed25519 -C "k8s@hetzner" -f ~/.ssh/hetzner_k8s
ssh-keygen -t ed25519 -C "k8s@proxmox" -f ~/.ssh/infra
```
### 4. Local Tools
@@ -71,10 +69,12 @@ cp terraform.tfvars.example terraform.tfvars
Edit `terraform.tfvars`:
```hcl
hcloud_token = "your-hetzner-api-token"
proxmox_endpoint = "https://100.105.0.115:8006/"
proxmox_api_token_id = "terraform-prov@pve!k8s-cluster"
proxmox_api_token_secret = "your-proxmox-token-secret"
ssh_public_key = "~/.ssh/hetzner_k8s.pub"
ssh_private_key = "~/.ssh/hetzner_k8s"
ssh_public_key = "~/.ssh/infra.pub"
ssh_private_key = "~/.ssh/infra"
s3_access_key = "your-backblaze-key-id"
s3_secret_key = "your-backblaze-application-key"
@@ -84,12 +84,7 @@ s3_bucket = "k8s-terraform-state"
tailscale_auth_key = "tskey-auth-..."
tailscale_tailnet = "yourtailnet.ts.net"
restrict_api_ssh_to_tailnet = true
tailnet_cidr = "100.64.0.0/10"
enable_nodeport_public = false
allowed_ssh_ips = []
allowed_api_ips = []
kube_api_vip = "10.27.27.40"
```
### 3. Initialize Terraform
@@ -152,7 +147,9 @@ Set these in your Gitea repository settings (**Settings** → **Secrets** → **
| Secret | Description |
|--------|-------------|
| `HCLOUD_TOKEN` | Hetzner Cloud API token |
| `PROXMOX_ENDPOINT` | Proxmox API endpoint (for example `https://100.105.0.115:8006/`) |
| `PROXMOX_API_TOKEN_ID` | Proxmox API token ID |
| `PROXMOX_API_TOKEN_SECRET` | Proxmox API token secret |
| `S3_ACCESS_KEY` | Backblaze B2 keyID |
| `S3_SECRET_KEY` | Backblaze B2 applicationKey |
| `S3_ENDPOINT` | Backblaze S3 endpoint (e.g., `https://s3.eu-central-003.backblazeb2.com`) |
@@ -163,7 +160,6 @@ Set these in your Gitea repository settings (**Settings** → **Secrets** → **
| `TAILSCALE_OAUTH_CLIENT_SECRET` | Tailscale OAuth client secret for Kubernetes Operator |
| `DOPPLER_HETZNERTERRA_SERVICE_TOKEN` | Doppler service token for `hetznerterra` runtime secrets |
| `GRAFANA_ADMIN_PASSWORD` | Optional admin password for Grafana (auto-generated if unset) |
| `RUNNER_ALLOWED_CIDRS` | Optional CIDR list for CI runner access if you choose to pass it via tfvars/secrets |
| `SSH_PUBLIC_KEY` | SSH public key content |
| `SSH_PRIVATE_KEY` | SSH private key content |
@@ -176,8 +172,8 @@ This repo uses Flux for continuous reconciliation after Terraform + Ansible boot
The current default target is the HA private baseline:
- `3` control plane nodes
- `3` worker nodes
- private Hetzner network only
- `5` worker nodes
- private Proxmox network only
- Tailscale for operator and service access
- Flux-managed platform addons with `apps` suspended by default
@@ -207,8 +203,7 @@ Terraform/bootstrap secrets remain in Gitea Actions secrets and are not managed
### Reconciliation graph
- `infrastructure` (top-level)
- `addon-ccm`
- `addon-csi` depends on `addon-ccm`
- `addon-nfs-storage`
- `addon-tailscale-operator`
- `addon-observability`
- `addon-observability-content` depends on `addon-observability`
@@ -224,7 +219,7 @@ Terraform/bootstrap secrets remain in Gitea Actions secrets and are not managed
### Current addon status
- Core infrastructure addons are Flux-managed from `infrastructure/addons/`.
- Active Flux addons for the current baseline: `addon-ccm`, `addon-csi`, `addon-cert-manager`, `addon-external-secrets`, `addon-tailscale-operator`, `addon-tailscale-proxyclass`, `addon-observability`, `addon-observability-content`, `addon-rancher`, `addon-rancher-config`, `addon-rancher-backup`, `addon-rancher-backup-config`.
- Active Flux addons for the current baseline: `addon-nfs-storage`, `addon-cert-manager`, `addon-external-secrets`, `addon-tailscale-operator`, `addon-tailscale-proxyclass`, `addon-observability`, `addon-observability-content`, `addon-rancher`, `addon-rancher-config`, `addon-rancher-backup`, `addon-rancher-backup-config`.
- `apps` remains suspended until workload rollout is explicitly enabled.
- Ansible is limited to cluster bootstrap, prerequisite secret creation, pre-proxy Tailscale cleanup, and kubeconfig finalization.
- Weave GitOps / Flux UI is no longer deployed; use Rancher or the `flux` CLI for Flux operations.
@@ -232,14 +227,14 @@ Terraform/bootstrap secrets remain in Gitea Actions secrets and are not managed
### Rancher access
- Rancher is private-only and exposed through Tailscale at `https://rancher.silverside-gopher.ts.net/`.
- The public Hetzner load balancer path is not used for Rancher.
- Rancher and the Kubernetes API stay private; kube-vip provides the API VIP on the LAN.
- Rancher stores state in embedded etcd; no external database is used.
### Stable baseline acceptance
A rebuild is considered successful only when all of the following pass without manual intervention:
- Terraform create succeeds for the default `3` control planes and `3` workers.
- Terraform create succeeds for the default `3` control planes and `5` workers.
- Ansible bootstrap succeeds end-to-end.
- All nodes become `Ready`.
- Flux core reconciliation is healthy.
@@ -323,9 +318,6 @@ It avoids full cluster provisioning and only applies Grafana content resources:
├── terraform/
│ ├── main.tf
│ ├── variables.tf
│ ├── network.tf
│ ├── firewall.tf
│ ├── ssh.tf
│ ├── servers.tf
│ ├── outputs.tf
│ └── backend.tf
@@ -353,17 +345,19 @@ It avoids full cluster provisioning and only applies Grafana content resources:
## Firewall Rules
This repo no longer manages cloud firewalls. Access control is expected to be handled on your LAN infrastructure and through Tailscale.
Important cluster-local ports still in use:
| Port | Source | Purpose |
|------|--------|---------|
| 22 | Tailnet CIDR | SSH |
| 6443 | Tailnet CIDR + internal | Kubernetes API |
| 41641/udp | Any | Tailscale WireGuard |
| 9345 | 10.0.0.0/16 | k3s Supervisor (HA join) |
| 2379 | 10.0.0.0/16 | etcd Client |
| 2380 | 10.0.0.0/16 | etcd Peer |
| 8472 | 10.0.0.0/16 | Flannel VXLAN |
| 10250 | 10.0.0.0/16 | Kubelet |
| 30000-32767 | Optional | NodePorts (disabled by default) |
| 22 | Admin hosts / CI | SSH |
| 6443 | 10.27.27.0/24 + VIP | Kubernetes API |
| 9345 | 10.27.27.0/24 | k3s Supervisor |
| 2379 | 10.27.27.0/24 | etcd Client |
| 2380 | 10.27.27.0/24 | etcd Peer |
| 8472/udp | 10.27.27.0/24 | Flannel VXLAN |
| 10250 | 10.27.27.0/24 | Kubelet |
## Operations
@@ -399,7 +393,7 @@ terraform destroy
### Check k3s Logs
```bash
ssh root@<control-plane-ip> journalctl -u k3s -f
ssh ubuntu@<control-plane-ip> sudo journalctl -u k3s -f
```
### Reset k3s
@@ -408,19 +402,10 @@ ssh root@<control-plane-ip> journalctl -u k3s -f
ansible-playbook site.yml -t reset
```
## Costs Breakdown
| Resource | Quantity | Unit Price | Monthly |
|----------|----------|------------|---------|
| CX23 (Control Plane) | 3 | €2.99 | €8.97 |
| CX33 (Workers) | 4 | €4.99 | €19.96 |
| Backblaze B2 | ~1 GB | Free (first 10GB) | €0.00 |
| **Total** | | | **€28.93/mo** |
## Security Notes
- Control plane has HA (3 nodes, can survive 1 failure)
- Consider adding Hetzner load balancer for API server
- Kubernetes API HA is provided by kube-vip on `10.27.27.40`
- Rotate API tokens regularly
- Use network policies in Kubernetes
- Enable audit logging for production
+14 -7
View File
@@ -1,6 +1,6 @@
# Gitea Secrets Setup
This document describes the secrets required for the HetznerTerra deployment workflow.
This document describes the secrets required for the Proxmox-based deployment workflow.
## Required Secrets
@@ -9,10 +9,17 @@ Add these secrets in your Gitea repository settings:
### Infrastructure Secrets
#### `HCLOUD_TOKEN`
- Hetzner Cloud API token
- Get from: https://console.hetzner.com/projects/{project-id}/security/api-tokens
- Permissions: Read & Write
#### `PROXMOX_ENDPOINT`
- Proxmox VE API endpoint
- Example: `https://100.105.0.115:8006/`
#### `PROXMOX_API_TOKEN_ID`
- Proxmox API token ID
- Example: `terraform-prov@pve!k8s-cluster`
#### `PROXMOX_API_TOKEN_SECRET`
- Proxmox API token secret
- Create with `pveum user token add terraform-prov@pve k8s-cluster`
#### `S3_ACCESS_KEY` & `S3_SECRET_KEY`
- Backblaze B2 credentials for Terraform state storage
@@ -31,7 +38,7 @@ Add these secrets in your Gitea repository settings:
#### `SSH_PRIVATE_KEY` & `SSH_PUBLIC_KEY`
- SSH key pair for cluster access
- Generate with: `ssh-keygen -t ed25519 -C "k8s@hetzner" -f ~/.ssh/hetzner_k8s`
- Generate with: `ssh-keygen -t ed25519 -C "k8s@proxmox" -f ~/.ssh/infra`
- Private key content (include BEGIN/END lines)
- Public key content (full line starting with ssh-ed25519)
@@ -90,4 +97,4 @@ Check the workflow logs to verify all secrets are being used correctly.
- Prefer Doppler for runtime app/platform secrets after cluster bootstrap
- Rotate Tailscale auth keys periodically
- Review OAuth client permissions regularly
- The workflow automatically opens SSH/API access only for the runner's IP during deployment
- CI expects direct SSH access to the Proxmox VMs and direct Proxmox API access
+12 -14
View File
@@ -5,9 +5,9 @@ This document defines the current engineering target for this repository.
## Topology
- 3 control planes (HA etcd cluster)
- 3 workers
- Hetzner Load Balancer for Kubernetes API
- private Hetzner network
- 5 workers
- kube-vip API VIP (`10.27.27.40`)
- private Proxmox/LAN network (`10.27.27.0/24`)
- Tailscale operator access and service exposure
- Rancher exposed through Tailscale (`rancher.silverside-gopher.ts.net`)
- Grafana exposed through Tailscale (`grafana.silverside-gopher.ts.net`)
@@ -17,11 +17,10 @@ This document defines the current engineering target for this repository.
## In Scope
- Terraform infrastructure bootstrap
- Ansible k3s bootstrap with external cloud provider
- Ansible k3s bootstrap on Ubuntu cloud-init VMs
- **HA control plane (3 nodes with etcd quorum)**
- **Hetzner Load Balancer for Kubernetes API**
- **Hetzner CCM deployed via Ansible (before workers join)**
- **Hetzner CSI for persistent volumes (via Flux)**
- **kube-vip for Kubernetes API HA**
- **NFS-backed persistent volumes via `nfs-subdir-external-provisioner`**
- Flux core reconciliation
- External Secrets Operator with Doppler
- Tailscale private access and smoke-check validation
@@ -45,15 +44,14 @@ This document defines the current engineering target for this repository.
## Phase Gates
1. Terraform apply completes for HA topology (3 CP, 3 workers, 1 LB).
2. Load Balancer is healthy with all 3 control plane targets.
3. Primary control plane bootstraps with `--cluster-init`.
4. Secondary control planes join via Load Balancer endpoint.
5. **CCM deployed via Ansible before workers join** (fixes uninitialized taint issue).
6. Workers join successfully via Load Balancer and all nodes show proper `providerID`.
1. Terraform apply completes for HA topology (3 CP, 5 workers, 1 VIP).
2. Primary control plane bootstraps with `--cluster-init`.
3. kube-vip advertises `10.27.27.40:6443` from the control-plane set.
4. Secondary control planes join via the kube-vip endpoint.
5. Workers join successfully via the kube-vip endpoint.
7. etcd reports 3 healthy members.
8. Flux source and infrastructure reconciliation are healthy.
9. **CSI deploys and creates `hcloud-volumes` StorageClass**.
9. **NFS provisioner deploys and creates `flash-nfs` StorageClass**.
10. **PVC provisioning tested and working**.
11. External Secrets sync required secrets.
12. Tailscale private access works for Rancher, Grafana, and Prometheus.
+1 -1
View File
@@ -13,7 +13,7 @@ control_plane
workers
[cluster:vars]
ansible_user=root
ansible_user=ubuntu
ansible_python_interpreter=/usr/bin/python3
ansible_ssh_private_key_file={{ private_key_file }}
k3s_version=latest
@@ -1,14 +1,4 @@
---
- name: Apply Hetzner cloud secret
shell: >-
kubectl -n kube-system create secret generic hcloud
--from-literal=token='{{ hcloud_token }}'
--from-literal=network='{{ cluster_name }}-network'
--dry-run=client -o yaml | kubectl apply -f -
changed_when: true
no_log: true
when: hcloud_token | default('') | length > 0
- name: Ensure Tailscale operator namespace exists
command: >-
kubectl create namespace {{ tailscale_operator_namespace | default('tailscale-system') }}
-82
View File
@@ -1,82 +0,0 @@
---
- name: Check if hcloud secret exists
command: kubectl -n kube-system get secret hcloud
register: hcloud_secret_check
changed_when: false
failed_when: false
- name: Fail if hcloud secret is missing
fail:
msg: "hcloud secret not found in kube-system namespace. CCM requires it."
when: hcloud_secret_check.rc != 0
- name: Check if helm is installed
command: which helm
register: helm_check
changed_when: false
failed_when: false
- name: Install helm
when: helm_check.rc != 0
block:
- name: Download helm install script
get_url:
url: https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3
dest: /tmp/get-helm-3.sh
mode: "0755"
- name: Run helm install script
command: /tmp/get-helm-3.sh
args:
creates: /usr/local/bin/helm
- name: Add Hetzner Helm repository
kubernetes.core.helm_repository:
name: hcloud
repo_url: https://charts.hetzner.cloud
kubeconfig: /etc/rancher/k3s/k3s.yaml
environment:
KUBECONFIG: /etc/rancher/k3s/k3s.yaml
- name: Deploy Hetzner Cloud Controller Manager
kubernetes.core.helm:
name: hcloud-cloud-controller-manager
chart_ref: hcloud/hcloud-cloud-controller-manager
release_namespace: kube-system
create_namespace: true
values:
networking:
enabled: true
nodeSelector:
kubernetes.io/hostname: "{{ inventory_hostname }}"
additionalTolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
kubeconfig: /etc/rancher/k3s/k3s.yaml
wait: true
wait_timeout: 300s
environment:
KUBECONFIG: /etc/rancher/k3s/k3s.yaml
- name: Wait for CCM to be ready
command: kubectl -n kube-system rollout status deployment/hcloud-cloud-controller-manager --timeout=120s
changed_when: false
register: ccm_rollout
until: ccm_rollout.rc == 0
retries: 3
delay: 10
- name: Pause to ensure CCM is fully ready to process new nodes
pause:
seconds: 10
- name: Verify CCM is removing uninitialized taints
command: kubectl get nodes -o jsonpath='{.items[*].spec.taints[?(@.key=="node.cloudprovider.kubernetes.io/uninitialized")].key}'
register: uninitialized_taints
changed_when: false
failed_when: false
- name: Display taint status
debug:
msg: "Nodes with uninitialized taint: {{ uninitialized_taints.stdout }}"
+1
View File
@@ -19,6 +19,7 @@
- lsb-release
- software-properties-common
- jq
- nfs-common
- htop
- vim
state: present
+2 -1
View File
@@ -3,4 +3,5 @@ k3s_version: latest
k3s_server_url: ""
k3s_token: ""
k3s_node_ip: ""
k3s_kubelet_cloud_provider_external: true
k3s_kubelet_cloud_provider_external: false
k3s_flannel_iface: ens18
+1 -1
View File
@@ -22,7 +22,7 @@
command: >-
/tmp/install-k3s.sh agent
--node-ip {{ k3s_node_ip }}
--flannel-iface=enp7s0
--flannel-iface={{ k3s_flannel_iface }}
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
args:
creates: /usr/local/bin/k3s-agent
+3 -2
View File
@@ -3,9 +3,10 @@ k3s_version: latest
k3s_token: ""
k3s_node_ip: ""
k3s_primary_public_ip: ""
k3s_disable_embedded_ccm: true
k3s_disable_embedded_ccm: false
k3s_disable_servicelb: true
k3s_kubelet_cloud_provider_external: true
k3s_kubelet_cloud_provider_external: false
k3s_flannel_iface: ens18
# Load Balancer endpoint for HA cluster joins (set in inventory)
kube_api_endpoint: ""
# Tailscale DNS names for control planes (to enable tailnet access)
+2 -2
View File
@@ -61,7 +61,7 @@
--cluster-init
--advertise-address={{ k3s_primary_ip }}
--node-ip={{ k3s_node_ip }}
--flannel-iface=enp7s0
--flannel-iface={{ k3s_flannel_iface }}
--tls-san={{ k3s_primary_ip }}
--tls-san={{ k3s_primary_public_ip }}
--tls-san={{ kube_api_endpoint }}
@@ -87,7 +87,7 @@
--server https://{{ k3s_join_endpoint | default(k3s_primary_ip) }}:6443
--advertise-address={{ k3s_node_ip }}
--node-ip={{ k3s_node_ip }}
--flannel-iface=enp7s0
--flannel-iface={{ k3s_flannel_iface }}
{% if k3s_disable_embedded_ccm | bool %}--disable-cloud-controller{% endif %}
{% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
@@ -0,0 +1,4 @@
---
kube_vip_version: v1.1.2
kube_vip_interface: ens18
kube_vip_address: "{{ kube_api_endpoint }}"
@@ -0,0 +1,21 @@
---
- name: Render kube-vip control plane manifest
template:
src: kube-vip-control-plane.yaml.j2
dest: /tmp/kube-vip-control-plane.yaml
mode: "0644"
- name: Apply kube-vip control plane manifest
command: kubectl apply -f /tmp/kube-vip-control-plane.yaml
changed_when: true
- name: Wait for kube-vip DaemonSet rollout
command: kubectl -n kube-system rollout status daemonset/kube-vip --timeout=180s
changed_when: false
- name: Wait for API VIP on 6443
wait_for:
host: "{{ kube_vip_address }}"
port: 6443
state: started
timeout: 180
@@ -0,0 +1,110 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: kube-vip
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: system:kube-vip-role
rules:
- apiGroups: [""]
resources: ["services/status"]
verbs: ["update"]
- apiGroups: [""]
resources: ["services", "endpoints"]
verbs: ["list", "get", "watch", "update"]
- apiGroups: [""]
resources: ["nodes"]
verbs: ["list", "get", "watch", "update", "patch"]
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["list", "get", "watch", "update", "create"]
- apiGroups: ["discovery.k8s.io"]
resources: ["endpointslices"]
verbs: ["list", "get", "watch", "update"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["list"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: system:kube-vip-binding
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:kube-vip-role
subjects:
- kind: ServiceAccount
name: kube-vip
namespace: kube-system
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: kube-vip
namespace: kube-system
spec:
selector:
matchLabels:
app.kubernetes.io/name: kube-vip
template:
metadata:
labels:
app.kubernetes.io/name: kube-vip
spec:
serviceAccountName: kube-vip
hostNetwork: true
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/control-plane
operator: Exists
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
- key: node-role.kubernetes.io/master
operator: Exists
effect: NoSchedule
containers:
- name: kube-vip
image: ghcr.io/kube-vip/kube-vip:{{ kube_vip_version }}
imagePullPolicy: IfNotPresent
args:
- manager
env:
- name: vip_arp
value: "true"
- name: port
value: "6443"
- name: vip_interface
value: {{ kube_vip_interface | quote }}
- name: vip_subnet
value: "32"
- name: cp_enable
value: "true"
- name: cp_namespace
value: kube-system
- name: vip_ddns
value: "false"
- name: vip_leaderelection
value: "true"
- name: vip_leaseduration
value: "5"
- name: vip_renewdeadline
value: "3"
- name: vip_retryperiod
value: "1"
- name: address
value: {{ kube_vip_address | quote }}
securityContext:
capabilities:
add:
- NET_ADMIN
- NET_RAW
- SYS_TIME
+2 -2
View File
@@ -57,12 +57,12 @@
roles:
- addon-secrets-bootstrap
- name: Deploy Hetzner CCM (required for workers with external cloud provider)
- name: Deploy kube-vip for API HA
hosts: control_plane[0]
become: true
roles:
- ccm-deploy
- kube-vip-deploy
- name: Setup secondary control planes
hosts: control_plane[1:]
@@ -1,36 +0,0 @@
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: hcloud-cloud-controller-manager
namespace: flux-system
spec:
interval: 10m
targetNamespace: kube-system
chart:
spec:
chart: hcloud-cloud-controller-manager
version: 1.30.1
sourceRef:
kind: HelmRepository
name: hcloud
namespace: flux-system
install:
createNamespace: true
remediation:
retries: 3
upgrade:
remediation:
retries: 3
values:
selectorLabels:
app: hcloud-cloud-controller-manager
args:
secure-port: "0"
networking:
enabled: true
nodeSelector:
kubernetes.io/hostname: k8s-cluster-cp-1
additionalTolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
@@ -1,8 +0,0 @@
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: hcloud
namespace: flux-system
spec:
interval: 1h
url: https://charts.hetzner.cloud
@@ -1,5 +0,0 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helmrepository-hcloud.yaml
- helmrelease-hcloud-ccm.yaml
@@ -1,36 +0,0 @@
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: hcloud-csi
namespace: flux-system
spec:
interval: 10m
targetNamespace: kube-system
chart:
spec:
chart: hcloud-csi
version: 2.20.0
sourceRef:
kind: HelmRepository
name: hcloud
namespace: flux-system
install:
createNamespace: true
remediation:
retries: 3
upgrade:
remediation:
retries: 3
values:
controller:
nodeSelector:
kubernetes.io/hostname: k8s-cluster-cp-1
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
hcloudVolumeDefaultLocation: nbg1
storageClasses:
- name: hcloud-volumes
defaultStorageClass: true
reclaimPolicy: Delete
@@ -1,5 +0,0 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helmrepository-hcloud.yaml
- helmrelease-hcloud-csi.yaml
@@ -1,17 +0,0 @@
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: addon-csi
namespace: flux-system
spec:
interval: 10m
prune: true
sourceRef:
kind: GitRepository
name: platform
path: ./infrastructure/addons/csi
dependsOn:
- name: addon-ccm
wait: true
timeout: 10m
suspend: false
@@ -1,7 +1,7 @@
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: addon-ccm
name: addon-nfs-storage
namespace: flux-system
spec:
interval: 10m
@@ -9,7 +9,7 @@ spec:
sourceRef:
kind: GitRepository
name: platform
path: ./infrastructure/addons/ccm
path: ./infrastructure/addons/nfs-storage
wait: true
timeout: 10m
suspend: false
+1 -2
View File
@@ -1,8 +1,7 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- kustomization-ccm.yaml
- kustomization-csi.yaml
- kustomization-nfs-storage.yaml
- kustomization-external-secrets.yaml
- kustomization-cert-manager.yaml
- kustomization-tailscale-operator.yaml
@@ -0,0 +1,36 @@
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: nfs-subdir-external-provisioner
namespace: flux-system
spec:
interval: 10m
targetNamespace: kube-system
chart:
spec:
chart: nfs-subdir-external-provisioner
version: 4.0.18
sourceRef:
kind: HelmRepository
name: nfs-subdir-external-provisioner
namespace: flux-system
install:
createNamespace: true
remediation:
retries: 3
upgrade:
remediation:
retries: 3
values:
nfs:
server: 10.27.27.22
path: /TheFlash/k8s-nfs
storageClass:
create: true
defaultClass: true
name: flash-nfs
provisionerName: flash-nfs
reclaimPolicy: Delete
archiveOnDelete: true
allowVolumeExpansion: true
volumeBindingMode: Immediate
@@ -1,8 +1,8 @@
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: hcloud
name: nfs-subdir-external-provisioner
namespace: flux-system
spec:
interval: 1h
url: https://charts.hetzner.cloud
url: https://kubernetes-sigs.github.io/nfs-subdir-external-provisioner
@@ -0,0 +1,5 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helmrepository-nfs-subdir-external-provisioner.yaml
- helmrelease-nfs-subdir-external-provisioner.yaml
+2 -1
View File
@@ -24,10 +24,11 @@ echo "Fetching kubeconfig from $CP1_PUBLIC_IP ..."
ssh -i "$SSH_KEY" \
-o StrictHostKeyChecking=no \
-o UserKnownHostsFile=/dev/null \
"root@$CP1_PUBLIC_IP" "cat /etc/rancher/k3s/k3s.yaml" \
"ubuntu@$CP1_PUBLIC_IP" "sudo cat /etc/rancher/k3s/k3s.yaml" \
| sed "s/127.0.0.1/$CP1_PUBLIC_IP/g" \
> "$KUBECONFIG_PATH"
chmod 600 "$KUBECONFIG_PATH"
echo "Kubeconfig saved to $KUBECONFIG_PATH"
echo "Run: export KUBECONFIG=$KUBECONFIG_PATH"
+19 -15
View File
@@ -1,29 +1,33 @@
hcloud_token = "your-hetzner-cloud-api-token-here"
proxmox_endpoint = "https://100.105.0.115:8006/"
proxmox_api_token_id = "terraform-prov@pve!k8s-cluster"
proxmox_api_token_secret = "your-proxmox-api-token-secret"
ssh_public_key = "~/.ssh/hetzner_k8s.pub"
ssh_private_key = "~/.ssh/hetzner_k8s"
ssh_public_key = "~/.ssh/infra.pub"
ssh_private_key = "~/.ssh/infra"
s3_access_key = "your-backblaze-key-id"
s3_secret_key = "your-backblaze-application-key"
s3_endpoint = "https://s3.eu-central-003.backblazeb2.com"
s3_bucket = "k8s-terraform-state"
cluster_name = "k8s-prod"
cluster_name = "k8s-cluster"
tailscale_tailnet = "yourtailnet.ts.net"
restrict_api_ssh_to_tailnet = true
tailnet_cidr = "100.64.0.0/10"
enable_nodeport_public = false
kube_api_vip = "10.27.27.40"
control_plane_count = 3
control_plane_type = "cx23"
control_plane_ips = ["10.27.27.30", "10.27.27.31", "10.27.27.32"]
control_plane_vm_ids = [200, 201, 202]
worker_count = 4
worker_type = "cx33"
worker_count = 5
worker_ips = ["10.27.27.41", "10.27.27.42", "10.27.27.43", "10.27.27.44", "10.27.27.45"]
worker_vm_ids = [210, 211, 212, 213, 214]
location = "nbg1"
allowed_ssh_ips = []
allowed_api_ips = []
proxmox_node_name = "flex"
proxmox_template_vm_id = 9000
proxmox_vm_storage_pool = "Flash"
proxmox_cloud_init_storage_pool = "Flash"
proxmox_bridge = "vmbr0"
proxmox_gateway = "10.27.27.1"
proxmox_dns_servers = ["1.1.1.1", "8.8.8.8"]
-118
View File
@@ -1,118 +0,0 @@
locals {
ssh_source_ips = var.restrict_api_ssh_to_tailnet ? concat([var.tailnet_cidr], var.allowed_ssh_ips) : var.allowed_ssh_ips
api_source_ips = var.restrict_api_ssh_to_tailnet ? concat([var.tailnet_cidr], var.allowed_api_ips) : var.allowed_api_ips
}
resource "hcloud_firewall" "cluster" {
name = "${var.cluster_name}-firewall"
rule {
description = "SSH"
direction = "in"
protocol = "tcp"
port = "22"
source_ips = local.ssh_source_ips
}
rule {
description = "Kubernetes API"
direction = "in"
protocol = "tcp"
port = "6443"
source_ips = local.api_source_ips
}
rule {
description = "Tailscale WireGuard"
direction = "in"
protocol = "udp"
port = "41641"
source_ips = ["0.0.0.0/0"]
}
rule {
description = "Kubernetes API (internal)"
direction = "in"
protocol = "tcp"
port = "6443"
source_ips = [var.subnet_cidr]
}
rule {
description = "k3s Supervisor"
direction = "in"
protocol = "tcp"
port = "9345"
source_ips = [var.subnet_cidr]
}
rule {
description = "etcd Client"
direction = "in"
protocol = "tcp"
port = "2379"
source_ips = [var.subnet_cidr]
}
rule {
description = "etcd Peer"
direction = "in"
protocol = "tcp"
port = "2380"
source_ips = [var.subnet_cidr]
}
rule {
description = "Flannel VXLAN"
direction = "in"
protocol = "udp"
port = "8472"
source_ips = [var.subnet_cidr]
}
rule {
description = "Kubelet"
direction = "in"
protocol = "tcp"
port = "10250"
source_ips = [var.subnet_cidr]
}
dynamic "rule" {
for_each = var.enable_nodeport_public ? [1] : []
content {
description = "NodePorts"
direction = "in"
protocol = "tcp"
port = "30000-32767"
source_ips = ["0.0.0.0/0"]
}
}
rule {
description = "HTTP from Load Balancer"
direction = "in"
protocol = "tcp"
port = "80"
source_ips = ["0.0.0.0/0"]
}
rule {
description = "HTTPS from Load Balancer"
direction = "in"
protocol = "tcp"
port = "443"
source_ips = ["0.0.0.0/0"]
}
rule {
description = "ICMP"
direction = "in"
protocol = "icmp"
source_ips = ["0.0.0.0/0"]
}
apply_to {
label_selector = "cluster=${var.cluster_name}"
}
}
-50
View File
@@ -1,50 +0,0 @@
# Load Balancer for Kubernetes API High Availability
# Provides a single endpoint for all control planes
resource "hcloud_load_balancer" "kube_api" {
name = "${var.cluster_name}-api"
load_balancer_type = "lb11" # Cheapest tier: €5.39/month
location = var.location
labels = {
cluster = var.cluster_name
role = "kube-api"
}
}
# Attach Load Balancer to private network (required for use_private_ip)
resource "hcloud_load_balancer_network" "kube_api" {
load_balancer_id = hcloud_load_balancer.kube_api.id
network_id = hcloud_network.cluster.id
ip = cidrhost(var.subnet_cidr, 5) # 10.0.1.5
}
# Attach all control plane servers as targets
resource "hcloud_load_balancer_target" "kube_api_targets" {
count = var.control_plane_count
type = "server"
load_balancer_id = hcloud_load_balancer.kube_api.id
server_id = hcloud_server.control_plane[count.index].id
use_private_ip = true
depends_on = [hcloud_load_balancer_network.kube_api, hcloud_server.control_plane]
}
# Kubernetes API service on port 6443
resource "hcloud_load_balancer_service" "kube_api" {
load_balancer_id = hcloud_load_balancer.kube_api.id
protocol = "tcp"
listen_port = 6443
destination_port = 6443
health_check {
protocol = "tcp"
port = 6443
interval = 15
timeout = 10
retries = 3
}
}
# Firewall rule to allow LB access to control planes on 6443
# This is added to the existing cluster firewall
+12 -5
View File
@@ -2,13 +2,20 @@ terraform {
required_version = ">= 1.0"
required_providers {
hcloud = {
source = "hetznercloud/hcloud"
version = "~> 1.45"
local = {
source = "hashicorp/local"
version = "~> 2.5"
}
proxmox = {
source = "bpg/proxmox"
version = ">= 0.60.0"
}
}
}
provider "hcloud" {
token = var.hcloud_token
provider "proxmox" {
endpoint = var.proxmox_endpoint
api_token = "${var.proxmox_api_token_id}=${var.proxmox_api_token_secret}"
insecure = var.proxmox_insecure
}
-11
View File
@@ -1,11 +0,0 @@
resource "hcloud_network" "cluster" {
name = "${var.cluster_name}-network"
ip_range = var.network_cidr
}
resource "hcloud_network_subnet" "servers" {
network_id = hcloud_network.cluster.id
type = "cloud"
network_zone = "eu-central"
ip_range = var.subnet_cidr
}
+9 -15
View File
@@ -1,42 +1,36 @@
output "control_plane_ips" {
description = "Public IPs of control plane nodes"
value = [for cp in hcloud_server.control_plane : cp.ipv4_address]
value = var.control_plane_ips
}
output "control_plane_names" {
description = "Control plane hostnames"
value = [for cp in hcloud_server.control_plane : cp.name]
value = [for idx in range(var.control_plane_count) : format("%s-cp-%d", var.cluster_name, idx + 1)]
}
output "control_plane_private_ips" {
description = "Private IPs of control plane nodes"
value = [
for idx, cp in hcloud_server.control_plane :
try(one(cp.network).ip, cidrhost(var.subnet_cidr, 10 + idx))
]
value = var.control_plane_ips
}
output "primary_control_plane_ip" {
description = "Public IP of the primary control plane (first node)"
value = hcloud_server.control_plane[0].ipv4_address
value = var.control_plane_ips[0]
}
output "worker_ips" {
description = "Public IPs of worker nodes"
value = [for worker in hcloud_server.workers : worker.ipv4_address]
value = var.worker_ips
}
output "worker_names" {
description = "Worker hostnames"
value = [for worker in hcloud_server.workers : worker.name]
value = [for idx in range(var.worker_count) : format("%s-worker-%d", var.cluster_name, idx + 1)]
}
output "worker_private_ips" {
description = "Private IPs of worker nodes"
value = [
for idx, worker in hcloud_server.workers :
try(one(worker.network).ip, cidrhost(var.subnet_cidr, 20 + idx))
]
value = var.worker_ips
}
output "ssh_private_key_path" {
@@ -61,10 +55,10 @@ output "network_cidr" {
output "kubeconfig_command" {
description = "Command to fetch kubeconfig"
value = "ssh root@${hcloud_server.control_plane[0].ipv4_address} 'cat /etc/rancher/k3s/k3s.yaml' > kubeconfig && sed -i 's/127.0.0.1/${hcloud_server.control_plane[0].ipv4_address}/g' kubeconfig"
value = "ssh ubuntu@${var.control_plane_ips[0]} 'sudo cat /etc/rancher/k3s/k3s.yaml' > kubeconfig && sed -i 's/127.0.0.1/${var.control_plane_ips[0]}/g' kubeconfig"
}
output "kube_api_lb_ip" {
description = "Load Balancer private IP for Kubernetes API (used for cluster joins)"
value = hcloud_load_balancer_network.kube_api.ip
value = var.kube_api_vip
}
+111 -50
View File
@@ -1,60 +1,121 @@
data "hcloud_image" "ubuntu" {
name = "ubuntu-24.04"
with_status = ["available"]
data "local_file" "ssh_public_key" {
filename = pathexpand(var.ssh_public_key)
}
resource "hcloud_server" "control_plane" {
count = var.control_plane_count
locals {
subnet_prefix = split("/", var.subnet_cidr)[1]
name = "${var.cluster_name}-cp-${count.index + 1}"
server_type = var.control_plane_type
image = data.hcloud_image.ubuntu.id
location = var.location
ssh_keys = [data.hcloud_ssh_key.cluster.id]
labels = {
cluster = var.cluster_name
control_planes = {
for idx in range(var.control_plane_count) :
format("%s-cp-%d", var.cluster_name, idx + 1) => {
role = "control-plane"
vm_id = var.control_plane_vm_ids[idx]
ip = var.control_plane_ips[idx]
cpu = var.control_plane_cores
memory_mb = var.control_plane_memory_mb
disk_gb = var.control_plane_disk_gb
startup = 1
}
}
network {
network_id = hcloud_network.cluster.id
ip = cidrhost(var.subnet_cidr, 10 + count.index)
}
public_net {
ipv4_enabled = true
ipv6_enabled = true
}
firewall_ids = [hcloud_firewall.cluster.id]
}
resource "hcloud_server" "workers" {
count = var.worker_count
name = "${var.cluster_name}-worker-${count.index + 1}"
server_type = var.worker_type
image = data.hcloud_image.ubuntu.id
location = var.location
ssh_keys = [data.hcloud_ssh_key.cluster.id]
labels = {
cluster = var.cluster_name
workers = {
for idx in range(var.worker_count) :
format("%s-worker-%d", var.cluster_name, idx + 1) => {
role = "worker"
vm_id = var.worker_vm_ids[idx]
ip = var.worker_ips[idx]
cpu = var.worker_cores
memory_mb = var.worker_memory_mb
disk_gb = var.worker_disk_gb
startup = 2
}
}
network {
network_id = hcloud_network.cluster.id
ip = cidrhost(var.subnet_cidr, 20 + count.index)
}
public_net {
ipv4_enabled = true
ipv6_enabled = true
}
firewall_ids = [hcloud_firewall.cluster.id]
depends_on = [hcloud_server.control_plane]
nodes = merge(local.control_planes, local.workers)
}
resource "proxmox_virtual_environment_vm" "nodes" {
for_each = local.nodes
name = each.key
description = "Managed by Terraform for ${var.cluster_name}"
tags = ["terraform", var.cluster_name, each.value.role]
node_name = var.proxmox_node_name
vm_id = each.value.vm_id
on_boot = true
started = true
stop_on_destroy = true
reboot_after_update = true
timeout_clone = 1800
timeout_create = 1800
timeout_shutdown_vm = 300
timeout_start_vm = 300
scsi_hardware = "virtio-scsi-single"
clone {
vm_id = var.proxmox_template_vm_id
datastore_id = var.proxmox_vm_storage_pool
full = var.proxmox_clone_full
retries = 3
}
agent {
enabled = true
trim = true
}
cpu {
cores = each.value.cpu
type = "x86-64-v2-AES"
}
memory {
dedicated = each.value.memory_mb
floating = each.value.memory_mb
}
startup {
order = tostring(each.value.startup)
up_delay = "20"
down_delay = "20"
}
disk {
datastore_id = var.proxmox_vm_storage_pool
interface = "scsi0"
size = each.value.disk_gb
discard = "on"
iothread = true
ssd = true
}
initialization {
datastore_id = var.proxmox_cloud_init_storage_pool
dns {
servers = var.proxmox_dns_servers
}
ip_config {
ipv4 {
address = "${each.value.ip}/${local.subnet_prefix}"
gateway = var.proxmox_gateway
}
}
user_account {
username = var.proxmox_ssh_username
keys = [trimspace(data.local_file.ssh_public_key.content)]
}
}
network_device {
bridge = var.proxmox_bridge
model = "virtio"
}
operating_system {
type = "l26"
}
}
-7
View File
@@ -1,7 +0,0 @@
data "local_file" "ssh_public_key" {
filename = pathexpand(var.ssh_public_key)
}
data "hcloud_ssh_key" "cluster" {
name = "infra"
}
+142 -22
View File
@@ -1,19 +1,13 @@
variable "hcloud_token" {
description = "Hetzner Cloud API token"
type = string
sensitive = true
}
variable "ssh_public_key" {
description = "Path to SSH public key"
type = string
default = "~/.ssh/id_ed25519.pub"
default = "~/.ssh/infra.pub"
}
variable "ssh_private_key" {
description = "Path to SSH private key"
type = string
default = "~/.ssh/id_ed25519"
default = "~/.ssh/infra"
}
variable "cluster_name" {
@@ -28,28 +22,112 @@ variable "control_plane_count" {
default = 3
}
variable "control_plane_type" {
description = "Hetzner server type for control plane"
type = string
default = "cx23"
variable "control_plane_cores" {
description = "vCPU count for control plane VMs"
type = number
default = 2
}
variable "control_plane_memory_mb" {
description = "Dedicated memory for control plane VMs in MiB"
type = number
default = 4096
}
variable "control_plane_disk_gb" {
description = "Disk size for control plane VMs in GiB"
type = number
default = 32
}
variable "worker_count" {
description = "Number of worker nodes"
type = number
default = 3
default = 5
}
variable "worker_type" {
description = "Hetzner server type for workers"
type = string
default = "cx33"
variable "worker_cores" {
description = "vCPU count for worker VMs"
type = number
default = 4
}
variable "location" {
description = "Hetzner datacenter location"
variable "worker_memory_mb" {
description = "Dedicated memory for worker VMs in MiB"
type = number
default = 8192
}
variable "worker_disk_gb" {
description = "Disk size for worker VMs in GiB"
type = number
default = 64
}
variable "proxmox_endpoint" {
description = "Proxmox API endpoint without /api2/json suffix"
type = string
default = "nbg1"
default = "https://100.105.0.115:8006/"
}
variable "proxmox_api_token_id" {
description = "Proxmox API token ID"
type = string
sensitive = true
}
variable "proxmox_api_token_secret" {
description = "Proxmox API token secret"
type = string
sensitive = true
}
variable "proxmox_insecure" {
description = "Skip TLS verification for the Proxmox API"
type = bool
default = true
}
variable "proxmox_node_name" {
description = "Fixed Proxmox node name for all cluster VMs"
type = string
default = "flex"
}
variable "proxmox_template_vm_id" {
description = "Template VM ID used for linked clones"
type = number
default = 9000
}
variable "proxmox_clone_full" {
description = "Whether to use full clones instead of linked clones"
type = bool
default = false
}
variable "proxmox_vm_storage_pool" {
description = "Proxmox datastore for VM disks"
type = string
default = "Flash"
}
variable "proxmox_cloud_init_storage_pool" {
description = "Proxmox datastore for cloud-init disks"
type = string
default = "Flash"
}
variable "proxmox_bridge" {
description = "Proxmox bridge for cluster VM interfaces"
type = string
default = "vmbr0"
}
variable "proxmox_ssh_username" {
description = "Cloud-init user injected into cloned VMs"
type = string
default = "ubuntu"
}
variable "allowed_ssh_ips" {
@@ -90,13 +168,55 @@ variable "enable_nodeport_public" {
variable "network_cidr" {
description = "CIDR for private network"
type = string
default = "10.0.0.0/16"
default = "10.27.27.0/24"
}
variable "subnet_cidr" {
description = "CIDR for server subnet"
type = string
default = "10.0.1.0/24"
default = "10.27.27.0/24"
}
variable "proxmox_gateway" {
description = "Gateway for cluster VM networking"
type = string
default = "10.27.27.1"
}
variable "proxmox_dns_servers" {
description = "DNS servers configured through cloud-init"
type = list(string)
default = ["1.1.1.1", "8.8.8.8"]
}
variable "control_plane_ips" {
description = "Static IPv4 addresses for control plane VMs"
type = list(string)
default = ["10.27.27.30", "10.27.27.31", "10.27.27.32"]
}
variable "worker_ips" {
description = "Static IPv4 addresses for worker VMs"
type = list(string)
default = ["10.27.27.41", "10.27.27.42", "10.27.27.43", "10.27.27.44", "10.27.27.45"]
}
variable "control_plane_vm_ids" {
description = "Fixed VMIDs for control plane VMs"
type = list(number)
default = [200, 201, 202]
}
variable "worker_vm_ids" {
description = "Fixed VMIDs for worker VMs"
type = list(number)
default = [210, 211, 212, 213, 214]
}
variable "kube_api_vip" {
description = "Virtual IP advertised by kube-vip for the Kubernetes API"
type = string
default = "10.27.27.40"
}
variable "s3_access_key" {