feat: migrate cluster baseline from Hetzner to Proxmox
Deploy Cluster / Terraform (push) Failing after 52s
Deploy Cluster / Ansible (push) Has been skipped
Deploy Grafana Content / Grafana Content (push) Failing after 1m37s

Replace Hetzner infrastructure and cloud-provider assumptions with Proxmox
VM clones, kube-vip API HA, and NFS-backed storage. Update bootstrap,
Flux addons, CI workflows, and docs to target the new private Proxmox
baseline while preserving the existing Tailscale, Doppler, Flux, Rancher,
and B2 backup flows.
This commit is contained in:
2026-04-22 03:02:13 +00:00
parent 6c6b9d20ca
commit b1dae28aa5
40 changed files with 577 additions and 784 deletions
+4 -20
View File
@@ -12,12 +12,15 @@ on:
env: env:
TF_VERSION: "1.7.0" TF_VERSION: "1.7.0"
TF_VAR_hcloud_token: ${{ secrets.HCLOUD_TOKEN }}
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }} TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }} TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }} TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }} TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }} TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
TF_VAR_proxmox_insecure: "true"
jobs: jobs:
dashboards: dashboards:
@@ -51,25 +54,6 @@ jobs:
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \ -backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
-backend-config="skip_requesting_account_id=true" -backend-config="skip_requesting_account_id=true"
- name: Detect runner egress IP
run: |
RUNNER_IP=$(curl -fsSL https://api.ipify.org)
echo "RUNNER_CIDR=[\"${RUNNER_IP}/32\"]" >> "$GITHUB_ENV"
echo "Runner egress IP: ${RUNNER_IP}"
- name: Open SSH/API for current runner CIDR
working-directory: terraform
run: |
terraform apply \
-refresh=false \
-target=hcloud_firewall.cluster \
-var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
-var="allowed_ssh_ips=${RUNNER_CIDR}" \
-var="allowed_api_ips=${RUNNER_CIDR}" \
-auto-approve
- name: Install Python Dependencies - name: Install Python Dependencies
run: | run: |
apt-get update && apt-get install -y python3-pip apt-get update && apt-get install -y python3-pip
+7 -62
View File
@@ -11,12 +11,15 @@ on:
env: env:
TF_VERSION: "1.7.0" TF_VERSION: "1.7.0"
TF_VAR_hcloud_token: ${{ secrets.HCLOUD_TOKEN }}
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }} TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }} TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }} TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }} TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }} TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
TF_VAR_proxmox_insecure: "true"
TS_OAUTH_CLIENT_ID: ${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }} TS_OAUTH_CLIENT_ID: ${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}
TS_OAUTH_CLIENT_SECRET: ${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }} TS_OAUTH_CLIENT_SECRET: ${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}
@@ -60,40 +63,6 @@ jobs:
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
chmod 644 ~/.ssh/id_ed25519.pub chmod 644 ~/.ssh/id_ed25519.pub
- name: Install jq
run: |
apt-get update
apt-get install -y jq
- name: Import existing servers into state (if missing)
working-directory: terraform
env:
HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }}
run: |
set -e
ensure_import() {
address="$1"
name="$2"
if terraform state show "$address" >/dev/null 2>&1; then
echo "$address already in state"
return
fi
id=$(curl -sS -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/servers?name=${name}" | jq -r '.servers[0].id // empty')
if [ -n "$id" ]; then
echo "Importing $address from server $name ($id)"
terraform import "$address" "$id"
else
echo "No existing server found for $name; skipping import"
fi
}
ensure_import 'hcloud_server.control_plane[0]' 'k8s-cluster-cp-1'
ensure_import 'hcloud_server.control_plane[1]' 'k8s-cluster-cp-2'
ensure_import 'hcloud_server.control_plane[2]' 'k8s-cluster-cp-3'
ensure_import 'hcloud_server.workers[0]' 'k8s-cluster-worker-1'
ensure_import 'hcloud_server.workers[1]' 'k8s-cluster-worker-2'
ensure_import 'hcloud_server.workers[2]' 'k8s-cluster-worker-3'
- name: Terraform Plan - name: Terraform Plan
id: plan id: plan
working-directory: terraform working-directory: terraform
@@ -187,32 +156,11 @@ jobs:
mkdir -p ../outputs mkdir -p ../outputs
terraform output -json > ../outputs/terraform_outputs.json terraform output -json > ../outputs/terraform_outputs.json
- name: Detect runner egress IP
run: |
RUNNER_IP=$(curl -fsSL https://api.ipify.org)
echo "RUNNER_CIDR=[\"${RUNNER_IP}/32\"]" >> "$GITHUB_ENV"
echo "Runner egress IP: ${RUNNER_IP}"
- name: Open SSH/API for current runner CIDR
working-directory: terraform
run: |
terraform apply \
-target=hcloud_firewall.cluster \
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
-var="allowed_ssh_ips=${RUNNER_CIDR}" \
-var="allowed_api_ips=${RUNNER_CIDR}" \
-auto-approve
- name: Install Python Dependencies - name: Install Python Dependencies
run: | run: |
apt-get update && apt-get install -y python3-pip apt-get update && apt-get install -y python3-pip
pip3 install --break-system-packages ansible kubernetes jinja2 pyyaml pip3 install --break-system-packages ansible kubernetes jinja2 pyyaml
- name: Note runner connectivity mode
run: |
echo "Using runner public network access with RUNNER_ALLOWED_CIDRS for SSH/API"
- name: Install Ansible Collections - name: Install Ansible Collections
run: ansible-galaxy collection install -r ansible/requirements.yml run: ansible-galaxy collection install -r ansible/requirements.yml
@@ -224,7 +172,6 @@ jobs:
working-directory: ansible working-directory: ansible
run: | run: |
ansible-playbook site.yml \ ansible-playbook site.yml \
-e "hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
-e "tailscale_auth_key=${{ secrets.TAILSCALE_AUTH_KEY }}" \ -e "tailscale_auth_key=${{ secrets.TAILSCALE_AUTH_KEY }}" \
-e "tailscale_tailnet=${{ secrets.TAILSCALE_TAILNET }}" \ -e "tailscale_tailnet=${{ secrets.TAILSCALE_TAILNET }}" \
-e "tailscale_oauth_client_id=${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}" \ -e "tailscale_oauth_client_id=${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}" \
@@ -294,9 +241,8 @@ jobs:
key: dopplerToken key: dopplerToken
namespace: external-secrets namespace: external-secrets
EOF EOF
# Wait for CCM and CSI (Hetzner cloud integration) # Wait for the storage layer and private access components
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-ccm --timeout=600s kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=600s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-csi --timeout=600s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s
- name: Wait for Rancher and backup operator - name: Wait for Rancher and backup operator
@@ -397,10 +343,9 @@ jobs:
working-directory: ansible working-directory: ansible
run: | run: |
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get nodes -o wide" ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get nodes -o wide"
ansible -i inventory.ini 'control_plane[0]' -m shell -a "kubectl describe nodes | grep -E 'Name:|providerID:'"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n flux-system get gitrepositories,kustomizations,helmreleases" ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n flux-system get gitrepositories,kustomizations,helmreleases"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n kube-system get pods -o wide" ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n kube-system get pods -o wide"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass" ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass flash-nfs"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n tailscale-system get pods -o wide" ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n tailscale-system get pods -o wide"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n external-secrets get pods" ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n external-secrets get pods"
env: env:
+13 -123
View File
@@ -10,107 +10,22 @@ on:
env: env:
TF_VERSION: "1.7.0" TF_VERSION: "1.7.0"
TF_VAR_hcloud_token: ${{ secrets.HCLOUD_TOKEN }}
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }} TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }} TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }} TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }} TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }} TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }} TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }} TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
TF_VAR_proxmox_insecure: "true"
jobs: jobs:
pre-destroy-backup:
name: Pre-Destroy Backup
runs-on: ubuntu-latest
if: github.event.inputs.confirm == 'destroy'
environment: destroy
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Terraform
uses: hashicorp/setup-terraform@v3
with:
terraform_version: ${{ env.TF_VERSION }}
- name: Terraform Init
working-directory: terraform
run: |
terraform init \
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
-backend-config="region=auto" \
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
-backend-config="skip_requesting_account_id=true"
- name: Setup SSH Keys
run: |
mkdir -p ~/.ssh
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
chmod 644 ~/.ssh/id_ed25519.pub
- name: Get Control Plane IP
id: cp_ip
working-directory: terraform
run: |
PRIMARY_IP=$(terraform output -raw primary_control_plane_ip)
echo "PRIMARY_IP=${PRIMARY_IP}" >> "$GITHUB_ENV"
- name: Pre-Destroy pg_dump to B2
run: |
set +e
echo "Attempting pre-destroy backup to B2..."
ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null root@${PRIMARY_IP} << 'EOF'
set -e
# Check if kubectl is available and cluster is up
if ! command -v kubectl &> /dev/null; then
echo "kubectl not found, skipping pre-destroy backup"
exit 0
fi
# Check if we can reach the cluster
if ! kubectl cluster-info &> /dev/null; then
echo "Cannot reach cluster, skipping pre-destroy backup"
exit 0
fi
# Check if CNP is deployed
if ! kubectl get namespace cnpg-cluster &> /dev/null; then
echo "CNP namespace not found, skipping pre-destroy backup"
exit 0
fi
# Run backup using the pgdump image directly
BACKUP_FILE="rancher-backup-$(date +%Y%m%d-%H%M%S).sql.gz"
B2_ACCOUNT_ID="$(cat /etc/kubernetes/secret/b2_account_id 2>/dev/null || echo '')"
B2_APPLICATION_KEY="$(cat /etc/kubernetes/secret/b2_application_key 2>/dev/null || echo '')"
if [ -z "$B2_ACCOUNT_ID" ] || [ -z "$B2_APPLICATION_KEY" ]; then
echo "B2 credentials not found in secret, skipping pre-destroy backup"
exit 0
fi
kubectl run pgdump-manual --image=ghcr.io/cloudnative-pg/pgbackrest:latest --restart=Never \
-n cnpg-cluster --dry-run=client -o yaml | \
kubectl apply -f -
echo "Waiting for backup job to complete..."
kubectl wait --for=condition=complete job/pgdump-manual -n cnpg-cluster --timeout=300s || true
kubectl logs job/pgdump-manual -n cnpg-cluster || true
kubectl delete job pgdump-manual -n cnpg-cluster --ignore-not-found=true || true
EOF
echo "Pre-destroy backup step completed (failure is non-fatal)"
destroy: destroy:
name: Destroy Cluster name: Destroy Cluster
runs-on: ubuntu-latest runs-on: ubuntu-latest
if: github.event.inputs.confirm == 'destroy' if: github.event.inputs.confirm == 'destroy'
environment: destroy environment: destroy
needs: pre-destroy-backup
steps: steps:
- name: Checkout - name: Checkout
uses: actions/checkout@v4 uses: actions/checkout@v4
@@ -120,6 +35,14 @@ jobs:
with: with:
terraform_version: ${{ env.TF_VERSION }} terraform_version: ${{ env.TF_VERSION }}
- name: Setup SSH Keys
run: |
mkdir -p ~/.ssh
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
chmod 644 ~/.ssh/id_ed25519.pub
- name: Terraform Init - name: Terraform Init
working-directory: terraform working-directory: terraform
run: | run: |
@@ -131,19 +54,6 @@ jobs:
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \ -backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
-backend-config="skip_requesting_account_id=true" -backend-config="skip_requesting_account_id=true"
- name: Setup SSH Keys
run: |
mkdir -p ~/.ssh
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
chmod 644 ~/.ssh/id_ed25519.pub
- name: Install jq
run: |
apt-get update
apt-get install -y jq
- name: Terraform Destroy - name: Terraform Destroy
id: destroy id: destroy
working-directory: terraform working-directory: terraform
@@ -152,7 +62,6 @@ jobs:
for attempt in 1 2 3; do for attempt in 1 2 3; do
echo "Terraform destroy attempt ${attempt}/3" echo "Terraform destroy attempt ${attempt}/3"
terraform destroy \ terraform destroy \
-var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \ -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \ -var="ssh_private_key=$HOME/.ssh/id_ed25519" \
-auto-approve -auto-approve
@@ -164,32 +73,13 @@ jobs:
echo "Terraform destroy failed with exit code ${rc}; retrying in 30s" echo "Terraform destroy failed with exit code ${rc}; retrying in 30s"
sleep 30 sleep 30
terraform refresh \ terraform refresh \
-var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \ -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
-var="ssh_private_key=$HOME/.ssh/id_ed25519" || true -var="ssh_private_key=$HOME/.ssh/id_ed25519" || true
fi fi
done done
exit "$rc" exit "$rc"
- name: Hetzner destroy diagnostics - name: Terraform state diagnostics
if: failure() && steps.destroy.outcome == 'failure' if: failure() && steps.destroy.outcome == 'failure'
env:
HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }}
run: | run: |
set +e
echo "== Terraform state list =="
terraform -chdir=terraform state list || true terraform -chdir=terraform state list || true
network_id=$(terraform -chdir=terraform state show hcloud_network.cluster 2>/dev/null | awk '/^id *=/ {gsub(/"/, "", $3); print $3; exit}')
if [ -z "$network_id" ]; then
network_id="11988935"
fi
echo "== Hetzner network =="
curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/networks/${network_id}" | jq . || true
echo "== Hetzner servers attached to network =="
curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/servers" | jq --argjson id "$network_id" '.servers[] | select(any(.private_net[]?; .network == $id)) | {id, name, private_net}' || true
echo "== Hetzner load balancers attached to network =="
curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/load_balancers" | jq --argjson id "$network_id" '.load_balancers[] | select(any(.private_net[]?; .network == $id)) | {id, name, private_net}' || true
+7 -3
View File
@@ -9,7 +9,9 @@ Repository guide for OpenCode sessions in this repo.
## Current Baseline ## Current Baseline
- HA private cluster: 3 control planes, 3 workers. - HA private cluster: 3 control planes, 5 workers on Proxmox.
- Proxmox clones come from template `9000` on node `flex`; API VIP is `10.27.27.40` via kube-vip.
- Storage is `nfs-subdir-external-provisioner` backed by `10.27.27.22:/TheFlash/k8s-nfs` with StorageClass `flash-nfs`.
- Tailscale is the private access path for Rancher and shared services. - Tailscale is the private access path for Rancher and shared services.
- Rancher, Grafana, and Prometheus are exposed through Tailscale; Flux UI / Weave GitOps is removed. - Rancher, Grafana, and Prometheus are exposed through Tailscale; Flux UI / Weave GitOps is removed.
- `apps/` is suspended by default. - `apps/` is suspended by default.
@@ -20,8 +22,8 @@ Repository guide for OpenCode sessions in this repo.
- Terraform: `terraform -chdir=terraform fmt -recursive`, `terraform -chdir=terraform validate`, `terraform -chdir=terraform plan -var-file=../terraform.tfvars`, `terraform -chdir=terraform apply -var-file=../terraform.tfvars` - Terraform: `terraform -chdir=terraform fmt -recursive`, `terraform -chdir=terraform validate`, `terraform -chdir=terraform plan -var-file=../terraform.tfvars`, `terraform -chdir=terraform apply -var-file=../terraform.tfvars`
- Ansible: `ansible-galaxy collection install -r ansible/requirements.yml`, `cd ansible && python3 generate_inventory.py`, `ansible-playbook -i ansible/inventory.ini ansible/site.yml --syntax-check`, `ansible-playbook ansible/site.yml` - Ansible: `ansible-galaxy collection install -r ansible/requirements.yml`, `cd ansible && python3 generate_inventory.py`, `ansible-playbook -i ansible/inventory.ini ansible/site.yml --syntax-check`, `ansible-playbook ansible/site.yml`
- Flux/Kustomize: `kubectl kustomize infrastructure/addons/<addon>`, `kubectl kustomize clusters/prod/flux-system` - Flux/Kustomize: `kubectl kustomize infrastructure/addons/<addon>`, `kubectl kustomize clusters/prod/flux-system`
- Kubeconfig refresh: `scripts/refresh-kubeconfig.sh <cp1-public-ip>` - Kubeconfig refresh: `scripts/refresh-kubeconfig.sh <cp1-ip>`
- Tailnet smoke check: `ssh root@<cp1-ip> 'bash -s' < scripts/smoke-check-tailnet-services.sh` - Tailnet smoke check: `ssh ubuntu@<cp1-ip> 'bash -s' < scripts/smoke-check-tailnet-services.sh`
## Workflow Rules ## Workflow Rules
@@ -31,12 +33,14 @@ Repository guide for OpenCode sessions in this repo.
- CI deploy order is Terraform -> Ansible -> Flux bootstrap -> Rancher restore -> health checks. - CI deploy order is Terraform -> Ansible -> Flux bootstrap -> Rancher restore -> health checks.
- One object per Kubernetes YAML file; keep filenames kebab-case. - One object per Kubernetes YAML file; keep filenames kebab-case.
- If `kubectl` points at `localhost:8080` after a rebuild, refresh kubeconfig from the primary control-plane IP. - If `kubectl` points at `localhost:8080` after a rebuild, refresh kubeconfig from the primary control-plane IP.
- Bootstrap assumptions that matter: SSH user is `ubuntu`, NIC is `ens18`, API join endpoint is the kube-vip address.
## Repo-Specific Gotchas ## Repo-Specific Gotchas
- `rancher-backup` uses a postRenderer to swap the broken hook image to `rancher/kubectl:v1.34.0`; do not put S3 config in HelmRelease values. Put it in the Backup CR. - `rancher-backup` uses a postRenderer to swap the broken hook image to `rancher/kubectl:v1.34.0`; do not put S3 config in HelmRelease values. Put it in the Backup CR.
- Tailscale cleanup only runs before service proxies exist; it removes stale offline `rancher`/`grafana`/`prometheus`/`flux` devices, then must stop so live proxies are not deleted. - Tailscale cleanup only runs before service proxies exist; it removes stale offline `rancher`/`grafana`/`prometheus`/`flux` devices, then must stop so live proxies are not deleted.
- Keep the Tailscale operator on the stable Helm repo `https://pkgs.tailscale.com/helmcharts` at `1.96.5` unless you have a reason to change it. - Keep the Tailscale operator on the stable Helm repo `https://pkgs.tailscale.com/helmcharts` at `1.96.5` unless you have a reason to change it.
- The repo no longer uses a cloud controller manager. If you see `providerID` or Hetzner-specific logic, it is stale.
- Current private URLs: - Current private URLs:
- Rancher: `https://rancher.silverside-gopher.ts.net/` - Rancher: `https://rancher.silverside-gopher.ts.net/`
- Grafana: `http://grafana.silverside-gopher.ts.net/` - Grafana: `http://grafana.silverside-gopher.ts.net/`
+41 -56
View File
@@ -1,30 +1,28 @@
# Hetzner Kubernetes Cluster # Proxmox Kubernetes Cluster
Production-ready Kubernetes cluster on Hetzner Cloud using Terraform and Ansible. Production-ready private Kubernetes cluster on Proxmox using Terraform, Ansible, and Flux.
## Architecture ## Architecture
| Component | Details | | Component | Details |
|-----------|---------| |-----------|---------|
| **Control Plane** | 3x CX23 (HA) | | **Control Plane** | 3x Proxmox VMs (2 vCPU / 4 GiB / 32 GiB) |
| **Workers** | 3x CX33 | | **Workers** | 5x Proxmox VMs (4 vCPU / 8 GiB / 64 GiB) |
| **K8s** | k3s (latest, HA) | | **K8s** | k3s (latest, HA) |
| **Addons** | Hetzner CCM + CSI + Prometheus + Grafana + Loki | | **Addons** | NFS provisioner + Prometheus + Grafana + Loki + Rancher |
| **Access** | SSH/API and private services restricted to Tailnet | | **Access** | SSH/API and private services restricted to Tailnet |
| **Bootstrap** | Terraform + Ansible + Flux | | **Bootstrap** | Terraform + Ansible + Flux |
## Prerequisites ## Prerequisites
### 1. Hetzner Cloud API Token ### 1. Proxmox API Token
1. Go to [Hetzner Cloud Console](https://console.hetzner.com/) Create an API token for the Proxmox VE user used by Terraform. The repo expects the `bpg/proxmox` provider with:
2. Select your project (or create a new one)
3. Navigate to **Security****API Tokens** - endpoint: `https://100.105.0.115:8006/`
4. Click **Generate API Token** - node: `flex`
5. Set description: `k8s-cluster-terraform` - clone source: template `9000` (`ubuntu-2404-k8s-template`)
6. Select permissions: **Read & Write** - auth: API token
7. Click **Generate API Token**
8. **Copy the token immediately** - it won't be shown again!
### 2. Backblaze B2 Bucket (for Terraform State) ### 2. Backblaze B2 Bucket (for Terraform State)
@@ -44,7 +42,7 @@ Production-ready Kubernetes cluster on Hetzner Cloud using Terraform and Ansible
### 3. SSH Key Pair ### 3. SSH Key Pair
```bash ```bash
ssh-keygen -t ed25519 -C "k8s@hetzner" -f ~/.ssh/hetzner_k8s ssh-keygen -t ed25519 -C "k8s@proxmox" -f ~/.ssh/infra
``` ```
### 4. Local Tools ### 4. Local Tools
@@ -71,10 +69,12 @@ cp terraform.tfvars.example terraform.tfvars
Edit `terraform.tfvars`: Edit `terraform.tfvars`:
```hcl ```hcl
hcloud_token = "your-hetzner-api-token" proxmox_endpoint = "https://100.105.0.115:8006/"
proxmox_api_token_id = "terraform-prov@pve!k8s-cluster"
proxmox_api_token_secret = "your-proxmox-token-secret"
ssh_public_key = "~/.ssh/hetzner_k8s.pub" ssh_public_key = "~/.ssh/infra.pub"
ssh_private_key = "~/.ssh/hetzner_k8s" ssh_private_key = "~/.ssh/infra"
s3_access_key = "your-backblaze-key-id" s3_access_key = "your-backblaze-key-id"
s3_secret_key = "your-backblaze-application-key" s3_secret_key = "your-backblaze-application-key"
@@ -84,12 +84,7 @@ s3_bucket = "k8s-terraform-state"
tailscale_auth_key = "tskey-auth-..." tailscale_auth_key = "tskey-auth-..."
tailscale_tailnet = "yourtailnet.ts.net" tailscale_tailnet = "yourtailnet.ts.net"
restrict_api_ssh_to_tailnet = true kube_api_vip = "10.27.27.40"
tailnet_cidr = "100.64.0.0/10"
enable_nodeport_public = false
allowed_ssh_ips = []
allowed_api_ips = []
``` ```
### 3. Initialize Terraform ### 3. Initialize Terraform
@@ -152,7 +147,9 @@ Set these in your Gitea repository settings (**Settings** → **Secrets** → **
| Secret | Description | | Secret | Description |
|--------|-------------| |--------|-------------|
| `HCLOUD_TOKEN` | Hetzner Cloud API token | | `PROXMOX_ENDPOINT` | Proxmox API endpoint (for example `https://100.105.0.115:8006/`) |
| `PROXMOX_API_TOKEN_ID` | Proxmox API token ID |
| `PROXMOX_API_TOKEN_SECRET` | Proxmox API token secret |
| `S3_ACCESS_KEY` | Backblaze B2 keyID | | `S3_ACCESS_KEY` | Backblaze B2 keyID |
| `S3_SECRET_KEY` | Backblaze B2 applicationKey | | `S3_SECRET_KEY` | Backblaze B2 applicationKey |
| `S3_ENDPOINT` | Backblaze S3 endpoint (e.g., `https://s3.eu-central-003.backblazeb2.com`) | | `S3_ENDPOINT` | Backblaze S3 endpoint (e.g., `https://s3.eu-central-003.backblazeb2.com`) |
@@ -163,7 +160,6 @@ Set these in your Gitea repository settings (**Settings** → **Secrets** → **
| `TAILSCALE_OAUTH_CLIENT_SECRET` | Tailscale OAuth client secret for Kubernetes Operator | | `TAILSCALE_OAUTH_CLIENT_SECRET` | Tailscale OAuth client secret for Kubernetes Operator |
| `DOPPLER_HETZNERTERRA_SERVICE_TOKEN` | Doppler service token for `hetznerterra` runtime secrets | | `DOPPLER_HETZNERTERRA_SERVICE_TOKEN` | Doppler service token for `hetznerterra` runtime secrets |
| `GRAFANA_ADMIN_PASSWORD` | Optional admin password for Grafana (auto-generated if unset) | | `GRAFANA_ADMIN_PASSWORD` | Optional admin password for Grafana (auto-generated if unset) |
| `RUNNER_ALLOWED_CIDRS` | Optional CIDR list for CI runner access if you choose to pass it via tfvars/secrets |
| `SSH_PUBLIC_KEY` | SSH public key content | | `SSH_PUBLIC_KEY` | SSH public key content |
| `SSH_PRIVATE_KEY` | SSH private key content | | `SSH_PRIVATE_KEY` | SSH private key content |
@@ -176,8 +172,8 @@ This repo uses Flux for continuous reconciliation after Terraform + Ansible boot
The current default target is the HA private baseline: The current default target is the HA private baseline:
- `3` control plane nodes - `3` control plane nodes
- `3` worker nodes - `5` worker nodes
- private Hetzner network only - private Proxmox network only
- Tailscale for operator and service access - Tailscale for operator and service access
- Flux-managed platform addons with `apps` suspended by default - Flux-managed platform addons with `apps` suspended by default
@@ -207,8 +203,7 @@ Terraform/bootstrap secrets remain in Gitea Actions secrets and are not managed
### Reconciliation graph ### Reconciliation graph
- `infrastructure` (top-level) - `infrastructure` (top-level)
- `addon-ccm` - `addon-nfs-storage`
- `addon-csi` depends on `addon-ccm`
- `addon-tailscale-operator` - `addon-tailscale-operator`
- `addon-observability` - `addon-observability`
- `addon-observability-content` depends on `addon-observability` - `addon-observability-content` depends on `addon-observability`
@@ -224,7 +219,7 @@ Terraform/bootstrap secrets remain in Gitea Actions secrets and are not managed
### Current addon status ### Current addon status
- Core infrastructure addons are Flux-managed from `infrastructure/addons/`. - Core infrastructure addons are Flux-managed from `infrastructure/addons/`.
- Active Flux addons for the current baseline: `addon-ccm`, `addon-csi`, `addon-cert-manager`, `addon-external-secrets`, `addon-tailscale-operator`, `addon-tailscale-proxyclass`, `addon-observability`, `addon-observability-content`, `addon-rancher`, `addon-rancher-config`, `addon-rancher-backup`, `addon-rancher-backup-config`. - Active Flux addons for the current baseline: `addon-nfs-storage`, `addon-cert-manager`, `addon-external-secrets`, `addon-tailscale-operator`, `addon-tailscale-proxyclass`, `addon-observability`, `addon-observability-content`, `addon-rancher`, `addon-rancher-config`, `addon-rancher-backup`, `addon-rancher-backup-config`.
- `apps` remains suspended until workload rollout is explicitly enabled. - `apps` remains suspended until workload rollout is explicitly enabled.
- Ansible is limited to cluster bootstrap, prerequisite secret creation, pre-proxy Tailscale cleanup, and kubeconfig finalization. - Ansible is limited to cluster bootstrap, prerequisite secret creation, pre-proxy Tailscale cleanup, and kubeconfig finalization.
- Weave GitOps / Flux UI is no longer deployed; use Rancher or the `flux` CLI for Flux operations. - Weave GitOps / Flux UI is no longer deployed; use Rancher or the `flux` CLI for Flux operations.
@@ -232,14 +227,14 @@ Terraform/bootstrap secrets remain in Gitea Actions secrets and are not managed
### Rancher access ### Rancher access
- Rancher is private-only and exposed through Tailscale at `https://rancher.silverside-gopher.ts.net/`. - Rancher is private-only and exposed through Tailscale at `https://rancher.silverside-gopher.ts.net/`.
- The public Hetzner load balancer path is not used for Rancher. - Rancher and the Kubernetes API stay private; kube-vip provides the API VIP on the LAN.
- Rancher stores state in embedded etcd; no external database is used. - Rancher stores state in embedded etcd; no external database is used.
### Stable baseline acceptance ### Stable baseline acceptance
A rebuild is considered successful only when all of the following pass without manual intervention: A rebuild is considered successful only when all of the following pass without manual intervention:
- Terraform create succeeds for the default `3` control planes and `3` workers. - Terraform create succeeds for the default `3` control planes and `5` workers.
- Ansible bootstrap succeeds end-to-end. - Ansible bootstrap succeeds end-to-end.
- All nodes become `Ready`. - All nodes become `Ready`.
- Flux core reconciliation is healthy. - Flux core reconciliation is healthy.
@@ -323,9 +318,6 @@ It avoids full cluster provisioning and only applies Grafana content resources:
├── terraform/ ├── terraform/
│ ├── main.tf │ ├── main.tf
│ ├── variables.tf │ ├── variables.tf
│ ├── network.tf
│ ├── firewall.tf
│ ├── ssh.tf
│ ├── servers.tf │ ├── servers.tf
│ ├── outputs.tf │ ├── outputs.tf
│ └── backend.tf │ └── backend.tf
@@ -353,17 +345,19 @@ It avoids full cluster provisioning and only applies Grafana content resources:
## Firewall Rules ## Firewall Rules
This repo no longer manages cloud firewalls. Access control is expected to be handled on your LAN infrastructure and through Tailscale.
Important cluster-local ports still in use:
| Port | Source | Purpose | | Port | Source | Purpose |
|------|--------|---------| |------|--------|---------|
| 22 | Tailnet CIDR | SSH | | 22 | Admin hosts / CI | SSH |
| 6443 | Tailnet CIDR + internal | Kubernetes API | | 6443 | 10.27.27.0/24 + VIP | Kubernetes API |
| 41641/udp | Any | Tailscale WireGuard | | 9345 | 10.27.27.0/24 | k3s Supervisor |
| 9345 | 10.0.0.0/16 | k3s Supervisor (HA join) | | 2379 | 10.27.27.0/24 | etcd Client |
| 2379 | 10.0.0.0/16 | etcd Client | | 2380 | 10.27.27.0/24 | etcd Peer |
| 2380 | 10.0.0.0/16 | etcd Peer | | 8472/udp | 10.27.27.0/24 | Flannel VXLAN |
| 8472 | 10.0.0.0/16 | Flannel VXLAN | | 10250 | 10.27.27.0/24 | Kubelet |
| 10250 | 10.0.0.0/16 | Kubelet |
| 30000-32767 | Optional | NodePorts (disabled by default) |
## Operations ## Operations
@@ -399,7 +393,7 @@ terraform destroy
### Check k3s Logs ### Check k3s Logs
```bash ```bash
ssh root@<control-plane-ip> journalctl -u k3s -f ssh ubuntu@<control-plane-ip> sudo journalctl -u k3s -f
``` ```
### Reset k3s ### Reset k3s
@@ -408,19 +402,10 @@ ssh root@<control-plane-ip> journalctl -u k3s -f
ansible-playbook site.yml -t reset ansible-playbook site.yml -t reset
``` ```
## Costs Breakdown
| Resource | Quantity | Unit Price | Monthly |
|----------|----------|------------|---------|
| CX23 (Control Plane) | 3 | €2.99 | €8.97 |
| CX33 (Workers) | 4 | €4.99 | €19.96 |
| Backblaze B2 | ~1 GB | Free (first 10GB) | €0.00 |
| **Total** | | | **€28.93/mo** |
## Security Notes ## Security Notes
- Control plane has HA (3 nodes, can survive 1 failure) - Control plane has HA (3 nodes, can survive 1 failure)
- Consider adding Hetzner load balancer for API server - Kubernetes API HA is provided by kube-vip on `10.27.27.40`
- Rotate API tokens regularly - Rotate API tokens regularly
- Use network policies in Kubernetes - Use network policies in Kubernetes
- Enable audit logging for production - Enable audit logging for production
+14 -7
View File
@@ -1,6 +1,6 @@
# Gitea Secrets Setup # Gitea Secrets Setup
This document describes the secrets required for the HetznerTerra deployment workflow. This document describes the secrets required for the Proxmox-based deployment workflow.
## Required Secrets ## Required Secrets
@@ -9,10 +9,17 @@ Add these secrets in your Gitea repository settings:
### Infrastructure Secrets ### Infrastructure Secrets
#### `HCLOUD_TOKEN` #### `PROXMOX_ENDPOINT`
- Hetzner Cloud API token - Proxmox VE API endpoint
- Get from: https://console.hetzner.com/projects/{project-id}/security/api-tokens - Example: `https://100.105.0.115:8006/`
- Permissions: Read & Write
#### `PROXMOX_API_TOKEN_ID`
- Proxmox API token ID
- Example: `terraform-prov@pve!k8s-cluster`
#### `PROXMOX_API_TOKEN_SECRET`
- Proxmox API token secret
- Create with `pveum user token add terraform-prov@pve k8s-cluster`
#### `S3_ACCESS_KEY` & `S3_SECRET_KEY` #### `S3_ACCESS_KEY` & `S3_SECRET_KEY`
- Backblaze B2 credentials for Terraform state storage - Backblaze B2 credentials for Terraform state storage
@@ -31,7 +38,7 @@ Add these secrets in your Gitea repository settings:
#### `SSH_PRIVATE_KEY` & `SSH_PUBLIC_KEY` #### `SSH_PRIVATE_KEY` & `SSH_PUBLIC_KEY`
- SSH key pair for cluster access - SSH key pair for cluster access
- Generate with: `ssh-keygen -t ed25519 -C "k8s@hetzner" -f ~/.ssh/hetzner_k8s` - Generate with: `ssh-keygen -t ed25519 -C "k8s@proxmox" -f ~/.ssh/infra`
- Private key content (include BEGIN/END lines) - Private key content (include BEGIN/END lines)
- Public key content (full line starting with ssh-ed25519) - Public key content (full line starting with ssh-ed25519)
@@ -90,4 +97,4 @@ Check the workflow logs to verify all secrets are being used correctly.
- Prefer Doppler for runtime app/platform secrets after cluster bootstrap - Prefer Doppler for runtime app/platform secrets after cluster bootstrap
- Rotate Tailscale auth keys periodically - Rotate Tailscale auth keys periodically
- Review OAuth client permissions regularly - Review OAuth client permissions regularly
- The workflow automatically opens SSH/API access only for the runner's IP during deployment - CI expects direct SSH access to the Proxmox VMs and direct Proxmox API access
+12 -14
View File
@@ -5,9 +5,9 @@ This document defines the current engineering target for this repository.
## Topology ## Topology
- 3 control planes (HA etcd cluster) - 3 control planes (HA etcd cluster)
- 3 workers - 5 workers
- Hetzner Load Balancer for Kubernetes API - kube-vip API VIP (`10.27.27.40`)
- private Hetzner network - private Proxmox/LAN network (`10.27.27.0/24`)
- Tailscale operator access and service exposure - Tailscale operator access and service exposure
- Rancher exposed through Tailscale (`rancher.silverside-gopher.ts.net`) - Rancher exposed through Tailscale (`rancher.silverside-gopher.ts.net`)
- Grafana exposed through Tailscale (`grafana.silverside-gopher.ts.net`) - Grafana exposed through Tailscale (`grafana.silverside-gopher.ts.net`)
@@ -17,11 +17,10 @@ This document defines the current engineering target for this repository.
## In Scope ## In Scope
- Terraform infrastructure bootstrap - Terraform infrastructure bootstrap
- Ansible k3s bootstrap with external cloud provider - Ansible k3s bootstrap on Ubuntu cloud-init VMs
- **HA control plane (3 nodes with etcd quorum)** - **HA control plane (3 nodes with etcd quorum)**
- **Hetzner Load Balancer for Kubernetes API** - **kube-vip for Kubernetes API HA**
- **Hetzner CCM deployed via Ansible (before workers join)** - **NFS-backed persistent volumes via `nfs-subdir-external-provisioner`**
- **Hetzner CSI for persistent volumes (via Flux)**
- Flux core reconciliation - Flux core reconciliation
- External Secrets Operator with Doppler - External Secrets Operator with Doppler
- Tailscale private access and smoke-check validation - Tailscale private access and smoke-check validation
@@ -45,15 +44,14 @@ This document defines the current engineering target for this repository.
## Phase Gates ## Phase Gates
1. Terraform apply completes for HA topology (3 CP, 3 workers, 1 LB). 1. Terraform apply completes for HA topology (3 CP, 5 workers, 1 VIP).
2. Load Balancer is healthy with all 3 control plane targets. 2. Primary control plane bootstraps with `--cluster-init`.
3. Primary control plane bootstraps with `--cluster-init`. 3. kube-vip advertises `10.27.27.40:6443` from the control-plane set.
4. Secondary control planes join via Load Balancer endpoint. 4. Secondary control planes join via the kube-vip endpoint.
5. **CCM deployed via Ansible before workers join** (fixes uninitialized taint issue). 5. Workers join successfully via the kube-vip endpoint.
6. Workers join successfully via Load Balancer and all nodes show proper `providerID`.
7. etcd reports 3 healthy members. 7. etcd reports 3 healthy members.
8. Flux source and infrastructure reconciliation are healthy. 8. Flux source and infrastructure reconciliation are healthy.
9. **CSI deploys and creates `hcloud-volumes` StorageClass**. 9. **NFS provisioner deploys and creates `flash-nfs` StorageClass**.
10. **PVC provisioning tested and working**. 10. **PVC provisioning tested and working**.
11. External Secrets sync required secrets. 11. External Secrets sync required secrets.
12. Tailscale private access works for Rancher, Grafana, and Prometheus. 12. Tailscale private access works for Rancher, Grafana, and Prometheus.
+1 -1
View File
@@ -13,7 +13,7 @@ control_plane
workers workers
[cluster:vars] [cluster:vars]
ansible_user=root ansible_user=ubuntu
ansible_python_interpreter=/usr/bin/python3 ansible_python_interpreter=/usr/bin/python3
ansible_ssh_private_key_file={{ private_key_file }} ansible_ssh_private_key_file={{ private_key_file }}
k3s_version=latest k3s_version=latest
@@ -1,14 +1,4 @@
--- ---
- name: Apply Hetzner cloud secret
shell: >-
kubectl -n kube-system create secret generic hcloud
--from-literal=token='{{ hcloud_token }}'
--from-literal=network='{{ cluster_name }}-network'
--dry-run=client -o yaml | kubectl apply -f -
changed_when: true
no_log: true
when: hcloud_token | default('') | length > 0
- name: Ensure Tailscale operator namespace exists - name: Ensure Tailscale operator namespace exists
command: >- command: >-
kubectl create namespace {{ tailscale_operator_namespace | default('tailscale-system') }} kubectl create namespace {{ tailscale_operator_namespace | default('tailscale-system') }}
-82
View File
@@ -1,82 +0,0 @@
---
- name: Check if hcloud secret exists
command: kubectl -n kube-system get secret hcloud
register: hcloud_secret_check
changed_when: false
failed_when: false
- name: Fail if hcloud secret is missing
fail:
msg: "hcloud secret not found in kube-system namespace. CCM requires it."
when: hcloud_secret_check.rc != 0
- name: Check if helm is installed
command: which helm
register: helm_check
changed_when: false
failed_when: false
- name: Install helm
when: helm_check.rc != 0
block:
- name: Download helm install script
get_url:
url: https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3
dest: /tmp/get-helm-3.sh
mode: "0755"
- name: Run helm install script
command: /tmp/get-helm-3.sh
args:
creates: /usr/local/bin/helm
- name: Add Hetzner Helm repository
kubernetes.core.helm_repository:
name: hcloud
repo_url: https://charts.hetzner.cloud
kubeconfig: /etc/rancher/k3s/k3s.yaml
environment:
KUBECONFIG: /etc/rancher/k3s/k3s.yaml
- name: Deploy Hetzner Cloud Controller Manager
kubernetes.core.helm:
name: hcloud-cloud-controller-manager
chart_ref: hcloud/hcloud-cloud-controller-manager
release_namespace: kube-system
create_namespace: true
values:
networking:
enabled: true
nodeSelector:
kubernetes.io/hostname: "{{ inventory_hostname }}"
additionalTolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
kubeconfig: /etc/rancher/k3s/k3s.yaml
wait: true
wait_timeout: 300s
environment:
KUBECONFIG: /etc/rancher/k3s/k3s.yaml
- name: Wait for CCM to be ready
command: kubectl -n kube-system rollout status deployment/hcloud-cloud-controller-manager --timeout=120s
changed_when: false
register: ccm_rollout
until: ccm_rollout.rc == 0
retries: 3
delay: 10
- name: Pause to ensure CCM is fully ready to process new nodes
pause:
seconds: 10
- name: Verify CCM is removing uninitialized taints
command: kubectl get nodes -o jsonpath='{.items[*].spec.taints[?(@.key=="node.cloudprovider.kubernetes.io/uninitialized")].key}'
register: uninitialized_taints
changed_when: false
failed_when: false
- name: Display taint status
debug:
msg: "Nodes with uninitialized taint: {{ uninitialized_taints.stdout }}"
+1
View File
@@ -19,6 +19,7 @@
- lsb-release - lsb-release
- software-properties-common - software-properties-common
- jq - jq
- nfs-common
- htop - htop
- vim - vim
state: present state: present
+2 -1
View File
@@ -3,4 +3,5 @@ k3s_version: latest
k3s_server_url: "" k3s_server_url: ""
k3s_token: "" k3s_token: ""
k3s_node_ip: "" k3s_node_ip: ""
k3s_kubelet_cloud_provider_external: true k3s_kubelet_cloud_provider_external: false
k3s_flannel_iface: ens18
+1 -1
View File
@@ -22,7 +22,7 @@
command: >- command: >-
/tmp/install-k3s.sh agent /tmp/install-k3s.sh agent
--node-ip {{ k3s_node_ip }} --node-ip {{ k3s_node_ip }}
--flannel-iface=enp7s0 --flannel-iface={{ k3s_flannel_iface }}
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %} {% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
args: args:
creates: /usr/local/bin/k3s-agent creates: /usr/local/bin/k3s-agent
+3 -2
View File
@@ -3,9 +3,10 @@ k3s_version: latest
k3s_token: "" k3s_token: ""
k3s_node_ip: "" k3s_node_ip: ""
k3s_primary_public_ip: "" k3s_primary_public_ip: ""
k3s_disable_embedded_ccm: true k3s_disable_embedded_ccm: false
k3s_disable_servicelb: true k3s_disable_servicelb: true
k3s_kubelet_cloud_provider_external: true k3s_kubelet_cloud_provider_external: false
k3s_flannel_iface: ens18
# Load Balancer endpoint for HA cluster joins (set in inventory) # Load Balancer endpoint for HA cluster joins (set in inventory)
kube_api_endpoint: "" kube_api_endpoint: ""
# Tailscale DNS names for control planes (to enable tailnet access) # Tailscale DNS names for control planes (to enable tailnet access)
+2 -2
View File
@@ -61,7 +61,7 @@
--cluster-init --cluster-init
--advertise-address={{ k3s_primary_ip }} --advertise-address={{ k3s_primary_ip }}
--node-ip={{ k3s_node_ip }} --node-ip={{ k3s_node_ip }}
--flannel-iface=enp7s0 --flannel-iface={{ k3s_flannel_iface }}
--tls-san={{ k3s_primary_ip }} --tls-san={{ k3s_primary_ip }}
--tls-san={{ k3s_primary_public_ip }} --tls-san={{ k3s_primary_public_ip }}
--tls-san={{ kube_api_endpoint }} --tls-san={{ kube_api_endpoint }}
@@ -87,7 +87,7 @@
--server https://{{ k3s_join_endpoint | default(k3s_primary_ip) }}:6443 --server https://{{ k3s_join_endpoint | default(k3s_primary_ip) }}:6443
--advertise-address={{ k3s_node_ip }} --advertise-address={{ k3s_node_ip }}
--node-ip={{ k3s_node_ip }} --node-ip={{ k3s_node_ip }}
--flannel-iface=enp7s0 --flannel-iface={{ k3s_flannel_iface }}
{% if k3s_disable_embedded_ccm | bool %}--disable-cloud-controller{% endif %} {% if k3s_disable_embedded_ccm | bool %}--disable-cloud-controller{% endif %}
{% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %} {% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %}
{% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %} {% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %}
@@ -0,0 +1,4 @@
---
kube_vip_version: v1.1.2
kube_vip_interface: ens18
kube_vip_address: "{{ kube_api_endpoint }}"
@@ -0,0 +1,21 @@
---
- name: Render kube-vip control plane manifest
template:
src: kube-vip-control-plane.yaml.j2
dest: /tmp/kube-vip-control-plane.yaml
mode: "0644"
- name: Apply kube-vip control plane manifest
command: kubectl apply -f /tmp/kube-vip-control-plane.yaml
changed_when: true
- name: Wait for kube-vip DaemonSet rollout
command: kubectl -n kube-system rollout status daemonset/kube-vip --timeout=180s
changed_when: false
- name: Wait for API VIP on 6443
wait_for:
host: "{{ kube_vip_address }}"
port: 6443
state: started
timeout: 180
@@ -0,0 +1,110 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: kube-vip
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: system:kube-vip-role
rules:
- apiGroups: [""]
resources: ["services/status"]
verbs: ["update"]
- apiGroups: [""]
resources: ["services", "endpoints"]
verbs: ["list", "get", "watch", "update"]
- apiGroups: [""]
resources: ["nodes"]
verbs: ["list", "get", "watch", "update", "patch"]
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["list", "get", "watch", "update", "create"]
- apiGroups: ["discovery.k8s.io"]
resources: ["endpointslices"]
verbs: ["list", "get", "watch", "update"]
- apiGroups: [""]
resources: ["pods"]
verbs: ["list"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: system:kube-vip-binding
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: system:kube-vip-role
subjects:
- kind: ServiceAccount
name: kube-vip
namespace: kube-system
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: kube-vip
namespace: kube-system
spec:
selector:
matchLabels:
app.kubernetes.io/name: kube-vip
template:
metadata:
labels:
app.kubernetes.io/name: kube-vip
spec:
serviceAccountName: kube-vip
hostNetwork: true
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: node-role.kubernetes.io/control-plane
operator: Exists
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
- key: node-role.kubernetes.io/master
operator: Exists
effect: NoSchedule
containers:
- name: kube-vip
image: ghcr.io/kube-vip/kube-vip:{{ kube_vip_version }}
imagePullPolicy: IfNotPresent
args:
- manager
env:
- name: vip_arp
value: "true"
- name: port
value: "6443"
- name: vip_interface
value: {{ kube_vip_interface | quote }}
- name: vip_subnet
value: "32"
- name: cp_enable
value: "true"
- name: cp_namespace
value: kube-system
- name: vip_ddns
value: "false"
- name: vip_leaderelection
value: "true"
- name: vip_leaseduration
value: "5"
- name: vip_renewdeadline
value: "3"
- name: vip_retryperiod
value: "1"
- name: address
value: {{ kube_vip_address | quote }}
securityContext:
capabilities:
add:
- NET_ADMIN
- NET_RAW
- SYS_TIME
+2 -2
View File
@@ -57,12 +57,12 @@
roles: roles:
- addon-secrets-bootstrap - addon-secrets-bootstrap
- name: Deploy Hetzner CCM (required for workers with external cloud provider) - name: Deploy kube-vip for API HA
hosts: control_plane[0] hosts: control_plane[0]
become: true become: true
roles: roles:
- ccm-deploy - kube-vip-deploy
- name: Setup secondary control planes - name: Setup secondary control planes
hosts: control_plane[1:] hosts: control_plane[1:]
@@ -1,36 +0,0 @@
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: hcloud-cloud-controller-manager
namespace: flux-system
spec:
interval: 10m
targetNamespace: kube-system
chart:
spec:
chart: hcloud-cloud-controller-manager
version: 1.30.1
sourceRef:
kind: HelmRepository
name: hcloud
namespace: flux-system
install:
createNamespace: true
remediation:
retries: 3
upgrade:
remediation:
retries: 3
values:
selectorLabels:
app: hcloud-cloud-controller-manager
args:
secure-port: "0"
networking:
enabled: true
nodeSelector:
kubernetes.io/hostname: k8s-cluster-cp-1
additionalTolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
@@ -1,8 +0,0 @@
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: hcloud
namespace: flux-system
spec:
interval: 1h
url: https://charts.hetzner.cloud
@@ -1,5 +0,0 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helmrepository-hcloud.yaml
- helmrelease-hcloud-ccm.yaml
@@ -1,36 +0,0 @@
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: hcloud-csi
namespace: flux-system
spec:
interval: 10m
targetNamespace: kube-system
chart:
spec:
chart: hcloud-csi
version: 2.20.0
sourceRef:
kind: HelmRepository
name: hcloud
namespace: flux-system
install:
createNamespace: true
remediation:
retries: 3
upgrade:
remediation:
retries: 3
values:
controller:
nodeSelector:
kubernetes.io/hostname: k8s-cluster-cp-1
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
hcloudVolumeDefaultLocation: nbg1
storageClasses:
- name: hcloud-volumes
defaultStorageClass: true
reclaimPolicy: Delete
@@ -1,5 +0,0 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helmrepository-hcloud.yaml
- helmrelease-hcloud-csi.yaml
@@ -1,17 +0,0 @@
apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization
metadata:
name: addon-csi
namespace: flux-system
spec:
interval: 10m
prune: true
sourceRef:
kind: GitRepository
name: platform
path: ./infrastructure/addons/csi
dependsOn:
- name: addon-ccm
wait: true
timeout: 10m
suspend: false
@@ -1,7 +1,7 @@
apiVersion: kustomize.toolkit.fluxcd.io/v1 apiVersion: kustomize.toolkit.fluxcd.io/v1
kind: Kustomization kind: Kustomization
metadata: metadata:
name: addon-ccm name: addon-nfs-storage
namespace: flux-system namespace: flux-system
spec: spec:
interval: 10m interval: 10m
@@ -9,7 +9,7 @@ spec:
sourceRef: sourceRef:
kind: GitRepository kind: GitRepository
name: platform name: platform
path: ./infrastructure/addons/ccm path: ./infrastructure/addons/nfs-storage
wait: true wait: true
timeout: 10m timeout: 10m
suspend: false suspend: false
+1 -2
View File
@@ -1,8 +1,7 @@
apiVersion: kustomize.config.k8s.io/v1beta1 apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization kind: Kustomization
resources: resources:
- kustomization-ccm.yaml - kustomization-nfs-storage.yaml
- kustomization-csi.yaml
- kustomization-external-secrets.yaml - kustomization-external-secrets.yaml
- kustomization-cert-manager.yaml - kustomization-cert-manager.yaml
- kustomization-tailscale-operator.yaml - kustomization-tailscale-operator.yaml
@@ -0,0 +1,36 @@
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: nfs-subdir-external-provisioner
namespace: flux-system
spec:
interval: 10m
targetNamespace: kube-system
chart:
spec:
chart: nfs-subdir-external-provisioner
version: 4.0.18
sourceRef:
kind: HelmRepository
name: nfs-subdir-external-provisioner
namespace: flux-system
install:
createNamespace: true
remediation:
retries: 3
upgrade:
remediation:
retries: 3
values:
nfs:
server: 10.27.27.22
path: /TheFlash/k8s-nfs
storageClass:
create: true
defaultClass: true
name: flash-nfs
provisionerName: flash-nfs
reclaimPolicy: Delete
archiveOnDelete: true
allowVolumeExpansion: true
volumeBindingMode: Immediate
@@ -1,8 +1,8 @@
apiVersion: source.toolkit.fluxcd.io/v1 apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository kind: HelmRepository
metadata: metadata:
name: hcloud name: nfs-subdir-external-provisioner
namespace: flux-system namespace: flux-system
spec: spec:
interval: 1h interval: 1h
url: https://charts.hetzner.cloud url: https://kubernetes-sigs.github.io/nfs-subdir-external-provisioner
@@ -0,0 +1,5 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helmrepository-nfs-subdir-external-provisioner.yaml
- helmrelease-nfs-subdir-external-provisioner.yaml
+2 -1
View File
@@ -24,10 +24,11 @@ echo "Fetching kubeconfig from $CP1_PUBLIC_IP ..."
ssh -i "$SSH_KEY" \ ssh -i "$SSH_KEY" \
-o StrictHostKeyChecking=no \ -o StrictHostKeyChecking=no \
-o UserKnownHostsFile=/dev/null \ -o UserKnownHostsFile=/dev/null \
"root@$CP1_PUBLIC_IP" "cat /etc/rancher/k3s/k3s.yaml" \ "ubuntu@$CP1_PUBLIC_IP" "sudo cat /etc/rancher/k3s/k3s.yaml" \
| sed "s/127.0.0.1/$CP1_PUBLIC_IP/g" \ | sed "s/127.0.0.1/$CP1_PUBLIC_IP/g" \
> "$KUBECONFIG_PATH" > "$KUBECONFIG_PATH"
chmod 600 "$KUBECONFIG_PATH" chmod 600 "$KUBECONFIG_PATH"
echo "Kubeconfig saved to $KUBECONFIG_PATH" echo "Kubeconfig saved to $KUBECONFIG_PATH"
echo "Run: export KUBECONFIG=$KUBECONFIG_PATH" echo "Run: export KUBECONFIG=$KUBECONFIG_PATH"
+19 -15
View File
@@ -1,29 +1,33 @@
hcloud_token = "your-hetzner-cloud-api-token-here" proxmox_endpoint = "https://100.105.0.115:8006/"
proxmox_api_token_id = "terraform-prov@pve!k8s-cluster"
proxmox_api_token_secret = "your-proxmox-api-token-secret"
ssh_public_key = "~/.ssh/hetzner_k8s.pub" ssh_public_key = "~/.ssh/infra.pub"
ssh_private_key = "~/.ssh/hetzner_k8s" ssh_private_key = "~/.ssh/infra"
s3_access_key = "your-backblaze-key-id" s3_access_key = "your-backblaze-key-id"
s3_secret_key = "your-backblaze-application-key" s3_secret_key = "your-backblaze-application-key"
s3_endpoint = "https://s3.eu-central-003.backblazeb2.com" s3_endpoint = "https://s3.eu-central-003.backblazeb2.com"
s3_bucket = "k8s-terraform-state" s3_bucket = "k8s-terraform-state"
cluster_name = "k8s-prod" cluster_name = "k8s-cluster"
tailscale_tailnet = "yourtailnet.ts.net" tailscale_tailnet = "yourtailnet.ts.net"
restrict_api_ssh_to_tailnet = true kube_api_vip = "10.27.27.40"
tailnet_cidr = "100.64.0.0/10"
enable_nodeport_public = false
control_plane_count = 3 control_plane_count = 3
control_plane_type = "cx23" control_plane_ips = ["10.27.27.30", "10.27.27.31", "10.27.27.32"]
control_plane_vm_ids = [200, 201, 202]
worker_count = 4 worker_count = 5
worker_type = "cx33" worker_ips = ["10.27.27.41", "10.27.27.42", "10.27.27.43", "10.27.27.44", "10.27.27.45"]
worker_vm_ids = [210, 211, 212, 213, 214]
location = "nbg1" proxmox_node_name = "flex"
proxmox_template_vm_id = 9000
allowed_ssh_ips = [] proxmox_vm_storage_pool = "Flash"
proxmox_cloud_init_storage_pool = "Flash"
allowed_api_ips = [] proxmox_bridge = "vmbr0"
proxmox_gateway = "10.27.27.1"
proxmox_dns_servers = ["1.1.1.1", "8.8.8.8"]
-118
View File
@@ -1,118 +0,0 @@
locals {
ssh_source_ips = var.restrict_api_ssh_to_tailnet ? concat([var.tailnet_cidr], var.allowed_ssh_ips) : var.allowed_ssh_ips
api_source_ips = var.restrict_api_ssh_to_tailnet ? concat([var.tailnet_cidr], var.allowed_api_ips) : var.allowed_api_ips
}
resource "hcloud_firewall" "cluster" {
name = "${var.cluster_name}-firewall"
rule {
description = "SSH"
direction = "in"
protocol = "tcp"
port = "22"
source_ips = local.ssh_source_ips
}
rule {
description = "Kubernetes API"
direction = "in"
protocol = "tcp"
port = "6443"
source_ips = local.api_source_ips
}
rule {
description = "Tailscale WireGuard"
direction = "in"
protocol = "udp"
port = "41641"
source_ips = ["0.0.0.0/0"]
}
rule {
description = "Kubernetes API (internal)"
direction = "in"
protocol = "tcp"
port = "6443"
source_ips = [var.subnet_cidr]
}
rule {
description = "k3s Supervisor"
direction = "in"
protocol = "tcp"
port = "9345"
source_ips = [var.subnet_cidr]
}
rule {
description = "etcd Client"
direction = "in"
protocol = "tcp"
port = "2379"
source_ips = [var.subnet_cidr]
}
rule {
description = "etcd Peer"
direction = "in"
protocol = "tcp"
port = "2380"
source_ips = [var.subnet_cidr]
}
rule {
description = "Flannel VXLAN"
direction = "in"
protocol = "udp"
port = "8472"
source_ips = [var.subnet_cidr]
}
rule {
description = "Kubelet"
direction = "in"
protocol = "tcp"
port = "10250"
source_ips = [var.subnet_cidr]
}
dynamic "rule" {
for_each = var.enable_nodeport_public ? [1] : []
content {
description = "NodePorts"
direction = "in"
protocol = "tcp"
port = "30000-32767"
source_ips = ["0.0.0.0/0"]
}
}
rule {
description = "HTTP from Load Balancer"
direction = "in"
protocol = "tcp"
port = "80"
source_ips = ["0.0.0.0/0"]
}
rule {
description = "HTTPS from Load Balancer"
direction = "in"
protocol = "tcp"
port = "443"
source_ips = ["0.0.0.0/0"]
}
rule {
description = "ICMP"
direction = "in"
protocol = "icmp"
source_ips = ["0.0.0.0/0"]
}
apply_to {
label_selector = "cluster=${var.cluster_name}"
}
}
-50
View File
@@ -1,50 +0,0 @@
# Load Balancer for Kubernetes API High Availability
# Provides a single endpoint for all control planes
resource "hcloud_load_balancer" "kube_api" {
name = "${var.cluster_name}-api"
load_balancer_type = "lb11" # Cheapest tier: €5.39/month
location = var.location
labels = {
cluster = var.cluster_name
role = "kube-api"
}
}
# Attach Load Balancer to private network (required for use_private_ip)
resource "hcloud_load_balancer_network" "kube_api" {
load_balancer_id = hcloud_load_balancer.kube_api.id
network_id = hcloud_network.cluster.id
ip = cidrhost(var.subnet_cidr, 5) # 10.0.1.5
}
# Attach all control plane servers as targets
resource "hcloud_load_balancer_target" "kube_api_targets" {
count = var.control_plane_count
type = "server"
load_balancer_id = hcloud_load_balancer.kube_api.id
server_id = hcloud_server.control_plane[count.index].id
use_private_ip = true
depends_on = [hcloud_load_balancer_network.kube_api, hcloud_server.control_plane]
}
# Kubernetes API service on port 6443
resource "hcloud_load_balancer_service" "kube_api" {
load_balancer_id = hcloud_load_balancer.kube_api.id
protocol = "tcp"
listen_port = 6443
destination_port = 6443
health_check {
protocol = "tcp"
port = 6443
interval = 15
timeout = 10
retries = 3
}
}
# Firewall rule to allow LB access to control planes on 6443
# This is added to the existing cluster firewall
+12 -5
View File
@@ -2,13 +2,20 @@ terraform {
required_version = ">= 1.0" required_version = ">= 1.0"
required_providers { required_providers {
hcloud = { local = {
source = "hetznercloud/hcloud" source = "hashicorp/local"
version = "~> 1.45" version = "~> 2.5"
}
proxmox = {
source = "bpg/proxmox"
version = ">= 0.60.0"
} }
} }
} }
provider "hcloud" { provider "proxmox" {
token = var.hcloud_token endpoint = var.proxmox_endpoint
api_token = "${var.proxmox_api_token_id}=${var.proxmox_api_token_secret}"
insecure = var.proxmox_insecure
} }
-11
View File
@@ -1,11 +0,0 @@
resource "hcloud_network" "cluster" {
name = "${var.cluster_name}-network"
ip_range = var.network_cidr
}
resource "hcloud_network_subnet" "servers" {
network_id = hcloud_network.cluster.id
type = "cloud"
network_zone = "eu-central"
ip_range = var.subnet_cidr
}
+9 -15
View File
@@ -1,42 +1,36 @@
output "control_plane_ips" { output "control_plane_ips" {
description = "Public IPs of control plane nodes" description = "Public IPs of control plane nodes"
value = [for cp in hcloud_server.control_plane : cp.ipv4_address] value = var.control_plane_ips
} }
output "control_plane_names" { output "control_plane_names" {
description = "Control plane hostnames" description = "Control plane hostnames"
value = [for cp in hcloud_server.control_plane : cp.name] value = [for idx in range(var.control_plane_count) : format("%s-cp-%d", var.cluster_name, idx + 1)]
} }
output "control_plane_private_ips" { output "control_plane_private_ips" {
description = "Private IPs of control plane nodes" description = "Private IPs of control plane nodes"
value = [ value = var.control_plane_ips
for idx, cp in hcloud_server.control_plane :
try(one(cp.network).ip, cidrhost(var.subnet_cidr, 10 + idx))
]
} }
output "primary_control_plane_ip" { output "primary_control_plane_ip" {
description = "Public IP of the primary control plane (first node)" description = "Public IP of the primary control plane (first node)"
value = hcloud_server.control_plane[0].ipv4_address value = var.control_plane_ips[0]
} }
output "worker_ips" { output "worker_ips" {
description = "Public IPs of worker nodes" description = "Public IPs of worker nodes"
value = [for worker in hcloud_server.workers : worker.ipv4_address] value = var.worker_ips
} }
output "worker_names" { output "worker_names" {
description = "Worker hostnames" description = "Worker hostnames"
value = [for worker in hcloud_server.workers : worker.name] value = [for idx in range(var.worker_count) : format("%s-worker-%d", var.cluster_name, idx + 1)]
} }
output "worker_private_ips" { output "worker_private_ips" {
description = "Private IPs of worker nodes" description = "Private IPs of worker nodes"
value = [ value = var.worker_ips
for idx, worker in hcloud_server.workers :
try(one(worker.network).ip, cidrhost(var.subnet_cidr, 20 + idx))
]
} }
output "ssh_private_key_path" { output "ssh_private_key_path" {
@@ -61,10 +55,10 @@ output "network_cidr" {
output "kubeconfig_command" { output "kubeconfig_command" {
description = "Command to fetch kubeconfig" description = "Command to fetch kubeconfig"
value = "ssh root@${hcloud_server.control_plane[0].ipv4_address} 'cat /etc/rancher/k3s/k3s.yaml' > kubeconfig && sed -i 's/127.0.0.1/${hcloud_server.control_plane[0].ipv4_address}/g' kubeconfig" value = "ssh ubuntu@${var.control_plane_ips[0]} 'sudo cat /etc/rancher/k3s/k3s.yaml' > kubeconfig && sed -i 's/127.0.0.1/${var.control_plane_ips[0]}/g' kubeconfig"
} }
output "kube_api_lb_ip" { output "kube_api_lb_ip" {
description = "Load Balancer private IP for Kubernetes API (used for cluster joins)" description = "Load Balancer private IP for Kubernetes API (used for cluster joins)"
value = hcloud_load_balancer_network.kube_api.ip value = var.kube_api_vip
} }
+107 -46
View File
@@ -1,60 +1,121 @@
data "hcloud_image" "ubuntu" { data "local_file" "ssh_public_key" {
name = "ubuntu-24.04" filename = pathexpand(var.ssh_public_key)
with_status = ["available"]
} }
resource "hcloud_server" "control_plane" { locals {
count = var.control_plane_count subnet_prefix = split("/", var.subnet_cidr)[1]
name = "${var.cluster_name}-cp-${count.index + 1}" control_planes = {
server_type = var.control_plane_type for idx in range(var.control_plane_count) :
image = data.hcloud_image.ubuntu.id format("%s-cp-%d", var.cluster_name, idx + 1) => {
location = var.location
ssh_keys = [data.hcloud_ssh_key.cluster.id]
labels = {
cluster = var.cluster_name
role = "control-plane" role = "control-plane"
vm_id = var.control_plane_vm_ids[idx]
ip = var.control_plane_ips[idx]
cpu = var.control_plane_cores
memory_mb = var.control_plane_memory_mb
disk_gb = var.control_plane_disk_gb
startup = 1
}
} }
network { workers = {
network_id = hcloud_network.cluster.id for idx in range(var.worker_count) :
ip = cidrhost(var.subnet_cidr, 10 + count.index) format("%s-worker-%d", var.cluster_name, idx + 1) => {
}
public_net {
ipv4_enabled = true
ipv6_enabled = true
}
firewall_ids = [hcloud_firewall.cluster.id]
}
resource "hcloud_server" "workers" {
count = var.worker_count
name = "${var.cluster_name}-worker-${count.index + 1}"
server_type = var.worker_type
image = data.hcloud_image.ubuntu.id
location = var.location
ssh_keys = [data.hcloud_ssh_key.cluster.id]
labels = {
cluster = var.cluster_name
role = "worker" role = "worker"
vm_id = var.worker_vm_ids[idx]
ip = var.worker_ips[idx]
cpu = var.worker_cores
memory_mb = var.worker_memory_mb
disk_gb = var.worker_disk_gb
startup = 2
}
} }
network { nodes = merge(local.control_planes, local.workers)
network_id = hcloud_network.cluster.id
ip = cidrhost(var.subnet_cidr, 20 + count.index)
} }
public_net { resource "proxmox_virtual_environment_vm" "nodes" {
ipv4_enabled = true for_each = local.nodes
ipv6_enabled = true
name = each.key
description = "Managed by Terraform for ${var.cluster_name}"
tags = ["terraform", var.cluster_name, each.value.role]
node_name = var.proxmox_node_name
vm_id = each.value.vm_id
on_boot = true
started = true
stop_on_destroy = true
reboot_after_update = true
timeout_clone = 1800
timeout_create = 1800
timeout_shutdown_vm = 300
timeout_start_vm = 300
scsi_hardware = "virtio-scsi-single"
clone {
vm_id = var.proxmox_template_vm_id
datastore_id = var.proxmox_vm_storage_pool
full = var.proxmox_clone_full
retries = 3
} }
firewall_ids = [hcloud_firewall.cluster.id] agent {
enabled = true
depends_on = [hcloud_server.control_plane] trim = true
}
cpu {
cores = each.value.cpu
type = "x86-64-v2-AES"
}
memory {
dedicated = each.value.memory_mb
floating = each.value.memory_mb
}
startup {
order = tostring(each.value.startup)
up_delay = "20"
down_delay = "20"
}
disk {
datastore_id = var.proxmox_vm_storage_pool
interface = "scsi0"
size = each.value.disk_gb
discard = "on"
iothread = true
ssd = true
}
initialization {
datastore_id = var.proxmox_cloud_init_storage_pool
dns {
servers = var.proxmox_dns_servers
}
ip_config {
ipv4 {
address = "${each.value.ip}/${local.subnet_prefix}"
gateway = var.proxmox_gateway
}
}
user_account {
username = var.proxmox_ssh_username
keys = [trimspace(data.local_file.ssh_public_key.content)]
}
}
network_device {
bridge = var.proxmox_bridge
model = "virtio"
}
operating_system {
type = "l26"
}
} }
-7
View File
@@ -1,7 +0,0 @@
data "local_file" "ssh_public_key" {
filename = pathexpand(var.ssh_public_key)
}
data "hcloud_ssh_key" "cluster" {
name = "infra"
}
+142 -22
View File
@@ -1,19 +1,13 @@
variable "hcloud_token" {
description = "Hetzner Cloud API token"
type = string
sensitive = true
}
variable "ssh_public_key" { variable "ssh_public_key" {
description = "Path to SSH public key" description = "Path to SSH public key"
type = string type = string
default = "~/.ssh/id_ed25519.pub" default = "~/.ssh/infra.pub"
} }
variable "ssh_private_key" { variable "ssh_private_key" {
description = "Path to SSH private key" description = "Path to SSH private key"
type = string type = string
default = "~/.ssh/id_ed25519" default = "~/.ssh/infra"
} }
variable "cluster_name" { variable "cluster_name" {
@@ -28,28 +22,112 @@ variable "control_plane_count" {
default = 3 default = 3
} }
variable "control_plane_type" { variable "control_plane_cores" {
description = "Hetzner server type for control plane" description = "vCPU count for control plane VMs"
type = string type = number
default = "cx23" default = 2
}
variable "control_plane_memory_mb" {
description = "Dedicated memory for control plane VMs in MiB"
type = number
default = 4096
}
variable "control_plane_disk_gb" {
description = "Disk size for control plane VMs in GiB"
type = number
default = 32
} }
variable "worker_count" { variable "worker_count" {
description = "Number of worker nodes" description = "Number of worker nodes"
type = number type = number
default = 3 default = 5
} }
variable "worker_type" { variable "worker_cores" {
description = "Hetzner server type for workers" description = "vCPU count for worker VMs"
type = string type = number
default = "cx33" default = 4
} }
variable "location" { variable "worker_memory_mb" {
description = "Hetzner datacenter location" description = "Dedicated memory for worker VMs in MiB"
type = number
default = 8192
}
variable "worker_disk_gb" {
description = "Disk size for worker VMs in GiB"
type = number
default = 64
}
variable "proxmox_endpoint" {
description = "Proxmox API endpoint without /api2/json suffix"
type = string type = string
default = "nbg1" default = "https://100.105.0.115:8006/"
}
variable "proxmox_api_token_id" {
description = "Proxmox API token ID"
type = string
sensitive = true
}
variable "proxmox_api_token_secret" {
description = "Proxmox API token secret"
type = string
sensitive = true
}
variable "proxmox_insecure" {
description = "Skip TLS verification for the Proxmox API"
type = bool
default = true
}
variable "proxmox_node_name" {
description = "Fixed Proxmox node name for all cluster VMs"
type = string
default = "flex"
}
variable "proxmox_template_vm_id" {
description = "Template VM ID used for linked clones"
type = number
default = 9000
}
variable "proxmox_clone_full" {
description = "Whether to use full clones instead of linked clones"
type = bool
default = false
}
variable "proxmox_vm_storage_pool" {
description = "Proxmox datastore for VM disks"
type = string
default = "Flash"
}
variable "proxmox_cloud_init_storage_pool" {
description = "Proxmox datastore for cloud-init disks"
type = string
default = "Flash"
}
variable "proxmox_bridge" {
description = "Proxmox bridge for cluster VM interfaces"
type = string
default = "vmbr0"
}
variable "proxmox_ssh_username" {
description = "Cloud-init user injected into cloned VMs"
type = string
default = "ubuntu"
} }
variable "allowed_ssh_ips" { variable "allowed_ssh_ips" {
@@ -90,13 +168,55 @@ variable "enable_nodeport_public" {
variable "network_cidr" { variable "network_cidr" {
description = "CIDR for private network" description = "CIDR for private network"
type = string type = string
default = "10.0.0.0/16" default = "10.27.27.0/24"
} }
variable "subnet_cidr" { variable "subnet_cidr" {
description = "CIDR for server subnet" description = "CIDR for server subnet"
type = string type = string
default = "10.0.1.0/24" default = "10.27.27.0/24"
}
variable "proxmox_gateway" {
description = "Gateway for cluster VM networking"
type = string
default = "10.27.27.1"
}
variable "proxmox_dns_servers" {
description = "DNS servers configured through cloud-init"
type = list(string)
default = ["1.1.1.1", "8.8.8.8"]
}
variable "control_plane_ips" {
description = "Static IPv4 addresses for control plane VMs"
type = list(string)
default = ["10.27.27.30", "10.27.27.31", "10.27.27.32"]
}
variable "worker_ips" {
description = "Static IPv4 addresses for worker VMs"
type = list(string)
default = ["10.27.27.41", "10.27.27.42", "10.27.27.43", "10.27.27.44", "10.27.27.45"]
}
variable "control_plane_vm_ids" {
description = "Fixed VMIDs for control plane VMs"
type = list(number)
default = [200, 201, 202]
}
variable "worker_vm_ids" {
description = "Fixed VMIDs for worker VMs"
type = list(number)
default = [210, 211, 212, 213, 214]
}
variable "kube_api_vip" {
description = "Virtual IP advertised by kube-vip for the Kubernetes API"
type = string
default = "10.27.27.40"
} }
variable "s3_access_key" { variable "s3_access_key" {