diff --git a/.gitea/workflows/dashboards.yml b/.gitea/workflows/dashboards.yml index 5c23ea7..6cb14c4 100644 --- a/.gitea/workflows/dashboards.yml +++ b/.gitea/workflows/dashboards.yml @@ -12,12 +12,15 @@ on: env: TF_VERSION: "1.7.0" - TF_VAR_hcloud_token: ${{ secrets.HCLOUD_TOKEN }} TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }} TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }} TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }} TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }} TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }} + TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }} + TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }} + TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }} + TF_VAR_proxmox_insecure: "true" jobs: dashboards: @@ -51,25 +54,6 @@ jobs: -backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \ -backend-config="skip_requesting_account_id=true" - - name: Detect runner egress IP - run: | - RUNNER_IP=$(curl -fsSL https://api.ipify.org) - echo "RUNNER_CIDR=[\"${RUNNER_IP}/32\"]" >> "$GITHUB_ENV" - echo "Runner egress IP: ${RUNNER_IP}" - - - name: Open SSH/API for current runner CIDR - working-directory: terraform - run: | - terraform apply \ - -refresh=false \ - -target=hcloud_firewall.cluster \ - -var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \ - -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \ - -var="ssh_private_key=$HOME/.ssh/id_ed25519" \ - -var="allowed_ssh_ips=${RUNNER_CIDR}" \ - -var="allowed_api_ips=${RUNNER_CIDR}" \ - -auto-approve - - name: Install Python Dependencies run: | apt-get update && apt-get install -y python3-pip diff --git a/.gitea/workflows/deploy.yml b/.gitea/workflows/deploy.yml index c220492..fdf8cba 100644 --- a/.gitea/workflows/deploy.yml +++ b/.gitea/workflows/deploy.yml @@ -11,12 +11,15 @@ on: env: TF_VERSION: "1.7.0" - TF_VAR_hcloud_token: ${{ secrets.HCLOUD_TOKEN }} TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }} TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }} TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }} TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }} TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }} + TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }} + TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }} + TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }} + TF_VAR_proxmox_insecure: "true" TS_OAUTH_CLIENT_ID: ${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }} TS_OAUTH_CLIENT_SECRET: ${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }} @@ -60,40 +63,6 @@ jobs: echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub chmod 644 ~/.ssh/id_ed25519.pub - - name: Install jq - run: | - apt-get update - apt-get install -y jq - - - name: Import existing servers into state (if missing) - working-directory: terraform - env: - HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }} - run: | - set -e - ensure_import() { - address="$1" - name="$2" - if terraform state show "$address" >/dev/null 2>&1; then - echo "$address already in state" - return - fi - id=$(curl -sS -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/servers?name=${name}" | jq -r '.servers[0].id // empty') - if [ -n "$id" ]; then - echo "Importing $address from server $name ($id)" - terraform import "$address" "$id" - else - echo "No existing server found for $name; skipping import" - fi - } - - ensure_import 'hcloud_server.control_plane[0]' 'k8s-cluster-cp-1' - ensure_import 'hcloud_server.control_plane[1]' 'k8s-cluster-cp-2' - ensure_import 'hcloud_server.control_plane[2]' 'k8s-cluster-cp-3' - ensure_import 'hcloud_server.workers[0]' 'k8s-cluster-worker-1' - ensure_import 'hcloud_server.workers[1]' 'k8s-cluster-worker-2' - ensure_import 'hcloud_server.workers[2]' 'k8s-cluster-worker-3' - - name: Terraform Plan id: plan working-directory: terraform @@ -187,32 +156,11 @@ jobs: mkdir -p ../outputs terraform output -json > ../outputs/terraform_outputs.json - - name: Detect runner egress IP - run: | - RUNNER_IP=$(curl -fsSL https://api.ipify.org) - echo "RUNNER_CIDR=[\"${RUNNER_IP}/32\"]" >> "$GITHUB_ENV" - echo "Runner egress IP: ${RUNNER_IP}" - - - name: Open SSH/API for current runner CIDR - working-directory: terraform - run: | - terraform apply \ - -target=hcloud_firewall.cluster \ - -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \ - -var="ssh_private_key=$HOME/.ssh/id_ed25519" \ - -var="allowed_ssh_ips=${RUNNER_CIDR}" \ - -var="allowed_api_ips=${RUNNER_CIDR}" \ - -auto-approve - - name: Install Python Dependencies run: | apt-get update && apt-get install -y python3-pip pip3 install --break-system-packages ansible kubernetes jinja2 pyyaml - - name: Note runner connectivity mode - run: | - echo "Using runner public network access with RUNNER_ALLOWED_CIDRS for SSH/API" - - name: Install Ansible Collections run: ansible-galaxy collection install -r ansible/requirements.yml @@ -224,7 +172,6 @@ jobs: working-directory: ansible run: | ansible-playbook site.yml \ - -e "hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \ -e "tailscale_auth_key=${{ secrets.TAILSCALE_AUTH_KEY }}" \ -e "tailscale_tailnet=${{ secrets.TAILSCALE_TAILNET }}" \ -e "tailscale_oauth_client_id=${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}" \ @@ -294,9 +241,8 @@ jobs: key: dopplerToken namespace: external-secrets EOF - # Wait for CCM and CSI (Hetzner cloud integration) - kubectl -n flux-system wait --for=condition=Ready kustomization/addon-ccm --timeout=600s - kubectl -n flux-system wait --for=condition=Ready kustomization/addon-csi --timeout=600s + # Wait for the storage layer and private access components + kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=600s kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s - name: Wait for Rancher and backup operator @@ -397,10 +343,9 @@ jobs: working-directory: ansible run: | ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get nodes -o wide" - ansible -i inventory.ini 'control_plane[0]' -m shell -a "kubectl describe nodes | grep -E 'Name:|providerID:'" ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n flux-system get gitrepositories,kustomizations,helmreleases" ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n kube-system get pods -o wide" - ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass" + ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass flash-nfs" ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n tailscale-system get pods -o wide" ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n external-secrets get pods" env: diff --git a/.gitea/workflows/destroy.yml b/.gitea/workflows/destroy.yml index fd44564..281a318 100644 --- a/.gitea/workflows/destroy.yml +++ b/.gitea/workflows/destroy.yml @@ -10,107 +10,22 @@ on: env: TF_VERSION: "1.7.0" - TF_VAR_hcloud_token: ${{ secrets.HCLOUD_TOKEN }} TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }} TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }} TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }} TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }} TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }} - B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }} - B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }} + TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }} + TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }} + TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }} + TF_VAR_proxmox_insecure: "true" jobs: - pre-destroy-backup: - name: Pre-Destroy Backup - runs-on: ubuntu-latest - if: github.event.inputs.confirm == 'destroy' - environment: destroy - steps: - - name: Checkout - uses: actions/checkout@v4 - - - name: Setup Terraform - uses: hashicorp/setup-terraform@v3 - with: - terraform_version: ${{ env.TF_VERSION }} - - - name: Terraform Init - working-directory: terraform - run: | - terraform init \ - -backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \ - -backend-config="bucket=${{ secrets.S3_BUCKET }}" \ - -backend-config="region=auto" \ - -backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \ - -backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \ - -backend-config="skip_requesting_account_id=true" - - - name: Setup SSH Keys - run: | - mkdir -p ~/.ssh - echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519 - chmod 600 ~/.ssh/id_ed25519 - echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub - chmod 644 ~/.ssh/id_ed25519.pub - - - name: Get Control Plane IP - id: cp_ip - working-directory: terraform - run: | - PRIMARY_IP=$(terraform output -raw primary_control_plane_ip) - echo "PRIMARY_IP=${PRIMARY_IP}" >> "$GITHUB_ENV" - - - name: Pre-Destroy pg_dump to B2 - run: | - set +e - echo "Attempting pre-destroy backup to B2..." - ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null root@${PRIMARY_IP} << 'EOF' - set -e - # Check if kubectl is available and cluster is up - if ! command -v kubectl &> /dev/null; then - echo "kubectl not found, skipping pre-destroy backup" - exit 0 - fi - - # Check if we can reach the cluster - if ! kubectl cluster-info &> /dev/null; then - echo "Cannot reach cluster, skipping pre-destroy backup" - exit 0 - fi - - # Check if CNP is deployed - if ! kubectl get namespace cnpg-cluster &> /dev/null; then - echo "CNP namespace not found, skipping pre-destroy backup" - exit 0 - fi - - # Run backup using the pgdump image directly - BACKUP_FILE="rancher-backup-$(date +%Y%m%d-%H%M%S).sql.gz" - B2_ACCOUNT_ID="$(cat /etc/kubernetes/secret/b2_account_id 2>/dev/null || echo '')" - B2_APPLICATION_KEY="$(cat /etc/kubernetes/secret/b2_application_key 2>/dev/null || echo '')" - - if [ -z "$B2_ACCOUNT_ID" ] || [ -z "$B2_APPLICATION_KEY" ]; then - echo "B2 credentials not found in secret, skipping pre-destroy backup" - exit 0 - fi - - kubectl run pgdump-manual --image=ghcr.io/cloudnative-pg/pgbackrest:latest --restart=Never \ - -n cnpg-cluster --dry-run=client -o yaml | \ - kubectl apply -f - - - echo "Waiting for backup job to complete..." - kubectl wait --for=condition=complete job/pgdump-manual -n cnpg-cluster --timeout=300s || true - kubectl logs job/pgdump-manual -n cnpg-cluster || true - kubectl delete job pgdump-manual -n cnpg-cluster --ignore-not-found=true || true - EOF - echo "Pre-destroy backup step completed (failure is non-fatal)" - destroy: name: Destroy Cluster runs-on: ubuntu-latest if: github.event.inputs.confirm == 'destroy' environment: destroy - needs: pre-destroy-backup steps: - name: Checkout uses: actions/checkout@v4 @@ -120,6 +35,14 @@ jobs: with: terraform_version: ${{ env.TF_VERSION }} + - name: Setup SSH Keys + run: | + mkdir -p ~/.ssh + echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519 + chmod 600 ~/.ssh/id_ed25519 + echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub + chmod 644 ~/.ssh/id_ed25519.pub + - name: Terraform Init working-directory: terraform run: | @@ -131,19 +54,6 @@ jobs: -backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \ -backend-config="skip_requesting_account_id=true" - - name: Setup SSH Keys - run: | - mkdir -p ~/.ssh - echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519 - chmod 600 ~/.ssh/id_ed25519 - echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub - chmod 644 ~/.ssh/id_ed25519.pub - - - name: Install jq - run: | - apt-get update - apt-get install -y jq - - name: Terraform Destroy id: destroy working-directory: terraform @@ -152,7 +62,6 @@ jobs: for attempt in 1 2 3; do echo "Terraform destroy attempt ${attempt}/3" terraform destroy \ - -var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \ -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \ -var="ssh_private_key=$HOME/.ssh/id_ed25519" \ -auto-approve @@ -164,32 +73,13 @@ jobs: echo "Terraform destroy failed with exit code ${rc}; retrying in 30s" sleep 30 terraform refresh \ - -var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \ -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \ -var="ssh_private_key=$HOME/.ssh/id_ed25519" || true fi done exit "$rc" - - name: Hetzner destroy diagnostics + - name: Terraform state diagnostics if: failure() && steps.destroy.outcome == 'failure' - env: - HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }} run: | - set +e - echo "== Terraform state list ==" terraform -chdir=terraform state list || true - - network_id=$(terraform -chdir=terraform state show hcloud_network.cluster 2>/dev/null | awk '/^id *=/ {gsub(/"/, "", $3); print $3; exit}') - if [ -z "$network_id" ]; then - network_id="11988935" - fi - - echo "== Hetzner network ==" - curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/networks/${network_id}" | jq . || true - - echo "== Hetzner servers attached to network ==" - curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/servers" | jq --argjson id "$network_id" '.servers[] | select(any(.private_net[]?; .network == $id)) | {id, name, private_net}' || true - - echo "== Hetzner load balancers attached to network ==" - curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/load_balancers" | jq --argjson id "$network_id" '.load_balancers[] | select(any(.private_net[]?; .network == $id)) | {id, name, private_net}' || true diff --git a/AGENTS.md b/AGENTS.md index 92e1329..bd2d1c7 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -9,7 +9,9 @@ Repository guide for OpenCode sessions in this repo. ## Current Baseline -- HA private cluster: 3 control planes, 3 workers. +- HA private cluster: 3 control planes, 5 workers on Proxmox. +- Proxmox clones come from template `9000` on node `flex`; API VIP is `10.27.27.40` via kube-vip. +- Storage is `nfs-subdir-external-provisioner` backed by `10.27.27.22:/TheFlash/k8s-nfs` with StorageClass `flash-nfs`. - Tailscale is the private access path for Rancher and shared services. - Rancher, Grafana, and Prometheus are exposed through Tailscale; Flux UI / Weave GitOps is removed. - `apps/` is suspended by default. @@ -20,8 +22,8 @@ Repository guide for OpenCode sessions in this repo. - Terraform: `terraform -chdir=terraform fmt -recursive`, `terraform -chdir=terraform validate`, `terraform -chdir=terraform plan -var-file=../terraform.tfvars`, `terraform -chdir=terraform apply -var-file=../terraform.tfvars` - Ansible: `ansible-galaxy collection install -r ansible/requirements.yml`, `cd ansible && python3 generate_inventory.py`, `ansible-playbook -i ansible/inventory.ini ansible/site.yml --syntax-check`, `ansible-playbook ansible/site.yml` - Flux/Kustomize: `kubectl kustomize infrastructure/addons/`, `kubectl kustomize clusters/prod/flux-system` -- Kubeconfig refresh: `scripts/refresh-kubeconfig.sh ` -- Tailnet smoke check: `ssh root@ 'bash -s' < scripts/smoke-check-tailnet-services.sh` +- Kubeconfig refresh: `scripts/refresh-kubeconfig.sh ` +- Tailnet smoke check: `ssh ubuntu@ 'bash -s' < scripts/smoke-check-tailnet-services.sh` ## Workflow Rules @@ -31,12 +33,14 @@ Repository guide for OpenCode sessions in this repo. - CI deploy order is Terraform -> Ansible -> Flux bootstrap -> Rancher restore -> health checks. - One object per Kubernetes YAML file; keep filenames kebab-case. - If `kubectl` points at `localhost:8080` after a rebuild, refresh kubeconfig from the primary control-plane IP. +- Bootstrap assumptions that matter: SSH user is `ubuntu`, NIC is `ens18`, API join endpoint is the kube-vip address. ## Repo-Specific Gotchas - `rancher-backup` uses a postRenderer to swap the broken hook image to `rancher/kubectl:v1.34.0`; do not put S3 config in HelmRelease values. Put it in the Backup CR. - Tailscale cleanup only runs before service proxies exist; it removes stale offline `rancher`/`grafana`/`prometheus`/`flux` devices, then must stop so live proxies are not deleted. - Keep the Tailscale operator on the stable Helm repo `https://pkgs.tailscale.com/helmcharts` at `1.96.5` unless you have a reason to change it. +- The repo no longer uses a cloud controller manager. If you see `providerID` or Hetzner-specific logic, it is stale. - Current private URLs: - Rancher: `https://rancher.silverside-gopher.ts.net/` - Grafana: `http://grafana.silverside-gopher.ts.net/` diff --git a/README.md b/README.md index 6d2431e..4da6083 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,28 @@ -# Hetzner Kubernetes Cluster +# Proxmox Kubernetes Cluster -Production-ready Kubernetes cluster on Hetzner Cloud using Terraform and Ansible. +Production-ready private Kubernetes cluster on Proxmox using Terraform, Ansible, and Flux. ## Architecture | Component | Details | |-----------|---------| -| **Control Plane** | 3x CX23 (HA) | -| **Workers** | 3x CX33 | +| **Control Plane** | 3x Proxmox VMs (2 vCPU / 4 GiB / 32 GiB) | +| **Workers** | 5x Proxmox VMs (4 vCPU / 8 GiB / 64 GiB) | | **K8s** | k3s (latest, HA) | -| **Addons** | Hetzner CCM + CSI + Prometheus + Grafana + Loki | +| **Addons** | NFS provisioner + Prometheus + Grafana + Loki + Rancher | | **Access** | SSH/API and private services restricted to Tailnet | | **Bootstrap** | Terraform + Ansible + Flux | ## Prerequisites -### 1. Hetzner Cloud API Token +### 1. Proxmox API Token -1. Go to [Hetzner Cloud Console](https://console.hetzner.com/) -2. Select your project (or create a new one) -3. Navigate to **Security** → **API Tokens** -4. Click **Generate API Token** -5. Set description: `k8s-cluster-terraform` -6. Select permissions: **Read & Write** -7. Click **Generate API Token** -8. **Copy the token immediately** - it won't be shown again! +Create an API token for the Proxmox VE user used by Terraform. The repo expects the `bpg/proxmox` provider with: + +- endpoint: `https://100.105.0.115:8006/` +- node: `flex` +- clone source: template `9000` (`ubuntu-2404-k8s-template`) +- auth: API token ### 2. Backblaze B2 Bucket (for Terraform State) @@ -44,7 +42,7 @@ Production-ready Kubernetes cluster on Hetzner Cloud using Terraform and Ansible ### 3. SSH Key Pair ```bash -ssh-keygen -t ed25519 -C "k8s@hetzner" -f ~/.ssh/hetzner_k8s +ssh-keygen -t ed25519 -C "k8s@proxmox" -f ~/.ssh/infra ``` ### 4. Local Tools @@ -71,10 +69,12 @@ cp terraform.tfvars.example terraform.tfvars Edit `terraform.tfvars`: ```hcl -hcloud_token = "your-hetzner-api-token" + proxmox_endpoint = "https://100.105.0.115:8006/" + proxmox_api_token_id = "terraform-prov@pve!k8s-cluster" + proxmox_api_token_secret = "your-proxmox-token-secret" -ssh_public_key = "~/.ssh/hetzner_k8s.pub" -ssh_private_key = "~/.ssh/hetzner_k8s" + ssh_public_key = "~/.ssh/infra.pub" + ssh_private_key = "~/.ssh/infra" s3_access_key = "your-backblaze-key-id" s3_secret_key = "your-backblaze-application-key" @@ -84,12 +84,7 @@ s3_bucket = "k8s-terraform-state" tailscale_auth_key = "tskey-auth-..." tailscale_tailnet = "yourtailnet.ts.net" -restrict_api_ssh_to_tailnet = true -tailnet_cidr = "100.64.0.0/10" -enable_nodeport_public = false - -allowed_ssh_ips = [] -allowed_api_ips = [] + kube_api_vip = "10.27.27.40" ``` ### 3. Initialize Terraform @@ -152,7 +147,9 @@ Set these in your Gitea repository settings (**Settings** → **Secrets** → ** | Secret | Description | |--------|-------------| -| `HCLOUD_TOKEN` | Hetzner Cloud API token | +| `PROXMOX_ENDPOINT` | Proxmox API endpoint (for example `https://100.105.0.115:8006/`) | +| `PROXMOX_API_TOKEN_ID` | Proxmox API token ID | +| `PROXMOX_API_TOKEN_SECRET` | Proxmox API token secret | | `S3_ACCESS_KEY` | Backblaze B2 keyID | | `S3_SECRET_KEY` | Backblaze B2 applicationKey | | `S3_ENDPOINT` | Backblaze S3 endpoint (e.g., `https://s3.eu-central-003.backblazeb2.com`) | @@ -163,7 +160,6 @@ Set these in your Gitea repository settings (**Settings** → **Secrets** → ** | `TAILSCALE_OAUTH_CLIENT_SECRET` | Tailscale OAuth client secret for Kubernetes Operator | | `DOPPLER_HETZNERTERRA_SERVICE_TOKEN` | Doppler service token for `hetznerterra` runtime secrets | | `GRAFANA_ADMIN_PASSWORD` | Optional admin password for Grafana (auto-generated if unset) | -| `RUNNER_ALLOWED_CIDRS` | Optional CIDR list for CI runner access if you choose to pass it via tfvars/secrets | | `SSH_PUBLIC_KEY` | SSH public key content | | `SSH_PRIVATE_KEY` | SSH private key content | @@ -176,8 +172,8 @@ This repo uses Flux for continuous reconciliation after Terraform + Ansible boot The current default target is the HA private baseline: - `3` control plane nodes -- `3` worker nodes -- private Hetzner network only +- `5` worker nodes +- private Proxmox network only - Tailscale for operator and service access - Flux-managed platform addons with `apps` suspended by default @@ -207,8 +203,7 @@ Terraform/bootstrap secrets remain in Gitea Actions secrets and are not managed ### Reconciliation graph - `infrastructure` (top-level) - - `addon-ccm` - - `addon-csi` depends on `addon-ccm` + - `addon-nfs-storage` - `addon-tailscale-operator` - `addon-observability` - `addon-observability-content` depends on `addon-observability` @@ -224,7 +219,7 @@ Terraform/bootstrap secrets remain in Gitea Actions secrets and are not managed ### Current addon status - Core infrastructure addons are Flux-managed from `infrastructure/addons/`. -- Active Flux addons for the current baseline: `addon-ccm`, `addon-csi`, `addon-cert-manager`, `addon-external-secrets`, `addon-tailscale-operator`, `addon-tailscale-proxyclass`, `addon-observability`, `addon-observability-content`, `addon-rancher`, `addon-rancher-config`, `addon-rancher-backup`, `addon-rancher-backup-config`. +- Active Flux addons for the current baseline: `addon-nfs-storage`, `addon-cert-manager`, `addon-external-secrets`, `addon-tailscale-operator`, `addon-tailscale-proxyclass`, `addon-observability`, `addon-observability-content`, `addon-rancher`, `addon-rancher-config`, `addon-rancher-backup`, `addon-rancher-backup-config`. - `apps` remains suspended until workload rollout is explicitly enabled. - Ansible is limited to cluster bootstrap, prerequisite secret creation, pre-proxy Tailscale cleanup, and kubeconfig finalization. - Weave GitOps / Flux UI is no longer deployed; use Rancher or the `flux` CLI for Flux operations. @@ -232,14 +227,14 @@ Terraform/bootstrap secrets remain in Gitea Actions secrets and are not managed ### Rancher access - Rancher is private-only and exposed through Tailscale at `https://rancher.silverside-gopher.ts.net/`. -- The public Hetzner load balancer path is not used for Rancher. +- Rancher and the Kubernetes API stay private; kube-vip provides the API VIP on the LAN. - Rancher stores state in embedded etcd; no external database is used. ### Stable baseline acceptance A rebuild is considered successful only when all of the following pass without manual intervention: -- Terraform create succeeds for the default `3` control planes and `3` workers. +- Terraform create succeeds for the default `3` control planes and `5` workers. - Ansible bootstrap succeeds end-to-end. - All nodes become `Ready`. - Flux core reconciliation is healthy. @@ -323,9 +318,6 @@ It avoids full cluster provisioning and only applies Grafana content resources: ├── terraform/ │ ├── main.tf │ ├── variables.tf -│ ├── network.tf -│ ├── firewall.tf -│ ├── ssh.tf │ ├── servers.tf │ ├── outputs.tf │ └── backend.tf @@ -353,17 +345,19 @@ It avoids full cluster provisioning and only applies Grafana content resources: ## Firewall Rules +This repo no longer manages cloud firewalls. Access control is expected to be handled on your LAN infrastructure and through Tailscale. + +Important cluster-local ports still in use: + | Port | Source | Purpose | |------|--------|---------| -| 22 | Tailnet CIDR | SSH | -| 6443 | Tailnet CIDR + internal | Kubernetes API | -| 41641/udp | Any | Tailscale WireGuard | -| 9345 | 10.0.0.0/16 | k3s Supervisor (HA join) | -| 2379 | 10.0.0.0/16 | etcd Client | -| 2380 | 10.0.0.0/16 | etcd Peer | -| 8472 | 10.0.0.0/16 | Flannel VXLAN | -| 10250 | 10.0.0.0/16 | Kubelet | -| 30000-32767 | Optional | NodePorts (disabled by default) | +| 22 | Admin hosts / CI | SSH | +| 6443 | 10.27.27.0/24 + VIP | Kubernetes API | +| 9345 | 10.27.27.0/24 | k3s Supervisor | +| 2379 | 10.27.27.0/24 | etcd Client | +| 2380 | 10.27.27.0/24 | etcd Peer | +| 8472/udp | 10.27.27.0/24 | Flannel VXLAN | +| 10250 | 10.27.27.0/24 | Kubelet | ## Operations @@ -399,7 +393,7 @@ terraform destroy ### Check k3s Logs ```bash -ssh root@ journalctl -u k3s -f +ssh ubuntu@ sudo journalctl -u k3s -f ``` ### Reset k3s @@ -408,19 +402,10 @@ ssh root@ journalctl -u k3s -f ansible-playbook site.yml -t reset ``` -## Costs Breakdown - -| Resource | Quantity | Unit Price | Monthly | -|----------|----------|------------|---------| -| CX23 (Control Plane) | 3 | €2.99 | €8.97 | -| CX33 (Workers) | 4 | €4.99 | €19.96 | -| Backblaze B2 | ~1 GB | Free (first 10GB) | €0.00 | -| **Total** | | | **€28.93/mo** | - ## Security Notes - Control plane has HA (3 nodes, can survive 1 failure) -- Consider adding Hetzner load balancer for API server +- Kubernetes API HA is provided by kube-vip on `10.27.27.40` - Rotate API tokens regularly - Use network policies in Kubernetes - Enable audit logging for production diff --git a/SECRETS_SETUP.md b/SECRETS_SETUP.md index f162f77..0347ebe 100644 --- a/SECRETS_SETUP.md +++ b/SECRETS_SETUP.md @@ -1,6 +1,6 @@ # Gitea Secrets Setup -This document describes the secrets required for the HetznerTerra deployment workflow. +This document describes the secrets required for the Proxmox-based deployment workflow. ## Required Secrets @@ -9,10 +9,17 @@ Add these secrets in your Gitea repository settings: ### Infrastructure Secrets -#### `HCLOUD_TOKEN` -- Hetzner Cloud API token -- Get from: https://console.hetzner.com/projects/{project-id}/security/api-tokens -- Permissions: Read & Write +#### `PROXMOX_ENDPOINT` +- Proxmox VE API endpoint +- Example: `https://100.105.0.115:8006/` + +#### `PROXMOX_API_TOKEN_ID` +- Proxmox API token ID +- Example: `terraform-prov@pve!k8s-cluster` + +#### `PROXMOX_API_TOKEN_SECRET` +- Proxmox API token secret +- Create with `pveum user token add terraform-prov@pve k8s-cluster` #### `S3_ACCESS_KEY` & `S3_SECRET_KEY` - Backblaze B2 credentials for Terraform state storage @@ -31,7 +38,7 @@ Add these secrets in your Gitea repository settings: #### `SSH_PRIVATE_KEY` & `SSH_PUBLIC_KEY` - SSH key pair for cluster access -- Generate with: `ssh-keygen -t ed25519 -C "k8s@hetzner" -f ~/.ssh/hetzner_k8s` +- Generate with: `ssh-keygen -t ed25519 -C "k8s@proxmox" -f ~/.ssh/infra` - Private key content (include BEGIN/END lines) - Public key content (full line starting with ssh-ed25519) @@ -90,4 +97,4 @@ Check the workflow logs to verify all secrets are being used correctly. - Prefer Doppler for runtime app/platform secrets after cluster bootstrap - Rotate Tailscale auth keys periodically - Review OAuth client permissions regularly -- The workflow automatically opens SSH/API access only for the runner's IP during deployment +- CI expects direct SSH access to the Proxmox VMs and direct Proxmox API access diff --git a/STABLE_BASELINE.md b/STABLE_BASELINE.md index 081a128..d6889cf 100644 --- a/STABLE_BASELINE.md +++ b/STABLE_BASELINE.md @@ -5,9 +5,9 @@ This document defines the current engineering target for this repository. ## Topology - 3 control planes (HA etcd cluster) -- 3 workers -- Hetzner Load Balancer for Kubernetes API -- private Hetzner network +- 5 workers +- kube-vip API VIP (`10.27.27.40`) +- private Proxmox/LAN network (`10.27.27.0/24`) - Tailscale operator access and service exposure - Rancher exposed through Tailscale (`rancher.silverside-gopher.ts.net`) - Grafana exposed through Tailscale (`grafana.silverside-gopher.ts.net`) @@ -17,11 +17,10 @@ This document defines the current engineering target for this repository. ## In Scope - Terraform infrastructure bootstrap -- Ansible k3s bootstrap with external cloud provider +- Ansible k3s bootstrap on Ubuntu cloud-init VMs - **HA control plane (3 nodes with etcd quorum)** -- **Hetzner Load Balancer for Kubernetes API** -- **Hetzner CCM deployed via Ansible (before workers join)** -- **Hetzner CSI for persistent volumes (via Flux)** +- **kube-vip for Kubernetes API HA** +- **NFS-backed persistent volumes via `nfs-subdir-external-provisioner`** - Flux core reconciliation - External Secrets Operator with Doppler - Tailscale private access and smoke-check validation @@ -45,15 +44,14 @@ This document defines the current engineering target for this repository. ## Phase Gates -1. Terraform apply completes for HA topology (3 CP, 3 workers, 1 LB). -2. Load Balancer is healthy with all 3 control plane targets. -3. Primary control plane bootstraps with `--cluster-init`. -4. Secondary control planes join via Load Balancer endpoint. -5. **CCM deployed via Ansible before workers join** (fixes uninitialized taint issue). -6. Workers join successfully via Load Balancer and all nodes show proper `providerID`. +1. Terraform apply completes for HA topology (3 CP, 5 workers, 1 VIP). +2. Primary control plane bootstraps with `--cluster-init`. +3. kube-vip advertises `10.27.27.40:6443` from the control-plane set. +4. Secondary control planes join via the kube-vip endpoint. +5. Workers join successfully via the kube-vip endpoint. 7. etcd reports 3 healthy members. 8. Flux source and infrastructure reconciliation are healthy. -9. **CSI deploys and creates `hcloud-volumes` StorageClass**. +9. **NFS provisioner deploys and creates `flash-nfs` StorageClass**. 10. **PVC provisioning tested and working**. 11. External Secrets sync required secrets. 12. Tailscale private access works for Rancher, Grafana, and Prometheus. diff --git a/ansible/inventory.tmpl b/ansible/inventory.tmpl index b4818f0..eac23b9 100644 --- a/ansible/inventory.tmpl +++ b/ansible/inventory.tmpl @@ -13,7 +13,7 @@ control_plane workers [cluster:vars] -ansible_user=root +ansible_user=ubuntu ansible_python_interpreter=/usr/bin/python3 ansible_ssh_private_key_file={{ private_key_file }} k3s_version=latest diff --git a/ansible/roles/addon-secrets-bootstrap/tasks/main.yml b/ansible/roles/addon-secrets-bootstrap/tasks/main.yml index 369597f..2456b89 100644 --- a/ansible/roles/addon-secrets-bootstrap/tasks/main.yml +++ b/ansible/roles/addon-secrets-bootstrap/tasks/main.yml @@ -1,14 +1,4 @@ --- -- name: Apply Hetzner cloud secret - shell: >- - kubectl -n kube-system create secret generic hcloud - --from-literal=token='{{ hcloud_token }}' - --from-literal=network='{{ cluster_name }}-network' - --dry-run=client -o yaml | kubectl apply -f - - changed_when: true - no_log: true - when: hcloud_token | default('') | length > 0 - - name: Ensure Tailscale operator namespace exists command: >- kubectl create namespace {{ tailscale_operator_namespace | default('tailscale-system') }} diff --git a/ansible/roles/ccm-deploy/tasks/main.yml b/ansible/roles/ccm-deploy/tasks/main.yml deleted file mode 100644 index 85fabb1..0000000 --- a/ansible/roles/ccm-deploy/tasks/main.yml +++ /dev/null @@ -1,82 +0,0 @@ ---- -- name: Check if hcloud secret exists - command: kubectl -n kube-system get secret hcloud - register: hcloud_secret_check - changed_when: false - failed_when: false - -- name: Fail if hcloud secret is missing - fail: - msg: "hcloud secret not found in kube-system namespace. CCM requires it." - when: hcloud_secret_check.rc != 0 - -- name: Check if helm is installed - command: which helm - register: helm_check - changed_when: false - failed_when: false - -- name: Install helm - when: helm_check.rc != 0 - block: - - name: Download helm install script - get_url: - url: https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 - dest: /tmp/get-helm-3.sh - mode: "0755" - - - name: Run helm install script - command: /tmp/get-helm-3.sh - args: - creates: /usr/local/bin/helm - -- name: Add Hetzner Helm repository - kubernetes.core.helm_repository: - name: hcloud - repo_url: https://charts.hetzner.cloud - kubeconfig: /etc/rancher/k3s/k3s.yaml - environment: - KUBECONFIG: /etc/rancher/k3s/k3s.yaml - -- name: Deploy Hetzner Cloud Controller Manager - kubernetes.core.helm: - name: hcloud-cloud-controller-manager - chart_ref: hcloud/hcloud-cloud-controller-manager - release_namespace: kube-system - create_namespace: true - values: - networking: - enabled: true - nodeSelector: - kubernetes.io/hostname: "{{ inventory_hostname }}" - additionalTolerations: - - key: node-role.kubernetes.io/control-plane - operator: Exists - effect: NoSchedule - kubeconfig: /etc/rancher/k3s/k3s.yaml - wait: true - wait_timeout: 300s - environment: - KUBECONFIG: /etc/rancher/k3s/k3s.yaml - -- name: Wait for CCM to be ready - command: kubectl -n kube-system rollout status deployment/hcloud-cloud-controller-manager --timeout=120s - changed_when: false - register: ccm_rollout - until: ccm_rollout.rc == 0 - retries: 3 - delay: 10 - -- name: Pause to ensure CCM is fully ready to process new nodes - pause: - seconds: 10 - -- name: Verify CCM is removing uninitialized taints - command: kubectl get nodes -o jsonpath='{.items[*].spec.taints[?(@.key=="node.cloudprovider.kubernetes.io/uninitialized")].key}' - register: uninitialized_taints - changed_when: false - failed_when: false - -- name: Display taint status - debug: - msg: "Nodes with uninitialized taint: {{ uninitialized_taints.stdout }}" diff --git a/ansible/roles/common/tasks/main.yml b/ansible/roles/common/tasks/main.yml index d37dbca..0d122ac 100644 --- a/ansible/roles/common/tasks/main.yml +++ b/ansible/roles/common/tasks/main.yml @@ -19,6 +19,7 @@ - lsb-release - software-properties-common - jq + - nfs-common - htop - vim state: present diff --git a/ansible/roles/k3s-agent/defaults/main.yml b/ansible/roles/k3s-agent/defaults/main.yml index 8cc646a..ca68a29 100644 --- a/ansible/roles/k3s-agent/defaults/main.yml +++ b/ansible/roles/k3s-agent/defaults/main.yml @@ -3,4 +3,5 @@ k3s_version: latest k3s_server_url: "" k3s_token: "" k3s_node_ip: "" -k3s_kubelet_cloud_provider_external: true +k3s_kubelet_cloud_provider_external: false +k3s_flannel_iface: ens18 diff --git a/ansible/roles/k3s-agent/tasks/main.yml b/ansible/roles/k3s-agent/tasks/main.yml index 7150f6c..021e171 100644 --- a/ansible/roles/k3s-agent/tasks/main.yml +++ b/ansible/roles/k3s-agent/tasks/main.yml @@ -22,7 +22,7 @@ command: >- /tmp/install-k3s.sh agent --node-ip {{ k3s_node_ip }} - --flannel-iface=enp7s0 + --flannel-iface={{ k3s_flannel_iface }} {% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %} args: creates: /usr/local/bin/k3s-agent diff --git a/ansible/roles/k3s-server/defaults/main.yml b/ansible/roles/k3s-server/defaults/main.yml index a156ceb..cd17ead 100644 --- a/ansible/roles/k3s-server/defaults/main.yml +++ b/ansible/roles/k3s-server/defaults/main.yml @@ -3,9 +3,10 @@ k3s_version: latest k3s_token: "" k3s_node_ip: "" k3s_primary_public_ip: "" -k3s_disable_embedded_ccm: true +k3s_disable_embedded_ccm: false k3s_disable_servicelb: true -k3s_kubelet_cloud_provider_external: true +k3s_kubelet_cloud_provider_external: false +k3s_flannel_iface: ens18 # Load Balancer endpoint for HA cluster joins (set in inventory) kube_api_endpoint: "" # Tailscale DNS names for control planes (to enable tailnet access) diff --git a/ansible/roles/k3s-server/tasks/main.yml b/ansible/roles/k3s-server/tasks/main.yml index 20e23da..0963fd3 100644 --- a/ansible/roles/k3s-server/tasks/main.yml +++ b/ansible/roles/k3s-server/tasks/main.yml @@ -61,7 +61,7 @@ --cluster-init --advertise-address={{ k3s_primary_ip }} --node-ip={{ k3s_node_ip }} - --flannel-iface=enp7s0 + --flannel-iface={{ k3s_flannel_iface }} --tls-san={{ k3s_primary_ip }} --tls-san={{ k3s_primary_public_ip }} --tls-san={{ kube_api_endpoint }} @@ -87,7 +87,7 @@ --server https://{{ k3s_join_endpoint | default(k3s_primary_ip) }}:6443 --advertise-address={{ k3s_node_ip }} --node-ip={{ k3s_node_ip }} - --flannel-iface=enp7s0 + --flannel-iface={{ k3s_flannel_iface }} {% if k3s_disable_embedded_ccm | bool %}--disable-cloud-controller{% endif %} {% if k3s_disable_servicelb | bool %}--disable=servicelb{% endif %} {% if k3s_kubelet_cloud_provider_external | bool %}--kubelet-arg=cloud-provider=external{% endif %} diff --git a/ansible/roles/kube-vip-deploy/defaults/main.yml b/ansible/roles/kube-vip-deploy/defaults/main.yml new file mode 100644 index 0000000..6094644 --- /dev/null +++ b/ansible/roles/kube-vip-deploy/defaults/main.yml @@ -0,0 +1,4 @@ +--- +kube_vip_version: v1.1.2 +kube_vip_interface: ens18 +kube_vip_address: "{{ kube_api_endpoint }}" diff --git a/ansible/roles/kube-vip-deploy/tasks/main.yml b/ansible/roles/kube-vip-deploy/tasks/main.yml new file mode 100644 index 0000000..8b10436 --- /dev/null +++ b/ansible/roles/kube-vip-deploy/tasks/main.yml @@ -0,0 +1,21 @@ +--- +- name: Render kube-vip control plane manifest + template: + src: kube-vip-control-plane.yaml.j2 + dest: /tmp/kube-vip-control-plane.yaml + mode: "0644" + +- name: Apply kube-vip control plane manifest + command: kubectl apply -f /tmp/kube-vip-control-plane.yaml + changed_when: true + +- name: Wait for kube-vip DaemonSet rollout + command: kubectl -n kube-system rollout status daemonset/kube-vip --timeout=180s + changed_when: false + +- name: Wait for API VIP on 6443 + wait_for: + host: "{{ kube_vip_address }}" + port: 6443 + state: started + timeout: 180 diff --git a/ansible/roles/kube-vip-deploy/templates/kube-vip-control-plane.yaml.j2 b/ansible/roles/kube-vip-deploy/templates/kube-vip-control-plane.yaml.j2 new file mode 100644 index 0000000..51159e7 --- /dev/null +++ b/ansible/roles/kube-vip-deploy/templates/kube-vip-control-plane.yaml.j2 @@ -0,0 +1,110 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kube-vip + namespace: kube-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: system:kube-vip-role +rules: + - apiGroups: [""] + resources: ["services/status"] + verbs: ["update"] + - apiGroups: [""] + resources: ["services", "endpoints"] + verbs: ["list", "get", "watch", "update"] + - apiGroups: [""] + resources: ["nodes"] + verbs: ["list", "get", "watch", "update", "patch"] + - apiGroups: ["coordination.k8s.io"] + resources: ["leases"] + verbs: ["list", "get", "watch", "update", "create"] + - apiGroups: ["discovery.k8s.io"] + resources: ["endpointslices"] + verbs: ["list", "get", "watch", "update"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["list"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: system:kube-vip-binding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:kube-vip-role +subjects: + - kind: ServiceAccount + name: kube-vip + namespace: kube-system +--- +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: kube-vip + namespace: kube-system +spec: + selector: + matchLabels: + app.kubernetes.io/name: kube-vip + template: + metadata: + labels: + app.kubernetes.io/name: kube-vip + spec: + serviceAccountName: kube-vip + hostNetwork: true + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: node-role.kubernetes.io/control-plane + operator: Exists + tolerations: + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule + - key: node-role.kubernetes.io/master + operator: Exists + effect: NoSchedule + containers: + - name: kube-vip + image: ghcr.io/kube-vip/kube-vip:{{ kube_vip_version }} + imagePullPolicy: IfNotPresent + args: + - manager + env: + - name: vip_arp + value: "true" + - name: port + value: "6443" + - name: vip_interface + value: {{ kube_vip_interface | quote }} + - name: vip_subnet + value: "32" + - name: cp_enable + value: "true" + - name: cp_namespace + value: kube-system + - name: vip_ddns + value: "false" + - name: vip_leaderelection + value: "true" + - name: vip_leaseduration + value: "5" + - name: vip_renewdeadline + value: "3" + - name: vip_retryperiod + value: "1" + - name: address + value: {{ kube_vip_address | quote }} + securityContext: + capabilities: + add: + - NET_ADMIN + - NET_RAW + - SYS_TIME diff --git a/ansible/site.yml b/ansible/site.yml index 0a985ee..7aab76f 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -57,12 +57,12 @@ roles: - addon-secrets-bootstrap -- name: Deploy Hetzner CCM (required for workers with external cloud provider) +- name: Deploy kube-vip for API HA hosts: control_plane[0] become: true roles: - - ccm-deploy + - kube-vip-deploy - name: Setup secondary control planes hosts: control_plane[1:] diff --git a/infrastructure/addons/ccm/helmrelease-hcloud-ccm.yaml b/infrastructure/addons/ccm/helmrelease-hcloud-ccm.yaml deleted file mode 100644 index b4dd442..0000000 --- a/infrastructure/addons/ccm/helmrelease-hcloud-ccm.yaml +++ /dev/null @@ -1,36 +0,0 @@ -apiVersion: helm.toolkit.fluxcd.io/v2 -kind: HelmRelease -metadata: - name: hcloud-cloud-controller-manager - namespace: flux-system -spec: - interval: 10m - targetNamespace: kube-system - chart: - spec: - chart: hcloud-cloud-controller-manager - version: 1.30.1 - sourceRef: - kind: HelmRepository - name: hcloud - namespace: flux-system - install: - createNamespace: true - remediation: - retries: 3 - upgrade: - remediation: - retries: 3 - values: - selectorLabels: - app: hcloud-cloud-controller-manager - args: - secure-port: "0" - networking: - enabled: true - nodeSelector: - kubernetes.io/hostname: k8s-cluster-cp-1 - additionalTolerations: - - key: node-role.kubernetes.io/control-plane - operator: Exists - effect: NoSchedule diff --git a/infrastructure/addons/ccm/helmrepository-hcloud.yaml b/infrastructure/addons/ccm/helmrepository-hcloud.yaml deleted file mode 100644 index 2774043..0000000 --- a/infrastructure/addons/ccm/helmrepository-hcloud.yaml +++ /dev/null @@ -1,8 +0,0 @@ -apiVersion: source.toolkit.fluxcd.io/v1 -kind: HelmRepository -metadata: - name: hcloud - namespace: flux-system -spec: - interval: 1h - url: https://charts.hetzner.cloud diff --git a/infrastructure/addons/ccm/kustomization.yaml b/infrastructure/addons/ccm/kustomization.yaml deleted file mode 100644 index 52b8d52..0000000 --- a/infrastructure/addons/ccm/kustomization.yaml +++ /dev/null @@ -1,5 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization -resources: - - helmrepository-hcloud.yaml - - helmrelease-hcloud-ccm.yaml diff --git a/infrastructure/addons/csi/helmrelease-hcloud-csi.yaml b/infrastructure/addons/csi/helmrelease-hcloud-csi.yaml deleted file mode 100644 index 3bd4ab8..0000000 --- a/infrastructure/addons/csi/helmrelease-hcloud-csi.yaml +++ /dev/null @@ -1,36 +0,0 @@ -apiVersion: helm.toolkit.fluxcd.io/v2 -kind: HelmRelease -metadata: - name: hcloud-csi - namespace: flux-system -spec: - interval: 10m - targetNamespace: kube-system - chart: - spec: - chart: hcloud-csi - version: 2.20.0 - sourceRef: - kind: HelmRepository - name: hcloud - namespace: flux-system - install: - createNamespace: true - remediation: - retries: 3 - upgrade: - remediation: - retries: 3 - values: - controller: - nodeSelector: - kubernetes.io/hostname: k8s-cluster-cp-1 - tolerations: - - key: node-role.kubernetes.io/control-plane - operator: Exists - effect: NoSchedule - hcloudVolumeDefaultLocation: nbg1 - storageClasses: - - name: hcloud-volumes - defaultStorageClass: true - reclaimPolicy: Delete diff --git a/infrastructure/addons/csi/kustomization.yaml b/infrastructure/addons/csi/kustomization.yaml deleted file mode 100644 index 8585a8c..0000000 --- a/infrastructure/addons/csi/kustomization.yaml +++ /dev/null @@ -1,5 +0,0 @@ -apiVersion: kustomize.config.k8s.io/v1beta1 -kind: Kustomization -resources: - - helmrepository-hcloud.yaml - - helmrelease-hcloud-csi.yaml diff --git a/infrastructure/addons/kustomization-csi.yaml b/infrastructure/addons/kustomization-csi.yaml deleted file mode 100644 index da13e7e..0000000 --- a/infrastructure/addons/kustomization-csi.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: kustomize.toolkit.fluxcd.io/v1 -kind: Kustomization -metadata: - name: addon-csi - namespace: flux-system -spec: - interval: 10m - prune: true - sourceRef: - kind: GitRepository - name: platform - path: ./infrastructure/addons/csi - dependsOn: - - name: addon-ccm - wait: true - timeout: 10m - suspend: false diff --git a/infrastructure/addons/kustomization-ccm.yaml b/infrastructure/addons/kustomization-nfs-storage.yaml similarity index 77% rename from infrastructure/addons/kustomization-ccm.yaml rename to infrastructure/addons/kustomization-nfs-storage.yaml index 6041dee..9f450bb 100644 --- a/infrastructure/addons/kustomization-ccm.yaml +++ b/infrastructure/addons/kustomization-nfs-storage.yaml @@ -1,7 +1,7 @@ apiVersion: kustomize.toolkit.fluxcd.io/v1 kind: Kustomization metadata: - name: addon-ccm + name: addon-nfs-storage namespace: flux-system spec: interval: 10m @@ -9,7 +9,7 @@ spec: sourceRef: kind: GitRepository name: platform - path: ./infrastructure/addons/ccm + path: ./infrastructure/addons/nfs-storage wait: true timeout: 10m suspend: false diff --git a/infrastructure/addons/kustomization.yaml b/infrastructure/addons/kustomization.yaml index ca27350..15e547d 100644 --- a/infrastructure/addons/kustomization.yaml +++ b/infrastructure/addons/kustomization.yaml @@ -1,8 +1,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - - kustomization-ccm.yaml - - kustomization-csi.yaml + - kustomization-nfs-storage.yaml - kustomization-external-secrets.yaml - kustomization-cert-manager.yaml - kustomization-tailscale-operator.yaml diff --git a/infrastructure/addons/nfs-storage/helmrelease-nfs-subdir-external-provisioner.yaml b/infrastructure/addons/nfs-storage/helmrelease-nfs-subdir-external-provisioner.yaml new file mode 100644 index 0000000..506a223 --- /dev/null +++ b/infrastructure/addons/nfs-storage/helmrelease-nfs-subdir-external-provisioner.yaml @@ -0,0 +1,36 @@ +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: nfs-subdir-external-provisioner + namespace: flux-system +spec: + interval: 10m + targetNamespace: kube-system + chart: + spec: + chart: nfs-subdir-external-provisioner + version: 4.0.18 + sourceRef: + kind: HelmRepository + name: nfs-subdir-external-provisioner + namespace: flux-system + install: + createNamespace: true + remediation: + retries: 3 + upgrade: + remediation: + retries: 3 + values: + nfs: + server: 10.27.27.22 + path: /TheFlash/k8s-nfs + storageClass: + create: true + defaultClass: true + name: flash-nfs + provisionerName: flash-nfs + reclaimPolicy: Delete + archiveOnDelete: true + allowVolumeExpansion: true + volumeBindingMode: Immediate diff --git a/infrastructure/addons/csi/helmrepository-hcloud.yaml b/infrastructure/addons/nfs-storage/helmrepository-nfs-subdir-external-provisioner.yaml similarity index 50% rename from infrastructure/addons/csi/helmrepository-hcloud.yaml rename to infrastructure/addons/nfs-storage/helmrepository-nfs-subdir-external-provisioner.yaml index 2774043..8f234ed 100644 --- a/infrastructure/addons/csi/helmrepository-hcloud.yaml +++ b/infrastructure/addons/nfs-storage/helmrepository-nfs-subdir-external-provisioner.yaml @@ -1,8 +1,8 @@ apiVersion: source.toolkit.fluxcd.io/v1 kind: HelmRepository metadata: - name: hcloud + name: nfs-subdir-external-provisioner namespace: flux-system spec: interval: 1h - url: https://charts.hetzner.cloud + url: https://kubernetes-sigs.github.io/nfs-subdir-external-provisioner diff --git a/infrastructure/addons/nfs-storage/kustomization.yaml b/infrastructure/addons/nfs-storage/kustomization.yaml new file mode 100644 index 0000000..38eb6a5 --- /dev/null +++ b/infrastructure/addons/nfs-storage/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - helmrepository-nfs-subdir-external-provisioner.yaml + - helmrelease-nfs-subdir-external-provisioner.yaml diff --git a/scripts/refresh-kubeconfig.sh b/scripts/refresh-kubeconfig.sh index 1798e81..fa30421 100755 --- a/scripts/refresh-kubeconfig.sh +++ b/scripts/refresh-kubeconfig.sh @@ -24,10 +24,11 @@ echo "Fetching kubeconfig from $CP1_PUBLIC_IP ..." ssh -i "$SSH_KEY" \ -o StrictHostKeyChecking=no \ -o UserKnownHostsFile=/dev/null \ - "root@$CP1_PUBLIC_IP" "cat /etc/rancher/k3s/k3s.yaml" \ + "ubuntu@$CP1_PUBLIC_IP" "sudo cat /etc/rancher/k3s/k3s.yaml" \ | sed "s/127.0.0.1/$CP1_PUBLIC_IP/g" \ > "$KUBECONFIG_PATH" + chmod 600 "$KUBECONFIG_PATH" echo "Kubeconfig saved to $KUBECONFIG_PATH" echo "Run: export KUBECONFIG=$KUBECONFIG_PATH" diff --git a/terraform.tfvars.example b/terraform.tfvars.example index 719ba02..bb44907 100644 --- a/terraform.tfvars.example +++ b/terraform.tfvars.example @@ -1,29 +1,33 @@ -hcloud_token = "your-hetzner-cloud-api-token-here" +proxmox_endpoint = "https://100.105.0.115:8006/" +proxmox_api_token_id = "terraform-prov@pve!k8s-cluster" +proxmox_api_token_secret = "your-proxmox-api-token-secret" -ssh_public_key = "~/.ssh/hetzner_k8s.pub" -ssh_private_key = "~/.ssh/hetzner_k8s" +ssh_public_key = "~/.ssh/infra.pub" +ssh_private_key = "~/.ssh/infra" s3_access_key = "your-backblaze-key-id" s3_secret_key = "your-backblaze-application-key" s3_endpoint = "https://s3.eu-central-003.backblazeb2.com" s3_bucket = "k8s-terraform-state" -cluster_name = "k8s-prod" +cluster_name = "k8s-cluster" tailscale_tailnet = "yourtailnet.ts.net" -restrict_api_ssh_to_tailnet = true -tailnet_cidr = "100.64.0.0/10" -enable_nodeport_public = false +kube_api_vip = "10.27.27.40" control_plane_count = 3 -control_plane_type = "cx23" +control_plane_ips = ["10.27.27.30", "10.27.27.31", "10.27.27.32"] +control_plane_vm_ids = [200, 201, 202] -worker_count = 4 -worker_type = "cx33" +worker_count = 5 +worker_ips = ["10.27.27.41", "10.27.27.42", "10.27.27.43", "10.27.27.44", "10.27.27.45"] +worker_vm_ids = [210, 211, 212, 213, 214] -location = "nbg1" - -allowed_ssh_ips = [] - -allowed_api_ips = [] +proxmox_node_name = "flex" +proxmox_template_vm_id = 9000 +proxmox_vm_storage_pool = "Flash" +proxmox_cloud_init_storage_pool = "Flash" +proxmox_bridge = "vmbr0" +proxmox_gateway = "10.27.27.1" +proxmox_dns_servers = ["1.1.1.1", "8.8.8.8"] diff --git a/terraform/firewall.tf b/terraform/firewall.tf deleted file mode 100644 index 3b92570..0000000 --- a/terraform/firewall.tf +++ /dev/null @@ -1,118 +0,0 @@ -locals { - ssh_source_ips = var.restrict_api_ssh_to_tailnet ? concat([var.tailnet_cidr], var.allowed_ssh_ips) : var.allowed_ssh_ips - api_source_ips = var.restrict_api_ssh_to_tailnet ? concat([var.tailnet_cidr], var.allowed_api_ips) : var.allowed_api_ips -} - -resource "hcloud_firewall" "cluster" { - name = "${var.cluster_name}-firewall" - - rule { - description = "SSH" - direction = "in" - protocol = "tcp" - port = "22" - source_ips = local.ssh_source_ips - } - - rule { - description = "Kubernetes API" - direction = "in" - protocol = "tcp" - port = "6443" - source_ips = local.api_source_ips - } - - rule { - description = "Tailscale WireGuard" - direction = "in" - protocol = "udp" - port = "41641" - source_ips = ["0.0.0.0/0"] - } - - rule { - description = "Kubernetes API (internal)" - direction = "in" - protocol = "tcp" - port = "6443" - source_ips = [var.subnet_cidr] - } - - rule { - description = "k3s Supervisor" - direction = "in" - protocol = "tcp" - port = "9345" - source_ips = [var.subnet_cidr] - } - - rule { - description = "etcd Client" - direction = "in" - protocol = "tcp" - port = "2379" - source_ips = [var.subnet_cidr] - } - - rule { - description = "etcd Peer" - direction = "in" - protocol = "tcp" - port = "2380" - source_ips = [var.subnet_cidr] - } - - rule { - description = "Flannel VXLAN" - direction = "in" - protocol = "udp" - port = "8472" - source_ips = [var.subnet_cidr] - } - - rule { - description = "Kubelet" - direction = "in" - protocol = "tcp" - port = "10250" - source_ips = [var.subnet_cidr] - } - - dynamic "rule" { - for_each = var.enable_nodeport_public ? [1] : [] - content { - description = "NodePorts" - direction = "in" - protocol = "tcp" - port = "30000-32767" - source_ips = ["0.0.0.0/0"] - } - } - - rule { - description = "HTTP from Load Balancer" - direction = "in" - protocol = "tcp" - port = "80" - source_ips = ["0.0.0.0/0"] - } - - rule { - description = "HTTPS from Load Balancer" - direction = "in" - protocol = "tcp" - port = "443" - source_ips = ["0.0.0.0/0"] - } - - rule { - description = "ICMP" - direction = "in" - protocol = "icmp" - source_ips = ["0.0.0.0/0"] - } - - apply_to { - label_selector = "cluster=${var.cluster_name}" - } -} diff --git a/terraform/loadbalancer.tf b/terraform/loadbalancer.tf deleted file mode 100644 index e762f7e..0000000 --- a/terraform/loadbalancer.tf +++ /dev/null @@ -1,50 +0,0 @@ -# Load Balancer for Kubernetes API High Availability -# Provides a single endpoint for all control planes - -resource "hcloud_load_balancer" "kube_api" { - name = "${var.cluster_name}-api" - load_balancer_type = "lb11" # Cheapest tier: €5.39/month - location = var.location - - labels = { - cluster = var.cluster_name - role = "kube-api" - } -} - -# Attach Load Balancer to private network (required for use_private_ip) -resource "hcloud_load_balancer_network" "kube_api" { - load_balancer_id = hcloud_load_balancer.kube_api.id - network_id = hcloud_network.cluster.id - ip = cidrhost(var.subnet_cidr, 5) # 10.0.1.5 -} - -# Attach all control plane servers as targets -resource "hcloud_load_balancer_target" "kube_api_targets" { - count = var.control_plane_count - type = "server" - load_balancer_id = hcloud_load_balancer.kube_api.id - server_id = hcloud_server.control_plane[count.index].id - use_private_ip = true - - depends_on = [hcloud_load_balancer_network.kube_api, hcloud_server.control_plane] -} - -# Kubernetes API service on port 6443 -resource "hcloud_load_balancer_service" "kube_api" { - load_balancer_id = hcloud_load_balancer.kube_api.id - protocol = "tcp" - listen_port = 6443 - destination_port = 6443 - - health_check { - protocol = "tcp" - port = 6443 - interval = 15 - timeout = 10 - retries = 3 - } -} - -# Firewall rule to allow LB access to control planes on 6443 -# This is added to the existing cluster firewall diff --git a/terraform/main.tf b/terraform/main.tf index f3c7352..0cee7df 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -2,13 +2,20 @@ terraform { required_version = ">= 1.0" required_providers { - hcloud = { - source = "hetznercloud/hcloud" - version = "~> 1.45" + local = { + source = "hashicorp/local" + version = "~> 2.5" + } + + proxmox = { + source = "bpg/proxmox" + version = ">= 0.60.0" } } } -provider "hcloud" { - token = var.hcloud_token +provider "proxmox" { + endpoint = var.proxmox_endpoint + api_token = "${var.proxmox_api_token_id}=${var.proxmox_api_token_secret}" + insecure = var.proxmox_insecure } diff --git a/terraform/network.tf b/terraform/network.tf deleted file mode 100644 index 1f49d09..0000000 --- a/terraform/network.tf +++ /dev/null @@ -1,11 +0,0 @@ -resource "hcloud_network" "cluster" { - name = "${var.cluster_name}-network" - ip_range = var.network_cidr -} - -resource "hcloud_network_subnet" "servers" { - network_id = hcloud_network.cluster.id - type = "cloud" - network_zone = "eu-central" - ip_range = var.subnet_cidr -} diff --git a/terraform/outputs.tf b/terraform/outputs.tf index a8f987f..e7e9ef8 100644 --- a/terraform/outputs.tf +++ b/terraform/outputs.tf @@ -1,42 +1,36 @@ output "control_plane_ips" { description = "Public IPs of control plane nodes" - value = [for cp in hcloud_server.control_plane : cp.ipv4_address] + value = var.control_plane_ips } output "control_plane_names" { description = "Control plane hostnames" - value = [for cp in hcloud_server.control_plane : cp.name] + value = [for idx in range(var.control_plane_count) : format("%s-cp-%d", var.cluster_name, idx + 1)] } output "control_plane_private_ips" { description = "Private IPs of control plane nodes" - value = [ - for idx, cp in hcloud_server.control_plane : - try(one(cp.network).ip, cidrhost(var.subnet_cidr, 10 + idx)) - ] + value = var.control_plane_ips } output "primary_control_plane_ip" { description = "Public IP of the primary control plane (first node)" - value = hcloud_server.control_plane[0].ipv4_address + value = var.control_plane_ips[0] } output "worker_ips" { description = "Public IPs of worker nodes" - value = [for worker in hcloud_server.workers : worker.ipv4_address] + value = var.worker_ips } output "worker_names" { description = "Worker hostnames" - value = [for worker in hcloud_server.workers : worker.name] + value = [for idx in range(var.worker_count) : format("%s-worker-%d", var.cluster_name, idx + 1)] } output "worker_private_ips" { description = "Private IPs of worker nodes" - value = [ - for idx, worker in hcloud_server.workers : - try(one(worker.network).ip, cidrhost(var.subnet_cidr, 20 + idx)) - ] + value = var.worker_ips } output "ssh_private_key_path" { @@ -61,10 +55,10 @@ output "network_cidr" { output "kubeconfig_command" { description = "Command to fetch kubeconfig" - value = "ssh root@${hcloud_server.control_plane[0].ipv4_address} 'cat /etc/rancher/k3s/k3s.yaml' > kubeconfig && sed -i 's/127.0.0.1/${hcloud_server.control_plane[0].ipv4_address}/g' kubeconfig" + value = "ssh ubuntu@${var.control_plane_ips[0]} 'sudo cat /etc/rancher/k3s/k3s.yaml' > kubeconfig && sed -i 's/127.0.0.1/${var.control_plane_ips[0]}/g' kubeconfig" } output "kube_api_lb_ip" { description = "Load Balancer private IP for Kubernetes API (used for cluster joins)" - value = hcloud_load_balancer_network.kube_api.ip + value = var.kube_api_vip } diff --git a/terraform/servers.tf b/terraform/servers.tf index 5b72730..84bb3a6 100644 --- a/terraform/servers.tf +++ b/terraform/servers.tf @@ -1,60 +1,121 @@ -data "hcloud_image" "ubuntu" { - name = "ubuntu-24.04" - with_status = ["available"] +data "local_file" "ssh_public_key" { + filename = pathexpand(var.ssh_public_key) } -resource "hcloud_server" "control_plane" { - count = var.control_plane_count +locals { + subnet_prefix = split("/", var.subnet_cidr)[1] - name = "${var.cluster_name}-cp-${count.index + 1}" - server_type = var.control_plane_type - image = data.hcloud_image.ubuntu.id - location = var.location - ssh_keys = [data.hcloud_ssh_key.cluster.id] - - labels = { - cluster = var.cluster_name - role = "control-plane" + control_planes = { + for idx in range(var.control_plane_count) : + format("%s-cp-%d", var.cluster_name, idx + 1) => { + role = "control-plane" + vm_id = var.control_plane_vm_ids[idx] + ip = var.control_plane_ips[idx] + cpu = var.control_plane_cores + memory_mb = var.control_plane_memory_mb + disk_gb = var.control_plane_disk_gb + startup = 1 + } } - network { - network_id = hcloud_network.cluster.id - ip = cidrhost(var.subnet_cidr, 10 + count.index) + workers = { + for idx in range(var.worker_count) : + format("%s-worker-%d", var.cluster_name, idx + 1) => { + role = "worker" + vm_id = var.worker_vm_ids[idx] + ip = var.worker_ips[idx] + cpu = var.worker_cores + memory_mb = var.worker_memory_mb + disk_gb = var.worker_disk_gb + startup = 2 + } } - public_net { - ipv4_enabled = true - ipv6_enabled = true - } - - firewall_ids = [hcloud_firewall.cluster.id] + nodes = merge(local.control_planes, local.workers) } -resource "hcloud_server" "workers" { - count = var.worker_count +resource "proxmox_virtual_environment_vm" "nodes" { + for_each = local.nodes - name = "${var.cluster_name}-worker-${count.index + 1}" - server_type = var.worker_type - image = data.hcloud_image.ubuntu.id - location = var.location - ssh_keys = [data.hcloud_ssh_key.cluster.id] + name = each.key + description = "Managed by Terraform for ${var.cluster_name}" + tags = ["terraform", var.cluster_name, each.value.role] + node_name = var.proxmox_node_name + vm_id = each.value.vm_id - labels = { - cluster = var.cluster_name - role = "worker" + on_boot = true + started = true + stop_on_destroy = true + reboot_after_update = true + timeout_clone = 1800 + timeout_create = 1800 + timeout_shutdown_vm = 300 + timeout_start_vm = 300 + scsi_hardware = "virtio-scsi-single" + + clone { + vm_id = var.proxmox_template_vm_id + datastore_id = var.proxmox_vm_storage_pool + full = var.proxmox_clone_full + retries = 3 } - network { - network_id = hcloud_network.cluster.id - ip = cidrhost(var.subnet_cidr, 20 + count.index) + agent { + enabled = true + trim = true } - public_net { - ipv4_enabled = true - ipv6_enabled = true + cpu { + cores = each.value.cpu + type = "x86-64-v2-AES" } - firewall_ids = [hcloud_firewall.cluster.id] + memory { + dedicated = each.value.memory_mb + floating = each.value.memory_mb + } - depends_on = [hcloud_server.control_plane] + startup { + order = tostring(each.value.startup) + up_delay = "20" + down_delay = "20" + } + + disk { + datastore_id = var.proxmox_vm_storage_pool + interface = "scsi0" + size = each.value.disk_gb + discard = "on" + iothread = true + ssd = true + } + + initialization { + datastore_id = var.proxmox_cloud_init_storage_pool + + dns { + servers = var.proxmox_dns_servers + } + + ip_config { + ipv4 { + address = "${each.value.ip}/${local.subnet_prefix}" + gateway = var.proxmox_gateway + } + } + + user_account { + username = var.proxmox_ssh_username + keys = [trimspace(data.local_file.ssh_public_key.content)] + } + } + + network_device { + bridge = var.proxmox_bridge + model = "virtio" + } + + operating_system { + type = "l26" + } } diff --git a/terraform/ssh.tf b/terraform/ssh.tf deleted file mode 100644 index 94f2d51..0000000 --- a/terraform/ssh.tf +++ /dev/null @@ -1,7 +0,0 @@ -data "local_file" "ssh_public_key" { - filename = pathexpand(var.ssh_public_key) -} - -data "hcloud_ssh_key" "cluster" { - name = "infra" -} diff --git a/terraform/variables.tf b/terraform/variables.tf index 24d37ae..2965a62 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -1,19 +1,13 @@ -variable "hcloud_token" { - description = "Hetzner Cloud API token" - type = string - sensitive = true -} - variable "ssh_public_key" { description = "Path to SSH public key" type = string - default = "~/.ssh/id_ed25519.pub" + default = "~/.ssh/infra.pub" } variable "ssh_private_key" { description = "Path to SSH private key" type = string - default = "~/.ssh/id_ed25519" + default = "~/.ssh/infra" } variable "cluster_name" { @@ -28,28 +22,112 @@ variable "control_plane_count" { default = 3 } -variable "control_plane_type" { - description = "Hetzner server type for control plane" - type = string - default = "cx23" +variable "control_plane_cores" { + description = "vCPU count for control plane VMs" + type = number + default = 2 +} + +variable "control_plane_memory_mb" { + description = "Dedicated memory for control plane VMs in MiB" + type = number + default = 4096 +} + +variable "control_plane_disk_gb" { + description = "Disk size for control plane VMs in GiB" + type = number + default = 32 } variable "worker_count" { description = "Number of worker nodes" type = number - default = 3 + default = 5 } -variable "worker_type" { - description = "Hetzner server type for workers" - type = string - default = "cx33" +variable "worker_cores" { + description = "vCPU count for worker VMs" + type = number + default = 4 } -variable "location" { - description = "Hetzner datacenter location" +variable "worker_memory_mb" { + description = "Dedicated memory for worker VMs in MiB" + type = number + default = 8192 +} + +variable "worker_disk_gb" { + description = "Disk size for worker VMs in GiB" + type = number + default = 64 +} + +variable "proxmox_endpoint" { + description = "Proxmox API endpoint without /api2/json suffix" type = string - default = "nbg1" + default = "https://100.105.0.115:8006/" +} + +variable "proxmox_api_token_id" { + description = "Proxmox API token ID" + type = string + sensitive = true +} + +variable "proxmox_api_token_secret" { + description = "Proxmox API token secret" + type = string + sensitive = true +} + +variable "proxmox_insecure" { + description = "Skip TLS verification for the Proxmox API" + type = bool + default = true +} + +variable "proxmox_node_name" { + description = "Fixed Proxmox node name for all cluster VMs" + type = string + default = "flex" +} + +variable "proxmox_template_vm_id" { + description = "Template VM ID used for linked clones" + type = number + default = 9000 +} + +variable "proxmox_clone_full" { + description = "Whether to use full clones instead of linked clones" + type = bool + default = false +} + +variable "proxmox_vm_storage_pool" { + description = "Proxmox datastore for VM disks" + type = string + default = "Flash" +} + +variable "proxmox_cloud_init_storage_pool" { + description = "Proxmox datastore for cloud-init disks" + type = string + default = "Flash" +} + +variable "proxmox_bridge" { + description = "Proxmox bridge for cluster VM interfaces" + type = string + default = "vmbr0" +} + +variable "proxmox_ssh_username" { + description = "Cloud-init user injected into cloned VMs" + type = string + default = "ubuntu" } variable "allowed_ssh_ips" { @@ -90,13 +168,55 @@ variable "enable_nodeport_public" { variable "network_cidr" { description = "CIDR for private network" type = string - default = "10.0.0.0/16" + default = "10.27.27.0/24" } variable "subnet_cidr" { description = "CIDR for server subnet" type = string - default = "10.0.1.0/24" + default = "10.27.27.0/24" +} + +variable "proxmox_gateway" { + description = "Gateway for cluster VM networking" + type = string + default = "10.27.27.1" +} + +variable "proxmox_dns_servers" { + description = "DNS servers configured through cloud-init" + type = list(string) + default = ["1.1.1.1", "8.8.8.8"] +} + +variable "control_plane_ips" { + description = "Static IPv4 addresses for control plane VMs" + type = list(string) + default = ["10.27.27.30", "10.27.27.31", "10.27.27.32"] +} + +variable "worker_ips" { + description = "Static IPv4 addresses for worker VMs" + type = list(string) + default = ["10.27.27.41", "10.27.27.42", "10.27.27.43", "10.27.27.44", "10.27.27.45"] +} + +variable "control_plane_vm_ids" { + description = "Fixed VMIDs for control plane VMs" + type = list(number) + default = [200, 201, 202] +} + +variable "worker_vm_ids" { + description = "Fixed VMIDs for worker VMs" + type = list(number) + default = [210, 211, 212, 213, 214] +} + +variable "kube_api_vip" { + description = "Virtual IP advertised by kube-vip for the Kubernetes API" + type = string + default = "10.27.27.40" } variable "s3_access_key" {