feat: migrate cluster baseline from Hetzner to Proxmox
Deploy Cluster / Terraform (push) Failing after 52s
Deploy Cluster / Ansible (push) Has been skipped
Deploy Grafana Content / Grafana Content (push) Failing after 1m37s

Replace Hetzner infrastructure and cloud-provider assumptions with Proxmox
VM clones, kube-vip API HA, and NFS-backed storage. Update bootstrap,
Flux addons, CI workflows, and docs to target the new private Proxmox
baseline while preserving the existing Tailscale, Doppler, Flux, Rancher,
and B2 backup flows.
This commit is contained in:
2026-04-22 03:02:13 +00:00
parent 6c6b9d20ca
commit b1dae28aa5
40 changed files with 577 additions and 784 deletions
+4 -20
View File
@@ -12,12 +12,15 @@ on:
env:
TF_VERSION: "1.7.0"
TF_VAR_hcloud_token: ${{ secrets.HCLOUD_TOKEN }}
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
TF_VAR_proxmox_insecure: "true"
jobs:
dashboards:
@@ -51,25 +54,6 @@ jobs:
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
-backend-config="skip_requesting_account_id=true"
- name: Detect runner egress IP
run: |
RUNNER_IP=$(curl -fsSL https://api.ipify.org)
echo "RUNNER_CIDR=[\"${RUNNER_IP}/32\"]" >> "$GITHUB_ENV"
echo "Runner egress IP: ${RUNNER_IP}"
- name: Open SSH/API for current runner CIDR
working-directory: terraform
run: |
terraform apply \
-refresh=false \
-target=hcloud_firewall.cluster \
-var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
-var="allowed_ssh_ips=${RUNNER_CIDR}" \
-var="allowed_api_ips=${RUNNER_CIDR}" \
-auto-approve
- name: Install Python Dependencies
run: |
apt-get update && apt-get install -y python3-pip
+7 -62
View File
@@ -11,12 +11,15 @@ on:
env:
TF_VERSION: "1.7.0"
TF_VAR_hcloud_token: ${{ secrets.HCLOUD_TOKEN }}
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
TF_VAR_proxmox_insecure: "true"
TS_OAUTH_CLIENT_ID: ${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}
TS_OAUTH_CLIENT_SECRET: ${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}
@@ -60,40 +63,6 @@ jobs:
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
chmod 644 ~/.ssh/id_ed25519.pub
- name: Install jq
run: |
apt-get update
apt-get install -y jq
- name: Import existing servers into state (if missing)
working-directory: terraform
env:
HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }}
run: |
set -e
ensure_import() {
address="$1"
name="$2"
if terraform state show "$address" >/dev/null 2>&1; then
echo "$address already in state"
return
fi
id=$(curl -sS -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/servers?name=${name}" | jq -r '.servers[0].id // empty')
if [ -n "$id" ]; then
echo "Importing $address from server $name ($id)"
terraform import "$address" "$id"
else
echo "No existing server found for $name; skipping import"
fi
}
ensure_import 'hcloud_server.control_plane[0]' 'k8s-cluster-cp-1'
ensure_import 'hcloud_server.control_plane[1]' 'k8s-cluster-cp-2'
ensure_import 'hcloud_server.control_plane[2]' 'k8s-cluster-cp-3'
ensure_import 'hcloud_server.workers[0]' 'k8s-cluster-worker-1'
ensure_import 'hcloud_server.workers[1]' 'k8s-cluster-worker-2'
ensure_import 'hcloud_server.workers[2]' 'k8s-cluster-worker-3'
- name: Terraform Plan
id: plan
working-directory: terraform
@@ -187,32 +156,11 @@ jobs:
mkdir -p ../outputs
terraform output -json > ../outputs/terraform_outputs.json
- name: Detect runner egress IP
run: |
RUNNER_IP=$(curl -fsSL https://api.ipify.org)
echo "RUNNER_CIDR=[\"${RUNNER_IP}/32\"]" >> "$GITHUB_ENV"
echo "Runner egress IP: ${RUNNER_IP}"
- name: Open SSH/API for current runner CIDR
working-directory: terraform
run: |
terraform apply \
-target=hcloud_firewall.cluster \
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
-var="allowed_ssh_ips=${RUNNER_CIDR}" \
-var="allowed_api_ips=${RUNNER_CIDR}" \
-auto-approve
- name: Install Python Dependencies
run: |
apt-get update && apt-get install -y python3-pip
pip3 install --break-system-packages ansible kubernetes jinja2 pyyaml
- name: Note runner connectivity mode
run: |
echo "Using runner public network access with RUNNER_ALLOWED_CIDRS for SSH/API"
- name: Install Ansible Collections
run: ansible-galaxy collection install -r ansible/requirements.yml
@@ -224,7 +172,6 @@ jobs:
working-directory: ansible
run: |
ansible-playbook site.yml \
-e "hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
-e "tailscale_auth_key=${{ secrets.TAILSCALE_AUTH_KEY }}" \
-e "tailscale_tailnet=${{ secrets.TAILSCALE_TAILNET }}" \
-e "tailscale_oauth_client_id=${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}" \
@@ -294,9 +241,8 @@ jobs:
key: dopplerToken
namespace: external-secrets
EOF
# Wait for CCM and CSI (Hetzner cloud integration)
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-ccm --timeout=600s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-csi --timeout=600s
# Wait for the storage layer and private access components
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=600s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s
- name: Wait for Rancher and backup operator
@@ -397,10 +343,9 @@ jobs:
working-directory: ansible
run: |
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get nodes -o wide"
ansible -i inventory.ini 'control_plane[0]' -m shell -a "kubectl describe nodes | grep -E 'Name:|providerID:'"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n flux-system get gitrepositories,kustomizations,helmreleases"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n kube-system get pods -o wide"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass flash-nfs"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n tailscale-system get pods -o wide"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n external-secrets get pods"
env:
+13 -123
View File
@@ -10,107 +10,22 @@ on:
env:
TF_VERSION: "1.7.0"
TF_VAR_hcloud_token: ${{ secrets.HCLOUD_TOKEN }}
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }}
B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }}
TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
TF_VAR_proxmox_insecure: "true"
jobs:
pre-destroy-backup:
name: Pre-Destroy Backup
runs-on: ubuntu-latest
if: github.event.inputs.confirm == 'destroy'
environment: destroy
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Terraform
uses: hashicorp/setup-terraform@v3
with:
terraform_version: ${{ env.TF_VERSION }}
- name: Terraform Init
working-directory: terraform
run: |
terraform init \
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
-backend-config="region=auto" \
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
-backend-config="skip_requesting_account_id=true"
- name: Setup SSH Keys
run: |
mkdir -p ~/.ssh
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
chmod 644 ~/.ssh/id_ed25519.pub
- name: Get Control Plane IP
id: cp_ip
working-directory: terraform
run: |
PRIMARY_IP=$(terraform output -raw primary_control_plane_ip)
echo "PRIMARY_IP=${PRIMARY_IP}" >> "$GITHUB_ENV"
- name: Pre-Destroy pg_dump to B2
run: |
set +e
echo "Attempting pre-destroy backup to B2..."
ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null root@${PRIMARY_IP} << 'EOF'
set -e
# Check if kubectl is available and cluster is up
if ! command -v kubectl &> /dev/null; then
echo "kubectl not found, skipping pre-destroy backup"
exit 0
fi
# Check if we can reach the cluster
if ! kubectl cluster-info &> /dev/null; then
echo "Cannot reach cluster, skipping pre-destroy backup"
exit 0
fi
# Check if CNP is deployed
if ! kubectl get namespace cnpg-cluster &> /dev/null; then
echo "CNP namespace not found, skipping pre-destroy backup"
exit 0
fi
# Run backup using the pgdump image directly
BACKUP_FILE="rancher-backup-$(date +%Y%m%d-%H%M%S).sql.gz"
B2_ACCOUNT_ID="$(cat /etc/kubernetes/secret/b2_account_id 2>/dev/null || echo '')"
B2_APPLICATION_KEY="$(cat /etc/kubernetes/secret/b2_application_key 2>/dev/null || echo '')"
if [ -z "$B2_ACCOUNT_ID" ] || [ -z "$B2_APPLICATION_KEY" ]; then
echo "B2 credentials not found in secret, skipping pre-destroy backup"
exit 0
fi
kubectl run pgdump-manual --image=ghcr.io/cloudnative-pg/pgbackrest:latest --restart=Never \
-n cnpg-cluster --dry-run=client -o yaml | \
kubectl apply -f -
echo "Waiting for backup job to complete..."
kubectl wait --for=condition=complete job/pgdump-manual -n cnpg-cluster --timeout=300s || true
kubectl logs job/pgdump-manual -n cnpg-cluster || true
kubectl delete job pgdump-manual -n cnpg-cluster --ignore-not-found=true || true
EOF
echo "Pre-destroy backup step completed (failure is non-fatal)"
destroy:
name: Destroy Cluster
runs-on: ubuntu-latest
if: github.event.inputs.confirm == 'destroy'
environment: destroy
needs: pre-destroy-backup
steps:
- name: Checkout
uses: actions/checkout@v4
@@ -120,6 +35,14 @@ jobs:
with:
terraform_version: ${{ env.TF_VERSION }}
- name: Setup SSH Keys
run: |
mkdir -p ~/.ssh
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
chmod 644 ~/.ssh/id_ed25519.pub
- name: Terraform Init
working-directory: terraform
run: |
@@ -131,19 +54,6 @@ jobs:
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
-backend-config="skip_requesting_account_id=true"
- name: Setup SSH Keys
run: |
mkdir -p ~/.ssh
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
chmod 644 ~/.ssh/id_ed25519.pub
- name: Install jq
run: |
apt-get update
apt-get install -y jq
- name: Terraform Destroy
id: destroy
working-directory: terraform
@@ -152,7 +62,6 @@ jobs:
for attempt in 1 2 3; do
echo "Terraform destroy attempt ${attempt}/3"
terraform destroy \
-var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
-auto-approve
@@ -164,32 +73,13 @@ jobs:
echo "Terraform destroy failed with exit code ${rc}; retrying in 30s"
sleep 30
terraform refresh \
-var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
-var="ssh_private_key=$HOME/.ssh/id_ed25519" || true
fi
done
exit "$rc"
- name: Hetzner destroy diagnostics
- name: Terraform state diagnostics
if: failure() && steps.destroy.outcome == 'failure'
env:
HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }}
run: |
set +e
echo "== Terraform state list =="
terraform -chdir=terraform state list || true
network_id=$(terraform -chdir=terraform state show hcloud_network.cluster 2>/dev/null | awk '/^id *=/ {gsub(/"/, "", $3); print $3; exit}')
if [ -z "$network_id" ]; then
network_id="11988935"
fi
echo "== Hetzner network =="
curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/networks/${network_id}" | jq . || true
echo "== Hetzner servers attached to network =="
curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/servers" | jq --argjson id "$network_id" '.servers[] | select(any(.private_net[]?; .network == $id)) | {id, name, private_net}' || true
echo "== Hetzner load balancers attached to network =="
curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/load_balancers" | jq --argjson id "$network_id" '.load_balancers[] | select(any(.private_net[]?; .network == $id)) | {id, name, private_net}' || true