feat: migrate cluster baseline from Hetzner to Proxmox
Replace Hetzner infrastructure and cloud-provider assumptions with Proxmox VM clones, kube-vip API HA, and NFS-backed storage. Update bootstrap, Flux addons, CI workflows, and docs to target the new private Proxmox baseline while preserving the existing Tailscale, Doppler, Flux, Rancher, and B2 backup flows.
This commit is contained in:
@@ -12,12 +12,15 @@ on:
|
||||
|
||||
env:
|
||||
TF_VERSION: "1.7.0"
|
||||
TF_VAR_hcloud_token: ${{ secrets.HCLOUD_TOKEN }}
|
||||
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
|
||||
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
|
||||
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
|
||||
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
|
||||
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
|
||||
TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
|
||||
TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
|
||||
TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
|
||||
TF_VAR_proxmox_insecure: "true"
|
||||
|
||||
jobs:
|
||||
dashboards:
|
||||
@@ -51,25 +54,6 @@ jobs:
|
||||
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
|
||||
-backend-config="skip_requesting_account_id=true"
|
||||
|
||||
- name: Detect runner egress IP
|
||||
run: |
|
||||
RUNNER_IP=$(curl -fsSL https://api.ipify.org)
|
||||
echo "RUNNER_CIDR=[\"${RUNNER_IP}/32\"]" >> "$GITHUB_ENV"
|
||||
echo "Runner egress IP: ${RUNNER_IP}"
|
||||
|
||||
- name: Open SSH/API for current runner CIDR
|
||||
working-directory: terraform
|
||||
run: |
|
||||
terraform apply \
|
||||
-refresh=false \
|
||||
-target=hcloud_firewall.cluster \
|
||||
-var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
|
||||
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
||||
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
||||
-var="allowed_ssh_ips=${RUNNER_CIDR}" \
|
||||
-var="allowed_api_ips=${RUNNER_CIDR}" \
|
||||
-auto-approve
|
||||
|
||||
- name: Install Python Dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install -y python3-pip
|
||||
|
||||
@@ -11,12 +11,15 @@ on:
|
||||
|
||||
env:
|
||||
TF_VERSION: "1.7.0"
|
||||
TF_VAR_hcloud_token: ${{ secrets.HCLOUD_TOKEN }}
|
||||
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
|
||||
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
|
||||
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
|
||||
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
|
||||
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
|
||||
TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
|
||||
TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
|
||||
TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
|
||||
TF_VAR_proxmox_insecure: "true"
|
||||
TS_OAUTH_CLIENT_ID: ${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}
|
||||
TS_OAUTH_CLIENT_SECRET: ${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}
|
||||
|
||||
@@ -60,40 +63,6 @@ jobs:
|
||||
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
|
||||
chmod 644 ~/.ssh/id_ed25519.pub
|
||||
|
||||
- name: Install jq
|
||||
run: |
|
||||
apt-get update
|
||||
apt-get install -y jq
|
||||
|
||||
- name: Import existing servers into state (if missing)
|
||||
working-directory: terraform
|
||||
env:
|
||||
HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }}
|
||||
run: |
|
||||
set -e
|
||||
ensure_import() {
|
||||
address="$1"
|
||||
name="$2"
|
||||
if terraform state show "$address" >/dev/null 2>&1; then
|
||||
echo "$address already in state"
|
||||
return
|
||||
fi
|
||||
id=$(curl -sS -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/servers?name=${name}" | jq -r '.servers[0].id // empty')
|
||||
if [ -n "$id" ]; then
|
||||
echo "Importing $address from server $name ($id)"
|
||||
terraform import "$address" "$id"
|
||||
else
|
||||
echo "No existing server found for $name; skipping import"
|
||||
fi
|
||||
}
|
||||
|
||||
ensure_import 'hcloud_server.control_plane[0]' 'k8s-cluster-cp-1'
|
||||
ensure_import 'hcloud_server.control_plane[1]' 'k8s-cluster-cp-2'
|
||||
ensure_import 'hcloud_server.control_plane[2]' 'k8s-cluster-cp-3'
|
||||
ensure_import 'hcloud_server.workers[0]' 'k8s-cluster-worker-1'
|
||||
ensure_import 'hcloud_server.workers[1]' 'k8s-cluster-worker-2'
|
||||
ensure_import 'hcloud_server.workers[2]' 'k8s-cluster-worker-3'
|
||||
|
||||
- name: Terraform Plan
|
||||
id: plan
|
||||
working-directory: terraform
|
||||
@@ -187,32 +156,11 @@ jobs:
|
||||
mkdir -p ../outputs
|
||||
terraform output -json > ../outputs/terraform_outputs.json
|
||||
|
||||
- name: Detect runner egress IP
|
||||
run: |
|
||||
RUNNER_IP=$(curl -fsSL https://api.ipify.org)
|
||||
echo "RUNNER_CIDR=[\"${RUNNER_IP}/32\"]" >> "$GITHUB_ENV"
|
||||
echo "Runner egress IP: ${RUNNER_IP}"
|
||||
|
||||
- name: Open SSH/API for current runner CIDR
|
||||
working-directory: terraform
|
||||
run: |
|
||||
terraform apply \
|
||||
-target=hcloud_firewall.cluster \
|
||||
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
||||
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
||||
-var="allowed_ssh_ips=${RUNNER_CIDR}" \
|
||||
-var="allowed_api_ips=${RUNNER_CIDR}" \
|
||||
-auto-approve
|
||||
|
||||
- name: Install Python Dependencies
|
||||
run: |
|
||||
apt-get update && apt-get install -y python3-pip
|
||||
pip3 install --break-system-packages ansible kubernetes jinja2 pyyaml
|
||||
|
||||
- name: Note runner connectivity mode
|
||||
run: |
|
||||
echo "Using runner public network access with RUNNER_ALLOWED_CIDRS for SSH/API"
|
||||
|
||||
- name: Install Ansible Collections
|
||||
run: ansible-galaxy collection install -r ansible/requirements.yml
|
||||
|
||||
@@ -224,7 +172,6 @@ jobs:
|
||||
working-directory: ansible
|
||||
run: |
|
||||
ansible-playbook site.yml \
|
||||
-e "hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
|
||||
-e "tailscale_auth_key=${{ secrets.TAILSCALE_AUTH_KEY }}" \
|
||||
-e "tailscale_tailnet=${{ secrets.TAILSCALE_TAILNET }}" \
|
||||
-e "tailscale_oauth_client_id=${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}" \
|
||||
@@ -294,9 +241,8 @@ jobs:
|
||||
key: dopplerToken
|
||||
namespace: external-secrets
|
||||
EOF
|
||||
# Wait for CCM and CSI (Hetzner cloud integration)
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-ccm --timeout=600s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-csi --timeout=600s
|
||||
# Wait for the storage layer and private access components
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=600s
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s
|
||||
|
||||
- name: Wait for Rancher and backup operator
|
||||
@@ -397,10 +343,9 @@ jobs:
|
||||
working-directory: ansible
|
||||
run: |
|
||||
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get nodes -o wide"
|
||||
ansible -i inventory.ini 'control_plane[0]' -m shell -a "kubectl describe nodes | grep -E 'Name:|providerID:'"
|
||||
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n flux-system get gitrepositories,kustomizations,helmreleases"
|
||||
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n kube-system get pods -o wide"
|
||||
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass"
|
||||
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass flash-nfs"
|
||||
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n tailscale-system get pods -o wide"
|
||||
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n external-secrets get pods"
|
||||
env:
|
||||
|
||||
+13
-123
@@ -10,107 +10,22 @@ on:
|
||||
|
||||
env:
|
||||
TF_VERSION: "1.7.0"
|
||||
TF_VAR_hcloud_token: ${{ secrets.HCLOUD_TOKEN }}
|
||||
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
|
||||
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
|
||||
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
|
||||
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
|
||||
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
|
||||
B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }}
|
||||
B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }}
|
||||
TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
|
||||
TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
|
||||
TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
|
||||
TF_VAR_proxmox_insecure: "true"
|
||||
|
||||
jobs:
|
||||
pre-destroy-backup:
|
||||
name: Pre-Destroy Backup
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event.inputs.confirm == 'destroy'
|
||||
environment: destroy
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Terraform
|
||||
uses: hashicorp/setup-terraform@v3
|
||||
with:
|
||||
terraform_version: ${{ env.TF_VERSION }}
|
||||
|
||||
- name: Terraform Init
|
||||
working-directory: terraform
|
||||
run: |
|
||||
terraform init \
|
||||
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
||||
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
||||
-backend-config="region=auto" \
|
||||
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
|
||||
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
|
||||
-backend-config="skip_requesting_account_id=true"
|
||||
|
||||
- name: Setup SSH Keys
|
||||
run: |
|
||||
mkdir -p ~/.ssh
|
||||
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
|
||||
chmod 600 ~/.ssh/id_ed25519
|
||||
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
|
||||
chmod 644 ~/.ssh/id_ed25519.pub
|
||||
|
||||
- name: Get Control Plane IP
|
||||
id: cp_ip
|
||||
working-directory: terraform
|
||||
run: |
|
||||
PRIMARY_IP=$(terraform output -raw primary_control_plane_ip)
|
||||
echo "PRIMARY_IP=${PRIMARY_IP}" >> "$GITHUB_ENV"
|
||||
|
||||
- name: Pre-Destroy pg_dump to B2
|
||||
run: |
|
||||
set +e
|
||||
echo "Attempting pre-destroy backup to B2..."
|
||||
ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null root@${PRIMARY_IP} << 'EOF'
|
||||
set -e
|
||||
# Check if kubectl is available and cluster is up
|
||||
if ! command -v kubectl &> /dev/null; then
|
||||
echo "kubectl not found, skipping pre-destroy backup"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Check if we can reach the cluster
|
||||
if ! kubectl cluster-info &> /dev/null; then
|
||||
echo "Cannot reach cluster, skipping pre-destroy backup"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Check if CNP is deployed
|
||||
if ! kubectl get namespace cnpg-cluster &> /dev/null; then
|
||||
echo "CNP namespace not found, skipping pre-destroy backup"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Run backup using the pgdump image directly
|
||||
BACKUP_FILE="rancher-backup-$(date +%Y%m%d-%H%M%S).sql.gz"
|
||||
B2_ACCOUNT_ID="$(cat /etc/kubernetes/secret/b2_account_id 2>/dev/null || echo '')"
|
||||
B2_APPLICATION_KEY="$(cat /etc/kubernetes/secret/b2_application_key 2>/dev/null || echo '')"
|
||||
|
||||
if [ -z "$B2_ACCOUNT_ID" ] || [ -z "$B2_APPLICATION_KEY" ]; then
|
||||
echo "B2 credentials not found in secret, skipping pre-destroy backup"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
kubectl run pgdump-manual --image=ghcr.io/cloudnative-pg/pgbackrest:latest --restart=Never \
|
||||
-n cnpg-cluster --dry-run=client -o yaml | \
|
||||
kubectl apply -f -
|
||||
|
||||
echo "Waiting for backup job to complete..."
|
||||
kubectl wait --for=condition=complete job/pgdump-manual -n cnpg-cluster --timeout=300s || true
|
||||
kubectl logs job/pgdump-manual -n cnpg-cluster || true
|
||||
kubectl delete job pgdump-manual -n cnpg-cluster --ignore-not-found=true || true
|
||||
EOF
|
||||
echo "Pre-destroy backup step completed (failure is non-fatal)"
|
||||
|
||||
destroy:
|
||||
name: Destroy Cluster
|
||||
runs-on: ubuntu-latest
|
||||
if: github.event.inputs.confirm == 'destroy'
|
||||
environment: destroy
|
||||
needs: pre-destroy-backup
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
@@ -120,6 +35,14 @@ jobs:
|
||||
with:
|
||||
terraform_version: ${{ env.TF_VERSION }}
|
||||
|
||||
- name: Setup SSH Keys
|
||||
run: |
|
||||
mkdir -p ~/.ssh
|
||||
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
|
||||
chmod 600 ~/.ssh/id_ed25519
|
||||
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
|
||||
chmod 644 ~/.ssh/id_ed25519.pub
|
||||
|
||||
- name: Terraform Init
|
||||
working-directory: terraform
|
||||
run: |
|
||||
@@ -131,19 +54,6 @@ jobs:
|
||||
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
|
||||
-backend-config="skip_requesting_account_id=true"
|
||||
|
||||
- name: Setup SSH Keys
|
||||
run: |
|
||||
mkdir -p ~/.ssh
|
||||
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
|
||||
chmod 600 ~/.ssh/id_ed25519
|
||||
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
|
||||
chmod 644 ~/.ssh/id_ed25519.pub
|
||||
|
||||
- name: Install jq
|
||||
run: |
|
||||
apt-get update
|
||||
apt-get install -y jq
|
||||
|
||||
- name: Terraform Destroy
|
||||
id: destroy
|
||||
working-directory: terraform
|
||||
@@ -152,7 +62,6 @@ jobs:
|
||||
for attempt in 1 2 3; do
|
||||
echo "Terraform destroy attempt ${attempt}/3"
|
||||
terraform destroy \
|
||||
-var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
|
||||
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
||||
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
||||
-auto-approve
|
||||
@@ -164,32 +73,13 @@ jobs:
|
||||
echo "Terraform destroy failed with exit code ${rc}; retrying in 30s"
|
||||
sleep 30
|
||||
terraform refresh \
|
||||
-var="hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
|
||||
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
||||
-var="ssh_private_key=$HOME/.ssh/id_ed25519" || true
|
||||
fi
|
||||
done
|
||||
exit "$rc"
|
||||
|
||||
- name: Hetzner destroy diagnostics
|
||||
- name: Terraform state diagnostics
|
||||
if: failure() && steps.destroy.outcome == 'failure'
|
||||
env:
|
||||
HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }}
|
||||
run: |
|
||||
set +e
|
||||
echo "== Terraform state list =="
|
||||
terraform -chdir=terraform state list || true
|
||||
|
||||
network_id=$(terraform -chdir=terraform state show hcloud_network.cluster 2>/dev/null | awk '/^id *=/ {gsub(/"/, "", $3); print $3; exit}')
|
||||
if [ -z "$network_id" ]; then
|
||||
network_id="11988935"
|
||||
fi
|
||||
|
||||
echo "== Hetzner network =="
|
||||
curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/networks/${network_id}" | jq . || true
|
||||
|
||||
echo "== Hetzner servers attached to network =="
|
||||
curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/servers" | jq --argjson id "$network_id" '.servers[] | select(any(.private_net[]?; .network == $id)) | {id, name, private_net}' || true
|
||||
|
||||
echo "== Hetzner load balancers attached to network =="
|
||||
curl -fsSL -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/load_balancers" | jq --argjson id "$network_id" '.load_balancers[] | select(any(.private_net[]?; .network == $id)) | {id, name, private_net}' || true
|
||||
|
||||
Reference in New Issue
Block a user