feat: migrate cluster baseline from Hetzner to Proxmox
Deploy Cluster / Terraform (push) Failing after 52s
Deploy Cluster / Ansible (push) Has been skipped
Deploy Grafana Content / Grafana Content (push) Failing after 1m37s

Replace Hetzner infrastructure and cloud-provider assumptions with Proxmox
VM clones, kube-vip API HA, and NFS-backed storage. Update bootstrap,
Flux addons, CI workflows, and docs to target the new private Proxmox
baseline while preserving the existing Tailscale, Doppler, Flux, Rancher,
and B2 backup flows.
This commit is contained in:
2026-04-22 03:02:13 +00:00
parent 6c6b9d20ca
commit b1dae28aa5
40 changed files with 577 additions and 784 deletions
+7 -62
View File
@@ -11,12 +11,15 @@ on:
env:
TF_VERSION: "1.7.0"
TF_VAR_hcloud_token: ${{ secrets.HCLOUD_TOKEN }}
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
TF_VAR_proxmox_insecure: "true"
TS_OAUTH_CLIENT_ID: ${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}
TS_OAUTH_CLIENT_SECRET: ${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}
@@ -60,40 +63,6 @@ jobs:
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
chmod 644 ~/.ssh/id_ed25519.pub
- name: Install jq
run: |
apt-get update
apt-get install -y jq
- name: Import existing servers into state (if missing)
working-directory: terraform
env:
HCLOUD_TOKEN: ${{ secrets.HCLOUD_TOKEN }}
run: |
set -e
ensure_import() {
address="$1"
name="$2"
if terraform state show "$address" >/dev/null 2>&1; then
echo "$address already in state"
return
fi
id=$(curl -sS -H "Authorization: Bearer ${HCLOUD_TOKEN}" "https://api.hetzner.cloud/v1/servers?name=${name}" | jq -r '.servers[0].id // empty')
if [ -n "$id" ]; then
echo "Importing $address from server $name ($id)"
terraform import "$address" "$id"
else
echo "No existing server found for $name; skipping import"
fi
}
ensure_import 'hcloud_server.control_plane[0]' 'k8s-cluster-cp-1'
ensure_import 'hcloud_server.control_plane[1]' 'k8s-cluster-cp-2'
ensure_import 'hcloud_server.control_plane[2]' 'k8s-cluster-cp-3'
ensure_import 'hcloud_server.workers[0]' 'k8s-cluster-worker-1'
ensure_import 'hcloud_server.workers[1]' 'k8s-cluster-worker-2'
ensure_import 'hcloud_server.workers[2]' 'k8s-cluster-worker-3'
- name: Terraform Plan
id: plan
working-directory: terraform
@@ -187,32 +156,11 @@ jobs:
mkdir -p ../outputs
terraform output -json > ../outputs/terraform_outputs.json
- name: Detect runner egress IP
run: |
RUNNER_IP=$(curl -fsSL https://api.ipify.org)
echo "RUNNER_CIDR=[\"${RUNNER_IP}/32\"]" >> "$GITHUB_ENV"
echo "Runner egress IP: ${RUNNER_IP}"
- name: Open SSH/API for current runner CIDR
working-directory: terraform
run: |
terraform apply \
-target=hcloud_firewall.cluster \
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
-var="allowed_ssh_ips=${RUNNER_CIDR}" \
-var="allowed_api_ips=${RUNNER_CIDR}" \
-auto-approve
- name: Install Python Dependencies
run: |
apt-get update && apt-get install -y python3-pip
pip3 install --break-system-packages ansible kubernetes jinja2 pyyaml
- name: Note runner connectivity mode
run: |
echo "Using runner public network access with RUNNER_ALLOWED_CIDRS for SSH/API"
- name: Install Ansible Collections
run: ansible-galaxy collection install -r ansible/requirements.yml
@@ -224,7 +172,6 @@ jobs:
working-directory: ansible
run: |
ansible-playbook site.yml \
-e "hcloud_token=${{ secrets.HCLOUD_TOKEN }}" \
-e "tailscale_auth_key=${{ secrets.TAILSCALE_AUTH_KEY }}" \
-e "tailscale_tailnet=${{ secrets.TAILSCALE_TAILNET }}" \
-e "tailscale_oauth_client_id=${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}" \
@@ -294,9 +241,8 @@ jobs:
key: dopplerToken
namespace: external-secrets
EOF
# Wait for CCM and CSI (Hetzner cloud integration)
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-ccm --timeout=600s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-csi --timeout=600s
# Wait for the storage layer and private access components
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=600s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s
- name: Wait for Rancher and backup operator
@@ -397,10 +343,9 @@ jobs:
working-directory: ansible
run: |
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get nodes -o wide"
ansible -i inventory.ini 'control_plane[0]' -m shell -a "kubectl describe nodes | grep -E 'Name:|providerID:'"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n flux-system get gitrepositories,kustomizations,helmreleases"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n kube-system get pods -o wide"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass flash-nfs"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n tailscale-system get pods -o wide"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n external-secrets get pods"
env: