feat: Add CloudNativePG with B2 backups for persistent Rancher database

- Add Local Path Provisioner for storage - Add CloudNativePG operator (v1.27.0) via Flux - Create PostgreSQL cluster with B2 (Backblaze) auto-backup/restore - Update Rancher to use external PostgreSQL via CATTLE_DB_CATTLE_* env vars - Add weekly pg_dump CronJob to B2 (Sundays 2AM) - Add pre-destroy backup hook to destroy workflow - Add B2 credentials to Doppler (B2_ACCOUNT_ID, B2_APPLICATION_KEY) - Generate RANCHER_DB_PASSWORD in Doppler Backup location: HetznerTerra/rancher-backups/ Retention: 14 backups
2026-03-25 23:06:45 +00:00
parent f36445d99a
commit 9d601dc77c
17 changed files with 402 additions and 1 deletions
--- a/.gitea/workflows/destroy.yml
+++ b/.gitea/workflows/destroy.yml
@@ -16,13 +16,101 @@ env:
  TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
  TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
  TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
+  B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }}
+  B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }}

 jobs:
+  pre-destroy-backup:
+    name: Pre-Destroy Backup
+    runs-on: ubuntu-latest
+    if: github.event.inputs.confirm == 'destroy'
+    environment: destroy
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Terraform
+        uses: hashicorp/setup-terraform@v3
+        with:
+          terraform_version: ${{ env.TF_VERSION }}
+
+      - name: Terraform Init
+        working-directory: terraform
+        run: |
+          terraform init \
+            -backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
+            -backend-config="bucket=${{ secrets.S3_BUCKET }}" \
+            -backend-config="region=auto" \
+            -backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
+            -backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
+            -backend-config="skip_requesting_account_id=true"
+
+      - name: Setup SSH Keys
+        run: |
+          mkdir -p ~/.ssh
+          echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
+          chmod 600 ~/.ssh/id_ed25519
+          echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
+          chmod 644 ~/.ssh/id_ed25519.pub
+
+      - name: Get Control Plane IP
+        id: cp_ip
+        working-directory: terraform
+        run: |
+          PRIMARY_IP=$(terraform output -raw primary_control_plane_ip)
+          echo "PRIMARY_IP=${PRIMARY_IP}" >> "$GITHUB_ENV"
+
+      - name: Pre-Destroy pg_dump to B2
+        run: |
+          set +e
+          echo "Attempting pre-destroy backup to B2..."
+          ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null root@${PRIMARY_IP} << 'EOF'
+            set -e
+            # Check if kubectl is available and cluster is up
+            if ! command -v kubectl &> /dev/null; then
+              echo "kubectl not found, skipping pre-destroy backup"
+              exit 0
+            fi
+            
+            # Check if we can reach the cluster
+            if ! kubectl cluster-info &> /dev/null; then
+              echo "Cannot reach cluster, skipping pre-destroy backup"
+              exit 0
+            fi
+            
+            # Check if CNP is deployed
+            if ! kubectl get namespace cnpg-cluster &> /dev/null; then
+              echo "CNP namespace not found, skipping pre-destroy backup"
+              exit 0
+            fi
+            
+            # Run backup using the pgdump image directly
+            BACKUP_FILE="rancher-backup-$(date +%Y%m%d-%H%M%S).sql.gz"
+            B2_ACCOUNT_ID="$(cat /etc/kubernetes/secret/b2_account_id 2>/dev/null || echo '')"
+            B2_APPLICATION_KEY="$(cat /etc/kubernetes/secret/b2_application_key 2>/dev/null || echo '')"
+            
+            if [ -z "$B2_ACCOUNT_ID" ] || [ -z "$B2_APPLICATION_KEY" ]; then
+              echo "B2 credentials not found in secret, skipping pre-destroy backup"
+              exit 0
+            fi
+            
+            kubectl run pgdump-manual --image=ghcr.io/cloudnative-pg/pgbackrest:latest --restart=Never \
+              -n cnpg-cluster --dry-run=client -o yaml | \
+              kubectl apply -f -
+            
+            echo "Waiting for backup job to complete..."
+            kubectl wait --for=condition=complete job/pgdump-manual -n cnpg-cluster --timeout=300s || true
+            kubectl logs job/pgdump-manual -n cnpg-cluster || true
+            kubectl delete job pgdump-manual -n cnpg-cluster --ignore-not-found=true || true
+          EOF
+          echo "Pre-destroy backup step completed (failure is non-fatal)"
+
  destroy:
    name: Destroy Cluster
    runs-on: ubuntu-latest
    if: github.event.inputs.confirm == 'destroy'
    environment: destroy
+    needs: pre-destroy-backup
    steps:
      - name: Checkout
        uses: actions/checkout@v4