feat: Add CloudNativePG with B2 backups for persistent Rancher database
Some checks failed
Deploy Cluster / Terraform (push) Successful in 4m16s
Deploy Cluster / Ansible (push) Failing after 12m27s

- Add Local Path Provisioner for storage
- Add CloudNativePG operator (v1.27.0) via Flux
- Create PostgreSQL cluster with B2 (Backblaze) auto-backup/restore
- Update Rancher to use external PostgreSQL via CATTLE_DB_CATTLE_* env vars
- Add weekly pg_dump CronJob to B2 (Sundays 2AM)
- Add pre-destroy backup hook to destroy workflow
- Add B2 credentials to Doppler (B2_ACCOUNT_ID, B2_APPLICATION_KEY)
- Generate RANCHER_DB_PASSWORD in Doppler

Backup location: HetznerTerra/rancher-backups/
Retention: 14 backups
This commit is contained in:
2026-03-25 23:06:45 +00:00
parent f36445d99a
commit 9d601dc77c
17 changed files with 402 additions and 1 deletions

View File

@@ -16,13 +16,101 @@ env:
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }}
B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }}
jobs:
pre-destroy-backup:
name: Pre-Destroy Backup
runs-on: ubuntu-latest
if: github.event.inputs.confirm == 'destroy'
environment: destroy
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Terraform
uses: hashicorp/setup-terraform@v3
with:
terraform_version: ${{ env.TF_VERSION }}
- name: Terraform Init
working-directory: terraform
run: |
terraform init \
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
-backend-config="region=auto" \
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
-backend-config="skip_requesting_account_id=true"
- name: Setup SSH Keys
run: |
mkdir -p ~/.ssh
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
chmod 644 ~/.ssh/id_ed25519.pub
- name: Get Control Plane IP
id: cp_ip
working-directory: terraform
run: |
PRIMARY_IP=$(terraform output -raw primary_control_plane_ip)
echo "PRIMARY_IP=${PRIMARY_IP}" >> "$GITHUB_ENV"
- name: Pre-Destroy pg_dump to B2
run: |
set +e
echo "Attempting pre-destroy backup to B2..."
ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null root@${PRIMARY_IP} << 'EOF'
set -e
# Check if kubectl is available and cluster is up
if ! command -v kubectl &> /dev/null; then
echo "kubectl not found, skipping pre-destroy backup"
exit 0
fi
# Check if we can reach the cluster
if ! kubectl cluster-info &> /dev/null; then
echo "Cannot reach cluster, skipping pre-destroy backup"
exit 0
fi
# Check if CNP is deployed
if ! kubectl get namespace cnpg-cluster &> /dev/null; then
echo "CNP namespace not found, skipping pre-destroy backup"
exit 0
fi
# Run backup using the pgdump image directly
BACKUP_FILE="rancher-backup-$(date +%Y%m%d-%H%M%S).sql.gz"
B2_ACCOUNT_ID="$(cat /etc/kubernetes/secret/b2_account_id 2>/dev/null || echo '')"
B2_APPLICATION_KEY="$(cat /etc/kubernetes/secret/b2_application_key 2>/dev/null || echo '')"
if [ -z "$B2_ACCOUNT_ID" ] || [ -z "$B2_APPLICATION_KEY" ]; then
echo "B2 credentials not found in secret, skipping pre-destroy backup"
exit 0
fi
kubectl run pgdump-manual --image=ghcr.io/cloudnative-pg/pgbackrest:latest --restart=Never \
-n cnpg-cluster --dry-run=client -o yaml | \
kubectl apply -f -
echo "Waiting for backup job to complete..."
kubectl wait --for=condition=complete job/pgdump-manual -n cnpg-cluster --timeout=300s || true
kubectl logs job/pgdump-manual -n cnpg-cluster || true
kubectl delete job pgdump-manual -n cnpg-cluster --ignore-not-found=true || true
EOF
echo "Pre-destroy backup step completed (failure is non-fatal)"
destroy:
name: Destroy Cluster
runs-on: ubuntu-latest
if: github.event.inputs.confirm == 'destroy'
environment: destroy
needs: pre-destroy-backup
steps:
- name: Checkout
uses: actions/checkout@v4

View File

@@ -0,0 +1,25 @@
apiVersion: external-secrets.io/v1
kind: ExternalSecret
metadata:
name: b2-credentials
namespace: cnpg-cluster
spec:
refreshInterval: 1h
secretStoreRef:
name: doppler-hetznerterra
kind: ClusterSecretStore
target:
name: b2-credentials
creationPolicy: Owner
template:
type: Opaque
data:
B2_ACCOUNT_ID: "{{ .B2_ACCOUNT_ID }}"
B2_APPLICATION_KEY: "{{ .B2_APPLICATION_KEY }}"
data:
- secretKey: B2_ACCOUNT_ID
remoteRef:
key: B2_ACCOUNT_ID
- secretKey: B2_APPLICATION_KEY
remoteRef:
key: B2_APPLICATION_KEY

View File

@@ -0,0 +1,19 @@
apiVersion: v1
kind: Service
metadata:
name: cnpg-cluster-rw
namespace: cnpg-cluster
labels:
app.kubernetes.io/name: rancher-db
cnpg.io/cluster: rancher-db
spec:
type: ClusterIP
clusterIP: None
ports:
- port: 5432
targetPort: 5432
protocol: TCP
selector:
app.kubernetes.io/name: postgresql
cnpg.io/cluster: rancher-db
role: primary

View File

@@ -0,0 +1,27 @@
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: cnpg
namespace: flux-system
spec:
interval: 10m
targetNamespace: cnpg-system
chart:
spec:
chart: cloudnative-pg
version: 1.27.0
sourceRef:
kind: HelmRepository
name: cnpg
namespace: flux-system
install:
createNamespace: true
remediation:
retries: 3
upgrade:
remediation:
retries: 3
values:
image:
repository: ghcr.io/cloudnative-pg/postgresql
clusterImage: ghcr.io/cloudnative-pg/postgresql:17.4

View File

@@ -0,0 +1,8 @@
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: cnpg
namespace: flux-system
spec:
interval: 1h
url: https://cloudnative-pg.github.io/charts

View File

@@ -0,0 +1,11 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helmrepository-cnpg.yaml
- helmrelease-cnpg.yaml
- namespace.yaml
- b2-credentials-externalsecret.yaml
- rancher-db-password-externalsecret.yaml
- postgres-cluster.yaml
- cnpg-cluster-rw-svc.yaml
- pgdump-cronjob.yaml

View File

@@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: cnpg-cluster

View File

@@ -0,0 +1,61 @@
apiVersion: batch/v1
kind: CronJob
metadata:
name: pgdump-rancher
namespace: cnpg-cluster
spec:
schedule: "0 2 * * 0"
concurrencyPolicy: Forbid
successfulJobsHistoryLimit: 4
failedJobsHistoryLimit: 4
jobTemplate:
spec:
backoffLimit: 3
template:
spec:
restartPolicy: OnFailure
containers:
- name: pgdump
image: ghcr.io/cloudnative-pg/pgbackrest:latest
command:
- /bin/sh
- -c
- |
set -e
export AWS_ACCESS_KEY_ID=$(cat /etc/b2/credentials/B2_ACCOUNT_ID)
export AWS_SECRET_ACCESS_KEY=$(cat /etc/b2/credentials/B2_APPLICATION_KEY)
export AWS_ENDPOINT=https://s3.us-east-005.backblazeb2.com
BACKUP_FILE="rancher-backup-$(date +%Y%m%d-%H%M%S).sql.gz"
pg_dump -h cnpg-cluster-rw.cnpg-cluster.svc -U postgres -d postgres --no-owner --clean | gzip | \
aws s3 cp - s3://HetznerTerra/rancher-backups/$BACKUP_FILE
echo "Backup completed: $BACKUP_FILE"
env:
- name: PGPASSWORD
valueFrom:
secretKeyRef:
name: rancher-db-password
key: password
volumeMounts:
- name: b2-credentials
mountPath: /etc/b2/credentials
readOnly: true
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
volumes:
- name: b2-credentials
secret:
secretName: b2-credentials
nodeSelector:
kubernetes.io/hostname: k8s-cluster-cp-1
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule

View File

@@ -0,0 +1,70 @@
apiVersion: postgresql.cnpg.io/v1
kind: Cluster
metadata:
name: rancher-db
namespace: cnpg-cluster
spec:
description: "Rancher external database cluster"
imageName: ghcr.io/cloudnative-pg/postgresql:17.4
imagePullPolicy: IfNotPresent
instances: 1
primaryUpdateStrategy: unsupervised
storage:
storageClass: local-path
size: 50Gi
resizeStorageStorageClassName: local-path
resources:
requests:
cpu: 250m
memory: 512Mi
limits:
cpu: 1000m
memory: 2Gi
bootstrap:
recovery:
externalClusters:
- name: b2-backup
s3Compatible:
bucket: HetznerTerra
region: us-east-005
endpoint: s3.us-east-005.backblazeb2.com
prefix: rancher-backups/
credentials:
name: b2-credentials
accessKey: B2_ACCOUNT_ID
secretKey: B2_APPLICATION_KEY
backup:
b2:
bucket: HetznerTerra
region: us-east-005
endpoint: s3.us-east-005.backblazeb2.com
prefix: rancher-backups/
credentials:
name: b2-credentials
accessKey: B2_ACCOUNT_ID
secretKey: B2_APPLICATION_KEY
retentionPolicy: keep14
serviceAccountTemplate:
metadata:
labels:
app.kubernetes.io/name: rancher-db
superuserSecret:
name: rancher-db-password
monitoring:
enablePodMonitor: true
affinity:
nodeSelector:
kubernetes.io/hostname: k8s-cluster-cp-1
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule

View File

@@ -0,0 +1,21 @@
apiVersion: external-secrets.io/v1
kind: ExternalSecret
metadata:
name: rancher-db-password
namespace: cnpg-cluster
spec:
refreshInterval: 1h
secretStoreRef:
name: doppler-hetznerterra
kind: ClusterSecretStore
target:
name: rancher-db-password
creationPolicy: Owner
template:
type: Opaque
data:
password: "{{ .RANCHER_DB_PASSWORD }}"
data:
- secretKey: RANCHER_DB_PASSWORD
remoteRef:
key: RANCHER_DB_PASSWORD

View File

@@ -0,0 +1,4 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- cnpg

View File

@@ -0,0 +1,4 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- lpp

View File

@@ -10,4 +10,6 @@ resources:
- kustomization-flux-ui.yaml
- kustomization-observability.yaml
- kustomization-observability-content.yaml
- kustomization-rancher.yaml
- kustomization-lpp.yaml
- kustomization-cnpg.yaml
- kustomization-rancher.yaml

View File

@@ -0,0 +1,31 @@
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: local-path-provisioner
namespace: flux-system
spec:
interval: 10m
targetNamespace: kube-system
chart:
spec:
chart: local-path-provisioner
version: 1.12.1
sourceRef:
kind: HelmRepository
name: local-path
namespace: flux-system
install:
createNamespace: true
remediation:
retries: 3
upgrade:
remediation:
retries: 3
values:
nodePathMap:
- node: /var/lib/rancher/k3s/storage
paths:
- /var/lib/rancher/k3s/storage
storageClass:
defaultClass: true
name: local-path

View File

@@ -0,0 +1,8 @@
apiVersion: source.toolkit.fluxcd.io/v1
kind: HelmRepository
metadata:
name: local-path
namespace: flux-system
spec:
interval: 1h
url: https://charts.rancher.io

View File

@@ -0,0 +1,5 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- helmrepository-local-path.yaml
- helmrelease-local-path-provisioner.yaml

View File

@@ -26,6 +26,19 @@ spec:
tls: external
replicas: 1
extraEnv:
- name: CATTLE_DB_CATTLE_HOST
value: cnpg-cluster-rw.cnpg-cluster.svc
- name: CATTLE_DB_CATTLE_PORT
value: "5432"
- name: CATTLE_DB_CATTLE_DATABASE
value: postgres
- name: CATTLE_DB_CATTLE_USERNAME
value: postgres
- name: CATTLE_DB_CATTLE_PASSWORD
valueFrom:
secretKeyRef:
name: rancher-db-password
key: password
- name: CATTLE_PROMETHEUS_METRICS
value: "true"
resources: