From 9d601dc77cd1243d2b25cde6fe661dfc2736f6c8 Mon Sep 17 00:00:00 2001 From: MichaelFisher1997 Date: Wed, 25 Mar 2026 23:06:45 +0000 Subject: [PATCH] feat: Add CloudNativePG with B2 backups for persistent Rancher database - Add Local Path Provisioner for storage - Add CloudNativePG operator (v1.27.0) via Flux - Create PostgreSQL cluster with B2 (Backblaze) auto-backup/restore - Update Rancher to use external PostgreSQL via CATTLE_DB_CATTLE_* env vars - Add weekly pg_dump CronJob to B2 (Sundays 2AM) - Add pre-destroy backup hook to destroy workflow - Add B2 credentials to Doppler (B2_ACCOUNT_ID, B2_APPLICATION_KEY) - Generate RANCHER_DB_PASSWORD in Doppler Backup location: HetznerTerra/rancher-backups/ Retention: 14 backups --- .gitea/workflows/destroy.yml | 88 +++++++++++++++++++ .../cnpg/b2-credentials-externalsecret.yaml | 25 ++++++ .../addons/cnpg/cnpg-cluster-rw-svc.yaml | 19 ++++ .../addons/cnpg/helmrelease-cnpg.yaml | 27 ++++++ .../addons/cnpg/helmrepository-cnpg.yaml | 8 ++ infrastructure/addons/cnpg/kustomization.yaml | 11 +++ infrastructure/addons/cnpg/namespace.yaml | 4 + .../addons/cnpg/pgdump-cronjob.yaml | 61 +++++++++++++ .../addons/cnpg/postgres-cluster.yaml | 70 +++++++++++++++ .../rancher-db-password-externalsecret.yaml | 21 +++++ infrastructure/addons/kustomization-cnpg.yaml | 4 + infrastructure/addons/kustomization-lpp.yaml | 4 + infrastructure/addons/kustomization.yaml | 4 +- .../helmrelease-local-path-provisioner.yaml | 31 +++++++ .../addons/lpp/helmrepository-local-path.yaml | 8 ++ infrastructure/addons/lpp/kustomization.yaml | 5 ++ .../addons/rancher/helmrelease-rancher.yaml | 13 +++ 17 files changed, 402 insertions(+), 1 deletion(-) create mode 100644 infrastructure/addons/cnpg/b2-credentials-externalsecret.yaml create mode 100644 infrastructure/addons/cnpg/cnpg-cluster-rw-svc.yaml create mode 100644 infrastructure/addons/cnpg/helmrelease-cnpg.yaml create mode 100644 infrastructure/addons/cnpg/helmrepository-cnpg.yaml create mode 100644 infrastructure/addons/cnpg/kustomization.yaml create mode 100644 infrastructure/addons/cnpg/namespace.yaml create mode 100644 infrastructure/addons/cnpg/pgdump-cronjob.yaml create mode 100644 infrastructure/addons/cnpg/postgres-cluster.yaml create mode 100644 infrastructure/addons/cnpg/rancher-db-password-externalsecret.yaml create mode 100644 infrastructure/addons/kustomization-cnpg.yaml create mode 100644 infrastructure/addons/kustomization-lpp.yaml create mode 100644 infrastructure/addons/lpp/helmrelease-local-path-provisioner.yaml create mode 100644 infrastructure/addons/lpp/helmrepository-local-path.yaml create mode 100644 infrastructure/addons/lpp/kustomization.yaml diff --git a/.gitea/workflows/destroy.yml b/.gitea/workflows/destroy.yml index e9312bf..fd44564 100644 --- a/.gitea/workflows/destroy.yml +++ b/.gitea/workflows/destroy.yml @@ -16,13 +16,101 @@ env: TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }} TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }} TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }} + B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }} + B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }} jobs: + pre-destroy-backup: + name: Pre-Destroy Backup + runs-on: ubuntu-latest + if: github.event.inputs.confirm == 'destroy' + environment: destroy + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Terraform + uses: hashicorp/setup-terraform@v3 + with: + terraform_version: ${{ env.TF_VERSION }} + + - name: Terraform Init + working-directory: terraform + run: | + terraform init \ + -backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \ + -backend-config="bucket=${{ secrets.S3_BUCKET }}" \ + -backend-config="region=auto" \ + -backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \ + -backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \ + -backend-config="skip_requesting_account_id=true" + + - name: Setup SSH Keys + run: | + mkdir -p ~/.ssh + echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519 + chmod 600 ~/.ssh/id_ed25519 + echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub + chmod 644 ~/.ssh/id_ed25519.pub + + - name: Get Control Plane IP + id: cp_ip + working-directory: terraform + run: | + PRIMARY_IP=$(terraform output -raw primary_control_plane_ip) + echo "PRIMARY_IP=${PRIMARY_IP}" >> "$GITHUB_ENV" + + - name: Pre-Destroy pg_dump to B2 + run: | + set +e + echo "Attempting pre-destroy backup to B2..." + ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null root@${PRIMARY_IP} << 'EOF' + set -e + # Check if kubectl is available and cluster is up + if ! command -v kubectl &> /dev/null; then + echo "kubectl not found, skipping pre-destroy backup" + exit 0 + fi + + # Check if we can reach the cluster + if ! kubectl cluster-info &> /dev/null; then + echo "Cannot reach cluster, skipping pre-destroy backup" + exit 0 + fi + + # Check if CNP is deployed + if ! kubectl get namespace cnpg-cluster &> /dev/null; then + echo "CNP namespace not found, skipping pre-destroy backup" + exit 0 + fi + + # Run backup using the pgdump image directly + BACKUP_FILE="rancher-backup-$(date +%Y%m%d-%H%M%S).sql.gz" + B2_ACCOUNT_ID="$(cat /etc/kubernetes/secret/b2_account_id 2>/dev/null || echo '')" + B2_APPLICATION_KEY="$(cat /etc/kubernetes/secret/b2_application_key 2>/dev/null || echo '')" + + if [ -z "$B2_ACCOUNT_ID" ] || [ -z "$B2_APPLICATION_KEY" ]; then + echo "B2 credentials not found in secret, skipping pre-destroy backup" + exit 0 + fi + + kubectl run pgdump-manual --image=ghcr.io/cloudnative-pg/pgbackrest:latest --restart=Never \ + -n cnpg-cluster --dry-run=client -o yaml | \ + kubectl apply -f - + + echo "Waiting for backup job to complete..." + kubectl wait --for=condition=complete job/pgdump-manual -n cnpg-cluster --timeout=300s || true + kubectl logs job/pgdump-manual -n cnpg-cluster || true + kubectl delete job pgdump-manual -n cnpg-cluster --ignore-not-found=true || true + EOF + echo "Pre-destroy backup step completed (failure is non-fatal)" + destroy: name: Destroy Cluster runs-on: ubuntu-latest if: github.event.inputs.confirm == 'destroy' environment: destroy + needs: pre-destroy-backup steps: - name: Checkout uses: actions/checkout@v4 diff --git a/infrastructure/addons/cnpg/b2-credentials-externalsecret.yaml b/infrastructure/addons/cnpg/b2-credentials-externalsecret.yaml new file mode 100644 index 0000000..93b1928 --- /dev/null +++ b/infrastructure/addons/cnpg/b2-credentials-externalsecret.yaml @@ -0,0 +1,25 @@ +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: b2-credentials + namespace: cnpg-cluster +spec: + refreshInterval: 1h + secretStoreRef: + name: doppler-hetznerterra + kind: ClusterSecretStore + target: + name: b2-credentials + creationPolicy: Owner + template: + type: Opaque + data: + B2_ACCOUNT_ID: "{{ .B2_ACCOUNT_ID }}" + B2_APPLICATION_KEY: "{{ .B2_APPLICATION_KEY }}" + data: + - secretKey: B2_ACCOUNT_ID + remoteRef: + key: B2_ACCOUNT_ID + - secretKey: B2_APPLICATION_KEY + remoteRef: + key: B2_APPLICATION_KEY \ No newline at end of file diff --git a/infrastructure/addons/cnpg/cnpg-cluster-rw-svc.yaml b/infrastructure/addons/cnpg/cnpg-cluster-rw-svc.yaml new file mode 100644 index 0000000..12c011c --- /dev/null +++ b/infrastructure/addons/cnpg/cnpg-cluster-rw-svc.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: Service +metadata: + name: cnpg-cluster-rw + namespace: cnpg-cluster + labels: + app.kubernetes.io/name: rancher-db + cnpg.io/cluster: rancher-db +spec: + type: ClusterIP + clusterIP: None + ports: + - port: 5432 + targetPort: 5432 + protocol: TCP + selector: + app.kubernetes.io/name: postgresql + cnpg.io/cluster: rancher-db + role: primary \ No newline at end of file diff --git a/infrastructure/addons/cnpg/helmrelease-cnpg.yaml b/infrastructure/addons/cnpg/helmrelease-cnpg.yaml new file mode 100644 index 0000000..5775079 --- /dev/null +++ b/infrastructure/addons/cnpg/helmrelease-cnpg.yaml @@ -0,0 +1,27 @@ +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: cnpg + namespace: flux-system +spec: + interval: 10m + targetNamespace: cnpg-system + chart: + spec: + chart: cloudnative-pg + version: 1.27.0 + sourceRef: + kind: HelmRepository + name: cnpg + namespace: flux-system + install: + createNamespace: true + remediation: + retries: 3 + upgrade: + remediation: + retries: 3 + values: + image: + repository: ghcr.io/cloudnative-pg/postgresql + clusterImage: ghcr.io/cloudnative-pg/postgresql:17.4 \ No newline at end of file diff --git a/infrastructure/addons/cnpg/helmrepository-cnpg.yaml b/infrastructure/addons/cnpg/helmrepository-cnpg.yaml new file mode 100644 index 0000000..88705dc --- /dev/null +++ b/infrastructure/addons/cnpg/helmrepository-cnpg.yaml @@ -0,0 +1,8 @@ +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: cnpg + namespace: flux-system +spec: + interval: 1h + url: https://cloudnative-pg.github.io/charts \ No newline at end of file diff --git a/infrastructure/addons/cnpg/kustomization.yaml b/infrastructure/addons/cnpg/kustomization.yaml new file mode 100644 index 0000000..370703d --- /dev/null +++ b/infrastructure/addons/cnpg/kustomization.yaml @@ -0,0 +1,11 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - helmrepository-cnpg.yaml + - helmrelease-cnpg.yaml + - namespace.yaml + - b2-credentials-externalsecret.yaml + - rancher-db-password-externalsecret.yaml + - postgres-cluster.yaml + - cnpg-cluster-rw-svc.yaml + - pgdump-cronjob.yaml \ No newline at end of file diff --git a/infrastructure/addons/cnpg/namespace.yaml b/infrastructure/addons/cnpg/namespace.yaml new file mode 100644 index 0000000..385a070 --- /dev/null +++ b/infrastructure/addons/cnpg/namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: cnpg-cluster \ No newline at end of file diff --git a/infrastructure/addons/cnpg/pgdump-cronjob.yaml b/infrastructure/addons/cnpg/pgdump-cronjob.yaml new file mode 100644 index 0000000..6231a5f --- /dev/null +++ b/infrastructure/addons/cnpg/pgdump-cronjob.yaml @@ -0,0 +1,61 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: pgdump-rancher + namespace: cnpg-cluster +spec: + schedule: "0 2 * * 0" + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 4 + failedJobsHistoryLimit: 4 + jobTemplate: + spec: + backoffLimit: 3 + template: + spec: + restartPolicy: OnFailure + containers: + - name: pgdump + image: ghcr.io/cloudnative-pg/pgbackrest:latest + command: + - /bin/sh + - -c + - | + set -e + export AWS_ACCESS_KEY_ID=$(cat /etc/b2/credentials/B2_ACCOUNT_ID) + export AWS_SECRET_ACCESS_KEY=$(cat /etc/b2/credentials/B2_APPLICATION_KEY) + export AWS_ENDPOINT=https://s3.us-east-005.backblazeb2.com + + BACKUP_FILE="rancher-backup-$(date +%Y%m%d-%H%M%S).sql.gz" + + pg_dump -h cnpg-cluster-rw.cnpg-cluster.svc -U postgres -d postgres --no-owner --clean | gzip | \ + aws s3 cp - s3://HetznerTerra/rancher-backups/$BACKUP_FILE + + echo "Backup completed: $BACKUP_FILE" + env: + - name: PGPASSWORD + valueFrom: + secretKeyRef: + name: rancher-db-password + key: password + volumeMounts: + - name: b2-credentials + mountPath: /etc/b2/credentials + readOnly: true + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 500m + memory: 512Mi + volumes: + - name: b2-credentials + secret: + secretName: b2-credentials + nodeSelector: + kubernetes.io/hostname: k8s-cluster-cp-1 + tolerations: + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule \ No newline at end of file diff --git a/infrastructure/addons/cnpg/postgres-cluster.yaml b/infrastructure/addons/cnpg/postgres-cluster.yaml new file mode 100644 index 0000000..1e4cc95 --- /dev/null +++ b/infrastructure/addons/cnpg/postgres-cluster.yaml @@ -0,0 +1,70 @@ +apiVersion: postgresql.cnpg.io/v1 +kind: Cluster +metadata: + name: rancher-db + namespace: cnpg-cluster +spec: + description: "Rancher external database cluster" + imageName: ghcr.io/cloudnative-pg/postgresql:17.4 + imagePullPolicy: IfNotPresent + + instances: 1 + primaryUpdateStrategy: unsupervised + + storage: + storageClass: local-path + size: 50Gi + resizeStorageStorageClassName: local-path + + resources: + requests: + cpu: 250m + memory: 512Mi + limits: + cpu: 1000m + memory: 2Gi + + bootstrap: + recovery: + externalClusters: + - name: b2-backup + s3Compatible: + bucket: HetznerTerra + region: us-east-005 + endpoint: s3.us-east-005.backblazeb2.com + prefix: rancher-backups/ + credentials: + name: b2-credentials + accessKey: B2_ACCOUNT_ID + secretKey: B2_APPLICATION_KEY + + backup: + b2: + bucket: HetznerTerra + region: us-east-005 + endpoint: s3.us-east-005.backblazeb2.com + prefix: rancher-backups/ + credentials: + name: b2-credentials + accessKey: B2_ACCOUNT_ID + secretKey: B2_APPLICATION_KEY + retentionPolicy: keep14 + + serviceAccountTemplate: + metadata: + labels: + app.kubernetes.io/name: rancher-db + + superuserSecret: + name: rancher-db-password + + monitoring: + enablePodMonitor: true + + affinity: + nodeSelector: + kubernetes.io/hostname: k8s-cluster-cp-1 + tolerations: + - key: node-role.kubernetes.io/control-plane + operator: Exists + effect: NoSchedule \ No newline at end of file diff --git a/infrastructure/addons/cnpg/rancher-db-password-externalsecret.yaml b/infrastructure/addons/cnpg/rancher-db-password-externalsecret.yaml new file mode 100644 index 0000000..eaf2b24 --- /dev/null +++ b/infrastructure/addons/cnpg/rancher-db-password-externalsecret.yaml @@ -0,0 +1,21 @@ +apiVersion: external-secrets.io/v1 +kind: ExternalSecret +metadata: + name: rancher-db-password + namespace: cnpg-cluster +spec: + refreshInterval: 1h + secretStoreRef: + name: doppler-hetznerterra + kind: ClusterSecretStore + target: + name: rancher-db-password + creationPolicy: Owner + template: + type: Opaque + data: + password: "{{ .RANCHER_DB_PASSWORD }}" + data: + - secretKey: RANCHER_DB_PASSWORD + remoteRef: + key: RANCHER_DB_PASSWORD \ No newline at end of file diff --git a/infrastructure/addons/kustomization-cnpg.yaml b/infrastructure/addons/kustomization-cnpg.yaml new file mode 100644 index 0000000..9cdcf35 --- /dev/null +++ b/infrastructure/addons/kustomization-cnpg.yaml @@ -0,0 +1,4 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - cnpg \ No newline at end of file diff --git a/infrastructure/addons/kustomization-lpp.yaml b/infrastructure/addons/kustomization-lpp.yaml new file mode 100644 index 0000000..99d4dc0 --- /dev/null +++ b/infrastructure/addons/kustomization-lpp.yaml @@ -0,0 +1,4 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - lpp \ No newline at end of file diff --git a/infrastructure/addons/kustomization.yaml b/infrastructure/addons/kustomization.yaml index 8c47371..4a88589 100644 --- a/infrastructure/addons/kustomization.yaml +++ b/infrastructure/addons/kustomization.yaml @@ -10,4 +10,6 @@ resources: - kustomization-flux-ui.yaml - kustomization-observability.yaml - kustomization-observability-content.yaml - - kustomization-rancher.yaml + - kustomization-lpp.yaml + - kustomization-cnpg.yaml + - kustomization-rancher.yaml \ No newline at end of file diff --git a/infrastructure/addons/lpp/helmrelease-local-path-provisioner.yaml b/infrastructure/addons/lpp/helmrelease-local-path-provisioner.yaml new file mode 100644 index 0000000..10728d2 --- /dev/null +++ b/infrastructure/addons/lpp/helmrelease-local-path-provisioner.yaml @@ -0,0 +1,31 @@ +apiVersion: helm.toolkit.fluxcd.io/v2 +kind: HelmRelease +metadata: + name: local-path-provisioner + namespace: flux-system +spec: + interval: 10m + targetNamespace: kube-system + chart: + spec: + chart: local-path-provisioner + version: 1.12.1 + sourceRef: + kind: HelmRepository + name: local-path + namespace: flux-system + install: + createNamespace: true + remediation: + retries: 3 + upgrade: + remediation: + retries: 3 + values: + nodePathMap: + - node: /var/lib/rancher/k3s/storage + paths: + - /var/lib/rancher/k3s/storage + storageClass: + defaultClass: true + name: local-path \ No newline at end of file diff --git a/infrastructure/addons/lpp/helmrepository-local-path.yaml b/infrastructure/addons/lpp/helmrepository-local-path.yaml new file mode 100644 index 0000000..9e18ae6 --- /dev/null +++ b/infrastructure/addons/lpp/helmrepository-local-path.yaml @@ -0,0 +1,8 @@ +apiVersion: source.toolkit.fluxcd.io/v1 +kind: HelmRepository +metadata: + name: local-path + namespace: flux-system +spec: + interval: 1h + url: https://charts.rancher.io \ No newline at end of file diff --git a/infrastructure/addons/lpp/kustomization.yaml b/infrastructure/addons/lpp/kustomization.yaml new file mode 100644 index 0000000..c5a73bd --- /dev/null +++ b/infrastructure/addons/lpp/kustomization.yaml @@ -0,0 +1,5 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - helmrepository-local-path.yaml + - helmrelease-local-path-provisioner.yaml \ No newline at end of file diff --git a/infrastructure/addons/rancher/helmrelease-rancher.yaml b/infrastructure/addons/rancher/helmrelease-rancher.yaml index a42fcbb..383c2a9 100644 --- a/infrastructure/addons/rancher/helmrelease-rancher.yaml +++ b/infrastructure/addons/rancher/helmrelease-rancher.yaml @@ -26,6 +26,19 @@ spec: tls: external replicas: 1 extraEnv: + - name: CATTLE_DB_CATTLE_HOST + value: cnpg-cluster-rw.cnpg-cluster.svc + - name: CATTLE_DB_CATTLE_PORT + value: "5432" + - name: CATTLE_DB_CATTLE_DATABASE + value: postgres + - name: CATTLE_DB_CATTLE_USERNAME + value: postgres + - name: CATTLE_DB_CATTLE_PASSWORD + valueFrom: + secretKeyRef: + name: rancher-db-password + key: password - name: CATTLE_PROMETHEUS_METRICS value: "true" resources: