From 8c7b62c024a558db5fde8d90266b04306a7d5cba Mon Sep 17 00:00:00 2001 From: MichaelFisher1997 Date: Mon, 30 Mar 2026 01:56:29 +0000 Subject: [PATCH] feat: Automate Rancher backup restore in CI pipeline - Wait for Rancher and rancher-backup operator to be ready - Patch default SA in cattle-resources-system (fixes post-install hook failure) - Clean up failed patch-sa jobs - Force reconcile rancher-backup HelmRelease - Find latest backup from B2 using Backblaze API - Create Restore CR to restore Rancher state from latest backup - Wait for restore to complete before continuing --- .gitea/workflows/deploy.yml | 95 +++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/.gitea/workflows/deploy.yml b/.gitea/workflows/deploy.yml index 5c8b70d..d652d8f 100644 --- a/.gitea/workflows/deploy.yml +++ b/.gitea/workflows/deploy.yml @@ -241,6 +241,12 @@ jobs: curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" chmod +x /usr/local/bin/kubectl + - name: Install flux CLI + run: | + curl -fsSL https://github.com/fluxcd/flux2/releases/latest/download/flux_2.5.1_linux_amd64.tar.gz | tar xz -C /tmp + mv /tmp/flux /usr/local/bin/flux + chmod +x /usr/local/bin/flux + - name: Rewrite kubeconfig for runner-reachable API working-directory: terraform run: | @@ -302,6 +308,95 @@ jobs: # kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=300s # kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=300s + - name: Wait for Rancher and fix backup operator + env: + KUBECONFIG: outputs/kubeconfig + run: | + set -euo pipefail + echo "Waiting for Rancher..." + kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=600s + kubectl -n flux-system wait --for=condition=Ready helmrelease/rancher -n flux-system --timeout=300s + + echo "Waiting for rancher-backup operator..." + kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup --timeout=600s || true + + echo "Patching default SA in cattle-resources-system..." + kubectl patch serviceaccount default -n cattle-resources-system -p '{"automountServiceAccountToken": false}' || true + + echo "Cleaning up failed patch-sa jobs..." + kubectl delete job -n cattle-resources-system rancher-backup-patch-sa --ignore-not-found=true || true + + echo "Force reconciling rancher-backup HelmRelease..." + flux reconcile helmrelease rancher-backup -n flux-system --timeout=5m || true + + - name: Restore Rancher from latest B2 backup + env: + KUBECONFIG: outputs/kubeconfig + B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }} + B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }} + run: | + set -euo pipefail + echo "Finding latest backup in B2..." + + CREDS=$(echo -n "${B2_ACCOUNT_ID}:${B2_APPLICATION_KEY}" | base64) + AUTH_RESP=$(curl -sS -H "Authorization: Basic ${CREDS}" https://api.backblazeb2.com/b2api/v2/b2_authorize_account) + API_URL=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['apiUrl'])") + AUTH_TOKEN=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['authorizationToken'])") + BUCKET_ID=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['allowed']['bucketId'])") + + LATEST=$(curl -sS -H "Authorization: Bearer ${AUTH_TOKEN}" \ + "${API_URL}/b2api/v2/b2_list_file_names?bucketId=${BUCKET_ID}&prefix=rancher-backups/&maxFileCount=100" \ + | python3 -c " + import json,sys + files = json.load(sys.stdin)['files'] + tars = [f['fileName'] for f in files if f['fileName'].endswith('.tar.gz')] + if not tars: + print('NONE') + else: + tars.sort() + print(tars[-1]) + ") + + if [ "$LATEST" = "NONE" ]; then + echo "No backups found in B2. Skipping restore." + exit 0 + fi + + BACKUP_FILE=$(basename "$LATEST") + echo "Latest backup: ${BACKUP_FILE}" + + echo "Creating Restore CR..." + kubectl apply -f - </dev/null || echo "Unknown") + MESSAGE=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].message}' 2>/dev/null || echo "") + echo " Restore status: ${STATUS} - ${MESSAGE}" + if [ "$STATUS" = "True" ]; then + echo "Restore completed successfully!" + exit 0 + fi + sleep 10 + done + echo "Restore did not complete within timeout. Continuing anyway." + - name: Post-deploy cluster health checks working-directory: ansible run: |