feat: Automate Rancher backup restore in CI pipeline
Some checks failed
Deploy Cluster / Terraform (push) Successful in 2m18s
Deploy Cluster / Ansible (push) Failing after 6m28s

- Wait for Rancher and rancher-backup operator to be ready
- Patch default SA in cattle-resources-system (fixes post-install hook failure)
- Clean up failed patch-sa jobs
- Force reconcile rancher-backup HelmRelease
- Find latest backup from B2 using Backblaze API
- Create Restore CR to restore Rancher state from latest backup
- Wait for restore to complete before continuing
This commit is contained in:
2026-03-30 01:56:29 +00:00
parent a1f07f863a
commit 8c7b62c024

View File

@@ -241,6 +241,12 @@ jobs:
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
chmod +x /usr/local/bin/kubectl chmod +x /usr/local/bin/kubectl
- name: Install flux CLI
run: |
curl -fsSL https://github.com/fluxcd/flux2/releases/latest/download/flux_2.5.1_linux_amd64.tar.gz | tar xz -C /tmp
mv /tmp/flux /usr/local/bin/flux
chmod +x /usr/local/bin/flux
- name: Rewrite kubeconfig for runner-reachable API - name: Rewrite kubeconfig for runner-reachable API
working-directory: terraform working-directory: terraform
run: | run: |
@@ -302,6 +308,95 @@ jobs:
# kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=300s # kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=300s
# kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=300s # kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=300s
- name: Wait for Rancher and fix backup operator
env:
KUBECONFIG: outputs/kubeconfig
run: |
set -euo pipefail
echo "Waiting for Rancher..."
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=600s
kubectl -n flux-system wait --for=condition=Ready helmrelease/rancher -n flux-system --timeout=300s
echo "Waiting for rancher-backup operator..."
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup --timeout=600s || true
echo "Patching default SA in cattle-resources-system..."
kubectl patch serviceaccount default -n cattle-resources-system -p '{"automountServiceAccountToken": false}' || true
echo "Cleaning up failed patch-sa jobs..."
kubectl delete job -n cattle-resources-system rancher-backup-patch-sa --ignore-not-found=true || true
echo "Force reconciling rancher-backup HelmRelease..."
flux reconcile helmrelease rancher-backup -n flux-system --timeout=5m || true
- name: Restore Rancher from latest B2 backup
env:
KUBECONFIG: outputs/kubeconfig
B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }}
B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }}
run: |
set -euo pipefail
echo "Finding latest backup in B2..."
CREDS=$(echo -n "${B2_ACCOUNT_ID}:${B2_APPLICATION_KEY}" | base64)
AUTH_RESP=$(curl -sS -H "Authorization: Basic ${CREDS}" https://api.backblazeb2.com/b2api/v2/b2_authorize_account)
API_URL=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['apiUrl'])")
AUTH_TOKEN=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['authorizationToken'])")
BUCKET_ID=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['allowed']['bucketId'])")
LATEST=$(curl -sS -H "Authorization: Bearer ${AUTH_TOKEN}" \
"${API_URL}/b2api/v2/b2_list_file_names?bucketId=${BUCKET_ID}&prefix=rancher-backups/&maxFileCount=100" \
| python3 -c "
import json,sys
files = json.load(sys.stdin)['files']
tars = [f['fileName'] for f in files if f['fileName'].endswith('.tar.gz')]
if not tars:
print('NONE')
else:
tars.sort()
print(tars[-1])
")
if [ "$LATEST" = "NONE" ]; then
echo "No backups found in B2. Skipping restore."
exit 0
fi
BACKUP_FILE=$(basename "$LATEST")
echo "Latest backup: ${BACKUP_FILE}"
echo "Creating Restore CR..."
kubectl apply -f - <<EOF
apiVersion: resources.cattle.io/v1
kind: Restore
metadata:
name: restore-from-b2
namespace: cattle-resources-system
spec:
backupFilename: ${BACKUP_FILE}
storageLocation:
s3:
credentialSecretName: rancher-b2-creds
credentialSecretNamespace: cattle-resources-system
bucketName: HetznerTerra
folder: rancher-backups
endpoint: s3.us-east-005.backblazeb2.com
region: us-east-005
EOF
echo "Waiting for restore to complete..."
for i in $(seq 1 60); do
STATUS=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "Unknown")
MESSAGE=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].message}' 2>/dev/null || echo "")
echo " Restore status: ${STATUS} - ${MESSAGE}"
if [ "$STATUS" = "True" ]; then
echo "Restore completed successfully!"
exit 0
fi
sleep 10
done
echo "Restore did not complete within timeout. Continuing anyway."
- name: Post-deploy cluster health checks - name: Post-deploy cluster health checks
working-directory: ansible working-directory: ansible
run: | run: |