feat: Automate Rancher backup restore in CI pipeline
- Wait for Rancher and rancher-backup operator to be ready - Patch default SA in cattle-resources-system (fixes post-install hook failure) - Clean up failed patch-sa jobs - Force reconcile rancher-backup HelmRelease - Find latest backup from B2 using Backblaze API - Create Restore CR to restore Rancher state from latest backup - Wait for restore to complete before continuing
This commit is contained in:
@@ -241,6 +241,12 @@ jobs:
|
||||
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
|
||||
chmod +x /usr/local/bin/kubectl
|
||||
|
||||
- name: Install flux CLI
|
||||
run: |
|
||||
curl -fsSL https://github.com/fluxcd/flux2/releases/latest/download/flux_2.5.1_linux_amd64.tar.gz | tar xz -C /tmp
|
||||
mv /tmp/flux /usr/local/bin/flux
|
||||
chmod +x /usr/local/bin/flux
|
||||
|
||||
- name: Rewrite kubeconfig for runner-reachable API
|
||||
working-directory: terraform
|
||||
run: |
|
||||
@@ -302,6 +308,95 @@ jobs:
|
||||
# kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=300s
|
||||
# kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=300s
|
||||
|
||||
- name: Wait for Rancher and fix backup operator
|
||||
env:
|
||||
KUBECONFIG: outputs/kubeconfig
|
||||
run: |
|
||||
set -euo pipefail
|
||||
echo "Waiting for Rancher..."
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=600s
|
||||
kubectl -n flux-system wait --for=condition=Ready helmrelease/rancher -n flux-system --timeout=300s
|
||||
|
||||
echo "Waiting for rancher-backup operator..."
|
||||
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup --timeout=600s || true
|
||||
|
||||
echo "Patching default SA in cattle-resources-system..."
|
||||
kubectl patch serviceaccount default -n cattle-resources-system -p '{"automountServiceAccountToken": false}' || true
|
||||
|
||||
echo "Cleaning up failed patch-sa jobs..."
|
||||
kubectl delete job -n cattle-resources-system rancher-backup-patch-sa --ignore-not-found=true || true
|
||||
|
||||
echo "Force reconciling rancher-backup HelmRelease..."
|
||||
flux reconcile helmrelease rancher-backup -n flux-system --timeout=5m || true
|
||||
|
||||
- name: Restore Rancher from latest B2 backup
|
||||
env:
|
||||
KUBECONFIG: outputs/kubeconfig
|
||||
B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }}
|
||||
B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
echo "Finding latest backup in B2..."
|
||||
|
||||
CREDS=$(echo -n "${B2_ACCOUNT_ID}:${B2_APPLICATION_KEY}" | base64)
|
||||
AUTH_RESP=$(curl -sS -H "Authorization: Basic ${CREDS}" https://api.backblazeb2.com/b2api/v2/b2_authorize_account)
|
||||
API_URL=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['apiUrl'])")
|
||||
AUTH_TOKEN=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['authorizationToken'])")
|
||||
BUCKET_ID=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['allowed']['bucketId'])")
|
||||
|
||||
LATEST=$(curl -sS -H "Authorization: Bearer ${AUTH_TOKEN}" \
|
||||
"${API_URL}/b2api/v2/b2_list_file_names?bucketId=${BUCKET_ID}&prefix=rancher-backups/&maxFileCount=100" \
|
||||
| python3 -c "
|
||||
import json,sys
|
||||
files = json.load(sys.stdin)['files']
|
||||
tars = [f['fileName'] for f in files if f['fileName'].endswith('.tar.gz')]
|
||||
if not tars:
|
||||
print('NONE')
|
||||
else:
|
||||
tars.sort()
|
||||
print(tars[-1])
|
||||
")
|
||||
|
||||
if [ "$LATEST" = "NONE" ]; then
|
||||
echo "No backups found in B2. Skipping restore."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
BACKUP_FILE=$(basename "$LATEST")
|
||||
echo "Latest backup: ${BACKUP_FILE}"
|
||||
|
||||
echo "Creating Restore CR..."
|
||||
kubectl apply -f - <<EOF
|
||||
apiVersion: resources.cattle.io/v1
|
||||
kind: Restore
|
||||
metadata:
|
||||
name: restore-from-b2
|
||||
namespace: cattle-resources-system
|
||||
spec:
|
||||
backupFilename: ${BACKUP_FILE}
|
||||
storageLocation:
|
||||
s3:
|
||||
credentialSecretName: rancher-b2-creds
|
||||
credentialSecretNamespace: cattle-resources-system
|
||||
bucketName: HetznerTerra
|
||||
folder: rancher-backups
|
||||
endpoint: s3.us-east-005.backblazeb2.com
|
||||
region: us-east-005
|
||||
EOF
|
||||
|
||||
echo "Waiting for restore to complete..."
|
||||
for i in $(seq 1 60); do
|
||||
STATUS=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "Unknown")
|
||||
MESSAGE=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].message}' 2>/dev/null || echo "")
|
||||
echo " Restore status: ${STATUS} - ${MESSAGE}"
|
||||
if [ "$STATUS" = "True" ]; then
|
||||
echo "Restore completed successfully!"
|
||||
exit 0
|
||||
fi
|
||||
sleep 10
|
||||
done
|
||||
echo "Restore did not complete within timeout. Continuing anyway."
|
||||
|
||||
- name: Post-deploy cluster health checks
|
||||
working-directory: ansible
|
||||
run: |
|
||||
|
||||
Reference in New Issue
Block a user