feat: Automate Rancher backup restore in CI pipeline
- Wait for Rancher and rancher-backup operator to be ready - Patch default SA in cattle-resources-system (fixes post-install hook failure) - Clean up failed patch-sa jobs - Force reconcile rancher-backup HelmRelease - Find latest backup from B2 using Backblaze API - Create Restore CR to restore Rancher state from latest backup - Wait for restore to complete before continuing
This commit is contained in:
@@ -241,6 +241,12 @@ jobs:
|
|||||||
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
|
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
|
||||||
chmod +x /usr/local/bin/kubectl
|
chmod +x /usr/local/bin/kubectl
|
||||||
|
|
||||||
|
- name: Install flux CLI
|
||||||
|
run: |
|
||||||
|
curl -fsSL https://github.com/fluxcd/flux2/releases/latest/download/flux_2.5.1_linux_amd64.tar.gz | tar xz -C /tmp
|
||||||
|
mv /tmp/flux /usr/local/bin/flux
|
||||||
|
chmod +x /usr/local/bin/flux
|
||||||
|
|
||||||
- name: Rewrite kubeconfig for runner-reachable API
|
- name: Rewrite kubeconfig for runner-reachable API
|
||||||
working-directory: terraform
|
working-directory: terraform
|
||||||
run: |
|
run: |
|
||||||
@@ -302,6 +308,95 @@ jobs:
|
|||||||
# kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=300s
|
# kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=300s
|
||||||
# kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=300s
|
# kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=300s
|
||||||
|
|
||||||
|
- name: Wait for Rancher and fix backup operator
|
||||||
|
env:
|
||||||
|
KUBECONFIG: outputs/kubeconfig
|
||||||
|
run: |
|
||||||
|
set -euo pipefail
|
||||||
|
echo "Waiting for Rancher..."
|
||||||
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=600s
|
||||||
|
kubectl -n flux-system wait --for=condition=Ready helmrelease/rancher -n flux-system --timeout=300s
|
||||||
|
|
||||||
|
echo "Waiting for rancher-backup operator..."
|
||||||
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup --timeout=600s || true
|
||||||
|
|
||||||
|
echo "Patching default SA in cattle-resources-system..."
|
||||||
|
kubectl patch serviceaccount default -n cattle-resources-system -p '{"automountServiceAccountToken": false}' || true
|
||||||
|
|
||||||
|
echo "Cleaning up failed patch-sa jobs..."
|
||||||
|
kubectl delete job -n cattle-resources-system rancher-backup-patch-sa --ignore-not-found=true || true
|
||||||
|
|
||||||
|
echo "Force reconciling rancher-backup HelmRelease..."
|
||||||
|
flux reconcile helmrelease rancher-backup -n flux-system --timeout=5m || true
|
||||||
|
|
||||||
|
- name: Restore Rancher from latest B2 backup
|
||||||
|
env:
|
||||||
|
KUBECONFIG: outputs/kubeconfig
|
||||||
|
B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }}
|
||||||
|
B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }}
|
||||||
|
run: |
|
||||||
|
set -euo pipefail
|
||||||
|
echo "Finding latest backup in B2..."
|
||||||
|
|
||||||
|
CREDS=$(echo -n "${B2_ACCOUNT_ID}:${B2_APPLICATION_KEY}" | base64)
|
||||||
|
AUTH_RESP=$(curl -sS -H "Authorization: Basic ${CREDS}" https://api.backblazeb2.com/b2api/v2/b2_authorize_account)
|
||||||
|
API_URL=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['apiUrl'])")
|
||||||
|
AUTH_TOKEN=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['authorizationToken'])")
|
||||||
|
BUCKET_ID=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['allowed']['bucketId'])")
|
||||||
|
|
||||||
|
LATEST=$(curl -sS -H "Authorization: Bearer ${AUTH_TOKEN}" \
|
||||||
|
"${API_URL}/b2api/v2/b2_list_file_names?bucketId=${BUCKET_ID}&prefix=rancher-backups/&maxFileCount=100" \
|
||||||
|
| python3 -c "
|
||||||
|
import json,sys
|
||||||
|
files = json.load(sys.stdin)['files']
|
||||||
|
tars = [f['fileName'] for f in files if f['fileName'].endswith('.tar.gz')]
|
||||||
|
if not tars:
|
||||||
|
print('NONE')
|
||||||
|
else:
|
||||||
|
tars.sort()
|
||||||
|
print(tars[-1])
|
||||||
|
")
|
||||||
|
|
||||||
|
if [ "$LATEST" = "NONE" ]; then
|
||||||
|
echo "No backups found in B2. Skipping restore."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
BACKUP_FILE=$(basename "$LATEST")
|
||||||
|
echo "Latest backup: ${BACKUP_FILE}"
|
||||||
|
|
||||||
|
echo "Creating Restore CR..."
|
||||||
|
kubectl apply -f - <<EOF
|
||||||
|
apiVersion: resources.cattle.io/v1
|
||||||
|
kind: Restore
|
||||||
|
metadata:
|
||||||
|
name: restore-from-b2
|
||||||
|
namespace: cattle-resources-system
|
||||||
|
spec:
|
||||||
|
backupFilename: ${BACKUP_FILE}
|
||||||
|
storageLocation:
|
||||||
|
s3:
|
||||||
|
credentialSecretName: rancher-b2-creds
|
||||||
|
credentialSecretNamespace: cattle-resources-system
|
||||||
|
bucketName: HetznerTerra
|
||||||
|
folder: rancher-backups
|
||||||
|
endpoint: s3.us-east-005.backblazeb2.com
|
||||||
|
region: us-east-005
|
||||||
|
EOF
|
||||||
|
|
||||||
|
echo "Waiting for restore to complete..."
|
||||||
|
for i in $(seq 1 60); do
|
||||||
|
STATUS=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "Unknown")
|
||||||
|
MESSAGE=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].message}' 2>/dev/null || echo "")
|
||||||
|
echo " Restore status: ${STATUS} - ${MESSAGE}"
|
||||||
|
if [ "$STATUS" = "True" ]; then
|
||||||
|
echo "Restore completed successfully!"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
sleep 10
|
||||||
|
done
|
||||||
|
echo "Restore did not complete within timeout. Continuing anyway."
|
||||||
|
|
||||||
- name: Post-deploy cluster health checks
|
- name: Post-deploy cluster health checks
|
||||||
working-directory: ansible
|
working-directory: ansible
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
Reference in New Issue
Block a user