Files
HetznerTerra/.gitea/workflows/deploy.yml
T
micqdf abb7578328
Deploy Cluster / Terraform (push) Successful in 28s
Deploy Cluster / Ansible (push) Failing after 12m17s
fix: run post-deploy checks with bash
2026-04-25 02:42:54 +00:00

643 lines
29 KiB
YAML

name: Deploy Cluster
on:
push:
branches:
- main
pull_request:
branches:
- main
workflow_dispatch:
env:
TF_VERSION: "1.7.0"
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
TF_VAR_proxmox_insecure: "true"
TS_OAUTH_CLIENT_ID: ${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}
TS_OAUTH_CLIENT_SECRET: ${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}
jobs:
terraform:
name: Terraform
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Terraform
uses: hashicorp/setup-terraform@v3
with:
terraform_version: ${{ env.TF_VERSION }}
- name: Terraform Format Check
working-directory: terraform
run: terraform fmt -check -recursive
- name: Terraform Init
working-directory: terraform
run: |
terraform init \
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
-backend-config="region=auto" \
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
-backend-config="skip_requesting_account_id=true"
- name: Terraform Validate
working-directory: terraform
run: terraform validate
- name: Setup SSH Keys
run: |
mkdir -p ~/.ssh
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
chmod 644 ~/.ssh/id_ed25519.pub
- name: Terraform Plan
id: plan
working-directory: terraform
run: |
terraform plan \
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
-out=tfplan \
-no-color
continue-on-error: true
- name: Post Plan to PR
if: github.event_name == 'pull_request'
uses: actions/github-script@v7
with:
script: |
const output = `#### Terraform Plan
\`\`\`
${{ steps.plan.outputs.stdout }}
\`\`\``;
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: output
});
- name: Fail if plan failed
if: steps.plan.outcome == 'failure'
run: exit 1
- name: Terraform Apply
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
working-directory: terraform
run: |
terraform apply \
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
-auto-approve
- name: Save Terraform Outputs
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
run: |
mkdir -p outputs
terraform output -json > outputs/terraform_outputs.json
working-directory: terraform
- name: Upload Outputs
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
uses: actions/upload-artifact@v3
with:
name: terraform-outputs
path: outputs/terraform_outputs.json
ansible:
name: Ansible
runs-on: ubuntu-latest
needs: terraform
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Terraform
uses: hashicorp/setup-terraform@v3
with:
terraform_version: ${{ env.TF_VERSION }}
- name: Setup SSH Keys
run: |
mkdir -p ~/.ssh
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
chmod 644 ~/.ssh/id_ed25519.pub
- name: Terraform Init
working-directory: terraform
run: |
terraform init \
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
-backend-config="region=auto" \
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
-backend-config="skip_requesting_account_id=true"
- name: Get Terraform Outputs
working-directory: terraform
run: |
mkdir -p ../outputs
terraform output -json > ../outputs/terraform_outputs.json
- name: Install Python Dependencies
run: |
apt-get update && apt-get install -y python3-pip
pip3 install --break-system-packages ansible kubernetes jinja2 pyyaml
- name: Install Ansible Collections
run: ansible-galaxy collection install -r ansible/requirements.yml
- name: Generate Ansible Inventory
working-directory: ansible
run: python3 generate_inventory.py
- name: Run Ansible Playbook
working-directory: ansible
run: |
ansible-playbook site.yml \
-e "tailscale_auth_key=${{ secrets.TAILSCALE_AUTH_KEY }}" \
-e "tailscale_tailnet=${{ secrets.TAILSCALE_TAILNET }}" \
-e "tailscale_oauth_client_id=${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}" \
-e "tailscale_oauth_client_secret=${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}" \
-e "doppler_hetznerterra_service_token=${{ secrets.DOPPLER_HETZNERTERRA_SERVICE_TOKEN }}" \
-e "tailscale_api_key=${{ secrets.TAILSCALE_API_KEY }}" \
-e "grafana_admin_password=${{ secrets.GRAFANA_ADMIN_PASSWORD }}" \
-e "cluster_name=k8s-cluster"
env:
ANSIBLE_HOST_KEY_CHECKING: "False"
- name: Install kubectl
run: |
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
chmod +x /usr/local/bin/kubectl
- name: Rewrite kubeconfig for runner-reachable API
working-directory: terraform
run: |
set -euo pipefail
PRIMARY_IP=$(terraform output -raw primary_control_plane_ip)
sed -i "s#https://k8s-cluster-cp-1\.[^:]*:6443#https://${PRIMARY_IP}:6443#g" ../outputs/kubeconfig
- name: Bootstrap Flux source and reconciliation graph
env:
KUBECONFIG: outputs/kubeconfig
FLUX_GIT_HOST: 64.176.189.59
FLUX_GIT_PORT: "2222"
run: |
set -euo pipefail
flux_rollout_status() {
local deployment="$1"
if ! kubectl -n flux-system rollout status "deployment/${deployment}" --timeout=900s; then
kubectl -n flux-system get pods -o wide
kubectl -n flux-system describe deployment "${deployment}"
kubectl -n flux-system describe pods -l "app=${deployment}"
exit 1
fi
}
wait_for_resource() {
local namespace="$1"
local resource="$2"
local timeout_seconds="$3"
local elapsed=0
until {
if [ -n "${namespace}" ]; then
kubectl -n "${namespace}" get "${resource}" >/dev/null 2>&1
else
kubectl get "${resource}" >/dev/null 2>&1
fi
}; do
if [ "${elapsed}" -ge "${timeout_seconds}" ]; then
echo "Timed out waiting for ${resource} to exist" >&2
kubectl -n flux-system get kustomizations,helmreleases || true
exit 1
fi
sleep 10
elapsed=$((elapsed + 10))
done
}
eso_diagnostics() {
kubectl -n flux-system get kustomizations,ocirepositories,helmrepositories,helmcharts,helmreleases || true
kubectl -n flux-system describe kustomization addon-external-secrets || true
kubectl -n flux-system describe ocirepository external-secrets || true
kubectl -n flux-system describe helmrelease external-secrets || true
kubectl -n external-secrets get pods -o wide || true
}
wait_for_helmrelease_ready() {
local release_name="$1"
local target_namespace="$2"
local timeout_seconds="$3"
local elapsed=0
local ready
local stalled
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
ready="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
stalled="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)"
if [ "${ready}" = "True" ]; then
return 0
fi
if [ "${stalled}" = "True" ]; then
echo "HelmRelease ${release_name} is stalled" >&2
kubectl -n flux-system describe "helmrelease/${release_name}" || true
kubectl -n "${target_namespace}" get pods -o wide || true
exit 1
fi
sleep 10
elapsed=$((elapsed + 10))
done
echo "Timed out waiting for HelmRelease ${release_name} to become Ready" >&2
kubectl -n flux-system describe "helmrelease/${release_name}" || true
kubectl -n "${target_namespace}" get pods -o wide || true
exit 1
}
wait_for_flux_oci_helm_release() {
local oci_name="$1"
local release_name="$2"
local target_namespace="$3"
local oci_timeout="$4"
local release_timeout="$5"
local reconcile_at
wait_for_resource flux-system "ocirepository.source.toolkit.fluxcd.io/${oci_name}" 600
reconcile_at="$(date +%s)"
kubectl -n flux-system annotate "ocirepository/${oci_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
kubectl -n flux-system annotate "helmrelease/${release_name}" \
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
--overwrite
if ! kubectl -n flux-system wait --for=condition=Ready "ocirepository/${oci_name}" --timeout="${oci_timeout}"; then
eso_diagnostics
exit 1
fi
wait_for_helmrelease_ready "${release_name}" "${target_namespace}" "${release_timeout}"
}
flux_helm_diagnostics() {
local repo_name="$1"
local chart_name="$2"
local release_name="$3"
local target_namespace="$4"
kubectl -n flux-system get helmrepositories,helmcharts,helmreleases || true
kubectl -n flux-system describe helmrepository "${repo_name}" || true
kubectl -n flux-system describe helmchart.source.toolkit.fluxcd.io "${chart_name}" || true
kubectl -n flux-system describe helmrelease "${release_name}" || true
kubectl -n "${target_namespace}" get pods -o wide || true
}
wait_for_flux_helm_release() {
local repo_name="$1"
local chart_name="$2"
local release_name="$3"
local target_namespace="$4"
local repo_timeout="$5"
local chart_timeout="$6"
local release_timeout="$7"
local reconcile_at
wait_for_resource flux-system "helmrepository.source.toolkit.fluxcd.io/${repo_name}" 600
if ! kubectl -n flux-system wait --for=condition=Ready "helmrepository/${repo_name}" --timeout="${repo_timeout}"; then
flux_helm_diagnostics "${repo_name}" "${chart_name}" "${release_name}" "${target_namespace}"
exit 1
fi
wait_for_resource flux-system "helmchart.source.toolkit.fluxcd.io/${chart_name}" 600
reconcile_at="$(date +%s)"
kubectl -n flux-system annotate "helmchart.source.toolkit.fluxcd.io/${chart_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
kubectl -n flux-system annotate "helmrelease/${release_name}" \
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
--overwrite
if ! kubectl -n flux-system wait --for=condition=Ready "helmchart.source.toolkit.fluxcd.io/${chart_name}" --timeout="${chart_timeout}"; then
flux_helm_diagnostics "${repo_name}" "${chart_name}" "${release_name}" "${target_namespace}"
exit 1
fi
wait_for_helmrelease_ready "${release_name}" "${target_namespace}" "${release_timeout}"
}
kubectl create namespace flux-system --dry-run=client -o yaml | kubectl apply -f -
ssh-keyscan -p "${FLUX_GIT_PORT}" "${FLUX_GIT_HOST}" > /tmp/flux_known_hosts
kubectl -n flux-system create secret generic flux-system \
--from-file=identity="$HOME/.ssh/id_ed25519" \
--from-file=known_hosts=/tmp/flux_known_hosts \
--dry-run=client -o yaml | kubectl apply -f -
# Apply CRDs and controllers first
kubectl apply -f clusters/prod/flux-system/gotk-components.yaml
# Wait for CRDs to be established
kubectl wait --for=condition=Established crd --all --timeout=120s
# Then apply custom resources
kubectl apply -f clusters/prod/flux-system/gitrepository-platform.yaml
kubectl apply -f clusters/prod/flux-system/kustomization-infrastructure.yaml
kubectl apply -f clusters/prod/flux-system/kustomization-apps.yaml
# Patch Flux controllers to run on cp-1 and tolerate the control-plane taint
PATCH='{"spec":{"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"k8s-cluster-cp-1"},"tolerations":[{"key":"node-role.kubernetes.io/control-plane","operator":"Exists","effect":"NoSchedule"}]}}}}'
kubectl -n flux-system patch deployment source-controller --type='merge' -p="$PATCH"
kubectl -n flux-system patch deployment kustomize-controller --type='merge' -p="$PATCH"
kubectl -n flux-system patch deployment helm-controller --type='merge' -p="$PATCH"
kubectl -n flux-system patch deployment notification-controller --type='merge' -p="$PATCH"
flux_rollout_status source-controller
flux_rollout_status kustomize-controller
flux_rollout_status helm-controller
kubectl -n flux-system wait --for=condition=Ready gitrepository/platform --timeout=300s
kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=600s
# Wait directly on the ESO Helm objects; Kustomization readiness hides useful failure details.
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-external-secrets 600
kubectl -n flux-system annotate kustomization/addon-external-secrets reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
wait_for_flux_oci_helm_release external-secrets external-secrets external-secrets 600s 600
wait_for_resource "" crd/clustersecretstores.external-secrets.io 900
wait_for_resource "" crd/externalsecrets.external-secrets.io 900
kubectl wait --for=condition=established --timeout=600s crd/clustersecretstores.external-secrets.io
kubectl wait --for=condition=established --timeout=600s crd/externalsecrets.external-secrets.io
kubectl -n external-secrets rollout status deployment/external-secrets-external-secrets --timeout=600s
kubectl -n external-secrets rollout status deployment/external-secrets-external-secrets-webhook --timeout=600s
wait_for_resource external-secrets service/external-secrets-external-secrets-webhook 600
wait_for_resource external-secrets endpoints/external-secrets-external-secrets-webhook 600
kubectl -n external-secrets wait --for=jsonpath='{.subsets[0].addresses[0].ip}' endpoints/external-secrets-external-secrets-webhook --timeout=600s
# Create Doppler ClusterSecretStore now that ESO CRDs are available
kubectl apply -f - <<'EOF'
apiVersion: external-secrets.io/v1
kind: ClusterSecretStore
metadata:
name: doppler-hetznerterra
spec:
provider:
doppler:
auth:
secretRef:
dopplerToken:
name: doppler-hetznerterra-service-token
key: dopplerToken
namespace: external-secrets
EOF
# Wait for the storage layer and private access components
wait_for_flux_helm_release tailscale flux-system-tailscale-operator tailscale-operator tailscale-system 600s 600s 600
kubectl -n tailscale-system rollout status deployment/operator --timeout=600s
wait_for_flux_helm_release nfs-subdir-external-provisioner flux-system-nfs-subdir-external-provisioner nfs-subdir-external-provisioner kube-system 600s 600s 600
kubectl -n kube-system rollout status deployment/kube-system-nfs-subdir-external-provisioner --timeout=600s
kubectl annotate storageclass local-path storageclass.kubernetes.io/is-default-class=false --overwrite
kubectl annotate storageclass flash-nfs storageclass.kubernetes.io/is-default-class=true --overwrite
kubectl get storageclass flash-nfs
- name: Wait for Rancher and backup operator
env:
KUBECONFIG: outputs/kubeconfig
run: |
set -euo pipefail
wait_for_resource() {
local namespace="$1"
local resource="$2"
local timeout_seconds="$3"
local elapsed=0
until {
if [ -n "${namespace}" ]; then
kubectl -n "${namespace}" get "${resource}" >/dev/null 2>&1
else
kubectl get "${resource}" >/dev/null 2>&1
fi
}; do
if [ "${elapsed}" -ge "${timeout_seconds}" ]; then
echo "Timed out waiting for ${resource} to exist" >&2
kubectl -n flux-system get kustomizations,helmrepositories,helmcharts,helmreleases || true
exit 1
fi
sleep 10
elapsed=$((elapsed + 10))
done
}
reconcile_helmrelease() {
local release_name="$1"
local reconcile_at
reconcile_at="$(date +%s)"
kubectl -n flux-system annotate "helmrelease/${release_name}" \
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
--overwrite
}
wait_for_helmrelease_ready() {
local release_name="$1"
local target_namespace="$2"
local timeout_seconds="$3"
local elapsed=0
local ready
local stalled
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
ready="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
stalled="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)"
if [ "${ready}" = "True" ]; then
return 0
fi
if [ "${stalled}" = "True" ]; then
echo "HelmRelease ${release_name} is stalled" >&2
kubectl -n flux-system describe "helmrelease/${release_name}" || true
kubectl -n "${target_namespace}" get pods -o wide || true
exit 1
fi
sleep 10
elapsed=$((elapsed + 10))
done
echo "Timed out waiting for HelmRelease ${release_name} to become Ready" >&2
kubectl -n flux-system describe "helmrelease/${release_name}" || true
kubectl -n "${target_namespace}" get pods -o wide || true
exit 1
}
echo "Waiting for Rancher..."
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-rancher 600
kubectl -n flux-system annotate kustomization/addon-rancher reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher 600
reconcile_helmrelease rancher
wait_for_helmrelease_ready rancher cattle-system 900
wait_for_resource "" namespace/cattle-system 600
wait_for_resource cattle-system deployment/cattle-system-rancher 600
kubectl -n cattle-system rollout status deployment/cattle-system-rancher --timeout=900s
wait_for_resource cattle-system deployment/rancher-webhook 900
kubectl -n cattle-system rollout status deployment/rancher-webhook --timeout=900s
wait_for_resource cattle-system issuer/cattle-system-rancher 900
wait_for_resource cattle-system certificate/tls-rancher-ingress 900
kubectl -n cattle-system wait --for=condition=Ready issuer/cattle-system-rancher --timeout=900s
kubectl -n cattle-system wait --for=condition=Ready certificate/tls-rancher-ingress --timeout=900s
echo "Waiting for rancher-backup operator..."
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-rancher-backup 600
kubectl -n flux-system annotate kustomization/addon-rancher-backup reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite
wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher-backup-crd 600
wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher-backup 600
reconcile_helmrelease rancher-backup-crd
reconcile_helmrelease rancher-backup
wait_for_helmrelease_ready rancher-backup-crd cattle-resources-system 600
wait_for_helmrelease_ready rancher-backup cattle-resources-system 600
wait_for_resource "" namespace/cattle-resources-system 600
kubectl -n cattle-resources-system rollout status deployment/rancher-backup --timeout=900s
- name: Restore Rancher from latest B2 backup
env:
KUBECONFIG: outputs/kubeconfig
B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }}
B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }}
run: |
echo "Finding latest backup in B2..."
CREDS=$(echo -n "${B2_ACCOUNT_ID}:${B2_APPLICATION_KEY}" | base64)
AUTH_RESP=$(curl -sS -H "Authorization: Basic ${CREDS}" https://api.backblazeb2.com/b2api/v2/b2_authorize_account)
API_URL=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['apiUrl'])")
AUTH_TOKEN=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['authorizationToken'])")
BUCKET_ID=$(echo "$AUTH_RESP" | python3 -c "
import json,sys
resp = json.load(sys.stdin)
bid = resp.get('allowed', {}).get('bucketId')
if bid:
print(bid)
else:
print('')
")
if [ -z "$BUCKET_ID" ]; then
echo "Restricted B2 key - resolving bucket ID by name..."
BUCKET_ID=$(curl -sS -H "Authorization: Bearer ${AUTH_TOKEN}" \
"${API_URL}/b2api/v2/b2_list_buckets?accountId=${B2_ACCOUNT_ID}&bucketName=HetznerTerra" \
| python3 -c "import json,sys; buckets=json.load(sys.stdin).get('buckets',[]); print(buckets[0]['bucketId'] if buckets else '')")
fi
LATEST=$(curl -sS -H "Authorization: Bearer ${AUTH_TOKEN}" \
"${API_URL}/b2api/v2/b2_list_file_names?bucketId=${BUCKET_ID}&prefix=rancher-backups/&maxFileCount=100" \
| python3 -c "
import json,sys
files = json.load(sys.stdin).get('files', [])
tars = [f['fileName'] for f in files if f['fileName'].endswith('.tar.gz')]
if not tars:
print('NONE')
else:
tars.sort()
print(tars[-1])
")
if [ "$LATEST" = "NONE" ]; then
echo "No backups found in B2. Skipping restore."
exit 0
fi
BACKUP_FILE=$(basename "$LATEST")
echo "Latest backup: ${BACKUP_FILE}"
echo "Creating Restore CR..."
kubectl apply -f - <<EOF
apiVersion: resources.cattle.io/v1
kind: Restore
metadata:
name: restore-from-b2
namespace: cattle-resources-system
spec:
backupFilename: ${BACKUP_FILE}
storageLocation:
s3:
credentialSecretName: rancher-b2-creds
credentialSecretNamespace: cattle-resources-system
bucketName: HetznerTerra
folder: rancher-backups
endpoint: s3.us-east-005.backblazeb2.com
region: us-east-005
EOF
echo "Waiting for restore to complete..."
for i in $(seq 1 60); do
STATUS=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "Unknown")
MESSAGE=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].message}' 2>/dev/null || echo "")
echo " Restore status: ${STATUS} - ${MESSAGE}"
if [ "$STATUS" = "True" ]; then
echo "Restore completed successfully!"
exit 0
fi
sleep 10
done
echo "Restore did not complete within timeout. Continuing anyway."
- name: Post-deploy cluster health checks
working-directory: ansible
run: |
set -euo pipefail
ansible -i inventory.ini 'control_plane[0]' -m shell -a '
set -euo pipefail
kubectl get nodes -o wide
kubectl -n flux-system get gitrepositories,kustomizations,helmreleases,ocirepositories
kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-cert-manager --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-proxyclass --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-config --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup-config --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=60s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=60s
kubectl -n flux-system wait --for=condition=Ready helmrelease --all --timeout=60s
kubectl get storageclass | grep -E "^flash-nfs.*\\(default\\)"
kubectl get pods -A --no-headers \
| grep -Ev "[[:space:]](Running|Completed)[[:space:]]" \
| grep -Ev "^cattle-system[[:space:]]+helm-operation-" \
| tee /tmp/unhealthy-pods || true
test ! -s /tmp/unhealthy-pods
kubectl -n kube-system get pods -o wide
kubectl -n tailscale-system get pods -o wide
kubectl -n external-secrets get pods -o wide
' -e ansible_shell_executable=/bin/bash
env:
ANSIBLE_HOST_KEY_CHECKING: "False"
- name: Post-deploy tailnet smoke checks
working-directory: ansible
run: |
ansible -i inventory.ini 'control_plane[0]' -m script -a "../scripts/smoke-check-tailnet-services.sh"
env:
ANSIBLE_HOST_KEY_CHECKING: "False"
- name: Upload Kubeconfig
uses: actions/upload-artifact@v3
with:
name: kubeconfig
path: outputs/kubeconfig