Files
HetznerTerra/.gitea/workflows/deploy.yml
T
micqdf 8e081ddfda
Deploy Cluster / Terraform (push) Successful in 29s
Deploy Cluster / Ansible (push) Failing after 19m8s
fix: wait on ESO deployment directly instead of Flux Kustomization status
The addon-external-secrets Flux Kustomization was timing out during bootstrap
because image pulls on fresh Proxmox VMs are slow. The critical dependency is
the ESO deployment being available for the Doppler ClusterSecretStore. Replace
the Kustomization readiness check with direct checks for ESO CRD establishment
and deployment rollout, which are the actual prerequisites for the next step.
2026-04-23 07:32:19 +00:00

373 lines
16 KiB
YAML

name: Deploy Cluster
on:
push:
branches:
- main
pull_request:
branches:
- main
workflow_dispatch:
env:
TF_VERSION: "1.7.0"
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
TF_VAR_proxmox_insecure: "true"
TS_OAUTH_CLIENT_ID: ${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}
TS_OAUTH_CLIENT_SECRET: ${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}
jobs:
terraform:
name: Terraform
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Terraform
uses: hashicorp/setup-terraform@v3
with:
terraform_version: ${{ env.TF_VERSION }}
- name: Terraform Format Check
working-directory: terraform
run: terraform fmt -check -recursive
- name: Terraform Init
working-directory: terraform
run: |
terraform init \
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
-backend-config="region=auto" \
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
-backend-config="skip_requesting_account_id=true"
- name: Terraform Validate
working-directory: terraform
run: terraform validate
- name: Setup SSH Keys
run: |
mkdir -p ~/.ssh
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
chmod 644 ~/.ssh/id_ed25519.pub
- name: Terraform Plan
id: plan
working-directory: terraform
run: |
terraform plan \
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
-out=tfplan \
-no-color
continue-on-error: true
- name: Post Plan to PR
if: github.event_name == 'pull_request'
uses: actions/github-script@v7
with:
script: |
const output = `#### Terraform Plan
\`\`\`
${{ steps.plan.outputs.stdout }}
\`\`\``;
github.rest.issues.createComment({
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: output
});
- name: Fail if plan failed
if: steps.plan.outcome == 'failure'
run: exit 1
- name: Terraform Apply
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
working-directory: terraform
run: |
terraform apply \
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
-auto-approve
- name: Save Terraform Outputs
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
run: |
mkdir -p outputs
terraform output -json > outputs/terraform_outputs.json
working-directory: terraform
- name: Upload Outputs
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
uses: actions/upload-artifact@v3
with:
name: terraform-outputs
path: outputs/terraform_outputs.json
ansible:
name: Ansible
runs-on: ubuntu-latest
needs: terraform
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Terraform
uses: hashicorp/setup-terraform@v3
with:
terraform_version: ${{ env.TF_VERSION }}
- name: Setup SSH Keys
run: |
mkdir -p ~/.ssh
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
chmod 644 ~/.ssh/id_ed25519.pub
- name: Terraform Init
working-directory: terraform
run: |
terraform init \
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
-backend-config="region=auto" \
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
-backend-config="skip_requesting_account_id=true"
- name: Get Terraform Outputs
working-directory: terraform
run: |
mkdir -p ../outputs
terraform output -json > ../outputs/terraform_outputs.json
- name: Install Python Dependencies
run: |
apt-get update && apt-get install -y python3-pip
pip3 install --break-system-packages ansible kubernetes jinja2 pyyaml
- name: Install Ansible Collections
run: ansible-galaxy collection install -r ansible/requirements.yml
- name: Generate Ansible Inventory
working-directory: ansible
run: python3 generate_inventory.py
- name: Run Ansible Playbook
working-directory: ansible
run: |
ansible-playbook site.yml \
-e "tailscale_auth_key=${{ secrets.TAILSCALE_AUTH_KEY }}" \
-e "tailscale_tailnet=${{ secrets.TAILSCALE_TAILNET }}" \
-e "tailscale_oauth_client_id=${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}" \
-e "tailscale_oauth_client_secret=${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}" \
-e "doppler_hetznerterra_service_token=${{ secrets.DOPPLER_HETZNERTERRA_SERVICE_TOKEN }}" \
-e "tailscale_api_key=${{ secrets.TAILSCALE_API_KEY }}" \
-e "grafana_admin_password=${{ secrets.GRAFANA_ADMIN_PASSWORD }}" \
-e "cluster_name=k8s-cluster"
env:
ANSIBLE_HOST_KEY_CHECKING: "False"
- name: Install kubectl
run: |
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
chmod +x /usr/local/bin/kubectl
- name: Rewrite kubeconfig for runner-reachable API
working-directory: terraform
run: |
PRIMARY_IP=$(terraform output -raw primary_control_plane_ip)
sed -i "s#https://k8s-cluster-cp-1\.[^:]*:6443#https://${PRIMARY_IP}:6443#g" ../outputs/kubeconfig
- name: Bootstrap Flux source and reconciliation graph
env:
KUBECONFIG: outputs/kubeconfig
FLUX_GIT_HOST: 64.176.189.59
FLUX_GIT_PORT: "2222"
run: |
kubectl create namespace flux-system --dry-run=client -o yaml | kubectl apply -f -
ssh-keyscan -p "${FLUX_GIT_PORT}" "${FLUX_GIT_HOST}" > /tmp/flux_known_hosts
kubectl -n flux-system create secret generic flux-system \
--from-file=identity="$HOME/.ssh/id_ed25519" \
--from-file=known_hosts=/tmp/flux_known_hosts \
--dry-run=client -o yaml | kubectl apply -f -
# Apply CRDs and controllers first
kubectl apply -f clusters/prod/flux-system/gotk-components.yaml
# Wait for CRDs to be established
kubectl wait --for=condition=Established crd --all --timeout=120s
# Then apply custom resources
kubectl apply -f clusters/prod/flux-system/gitrepository-platform.yaml
kubectl apply -f clusters/prod/flux-system/kustomization-infrastructure.yaml
kubectl apply -f clusters/prod/flux-system/kustomization-apps.yaml
# Patch Flux controllers to run on cp-1 and tolerate the control-plane taint
PATCH='{"spec":{"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"k8s-cluster-cp-1"},"tolerations":[{"key":"node-role.kubernetes.io/control-plane","operator":"Exists","effect":"NoSchedule"}]}}}}'
kubectl -n flux-system patch deployment source-controller --type='merge' -p="$PATCH"
kubectl -n flux-system patch deployment kustomize-controller --type='merge' -p="$PATCH"
kubectl -n flux-system patch deployment helm-controller --type='merge' -p="$PATCH"
kubectl -n flux-system patch deployment notification-controller --type='merge' -p="$PATCH"
kubectl -n flux-system rollout status deployment/source-controller --timeout=600s
kubectl -n flux-system rollout status deployment/kustomize-controller --timeout=600s
kubectl -n flux-system rollout status deployment/helm-controller --timeout=600s
kubectl -n flux-system wait --for=condition=Ready gitrepository/platform --timeout=300s
kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=600s
# Wait for ESO CRDs and deployment directly instead of Flux Kustomization status
kubectl wait --for=condition=established --timeout=600s crd/clustersecretstores.external-secrets.io
kubectl wait --for=condition=established --timeout=600s crd/externalsecrets.external-secrets.io
kubectl -n kube-system rollout status deployment/external-secrets --timeout=600s
# Create Doppler ClusterSecretStore now that ESO CRDs are available
kubectl apply -f - <<'EOF'
apiVersion: external-secrets.io/v1
kind: ClusterSecretStore
metadata:
name: doppler-hetznerterra
spec:
provider:
doppler:
auth:
secretRef:
dopplerToken:
name: doppler-hetznerterra-service-token
key: dopplerToken
namespace: external-secrets
EOF
# Wait for the storage layer and private access components
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s
kubectl -n kube-system rollout status deployment/kube-system-nfs-subdir-external-provisioner --timeout=600s
kubectl get storageclass flash-nfs
- name: Wait for Rancher and backup operator
env:
KUBECONFIG: outputs/kubeconfig
run: |
set -euo pipefail
echo "Waiting for Rancher..."
kubectl -n cattle-system rollout status deployment/cattle-system-rancher --timeout=900s
kubectl -n cattle-system rollout status deployment/rancher-webhook --timeout=900s
kubectl -n cattle-system wait --for=condition=Ready issuer/cattle-system-rancher --timeout=900s
kubectl -n cattle-system wait --for=condition=Ready certificate/tls-rancher-ingress --timeout=900s
echo "Waiting for rancher-backup operator..."
kubectl -n cattle-resources-system rollout status deployment/rancher-backup --timeout=900s
- name: Restore Rancher from latest B2 backup
env:
KUBECONFIG: outputs/kubeconfig
B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }}
B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }}
run: |
echo "Finding latest backup in B2..."
CREDS=$(echo -n "${B2_ACCOUNT_ID}:${B2_APPLICATION_KEY}" | base64)
AUTH_RESP=$(curl -sS -H "Authorization: Basic ${CREDS}" https://api.backblazeb2.com/b2api/v2/b2_authorize_account)
API_URL=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['apiUrl'])")
AUTH_TOKEN=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['authorizationToken'])")
BUCKET_ID=$(echo "$AUTH_RESP" | python3 -c "
import json,sys
resp = json.load(sys.stdin)
bid = resp.get('allowed', {}).get('bucketId')
if bid:
print(bid)
else:
print('')
")
if [ -z "$BUCKET_ID" ]; then
echo "Restricted B2 key - resolving bucket ID by name..."
BUCKET_ID=$(curl -sS -H "Authorization: Bearer ${AUTH_TOKEN}" \
"${API_URL}/b2api/v2/b2_list_buckets?accountId=${B2_ACCOUNT_ID}&bucketName=HetznerTerra" \
| python3 -c "import json,sys; buckets=json.load(sys.stdin).get('buckets',[]); print(buckets[0]['bucketId'] if buckets else '')")
fi
LATEST=$(curl -sS -H "Authorization: Bearer ${AUTH_TOKEN}" \
"${API_URL}/b2api/v2/b2_list_file_names?bucketId=${BUCKET_ID}&prefix=rancher-backups/&maxFileCount=100" \
| python3 -c "
import json,sys
files = json.load(sys.stdin).get('files', [])
tars = [f['fileName'] for f in files if f['fileName'].endswith('.tar.gz')]
if not tars:
print('NONE')
else:
tars.sort()
print(tars[-1])
")
if [ "$LATEST" = "NONE" ]; then
echo "No backups found in B2. Skipping restore."
exit 0
fi
BACKUP_FILE=$(basename "$LATEST")
echo "Latest backup: ${BACKUP_FILE}"
echo "Creating Restore CR..."
kubectl apply -f - <<EOF
apiVersion: resources.cattle.io/v1
kind: Restore
metadata:
name: restore-from-b2
namespace: cattle-resources-system
spec:
backupFilename: ${BACKUP_FILE}
storageLocation:
s3:
credentialSecretName: rancher-b2-creds
credentialSecretNamespace: cattle-resources-system
bucketName: HetznerTerra
folder: rancher-backups
endpoint: s3.us-east-005.backblazeb2.com
region: us-east-005
EOF
echo "Waiting for restore to complete..."
for i in $(seq 1 60); do
STATUS=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "Unknown")
MESSAGE=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].message}' 2>/dev/null || echo "")
echo " Restore status: ${STATUS} - ${MESSAGE}"
if [ "$STATUS" = "True" ]; then
echo "Restore completed successfully!"
exit 0
fi
sleep 10
done
echo "Restore did not complete within timeout. Continuing anyway."
- name: Post-deploy cluster health checks
working-directory: ansible
run: |
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get nodes -o wide"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n flux-system get gitrepositories,kustomizations,helmreleases"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n kube-system get pods -o wide"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass flash-nfs"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n tailscale-system get pods -o wide"
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n external-secrets get pods"
env:
ANSIBLE_HOST_KEY_CHECKING: "False"
- name: Post-deploy tailnet smoke checks
working-directory: ansible
run: |
ansible -i inventory.ini 'control_plane[0]' -m script -a "../scripts/smoke-check-tailnet-services.sh"
env:
ANSIBLE_HOST_KEY_CHECKING: "False"
- name: Upload Kubeconfig
uses: actions/upload-artifact@v3
with:
name: kubeconfig
path: outputs/kubeconfig