name: Deploy Cluster on: push: branches: - main pull_request: branches: - main workflow_dispatch: env: TF_VERSION: "1.7.0" TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }} TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }} TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }} TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }} TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }} TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }} TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }} TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }} TF_VAR_proxmox_insecure: "true" TS_OAUTH_CLIENT_ID: ${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }} TS_OAUTH_CLIENT_SECRET: ${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }} jobs: terraform: name: Terraform runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 - name: Setup Terraform uses: hashicorp/setup-terraform@v3 with: terraform_version: ${{ env.TF_VERSION }} - name: Terraform Format Check working-directory: terraform run: terraform fmt -check -recursive - name: Terraform Init working-directory: terraform run: | terraform init \ -backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \ -backend-config="bucket=${{ secrets.S3_BUCKET }}" \ -backend-config="region=auto" \ -backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \ -backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \ -backend-config="skip_requesting_account_id=true" - name: Terraform Validate working-directory: terraform run: terraform validate - name: Setup SSH Keys run: | mkdir -p ~/.ssh echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519 chmod 600 ~/.ssh/id_ed25519 echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub chmod 644 ~/.ssh/id_ed25519.pub - name: Terraform Plan id: plan working-directory: terraform run: | terraform plan \ -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \ -var="ssh_private_key=$HOME/.ssh/id_ed25519" \ -out=tfplan \ -no-color continue-on-error: true - name: Post Plan to PR if: github.event_name == 'pull_request' uses: actions/github-script@v7 with: script: | const output = `#### Terraform Plan \`\`\` ${{ steps.plan.outputs.stdout }} \`\`\``; github.rest.issues.createComment({ issue_number: context.issue.number, owner: context.repo.owner, repo: context.repo.repo, body: output }); - name: Fail if plan failed if: steps.plan.outcome == 'failure' run: exit 1 - name: Terraform Apply if: github.ref == 'refs/heads/main' && github.event_name == 'push' working-directory: terraform run: | terraform apply \ -var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \ -var="ssh_private_key=$HOME/.ssh/id_ed25519" \ -auto-approve - name: Save Terraform Outputs if: github.ref == 'refs/heads/main' && github.event_name == 'push' run: | mkdir -p outputs terraform output -json > outputs/terraform_outputs.json working-directory: terraform - name: Upload Outputs if: github.ref == 'refs/heads/main' && github.event_name == 'push' uses: actions/upload-artifact@v3 with: name: terraform-outputs path: outputs/terraform_outputs.json ansible: name: Ansible runs-on: ubuntu-latest needs: terraform if: github.ref == 'refs/heads/main' && github.event_name == 'push' steps: - name: Checkout uses: actions/checkout@v4 - name: Setup Terraform uses: hashicorp/setup-terraform@v3 with: terraform_version: ${{ env.TF_VERSION }} - name: Setup SSH Keys run: | mkdir -p ~/.ssh echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519 chmod 600 ~/.ssh/id_ed25519 echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub chmod 644 ~/.ssh/id_ed25519.pub - name: Terraform Init working-directory: terraform run: | terraform init \ -backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \ -backend-config="bucket=${{ secrets.S3_BUCKET }}" \ -backend-config="region=auto" \ -backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \ -backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \ -backend-config="skip_requesting_account_id=true" - name: Get Terraform Outputs working-directory: terraform run: | mkdir -p ../outputs terraform output -json > ../outputs/terraform_outputs.json - name: Install Python Dependencies run: | apt-get update && apt-get install -y python3-pip pip3 install --break-system-packages ansible kubernetes jinja2 pyyaml - name: Install Ansible Collections run: ansible-galaxy collection install -r ansible/requirements.yml - name: Generate Ansible Inventory working-directory: ansible run: python3 generate_inventory.py - name: Run Ansible Playbook working-directory: ansible run: | ansible-playbook site.yml \ -e "tailscale_auth_key=${{ secrets.TAILSCALE_AUTH_KEY }}" \ -e "tailscale_tailnet=${{ secrets.TAILSCALE_TAILNET }}" \ -e "tailscale_oauth_client_id=${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}" \ -e "tailscale_oauth_client_secret=${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}" \ -e "doppler_hetznerterra_service_token=${{ secrets.DOPPLER_HETZNERTERRA_SERVICE_TOKEN }}" \ -e "tailscale_api_key=${{ secrets.TAILSCALE_API_KEY }}" \ -e "grafana_admin_password=${{ secrets.GRAFANA_ADMIN_PASSWORD }}" \ -e "cluster_name=k8s-cluster" env: ANSIBLE_HOST_KEY_CHECKING: "False" - name: Install kubectl run: | curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" chmod +x /usr/local/bin/kubectl - name: Rewrite kubeconfig for runner-reachable API working-directory: terraform run: | set -euo pipefail PRIMARY_IP=$(terraform output -raw primary_control_plane_ip) sed -i "s#https://k8s-cluster-cp-1\.[^:]*:6443#https://${PRIMARY_IP}:6443#g" ../outputs/kubeconfig - name: Pre-pull Flux controller images on primary control plane working-directory: terraform run: | set -euo pipefail PRIMARY_IP=$(terraform output -raw primary_control_plane_ip) ssh -o StrictHostKeyChecking=no "ubuntu@${PRIMARY_IP}" 'bash -s' <<'EOF' set -euo pipefail images=( ghcr.io/fluxcd/source-controller:v1.8.0 ghcr.io/fluxcd/kustomize-controller:v1.8.1 ghcr.io/fluxcd/helm-controller:v1.5.1 ghcr.io/fluxcd/notification-controller:v1.8.1 ) for image in "${images[@]}"; do for attempt in $(seq 1 12); do if timeout 180s sudo /usr/local/bin/ctr -n k8s.io images pull "${image}"; then break fi if [ "${attempt}" -eq 12 ]; then echo "Failed to pre-pull ${image} after ${attempt} attempts" >&2 exit 1 fi sleep 20 done done EOF - name: Bootstrap Flux source and reconciliation graph env: KUBECONFIG: outputs/kubeconfig FLUX_GIT_HOST: 64.176.189.59 FLUX_GIT_PORT: "2222" run: | set -euo pipefail flux_rollout_status() { local deployment="$1" if ! kubectl -n flux-system rollout status "deployment/${deployment}" --timeout=900s; then kubectl -n flux-system get pods -o wide kubectl -n flux-system describe deployment "${deployment}" kubectl -n flux-system describe pods -l "app=${deployment}" exit 1 fi } wait_for_resource() { local namespace="$1" local resource="$2" local timeout_seconds="$3" local elapsed=0 until { if [ -n "${namespace}" ]; then kubectl -n "${namespace}" get "${resource}" >/dev/null 2>&1 else kubectl get "${resource}" >/dev/null 2>&1 fi }; do if [ "${elapsed}" -ge "${timeout_seconds}" ]; then echo "Timed out waiting for ${resource} to exist" >&2 kubectl -n flux-system get kustomizations,helmreleases || true exit 1 fi sleep 10 elapsed=$((elapsed + 10)) done } eso_diagnostics() { kubectl -n flux-system get kustomizations,helmrepositories,helmcharts,helmreleases || true kubectl -n flux-system describe kustomization addon-external-secrets || true kubectl -n flux-system describe helmrepository external-secrets || true kubectl -n flux-system describe helmrelease external-secrets || true kubectl -n external-secrets get pods -o wide || true } kubectl create namespace flux-system --dry-run=client -o yaml | kubectl apply -f - ssh-keyscan -p "${FLUX_GIT_PORT}" "${FLUX_GIT_HOST}" > /tmp/flux_known_hosts kubectl -n flux-system create secret generic flux-system \ --from-file=identity="$HOME/.ssh/id_ed25519" \ --from-file=known_hosts=/tmp/flux_known_hosts \ --dry-run=client -o yaml | kubectl apply -f - # Apply CRDs and controllers first kubectl apply -f clusters/prod/flux-system/gotk-components.yaml # Wait for CRDs to be established kubectl wait --for=condition=Established crd --all --timeout=120s # Then apply custom resources kubectl apply -f clusters/prod/flux-system/gitrepository-platform.yaml kubectl apply -f clusters/prod/flux-system/kustomization-infrastructure.yaml kubectl apply -f clusters/prod/flux-system/kustomization-apps.yaml # Patch Flux controllers to run on cp-1 and tolerate the control-plane taint PATCH='{"spec":{"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"k8s-cluster-cp-1"},"tolerations":[{"key":"node-role.kubernetes.io/control-plane","operator":"Exists","effect":"NoSchedule"}]}}}}' kubectl -n flux-system patch deployment source-controller --type='merge' -p="$PATCH" kubectl -n flux-system patch deployment kustomize-controller --type='merge' -p="$PATCH" kubectl -n flux-system patch deployment helm-controller --type='merge' -p="$PATCH" kubectl -n flux-system patch deployment notification-controller --type='merge' -p="$PATCH" flux_rollout_status source-controller flux_rollout_status kustomize-controller flux_rollout_status helm-controller kubectl -n flux-system wait --for=condition=Ready gitrepository/platform --timeout=300s kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=600s # Wait directly on the ESO Helm objects; Kustomization readiness hides useful failure details. wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-external-secrets 600 kubectl -n flux-system annotate kustomization/addon-external-secrets reconcile.fluxcd.io/requestedAt="$(date +%s)" --overwrite wait_for_resource flux-system helmrepository.source.toolkit.fluxcd.io/external-secrets 600 if ! kubectl -n flux-system wait --for=condition=Ready helmrepository/external-secrets --timeout=900s; then eso_diagnostics exit 1 fi wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/external-secrets 600 if ! kubectl -n flux-system wait --for=condition=Ready helmrelease/external-secrets --timeout=1800s; then eso_diagnostics exit 1 fi wait_for_resource "" crd/clustersecretstores.external-secrets.io 900 wait_for_resource "" crd/externalsecrets.external-secrets.io 900 kubectl wait --for=condition=established --timeout=600s crd/clustersecretstores.external-secrets.io kubectl wait --for=condition=established --timeout=600s crd/externalsecrets.external-secrets.io kubectl -n external-secrets rollout status deployment/external-secrets --timeout=600s # Create Doppler ClusterSecretStore now that ESO CRDs are available kubectl apply -f - <<'EOF' apiVersion: external-secrets.io/v1 kind: ClusterSecretStore metadata: name: doppler-hetznerterra spec: provider: doppler: auth: secretRef: dopplerToken: name: doppler-hetznerterra-service-token key: dopplerToken namespace: external-secrets EOF # Wait for the storage layer and private access components kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s kubectl -n kube-system rollout status deployment/kube-system-nfs-subdir-external-provisioner --timeout=600s kubectl get storageclass flash-nfs - name: Wait for Rancher and backup operator env: KUBECONFIG: outputs/kubeconfig run: | set -euo pipefail echo "Waiting for Rancher..." kubectl -n cattle-system rollout status deployment/cattle-system-rancher --timeout=900s kubectl -n cattle-system rollout status deployment/rancher-webhook --timeout=900s kubectl -n cattle-system wait --for=condition=Ready issuer/cattle-system-rancher --timeout=900s kubectl -n cattle-system wait --for=condition=Ready certificate/tls-rancher-ingress --timeout=900s echo "Waiting for rancher-backup operator..." kubectl -n cattle-resources-system rollout status deployment/rancher-backup --timeout=900s - name: Restore Rancher from latest B2 backup env: KUBECONFIG: outputs/kubeconfig B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }} B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }} run: | echo "Finding latest backup in B2..." CREDS=$(echo -n "${B2_ACCOUNT_ID}:${B2_APPLICATION_KEY}" | base64) AUTH_RESP=$(curl -sS -H "Authorization: Basic ${CREDS}" https://api.backblazeb2.com/b2api/v2/b2_authorize_account) API_URL=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['apiUrl'])") AUTH_TOKEN=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['authorizationToken'])") BUCKET_ID=$(echo "$AUTH_RESP" | python3 -c " import json,sys resp = json.load(sys.stdin) bid = resp.get('allowed', {}).get('bucketId') if bid: print(bid) else: print('') ") if [ -z "$BUCKET_ID" ]; then echo "Restricted B2 key - resolving bucket ID by name..." BUCKET_ID=$(curl -sS -H "Authorization: Bearer ${AUTH_TOKEN}" \ "${API_URL}/b2api/v2/b2_list_buckets?accountId=${B2_ACCOUNT_ID}&bucketName=HetznerTerra" \ | python3 -c "import json,sys; buckets=json.load(sys.stdin).get('buckets',[]); print(buckets[0]['bucketId'] if buckets else '')") fi LATEST=$(curl -sS -H "Authorization: Bearer ${AUTH_TOKEN}" \ "${API_URL}/b2api/v2/b2_list_file_names?bucketId=${BUCKET_ID}&prefix=rancher-backups/&maxFileCount=100" \ | python3 -c " import json,sys files = json.load(sys.stdin).get('files', []) tars = [f['fileName'] for f in files if f['fileName'].endswith('.tar.gz')] if not tars: print('NONE') else: tars.sort() print(tars[-1]) ") if [ "$LATEST" = "NONE" ]; then echo "No backups found in B2. Skipping restore." exit 0 fi BACKUP_FILE=$(basename "$LATEST") echo "Latest backup: ${BACKUP_FILE}" echo "Creating Restore CR..." kubectl apply -f - </dev/null || echo "Unknown") MESSAGE=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].message}' 2>/dev/null || echo "") echo " Restore status: ${STATUS} - ${MESSAGE}" if [ "$STATUS" = "True" ]; then echo "Restore completed successfully!" exit 0 fi sleep 10 done echo "Restore did not complete within timeout. Continuing anyway." - name: Post-deploy cluster health checks working-directory: ansible run: | ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get nodes -o wide" ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n flux-system get gitrepositories,kustomizations,helmreleases" ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n kube-system get pods -o wide" ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass flash-nfs" ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n tailscale-system get pods -o wide" ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n external-secrets get pods" env: ANSIBLE_HOST_KEY_CHECKING: "False" - name: Post-deploy tailnet smoke checks working-directory: ansible run: | ansible -i inventory.ini 'control_plane[0]' -m script -a "../scripts/smoke-check-tailnet-services.sh" env: ANSIBLE_HOST_KEY_CHECKING: "False" - name: Upload Kubeconfig uses: actions/upload-artifact@v3 with: name: kubeconfig path: outputs/kubeconfig