8e081ddfda
The addon-external-secrets Flux Kustomization was timing out during bootstrap because image pulls on fresh Proxmox VMs are slow. The critical dependency is the ESO deployment being available for the Doppler ClusterSecretStore. Replace the Kustomization readiness check with direct checks for ESO CRD establishment and deployment rollout, which are the actual prerequisites for the next step.
373 lines
16 KiB
YAML
373 lines
16 KiB
YAML
name: Deploy Cluster
|
|
|
|
on:
|
|
push:
|
|
branches:
|
|
- main
|
|
pull_request:
|
|
branches:
|
|
- main
|
|
workflow_dispatch:
|
|
|
|
env:
|
|
TF_VERSION: "1.7.0"
|
|
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
|
|
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
|
|
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
|
|
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
|
|
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
|
|
TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
|
|
TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
|
|
TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
|
|
TF_VAR_proxmox_insecure: "true"
|
|
TS_OAUTH_CLIENT_ID: ${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}
|
|
TS_OAUTH_CLIENT_SECRET: ${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}
|
|
|
|
jobs:
|
|
terraform:
|
|
name: Terraform
|
|
runs-on: ubuntu-latest
|
|
steps:
|
|
- name: Checkout
|
|
uses: actions/checkout@v4
|
|
|
|
- name: Setup Terraform
|
|
uses: hashicorp/setup-terraform@v3
|
|
with:
|
|
terraform_version: ${{ env.TF_VERSION }}
|
|
|
|
- name: Terraform Format Check
|
|
working-directory: terraform
|
|
run: terraform fmt -check -recursive
|
|
|
|
- name: Terraform Init
|
|
working-directory: terraform
|
|
run: |
|
|
terraform init \
|
|
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
|
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
|
-backend-config="region=auto" \
|
|
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
|
|
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
|
|
-backend-config="skip_requesting_account_id=true"
|
|
|
|
- name: Terraform Validate
|
|
working-directory: terraform
|
|
run: terraform validate
|
|
|
|
- name: Setup SSH Keys
|
|
run: |
|
|
mkdir -p ~/.ssh
|
|
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
|
|
chmod 600 ~/.ssh/id_ed25519
|
|
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
|
|
chmod 644 ~/.ssh/id_ed25519.pub
|
|
|
|
- name: Terraform Plan
|
|
id: plan
|
|
working-directory: terraform
|
|
run: |
|
|
terraform plan \
|
|
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
|
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
|
-out=tfplan \
|
|
-no-color
|
|
continue-on-error: true
|
|
|
|
- name: Post Plan to PR
|
|
if: github.event_name == 'pull_request'
|
|
uses: actions/github-script@v7
|
|
with:
|
|
script: |
|
|
const output = `#### Terraform Plan
|
|
\`\`\`
|
|
${{ steps.plan.outputs.stdout }}
|
|
\`\`\``;
|
|
github.rest.issues.createComment({
|
|
issue_number: context.issue.number,
|
|
owner: context.repo.owner,
|
|
repo: context.repo.repo,
|
|
body: output
|
|
});
|
|
|
|
- name: Fail if plan failed
|
|
if: steps.plan.outcome == 'failure'
|
|
run: exit 1
|
|
|
|
- name: Terraform Apply
|
|
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
|
working-directory: terraform
|
|
run: |
|
|
terraform apply \
|
|
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
|
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
|
-auto-approve
|
|
|
|
- name: Save Terraform Outputs
|
|
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
|
run: |
|
|
mkdir -p outputs
|
|
terraform output -json > outputs/terraform_outputs.json
|
|
working-directory: terraform
|
|
|
|
- name: Upload Outputs
|
|
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
|
uses: actions/upload-artifact@v3
|
|
with:
|
|
name: terraform-outputs
|
|
path: outputs/terraform_outputs.json
|
|
|
|
ansible:
|
|
name: Ansible
|
|
runs-on: ubuntu-latest
|
|
needs: terraform
|
|
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
|
steps:
|
|
- name: Checkout
|
|
uses: actions/checkout@v4
|
|
|
|
- name: Setup Terraform
|
|
uses: hashicorp/setup-terraform@v3
|
|
with:
|
|
terraform_version: ${{ env.TF_VERSION }}
|
|
|
|
- name: Setup SSH Keys
|
|
run: |
|
|
mkdir -p ~/.ssh
|
|
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
|
|
chmod 600 ~/.ssh/id_ed25519
|
|
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
|
|
chmod 644 ~/.ssh/id_ed25519.pub
|
|
|
|
- name: Terraform Init
|
|
working-directory: terraform
|
|
run: |
|
|
terraform init \
|
|
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
|
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
|
-backend-config="region=auto" \
|
|
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
|
|
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
|
|
-backend-config="skip_requesting_account_id=true"
|
|
|
|
- name: Get Terraform Outputs
|
|
working-directory: terraform
|
|
run: |
|
|
mkdir -p ../outputs
|
|
terraform output -json > ../outputs/terraform_outputs.json
|
|
|
|
- name: Install Python Dependencies
|
|
run: |
|
|
apt-get update && apt-get install -y python3-pip
|
|
pip3 install --break-system-packages ansible kubernetes jinja2 pyyaml
|
|
|
|
- name: Install Ansible Collections
|
|
run: ansible-galaxy collection install -r ansible/requirements.yml
|
|
|
|
- name: Generate Ansible Inventory
|
|
working-directory: ansible
|
|
run: python3 generate_inventory.py
|
|
|
|
- name: Run Ansible Playbook
|
|
working-directory: ansible
|
|
run: |
|
|
ansible-playbook site.yml \
|
|
-e "tailscale_auth_key=${{ secrets.TAILSCALE_AUTH_KEY }}" \
|
|
-e "tailscale_tailnet=${{ secrets.TAILSCALE_TAILNET }}" \
|
|
-e "tailscale_oauth_client_id=${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}" \
|
|
-e "tailscale_oauth_client_secret=${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}" \
|
|
-e "doppler_hetznerterra_service_token=${{ secrets.DOPPLER_HETZNERTERRA_SERVICE_TOKEN }}" \
|
|
-e "tailscale_api_key=${{ secrets.TAILSCALE_API_KEY }}" \
|
|
-e "grafana_admin_password=${{ secrets.GRAFANA_ADMIN_PASSWORD }}" \
|
|
-e "cluster_name=k8s-cluster"
|
|
env:
|
|
ANSIBLE_HOST_KEY_CHECKING: "False"
|
|
|
|
- name: Install kubectl
|
|
run: |
|
|
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
|
|
chmod +x /usr/local/bin/kubectl
|
|
|
|
- name: Rewrite kubeconfig for runner-reachable API
|
|
working-directory: terraform
|
|
run: |
|
|
PRIMARY_IP=$(terraform output -raw primary_control_plane_ip)
|
|
sed -i "s#https://k8s-cluster-cp-1\.[^:]*:6443#https://${PRIMARY_IP}:6443#g" ../outputs/kubeconfig
|
|
|
|
- name: Bootstrap Flux source and reconciliation graph
|
|
env:
|
|
KUBECONFIG: outputs/kubeconfig
|
|
FLUX_GIT_HOST: 64.176.189.59
|
|
FLUX_GIT_PORT: "2222"
|
|
run: |
|
|
kubectl create namespace flux-system --dry-run=client -o yaml | kubectl apply -f -
|
|
ssh-keyscan -p "${FLUX_GIT_PORT}" "${FLUX_GIT_HOST}" > /tmp/flux_known_hosts
|
|
kubectl -n flux-system create secret generic flux-system \
|
|
--from-file=identity="$HOME/.ssh/id_ed25519" \
|
|
--from-file=known_hosts=/tmp/flux_known_hosts \
|
|
--dry-run=client -o yaml | kubectl apply -f -
|
|
# Apply CRDs and controllers first
|
|
kubectl apply -f clusters/prod/flux-system/gotk-components.yaml
|
|
# Wait for CRDs to be established
|
|
kubectl wait --for=condition=Established crd --all --timeout=120s
|
|
# Then apply custom resources
|
|
kubectl apply -f clusters/prod/flux-system/gitrepository-platform.yaml
|
|
kubectl apply -f clusters/prod/flux-system/kustomization-infrastructure.yaml
|
|
kubectl apply -f clusters/prod/flux-system/kustomization-apps.yaml
|
|
# Patch Flux controllers to run on cp-1 and tolerate the control-plane taint
|
|
PATCH='{"spec":{"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"k8s-cluster-cp-1"},"tolerations":[{"key":"node-role.kubernetes.io/control-plane","operator":"Exists","effect":"NoSchedule"}]}}}}'
|
|
kubectl -n flux-system patch deployment source-controller --type='merge' -p="$PATCH"
|
|
kubectl -n flux-system patch deployment kustomize-controller --type='merge' -p="$PATCH"
|
|
kubectl -n flux-system patch deployment helm-controller --type='merge' -p="$PATCH"
|
|
kubectl -n flux-system patch deployment notification-controller --type='merge' -p="$PATCH"
|
|
kubectl -n flux-system rollout status deployment/source-controller --timeout=600s
|
|
kubectl -n flux-system rollout status deployment/kustomize-controller --timeout=600s
|
|
kubectl -n flux-system rollout status deployment/helm-controller --timeout=600s
|
|
kubectl -n flux-system wait --for=condition=Ready gitrepository/platform --timeout=300s
|
|
kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=600s
|
|
# Wait for ESO CRDs and deployment directly instead of Flux Kustomization status
|
|
kubectl wait --for=condition=established --timeout=600s crd/clustersecretstores.external-secrets.io
|
|
kubectl wait --for=condition=established --timeout=600s crd/externalsecrets.external-secrets.io
|
|
kubectl -n kube-system rollout status deployment/external-secrets --timeout=600s
|
|
# Create Doppler ClusterSecretStore now that ESO CRDs are available
|
|
kubectl apply -f - <<'EOF'
|
|
apiVersion: external-secrets.io/v1
|
|
kind: ClusterSecretStore
|
|
metadata:
|
|
name: doppler-hetznerterra
|
|
spec:
|
|
provider:
|
|
doppler:
|
|
auth:
|
|
secretRef:
|
|
dopplerToken:
|
|
name: doppler-hetznerterra-service-token
|
|
key: dopplerToken
|
|
namespace: external-secrets
|
|
EOF
|
|
# Wait for the storage layer and private access components
|
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s
|
|
kubectl -n kube-system rollout status deployment/kube-system-nfs-subdir-external-provisioner --timeout=600s
|
|
kubectl get storageclass flash-nfs
|
|
|
|
- name: Wait for Rancher and backup operator
|
|
env:
|
|
KUBECONFIG: outputs/kubeconfig
|
|
run: |
|
|
set -euo pipefail
|
|
echo "Waiting for Rancher..."
|
|
kubectl -n cattle-system rollout status deployment/cattle-system-rancher --timeout=900s
|
|
kubectl -n cattle-system rollout status deployment/rancher-webhook --timeout=900s
|
|
kubectl -n cattle-system wait --for=condition=Ready issuer/cattle-system-rancher --timeout=900s
|
|
kubectl -n cattle-system wait --for=condition=Ready certificate/tls-rancher-ingress --timeout=900s
|
|
|
|
echo "Waiting for rancher-backup operator..."
|
|
kubectl -n cattle-resources-system rollout status deployment/rancher-backup --timeout=900s
|
|
|
|
- name: Restore Rancher from latest B2 backup
|
|
env:
|
|
KUBECONFIG: outputs/kubeconfig
|
|
B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }}
|
|
B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }}
|
|
run: |
|
|
echo "Finding latest backup in B2..."
|
|
|
|
CREDS=$(echo -n "${B2_ACCOUNT_ID}:${B2_APPLICATION_KEY}" | base64)
|
|
AUTH_RESP=$(curl -sS -H "Authorization: Basic ${CREDS}" https://api.backblazeb2.com/b2api/v2/b2_authorize_account)
|
|
API_URL=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['apiUrl'])")
|
|
AUTH_TOKEN=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['authorizationToken'])")
|
|
BUCKET_ID=$(echo "$AUTH_RESP" | python3 -c "
|
|
import json,sys
|
|
resp = json.load(sys.stdin)
|
|
bid = resp.get('allowed', {}).get('bucketId')
|
|
if bid:
|
|
print(bid)
|
|
else:
|
|
print('')
|
|
")
|
|
|
|
if [ -z "$BUCKET_ID" ]; then
|
|
echo "Restricted B2 key - resolving bucket ID by name..."
|
|
BUCKET_ID=$(curl -sS -H "Authorization: Bearer ${AUTH_TOKEN}" \
|
|
"${API_URL}/b2api/v2/b2_list_buckets?accountId=${B2_ACCOUNT_ID}&bucketName=HetznerTerra" \
|
|
| python3 -c "import json,sys; buckets=json.load(sys.stdin).get('buckets',[]); print(buckets[0]['bucketId'] if buckets else '')")
|
|
fi
|
|
|
|
LATEST=$(curl -sS -H "Authorization: Bearer ${AUTH_TOKEN}" \
|
|
"${API_URL}/b2api/v2/b2_list_file_names?bucketId=${BUCKET_ID}&prefix=rancher-backups/&maxFileCount=100" \
|
|
| python3 -c "
|
|
import json,sys
|
|
files = json.load(sys.stdin).get('files', [])
|
|
tars = [f['fileName'] for f in files if f['fileName'].endswith('.tar.gz')]
|
|
if not tars:
|
|
print('NONE')
|
|
else:
|
|
tars.sort()
|
|
print(tars[-1])
|
|
")
|
|
|
|
if [ "$LATEST" = "NONE" ]; then
|
|
echo "No backups found in B2. Skipping restore."
|
|
exit 0
|
|
fi
|
|
|
|
BACKUP_FILE=$(basename "$LATEST")
|
|
echo "Latest backup: ${BACKUP_FILE}"
|
|
|
|
echo "Creating Restore CR..."
|
|
kubectl apply -f - <<EOF
|
|
apiVersion: resources.cattle.io/v1
|
|
kind: Restore
|
|
metadata:
|
|
name: restore-from-b2
|
|
namespace: cattle-resources-system
|
|
spec:
|
|
backupFilename: ${BACKUP_FILE}
|
|
storageLocation:
|
|
s3:
|
|
credentialSecretName: rancher-b2-creds
|
|
credentialSecretNamespace: cattle-resources-system
|
|
bucketName: HetznerTerra
|
|
folder: rancher-backups
|
|
endpoint: s3.us-east-005.backblazeb2.com
|
|
region: us-east-005
|
|
EOF
|
|
|
|
echo "Waiting for restore to complete..."
|
|
for i in $(seq 1 60); do
|
|
STATUS=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "Unknown")
|
|
MESSAGE=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].message}' 2>/dev/null || echo "")
|
|
echo " Restore status: ${STATUS} - ${MESSAGE}"
|
|
if [ "$STATUS" = "True" ]; then
|
|
echo "Restore completed successfully!"
|
|
exit 0
|
|
fi
|
|
sleep 10
|
|
done
|
|
echo "Restore did not complete within timeout. Continuing anyway."
|
|
|
|
- name: Post-deploy cluster health checks
|
|
working-directory: ansible
|
|
run: |
|
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get nodes -o wide"
|
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n flux-system get gitrepositories,kustomizations,helmreleases"
|
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n kube-system get pods -o wide"
|
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass flash-nfs"
|
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n tailscale-system get pods -o wide"
|
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n external-secrets get pods"
|
|
env:
|
|
ANSIBLE_HOST_KEY_CHECKING: "False"
|
|
|
|
- name: Post-deploy tailnet smoke checks
|
|
working-directory: ansible
|
|
run: |
|
|
ansible -i inventory.ini 'control_plane[0]' -m script -a "../scripts/smoke-check-tailnet-services.sh"
|
|
env:
|
|
ANSIBLE_HOST_KEY_CHECKING: "False"
|
|
|
|
- name: Upload Kubeconfig
|
|
uses: actions/upload-artifact@v3
|
|
with:
|
|
name: kubeconfig
|
|
path: outputs/kubeconfig
|