71bdc6a709
Fresh Proxmox clusters need longer for the Flux controller rollouts and first GitRepository/Kustomization reconciliations, especially while images are still being pulled onto the control plane. Increase the bootstrap wait windows so CI does not fail while the controllers are still converging.
367 lines
15 KiB
YAML
367 lines
15 KiB
YAML
name: Deploy Cluster
|
|
|
|
on:
|
|
push:
|
|
branches:
|
|
- main
|
|
pull_request:
|
|
branches:
|
|
- main
|
|
workflow_dispatch:
|
|
|
|
env:
|
|
TF_VERSION: "1.7.0"
|
|
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
|
|
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
|
|
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
|
|
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
|
|
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
|
|
TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
|
|
TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
|
|
TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
|
|
TF_VAR_proxmox_insecure: "true"
|
|
TS_OAUTH_CLIENT_ID: ${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}
|
|
TS_OAUTH_CLIENT_SECRET: ${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}
|
|
|
|
jobs:
|
|
terraform:
|
|
name: Terraform
|
|
runs-on: ubuntu-latest
|
|
steps:
|
|
- name: Checkout
|
|
uses: actions/checkout@v4
|
|
|
|
- name: Setup Terraform
|
|
uses: hashicorp/setup-terraform@v3
|
|
with:
|
|
terraform_version: ${{ env.TF_VERSION }}
|
|
|
|
- name: Terraform Format Check
|
|
working-directory: terraform
|
|
run: terraform fmt -check -recursive
|
|
|
|
- name: Terraform Init
|
|
working-directory: terraform
|
|
run: |
|
|
terraform init \
|
|
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
|
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
|
-backend-config="region=auto" \
|
|
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
|
|
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
|
|
-backend-config="skip_requesting_account_id=true"
|
|
|
|
- name: Terraform Validate
|
|
working-directory: terraform
|
|
run: terraform validate
|
|
|
|
- name: Setup SSH Keys
|
|
run: |
|
|
mkdir -p ~/.ssh
|
|
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
|
|
chmod 600 ~/.ssh/id_ed25519
|
|
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
|
|
chmod 644 ~/.ssh/id_ed25519.pub
|
|
|
|
- name: Terraform Plan
|
|
id: plan
|
|
working-directory: terraform
|
|
run: |
|
|
terraform plan \
|
|
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
|
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
|
-out=tfplan \
|
|
-no-color
|
|
continue-on-error: true
|
|
|
|
- name: Post Plan to PR
|
|
if: github.event_name == 'pull_request'
|
|
uses: actions/github-script@v7
|
|
with:
|
|
script: |
|
|
const output = `#### Terraform Plan
|
|
\`\`\`
|
|
${{ steps.plan.outputs.stdout }}
|
|
\`\`\``;
|
|
github.rest.issues.createComment({
|
|
issue_number: context.issue.number,
|
|
owner: context.repo.owner,
|
|
repo: context.repo.repo,
|
|
body: output
|
|
});
|
|
|
|
- name: Fail if plan failed
|
|
if: steps.plan.outcome == 'failure'
|
|
run: exit 1
|
|
|
|
- name: Terraform Apply
|
|
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
|
working-directory: terraform
|
|
run: |
|
|
terraform apply \
|
|
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
|
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
|
-auto-approve
|
|
|
|
- name: Save Terraform Outputs
|
|
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
|
run: |
|
|
mkdir -p outputs
|
|
terraform output -json > outputs/terraform_outputs.json
|
|
working-directory: terraform
|
|
|
|
- name: Upload Outputs
|
|
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
|
uses: actions/upload-artifact@v3
|
|
with:
|
|
name: terraform-outputs
|
|
path: outputs/terraform_outputs.json
|
|
|
|
ansible:
|
|
name: Ansible
|
|
runs-on: ubuntu-latest
|
|
needs: terraform
|
|
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
|
steps:
|
|
- name: Checkout
|
|
uses: actions/checkout@v4
|
|
|
|
- name: Setup Terraform
|
|
uses: hashicorp/setup-terraform@v3
|
|
with:
|
|
terraform_version: ${{ env.TF_VERSION }}
|
|
|
|
- name: Setup SSH Keys
|
|
run: |
|
|
mkdir -p ~/.ssh
|
|
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
|
|
chmod 600 ~/.ssh/id_ed25519
|
|
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
|
|
chmod 644 ~/.ssh/id_ed25519.pub
|
|
|
|
- name: Terraform Init
|
|
working-directory: terraform
|
|
run: |
|
|
terraform init \
|
|
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
|
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
|
-backend-config="region=auto" \
|
|
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
|
|
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
|
|
-backend-config="skip_requesting_account_id=true"
|
|
|
|
- name: Get Terraform Outputs
|
|
working-directory: terraform
|
|
run: |
|
|
mkdir -p ../outputs
|
|
terraform output -json > ../outputs/terraform_outputs.json
|
|
|
|
- name: Install Python Dependencies
|
|
run: |
|
|
apt-get update && apt-get install -y python3-pip
|
|
pip3 install --break-system-packages ansible kubernetes jinja2 pyyaml
|
|
|
|
- name: Install Ansible Collections
|
|
run: ansible-galaxy collection install -r ansible/requirements.yml
|
|
|
|
- name: Generate Ansible Inventory
|
|
working-directory: ansible
|
|
run: python3 generate_inventory.py
|
|
|
|
- name: Run Ansible Playbook
|
|
working-directory: ansible
|
|
run: |
|
|
ansible-playbook site.yml \
|
|
-e "tailscale_auth_key=${{ secrets.TAILSCALE_AUTH_KEY }}" \
|
|
-e "tailscale_tailnet=${{ secrets.TAILSCALE_TAILNET }}" \
|
|
-e "tailscale_oauth_client_id=${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}" \
|
|
-e "tailscale_oauth_client_secret=${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}" \
|
|
-e "doppler_hetznerterra_service_token=${{ secrets.DOPPLER_HETZNERTERRA_SERVICE_TOKEN }}" \
|
|
-e "tailscale_api_key=${{ secrets.TAILSCALE_API_KEY }}" \
|
|
-e "grafana_admin_password=${{ secrets.GRAFANA_ADMIN_PASSWORD }}" \
|
|
-e "cluster_name=k8s-cluster"
|
|
env:
|
|
ANSIBLE_HOST_KEY_CHECKING: "False"
|
|
|
|
- name: Install kubectl
|
|
run: |
|
|
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
|
|
chmod +x /usr/local/bin/kubectl
|
|
|
|
- name: Rewrite kubeconfig for runner-reachable API
|
|
working-directory: terraform
|
|
run: |
|
|
PRIMARY_IP=$(terraform output -raw primary_control_plane_ip)
|
|
sed -i "s#https://k8s-cluster-cp-1\.[^:]*:6443#https://${PRIMARY_IP}:6443#g" ../outputs/kubeconfig
|
|
|
|
- name: Bootstrap Flux source and reconciliation graph
|
|
env:
|
|
KUBECONFIG: outputs/kubeconfig
|
|
FLUX_GIT_HOST: 64.176.189.59
|
|
FLUX_GIT_PORT: "2222"
|
|
run: |
|
|
kubectl create namespace flux-system --dry-run=client -o yaml | kubectl apply -f -
|
|
ssh-keyscan -p "${FLUX_GIT_PORT}" "${FLUX_GIT_HOST}" > /tmp/flux_known_hosts
|
|
kubectl -n flux-system create secret generic flux-system \
|
|
--from-file=identity="$HOME/.ssh/id_ed25519" \
|
|
--from-file=known_hosts=/tmp/flux_known_hosts \
|
|
--dry-run=client -o yaml | kubectl apply -f -
|
|
# Apply CRDs and controllers first
|
|
kubectl apply -f clusters/prod/flux-system/gotk-components.yaml
|
|
# Wait for CRDs to be established
|
|
kubectl wait --for=condition=Established crd --all --timeout=120s
|
|
# Then apply custom resources
|
|
kubectl apply -f clusters/prod/flux-system/gitrepository-platform.yaml
|
|
kubectl apply -f clusters/prod/flux-system/kustomization-infrastructure.yaml
|
|
kubectl apply -f clusters/prod/flux-system/kustomization-apps.yaml
|
|
# Patch Flux controllers to run on cp-1 and tolerate the control-plane taint
|
|
PATCH='{"spec":{"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"k8s-cluster-cp-1"},"tolerations":[{"key":"node-role.kubernetes.io/control-plane","operator":"Exists","effect":"NoSchedule"}]}}}}'
|
|
kubectl -n flux-system patch deployment source-controller --type='merge' -p="$PATCH"
|
|
kubectl -n flux-system patch deployment kustomize-controller --type='merge' -p="$PATCH"
|
|
kubectl -n flux-system patch deployment helm-controller --type='merge' -p="$PATCH"
|
|
kubectl -n flux-system patch deployment notification-controller --type='merge' -p="$PATCH"
|
|
kubectl -n flux-system rollout status deployment/source-controller --timeout=600s
|
|
kubectl -n flux-system rollout status deployment/kustomize-controller --timeout=600s
|
|
kubectl -n flux-system rollout status deployment/helm-controller --timeout=600s
|
|
kubectl -n flux-system wait --for=condition=Ready gitrepository/platform --timeout=300s
|
|
kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=600s
|
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets --timeout=600s
|
|
# Create Doppler ClusterSecretStore now that ESO CRDs are available
|
|
kubectl apply -f - <<'EOF'
|
|
apiVersion: external-secrets.io/v1
|
|
kind: ClusterSecretStore
|
|
metadata:
|
|
name: doppler-hetznerterra
|
|
spec:
|
|
provider:
|
|
doppler:
|
|
auth:
|
|
secretRef:
|
|
dopplerToken:
|
|
name: doppler-hetznerterra-service-token
|
|
key: dopplerToken
|
|
namespace: external-secrets
|
|
EOF
|
|
# Wait for the storage layer and private access components
|
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=600s
|
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s
|
|
|
|
- name: Wait for Rancher and backup operator
|
|
env:
|
|
KUBECONFIG: outputs/kubeconfig
|
|
run: |
|
|
set -euo pipefail
|
|
echo "Waiting for Rancher..."
|
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=600s
|
|
kubectl -n flux-system wait --for=condition=Ready helmrelease/rancher -n flux-system --timeout=300s
|
|
|
|
echo "Waiting for rancher-backup operator..."
|
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup --timeout=600s || true
|
|
|
|
- name: Restore Rancher from latest B2 backup
|
|
env:
|
|
KUBECONFIG: outputs/kubeconfig
|
|
B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }}
|
|
B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }}
|
|
run: |
|
|
echo "Finding latest backup in B2..."
|
|
|
|
CREDS=$(echo -n "${B2_ACCOUNT_ID}:${B2_APPLICATION_KEY}" | base64)
|
|
AUTH_RESP=$(curl -sS -H "Authorization: Basic ${CREDS}" https://api.backblazeb2.com/b2api/v2/b2_authorize_account)
|
|
API_URL=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['apiUrl'])")
|
|
AUTH_TOKEN=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['authorizationToken'])")
|
|
BUCKET_ID=$(echo "$AUTH_RESP" | python3 -c "
|
|
import json,sys
|
|
resp = json.load(sys.stdin)
|
|
bid = resp.get('allowed', {}).get('bucketId')
|
|
if bid:
|
|
print(bid)
|
|
else:
|
|
print('')
|
|
")
|
|
|
|
if [ -z "$BUCKET_ID" ]; then
|
|
echo "Restricted B2 key - resolving bucket ID by name..."
|
|
BUCKET_ID=$(curl -sS -H "Authorization: Bearer ${AUTH_TOKEN}" \
|
|
"${API_URL}/b2api/v2/b2_list_buckets?accountId=${B2_ACCOUNT_ID}&bucketName=HetznerTerra" \
|
|
| python3 -c "import json,sys; buckets=json.load(sys.stdin).get('buckets',[]); print(buckets[0]['bucketId'] if buckets else '')")
|
|
fi
|
|
|
|
LATEST=$(curl -sS -H "Authorization: Bearer ${AUTH_TOKEN}" \
|
|
"${API_URL}/b2api/v2/b2_list_file_names?bucketId=${BUCKET_ID}&prefix=rancher-backups/&maxFileCount=100" \
|
|
| python3 -c "
|
|
import json,sys
|
|
files = json.load(sys.stdin).get('files', [])
|
|
tars = [f['fileName'] for f in files if f['fileName'].endswith('.tar.gz')]
|
|
if not tars:
|
|
print('NONE')
|
|
else:
|
|
tars.sort()
|
|
print(tars[-1])
|
|
")
|
|
|
|
if [ "$LATEST" = "NONE" ]; then
|
|
echo "No backups found in B2. Skipping restore."
|
|
exit 0
|
|
fi
|
|
|
|
BACKUP_FILE=$(basename "$LATEST")
|
|
echo "Latest backup: ${BACKUP_FILE}"
|
|
|
|
echo "Creating Restore CR..."
|
|
kubectl apply -f - <<EOF
|
|
apiVersion: resources.cattle.io/v1
|
|
kind: Restore
|
|
metadata:
|
|
name: restore-from-b2
|
|
namespace: cattle-resources-system
|
|
spec:
|
|
backupFilename: ${BACKUP_FILE}
|
|
storageLocation:
|
|
s3:
|
|
credentialSecretName: rancher-b2-creds
|
|
credentialSecretNamespace: cattle-resources-system
|
|
bucketName: HetznerTerra
|
|
folder: rancher-backups
|
|
endpoint: s3.us-east-005.backblazeb2.com
|
|
region: us-east-005
|
|
EOF
|
|
|
|
echo "Waiting for restore to complete..."
|
|
for i in $(seq 1 60); do
|
|
STATUS=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "Unknown")
|
|
MESSAGE=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].message}' 2>/dev/null || echo "")
|
|
echo " Restore status: ${STATUS} - ${MESSAGE}"
|
|
if [ "$STATUS" = "True" ]; then
|
|
echo "Restore completed successfully!"
|
|
exit 0
|
|
fi
|
|
sleep 10
|
|
done
|
|
echo "Restore did not complete within timeout. Continuing anyway."
|
|
|
|
- name: Post-deploy cluster health checks
|
|
working-directory: ansible
|
|
run: |
|
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get nodes -o wide"
|
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n flux-system get gitrepositories,kustomizations,helmreleases"
|
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n kube-system get pods -o wide"
|
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass flash-nfs"
|
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n tailscale-system get pods -o wide"
|
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n external-secrets get pods"
|
|
env:
|
|
ANSIBLE_HOST_KEY_CHECKING: "False"
|
|
|
|
- name: Post-deploy tailnet smoke checks
|
|
working-directory: ansible
|
|
run: |
|
|
ansible -i inventory.ini 'control_plane[0]' -m script -a "../scripts/smoke-check-tailnet-services.sh"
|
|
env:
|
|
ANSIBLE_HOST_KEY_CHECKING: "False"
|
|
|
|
- name: Upload Kubeconfig
|
|
uses: actions/upload-artifact@v3
|
|
with:
|
|
name: kubeconfig
|
|
path: outputs/kubeconfig
|