098bd98876
Flux can leave HelmRelease and Kustomization conditions stale after transient chart fetch or image pull failures even when the underlying workloads recover. Switch the deploy workflow to wait on the concrete runtime resources we care about: the NFS provisioner deployment and StorageClass, Rancher deployment, webhook, cert-manager issuer/certificate, and the rancher-backup deployment.
388 lines
16 KiB
YAML
388 lines
16 KiB
YAML
name: Deploy Cluster
|
|
|
|
on:
|
|
push:
|
|
branches:
|
|
- main
|
|
pull_request:
|
|
branches:
|
|
- main
|
|
workflow_dispatch:
|
|
|
|
env:
|
|
TF_VERSION: "1.7.0"
|
|
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
|
|
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
|
|
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
|
|
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
|
|
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
|
|
TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
|
|
TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
|
|
TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
|
|
TF_VAR_proxmox_insecure: "true"
|
|
TS_OAUTH_CLIENT_ID: ${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}
|
|
TS_OAUTH_CLIENT_SECRET: ${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}
|
|
|
|
jobs:
|
|
terraform:
|
|
name: Terraform
|
|
runs-on: ubuntu-latest
|
|
steps:
|
|
- name: Checkout
|
|
uses: actions/checkout@v4
|
|
|
|
- name: Setup Terraform
|
|
uses: hashicorp/setup-terraform@v3
|
|
with:
|
|
terraform_version: ${{ env.TF_VERSION }}
|
|
|
|
- name: Terraform Format Check
|
|
working-directory: terraform
|
|
run: terraform fmt -check -recursive
|
|
|
|
- name: Terraform Init
|
|
working-directory: terraform
|
|
run: |
|
|
terraform init \
|
|
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
|
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
|
-backend-config="region=auto" \
|
|
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
|
|
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
|
|
-backend-config="skip_requesting_account_id=true"
|
|
|
|
- name: Terraform Validate
|
|
working-directory: terraform
|
|
run: terraform validate
|
|
|
|
- name: Setup SSH Keys
|
|
run: |
|
|
mkdir -p ~/.ssh
|
|
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
|
|
chmod 600 ~/.ssh/id_ed25519
|
|
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
|
|
chmod 644 ~/.ssh/id_ed25519.pub
|
|
|
|
- name: Terraform Plan
|
|
id: plan
|
|
working-directory: terraform
|
|
run: |
|
|
terraform plan \
|
|
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
|
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
|
-out=tfplan \
|
|
-no-color
|
|
continue-on-error: true
|
|
|
|
- name: Post Plan to PR
|
|
if: github.event_name == 'pull_request'
|
|
uses: actions/github-script@v7
|
|
with:
|
|
script: |
|
|
const output = `#### Terraform Plan
|
|
\`\`\`
|
|
${{ steps.plan.outputs.stdout }}
|
|
\`\`\``;
|
|
github.rest.issues.createComment({
|
|
issue_number: context.issue.number,
|
|
owner: context.repo.owner,
|
|
repo: context.repo.repo,
|
|
body: output
|
|
});
|
|
|
|
- name: Fail if plan failed
|
|
if: steps.plan.outcome == 'failure'
|
|
run: exit 1
|
|
|
|
- name: Terraform Apply
|
|
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
|
working-directory: terraform
|
|
run: |
|
|
terraform apply \
|
|
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
|
|
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
|
|
-auto-approve
|
|
|
|
- name: Save Terraform Outputs
|
|
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
|
run: |
|
|
mkdir -p outputs
|
|
terraform output -json > outputs/terraform_outputs.json
|
|
working-directory: terraform
|
|
|
|
- name: Upload Outputs
|
|
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
|
uses: actions/upload-artifact@v3
|
|
with:
|
|
name: terraform-outputs
|
|
path: outputs/terraform_outputs.json
|
|
|
|
ansible:
|
|
name: Ansible
|
|
runs-on: ubuntu-latest
|
|
needs: terraform
|
|
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
|
steps:
|
|
- name: Checkout
|
|
uses: actions/checkout@v4
|
|
|
|
- name: Setup Terraform
|
|
uses: hashicorp/setup-terraform@v3
|
|
with:
|
|
terraform_version: ${{ env.TF_VERSION }}
|
|
|
|
- name: Setup SSH Keys
|
|
run: |
|
|
mkdir -p ~/.ssh
|
|
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
|
|
chmod 600 ~/.ssh/id_ed25519
|
|
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
|
|
chmod 644 ~/.ssh/id_ed25519.pub
|
|
|
|
- name: Terraform Init
|
|
working-directory: terraform
|
|
run: |
|
|
terraform init \
|
|
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
|
|
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
|
|
-backend-config="region=auto" \
|
|
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
|
|
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
|
|
-backend-config="skip_requesting_account_id=true"
|
|
|
|
- name: Get Terraform Outputs
|
|
working-directory: terraform
|
|
run: |
|
|
mkdir -p ../outputs
|
|
terraform output -json > ../outputs/terraform_outputs.json
|
|
|
|
- name: Install Python Dependencies
|
|
run: |
|
|
apt-get update && apt-get install -y python3-pip
|
|
pip3 install --break-system-packages ansible kubernetes jinja2 pyyaml
|
|
|
|
- name: Install Ansible Collections
|
|
run: ansible-galaxy collection install -r ansible/requirements.yml
|
|
|
|
- name: Generate Ansible Inventory
|
|
working-directory: ansible
|
|
run: python3 generate_inventory.py
|
|
|
|
- name: Run Ansible Playbook
|
|
working-directory: ansible
|
|
run: |
|
|
ansible-playbook site.yml \
|
|
-e "tailscale_auth_key=${{ secrets.TAILSCALE_AUTH_KEY }}" \
|
|
-e "tailscale_tailnet=${{ secrets.TAILSCALE_TAILNET }}" \
|
|
-e "tailscale_oauth_client_id=${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}" \
|
|
-e "tailscale_oauth_client_secret=${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}" \
|
|
-e "doppler_hetznerterra_service_token=${{ secrets.DOPPLER_HETZNERTERRA_SERVICE_TOKEN }}" \
|
|
-e "tailscale_api_key=${{ secrets.TAILSCALE_API_KEY }}" \
|
|
-e "grafana_admin_password=${{ secrets.GRAFANA_ADMIN_PASSWORD }}" \
|
|
-e "cluster_name=k8s-cluster"
|
|
env:
|
|
ANSIBLE_HOST_KEY_CHECKING: "False"
|
|
|
|
- name: Install kubectl
|
|
run: |
|
|
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/$(curl -fsSL https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
|
|
chmod +x /usr/local/bin/kubectl
|
|
|
|
- name: Rewrite kubeconfig for runner-reachable API
|
|
working-directory: terraform
|
|
run: |
|
|
PRIMARY_IP=$(terraform output -raw primary_control_plane_ip)
|
|
sed -i "s#https://k8s-cluster-cp-1\.[^:]*:6443#https://${PRIMARY_IP}:6443#g" ../outputs/kubeconfig
|
|
|
|
- name: Bootstrap Flux source and reconciliation graph
|
|
env:
|
|
KUBECONFIG: outputs/kubeconfig
|
|
FLUX_GIT_HOST: 64.176.189.59
|
|
FLUX_GIT_PORT: "2222"
|
|
run: |
|
|
kubectl create namespace flux-system --dry-run=client -o yaml | kubectl apply -f -
|
|
ssh-keyscan -p "${FLUX_GIT_PORT}" "${FLUX_GIT_HOST}" > /tmp/flux_known_hosts
|
|
kubectl -n flux-system create secret generic flux-system \
|
|
--from-file=identity="$HOME/.ssh/id_ed25519" \
|
|
--from-file=known_hosts=/tmp/flux_known_hosts \
|
|
--dry-run=client -o yaml | kubectl apply -f -
|
|
# Apply CRDs and controllers first
|
|
kubectl apply -f clusters/prod/flux-system/gotk-components.yaml
|
|
# Wait for CRDs to be established
|
|
kubectl wait --for=condition=Established crd --all --timeout=120s
|
|
# Then apply custom resources
|
|
kubectl apply -f clusters/prod/flux-system/gitrepository-platform.yaml
|
|
kubectl apply -f clusters/prod/flux-system/kustomization-infrastructure.yaml
|
|
kubectl apply -f clusters/prod/flux-system/kustomization-apps.yaml
|
|
# Patch Flux controllers to run on cp-1 and tolerate the control-plane taint
|
|
PATCH='{"spec":{"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"k8s-cluster-cp-1"},"tolerations":[{"key":"node-role.kubernetes.io/control-plane","operator":"Exists","effect":"NoSchedule"}]}}}}'
|
|
kubectl -n flux-system patch deployment source-controller --type='merge' -p="$PATCH"
|
|
kubectl -n flux-system patch deployment kustomize-controller --type='merge' -p="$PATCH"
|
|
kubectl -n flux-system patch deployment helm-controller --type='merge' -p="$PATCH"
|
|
kubectl -n flux-system patch deployment notification-controller --type='merge' -p="$PATCH"
|
|
kubectl -n flux-system rollout status deployment/source-controller --timeout=600s
|
|
kubectl -n flux-system rollout status deployment/kustomize-controller --timeout=600s
|
|
kubectl -n flux-system rollout status deployment/helm-controller --timeout=600s
|
|
kubectl -n flux-system wait --for=condition=Ready gitrepository/platform --timeout=300s
|
|
kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=600s
|
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets --timeout=600s
|
|
# Create Doppler ClusterSecretStore now that ESO CRDs are available
|
|
kubectl apply -f - <<'EOF'
|
|
apiVersion: external-secrets.io/v1
|
|
kind: ClusterSecretStore
|
|
metadata:
|
|
name: doppler-hetznerterra
|
|
spec:
|
|
provider:
|
|
doppler:
|
|
auth:
|
|
secretRef:
|
|
dopplerToken:
|
|
name: doppler-hetznerterra-service-token
|
|
key: dopplerToken
|
|
namespace: external-secrets
|
|
EOF
|
|
# Wait for the storage layer and private access components
|
|
TS=$(date --iso-8601=seconds)
|
|
kubectl -n flux-system annotate helmrelease/nfs-subdir-external-provisioner \
|
|
reconcile.fluxcd.io/requestedAt="$TS" \
|
|
reconcile.fluxcd.io/resetAt="$TS" \
|
|
reconcile.fluxcd.io/forceAt="$TS" \
|
|
--overwrite || true
|
|
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s
|
|
kubectl -n kube-system rollout status deployment/kube-system-nfs-subdir-external-provisioner --timeout=600s
|
|
kubectl get storageclass flash-nfs
|
|
|
|
- name: Wait for Rancher and backup operator
|
|
env:
|
|
KUBECONFIG: outputs/kubeconfig
|
|
run: |
|
|
set -euo pipefail
|
|
TS=$(date --iso-8601=seconds)
|
|
kubectl -n flux-system annotate helmrelease/rancher \
|
|
reconcile.fluxcd.io/requestedAt="$TS" \
|
|
reconcile.fluxcd.io/resetAt="$TS" \
|
|
reconcile.fluxcd.io/forceAt="$TS" \
|
|
--overwrite || true
|
|
kubectl -n flux-system annotate helmrelease/rancher-backup \
|
|
reconcile.fluxcd.io/requestedAt="$TS" \
|
|
reconcile.fluxcd.io/resetAt="$TS" \
|
|
reconcile.fluxcd.io/forceAt="$TS" \
|
|
--overwrite || true
|
|
|
|
echo "Waiting for Rancher..."
|
|
kubectl -n cattle-system rollout status deployment/cattle-system-rancher --timeout=900s
|
|
kubectl -n cattle-system rollout status deployment/rancher-webhook --timeout=900s
|
|
kubectl -n cattle-system wait --for=condition=Ready issuer/cattle-system-rancher --timeout=900s
|
|
kubectl -n cattle-system wait --for=condition=Ready certificate/tls-rancher-ingress --timeout=900s
|
|
|
|
echo "Waiting for rancher-backup operator..."
|
|
kubectl -n cattle-resources-system rollout status deployment/rancher-backup --timeout=900s
|
|
|
|
- name: Restore Rancher from latest B2 backup
|
|
env:
|
|
KUBECONFIG: outputs/kubeconfig
|
|
B2_ACCOUNT_ID: ${{ secrets.B2_ACCOUNT_ID }}
|
|
B2_APPLICATION_KEY: ${{ secrets.B2_APPLICATION_KEY }}
|
|
run: |
|
|
echo "Finding latest backup in B2..."
|
|
|
|
CREDS=$(echo -n "${B2_ACCOUNT_ID}:${B2_APPLICATION_KEY}" | base64)
|
|
AUTH_RESP=$(curl -sS -H "Authorization: Basic ${CREDS}" https://api.backblazeb2.com/b2api/v2/b2_authorize_account)
|
|
API_URL=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['apiUrl'])")
|
|
AUTH_TOKEN=$(echo "$AUTH_RESP" | python3 -c "import json,sys; print(json.load(sys.stdin)['authorizationToken'])")
|
|
BUCKET_ID=$(echo "$AUTH_RESP" | python3 -c "
|
|
import json,sys
|
|
resp = json.load(sys.stdin)
|
|
bid = resp.get('allowed', {}).get('bucketId')
|
|
if bid:
|
|
print(bid)
|
|
else:
|
|
print('')
|
|
")
|
|
|
|
if [ -z "$BUCKET_ID" ]; then
|
|
echo "Restricted B2 key - resolving bucket ID by name..."
|
|
BUCKET_ID=$(curl -sS -H "Authorization: Bearer ${AUTH_TOKEN}" \
|
|
"${API_URL}/b2api/v2/b2_list_buckets?accountId=${B2_ACCOUNT_ID}&bucketName=HetznerTerra" \
|
|
| python3 -c "import json,sys; buckets=json.load(sys.stdin).get('buckets',[]); print(buckets[0]['bucketId'] if buckets else '')")
|
|
fi
|
|
|
|
LATEST=$(curl -sS -H "Authorization: Bearer ${AUTH_TOKEN}" \
|
|
"${API_URL}/b2api/v2/b2_list_file_names?bucketId=${BUCKET_ID}&prefix=rancher-backups/&maxFileCount=100" \
|
|
| python3 -c "
|
|
import json,sys
|
|
files = json.load(sys.stdin).get('files', [])
|
|
tars = [f['fileName'] for f in files if f['fileName'].endswith('.tar.gz')]
|
|
if not tars:
|
|
print('NONE')
|
|
else:
|
|
tars.sort()
|
|
print(tars[-1])
|
|
")
|
|
|
|
if [ "$LATEST" = "NONE" ]; then
|
|
echo "No backups found in B2. Skipping restore."
|
|
exit 0
|
|
fi
|
|
|
|
BACKUP_FILE=$(basename "$LATEST")
|
|
echo "Latest backup: ${BACKUP_FILE}"
|
|
|
|
echo "Creating Restore CR..."
|
|
kubectl apply -f - <<EOF
|
|
apiVersion: resources.cattle.io/v1
|
|
kind: Restore
|
|
metadata:
|
|
name: restore-from-b2
|
|
namespace: cattle-resources-system
|
|
spec:
|
|
backupFilename: ${BACKUP_FILE}
|
|
storageLocation:
|
|
s3:
|
|
credentialSecretName: rancher-b2-creds
|
|
credentialSecretNamespace: cattle-resources-system
|
|
bucketName: HetznerTerra
|
|
folder: rancher-backups
|
|
endpoint: s3.us-east-005.backblazeb2.com
|
|
region: us-east-005
|
|
EOF
|
|
|
|
echo "Waiting for restore to complete..."
|
|
for i in $(seq 1 60); do
|
|
STATUS=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "Unknown")
|
|
MESSAGE=$(kubectl get restore restore-from-b2 -n cattle-resources-system -o jsonpath='{.status.conditions[?(@.type=="Ready")].message}' 2>/dev/null || echo "")
|
|
echo " Restore status: ${STATUS} - ${MESSAGE}"
|
|
if [ "$STATUS" = "True" ]; then
|
|
echo "Restore completed successfully!"
|
|
exit 0
|
|
fi
|
|
sleep 10
|
|
done
|
|
echo "Restore did not complete within timeout. Continuing anyway."
|
|
|
|
- name: Post-deploy cluster health checks
|
|
working-directory: ansible
|
|
run: |
|
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get nodes -o wide"
|
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n flux-system get gitrepositories,kustomizations,helmreleases"
|
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n kube-system get pods -o wide"
|
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass flash-nfs"
|
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n tailscale-system get pods -o wide"
|
|
ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n external-secrets get pods"
|
|
env:
|
|
ANSIBLE_HOST_KEY_CHECKING: "False"
|
|
|
|
- name: Post-deploy tailnet smoke checks
|
|
working-directory: ansible
|
|
run: |
|
|
ansible -i inventory.ini 'control_plane[0]' -m script -a "../scripts/smoke-check-tailnet-services.sh"
|
|
env:
|
|
ANSIBLE_HOST_KEY_CHECKING: "False"
|
|
|
|
- name: Upload Kubeconfig
|
|
uses: actions/upload-artifact@v3
|
|
with:
|
|
name: kubeconfig
|
|
path: outputs/kubeconfig
|