Files
HetznerTerra/.gitea/workflows/deploy.yml
T
micqdf e1c836aacd
Deploy Cluster / Terraform (push) Successful in 35s
Deploy Cluster / Ansible (push) Successful in 18m9s
fix: tolerate converged flux reconcile waits
2026-05-03 02:57:05 +00:00

1085 lines
51 KiB
YAML

name: Deploy Cluster
on:
push:
branches:
- main
pull_request:
branches:
- main
workflow_dispatch:
concurrency:
group: prod-cluster
cancel-in-progress: false
env:
TF_VERSION: "1.14.9"
KUBECTL_VERSION: "v1.34.6"
TF_VAR_s3_access_key: ${{ secrets.S3_ACCESS_KEY }}
TF_VAR_s3_secret_key: ${{ secrets.S3_SECRET_KEY }}
TF_VAR_s3_endpoint: ${{ secrets.S3_ENDPOINT }}
TF_VAR_s3_bucket: ${{ secrets.S3_BUCKET }}
TF_VAR_tailscale_tailnet: ${{ secrets.TAILSCALE_TAILNET }}
TF_VAR_proxmox_endpoint: ${{ secrets.PROXMOX_ENDPOINT }}
TF_VAR_proxmox_api_token_id: ${{ secrets.PROXMOX_API_TOKEN_ID }}
TF_VAR_proxmox_api_token_secret: ${{ secrets.PROXMOX_API_TOKEN_SECRET }}
TF_VAR_proxmox_insecure: "true"
TS_OAUTH_CLIENT_ID: ${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}
TS_OAUTH_CLIENT_SECRET: ${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}
jobs:
terraform:
name: Terraform
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Terraform
uses: hashicorp/setup-terraform@v3
with:
terraform_version: ${{ env.TF_VERSION }}
terraform_wrapper: false
- name: Terraform Format Check
working-directory: terraform
run: terraform fmt -check -recursive
- name: Terraform Init
working-directory: terraform
run: |
terraform init \
-lockfile=readonly \
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
-backend-config="region=auto" \
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
-backend-config="skip_requesting_account_id=true"
- name: Terraform Validate
working-directory: terraform
run: terraform validate
- name: Setup SSH Keys
run: |
mkdir -p ~/.ssh
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
chmod 644 ~/.ssh/id_ed25519.pub
- name: Terraform Plan
working-directory: terraform
run: |
terraform plan \
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
-out=tfplan \
-no-color
- name: Cleanup orphan Proxmox cloud-init volumes
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
run: python3 scripts/proxmox-rebuild-cleanup.py --mode orphan-cloudinit --terraform-dir terraform --plan tfplan
- name: Terraform Apply
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
working-directory: terraform
run: |
set -euo pipefail
run_apply() {
local log_file="$1"
terraform apply \
-parallelism=2 \
-var="ssh_public_key=$HOME/.ssh/id_ed25519.pub" \
-var="ssh_private_key=$HOME/.ssh/id_ed25519" \
-auto-approve 2>&1 | tee "${log_file}"
return "${PIPESTATUS[0]}"
}
cleanup_untracked_target_vms() {
python3 ../scripts/proxmox-rebuild-cleanup.py --mode untracked-vms --terraform-dir . --plan tfplan
}
cleanup_untracked_target_vms
for attempt in 1 2 3; do
log_file="/tmp/terraform-apply-${attempt}.log"
if run_apply "${log_file}"; then
exit 0
fi
if [ "${attempt}" = "3" ]; then
exit 1
fi
echo "Terraform apply failed; cleaning Terraform-untracked partial VM creates before retry ${attempt}/2"
cleanup_untracked_target_vms
sleep 20
done
- name: Save Terraform Outputs
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
run: |
mkdir -p outputs
terraform output -json > outputs/terraform_outputs.json
working-directory: terraform
ansible:
name: Ansible
runs-on: ubuntu-22.04
needs: terraform
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
steps:
- name: Checkout
uses: actions/checkout@v4
- name: Setup Terraform
uses: hashicorp/setup-terraform@v3
with:
terraform_version: ${{ env.TF_VERSION }}
terraform_wrapper: false
- name: Setup SSH Keys
run: |
mkdir -p ~/.ssh
echo "${{ secrets.SSH_PRIVATE_KEY }}" > ~/.ssh/id_ed25519
chmod 600 ~/.ssh/id_ed25519
echo "${{ secrets.SSH_PUBLIC_KEY }}" > ~/.ssh/id_ed25519.pub
chmod 644 ~/.ssh/id_ed25519.pub
- name: Terraform Init
working-directory: terraform
run: |
terraform init \
-lockfile=readonly \
-backend-config="endpoint=${{ secrets.S3_ENDPOINT }}" \
-backend-config="bucket=${{ secrets.S3_BUCKET }}" \
-backend-config="region=auto" \
-backend-config="access_key=${{ secrets.S3_ACCESS_KEY }}" \
-backend-config="secret_key=${{ secrets.S3_SECRET_KEY }}" \
-backend-config="skip_requesting_account_id=true"
- name: Get Terraform Outputs
working-directory: terraform
run: |
mkdir -p ../outputs
terraform output -json > ../outputs/terraform_outputs.json
- name: Install Python Dependencies
run: |
apt-get update && apt-get install -y python3-pip
pip3 install ansible==8.7.0 kubernetes==26.1.0 jinja2==3.1.5 pyyaml==6.0.2
- name: Install Ansible Collections
run: ansible-galaxy collection install -r ansible/requirements.yml
- name: Generate Ansible Inventory
working-directory: ansible
run: python3 generate_inventory.py
- name: Run Ansible Playbook
working-directory: ansible
run: |
ansible-playbook site.yml \
-e "tailscale_auth_key=${{ secrets.TAILSCALE_AUTH_KEY }}" \
-e "tailscale_tailnet=${{ secrets.TAILSCALE_TAILNET }}" \
-e "tailscale_oauth_client_id=${{ secrets.TAILSCALE_OAUTH_CLIENT_ID }}" \
-e "tailscale_oauth_client_secret=${{ secrets.TAILSCALE_OAUTH_CLIENT_SECRET }}" \
-e "doppler_hetznerterra_service_token=${{ secrets.DOPPLER_HETZNERTERRA_SERVICE_TOKEN }}" \
-e "tailscale_api_key=${{ secrets.TAILSCALE_API_KEY }}" \
-e "grafana_admin_password=${{ secrets.GRAFANA_ADMIN_PASSWORD }}" \
-e "cluster_name=k8s-cluster"
env:
ANSIBLE_HOST_KEY_CHECKING: "False"
- name: Install kubectl
run: |
curl -fsSL -o /usr/local/bin/kubectl "https://dl.k8s.io/release/${KUBECTL_VERSION}/bin/linux/amd64/kubectl"
chmod +x /usr/local/bin/kubectl
- name: Rewrite kubeconfig for runner-reachable API
working-directory: terraform
run: |
set -euo pipefail
PRIMARY_IP=$(terraform output -raw primary_control_plane_ip)
sed -i "s#https://k8s-cluster-cp-1\.[^:]*:6443#https://${PRIMARY_IP}:6443#g" ../outputs/kubeconfig
- name: Bootstrap Flux source and reconciliation graph
env:
KUBECONFIG: outputs/kubeconfig
FLUX_GIT_HOST: 64.176.189.59
FLUX_GIT_PORT: "2222"
FLUX_KNOWN_HOSTS: ${{ secrets.FLUX_KNOWN_HOSTS }}
run: |
set -euo pipefail
flux_rollout_status() {
local deployment="$1"
if ! kubectl -n flux-system rollout status "deployment/${deployment}" --timeout=900s; then
kubectl -n flux-system get pods -o wide
kubectl -n flux-system describe deployment "${deployment}"
kubectl -n flux-system describe pods -l "app=${deployment}"
exit 1
fi
}
wait_for_resource() {
local namespace="$1"
local resource="$2"
local timeout_seconds="$3"
local elapsed=0
until {
if [ -n "${namespace}" ]; then
kubectl -n "${namespace}" get "${resource}" >/dev/null 2>&1
else
kubectl get "${resource}" >/dev/null 2>&1
fi
}; do
if [ "${elapsed}" -ge "${timeout_seconds}" ]; then
echo "Timed out waiting for ${resource} to exist" >&2
kubectl -n flux-system get kustomizations,helmreleases || true
exit 1
fi
sleep 10
elapsed=$((elapsed + 10))
done
}
wait_for_reconcile_handled() {
local namespace="$1"
local resource="$2"
local reconcile_at="$3"
local timeout_seconds="$4"
local elapsed=0
local handled
local ready
local healthy
local generation
local observed_generation
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
handled="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.lastHandledReconcileAt}' 2>/dev/null || true)"
if [ "${handled}" = "${reconcile_at}" ]; then
return 0
fi
sleep 5
elapsed=$((elapsed + 5))
done
ready="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
healthy="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.conditions[?(@.type=="Healthy")].status}' 2>/dev/null || true)"
generation="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.metadata.generation}' 2>/dev/null || true)"
observed_generation="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.observedGeneration}' 2>/dev/null || true)"
if [ "${ready}" = "True" ] && [ "${observed_generation}" = "${generation}" ]; then
if [ -z "${healthy}" ] || [ "${healthy}" = "True" ]; then
echo "${resource} did not report reconcile ${reconcile_at}, but it is already Ready; continuing"
return 0
fi
fi
echo "Timed out waiting for ${resource} to handle reconcile ${reconcile_at}" >&2
kubectl -n "${namespace}" describe "${resource}" || true
exit 1
}
reconcile_flux_resource() {
local namespace="$1"
local resource="$2"
local timeout_seconds="$3"
local reconcile_at
reconcile_at="$(date +%s%N)"
kubectl -n "${namespace}" annotate "${resource}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
wait_for_reconcile_handled "${namespace}" "${resource}" "${reconcile_at}" "${timeout_seconds}"
}
reconcile_helmrelease() {
local release_name="$1"
local timeout_seconds="$2"
local reconcile_at
reconcile_at="$(date +%s%N)"
kubectl -n flux-system annotate "helmrelease/${release_name}" \
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
--overwrite
wait_for_reconcile_handled flux-system "helmrelease/${release_name}" "${reconcile_at}" "${timeout_seconds}"
}
pull_required_image() {
local image="$1"
local host_ip="$2"
local attempts="$3"
local sleep_seconds="$4"
local failure_message="$5"
local pulled=false
local last_output=""
for attempt in $(seq 1 "${attempts}"); do
echo "Pre-pulling ${image} on ${host_ip} (${attempt}/${attempts})"
if last_output="$(ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${host_ip}" \
"sudo k3s crictl inspecti '${image}' >/dev/null 2>&1 || (sudo k3s crictl pull '${image}' && sudo k3s crictl inspecti '${image}' >/dev/null 2>&1)" 2>&1)"; then
pulled=true
break
fi
printf '%s\n' "${last_output}" >&2
sleep "${sleep_seconds}"
done
if [ "${pulled}" != "true" ]; then
echo "${failure_message} ${image} on ${host_ip}" >&2
echo "Last pull output:" >&2
printf '%s\n' "${last_output}" >&2
ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${host_ip}" 'bash -s' <<'EOF' || true
set -u
echo "== node network diagnostics =="
iface="$(ip route get 1.1.1.1 2>/dev/null | awk '{for (i = 1; i <= NF; i++) if ($i == "dev") {print $(i + 1); exit}}')"
echo "primary_iface=${iface:-unknown}"
if [ -n "${iface:-}" ] && [ -r "/sys/class/net/${iface}/mtu" ]; then
echo "primary_mtu=$(cat "/sys/class/net/${iface}/mtu")"
fi
ip -brief addr || true
ip route || true
ip route get 1.1.1.1 || true
sed -n '/^nameserver/p;/^search/p;/^options/p' /etc/resolv.conf 2>/dev/null || true
for endpoint in https://ghcr.io/v2/ https://auth.docker.io/token https://registry-1.docker.io/v2/ https://quay.io/v2/ https://registry.k8s.io/v2/ https://api.doppler.com/v3/projects; do
echo "-- ${endpoint} --"
curl -fsSIL --connect-timeout 15 --max-time 20 -o /dev/null -w 'http_code=%{http_code} remote_ip=%{remote_ip} time_connect=%{time_connect} time_appconnect=%{time_appconnect} time_total=%{time_total}\n' "${endpoint}" || true
done
EOF
exit 1
fi
}
eso_diagnostics() {
kubectl -n flux-system get kustomizations,ocirepositories,helmrepositories,helmcharts,helmreleases || true
kubectl -n flux-system describe kustomization addon-external-secrets || true
kubectl -n flux-system describe ocirepository external-secrets || true
kubectl -n flux-system describe helmrelease external-secrets || true
kubectl -n external-secrets get pods -o wide || true
}
wait_for_helmrelease_ready() {
local release_name="$1"
local target_namespace="$2"
local timeout_seconds="$3"
local elapsed=0
local ready
local stalled
local generation
local observed_generation
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
ready="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
stalled="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)"
generation="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.metadata.generation}' 2>/dev/null || true)"
observed_generation="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.observedGeneration}' 2>/dev/null || true)"
if [ "${ready}" = "True" ] && [ "${observed_generation}" = "${generation}" ]; then
return 0
fi
if [ "${stalled}" = "True" ]; then
echo "HelmRelease ${release_name} is stalled" >&2
kubectl -n flux-system describe "helmrelease/${release_name}" || true
kubectl -n "${target_namespace}" get pods -o wide || true
exit 1
fi
sleep 10
elapsed=$((elapsed + 10))
done
echo "Timed out waiting for HelmRelease ${release_name} to become Ready" >&2
kubectl -n flux-system describe "helmrelease/${release_name}" || true
kubectl -n "${target_namespace}" get pods -o wide || true
exit 1
}
wait_for_flux_oci_helm_release() {
local oci_name="$1"
local release_name="$2"
local target_namespace="$3"
local oci_timeout="$4"
local release_timeout="$5"
local artifact_storage
wait_for_resource flux-system "ocirepository.source.toolkit.fluxcd.io/${oci_name}" 600
reconcile_helmrelease "${release_name}" 300
if ! kubectl -n flux-system wait --for=condition=Ready "ocirepository/${oci_name}" --timeout="${oci_timeout}"; then
artifact_storage="$(kubectl -n flux-system get "ocirepository/${oci_name}" -o jsonpath='{.status.conditions[?(@.type=="ArtifactInStorage")].status}' 2>/dev/null || true)"
if [ "${artifact_storage}" = "True" ]; then
echo "OCIRepository ${oci_name} is not currently Ready; continuing with cached artifact" >&2
else
eso_diagnostics
exit 1
fi
fi
wait_for_helmrelease_ready "${release_name}" "${target_namespace}" "${release_timeout}"
}
flux_helm_diagnostics() {
local repo_name="$1"
local chart_name="$2"
local release_name="$3"
local target_namespace="$4"
kubectl -n flux-system get helmrepositories,helmcharts,helmreleases || true
kubectl -n flux-system describe helmrepository "${repo_name}" || true
kubectl -n flux-system describe helmchart.source.toolkit.fluxcd.io "${chart_name}" || true
kubectl -n flux-system describe helmrelease "${release_name}" || true
kubectl -n "${target_namespace}" get pods -o wide || true
}
wait_for_flux_helm_release() {
local repo_name="$1"
local chart_name="$2"
local release_name="$3"
local target_namespace="$4"
local repo_timeout="$5"
local chart_timeout="$6"
local release_timeout="$7"
wait_for_resource flux-system "helmrepository.source.toolkit.fluxcd.io/${repo_name}" 600
if ! kubectl -n flux-system wait --for=condition=Ready "helmrepository/${repo_name}" --timeout="${repo_timeout}"; then
echo "HelmRepository ${repo_name} is not currently Ready; continuing because a cached artifact may still satisfy HelmChart ${chart_name}" >&2
kubectl -n flux-system describe helmrepository "${repo_name}" || true
fi
wait_for_resource flux-system "helmchart.source.toolkit.fluxcd.io/${chart_name}" 600
reconcile_flux_resource flux-system "helmchart.source.toolkit.fluxcd.io/${chart_name}" 300
reconcile_helmrelease "${release_name}" 300
for attempt in $(seq 1 6); do
if kubectl -n flux-system wait --for=condition=Ready "helmchart.source.toolkit.fluxcd.io/${chart_name}" --timeout="${chart_timeout}"; then
wait_for_helmrelease_ready "${release_name}" "${target_namespace}" "${release_timeout}"
return 0
fi
echo "HelmChart ${chart_name} did not become Ready after ${chart_timeout}; forcing retry (${attempt}/6)" >&2
reconcile_flux_resource flux-system "helmchart.source.toolkit.fluxcd.io/${chart_name}" 300
reconcile_helmrelease "${release_name}" 300
done
flux_helm_diagnostics "${repo_name}" "${chart_name}" "${release_name}" "${target_namespace}"
exit 1
}
kubectl create namespace flux-system --dry-run=client -o yaml | kubectl apply -f -
if [ -n "${FLUX_KNOWN_HOSTS}" ]; then
printf '%s\n' "${FLUX_KNOWN_HOSTS}" > /tmp/flux_known_hosts
else
ssh-keyscan -p "${FLUX_GIT_PORT}" "${FLUX_GIT_HOST}" > /tmp/flux_known_hosts
fi
kubectl -n flux-system create secret generic flux-system \
--from-file=identity="$HOME/.ssh/id_ed25519" \
--from-file=known_hosts=/tmp/flux_known_hosts \
--dry-run=client -o yaml | kubectl apply -f -
# Apply CRDs and controllers first
kubectl apply -f clusters/prod/flux-system/gotk-components.yaml
# Wait for CRDs to be established
kubectl wait --for=condition=Established crd --all --timeout=120s
# Then apply custom resources
kubectl apply -f clusters/prod/flux-system/gitrepository-platform.yaml
kubectl apply -f clusters/prod/flux-system/kustomization-infrastructure.yaml
kubectl apply -f clusters/prod/flux-system/kustomization-apps.yaml
# Patch Flux controllers to run on cp-1 and tolerate the control-plane taint
PATCH='{"spec":{"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"k8s-cluster-cp-1"},"tolerations":[{"key":"node-role.kubernetes.io/control-plane","operator":"Exists","effect":"NoSchedule"}]}}}}'
kubectl -n flux-system patch deployment source-controller --type='merge' -p="$PATCH"
kubectl -n flux-system patch deployment kustomize-controller --type='merge' -p="$PATCH"
kubectl -n flux-system patch deployment helm-controller --type='merge' -p="$PATCH"
kubectl -n flux-system patch deployment notification-controller --type='merge' -p="$PATCH"
flux_rollout_status source-controller
flux_rollout_status kustomize-controller
flux_rollout_status helm-controller
kubectl -n flux-system wait --for=condition=Ready gitrepository/platform --timeout=300s
kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=600s
reconcile_flux_resource flux-system kustomization/addon-cert-manager 1500
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-cert-manager --timeout=1200s
kubectl -n flux-system wait --for=condition=Ready helmrelease/cert-manager --timeout=1200s
# Wait directly on the ESO Helm objects; Kustomization readiness hides useful failure details.
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-external-secrets 600
reconcile_flux_resource flux-system kustomization/addon-external-secrets 900
wait_for_flux_oci_helm_release external-secrets external-secrets external-secrets 600s 600
wait_for_resource "" crd/clustersecretstores.external-secrets.io 900
wait_for_resource "" crd/externalsecrets.external-secrets.io 900
kubectl wait --for=condition=established --timeout=600s crd/clustersecretstores.external-secrets.io
kubectl wait --for=condition=established --timeout=600s crd/externalsecrets.external-secrets.io
kubectl -n external-secrets rollout status deployment/external-secrets-external-secrets --timeout=600s
kubectl -n external-secrets rollout status deployment/external-secrets-external-secrets-webhook --timeout=600s
wait_for_resource external-secrets service/external-secrets-external-secrets-webhook 600
wait_for_resource external-secrets endpoints/external-secrets-external-secrets-webhook 600
kubectl -n external-secrets wait --for=jsonpath='{.subsets[0].addresses[0].ip}' endpoints/external-secrets-external-secrets-webhook --timeout=600s
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-external-secrets-store 900
reconcile_flux_resource flux-system kustomization/addon-external-secrets-store 900
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets-store --timeout=900s
# Wait for the storage layer and private access components
reconcile_flux_resource flux-system kustomization/addon-tailscale-operator 900
if ! kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s; then
kubectl -n flux-system describe kustomization/addon-tailscale-operator || true
exit 1
fi
wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/tailscale-operator 600
reconcile_helmrelease tailscale-operator 900
wait_for_helmrelease_ready tailscale-operator tailscale-system 900
kubectl wait --for=condition=Established crd/proxyclasses.tailscale.com --timeout=600s
kubectl -n tailscale-system rollout status deployment/operator --timeout=600s
reconcile_flux_resource flux-system kustomization/addon-nfs-storage 600
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=300s
kubectl -n kube-system rollout status deployment/nfs-subdir-external-provisioner --timeout=300s
kubectl annotate storageclass local-path storageclass.kubernetes.io/is-default-class=false --overwrite
kubectl annotate storageclass flash-nfs storageclass.kubernetes.io/is-default-class=true --overwrite
kubectl get storageclass flash-nfs
kubectl -n kube-system delete pod/nfs-smoke pvc/nfs-smoke --ignore-not-found=true
kubectl apply -f - <<'EOF'
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: nfs-smoke
namespace: kube-system
spec:
accessModes:
- ReadWriteOnce
storageClassName: flash-nfs
resources:
requests:
storage: 1Mi
---
apiVersion: v1
kind: Pod
metadata:
name: nfs-smoke
namespace: kube-system
spec:
restartPolicy: Never
nodeSelector:
kubernetes.io/hostname: k8s-cluster-cp-1
tolerations:
- key: node-role.kubernetes.io/control-plane
operator: Exists
effect: NoSchedule
containers:
- name: smoke
image: docker.io/rancher/mirrored-library-busybox:1.37.0
command:
- sh
- -c
- echo ok >/data/smoke && test -s /data/smoke && sleep 30
volumeMounts:
- name: data
mountPath: /data
volumes:
- name: data
persistentVolumeClaim:
claimName: nfs-smoke
EOF
kubectl -n kube-system wait --for=condition=Ready pod/nfs-smoke --timeout=180s
kubectl -n kube-system delete pod/nfs-smoke pvc/nfs-smoke --ignore-not-found=true --wait=false
- name: Wait for Rancher
env:
KUBECONFIG: outputs/kubeconfig
run: |
set -euo pipefail
wait_for_resource() {
local namespace="$1"
local resource="$2"
local timeout_seconds="$3"
local elapsed=0
until {
if [ -n "${namespace}" ]; then
kubectl -n "${namespace}" get "${resource}" >/dev/null 2>&1
else
kubectl get "${resource}" >/dev/null 2>&1
fi
}; do
if [ "${elapsed}" -ge "${timeout_seconds}" ]; then
echo "Timed out waiting for ${resource} to exist" >&2
kubectl -n flux-system get kustomizations,helmrepositories,helmcharts,helmreleases || true
exit 1
fi
sleep 10
elapsed=$((elapsed + 10))
done
}
wait_for_reconcile_handled() {
local namespace="$1"
local resource="$2"
local reconcile_at="$3"
local timeout_seconds="$4"
local elapsed=0
local handled
local ready
local healthy
local generation
local observed_generation
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
handled="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.lastHandledReconcileAt}' 2>/dev/null || true)"
if [ "${handled}" = "${reconcile_at}" ]; then
return 0
fi
sleep 5
elapsed=$((elapsed + 5))
done
ready="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
healthy="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.conditions[?(@.type=="Healthy")].status}' 2>/dev/null || true)"
generation="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.metadata.generation}' 2>/dev/null || true)"
observed_generation="$(kubectl -n "${namespace}" get "${resource}" -o jsonpath='{.status.observedGeneration}' 2>/dev/null || true)"
if [ "${ready}" = "True" ] && [ "${observed_generation}" = "${generation}" ]; then
if [ -z "${healthy}" ] || [ "${healthy}" = "True" ]; then
echo "${resource} did not report reconcile ${reconcile_at}, but it is already Ready; continuing"
return 0
fi
fi
echo "Timed out waiting for ${resource} to handle reconcile ${reconcile_at}" >&2
kubectl -n "${namespace}" describe "${resource}" || true
exit 1
}
reconcile_flux_resource() {
local namespace="$1"
local resource="$2"
local timeout_seconds="$3"
local reconcile_at
reconcile_at="$(date +%s%N)"
kubectl -n "${namespace}" annotate "${resource}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
wait_for_reconcile_handled "${namespace}" "${resource}" "${reconcile_at}" "${timeout_seconds}"
}
reconcile_helmrelease() {
local release_name="$1"
local timeout_seconds="${2:-300}"
local reconcile_at
reconcile_at="$(date +%s%N)"
kubectl -n flux-system annotate "helmrelease/${release_name}" \
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
--overwrite
wait_for_reconcile_handled flux-system "helmrelease/${release_name}" "${reconcile_at}" "${timeout_seconds}"
}
wait_for_helmchart_ready() {
local chart_name="$1"
local release_name="$2"
local timeout="$3"
local attempts="$4"
wait_for_resource flux-system "helmchart.source.toolkit.fluxcd.io/${chart_name}" 600
for attempt in $(seq 1 "${attempts}"); do
reconcile_flux_resource flux-system "helmchart.source.toolkit.fluxcd.io/${chart_name}" 300
reconcile_helmrelease "${release_name}" 300
if kubectl -n flux-system wait --for=condition=Ready "helmchart.source.toolkit.fluxcd.io/${chart_name}" --timeout="${timeout}"; then
return 0
fi
echo "HelmChart ${chart_name} did not become Ready after ${timeout}; forcing retry (${attempt}/${attempts})" >&2
done
kubectl -n flux-system describe "helmchart.source.toolkit.fluxcd.io/${chart_name}" || true
kubectl -n flux-system describe "helmrelease/${release_name}" || true
exit 1
}
wait_for_helmrelease_ready() {
local release_name="$1"
local target_namespace="$2"
local timeout_seconds="$3"
local elapsed=0
local ready
local stalled
local generation
local observed_generation
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
ready="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
stalled="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)"
generation="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.metadata.generation}' 2>/dev/null || true)"
observed_generation="$(kubectl -n flux-system get "helmrelease/${release_name}" -o jsonpath='{.status.observedGeneration}' 2>/dev/null || true)"
if [ "${ready}" = "True" ] && [ "${observed_generation}" = "${generation}" ]; then
return 0
fi
if [ "${stalled}" = "True" ]; then
echo "HelmRelease ${release_name} is stalled" >&2
kubectl -n flux-system describe "helmrelease/${release_name}" || true
kubectl -n "${target_namespace}" get pods -o wide || true
exit 1
fi
sleep 10
elapsed=$((elapsed + 10))
done
echo "Timed out waiting for HelmRelease ${release_name} to become Ready" >&2
kubectl -n flux-system describe "helmrelease/${release_name}" || true
kubectl -n "${target_namespace}" get pods -o wide || true
exit 1
}
pull_image_on_matching_pod_nodes() {
local namespace="$1"
local selector="$2"
local image="$3"
local attempts="$4"
local sleep_seconds="$5"
local nodes
nodes="$(kubectl -n "${namespace}" get pods -l "${selector}" -o jsonpath='{range .items[*]}{.spec.nodeName}{"\n"}{end}' 2>/dev/null | sort -u)"
if [ -z "${nodes}" ]; then
echo "No pods found for ${namespace}/${selector}; skipping targeted image pull for ${image}" >&2
return 0
fi
for node in ${nodes}; do
local node_ip
local pulled=false
local last_output=""
node_ip="$(kubectl get node "${node}" -o jsonpath='{.status.addresses[?(@.type=="InternalIP")].address}')"
for attempt in $(seq 1 "${attempts}"); do
echo "Pre-pulling ${image} on ${node}/${node_ip} (${attempt}/${attempts})"
if last_output="$(ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${node_ip}" \
"sudo k3s crictl inspecti '${image}' >/dev/null 2>&1 || (sudo k3s crictl pull '${image}' && sudo k3s crictl inspecti '${image}' >/dev/null 2>&1)" 2>&1)"; then
pulled=true
break
fi
printf '%s\n' "${last_output}" >&2
sleep "${sleep_seconds}"
done
if [ "${pulled}" != "true" ]; then
echo "Best-effort targeted image pre-pull did not complete for ${image} on ${node}/${node_ip}" >&2
echo "Last pull output:" >&2
printf '%s\n' "${last_output}" >&2
ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${node_ip}" 'bash -s' <<'EOF' || true
set -u
echo "== node network diagnostics =="
iface="$(ip route get 1.1.1.1 2>/dev/null | awk '{for (i = 1; i <= NF; i++) if ($i == "dev") {print $(i + 1); exit}}')"
echo "primary_iface=${iface:-unknown}"
if [ -n "${iface:-}" ] && [ -r "/sys/class/net/${iface}/mtu" ]; then
echo "primary_mtu=$(cat "/sys/class/net/${iface}/mtu")"
fi
ip -brief addr || true
ip route || true
ip route get 1.1.1.1 || true
sed -n '/^nameserver/p;/^search/p;/^options/p' /etc/resolv.conf 2>/dev/null || true
for endpoint in https://ghcr.io/v2/ https://auth.docker.io/token https://registry-1.docker.io/v2/ https://quay.io/v2/ https://registry.k8s.io/v2/ https://api.doppler.com/v3/projects; do
echo "-- ${endpoint} --"
curl -fsSIL --connect-timeout 15 --max-time 20 -o /dev/null -w 'http_code=%{http_code} remote_ip=%{remote_ip} time_connect=%{time_connect} time_appconnect=%{time_appconnect} time_total=%{time_total}\n' "${endpoint}" || true
done
EOF
fi
done
}
wait_for_rancher_bootstrap_secrets() {
local timeout_seconds="$1"
local deadline
local force_sync
deadline=$(($(date +%s) + timeout_seconds))
while [ "$(date +%s)" -lt "${deadline}" ]; do
force_sync="$(date +%s)"
kubectl -n flux-system annotate externalsecret/rancher-bootstrap-password external-secrets.io/force-sync="${force_sync}" --overwrite || true
kubectl -n cattle-system annotate externalsecret/rancher-bootstrap-password external-secrets.io/force-sync="${force_sync}" --overwrite || true
if kubectl -n flux-system get secret/rancher-bootstrap-password >/dev/null 2>&1 \
&& kubectl -n cattle-system get secret/rancher-bootstrap-password >/dev/null 2>&1; then
return 0
fi
sleep 30
done
echo "Timed out waiting for Rancher bootstrap ExternalSecrets to sync" >&2
kubectl get clustersecretstore/doppler-hetznerterra -o yaml || true
kubectl -n flux-system get externalsecret/rancher-bootstrap-password -o yaml || true
kubectl -n cattle-system get externalsecret/rancher-bootstrap-password -o yaml || true
kubectl -n external-secrets logs deploy/external-secrets-external-secrets --tail=120 || true
exit 1
}
echo "Waiting for Rancher..."
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-rancher-secrets 900
reconcile_flux_resource flux-system kustomization/addon-rancher-secrets 900
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-secrets --timeout=900s
wait_for_rancher_bootstrap_secrets 900
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-rancher 600
reconcile_flux_resource flux-system kustomization/addon-rancher 1800
wait_for_resource flux-system helmrelease.helm.toolkit.fluxcd.io/rancher 600
reconcile_helmrelease rancher 300
wait_for_helmchart_ready flux-system-rancher rancher 180s 5
wait_for_helmrelease_ready rancher cattle-system 900
wait_for_resource "" namespace/cattle-system 600
wait_for_resource cattle-system deployment/cattle-system-rancher 600
kubectl -n cattle-system rollout status deployment/cattle-system-rancher --timeout=900s
wait_for_resource cattle-system deployment/rancher-webhook 900
pull_image_on_matching_pod_nodes cattle-system app=rancher-webhook registry.rancher.com/rancher/rancher-webhook:v0.9.3 12 10
kubectl -n cattle-system rollout restart deployment/rancher-webhook
kubectl -n cattle-system rollout status deployment/rancher-webhook --timeout=900s
wait_for_resource cattle-system issuer/cattle-system-rancher 900
wait_for_resource cattle-system certificate/tls-rancher-ingress 900
kubectl -n cattle-system wait --for=condition=Ready issuer/cattle-system-rancher --timeout=900s
kubectl -n cattle-system wait --for=condition=Ready certificate/tls-rancher-ingress --timeout=900s
- name: Reconcile observability stack
env:
KUBECONFIG: outputs/kubeconfig
run: |
set -euo pipefail
observability_diagnostics() {
kubectl -n flux-system get gitrepositories,kustomizations,ocirepositories,helmreleases || true
kubectl -n flux-system describe kustomization/addon-observability-secrets || true
kubectl -n flux-system describe kustomization/addon-observability || true
kubectl -n flux-system describe kustomization/addon-observability-content || true
kubectl describe clustersecretstore/doppler-hetznerterra || true
kubectl -n observability describe externalsecret/grafana-admin || true
kubectl -n observability get secret/grafana-admin-credentials || true
kubectl -n flux-system describe ocirepository/loki || true
kubectl -n flux-system describe ocirepository/promtail || true
kubectl -n flux-system describe helmrelease/kube-prometheus-stack || true
kubectl -n flux-system describe helmrelease/loki || true
kubectl -n flux-system describe helmrelease/promtail || true
kubectl -n observability get pods,pvc,svc -o wide || true
kubectl -n observability get events --sort-by=.lastTimestamp || true
}
wait_for_resource() {
local namespace="$1"
local resource="$2"
local timeout_seconds="$3"
local elapsed=0
until kubectl -n "${namespace}" get "${resource}" >/dev/null 2>&1; do
if [ "${elapsed}" -ge "${timeout_seconds}" ]; then
echo "Timed out waiting for ${resource} to exist" >&2
observability_diagnostics
exit 1
fi
sleep 10
elapsed=$((elapsed + 10))
done
}
wait_for_reconcile_handled() {
local resource="$1"
local reconcile_at="$2"
local timeout_seconds="$3"
local elapsed=0
local handled
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
handled="$(kubectl -n flux-system get "${resource}" -o jsonpath='{.status.lastHandledReconcileAt}' 2>/dev/null || true)"
if [ "${handled}" = "${reconcile_at}" ]; then
return 0
fi
sleep 5
elapsed=$((elapsed + 5))
done
echo "Timed out waiting for ${resource} to handle reconcile ${reconcile_at}" >&2
observability_diagnostics
exit 1
}
reconcile_flux_resource() {
local resource="$1"
local timeout_seconds="${2:-300}"
local reconcile_at
reconcile_at="$(date +%s%N)"
kubectl -n flux-system annotate "${resource}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
wait_for_reconcile_handled "${resource}" "${reconcile_at}" "${timeout_seconds}"
}
request_helmrelease_reconcile() {
local release="$1"
local reconcile_at
reconcile_at="$(date +%s%N)"
kubectl -n flux-system annotate "helmrelease/${release}" \
reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
reconcile.fluxcd.io/resetAt="${reconcile_at}" \
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
--overwrite
}
wait_for_flux_ready() {
local resource="$1"
local timeout="$2"
if ! kubectl -n flux-system wait --for=condition=Ready "${resource}" --timeout="${timeout}"; then
observability_diagnostics
exit 1
fi
}
wait_for_grafana_secret() {
local timeout_seconds="$1"
local elapsed=0
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
if kubectl -n observability get secret/grafana-admin-credentials >/dev/null 2>&1; then
return 0
fi
sleep 15
elapsed=$((elapsed + 75))
done
echo "Timed out waiting for Grafana admin ExternalSecret to sync" >&2
observability_diagnostics
exit 1
}
wait_for_ocirepository_ready_or_cached() {
local repository="$1"
local timeout="$2"
local artifact_storage
if kubectl -n flux-system wait --for=condition=Ready "ocirepository/${repository}" --timeout="${timeout}"; then
return 0
fi
artifact_storage="$(kubectl -n flux-system get "ocirepository/${repository}" -o jsonpath='{.status.conditions[?(@.type=="ArtifactInStorage")].status}' 2>/dev/null || true)"
if [ "${artifact_storage}" = "True" ]; then
echo "OCIRepository ${repository} is not currently Ready; continuing with cached artifact" >&2
return 0
fi
observability_diagnostics
exit 1
}
wait_for_helmrelease_ready() {
local release="$1"
local timeout_seconds="$2"
local elapsed=0
local ready
local stalled
local generation
local observed_generation
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
ready="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || true)"
stalled="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.conditions[?(@.type=="Stalled")].status}' 2>/dev/null || true)"
generation="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.metadata.generation}' 2>/dev/null || true)"
observed_generation="$(kubectl -n flux-system get "helmrelease/${release}" -o jsonpath='{.status.observedGeneration}' 2>/dev/null || true)"
if [ "${ready}" = "True" ] && [ "${observed_generation}" = "${generation}" ]; then
return 0
fi
if [ "${stalled}" = "True" ]; then
echo "HelmRelease ${release} is stalled" >&2
observability_diagnostics
exit 1
fi
sleep 10
elapsed=$((elapsed + 10))
done
echo "Timed out waiting for HelmRelease ${release} to become Ready" >&2
observability_diagnostics
exit 1
}
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability-secrets 600
reconcile_flux_resource kustomization/addon-observability-secrets 300
wait_for_flux_ready kustomization/addon-observability-secrets 300s
wait_for_grafana_secret 900
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability 600
reconcile_flux_resource kustomization/addon-observability 600
wait_for_flux_ready kustomization/addon-observability 300s
wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/loki 300
wait_for_resource flux-system ocirepository.source.toolkit.fluxcd.io/promtail 300
wait_for_ocirepository_ready_or_cached loki 300s
wait_for_ocirepository_ready_or_cached promtail 300s
for release in kube-prometheus-stack loki promtail; do
wait_for_resource flux-system "helmrelease.helm.toolkit.fluxcd.io/${release}" 300
request_helmrelease_reconcile "${release}"
wait_for_helmrelease_ready "${release}" 600
done
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability-content 300
reconcile_flux_resource kustomization/addon-observability-content 300
wait_for_flux_ready kustomization/addon-observability-content 300s
kubectl -n observability rollout restart deployment/observability-kube-prometheus-stack-grafana || true
- name: Post-deploy cluster health checks
working-directory: ansible
run: |
set -euo pipefail
health_script="$(mktemp)"
cat >"${health_script}" <<'EOF'
#!/usr/bin/env bash
set -euo pipefail
kubectl get nodes -o wide
kubectl -n flux-system get gitrepositories,kustomizations,helmreleases,ocirepositories
kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=300s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-cert-manager --timeout=300s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets --timeout=300s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets-store --timeout=300s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=300s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-proxyclass --timeout=300s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-secrets --timeout=300s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=900s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-config --timeout=300s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-secrets --timeout=300s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=300s
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=300s
kubectl -n flux-system wait --for=condition=Ready helmrelease --all --timeout=600s
kubectl annotate storageclass local-path storageclass.kubernetes.io/is-default-class=false --overwrite
kubectl annotate storageclass flash-nfs storageclass.kubernetes.io/is-default-class=true --overwrite
kubectl get storageclass | grep -E "^flash-nfs.*\\(default\\)"
! kubectl get storageclass | grep -E "^local-path.*\\(default\\)"
unhealthy_pods=$(mktemp)
kubectl get pods -A --no-headers \
| grep -Ev "[[:space:]](Running|Completed)[[:space:]]" \
| grep -Ev "^cattle-system[[:space:]]+helm-operation-" \
| grep -Ev "^cattle-capi-system[[:space:]]+capi-controller-manager-" \
| grep -Ev "^cattle-turtles-system[[:space:]]+cluster-api-operator-resources-cleanup-" \
| grep -Ev "^kube-system[[:space:]]+helm-install-" \
| tee "${unhealthy_pods}" || true
test ! -s "${unhealthy_pods}"
kubectl -n kube-system get pods -o wide
kubectl -n tailscale-system get pods -o wide
kubectl -n external-secrets get pods -o wide
EOF
chmod +x "${health_script}"
ansible -i inventory.ini 'control_plane[0]' -m script -a "${health_script}"
env:
ANSIBLE_HOST_KEY_CHECKING: "False"
- name: Post-deploy tailnet smoke checks
working-directory: ansible
run: |
ansible -i inventory.ini 'control_plane[0]' -m script -a "../scripts/smoke-check-tailnet-services.sh"
env:
ANSIBLE_HOST_KEY_CHECKING: "False"
- name: Upload Kubeconfig
uses: actions/upload-artifact@v3
with:
name: kubeconfig
path: outputs/kubeconfig