From bdba2b7af28308f87b998b5d10b31751a9be79b8 Mon Sep 17 00:00:00 2001 From: MichaelFisher1997 Date: Sun, 26 Apr 2026 11:13:22 +0000 Subject: [PATCH] fix: defer observability image seeding --- .gitea/workflows/deploy.yml | 109 +++++++++++++++++++++++++++++++----- 1 file changed, 94 insertions(+), 15 deletions(-) diff --git a/.gitea/workflows/deploy.yml b/.gitea/workflows/deploy.yml index f2ee3b6..7347a91 100644 --- a/.gitea/workflows/deploy.yml +++ b/.gitea/workflows/deploy.yml @@ -539,10 +539,15 @@ jobs: exit 1 fi + if ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${host_ip}" \ + "sudo k3s crictl inspecti '${image}' >/dev/null 2>&1"; then + return 0 + fi + echo "Importing ${image} archive on ${host_ip}" - scp -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 \ + timeout 180s scp -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 -o ServerAliveInterval=15 -o ServerAliveCountMax=4 \ "${archive_path}" "ubuntu@${host_ip}:/tmp/${archive_name}" - ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${host_ip}" \ + timeout 300s ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 -o ServerAliveInterval=15 -o ServerAliveCountMax=4 "ubuntu@${host_ip}" \ "set -euo pipefail; \ if sudo k3s crictl inspecti '${image}' >/dev/null 2>&1; then exit 0; fi; \ for attempt in 1 2 3 4 5; do \ @@ -699,19 +704,6 @@ jobs: ghcr.io/fluxcd/notification-controller:v1.8.1; do import_required_image "${image}" "${PRIMARY_CP_IP}" done - for image in \ - docker.io/grafana/loki:3.5.7 \ - docker.io/kiwigrid/k8s-sidecar:1.30.10 \ - docker.io/grafana/promtail:3.0.0 \ - docker.io/rancher/mirrored-library-traefik:3.6.10 \ - docker.io/grafana/grafana:11.4.0 \ - quay.io/prometheus-operator/prometheus-operator:v0.79.2 \ - quay.io/prometheus-operator/prometheus-config-reloader:v0.79.2 \ - quay.io/prometheus/prometheus:v3.1.0 \ - registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.14.0 \ - quay.io/prometheus/node-exporter:v1.8.2; do - import_required_image_on_all_nodes "${image}" - done # Apply CRDs and controllers first kubectl apply -f clusters/prod/flux-system/gotk-components.yaml # Wait for CRDs to be established @@ -1024,6 +1016,93 @@ jobs: done echo "Restore did not complete within timeout. Continuing anyway." + - name: Seed observability runtime images + run: | + set -euo pipefail + + archive_name() { + printf '%s' "$1" | tr '/:' '__' + } + + import_required_image() { + local image="$1" + local host_ip="$2" + local archive_name + local archive_path + archive_name="$(archive_name "${image}").tar" + archive_path="outputs/bootstrap-image-archives/${archive_name}" + + if [ ! -s "${archive_path}" ]; then + echo "Missing required bootstrap image archive ${archive_path} for ${image}" >&2 + return 1 + fi + + if ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${host_ip}" \ + "sudo k3s crictl inspecti '${image}' >/dev/null 2>&1"; then + return 0 + fi + + echo "Importing ${image} archive on ${host_ip}" + timeout 180s scp -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 -o ServerAliveInterval=15 -o ServerAliveCountMax=4 \ + "${archive_path}" "ubuntu@${host_ip}:/tmp/${archive_name}" + timeout 300s ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 -o ServerAliveInterval=15 -o ServerAliveCountMax=4 "ubuntu@${host_ip}" \ + "set -euo pipefail; \ + if sudo k3s crictl inspecti '${image}' >/dev/null 2>&1; then exit 0; fi; \ + for attempt in 1 2 3; do \ + echo 'Importing ${image} archive with ctr'; \ + if sudo k3s ctr -n k8s.io images import '/tmp/${archive_name}' && sudo k3s crictl inspecti '${image}' >/dev/null; then exit 0; fi; \ + sleep 10; \ + done; \ + sudo systemctl status k3s --no-pager -l || true; \ + sudo journalctl -u k3s -n 80 --no-pager || true; \ + exit 1" + } + + import_required_image_on_all_nodes() { + local image="$1" + local status_dir + local host_ip + local pid + local failed=false + status_dir="$(mktemp -d)" + + for host_ip in ${ALL_NODE_IPS}; do + ( + import_required_image "${image}" "${host_ip}" + ) >"${status_dir}/${host_ip}.log" 2>&1 & + done + + for pid in $(jobs -p); do + if ! wait "${pid}"; then + failed=true + fi + done + + for host_ip in ${ALL_NODE_IPS}; do + sed "s/^/[${host_ip}] /" "${status_dir}/${host_ip}.log" + done + + if [ "${failed}" = "true" ]; then + echo "Failed to import ${image} on one or more nodes" >&2 + return 1 + fi + } + + ALL_NODE_IPS=$(python3 -c 'import json; outputs = json.load(open("outputs/terraform_outputs.json")); print(" ".join(outputs["control_plane_ips"]["value"] + outputs["worker_ips"]["value"]))') + for image in \ + docker.io/grafana/loki:3.5.7 \ + docker.io/kiwigrid/k8s-sidecar:1.30.10 \ + docker.io/grafana/promtail:3.0.0 \ + docker.io/rancher/mirrored-library-traefik:3.6.10 \ + docker.io/grafana/grafana:11.4.0 \ + quay.io/prometheus-operator/prometheus-operator:v0.79.2 \ + quay.io/prometheus-operator/prometheus-config-reloader:v0.79.2 \ + quay.io/prometheus/prometheus:v3.1.0 \ + registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.14.0 \ + quay.io/prometheus/node-exporter:v1.8.2; do + import_required_image_on_all_nodes "${image}" + done + - name: Post-deploy cluster health checks working-directory: ansible run: |