fix: defer observability image seeding
This commit is contained in:
+94
-15
@@ -539,10 +539,15 @@ jobs:
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${host_ip}" \
|
||||
"sudo k3s crictl inspecti '${image}' >/dev/null 2>&1"; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
echo "Importing ${image} archive on ${host_ip}"
|
||||
scp -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 \
|
||||
timeout 180s scp -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 -o ServerAliveInterval=15 -o ServerAliveCountMax=4 \
|
||||
"${archive_path}" "ubuntu@${host_ip}:/tmp/${archive_name}"
|
||||
ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${host_ip}" \
|
||||
timeout 300s ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 -o ServerAliveInterval=15 -o ServerAliveCountMax=4 "ubuntu@${host_ip}" \
|
||||
"set -euo pipefail; \
|
||||
if sudo k3s crictl inspecti '${image}' >/dev/null 2>&1; then exit 0; fi; \
|
||||
for attempt in 1 2 3 4 5; do \
|
||||
@@ -699,19 +704,6 @@ jobs:
|
||||
ghcr.io/fluxcd/notification-controller:v1.8.1; do
|
||||
import_required_image "${image}" "${PRIMARY_CP_IP}"
|
||||
done
|
||||
for image in \
|
||||
docker.io/grafana/loki:3.5.7 \
|
||||
docker.io/kiwigrid/k8s-sidecar:1.30.10 \
|
||||
docker.io/grafana/promtail:3.0.0 \
|
||||
docker.io/rancher/mirrored-library-traefik:3.6.10 \
|
||||
docker.io/grafana/grafana:11.4.0 \
|
||||
quay.io/prometheus-operator/prometheus-operator:v0.79.2 \
|
||||
quay.io/prometheus-operator/prometheus-config-reloader:v0.79.2 \
|
||||
quay.io/prometheus/prometheus:v3.1.0 \
|
||||
registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.14.0 \
|
||||
quay.io/prometheus/node-exporter:v1.8.2; do
|
||||
import_required_image_on_all_nodes "${image}"
|
||||
done
|
||||
# Apply CRDs and controllers first
|
||||
kubectl apply -f clusters/prod/flux-system/gotk-components.yaml
|
||||
# Wait for CRDs to be established
|
||||
@@ -1024,6 +1016,93 @@ jobs:
|
||||
done
|
||||
echo "Restore did not complete within timeout. Continuing anyway."
|
||||
|
||||
- name: Seed observability runtime images
|
||||
run: |
|
||||
set -euo pipefail
|
||||
|
||||
archive_name() {
|
||||
printf '%s' "$1" | tr '/:' '__'
|
||||
}
|
||||
|
||||
import_required_image() {
|
||||
local image="$1"
|
||||
local host_ip="$2"
|
||||
local archive_name
|
||||
local archive_path
|
||||
archive_name="$(archive_name "${image}").tar"
|
||||
archive_path="outputs/bootstrap-image-archives/${archive_name}"
|
||||
|
||||
if [ ! -s "${archive_path}" ]; then
|
||||
echo "Missing required bootstrap image archive ${archive_path} for ${image}" >&2
|
||||
return 1
|
||||
fi
|
||||
|
||||
if ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${host_ip}" \
|
||||
"sudo k3s crictl inspecti '${image}' >/dev/null 2>&1"; then
|
||||
return 0
|
||||
fi
|
||||
|
||||
echo "Importing ${image} archive on ${host_ip}"
|
||||
timeout 180s scp -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 -o ServerAliveInterval=15 -o ServerAliveCountMax=4 \
|
||||
"${archive_path}" "ubuntu@${host_ip}:/tmp/${archive_name}"
|
||||
timeout 300s ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 -o ServerAliveInterval=15 -o ServerAliveCountMax=4 "ubuntu@${host_ip}" \
|
||||
"set -euo pipefail; \
|
||||
if sudo k3s crictl inspecti '${image}' >/dev/null 2>&1; then exit 0; fi; \
|
||||
for attempt in 1 2 3; do \
|
||||
echo 'Importing ${image} archive with ctr'; \
|
||||
if sudo k3s ctr -n k8s.io images import '/tmp/${archive_name}' && sudo k3s crictl inspecti '${image}' >/dev/null; then exit 0; fi; \
|
||||
sleep 10; \
|
||||
done; \
|
||||
sudo systemctl status k3s --no-pager -l || true; \
|
||||
sudo journalctl -u k3s -n 80 --no-pager || true; \
|
||||
exit 1"
|
||||
}
|
||||
|
||||
import_required_image_on_all_nodes() {
|
||||
local image="$1"
|
||||
local status_dir
|
||||
local host_ip
|
||||
local pid
|
||||
local failed=false
|
||||
status_dir="$(mktemp -d)"
|
||||
|
||||
for host_ip in ${ALL_NODE_IPS}; do
|
||||
(
|
||||
import_required_image "${image}" "${host_ip}"
|
||||
) >"${status_dir}/${host_ip}.log" 2>&1 &
|
||||
done
|
||||
|
||||
for pid in $(jobs -p); do
|
||||
if ! wait "${pid}"; then
|
||||
failed=true
|
||||
fi
|
||||
done
|
||||
|
||||
for host_ip in ${ALL_NODE_IPS}; do
|
||||
sed "s/^/[${host_ip}] /" "${status_dir}/${host_ip}.log"
|
||||
done
|
||||
|
||||
if [ "${failed}" = "true" ]; then
|
||||
echo "Failed to import ${image} on one or more nodes" >&2
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
ALL_NODE_IPS=$(python3 -c 'import json; outputs = json.load(open("outputs/terraform_outputs.json")); print(" ".join(outputs["control_plane_ips"]["value"] + outputs["worker_ips"]["value"]))')
|
||||
for image in \
|
||||
docker.io/grafana/loki:3.5.7 \
|
||||
docker.io/kiwigrid/k8s-sidecar:1.30.10 \
|
||||
docker.io/grafana/promtail:3.0.0 \
|
||||
docker.io/rancher/mirrored-library-traefik:3.6.10 \
|
||||
docker.io/grafana/grafana:11.4.0 \
|
||||
quay.io/prometheus-operator/prometheus-operator:v0.79.2 \
|
||||
quay.io/prometheus-operator/prometheus-config-reloader:v0.79.2 \
|
||||
quay.io/prometheus/prometheus:v3.1.0 \
|
||||
registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.14.0 \
|
||||
quay.io/prometheus/node-exporter:v1.8.2; do
|
||||
import_required_image_on_all_nodes "${image}"
|
||||
done
|
||||
|
||||
- name: Post-deploy cluster health checks
|
||||
working-directory: ansible
|
||||
run: |
|
||||
|
||||
Reference in New Issue
Block a user