fix: add local registry cache for rebuilds
Deploy Cluster / Terraform (push) Successful in 4m7s
Deploy Cluster / Ansible (push) Failing after 16m31s

This commit is contained in:
2026-05-03 00:02:33 +00:00
parent 8375333ac5
commit 1896108cbb
9 changed files with 334 additions and 11 deletions
+48 -11
View File
@@ -302,19 +302,40 @@ jobs:
local sleep_seconds="$4"
local failure_message="$5"
local pulled=false
local last_output=""
for attempt in $(seq 1 "${attempts}"); do
echo "Pre-pulling ${image} on ${host_ip} (${attempt}/${attempts})"
if ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${host_ip}" \
"sudo k3s crictl inspecti '${image}' >/dev/null 2>&1 || (sudo k3s crictl pull '${image}' && sudo k3s crictl inspecti '${image}' >/dev/null 2>&1)"; then
if last_output="$(ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${host_ip}" \
"sudo k3s crictl inspecti '${image}' >/dev/null 2>&1 || (sudo k3s crictl pull '${image}' && sudo k3s crictl inspecti '${image}' >/dev/null 2>&1)" 2>&1)"; then
pulled=true
break
fi
printf '%s\n' "${last_output}" >&2
sleep "${sleep_seconds}"
done
if [ "${pulled}" != "true" ]; then
echo "${failure_message} ${image} on ${host_ip}" >&2
echo "Last pull output:" >&2
printf '%s\n' "${last_output}" >&2
ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${host_ip}" 'bash -s' <<'EOF' || true
set -u
echo "== node network diagnostics =="
iface="$(ip route get 1.1.1.1 2>/dev/null | awk '{for (i = 1; i <= NF; i++) if ($i == "dev") {print $(i + 1); exit}}')"
echo "primary_iface=${iface:-unknown}"
if [ -n "${iface:-}" ] && [ -r "/sys/class/net/${iface}/mtu" ]; then
echo "primary_mtu=$(cat "/sys/class/net/${iface}/mtu")"
fi
ip -brief addr || true
ip route || true
ip route get 1.1.1.1 || true
sed -n '/^nameserver/p;/^search/p;/^options/p' /etc/resolv.conf 2>/dev/null || true
for endpoint in https://ghcr.io/v2/ https://auth.docker.io/token https://registry-1.docker.io/v2/ https://quay.io/v2/ https://registry.k8s.io/v2/ https://api.doppler.com/v3/projects; do
echo "-- ${endpoint} --"
curl -fsSIL --connect-timeout 15 --max-time 20 -o /dev/null -w 'http_code=%{http_code} remote_ip=%{remote_ip} time_connect=%{time_connect} time_appconnect=%{time_appconnect} time_total=%{time_total}\n' "${endpoint}" || true
done
EOF
exit 1
fi
}
@@ -698,20 +719,41 @@ jobs:
for node in ${nodes}; do
local node_ip
local pulled=false
local last_output=""
node_ip="$(kubectl get node "${node}" -o jsonpath='{.status.addresses[?(@.type=="InternalIP")].address}')"
for attempt in $(seq 1 "${attempts}"); do
echo "Pre-pulling ${image} on ${node}/${node_ip} (${attempt}/${attempts})"
if ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${node_ip}" \
"sudo k3s crictl inspecti '${image}' >/dev/null 2>&1 || (sudo k3s crictl pull '${image}' && sudo k3s crictl inspecti '${image}' >/dev/null 2>&1)"; then
if last_output="$(ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${node_ip}" \
"sudo k3s crictl inspecti '${image}' >/dev/null 2>&1 || (sudo k3s crictl pull '${image}' && sudo k3s crictl inspecti '${image}' >/dev/null 2>&1)" 2>&1)"; then
pulled=true
break
fi
printf '%s\n' "${last_output}" >&2
sleep "${sleep_seconds}"
done
if [ "${pulled}" != "true" ]; then
echo "Best-effort targeted image pre-pull did not complete for ${image} on ${node}/${node_ip}" >&2
echo "Last pull output:" >&2
printf '%s\n' "${last_output}" >&2
ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${node_ip}" 'bash -s' <<'EOF' || true
set -u
echo "== node network diagnostics =="
iface="$(ip route get 1.1.1.1 2>/dev/null | awk '{for (i = 1; i <= NF; i++) if ($i == "dev") {print $(i + 1); exit}}')"
echo "primary_iface=${iface:-unknown}"
if [ -n "${iface:-}" ] && [ -r "/sys/class/net/${iface}/mtu" ]; then
echo "primary_mtu=$(cat "/sys/class/net/${iface}/mtu")"
fi
ip -brief addr || true
ip route || true
ip route get 1.1.1.1 || true
sed -n '/^nameserver/p;/^search/p;/^options/p' /etc/resolv.conf 2>/dev/null || true
for endpoint in https://ghcr.io/v2/ https://auth.docker.io/token https://registry-1.docker.io/v2/ https://quay.io/v2/ https://registry.k8s.io/v2/ https://api.doppler.com/v3/projects; do
echo "-- ${endpoint} --"
curl -fsSIL --connect-timeout 15 --max-time 20 -o /dev/null -w 'http_code=%{http_code} remote_ip=%{remote_ip} time_connect=%{time_connect} time_appconnect=%{time_appconnect} time_total=%{time_total}\n' "${endpoint}" || true
done
EOF
fi
done
}
@@ -727,10 +769,7 @@ jobs:
kubectl -n flux-system annotate externalsecret/rancher-bootstrap-password external-secrets.io/force-sync="${force_sync}" --overwrite || true
kubectl -n cattle-system annotate externalsecret/rancher-bootstrap-password external-secrets.io/force-sync="${force_sync}" --overwrite || true
if kubectl wait --for=condition=Ready clustersecretstore/doppler-hetznerterra --timeout=30s \
&& kubectl -n flux-system wait --for=condition=Ready externalsecret/rancher-bootstrap-password --timeout=30s \
&& kubectl -n cattle-system wait --for=condition=Ready externalsecret/rancher-bootstrap-password --timeout=30s \
&& kubectl -n flux-system get secret/rancher-bootstrap-password >/dev/null 2>&1 \
if kubectl -n flux-system get secret/rancher-bootstrap-password >/dev/null 2>&1 \
&& kubectl -n cattle-system get secret/rancher-bootstrap-password >/dev/null 2>&1; then
return 0
fi
@@ -866,9 +905,7 @@ jobs:
local elapsed=0
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
if kubectl wait --for=condition=Ready clustersecretstore/doppler-hetznerterra --timeout=30s \
&& kubectl -n observability wait --for=condition=Ready externalsecret/grafana-admin --timeout=30s \
&& kubectl -n observability get secret/grafana-admin-credentials >/dev/null 2>&1; then
if kubectl -n observability get secret/grafana-admin-credentials >/dev/null 2>&1; then
return 0
fi