fix: add local registry cache for rebuilds
Deploy Cluster / Terraform (push) Successful in 4m7s
Deploy Cluster / Ansible (push) Failing after 16m31s

This commit is contained in:
2026-05-03 00:02:33 +00:00
parent 8375333ac5
commit 1896108cbb
9 changed files with 334 additions and 11 deletions
+48 -11
View File
@@ -302,19 +302,40 @@ jobs:
local sleep_seconds="$4" local sleep_seconds="$4"
local failure_message="$5" local failure_message="$5"
local pulled=false local pulled=false
local last_output=""
for attempt in $(seq 1 "${attempts}"); do for attempt in $(seq 1 "${attempts}"); do
echo "Pre-pulling ${image} on ${host_ip} (${attempt}/${attempts})" echo "Pre-pulling ${image} on ${host_ip} (${attempt}/${attempts})"
if ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${host_ip}" \ if last_output="$(ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${host_ip}" \
"sudo k3s crictl inspecti '${image}' >/dev/null 2>&1 || (sudo k3s crictl pull '${image}' && sudo k3s crictl inspecti '${image}' >/dev/null 2>&1)"; then "sudo k3s crictl inspecti '${image}' >/dev/null 2>&1 || (sudo k3s crictl pull '${image}' && sudo k3s crictl inspecti '${image}' >/dev/null 2>&1)" 2>&1)"; then
pulled=true pulled=true
break break
fi fi
printf '%s\n' "${last_output}" >&2
sleep "${sleep_seconds}" sleep "${sleep_seconds}"
done done
if [ "${pulled}" != "true" ]; then if [ "${pulled}" != "true" ]; then
echo "${failure_message} ${image} on ${host_ip}" >&2 echo "${failure_message} ${image} on ${host_ip}" >&2
echo "Last pull output:" >&2
printf '%s\n' "${last_output}" >&2
ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${host_ip}" 'bash -s' <<'EOF' || true
set -u
echo "== node network diagnostics =="
iface="$(ip route get 1.1.1.1 2>/dev/null | awk '{for (i = 1; i <= NF; i++) if ($i == "dev") {print $(i + 1); exit}}')"
echo "primary_iface=${iface:-unknown}"
if [ -n "${iface:-}" ] && [ -r "/sys/class/net/${iface}/mtu" ]; then
echo "primary_mtu=$(cat "/sys/class/net/${iface}/mtu")"
fi
ip -brief addr || true
ip route || true
ip route get 1.1.1.1 || true
sed -n '/^nameserver/p;/^search/p;/^options/p' /etc/resolv.conf 2>/dev/null || true
for endpoint in https://ghcr.io/v2/ https://auth.docker.io/token https://registry-1.docker.io/v2/ https://quay.io/v2/ https://registry.k8s.io/v2/ https://api.doppler.com/v3/projects; do
echo "-- ${endpoint} --"
curl -fsSIL --connect-timeout 15 --max-time 20 -o /dev/null -w 'http_code=%{http_code} remote_ip=%{remote_ip} time_connect=%{time_connect} time_appconnect=%{time_appconnect} time_total=%{time_total}\n' "${endpoint}" || true
done
EOF
exit 1 exit 1
fi fi
} }
@@ -698,20 +719,41 @@ jobs:
for node in ${nodes}; do for node in ${nodes}; do
local node_ip local node_ip
local pulled=false local pulled=false
local last_output=""
node_ip="$(kubectl get node "${node}" -o jsonpath='{.status.addresses[?(@.type=="InternalIP")].address}')" node_ip="$(kubectl get node "${node}" -o jsonpath='{.status.addresses[?(@.type=="InternalIP")].address}')"
for attempt in $(seq 1 "${attempts}"); do for attempt in $(seq 1 "${attempts}"); do
echo "Pre-pulling ${image} on ${node}/${node_ip} (${attempt}/${attempts})" echo "Pre-pulling ${image} on ${node}/${node_ip} (${attempt}/${attempts})"
if ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${node_ip}" \ if last_output="$(ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${node_ip}" \
"sudo k3s crictl inspecti '${image}' >/dev/null 2>&1 || (sudo k3s crictl pull '${image}' && sudo k3s crictl inspecti '${image}' >/dev/null 2>&1)"; then "sudo k3s crictl inspecti '${image}' >/dev/null 2>&1 || (sudo k3s crictl pull '${image}' && sudo k3s crictl inspecti '${image}' >/dev/null 2>&1)" 2>&1)"; then
pulled=true pulled=true
break break
fi fi
printf '%s\n' "${last_output}" >&2
sleep "${sleep_seconds}" sleep "${sleep_seconds}"
done done
if [ "${pulled}" != "true" ]; then if [ "${pulled}" != "true" ]; then
echo "Best-effort targeted image pre-pull did not complete for ${image} on ${node}/${node_ip}" >&2 echo "Best-effort targeted image pre-pull did not complete for ${image} on ${node}/${node_ip}" >&2
echo "Last pull output:" >&2
printf '%s\n' "${last_output}" >&2
ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${node_ip}" 'bash -s' <<'EOF' || true
set -u
echo "== node network diagnostics =="
iface="$(ip route get 1.1.1.1 2>/dev/null | awk '{for (i = 1; i <= NF; i++) if ($i == "dev") {print $(i + 1); exit}}')"
echo "primary_iface=${iface:-unknown}"
if [ -n "${iface:-}" ] && [ -r "/sys/class/net/${iface}/mtu" ]; then
echo "primary_mtu=$(cat "/sys/class/net/${iface}/mtu")"
fi
ip -brief addr || true
ip route || true
ip route get 1.1.1.1 || true
sed -n '/^nameserver/p;/^search/p;/^options/p' /etc/resolv.conf 2>/dev/null || true
for endpoint in https://ghcr.io/v2/ https://auth.docker.io/token https://registry-1.docker.io/v2/ https://quay.io/v2/ https://registry.k8s.io/v2/ https://api.doppler.com/v3/projects; do
echo "-- ${endpoint} --"
curl -fsSIL --connect-timeout 15 --max-time 20 -o /dev/null -w 'http_code=%{http_code} remote_ip=%{remote_ip} time_connect=%{time_connect} time_appconnect=%{time_appconnect} time_total=%{time_total}\n' "${endpoint}" || true
done
EOF
fi fi
done done
} }
@@ -727,10 +769,7 @@ jobs:
kubectl -n flux-system annotate externalsecret/rancher-bootstrap-password external-secrets.io/force-sync="${force_sync}" --overwrite || true kubectl -n flux-system annotate externalsecret/rancher-bootstrap-password external-secrets.io/force-sync="${force_sync}" --overwrite || true
kubectl -n cattle-system annotate externalsecret/rancher-bootstrap-password external-secrets.io/force-sync="${force_sync}" --overwrite || true kubectl -n cattle-system annotate externalsecret/rancher-bootstrap-password external-secrets.io/force-sync="${force_sync}" --overwrite || true
if kubectl wait --for=condition=Ready clustersecretstore/doppler-hetznerterra --timeout=30s \ if kubectl -n flux-system get secret/rancher-bootstrap-password >/dev/null 2>&1 \
&& kubectl -n flux-system wait --for=condition=Ready externalsecret/rancher-bootstrap-password --timeout=30s \
&& kubectl -n cattle-system wait --for=condition=Ready externalsecret/rancher-bootstrap-password --timeout=30s \
&& kubectl -n flux-system get secret/rancher-bootstrap-password >/dev/null 2>&1 \
&& kubectl -n cattle-system get secret/rancher-bootstrap-password >/dev/null 2>&1; then && kubectl -n cattle-system get secret/rancher-bootstrap-password >/dev/null 2>&1; then
return 0 return 0
fi fi
@@ -866,9 +905,7 @@ jobs:
local elapsed=0 local elapsed=0
while [ "${elapsed}" -lt "${timeout_seconds}" ]; do while [ "${elapsed}" -lt "${timeout_seconds}" ]; do
if kubectl wait --for=condition=Ready clustersecretstore/doppler-hetznerterra --timeout=30s \ if kubectl -n observability get secret/grafana-admin-credentials >/dev/null 2>&1; then
&& kubectl -n observability wait --for=condition=Ready externalsecret/grafana-admin --timeout=30s \
&& kubectl -n observability get secret/grafana-admin-credentials >/dev/null 2>&1; then
return 0 return 0
fi fi
+36
View File
@@ -273,6 +273,42 @@ kubectl -n observability describe svc prometheus-tailscale | grep TailscaleProxy
If local `kubectl` falls back to `localhost:8080`, refresh `outputs/kubeconfig` with `scripts/refresh-kubeconfig.sh 10.27.27.30`. If local `kubectl` falls back to `localhost:8080`, refresh `outputs/kubeconfig` with `scripts/refresh-kubeconfig.sh 10.27.27.30`.
## Network Stabilization Probes
Run the same probe from the Proxmox host, `cp1`, and one worker when registry pulls or Doppler calls flap:
```bash
scripts/network-stabilization-probe.sh
```
From the generated Ansible inventory:
```bash
cd ansible
ansible -i inventory.ini 'control_plane[0]' -m script -a '../scripts/network-stabilization-probe.sh'
ansible -i inventory.ini 'workers[0]' -m script -a '../scripts/network-stabilization-probe.sh'
```
Use `NETWORK_PROBE_REPEAT_COUNT`, `NETWORK_PROBE_CURL_TIMEOUT`, and `NETWORK_PROBE_PULL_TIMEOUT` to tune probe duration.
## Registry Cache
K3s nodes are configured by Ansible to use the Proxmox host as a local pull-through cache for common upstream registries. The cache listens on `10.27.27.239`:
```text
docker.io -> http://10.27.27.239:5000
ghcr.io -> http://10.27.27.239:5001
quay.io -> http://10.27.27.239:5002
registry.k8s.io -> http://10.27.27.239:5003
oci.external-secrets.io -> http://10.27.27.239:5004
```
Bootstrap or repair the cache on Proxmox with:
```bash
ssh -i ~/.ssh/infra root@10.27.27.239 'bash -s' < scripts/setup-proxmox-registry-cache.sh
```
## Security Notes ## Security Notes
- Never commit `terraform.tfvars`, kubeconfigs, private keys, `outputs/`, or real secret values. - Never commit `terraform.tfvars`, kubeconfigs, private keys, `outputs/`, or real secret values.
+4
View File
@@ -27,6 +27,10 @@
or (k3s_version != 'latest' and k3s_version not in (installed_k3s_version.stdout | default(''))) or (k3s_version != 'latest' and k3s_version not in (installed_k3s_version.stdout | default('')))
}} }}
- name: Configure k3s registry mirrors
import_role:
name: k3s-registry-mirror
- name: Download k3s install script - name: Download k3s install script
get_url: get_url:
url: https://get.k3s.io url: https://get.k3s.io
@@ -0,0 +1,14 @@
---
k3s_registry_mirror_enabled: true
k3s_registry_mirror_host: 10.27.27.239
k3s_registry_mirrors:
docker.io:
port: 5000
ghcr.io:
port: 5001
quay.io:
port: 5002
registry.k8s.io:
port: 5003
oci.external-secrets.io:
port: 5004
@@ -0,0 +1,20 @@
---
- name: Ensure k3s config directory exists
file:
path: /etc/rancher/k3s
state: directory
mode: "0755"
when: k3s_registry_mirror_enabled | bool
- name: Configure k3s registry mirrors
template:
src: registries.yaml.j2
dest: /etc/rancher/k3s/registries.yaml
mode: "0644"
when: k3s_registry_mirror_enabled | bool
- name: Remove k3s registry mirror config when disabled
file:
path: /etc/rancher/k3s/registries.yaml
state: absent
when: not (k3s_registry_mirror_enabled | bool)
@@ -0,0 +1,6 @@
mirrors:
{% for registry, mirror in k3s_registry_mirrors.items() %}
"{{ registry }}":
endpoint:
- "http://{{ k3s_registry_mirror_host }}:{{ mirror.port }}"
{% endfor %}
+4
View File
@@ -57,6 +57,10 @@
- /var/lib/rancher/k3s - /var/lib/rancher/k3s
when: k3s_install_needed when: k3s_install_needed
- name: Configure k3s registry mirrors
import_role:
name: k3s-registry-mirror
- name: Download k3s install script - name: Download k3s install script
get_url: get_url:
url: https://get.k3s.io url: https://get.k3s.io
+118
View File
@@ -0,0 +1,118 @@
#!/usr/bin/env bash
set -u
repeat_count="${NETWORK_PROBE_REPEAT_COUNT:-5}"
curl_timeout="${NETWORK_PROBE_CURL_TIMEOUT:-20}"
pull_timeout="${NETWORK_PROBE_PULL_TIMEOUT:-240}"
endpoints=(
"https://ghcr.io/v2/"
"https://auth.docker.io/token"
"https://registry-1.docker.io/v2/"
"https://quay.io/v2/"
"https://registry.k8s.io/v2/"
"https://api.doppler.com/v3/projects"
)
images=(
"ghcr.io/fluxcd/helm-controller:v1.5.1"
"oci.external-secrets.io/external-secrets/external-secrets:v2.1.0"
"docker.io/rancher/mirrored-library-busybox:1.37.0"
"ghcr.io/tailscale/tailscale:v1.96.5"
"quay.io/prometheus/node-exporter:v1.8.2"
)
have() {
command -v "$1" >/dev/null 2>&1
}
section() {
printf '\n== %s ==\n' "$1"
}
primary_iface() {
ip route get 1.1.1.1 2>/dev/null | awk '{for (i = 1; i <= NF; i++) if ($i == "dev") {print $(i + 1); exit}}'
}
endpoint_host() {
printf '%s\n' "$1" | sed -E 's#^https?://([^/:]+).*#\1#'
}
section "host"
hostname -f 2>/dev/null || hostname || true
date -Is 2>/dev/null || date || true
section "network"
iface="$(primary_iface)"
printf 'primary_iface=%s\n' "${iface:-unknown}"
if [ -n "${iface}" ] && [ -r "/sys/class/net/${iface}/mtu" ]; then
printf 'primary_mtu=%s\n' "$(cat "/sys/class/net/${iface}/mtu")"
fi
ip -brief addr || true
ip route || true
ip route get 1.1.1.1 || true
section "dns"
if [ -r /etc/resolv.conf ]; then
sed -n '/^nameserver/p;/^search/p;/^options/p' /etc/resolv.conf
fi
section "remote addresses"
for endpoint in "${endpoints[@]}"; do
host="$(endpoint_host "${endpoint}")"
printf '%s ' "${host}"
if have getent; then
getent ahosts "${host}" | awk '{print $1}' | sort -u | paste -sd ',' -
else
printf 'getent unavailable\n'
fi
done
if have tracepath; then
section "path mtu"
for endpoint in "${endpoints[@]}"; do
host="$(endpoint_host "${endpoint}")"
printf '\n-- tracepath %s --\n' "${host}"
tracepath -n "${host}" || true
done
fi
section "curl timings"
for endpoint in "${endpoints[@]}"; do
printf '\n-- %s --\n' "${endpoint}"
for attempt in $(seq 1 "${repeat_count}"); do
printf 'attempt=%s ' "${attempt}"
curl -fsSIL --connect-timeout "${curl_timeout}" --max-time "${curl_timeout}" \
-o /dev/null \
-w 'http_code=%{http_code} remote_ip=%{remote_ip} time_connect=%{time_connect} time_appconnect=%{time_appconnect} time_starttransfer=%{time_starttransfer} time_total=%{time_total}\n' \
"${endpoint}" || printf 'curl_failed rc=%s\n' "$?"
sleep 1
done
done
section "image pulls"
if have sudo && sudo -n true 2>/dev/null; then
sudo_cmd=(sudo)
else
sudo_cmd=()
fi
if have k3s; then
pull_cmd=(timeout "${pull_timeout}s" "${sudo_cmd[@]}" k3s crictl pull)
elif have crictl; then
pull_cmd=(timeout "${pull_timeout}s" "${sudo_cmd[@]}" crictl pull)
elif have ctr; then
pull_cmd=(timeout "${pull_timeout}s" "${sudo_cmd[@]}" ctr -n k8s.io images pull)
else
printf 'No k3s, crictl, or ctr found; skipping image pulls.\n'
exit 0
fi
for image in "${images[@]}"; do
printf '\n-- %s --\n' "${image}"
for attempt in $(seq 1 "${repeat_count}"); do
printf 'attempt=%s\n' "${attempt}"
"${pull_cmd[@]}" "${image}" && printf 'pull_ok\n' || printf 'pull_failed rc=%s\n' "$?"
sleep 2
done
done
+84
View File
@@ -0,0 +1,84 @@
#!/usr/bin/env bash
set -euo pipefail
listen_ip="${REGISTRY_CACHE_LISTEN_IP:-10.27.27.239}"
storage_root="${REGISTRY_CACHE_STORAGE_ROOT:-/var/lib/docker-registry-cache}"
if [ "$(id -u)" -ne 0 ]; then
echo "Run as root on the Proxmox host." >&2
exit 1
fi
apt-get update
apt-get install -y docker-registry
systemctl disable --now docker-registry.service || true
mkdir -p /etc/docker/registry "${storage_root}"
chown docker-registry:docker-registry "${storage_root}"
cat >/etc/systemd/system/docker-registry-cache@.service <<'UNIT'
[Unit]
Description=Docker registry pull-through cache for %i
After=network.target
[Service]
User=docker-registry
Group=docker-registry
ExecStart=/usr/bin/docker-registry serve /etc/docker/registry/cache-%i.yml
Restart=on-failure
RestartSec=5s
[Install]
WantedBy=multi-user.target
UNIT
write_config() {
local name="$1"
local port="$2"
local remote="$3"
local dir="${storage_root}/${name}"
mkdir -p "${dir}"
chown docker-registry:docker-registry "${dir}"
cat >"/etc/docker/registry/cache-${name}.yml" <<EOF
version: 0.1
log:
fields:
service: registry-cache-${name}
storage:
cache:
blobdescriptor: inmemory
filesystem:
rootdirectory: ${dir}
http:
addr: ${listen_ip}:${port}
headers:
X-Content-Type-Options: [nosniff]
proxy:
remoteurl: ${remote}
health:
storagedriver:
enabled: true
interval: 10s
threshold: 3
EOF
}
write_config dockerhub 5000 https://registry-1.docker.io
write_config ghcr 5001 https://ghcr.io
write_config quay 5002 https://quay.io
write_config k8s 5003 https://registry.k8s.io
write_config external-secrets 5004 https://oci.external-secrets.io
systemctl daemon-reload
for name in dockerhub ghcr quay k8s external-secrets; do
systemctl enable --now "docker-registry-cache@${name}.service"
done
systemctl --no-pager --full status \
docker-registry-cache@dockerhub.service \
docker-registry-cache@ghcr.service \
docker-registry-cache@quay.service \
docker-registry-cache@k8s.service \
docker-registry-cache@external-secrets.service