From 1896108cbb36a2a05bd8378537d469c90db71406 Mon Sep 17 00:00:00 2001 From: MichaelFisher1997 Date: Sun, 3 May 2026 00:02:33 +0000 Subject: [PATCH] fix: add local registry cache for rebuilds --- .gitea/workflows/deploy.yml | 59 +++++++-- README.md | 36 ++++++ ansible/roles/k3s-agent/tasks/main.yml | 4 + .../k3s-registry-mirror/defaults/main.yml | 14 +++ .../roles/k3s-registry-mirror/tasks/main.yml | 20 +++ .../templates/registries.yaml.j2 | 6 + ansible/roles/k3s-server/tasks/main.yml | 4 + scripts/network-stabilization-probe.sh | 118 ++++++++++++++++++ scripts/setup-proxmox-registry-cache.sh | 84 +++++++++++++ 9 files changed, 334 insertions(+), 11 deletions(-) create mode 100644 ansible/roles/k3s-registry-mirror/defaults/main.yml create mode 100644 ansible/roles/k3s-registry-mirror/tasks/main.yml create mode 100644 ansible/roles/k3s-registry-mirror/templates/registries.yaml.j2 create mode 100755 scripts/network-stabilization-probe.sh create mode 100755 scripts/setup-proxmox-registry-cache.sh diff --git a/.gitea/workflows/deploy.yml b/.gitea/workflows/deploy.yml index 276897d..8f66a2a 100644 --- a/.gitea/workflows/deploy.yml +++ b/.gitea/workflows/deploy.yml @@ -302,19 +302,40 @@ jobs: local sleep_seconds="$4" local failure_message="$5" local pulled=false + local last_output="" for attempt in $(seq 1 "${attempts}"); do echo "Pre-pulling ${image} on ${host_ip} (${attempt}/${attempts})" - if ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${host_ip}" \ - "sudo k3s crictl inspecti '${image}' >/dev/null 2>&1 || (sudo k3s crictl pull '${image}' && sudo k3s crictl inspecti '${image}' >/dev/null 2>&1)"; then + if last_output="$(ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${host_ip}" \ + "sudo k3s crictl inspecti '${image}' >/dev/null 2>&1 || (sudo k3s crictl pull '${image}' && sudo k3s crictl inspecti '${image}' >/dev/null 2>&1)" 2>&1)"; then pulled=true break fi + printf '%s\n' "${last_output}" >&2 sleep "${sleep_seconds}" done if [ "${pulled}" != "true" ]; then echo "${failure_message} ${image} on ${host_ip}" >&2 + echo "Last pull output:" >&2 + printf '%s\n' "${last_output}" >&2 + ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${host_ip}" 'bash -s' <<'EOF' || true + set -u + echo "== node network diagnostics ==" + iface="$(ip route get 1.1.1.1 2>/dev/null | awk '{for (i = 1; i <= NF; i++) if ($i == "dev") {print $(i + 1); exit}}')" + echo "primary_iface=${iface:-unknown}" + if [ -n "${iface:-}" ] && [ -r "/sys/class/net/${iface}/mtu" ]; then + echo "primary_mtu=$(cat "/sys/class/net/${iface}/mtu")" + fi + ip -brief addr || true + ip route || true + ip route get 1.1.1.1 || true + sed -n '/^nameserver/p;/^search/p;/^options/p' /etc/resolv.conf 2>/dev/null || true + for endpoint in https://ghcr.io/v2/ https://auth.docker.io/token https://registry-1.docker.io/v2/ https://quay.io/v2/ https://registry.k8s.io/v2/ https://api.doppler.com/v3/projects; do + echo "-- ${endpoint} --" + curl -fsSIL --connect-timeout 15 --max-time 20 -o /dev/null -w 'http_code=%{http_code} remote_ip=%{remote_ip} time_connect=%{time_connect} time_appconnect=%{time_appconnect} time_total=%{time_total}\n' "${endpoint}" || true + done + EOF exit 1 fi } @@ -698,20 +719,41 @@ jobs: for node in ${nodes}; do local node_ip local pulled=false + local last_output="" node_ip="$(kubectl get node "${node}" -o jsonpath='{.status.addresses[?(@.type=="InternalIP")].address}')" for attempt in $(seq 1 "${attempts}"); do echo "Pre-pulling ${image} on ${node}/${node_ip} (${attempt}/${attempts})" - if ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${node_ip}" \ - "sudo k3s crictl inspecti '${image}' >/dev/null 2>&1 || (sudo k3s crictl pull '${image}' && sudo k3s crictl inspecti '${image}' >/dev/null 2>&1)"; then + if last_output="$(ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${node_ip}" \ + "sudo k3s crictl inspecti '${image}' >/dev/null 2>&1 || (sudo k3s crictl pull '${image}' && sudo k3s crictl inspecti '${image}' >/dev/null 2>&1)" 2>&1)"; then pulled=true break fi + printf '%s\n' "${last_output}" >&2 sleep "${sleep_seconds}" done if [ "${pulled}" != "true" ]; then echo "Best-effort targeted image pre-pull did not complete for ${image} on ${node}/${node_ip}" >&2 + echo "Last pull output:" >&2 + printf '%s\n' "${last_output}" >&2 + ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${node_ip}" 'bash -s' <<'EOF' || true + set -u + echo "== node network diagnostics ==" + iface="$(ip route get 1.1.1.1 2>/dev/null | awk '{for (i = 1; i <= NF; i++) if ($i == "dev") {print $(i + 1); exit}}')" + echo "primary_iface=${iface:-unknown}" + if [ -n "${iface:-}" ] && [ -r "/sys/class/net/${iface}/mtu" ]; then + echo "primary_mtu=$(cat "/sys/class/net/${iface}/mtu")" + fi + ip -brief addr || true + ip route || true + ip route get 1.1.1.1 || true + sed -n '/^nameserver/p;/^search/p;/^options/p' /etc/resolv.conf 2>/dev/null || true + for endpoint in https://ghcr.io/v2/ https://auth.docker.io/token https://registry-1.docker.io/v2/ https://quay.io/v2/ https://registry.k8s.io/v2/ https://api.doppler.com/v3/projects; do + echo "-- ${endpoint} --" + curl -fsSIL --connect-timeout 15 --max-time 20 -o /dev/null -w 'http_code=%{http_code} remote_ip=%{remote_ip} time_connect=%{time_connect} time_appconnect=%{time_appconnect} time_total=%{time_total}\n' "${endpoint}" || true + done + EOF fi done } @@ -727,10 +769,7 @@ jobs: kubectl -n flux-system annotate externalsecret/rancher-bootstrap-password external-secrets.io/force-sync="${force_sync}" --overwrite || true kubectl -n cattle-system annotate externalsecret/rancher-bootstrap-password external-secrets.io/force-sync="${force_sync}" --overwrite || true - if kubectl wait --for=condition=Ready clustersecretstore/doppler-hetznerterra --timeout=30s \ - && kubectl -n flux-system wait --for=condition=Ready externalsecret/rancher-bootstrap-password --timeout=30s \ - && kubectl -n cattle-system wait --for=condition=Ready externalsecret/rancher-bootstrap-password --timeout=30s \ - && kubectl -n flux-system get secret/rancher-bootstrap-password >/dev/null 2>&1 \ + if kubectl -n flux-system get secret/rancher-bootstrap-password >/dev/null 2>&1 \ && kubectl -n cattle-system get secret/rancher-bootstrap-password >/dev/null 2>&1; then return 0 fi @@ -866,9 +905,7 @@ jobs: local elapsed=0 while [ "${elapsed}" -lt "${timeout_seconds}" ]; do - if kubectl wait --for=condition=Ready clustersecretstore/doppler-hetznerterra --timeout=30s \ - && kubectl -n observability wait --for=condition=Ready externalsecret/grafana-admin --timeout=30s \ - && kubectl -n observability get secret/grafana-admin-credentials >/dev/null 2>&1; then + if kubectl -n observability get secret/grafana-admin-credentials >/dev/null 2>&1; then return 0 fi diff --git a/README.md b/README.md index 99daf2f..88e3cf8 100644 --- a/README.md +++ b/README.md @@ -273,6 +273,42 @@ kubectl -n observability describe svc prometheus-tailscale | grep TailscaleProxy If local `kubectl` falls back to `localhost:8080`, refresh `outputs/kubeconfig` with `scripts/refresh-kubeconfig.sh 10.27.27.30`. +## Network Stabilization Probes + +Run the same probe from the Proxmox host, `cp1`, and one worker when registry pulls or Doppler calls flap: + +```bash +scripts/network-stabilization-probe.sh +``` + +From the generated Ansible inventory: + +```bash +cd ansible +ansible -i inventory.ini 'control_plane[0]' -m script -a '../scripts/network-stabilization-probe.sh' +ansible -i inventory.ini 'workers[0]' -m script -a '../scripts/network-stabilization-probe.sh' +``` + +Use `NETWORK_PROBE_REPEAT_COUNT`, `NETWORK_PROBE_CURL_TIMEOUT`, and `NETWORK_PROBE_PULL_TIMEOUT` to tune probe duration. + +## Registry Cache + +K3s nodes are configured by Ansible to use the Proxmox host as a local pull-through cache for common upstream registries. The cache listens on `10.27.27.239`: + +```text +docker.io -> http://10.27.27.239:5000 +ghcr.io -> http://10.27.27.239:5001 +quay.io -> http://10.27.27.239:5002 +registry.k8s.io -> http://10.27.27.239:5003 +oci.external-secrets.io -> http://10.27.27.239:5004 +``` + +Bootstrap or repair the cache on Proxmox with: + +```bash +ssh -i ~/.ssh/infra root@10.27.27.239 'bash -s' < scripts/setup-proxmox-registry-cache.sh +``` + ## Security Notes - Never commit `terraform.tfvars`, kubeconfigs, private keys, `outputs/`, or real secret values. diff --git a/ansible/roles/k3s-agent/tasks/main.yml b/ansible/roles/k3s-agent/tasks/main.yml index 8ce510e..21246e6 100644 --- a/ansible/roles/k3s-agent/tasks/main.yml +++ b/ansible/roles/k3s-agent/tasks/main.yml @@ -27,6 +27,10 @@ or (k3s_version != 'latest' and k3s_version not in (installed_k3s_version.stdout | default(''))) }} +- name: Configure k3s registry mirrors + import_role: + name: k3s-registry-mirror + - name: Download k3s install script get_url: url: https://get.k3s.io diff --git a/ansible/roles/k3s-registry-mirror/defaults/main.yml b/ansible/roles/k3s-registry-mirror/defaults/main.yml new file mode 100644 index 0000000..3b6e516 --- /dev/null +++ b/ansible/roles/k3s-registry-mirror/defaults/main.yml @@ -0,0 +1,14 @@ +--- +k3s_registry_mirror_enabled: true +k3s_registry_mirror_host: 10.27.27.239 +k3s_registry_mirrors: + docker.io: + port: 5000 + ghcr.io: + port: 5001 + quay.io: + port: 5002 + registry.k8s.io: + port: 5003 + oci.external-secrets.io: + port: 5004 diff --git a/ansible/roles/k3s-registry-mirror/tasks/main.yml b/ansible/roles/k3s-registry-mirror/tasks/main.yml new file mode 100644 index 0000000..ae67ae4 --- /dev/null +++ b/ansible/roles/k3s-registry-mirror/tasks/main.yml @@ -0,0 +1,20 @@ +--- +- name: Ensure k3s config directory exists + file: + path: /etc/rancher/k3s + state: directory + mode: "0755" + when: k3s_registry_mirror_enabled | bool + +- name: Configure k3s registry mirrors + template: + src: registries.yaml.j2 + dest: /etc/rancher/k3s/registries.yaml + mode: "0644" + when: k3s_registry_mirror_enabled | bool + +- name: Remove k3s registry mirror config when disabled + file: + path: /etc/rancher/k3s/registries.yaml + state: absent + when: not (k3s_registry_mirror_enabled | bool) diff --git a/ansible/roles/k3s-registry-mirror/templates/registries.yaml.j2 b/ansible/roles/k3s-registry-mirror/templates/registries.yaml.j2 new file mode 100644 index 0000000..940c5d4 --- /dev/null +++ b/ansible/roles/k3s-registry-mirror/templates/registries.yaml.j2 @@ -0,0 +1,6 @@ +mirrors: +{% for registry, mirror in k3s_registry_mirrors.items() %} + "{{ registry }}": + endpoint: + - "http://{{ k3s_registry_mirror_host }}:{{ mirror.port }}" +{% endfor %} diff --git a/ansible/roles/k3s-server/tasks/main.yml b/ansible/roles/k3s-server/tasks/main.yml index 89a36c3..fe837e0 100644 --- a/ansible/roles/k3s-server/tasks/main.yml +++ b/ansible/roles/k3s-server/tasks/main.yml @@ -57,6 +57,10 @@ - /var/lib/rancher/k3s when: k3s_install_needed +- name: Configure k3s registry mirrors + import_role: + name: k3s-registry-mirror + - name: Download k3s install script get_url: url: https://get.k3s.io diff --git a/scripts/network-stabilization-probe.sh b/scripts/network-stabilization-probe.sh new file mode 100755 index 0000000..9a8b740 --- /dev/null +++ b/scripts/network-stabilization-probe.sh @@ -0,0 +1,118 @@ +#!/usr/bin/env bash +set -u + +repeat_count="${NETWORK_PROBE_REPEAT_COUNT:-5}" +curl_timeout="${NETWORK_PROBE_CURL_TIMEOUT:-20}" +pull_timeout="${NETWORK_PROBE_PULL_TIMEOUT:-240}" + +endpoints=( + "https://ghcr.io/v2/" + "https://auth.docker.io/token" + "https://registry-1.docker.io/v2/" + "https://quay.io/v2/" + "https://registry.k8s.io/v2/" + "https://api.doppler.com/v3/projects" +) + +images=( + "ghcr.io/fluxcd/helm-controller:v1.5.1" + "oci.external-secrets.io/external-secrets/external-secrets:v2.1.0" + "docker.io/rancher/mirrored-library-busybox:1.37.0" + "ghcr.io/tailscale/tailscale:v1.96.5" + "quay.io/prometheus/node-exporter:v1.8.2" +) + +have() { + command -v "$1" >/dev/null 2>&1 +} + +section() { + printf '\n== %s ==\n' "$1" +} + +primary_iface() { + ip route get 1.1.1.1 2>/dev/null | awk '{for (i = 1; i <= NF; i++) if ($i == "dev") {print $(i + 1); exit}}' +} + +endpoint_host() { + printf '%s\n' "$1" | sed -E 's#^https?://([^/:]+).*#\1#' +} + +section "host" +hostname -f 2>/dev/null || hostname || true +date -Is 2>/dev/null || date || true + +section "network" +iface="$(primary_iface)" +printf 'primary_iface=%s\n' "${iface:-unknown}" +if [ -n "${iface}" ] && [ -r "/sys/class/net/${iface}/mtu" ]; then + printf 'primary_mtu=%s\n' "$(cat "/sys/class/net/${iface}/mtu")" +fi +ip -brief addr || true +ip route || true +ip route get 1.1.1.1 || true + +section "dns" +if [ -r /etc/resolv.conf ]; then + sed -n '/^nameserver/p;/^search/p;/^options/p' /etc/resolv.conf +fi + +section "remote addresses" +for endpoint in "${endpoints[@]}"; do + host="$(endpoint_host "${endpoint}")" + printf '%s ' "${host}" + if have getent; then + getent ahosts "${host}" | awk '{print $1}' | sort -u | paste -sd ',' - + else + printf 'getent unavailable\n' + fi +done + +if have tracepath; then + section "path mtu" + for endpoint in "${endpoints[@]}"; do + host="$(endpoint_host "${endpoint}")" + printf '\n-- tracepath %s --\n' "${host}" + tracepath -n "${host}" || true + done +fi + +section "curl timings" +for endpoint in "${endpoints[@]}"; do + printf '\n-- %s --\n' "${endpoint}" + for attempt in $(seq 1 "${repeat_count}"); do + printf 'attempt=%s ' "${attempt}" + curl -fsSIL --connect-timeout "${curl_timeout}" --max-time "${curl_timeout}" \ + -o /dev/null \ + -w 'http_code=%{http_code} remote_ip=%{remote_ip} time_connect=%{time_connect} time_appconnect=%{time_appconnect} time_starttransfer=%{time_starttransfer} time_total=%{time_total}\n' \ + "${endpoint}" || printf 'curl_failed rc=%s\n' "$?" + sleep 1 + done +done + +section "image pulls" +if have sudo && sudo -n true 2>/dev/null; then + sudo_cmd=(sudo) +else + sudo_cmd=() +fi + +if have k3s; then + pull_cmd=(timeout "${pull_timeout}s" "${sudo_cmd[@]}" k3s crictl pull) +elif have crictl; then + pull_cmd=(timeout "${pull_timeout}s" "${sudo_cmd[@]}" crictl pull) +elif have ctr; then + pull_cmd=(timeout "${pull_timeout}s" "${sudo_cmd[@]}" ctr -n k8s.io images pull) +else + printf 'No k3s, crictl, or ctr found; skipping image pulls.\n' + exit 0 +fi + +for image in "${images[@]}"; do + printf '\n-- %s --\n' "${image}" + for attempt in $(seq 1 "${repeat_count}"); do + printf 'attempt=%s\n' "${attempt}" + "${pull_cmd[@]}" "${image}" && printf 'pull_ok\n' || printf 'pull_failed rc=%s\n' "$?" + sleep 2 + done +done diff --git a/scripts/setup-proxmox-registry-cache.sh b/scripts/setup-proxmox-registry-cache.sh new file mode 100755 index 0000000..e66793c --- /dev/null +++ b/scripts/setup-proxmox-registry-cache.sh @@ -0,0 +1,84 @@ +#!/usr/bin/env bash +set -euo pipefail + +listen_ip="${REGISTRY_CACHE_LISTEN_IP:-10.27.27.239}" +storage_root="${REGISTRY_CACHE_STORAGE_ROOT:-/var/lib/docker-registry-cache}" + +if [ "$(id -u)" -ne 0 ]; then + echo "Run as root on the Proxmox host." >&2 + exit 1 +fi + +apt-get update +apt-get install -y docker-registry + +systemctl disable --now docker-registry.service || true + +mkdir -p /etc/docker/registry "${storage_root}" +chown docker-registry:docker-registry "${storage_root}" + +cat >/etc/systemd/system/docker-registry-cache@.service <<'UNIT' +[Unit] +Description=Docker registry pull-through cache for %i +After=network.target + +[Service] +User=docker-registry +Group=docker-registry +ExecStart=/usr/bin/docker-registry serve /etc/docker/registry/cache-%i.yml +Restart=on-failure +RestartSec=5s + +[Install] +WantedBy=multi-user.target +UNIT + +write_config() { + local name="$1" + local port="$2" + local remote="$3" + local dir="${storage_root}/${name}" + + mkdir -p "${dir}" + chown docker-registry:docker-registry "${dir}" + cat >"/etc/docker/registry/cache-${name}.yml" <