From 9c0523e880677b622ed1c13ab15b2a9872758f11 Mon Sep 17 00:00:00 2001 From: MichaelFisher1997 Date: Wed, 22 Apr 2026 11:00:54 +0000 Subject: [PATCH] fix: pre-pull Rancher images and reset Rancher release during bootstrap Rancher installs were stalling on transient Docker Hub TLS handshake timeouts for rancher shell, webhook, and system-upgrade-controller images. Pre-pull the required images onto all nodes after k3s comes up, extend the Rancher HelmRelease timeout, and reset/force the Rancher HelmRelease before waiting on addon-rancher so bootstrap can recover from stale failed remediation state. --- .gitea/workflows/deploy.yml | 11 +++++++++-- ansible/roles/rancher-image-prepull/defaults/main.yml | 6 ++++++ ansible/roles/rancher-image-prepull/tasks/main.yml | 9 +++++++++ ansible/site.yml | 7 +++++++ .../addons/rancher/helmrelease-rancher.yaml | 1 + 5 files changed, 32 insertions(+), 2 deletions(-) create mode 100644 ansible/roles/rancher-image-prepull/defaults/main.yml create mode 100644 ansible/roles/rancher-image-prepull/tasks/main.yml diff --git a/.gitea/workflows/deploy.yml b/.gitea/workflows/deploy.yml index 66ed8ff..653081e 100644 --- a/.gitea/workflows/deploy.yml +++ b/.gitea/workflows/deploy.yml @@ -259,9 +259,16 @@ jobs: KUBECONFIG: outputs/kubeconfig run: | set -euo pipefail + TS=$(date --iso-8601=seconds) + kubectl -n flux-system annotate helmrelease/rancher \ + reconcile.fluxcd.io/requestedAt="$TS" \ + reconcile.fluxcd.io/resetAt="$TS" \ + reconcile.fluxcd.io/forceAt="$TS" \ + --overwrite || true + echo "Waiting for Rancher..." - kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=600s - kubectl -n flux-system wait --for=condition=Ready helmrelease/rancher -n flux-system --timeout=300s + kubectl -n flux-system wait --for=condition=Ready helmrelease/rancher --timeout=900s + kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=900s echo "Waiting for rancher-backup operator..." kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup --timeout=600s || true diff --git a/ansible/roles/rancher-image-prepull/defaults/main.yml b/ansible/roles/rancher-image-prepull/defaults/main.yml new file mode 100644 index 0000000..96129ad --- /dev/null +++ b/ansible/roles/rancher-image-prepull/defaults/main.yml @@ -0,0 +1,6 @@ +--- +rancher_images_to_prepull: + - docker.io/rancher/rancher:v2.13.3 + - docker.io/rancher/rancher-webhook:v0.9.3 + - docker.io/rancher/system-upgrade-controller:v0.17.0 + - docker.io/rancher/shell:v0.6.2 diff --git a/ansible/roles/rancher-image-prepull/tasks/main.yml b/ansible/roles/rancher-image-prepull/tasks/main.yml new file mode 100644 index 0000000..7ee4d25 --- /dev/null +++ b/ansible/roles/rancher-image-prepull/tasks/main.yml @@ -0,0 +1,9 @@ +--- +- name: Pre-pull Rancher images into containerd + command: /usr/local/bin/ctr -n k8s.io images pull {{ item }} + register: rancher_image_pull + retries: 5 + delay: 15 + until: rancher_image_pull.rc == 0 + loop: "{{ rancher_images_to_prepull }}" + changed_when: true diff --git a/ansible/site.yml b/ansible/site.yml index 58e808b..36ab096 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -93,6 +93,13 @@ roles: - k3s-agent +- name: Pre-pull Rancher bootstrap images + hosts: cluster + become: true + + roles: + - rancher-image-prepull + - name: Deploy observability stack hosts: control_plane[0] become: true diff --git a/infrastructure/addons/rancher/helmrelease-rancher.yaml b/infrastructure/addons/rancher/helmrelease-rancher.yaml index 4fbb59f..dd7b243 100644 --- a/infrastructure/addons/rancher/helmrelease-rancher.yaml +++ b/infrastructure/addons/rancher/helmrelease-rancher.yaml @@ -5,6 +5,7 @@ metadata: namespace: flux-system spec: interval: 10m + timeout: 15m targetNamespace: cattle-system chart: spec: