From 17182f84a9d3b51ec4544dbc4dcbf1396ad9e522 Mon Sep 17 00:00:00 2001 From: MichaelFisher1997 Date: Sat, 2 May 2026 00:41:25 +0000 Subject: [PATCH] fix: remove runner image archive path --- .gitea/workflows/deploy.yml | 135 ------------------ AGENTS.md | 2 +- README.md | 4 +- .../bootstrap-image-prepull/tasks/main.yml | 40 +----- ansible/roles/kube-vip-deploy/tasks/main.yml | 19 --- .../rancher-image-prepull/tasks/main.yml | 40 +----- ansible/site.yml | 37 ----- 7 files changed, 7 insertions(+), 270 deletions(-) diff --git a/.gitea/workflows/deploy.yml b/.gitea/workflows/deploy.yml index fffc0f7..5712b0b 100644 --- a/.gitea/workflows/deploy.yml +++ b/.gitea/workflows/deploy.yml @@ -176,80 +176,10 @@ jobs: - name: Install Ansible Collections run: ansible-galaxy collection install -r ansible/requirements.yml - - name: Install skopeo - run: | - apt-get update - apt-get install -y skopeo - - name: Generate Ansible Inventory working-directory: ansible run: python3 generate_inventory.py - - name: Prepare kube-vip image archive - run: | - set -euo pipefail - mkdir -p outputs - for attempt in 1 2 3; do - if skopeo copy \ - docker://ghcr.io/kube-vip/kube-vip:v1.1.2 \ - docker-archive:outputs/kube-vip-bootstrap.tar:ghcr.io/kube-vip/kube-vip:v1.1.2; then - exit 0 - fi - sleep 10 - done - echo "Failed to prepare kube-vip image archive on runner" >&2 - exit 1 - - - name: Prepare bootstrap image archives - run: | - set -euo pipefail - archive_name() { - printf '%s' "$1" | tr '/:' '__' - } - - prepare_image_archive() { - local image="$1" - local archive="outputs/bootstrap-image-archives/$(archive_name "${image}").tar" - - mkdir -p outputs/bootstrap-image-archives - for attempt in 1 2 3; do - if skopeo copy "docker://${image}" "docker-archive:${archive}:${image}"; then - return 0 - fi - sleep 10 - done - - echo "Failed to prepare bootstrap image archive for ${image}" >&2 - return 1 - } - - for image in \ - ghcr.io/fluxcd/source-controller:v1.8.0 \ - ghcr.io/fluxcd/kustomize-controller:v1.8.1 \ - ghcr.io/fluxcd/helm-controller:v1.5.1 \ - ghcr.io/fluxcd/notification-controller:v1.8.1 \ - docker.io/rancher/mirrored-coredns-coredns:1.14.2 \ - docker.io/rancher/mirrored-metrics-server:v0.8.1 \ - docker.io/rancher/local-path-provisioner:v0.0.35 \ - docker.io/rancher/mirrored-library-traefik:3.6.10 \ - docker.io/rancher/klipper-helm:v0.9.14-build20260309 \ - oci.external-secrets.io/external-secrets/external-secrets:v2.1.0 \ - ghcr.io/tailscale/k8s-operator:v1.96.5 \ - ghcr.io/tailscale/tailscale:v1.96.5 \ - registry.k8s.io/sig-storage/nfs-subdir-external-provisioner:v4.0.2 \ - docker.io/rancher/mirrored-pause:3.6 \ - docker.io/rancher/rancher:v2.13.3 \ - docker.io/rancher/rancher-webhook:v0.9.3 \ - docker.io/rancher/system-upgrade-controller:v0.17.0 \ - docker.io/rancher/shell:v0.6.2 \ - quay.io/jetstack/cert-manager-controller:v1.17.2 \ - quay.io/jetstack/cert-manager-cainjector:v1.17.2 \ - quay.io/jetstack/cert-manager-webhook:v1.17.2 \ - quay.io/jetstack/cert-manager-startupapicheck:v1.17.2 \ - docker.io/library/busybox:1.31.1; do - prepare_image_archive "${image}" - done - - name: Run Ansible Playbook working-directory: ansible run: | @@ -389,49 +319,6 @@ jobs: fi } - import_required_image() { - local image="$1" - local host_ip="$2" - local archive_name - local archive_path - archive_name="$(printf '%s' "${image}" | tr '/:' '__').tar" - archive_path="outputs/bootstrap-image-archives/${archive_name}" - - if [ ! -s "${archive_path}" ]; then - echo "Missing required bootstrap image archive ${archive_path} for ${image}" >&2 - exit 1 - fi - - if ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${host_ip}" \ - "sudo k3s crictl inspecti '${image}' >/dev/null 2>&1"; then - return 0 - fi - - echo "Importing ${image} archive on ${host_ip}" - timeout 180s scp -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 -o ServerAliveInterval=15 -o ServerAliveCountMax=4 \ - "${archive_path}" "ubuntu@${host_ip}:/tmp/${archive_name}" - timeout 300s ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 -o ServerAliveInterval=15 -o ServerAliveCountMax=4 "ubuntu@${host_ip}" \ - "set -euo pipefail; \ - if sudo k3s crictl inspecti '${image}' >/dev/null 2>&1; then exit 0; fi; \ - for attempt in 1 2 3 4 5; do \ - echo 'Importing ${image} archive with ctr'; \ - if sudo k3s ctr -n k8s.io images import '/tmp/${archive_name}' && sudo k3s crictl inspecti '${image}' >/dev/null; then exit 0; fi; \ - sleep 10; \ - done; \ - sudo systemctl status k3s --no-pager -l || true; \ - sudo journalctl -u k3s -n 80 --no-pager || true; \ - exit 1" - } - - import_required_image_on_all_nodes() { - local image="$1" - local host_ip - - for host_ip in ${ALL_NODE_IPS}; do - import_required_image "${image}" "${host_ip}" - done - } - eso_diagnostics() { kubectl -n flux-system get kustomizations,ocirepositories,helmrepositories,helmcharts,helmreleases || true kubectl -n flux-system describe kustomization addon-external-secrets || true @@ -558,23 +445,6 @@ jobs: --from-file=identity="$HOME/.ssh/id_ed25519" \ --from-file=known_hosts=/tmp/flux_known_hosts \ --dry-run=client -o yaml | kubectl apply -f - - PRIMARY_CP_IP=$(python3 -c 'import json; print(json.load(open("outputs/terraform_outputs.json"))["primary_control_plane_ip"]["value"])') - ALL_NODE_IPS=$(python3 -c 'import json; outputs = json.load(open("outputs/terraform_outputs.json")); print(" ".join(outputs["control_plane_ips"]["value"] + outputs["worker_ips"]["value"]))') - for image in \ - ghcr.io/fluxcd/source-controller:v1.8.0 \ - ghcr.io/fluxcd/kustomize-controller:v1.8.1 \ - ghcr.io/fluxcd/helm-controller:v1.5.1 \ - ghcr.io/fluxcd/notification-controller:v1.8.1; do - import_required_image "${image}" "${PRIMARY_CP_IP}" - done - for image in \ - docker.io/rancher/mirrored-pause:3.6 \ - quay.io/jetstack/cert-manager-controller:v1.17.2 \ - quay.io/jetstack/cert-manager-cainjector:v1.17.2 \ - quay.io/jetstack/cert-manager-webhook:v1.17.2 \ - quay.io/jetstack/cert-manager-startupapicheck:v1.17.2; do - import_required_image_on_all_nodes "${image}" - done # Apply CRDs and controllers first kubectl apply -f clusters/prod/flux-system/gotk-components.yaml # Wait for CRDs to be established @@ -600,7 +470,6 @@ jobs: # Wait directly on the ESO Helm objects; Kustomization readiness hides useful failure details. wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-external-secrets 600 reconcile_flux_resource flux-system kustomization/addon-external-secrets 900 - import_required_image oci.external-secrets.io/external-secrets/external-secrets:v2.1.0 "${PRIMARY_CP_IP}" wait_for_flux_oci_helm_release external-secrets external-secrets external-secrets 600s 600 wait_for_resource "" crd/clustersecretstores.external-secrets.io 900 wait_for_resource "" crd/externalsecrets.external-secrets.io 900 @@ -615,8 +484,6 @@ jobs: reconcile_flux_resource flux-system kustomization/addon-external-secrets-store 600 kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets-store --timeout=600s # Wait for the storage layer and private access components - import_required_image ghcr.io/tailscale/k8s-operator:v1.96.5 "${PRIMARY_CP_IP}" - import_required_image ghcr.io/tailscale/tailscale:v1.96.5 "${PRIMARY_CP_IP}" reconcile_flux_resource flux-system kustomization/addon-tailscale-operator 900 if ! kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s; then kubectl -n flux-system describe kustomization/addon-tailscale-operator || true @@ -627,14 +494,12 @@ jobs: wait_for_helmrelease_ready tailscale-operator tailscale-system 900 kubectl wait --for=condition=Established crd/proxyclasses.tailscale.com --timeout=600s kubectl -n tailscale-system rollout status deployment/operator --timeout=600s - import_required_image registry.k8s.io/sig-storage/nfs-subdir-external-provisioner:v4.0.2 "${PRIMARY_CP_IP}" reconcile_flux_resource flux-system kustomization/addon-nfs-storage 600 kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=300s kubectl -n kube-system rollout status deployment/nfs-subdir-external-provisioner --timeout=300s kubectl annotate storageclass local-path storageclass.kubernetes.io/is-default-class=false --overwrite kubectl annotate storageclass flash-nfs storageclass.kubernetes.io/is-default-class=true --overwrite kubectl get storageclass flash-nfs - import_required_image docker.io/library/busybox:1.31.1 "${PRIMARY_CP_IP}" kubectl -n kube-system delete pod/nfs-smoke pvc/nfs-smoke --ignore-not-found=true kubectl apply -f - <<'EOF' apiVersion: v1 diff --git a/AGENTS.md b/AGENTS.md index 4105c07..56aee71 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -31,7 +31,7 @@ Compact repo guidance for OpenCode sessions. Trust executable sources over docs - Deploy and destroy workflows share `concurrency.group: prod-cluster`; destroy only requires workflow input `confirm: destroy` and has no backup gate. - Keep `set -euo pipefail` in workflow shell blocks. - Terraform retry cleanup has hard-coded target VMIDs/names in `.gitea/workflows/deploy.yml`; update it when changing node counts, names, or VMIDs. -- Fresh VMs have unreliable registry/chart egress, so critical images are prepared by `skopeo` on the runner and imported with `k3s ctr`; update the workflow archive lists when adding bootstrap-time images. +- Fresh VMs pull bootstrap images directly through containerd/K3s. Do not add runner-side `skopeo` archive/import paths; registry/network failures should surface directly in deploy logs. - CI applies `clusters/prod/flux-system/gotk-components.yaml` directly and then patches Flux controller deployments inline; changes only in `gotk-controller-cp1-patches.yaml` do not affect CI bootstrap. ## GitOps Addons diff --git a/README.md b/README.md index 8fd324d..99daf2f 100644 --- a/README.md +++ b/README.md @@ -148,8 +148,8 @@ Deploy sequence on push to `main`: 1. Terraform fmt/init/validate/plan/apply. 2. Cleanup/retry around known transient Proxmox clone and disk-update failures. 3. Generate Ansible inventory from Terraform outputs. -4. Prepare critical image archives with `skopeo` on the runner. -5. Run `ansible/site.yml` to bootstrap nodes, K3s, kube-vip, prerequisite secrets, and kubeconfig. +4. Run `ansible/site.yml` to bootstrap nodes, K3s, kube-vip, prerequisite secrets, and kubeconfig. +5. Pull bootstrap images directly through containerd/K3s on the target nodes. 6. Apply Flux CRDs/controllers and the `clusters/prod/flux-system` graph. 7. Gate cert-manager, External Secrets, Tailscale, NFS, Rancher, and observability. 8. Run post-deploy health checks and Tailscale service smoke checks. diff --git a/ansible/roles/bootstrap-image-prepull/tasks/main.yml b/ansible/roles/bootstrap-image-prepull/tasks/main.yml index ae2806c..72c2f3c 100644 --- a/ansible/roles/bootstrap-image-prepull/tasks/main.yml +++ b/ansible/roles/bootstrap-image-prepull/tasks/main.yml @@ -1,47 +1,11 @@ --- -- name: Check for runner-provided bootstrap image archives - stat: - path: "{{ playbook_dir }}/../outputs/bootstrap-image-archives/{{ item | regex_replace('[/:]', '_') }}.tar" - delegate_to: localhost - become: false - register: bootstrap_image_archive_stats - loop: "{{ bootstrap_prepull_images }}" - -- name: Ensure remote bootstrap image archive directory exists - file: - path: /tmp/bootstrap-image-archives - state: directory - mode: "0755" - -- name: Copy runner-provided bootstrap image archives - copy: - src: "{{ item.stat.path }}" - dest: "/tmp/bootstrap-image-archives/{{ item.item | regex_replace('[/:]', '_') }}.tar" - mode: "0644" - loop: "{{ bootstrap_image_archive_stats.results }}" - loop_control: - label: "{{ item.item }}" - when: item.stat.exists - -- name: Import or pull bootstrap images into containerd +- name: Pull bootstrap images into containerd shell: | if /usr/local/bin/ctr -n k8s.io images ls -q | grep -Fx -- "{{ item }}" >/dev/null; then echo "already present" exit 0 fi - archive="/tmp/bootstrap-image-archives/{{ item | regex_replace('[/:]', '_') }}.tar" - if [ -s "${archive}" ]; then - for attempt in 1 2 3; do - if /usr/local/bin/ctr -n k8s.io images import "${archive}" && /usr/local/bin/ctr -n k8s.io images ls -q | grep -Fx -- "{{ item }}" >/dev/null; then - echo "imported image" - exit 0 - fi - - sleep 10 - done - fi - for attempt in 1 2 3 4 5; do if timeout 180s /usr/local/bin/ctr -n k8s.io images pull "{{ item }}"; then echo "pulled image" @@ -56,4 +20,4 @@ executable: /bin/bash register: bootstrap_image_pull loop: "{{ bootstrap_prepull_images }}" - changed_when: "'imported image' in bootstrap_image_pull.stdout or 'pulled image' in bootstrap_image_pull.stdout" + changed_when: "'pulled image' in bootstrap_image_pull.stdout" diff --git a/ansible/roles/kube-vip-deploy/tasks/main.yml b/ansible/roles/kube-vip-deploy/tasks/main.yml index 5c00bfe..f792f74 100644 --- a/ansible/roles/kube-vip-deploy/tasks/main.yml +++ b/ansible/roles/kube-vip-deploy/tasks/main.yml @@ -1,23 +1,4 @@ --- -- name: Check for runner-provided kube-vip image archive - stat: - path: "{{ playbook_dir }}/../outputs/kube-vip-bootstrap.tar" - delegate_to: localhost - become: false - register: kube_vip_bootstrap_archive - -- name: Copy runner-provided kube-vip image archive - copy: - src: "{{ playbook_dir }}/../outputs/kube-vip-bootstrap.tar" - dest: /tmp/kube-vip-bootstrap.tar - mode: "0644" - when: kube_vip_bootstrap_archive.stat.exists - -- name: Import runner-provided kube-vip image archive - command: /usr/local/bin/ctr -n k8s.io images import /tmp/kube-vip-bootstrap.tar - changed_when: false - when: kube_vip_bootstrap_archive.stat.exists - - name: Pre-pull kube-vip bootstrap images into containerd shell: | if /usr/local/bin/ctr -n k8s.io images ls -q | grep -Fx -- "{{ item }}" >/dev/null; then diff --git a/ansible/roles/rancher-image-prepull/tasks/main.yml b/ansible/roles/rancher-image-prepull/tasks/main.yml index 4148161..188963d 100644 --- a/ansible/roles/rancher-image-prepull/tasks/main.yml +++ b/ansible/roles/rancher-image-prepull/tasks/main.yml @@ -1,47 +1,11 @@ --- -- name: Check for runner-provided Rancher image archives - stat: - path: "{{ playbook_dir }}/../outputs/bootstrap-image-archives/{{ item | regex_replace('[/:]', '_') }}.tar" - delegate_to: localhost - become: false - register: rancher_image_archive_stats - loop: "{{ rancher_images_to_prepull }}" - -- name: Ensure remote Rancher image archive directory exists - file: - path: /tmp/bootstrap-image-archives - state: directory - mode: "0755" - -- name: Copy runner-provided Rancher image archives - copy: - src: "{{ item.stat.path }}" - dest: "/tmp/bootstrap-image-archives/{{ item.item | regex_replace('[/:]', '_') }}.tar" - mode: "0644" - loop: "{{ rancher_image_archive_stats.results }}" - loop_control: - label: "{{ item.item }}" - when: item.stat.exists - -- name: Import or pull Rancher images into containerd +- name: Pull Rancher images into containerd shell: | if /usr/local/bin/ctr -n k8s.io images ls -q | grep -Fx -- "{{ item }}" >/dev/null; then echo "already present" exit 0 fi - archive="/tmp/bootstrap-image-archives/{{ item | regex_replace('[/:]', '_') }}.tar" - if [ -s "${archive}" ]; then - for attempt in 1 2 3; do - if /usr/local/bin/ctr -n k8s.io images import "${archive}" && /usr/local/bin/ctr -n k8s.io images ls -q | grep -Fx -- "{{ item }}" >/dev/null; then - echo "imported image" - exit 0 - fi - - sleep 10 - done - fi - for attempt in 1 2 3 4 5; do if timeout 180s /usr/local/bin/ctr -n k8s.io images pull "{{ item }}"; then echo "pulled image" @@ -56,4 +20,4 @@ executable: /bin/bash register: rancher_image_pull loop: "{{ rancher_images_to_prepull }}" - changed_when: "'imported image' in rancher_image_pull.stdout or 'pulled image' in rancher_image_pull.stdout" + changed_when: "'pulled image' in rancher_image_pull.stdout" diff --git a/ansible/site.yml b/ansible/site.yml index 38bb454..bb416f8 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -104,43 +104,6 @@ roles: - k3s-server -- name: Export kube-vip image from primary control plane - hosts: control_plane[0] - become: true - - tasks: - - name: Export kube-vip image for secondary control planes - command: >- - /usr/local/bin/ctr -n k8s.io images export - /tmp/kube-vip-bootstrap.tar - ghcr.io/kube-vip/kube-vip:v1.1.2 - changed_when: false - - - name: Fetch kube-vip image archive - fetch: - src: /tmp/kube-vip-bootstrap.tar - dest: ../outputs/kube-vip-bootstrap.tar - flat: true - -- name: Seed kube-vip image on secondary control planes - hosts: control_plane[1:] - become: true - - tasks: - - name: Copy kube-vip image archive - copy: - src: ../outputs/kube-vip-bootstrap.tar - dest: /tmp/kube-vip-bootstrap.tar - mode: "0644" - - - name: Import kube-vip image into containerd - command: /usr/local/bin/ctr -n k8s.io images import /tmp/kube-vip-bootstrap.tar - register: kube_vip_secondary_import - until: kube_vip_secondary_import.rc == 0 - retries: 3 - delay: 10 - changed_when: false - - name: Wait for all control plane nodes to be Ready hosts: control_plane[0] become: true