fix: remove runner image archive path
Deploy Cluster / Terraform (push) Successful in 4m16s
Deploy Cluster / Ansible (push) Failing after 13m57s

This commit is contained in:
2026-05-02 00:41:25 +00:00
parent df3d49c0d4
commit 17182f84a9
7 changed files with 7 additions and 270 deletions
-135
View File
@@ -176,80 +176,10 @@ jobs:
- name: Install Ansible Collections - name: Install Ansible Collections
run: ansible-galaxy collection install -r ansible/requirements.yml run: ansible-galaxy collection install -r ansible/requirements.yml
- name: Install skopeo
run: |
apt-get update
apt-get install -y skopeo
- name: Generate Ansible Inventory - name: Generate Ansible Inventory
working-directory: ansible working-directory: ansible
run: python3 generate_inventory.py run: python3 generate_inventory.py
- name: Prepare kube-vip image archive
run: |
set -euo pipefail
mkdir -p outputs
for attempt in 1 2 3; do
if skopeo copy \
docker://ghcr.io/kube-vip/kube-vip:v1.1.2 \
docker-archive:outputs/kube-vip-bootstrap.tar:ghcr.io/kube-vip/kube-vip:v1.1.2; then
exit 0
fi
sleep 10
done
echo "Failed to prepare kube-vip image archive on runner" >&2
exit 1
- name: Prepare bootstrap image archives
run: |
set -euo pipefail
archive_name() {
printf '%s' "$1" | tr '/:' '__'
}
prepare_image_archive() {
local image="$1"
local archive="outputs/bootstrap-image-archives/$(archive_name "${image}").tar"
mkdir -p outputs/bootstrap-image-archives
for attempt in 1 2 3; do
if skopeo copy "docker://${image}" "docker-archive:${archive}:${image}"; then
return 0
fi
sleep 10
done
echo "Failed to prepare bootstrap image archive for ${image}" >&2
return 1
}
for image in \
ghcr.io/fluxcd/source-controller:v1.8.0 \
ghcr.io/fluxcd/kustomize-controller:v1.8.1 \
ghcr.io/fluxcd/helm-controller:v1.5.1 \
ghcr.io/fluxcd/notification-controller:v1.8.1 \
docker.io/rancher/mirrored-coredns-coredns:1.14.2 \
docker.io/rancher/mirrored-metrics-server:v0.8.1 \
docker.io/rancher/local-path-provisioner:v0.0.35 \
docker.io/rancher/mirrored-library-traefik:3.6.10 \
docker.io/rancher/klipper-helm:v0.9.14-build20260309 \
oci.external-secrets.io/external-secrets/external-secrets:v2.1.0 \
ghcr.io/tailscale/k8s-operator:v1.96.5 \
ghcr.io/tailscale/tailscale:v1.96.5 \
registry.k8s.io/sig-storage/nfs-subdir-external-provisioner:v4.0.2 \
docker.io/rancher/mirrored-pause:3.6 \
docker.io/rancher/rancher:v2.13.3 \
docker.io/rancher/rancher-webhook:v0.9.3 \
docker.io/rancher/system-upgrade-controller:v0.17.0 \
docker.io/rancher/shell:v0.6.2 \
quay.io/jetstack/cert-manager-controller:v1.17.2 \
quay.io/jetstack/cert-manager-cainjector:v1.17.2 \
quay.io/jetstack/cert-manager-webhook:v1.17.2 \
quay.io/jetstack/cert-manager-startupapicheck:v1.17.2 \
docker.io/library/busybox:1.31.1; do
prepare_image_archive "${image}"
done
- name: Run Ansible Playbook - name: Run Ansible Playbook
working-directory: ansible working-directory: ansible
run: | run: |
@@ -389,49 +319,6 @@ jobs:
fi fi
} }
import_required_image() {
local image="$1"
local host_ip="$2"
local archive_name
local archive_path
archive_name="$(printf '%s' "${image}" | tr '/:' '__').tar"
archive_path="outputs/bootstrap-image-archives/${archive_name}"
if [ ! -s "${archive_path}" ]; then
echo "Missing required bootstrap image archive ${archive_path} for ${image}" >&2
exit 1
fi
if ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${host_ip}" \
"sudo k3s crictl inspecti '${image}' >/dev/null 2>&1"; then
return 0
fi
echo "Importing ${image} archive on ${host_ip}"
timeout 180s scp -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 -o ServerAliveInterval=15 -o ServerAliveCountMax=4 \
"${archive_path}" "ubuntu@${host_ip}:/tmp/${archive_name}"
timeout 300s ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 -o ServerAliveInterval=15 -o ServerAliveCountMax=4 "ubuntu@${host_ip}" \
"set -euo pipefail; \
if sudo k3s crictl inspecti '${image}' >/dev/null 2>&1; then exit 0; fi; \
for attempt in 1 2 3 4 5; do \
echo 'Importing ${image} archive with ctr'; \
if sudo k3s ctr -n k8s.io images import '/tmp/${archive_name}' && sudo k3s crictl inspecti '${image}' >/dev/null; then exit 0; fi; \
sleep 10; \
done; \
sudo systemctl status k3s --no-pager -l || true; \
sudo journalctl -u k3s -n 80 --no-pager || true; \
exit 1"
}
import_required_image_on_all_nodes() {
local image="$1"
local host_ip
for host_ip in ${ALL_NODE_IPS}; do
import_required_image "${image}" "${host_ip}"
done
}
eso_diagnostics() { eso_diagnostics() {
kubectl -n flux-system get kustomizations,ocirepositories,helmrepositories,helmcharts,helmreleases || true kubectl -n flux-system get kustomizations,ocirepositories,helmrepositories,helmcharts,helmreleases || true
kubectl -n flux-system describe kustomization addon-external-secrets || true kubectl -n flux-system describe kustomization addon-external-secrets || true
@@ -558,23 +445,6 @@ jobs:
--from-file=identity="$HOME/.ssh/id_ed25519" \ --from-file=identity="$HOME/.ssh/id_ed25519" \
--from-file=known_hosts=/tmp/flux_known_hosts \ --from-file=known_hosts=/tmp/flux_known_hosts \
--dry-run=client -o yaml | kubectl apply -f - --dry-run=client -o yaml | kubectl apply -f -
PRIMARY_CP_IP=$(python3 -c 'import json; print(json.load(open("outputs/terraform_outputs.json"))["primary_control_plane_ip"]["value"])')
ALL_NODE_IPS=$(python3 -c 'import json; outputs = json.load(open("outputs/terraform_outputs.json")); print(" ".join(outputs["control_plane_ips"]["value"] + outputs["worker_ips"]["value"]))')
for image in \
ghcr.io/fluxcd/source-controller:v1.8.0 \
ghcr.io/fluxcd/kustomize-controller:v1.8.1 \
ghcr.io/fluxcd/helm-controller:v1.5.1 \
ghcr.io/fluxcd/notification-controller:v1.8.1; do
import_required_image "${image}" "${PRIMARY_CP_IP}"
done
for image in \
docker.io/rancher/mirrored-pause:3.6 \
quay.io/jetstack/cert-manager-controller:v1.17.2 \
quay.io/jetstack/cert-manager-cainjector:v1.17.2 \
quay.io/jetstack/cert-manager-webhook:v1.17.2 \
quay.io/jetstack/cert-manager-startupapicheck:v1.17.2; do
import_required_image_on_all_nodes "${image}"
done
# Apply CRDs and controllers first # Apply CRDs and controllers first
kubectl apply -f clusters/prod/flux-system/gotk-components.yaml kubectl apply -f clusters/prod/flux-system/gotk-components.yaml
# Wait for CRDs to be established # Wait for CRDs to be established
@@ -600,7 +470,6 @@ jobs:
# Wait directly on the ESO Helm objects; Kustomization readiness hides useful failure details. # Wait directly on the ESO Helm objects; Kustomization readiness hides useful failure details.
wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-external-secrets 600 wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-external-secrets 600
reconcile_flux_resource flux-system kustomization/addon-external-secrets 900 reconcile_flux_resource flux-system kustomization/addon-external-secrets 900
import_required_image oci.external-secrets.io/external-secrets/external-secrets:v2.1.0 "${PRIMARY_CP_IP}"
wait_for_flux_oci_helm_release external-secrets external-secrets external-secrets 600s 600 wait_for_flux_oci_helm_release external-secrets external-secrets external-secrets 600s 600
wait_for_resource "" crd/clustersecretstores.external-secrets.io 900 wait_for_resource "" crd/clustersecretstores.external-secrets.io 900
wait_for_resource "" crd/externalsecrets.external-secrets.io 900 wait_for_resource "" crd/externalsecrets.external-secrets.io 900
@@ -615,8 +484,6 @@ jobs:
reconcile_flux_resource flux-system kustomization/addon-external-secrets-store 600 reconcile_flux_resource flux-system kustomization/addon-external-secrets-store 600
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets-store --timeout=600s kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets-store --timeout=600s
# Wait for the storage layer and private access components # Wait for the storage layer and private access components
import_required_image ghcr.io/tailscale/k8s-operator:v1.96.5 "${PRIMARY_CP_IP}"
import_required_image ghcr.io/tailscale/tailscale:v1.96.5 "${PRIMARY_CP_IP}"
reconcile_flux_resource flux-system kustomization/addon-tailscale-operator 900 reconcile_flux_resource flux-system kustomization/addon-tailscale-operator 900
if ! kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s; then if ! kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=300s; then
kubectl -n flux-system describe kustomization/addon-tailscale-operator || true kubectl -n flux-system describe kustomization/addon-tailscale-operator || true
@@ -627,14 +494,12 @@ jobs:
wait_for_helmrelease_ready tailscale-operator tailscale-system 900 wait_for_helmrelease_ready tailscale-operator tailscale-system 900
kubectl wait --for=condition=Established crd/proxyclasses.tailscale.com --timeout=600s kubectl wait --for=condition=Established crd/proxyclasses.tailscale.com --timeout=600s
kubectl -n tailscale-system rollout status deployment/operator --timeout=600s kubectl -n tailscale-system rollout status deployment/operator --timeout=600s
import_required_image registry.k8s.io/sig-storage/nfs-subdir-external-provisioner:v4.0.2 "${PRIMARY_CP_IP}"
reconcile_flux_resource flux-system kustomization/addon-nfs-storage 600 reconcile_flux_resource flux-system kustomization/addon-nfs-storage 600
kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=300s kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=300s
kubectl -n kube-system rollout status deployment/nfs-subdir-external-provisioner --timeout=300s kubectl -n kube-system rollout status deployment/nfs-subdir-external-provisioner --timeout=300s
kubectl annotate storageclass local-path storageclass.kubernetes.io/is-default-class=false --overwrite kubectl annotate storageclass local-path storageclass.kubernetes.io/is-default-class=false --overwrite
kubectl annotate storageclass flash-nfs storageclass.kubernetes.io/is-default-class=true --overwrite kubectl annotate storageclass flash-nfs storageclass.kubernetes.io/is-default-class=true --overwrite
kubectl get storageclass flash-nfs kubectl get storageclass flash-nfs
import_required_image docker.io/library/busybox:1.31.1 "${PRIMARY_CP_IP}"
kubectl -n kube-system delete pod/nfs-smoke pvc/nfs-smoke --ignore-not-found=true kubectl -n kube-system delete pod/nfs-smoke pvc/nfs-smoke --ignore-not-found=true
kubectl apply -f - <<'EOF' kubectl apply -f - <<'EOF'
apiVersion: v1 apiVersion: v1
+1 -1
View File
@@ -31,7 +31,7 @@ Compact repo guidance for OpenCode sessions. Trust executable sources over docs
- Deploy and destroy workflows share `concurrency.group: prod-cluster`; destroy only requires workflow input `confirm: destroy` and has no backup gate. - Deploy and destroy workflows share `concurrency.group: prod-cluster`; destroy only requires workflow input `confirm: destroy` and has no backup gate.
- Keep `set -euo pipefail` in workflow shell blocks. - Keep `set -euo pipefail` in workflow shell blocks.
- Terraform retry cleanup has hard-coded target VMIDs/names in `.gitea/workflows/deploy.yml`; update it when changing node counts, names, or VMIDs. - Terraform retry cleanup has hard-coded target VMIDs/names in `.gitea/workflows/deploy.yml`; update it when changing node counts, names, or VMIDs.
- Fresh VMs have unreliable registry/chart egress, so critical images are prepared by `skopeo` on the runner and imported with `k3s ctr`; update the workflow archive lists when adding bootstrap-time images. - Fresh VMs pull bootstrap images directly through containerd/K3s. Do not add runner-side `skopeo` archive/import paths; registry/network failures should surface directly in deploy logs.
- CI applies `clusters/prod/flux-system/gotk-components.yaml` directly and then patches Flux controller deployments inline; changes only in `gotk-controller-cp1-patches.yaml` do not affect CI bootstrap. - CI applies `clusters/prod/flux-system/gotk-components.yaml` directly and then patches Flux controller deployments inline; changes only in `gotk-controller-cp1-patches.yaml` do not affect CI bootstrap.
## GitOps Addons ## GitOps Addons
+2 -2
View File
@@ -148,8 +148,8 @@ Deploy sequence on push to `main`:
1. Terraform fmt/init/validate/plan/apply. 1. Terraform fmt/init/validate/plan/apply.
2. Cleanup/retry around known transient Proxmox clone and disk-update failures. 2. Cleanup/retry around known transient Proxmox clone and disk-update failures.
3. Generate Ansible inventory from Terraform outputs. 3. Generate Ansible inventory from Terraform outputs.
4. Prepare critical image archives with `skopeo` on the runner. 4. Run `ansible/site.yml` to bootstrap nodes, K3s, kube-vip, prerequisite secrets, and kubeconfig.
5. Run `ansible/site.yml` to bootstrap nodes, K3s, kube-vip, prerequisite secrets, and kubeconfig. 5. Pull bootstrap images directly through containerd/K3s on the target nodes.
6. Apply Flux CRDs/controllers and the `clusters/prod/flux-system` graph. 6. Apply Flux CRDs/controllers and the `clusters/prod/flux-system` graph.
7. Gate cert-manager, External Secrets, Tailscale, NFS, Rancher, and observability. 7. Gate cert-manager, External Secrets, Tailscale, NFS, Rancher, and observability.
8. Run post-deploy health checks and Tailscale service smoke checks. 8. Run post-deploy health checks and Tailscale service smoke checks.
@@ -1,47 +1,11 @@
--- ---
- name: Check for runner-provided bootstrap image archives - name: Pull bootstrap images into containerd
stat:
path: "{{ playbook_dir }}/../outputs/bootstrap-image-archives/{{ item | regex_replace('[/:]', '_') }}.tar"
delegate_to: localhost
become: false
register: bootstrap_image_archive_stats
loop: "{{ bootstrap_prepull_images }}"
- name: Ensure remote bootstrap image archive directory exists
file:
path: /tmp/bootstrap-image-archives
state: directory
mode: "0755"
- name: Copy runner-provided bootstrap image archives
copy:
src: "{{ item.stat.path }}"
dest: "/tmp/bootstrap-image-archives/{{ item.item | regex_replace('[/:]', '_') }}.tar"
mode: "0644"
loop: "{{ bootstrap_image_archive_stats.results }}"
loop_control:
label: "{{ item.item }}"
when: item.stat.exists
- name: Import or pull bootstrap images into containerd
shell: | shell: |
if /usr/local/bin/ctr -n k8s.io images ls -q | grep -Fx -- "{{ item }}" >/dev/null; then if /usr/local/bin/ctr -n k8s.io images ls -q | grep -Fx -- "{{ item }}" >/dev/null; then
echo "already present" echo "already present"
exit 0 exit 0
fi fi
archive="/tmp/bootstrap-image-archives/{{ item | regex_replace('[/:]', '_') }}.tar"
if [ -s "${archive}" ]; then
for attempt in 1 2 3; do
if /usr/local/bin/ctr -n k8s.io images import "${archive}" && /usr/local/bin/ctr -n k8s.io images ls -q | grep -Fx -- "{{ item }}" >/dev/null; then
echo "imported image"
exit 0
fi
sleep 10
done
fi
for attempt in 1 2 3 4 5; do for attempt in 1 2 3 4 5; do
if timeout 180s /usr/local/bin/ctr -n k8s.io images pull "{{ item }}"; then if timeout 180s /usr/local/bin/ctr -n k8s.io images pull "{{ item }}"; then
echo "pulled image" echo "pulled image"
@@ -56,4 +20,4 @@
executable: /bin/bash executable: /bin/bash
register: bootstrap_image_pull register: bootstrap_image_pull
loop: "{{ bootstrap_prepull_images }}" loop: "{{ bootstrap_prepull_images }}"
changed_when: "'imported image' in bootstrap_image_pull.stdout or 'pulled image' in bootstrap_image_pull.stdout" changed_when: "'pulled image' in bootstrap_image_pull.stdout"
@@ -1,23 +1,4 @@
--- ---
- name: Check for runner-provided kube-vip image archive
stat:
path: "{{ playbook_dir }}/../outputs/kube-vip-bootstrap.tar"
delegate_to: localhost
become: false
register: kube_vip_bootstrap_archive
- name: Copy runner-provided kube-vip image archive
copy:
src: "{{ playbook_dir }}/../outputs/kube-vip-bootstrap.tar"
dest: /tmp/kube-vip-bootstrap.tar
mode: "0644"
when: kube_vip_bootstrap_archive.stat.exists
- name: Import runner-provided kube-vip image archive
command: /usr/local/bin/ctr -n k8s.io images import /tmp/kube-vip-bootstrap.tar
changed_when: false
when: kube_vip_bootstrap_archive.stat.exists
- name: Pre-pull kube-vip bootstrap images into containerd - name: Pre-pull kube-vip bootstrap images into containerd
shell: | shell: |
if /usr/local/bin/ctr -n k8s.io images ls -q | grep -Fx -- "{{ item }}" >/dev/null; then if /usr/local/bin/ctr -n k8s.io images ls -q | grep -Fx -- "{{ item }}" >/dev/null; then
@@ -1,47 +1,11 @@
--- ---
- name: Check for runner-provided Rancher image archives - name: Pull Rancher images into containerd
stat:
path: "{{ playbook_dir }}/../outputs/bootstrap-image-archives/{{ item | regex_replace('[/:]', '_') }}.tar"
delegate_to: localhost
become: false
register: rancher_image_archive_stats
loop: "{{ rancher_images_to_prepull }}"
- name: Ensure remote Rancher image archive directory exists
file:
path: /tmp/bootstrap-image-archives
state: directory
mode: "0755"
- name: Copy runner-provided Rancher image archives
copy:
src: "{{ item.stat.path }}"
dest: "/tmp/bootstrap-image-archives/{{ item.item | regex_replace('[/:]', '_') }}.tar"
mode: "0644"
loop: "{{ rancher_image_archive_stats.results }}"
loop_control:
label: "{{ item.item }}"
when: item.stat.exists
- name: Import or pull Rancher images into containerd
shell: | shell: |
if /usr/local/bin/ctr -n k8s.io images ls -q | grep -Fx -- "{{ item }}" >/dev/null; then if /usr/local/bin/ctr -n k8s.io images ls -q | grep -Fx -- "{{ item }}" >/dev/null; then
echo "already present" echo "already present"
exit 0 exit 0
fi fi
archive="/tmp/bootstrap-image-archives/{{ item | regex_replace('[/:]', '_') }}.tar"
if [ -s "${archive}" ]; then
for attempt in 1 2 3; do
if /usr/local/bin/ctr -n k8s.io images import "${archive}" && /usr/local/bin/ctr -n k8s.io images ls -q | grep -Fx -- "{{ item }}" >/dev/null; then
echo "imported image"
exit 0
fi
sleep 10
done
fi
for attempt in 1 2 3 4 5; do for attempt in 1 2 3 4 5; do
if timeout 180s /usr/local/bin/ctr -n k8s.io images pull "{{ item }}"; then if timeout 180s /usr/local/bin/ctr -n k8s.io images pull "{{ item }}"; then
echo "pulled image" echo "pulled image"
@@ -56,4 +20,4 @@
executable: /bin/bash executable: /bin/bash
register: rancher_image_pull register: rancher_image_pull
loop: "{{ rancher_images_to_prepull }}" loop: "{{ rancher_images_to_prepull }}"
changed_when: "'imported image' in rancher_image_pull.stdout or 'pulled image' in rancher_image_pull.stdout" changed_when: "'pulled image' in rancher_image_pull.stdout"
-37
View File
@@ -104,43 +104,6 @@
roles: roles:
- k3s-server - k3s-server
- name: Export kube-vip image from primary control plane
hosts: control_plane[0]
become: true
tasks:
- name: Export kube-vip image for secondary control planes
command: >-
/usr/local/bin/ctr -n k8s.io images export
/tmp/kube-vip-bootstrap.tar
ghcr.io/kube-vip/kube-vip:v1.1.2
changed_when: false
- name: Fetch kube-vip image archive
fetch:
src: /tmp/kube-vip-bootstrap.tar
dest: ../outputs/kube-vip-bootstrap.tar
flat: true
- name: Seed kube-vip image on secondary control planes
hosts: control_plane[1:]
become: true
tasks:
- name: Copy kube-vip image archive
copy:
src: ../outputs/kube-vip-bootstrap.tar
dest: /tmp/kube-vip-bootstrap.tar
mode: "0644"
- name: Import kube-vip image into containerd
command: /usr/local/bin/ctr -n k8s.io images import /tmp/kube-vip-bootstrap.tar
register: kube_vip_secondary_import
until: kube_vip_secondary_import.rc == 0
retries: 3
delay: 10
changed_when: false
- name: Wait for all control plane nodes to be Ready - name: Wait for all control plane nodes to be Ready
hosts: control_plane[0] hosts: control_plane[0]
become: true become: true