From cdb26904d2f1ae8ce5aaa233bc9dbf17da9dd7fb Mon Sep 17 00:00:00 2001 From: MichaelFisher1997 Date: Sat, 25 Apr 2026 20:11:43 +0000 Subject: [PATCH] fix: retry Tailscale chart pulls during bootstrap --- .gitea/workflows/deploy.yml | 24 +++++++++++++------ .../helmrelease-tailscale-operator.yaml | 7 ++++++ 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/.gitea/workflows/deploy.yml b/.gitea/workflows/deploy.yml index 841b20f..783dddb 100644 --- a/.gitea/workflows/deploy.yml +++ b/.gitea/workflows/deploy.yml @@ -303,7 +303,7 @@ jobs: for attempt in $(seq 1 "${attempts}"); do echo "Pre-pulling ${image} on ${host_ip} (${attempt}/${attempts})" if ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${host_ip}" \ - "sudo k3s crictl inspecti '${image}' >/dev/null 2>&1 || ((sudo k3s crictl pull --platform linux/amd64 '${image}' || sudo k3s crictl pull '${image}') && sudo k3s crictl inspecti '${image}' >/dev/null 2>&1)"; then + "sudo k3s crictl inspecti '${image}' >/dev/null 2>&1 || (sudo k3s crictl pull '${image}' && sudo k3s crictl inspecti '${image}' >/dev/null 2>&1)"; then pulled=true break fi @@ -420,12 +420,20 @@ jobs: reconcile.fluxcd.io/forceAt="${reconcile_at}" \ --overwrite - if ! kubectl -n flux-system wait --for=condition=Ready "helmchart.source.toolkit.fluxcd.io/${chart_name}" --timeout="${chart_timeout}"; then - flux_helm_diagnostics "${repo_name}" "${chart_name}" "${release_name}" "${target_namespace}" - exit 1 - fi + for attempt in $(seq 1 6); do + if kubectl -n flux-system wait --for=condition=Ready "helmchart.source.toolkit.fluxcd.io/${chart_name}" --timeout="${chart_timeout}"; then + wait_for_helmrelease_ready "${release_name}" "${target_namespace}" "${release_timeout}" + return 0 + fi - wait_for_helmrelease_ready "${release_name}" "${target_namespace}" "${release_timeout}" + echo "HelmChart ${chart_name} did not become Ready after ${chart_timeout}; forcing retry (${attempt}/6)" >&2 + reconcile_at="$(date +%s)" + kubectl -n flux-system annotate "helmchart.source.toolkit.fluxcd.io/${chart_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite + kubectl -n flux-system annotate "helmrelease/${release_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite + done + + flux_helm_diagnostics "${repo_name}" "${chart_name}" "${release_name}" "${target_namespace}" + exit 1 } kubectl create namespace flux-system --dry-run=client -o yaml | kubectl apply -f - @@ -495,7 +503,9 @@ jobs: namespace: external-secrets EOF # Wait for the storage layer and private access components - wait_for_flux_helm_release tailscale flux-system-tailscale-operator tailscale-operator tailscale-system 600s 600s 600 + pull_required_image ghcr.io/tailscale/k8s-operator:v1.96.5 "${PRIMARY_CP_IP}" 45 10 "Failed to pre-pull required Tailscale operator image" + pull_required_image ghcr.io/tailscale/tailscale:v1.96.5 "${PRIMARY_CP_IP}" 45 10 "Failed to pre-pull required Tailscale proxy image" + wait_for_flux_helm_release tailscale flux-system-tailscale-operator tailscale-operator tailscale-system 600s 900s 900 kubectl -n tailscale-system rollout status deployment/operator --timeout=600s wait_for_flux_helm_release nfs-subdir-external-provisioner flux-system-nfs-subdir-external-provisioner nfs-subdir-external-provisioner kube-system 600s 600s 600 kubectl -n kube-system rollout status deployment/kube-system-nfs-subdir-external-provisioner --timeout=600s diff --git a/infrastructure/addons/tailscale-operator/helmrelease-tailscale-operator.yaml b/infrastructure/addons/tailscale-operator/helmrelease-tailscale-operator.yaml index deb37ce..17417ee 100644 --- a/infrastructure/addons/tailscale-operator/helmrelease-tailscale-operator.yaml +++ b/infrastructure/addons/tailscale-operator/helmrelease-tailscale-operator.yaml @@ -28,6 +28,10 @@ spec: operatorConfig: defaultTags: - tag:k8s + image: + repository: ghcr.io/tailscale/k8s-operator + tag: v1.96.5 + pullPolicy: IfNotPresent nodeSelector: kubernetes.io/hostname: k8s-cluster-cp-1 tolerations: @@ -37,3 +41,6 @@ spec: proxyConfig: defaultTags: tag:k8s defaultProxyClass: infra-stable + image: + repository: ghcr.io/tailscale/tailscale + tag: v1.96.5