fix: retry Tailscale chart pulls during bootstrap
Deploy Cluster / Terraform (push) Successful in 32s
Deploy Cluster / Ansible (push) Failing after 27m40s

This commit is contained in:
2026-04-25 20:11:43 +00:00
parent 3c06e046c2
commit cdb26904d2
2 changed files with 24 additions and 7 deletions
+16 -6
View File
@@ -303,7 +303,7 @@ jobs:
for attempt in $(seq 1 "${attempts}"); do
echo "Pre-pulling ${image} on ${host_ip} (${attempt}/${attempts})"
if ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${host_ip}" \
"sudo k3s crictl inspecti '${image}' >/dev/null 2>&1 || ((sudo k3s crictl pull --platform linux/amd64 '${image}' || sudo k3s crictl pull '${image}') && sudo k3s crictl inspecti '${image}' >/dev/null 2>&1)"; then
"sudo k3s crictl inspecti '${image}' >/dev/null 2>&1 || (sudo k3s crictl pull '${image}' && sudo k3s crictl inspecti '${image}' >/dev/null 2>&1)"; then
pulled=true
break
fi
@@ -420,12 +420,20 @@ jobs:
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
--overwrite
if ! kubectl -n flux-system wait --for=condition=Ready "helmchart.source.toolkit.fluxcd.io/${chart_name}" --timeout="${chart_timeout}"; then
flux_helm_diagnostics "${repo_name}" "${chart_name}" "${release_name}" "${target_namespace}"
exit 1
for attempt in $(seq 1 6); do
if kubectl -n flux-system wait --for=condition=Ready "helmchart.source.toolkit.fluxcd.io/${chart_name}" --timeout="${chart_timeout}"; then
wait_for_helmrelease_ready "${release_name}" "${target_namespace}" "${release_timeout}"
return 0
fi
wait_for_helmrelease_ready "${release_name}" "${target_namespace}" "${release_timeout}"
echo "HelmChart ${chart_name} did not become Ready after ${chart_timeout}; forcing retry (${attempt}/6)" >&2
reconcile_at="$(date +%s)"
kubectl -n flux-system annotate "helmchart.source.toolkit.fluxcd.io/${chart_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
kubectl -n flux-system annotate "helmrelease/${release_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
done
flux_helm_diagnostics "${repo_name}" "${chart_name}" "${release_name}" "${target_namespace}"
exit 1
}
kubectl create namespace flux-system --dry-run=client -o yaml | kubectl apply -f -
@@ -495,7 +503,9 @@ jobs:
namespace: external-secrets
EOF
# Wait for the storage layer and private access components
wait_for_flux_helm_release tailscale flux-system-tailscale-operator tailscale-operator tailscale-system 600s 600s 600
pull_required_image ghcr.io/tailscale/k8s-operator:v1.96.5 "${PRIMARY_CP_IP}" 45 10 "Failed to pre-pull required Tailscale operator image"
pull_required_image ghcr.io/tailscale/tailscale:v1.96.5 "${PRIMARY_CP_IP}" 45 10 "Failed to pre-pull required Tailscale proxy image"
wait_for_flux_helm_release tailscale flux-system-tailscale-operator tailscale-operator tailscale-system 600s 900s 900
kubectl -n tailscale-system rollout status deployment/operator --timeout=600s
wait_for_flux_helm_release nfs-subdir-external-provisioner flux-system-nfs-subdir-external-provisioner nfs-subdir-external-provisioner kube-system 600s 600s 600
kubectl -n kube-system rollout status deployment/kube-system-nfs-subdir-external-provisioner --timeout=600s
@@ -28,6 +28,10 @@ spec:
operatorConfig:
defaultTags:
- tag:k8s
image:
repository: ghcr.io/tailscale/k8s-operator
tag: v1.96.5
pullPolicy: IfNotPresent
nodeSelector:
kubernetes.io/hostname: k8s-cluster-cp-1
tolerations:
@@ -37,3 +41,6 @@ spec:
proxyConfig:
defaultTags: tag:k8s
defaultProxyClass: infra-stable
image:
repository: ghcr.io/tailscale/tailscale
tag: v1.96.5