fix: retry Tailscale chart pulls during bootstrap
This commit is contained in:
@@ -303,7 +303,7 @@ jobs:
|
|||||||
for attempt in $(seq 1 "${attempts}"); do
|
for attempt in $(seq 1 "${attempts}"); do
|
||||||
echo "Pre-pulling ${image} on ${host_ip} (${attempt}/${attempts})"
|
echo "Pre-pulling ${image} on ${host_ip} (${attempt}/${attempts})"
|
||||||
if ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${host_ip}" \
|
if ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${host_ip}" \
|
||||||
"sudo k3s crictl inspecti '${image}' >/dev/null 2>&1 || ((sudo k3s crictl pull --platform linux/amd64 '${image}' || sudo k3s crictl pull '${image}') && sudo k3s crictl inspecti '${image}' >/dev/null 2>&1)"; then
|
"sudo k3s crictl inspecti '${image}' >/dev/null 2>&1 || (sudo k3s crictl pull '${image}' && sudo k3s crictl inspecti '${image}' >/dev/null 2>&1)"; then
|
||||||
pulled=true
|
pulled=true
|
||||||
break
|
break
|
||||||
fi
|
fi
|
||||||
@@ -420,12 +420,20 @@ jobs:
|
|||||||
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
|
reconcile.fluxcd.io/forceAt="${reconcile_at}" \
|
||||||
--overwrite
|
--overwrite
|
||||||
|
|
||||||
if ! kubectl -n flux-system wait --for=condition=Ready "helmchart.source.toolkit.fluxcd.io/${chart_name}" --timeout="${chart_timeout}"; then
|
for attempt in $(seq 1 6); do
|
||||||
flux_helm_diagnostics "${repo_name}" "${chart_name}" "${release_name}" "${target_namespace}"
|
if kubectl -n flux-system wait --for=condition=Ready "helmchart.source.toolkit.fluxcd.io/${chart_name}" --timeout="${chart_timeout}"; then
|
||||||
exit 1
|
wait_for_helmrelease_ready "${release_name}" "${target_namespace}" "${release_timeout}"
|
||||||
fi
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
wait_for_helmrelease_ready "${release_name}" "${target_namespace}" "${release_timeout}"
|
echo "HelmChart ${chart_name} did not become Ready after ${chart_timeout}; forcing retry (${attempt}/6)" >&2
|
||||||
|
reconcile_at="$(date +%s)"
|
||||||
|
kubectl -n flux-system annotate "helmchart.source.toolkit.fluxcd.io/${chart_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
|
||||||
|
kubectl -n flux-system annotate "helmrelease/${release_name}" reconcile.fluxcd.io/requestedAt="${reconcile_at}" --overwrite
|
||||||
|
done
|
||||||
|
|
||||||
|
flux_helm_diagnostics "${repo_name}" "${chart_name}" "${release_name}" "${target_namespace}"
|
||||||
|
exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
kubectl create namespace flux-system --dry-run=client -o yaml | kubectl apply -f -
|
kubectl create namespace flux-system --dry-run=client -o yaml | kubectl apply -f -
|
||||||
@@ -495,7 +503,9 @@ jobs:
|
|||||||
namespace: external-secrets
|
namespace: external-secrets
|
||||||
EOF
|
EOF
|
||||||
# Wait for the storage layer and private access components
|
# Wait for the storage layer and private access components
|
||||||
wait_for_flux_helm_release tailscale flux-system-tailscale-operator tailscale-operator tailscale-system 600s 600s 600
|
pull_required_image ghcr.io/tailscale/k8s-operator:v1.96.5 "${PRIMARY_CP_IP}" 45 10 "Failed to pre-pull required Tailscale operator image"
|
||||||
|
pull_required_image ghcr.io/tailscale/tailscale:v1.96.5 "${PRIMARY_CP_IP}" 45 10 "Failed to pre-pull required Tailscale proxy image"
|
||||||
|
wait_for_flux_helm_release tailscale flux-system-tailscale-operator tailscale-operator tailscale-system 600s 900s 900
|
||||||
kubectl -n tailscale-system rollout status deployment/operator --timeout=600s
|
kubectl -n tailscale-system rollout status deployment/operator --timeout=600s
|
||||||
wait_for_flux_helm_release nfs-subdir-external-provisioner flux-system-nfs-subdir-external-provisioner nfs-subdir-external-provisioner kube-system 600s 600s 600
|
wait_for_flux_helm_release nfs-subdir-external-provisioner flux-system-nfs-subdir-external-provisioner nfs-subdir-external-provisioner kube-system 600s 600s 600
|
||||||
kubectl -n kube-system rollout status deployment/kube-system-nfs-subdir-external-provisioner --timeout=600s
|
kubectl -n kube-system rollout status deployment/kube-system-nfs-subdir-external-provisioner --timeout=600s
|
||||||
|
|||||||
@@ -28,6 +28,10 @@ spec:
|
|||||||
operatorConfig:
|
operatorConfig:
|
||||||
defaultTags:
|
defaultTags:
|
||||||
- tag:k8s
|
- tag:k8s
|
||||||
|
image:
|
||||||
|
repository: ghcr.io/tailscale/k8s-operator
|
||||||
|
tag: v1.96.5
|
||||||
|
pullPolicy: IfNotPresent
|
||||||
nodeSelector:
|
nodeSelector:
|
||||||
kubernetes.io/hostname: k8s-cluster-cp-1
|
kubernetes.io/hostname: k8s-cluster-cp-1
|
||||||
tolerations:
|
tolerations:
|
||||||
@@ -37,3 +41,6 @@ spec:
|
|||||||
proxyConfig:
|
proxyConfig:
|
||||||
defaultTags: tag:k8s
|
defaultTags: tag:k8s
|
||||||
defaultProxyClass: infra-stable
|
defaultProxyClass: infra-stable
|
||||||
|
image:
|
||||||
|
repository: ghcr.io/tailscale/tailscale
|
||||||
|
tag: v1.96.5
|
||||||
|
|||||||
Reference in New Issue
Block a user