diff --git a/.gitea/workflows/deploy.yml b/.gitea/workflows/deploy.yml index 032fab8..0234439 100644 --- a/.gitea/workflows/deploy.yml +++ b/.gitea/workflows/deploy.yml @@ -246,17 +246,7 @@ jobs: quay.io/jetstack/cert-manager-cainjector:v1.17.2 \ quay.io/jetstack/cert-manager-webhook:v1.17.2 \ quay.io/jetstack/cert-manager-startupapicheck:v1.17.2 \ - docker.io/library/busybox:1.31.1 \ - docker.io/grafana/loki:3.5.7 \ - quay.io/kiwigrid/k8s-sidecar:1.28.0 \ - docker.io/kiwigrid/k8s-sidecar:1.30.10 \ - docker.io/grafana/promtail:3.0.0 \ - docker.io/grafana/grafana:11.4.0 \ - quay.io/prometheus-operator/prometheus-operator:v0.79.2 \ - quay.io/prometheus-operator/prometheus-config-reloader:v0.79.2 \ - quay.io/prometheus/prometheus:v3.1.0 \ - registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.14.0 \ - quay.io/prometheus/node-exporter:v1.8.2; do + docker.io/library/busybox:1.31.1; do prepare_image_archive "${image}" done @@ -883,16 +873,12 @@ jobs: kubectl -n cattle-system wait --for=condition=Ready issuer/cattle-system-rancher --timeout=900s kubectl -n cattle-system wait --for=condition=Ready certificate/tls-rancher-ingress --timeout=900s - - name: Seed observability runtime images + - name: Reconcile observability stack env: KUBECONFIG: outputs/kubeconfig run: | set -euo pipefail - archive_name() { - printf '%s' "$1" | tr '/:' '__' - } - wait_for_resource() { local namespace="$1" local resource="$2" @@ -954,86 +940,19 @@ jobs: wait_for_reconcile_handled "helmrelease/${release}" "${reconcile_at}" 300 } - import_required_image() { - local image="$1" - local host_ip="$2" - local archive_name - local archive_path - archive_name="$(archive_name "${image}").tar" - archive_path="outputs/bootstrap-image-archives/${archive_name}" - - if [ ! -s "${archive_path}" ]; then - echo "Missing required bootstrap image archive ${archive_path} for ${image}" >&2 - return 1 - fi - - if ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 "ubuntu@${host_ip}" \ - "sudo k3s crictl inspecti '${image}' >/dev/null 2>&1"; then - return 0 - fi - - echo "Importing ${image} archive on ${host_ip}" - timeout 180s scp -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 -o ServerAliveInterval=15 -o ServerAliveCountMax=4 \ - "${archive_path}" "ubuntu@${host_ip}:/tmp/${archive_name}" - timeout 300s ssh -i "$HOME/.ssh/id_ed25519" -o StrictHostKeyChecking=no -o ConnectTimeout=10 -o ServerAliveInterval=15 -o ServerAliveCountMax=4 "ubuntu@${host_ip}" \ - "set -euo pipefail; \ - if sudo k3s crictl inspecti '${image}' >/dev/null 2>&1; then exit 0; fi; \ - for attempt in 1 2 3; do \ - echo 'Importing ${image} archive with ctr'; \ - if sudo k3s ctr -n k8s.io images import '/tmp/${archive_name}' && sudo k3s crictl inspecti '${image}' >/dev/null; then exit 0; fi; \ - sleep 10; \ - done; \ - sudo systemctl status k3s --no-pager -l || true; \ - sudo journalctl -u k3s -n 80 --no-pager || true; \ - exit 1" - } - - import_required_image_on_all_nodes() { - local image="$1" - local status_dir - local host_ip - local pid - local failed=false - status_dir="$(mktemp -d)" - - for host_ip in ${ALL_NODE_IPS}; do - ( - import_required_image "${image}" "${host_ip}" - ) >"${status_dir}/${host_ip}.log" 2>&1 & - done - - for pid in $(jobs -p); do - if ! wait "${pid}"; then - failed=true - fi - done - - for host_ip in ${ALL_NODE_IPS}; do - sed "s/^/[${host_ip}] /" "${status_dir}/${host_ip}.log" - done - - if [ "${failed}" = "true" ]; then - echo "Warning: failed to import ${image} on one or more nodes; continuing so Flux/Kubernetes can schedule on seeded nodes or retry pulls" >&2 - fi - } - - ALL_NODE_IPS=$(python3 -c 'import json; outputs = json.load(open("outputs/terraform_outputs.json")); print(" ".join(outputs["control_plane_ips"]["value"] + outputs["worker_ips"]["value"]))') - for image in \ - docker.io/library/busybox:1.31.1 \ - docker.io/grafana/loki:3.5.7 \ - quay.io/kiwigrid/k8s-sidecar:1.28.0 \ - docker.io/kiwigrid/k8s-sidecar:1.30.10 \ - docker.io/grafana/promtail:3.0.0 \ - docker.io/grafana/grafana:11.4.0 \ - quay.io/prometheus-operator/prometheus-operator:v0.79.2 \ - quay.io/prometheus-operator/prometheus-config-reloader:v0.79.2 \ - quay.io/prometheus/prometheus:v3.1.0 \ - registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.14.0 \ - quay.io/prometheus/node-exporter:v1.8.2; do - import_required_image_on_all_nodes "${image}" - done - reconcile_flux_resource kustomization/addon-observability 1200 - kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=1200s + wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability-secrets 600 + reconcile_flux_resource kustomization/addon-observability-secrets 600 + kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-secrets --timeout=600s + wait_for_resource flux-system kustomization.kustomize.toolkit.fluxcd.io/addon-observability 600 + reconcile_flux_resource kustomization/addon-observability 1800 + if ! kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=1800s; then + kubectl -n flux-system describe kustomization/addon-observability || true + kubectl -n flux-system describe helmrelease/kube-prometheus-stack || true + kubectl -n flux-system describe helmrelease/loki || true + kubectl -n flux-system describe helmrelease/promtail || true + kubectl -n observability get pods -o wide || true + exit 1 + fi for release in kube-prometheus-stack loki promtail; do reconcile_helmrelease "${release}" done @@ -1058,7 +977,7 @@ jobs: kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=900s kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-config --timeout=300s kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-secrets --timeout=300s - kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=1200s + kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=1800s kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=300s kubectl -n flux-system wait --for=condition=Ready helmrelease --all --timeout=1200s kubectl annotate storageclass local-path storageclass.kubernetes.io/is-default-class=false --overwrite diff --git a/infrastructure/addons/kustomization-observability.yaml b/infrastructure/addons/kustomization-observability.yaml index f4ff8ff..82d5849 100644 --- a/infrastructure/addons/kustomization-observability.yaml +++ b/infrastructure/addons/kustomization-observability.yaml @@ -29,5 +29,5 @@ spec: kind: HelmRelease name: promtail namespace: flux-system - timeout: 15m + timeout: 30m suspend: false