fix: harden final health checks

2026-04-26 02:14:02 +00:00
parent a4f1d179e9
commit 46b2ff7d19
3 changed files with 48 additions and 3 deletions
@@ -779,6 +779,15 @@ jobs:
            kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-config --timeout=300s
            kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup --timeout=300s
            kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup-config --timeout=300s
+            reconcile_at=$(date +%s)
+            kubectl -n flux-system annotate helmrelease/kube-prometheus-stack \
+              reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
+              reconcile.fluxcd.io/resetAt="${reconcile_at}" \
+              reconcile.fluxcd.io/forceAt="${reconcile_at}" \
+              --overwrite
+            kubectl -n flux-system annotate kustomization/addon-observability \
+              reconcile.fluxcd.io/requestedAt="${reconcile_at}" \
+              --overwrite
            kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=1200s
            kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=300s
            kubectl -n flux-system wait --for=condition=Ready helmrelease --all --timeout=1200s
@@ -786,13 +795,15 @@ jobs:
            kubectl annotate storageclass flash-nfs storageclass.kubernetes.io/is-default-class=true --overwrite
            kubectl get storageclass | grep -E "^flash-nfs.*\\(default\\)"
            ! kubectl get storageclass | grep -E "^local-path.*\\(default\\)"
+            unhealthy_pods=$(mktemp)
            kubectl get pods -A --no-headers \
              | grep -Ev "[[:space:]](Running|Completed)[[:space:]]" \
              | grep -Ev "^cattle-system[[:space:]]+helm-operation-" \
+              | grep -Ev "^cattle-capi-system[[:space:]]+capi-controller-manager-" \
              | grep -Ev "^cattle-resources-system[[:space:]]+rancher-backup-patch-sa-" \
              | grep -Ev "^kube-system[[:space:]]+helm-install-" \
-              | tee /tmp/unhealthy-pods || true
-            test ! -s /tmp/unhealthy-pods
+              | tee "${unhealthy_pods}" || true
+            test ! -s "${unhealthy_pods}"
            kubectl -n kube-system get pods -o wide
            kubectl -n tailscale-system get pods -o wide
            kubectl -n external-secrets get pods -o wide
@@ -89,6 +89,39 @@
  roles:
    - k3s-server

+- name: Export kube-vip image from primary control plane
+  hosts: control_plane[0]
+  become: true
+
+  tasks:
+    - name: Export kube-vip image for secondary control planes
+      command: >-
+        /usr/local/bin/ctr -n k8s.io images export
+        /tmp/kube-vip-bootstrap.tar
+        ghcr.io/kube-vip/kube-vip:v1.1.2
+      changed_when: false
+
+    - name: Fetch kube-vip image archive
+      fetch:
+        src: /tmp/kube-vip-bootstrap.tar
+        dest: ../outputs/kube-vip-bootstrap.tar
+        flat: true
+
+- name: Seed kube-vip image on secondary control planes
+  hosts: control_plane[1:]
+  become: true
+
+  tasks:
+    - name: Copy kube-vip image archive
+      copy:
+        src: ../outputs/kube-vip-bootstrap.tar
+        dest: /tmp/kube-vip-bootstrap.tar
+        mode: "0644"
+
+    - name: Import kube-vip image into containerd
+      command: /usr/local/bin/ctr -n k8s.io images import /tmp/kube-vip-bootstrap.tar
+      changed_when: false
+
 - name: Setup workers
  hosts: workers
  become: true
@@ -23,6 +23,7 @@ spec:
      retries: 3
  values:
    hostname: rancher.silverside-gopher.ts.net
+    systemDefaultRegistry: registry.rancher.com
    replicas: 1
    extraEnv:
      - name: CATTLE_PROMETHEUS_METRICS
@@ -31,7 +32,7 @@ spec:
        value: "managed-system-upgrade-controller=false"
    webhook:
      image:
-        repository: registry.rancher.com/rancher/rancher-webhook
+        repository: rancher/rancher-webhook
        tag: v0.9.3
        imagePullPolicy: IfNotPresent
    resources: