From bfcf57bcc5cd478ca3df1565a5aea1d85eea8a1c Mon Sep 17 00:00:00 2001 From: MichaelFisher1997 Date: Sat, 25 Apr 2026 02:22:16 +0000 Subject: [PATCH] fix: enforce post-deploy health checks --- .gitea/workflows/deploy.yml | 33 +++++++++++++++---- .../helmrelease-kube-prometheus-stack.yaml | 5 +-- .../observability/helmrelease-loki.yaml | 14 +++----- .../observability/helmrelease-promtail.yaml | 12 +++---- .../observability/helmrepository-grafana.yaml | 8 ----- .../addons/observability/kustomization.yaml | 3 +- .../observability/ocirepository-loki.yaml | 13 ++++++++ .../observability/ocirepository-promtail.yaml | 13 ++++++++ 8 files changed, 67 insertions(+), 34 deletions(-) delete mode 100644 infrastructure/addons/observability/helmrepository-grafana.yaml create mode 100644 infrastructure/addons/observability/ocirepository-loki.yaml create mode 100644 infrastructure/addons/observability/ocirepository-promtail.yaml diff --git a/.gitea/workflows/deploy.yml b/.gitea/workflows/deploy.yml index 1389016..7e29a8d 100644 --- a/.gitea/workflows/deploy.yml +++ b/.gitea/workflows/deploy.yml @@ -407,6 +407,8 @@ jobs: kubectl -n tailscale-system rollout status deployment/operator --timeout=600s wait_for_flux_helm_release nfs-subdir-external-provisioner flux-system-nfs-subdir-external-provisioner nfs-subdir-external-provisioner kube-system 600s 600s 600 kubectl -n kube-system rollout status deployment/kube-system-nfs-subdir-external-provisioner --timeout=600s + kubectl annotate storageclass local-path storageclass.kubernetes.io/is-default-class=false --overwrite + kubectl annotate storageclass flash-nfs storageclass.kubernetes.io/is-default-class=true --overwrite kubectl get storageclass flash-nfs - name: Wait for Rancher and backup operator @@ -595,12 +597,31 @@ jobs: - name: Post-deploy cluster health checks working-directory: ansible run: | - ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get nodes -o wide" - ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n flux-system get gitrepositories,kustomizations,helmreleases" - ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n kube-system get pods -o wide" - ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl get storageclass flash-nfs" - ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n tailscale-system get pods -o wide" - ansible -i inventory.ini 'control_plane[0]' -m command -a "kubectl -n external-secrets get pods" + set -euo pipefail + ansible -i inventory.ini 'control_plane[0]' -m shell -a ' + set -euo pipefail + kubectl get nodes -o wide + kubectl -n flux-system get gitrepositories,kustomizations,helmreleases,ocirepositories + kubectl -n flux-system wait --for=condition=Ready kustomization/infrastructure --timeout=60s + kubectl -n flux-system wait --for=condition=Ready kustomization/addon-cert-manager --timeout=60s + kubectl -n flux-system wait --for=condition=Ready kustomization/addon-external-secrets --timeout=60s + kubectl -n flux-system wait --for=condition=Ready kustomization/addon-nfs-storage --timeout=60s + kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-operator --timeout=60s + kubectl -n flux-system wait --for=condition=Ready kustomization/addon-tailscale-proxyclass --timeout=60s + kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher --timeout=60s + kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-config --timeout=60s + kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup --timeout=60s + kubectl -n flux-system wait --for=condition=Ready kustomization/addon-rancher-backup-config --timeout=60s + kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability --timeout=60s + kubectl -n flux-system wait --for=condition=Ready kustomization/addon-observability-content --timeout=60s + kubectl -n flux-system wait --for=condition=Ready helmrelease --all --timeout=60s + kubectl get storageclass | grep -E "^flash-nfs.*\\(default\\)" + kubectl get pods -A --field-selector=status.phase!=Running,status.phase!=Succeeded --no-headers | tee /tmp/nonrunning-pods + test ! -s /tmp/nonrunning-pods + kubectl -n kube-system get pods -o wide + kubectl -n tailscale-system get pods -o wide + kubectl -n external-secrets get pods -o wide + ' env: ANSIBLE_HOST_KEY_CHECKING: "False" diff --git a/infrastructure/addons/observability/helmrelease-kube-prometheus-stack.yaml b/infrastructure/addons/observability/helmrelease-kube-prometheus-stack.yaml index ed069f5..d618e12 100644 --- a/infrastructure/addons/observability/helmrelease-kube-prometheus-stack.yaml +++ b/infrastructure/addons/observability/helmrelease-kube-prometheus-stack.yaml @@ -5,6 +5,7 @@ metadata: namespace: flux-system spec: interval: 10m + timeout: 15m targetNamespace: observability chart: spec: @@ -32,7 +33,7 @@ spec: serve_from_sub_path: false persistence: enabled: true - storageClassName: local-path + storageClassName: flash-nfs size: 5Gi service: type: ClusterIP @@ -55,7 +56,7 @@ spec: storageSpec: volumeClaimTemplate: spec: - storageClassName: local-path + storageClassName: flash-nfs accessModes: - ReadWriteOnce resources: diff --git a/infrastructure/addons/observability/helmrelease-loki.yaml b/infrastructure/addons/observability/helmrelease-loki.yaml index 8b08f56..d299a54 100644 --- a/infrastructure/addons/observability/helmrelease-loki.yaml +++ b/infrastructure/addons/observability/helmrelease-loki.yaml @@ -6,14 +6,10 @@ metadata: spec: interval: 10m targetNamespace: observability - chart: - spec: - chart: loki - version: 6.10.0 - sourceRef: - kind: HelmRepository - name: grafana - namespace: flux-system + chartRef: + kind: OCIRepository + name: loki + namespace: flux-system install: createNamespace: true remediation: @@ -50,7 +46,7 @@ spec: replicas: 1 persistence: size: 10Gi - storageClass: local-path + storageClass: flash-nfs resources: requests: cpu: 100m diff --git a/infrastructure/addons/observability/helmrelease-promtail.yaml b/infrastructure/addons/observability/helmrelease-promtail.yaml index 2fe09f0..39dd469 100644 --- a/infrastructure/addons/observability/helmrelease-promtail.yaml +++ b/infrastructure/addons/observability/helmrelease-promtail.yaml @@ -6,14 +6,10 @@ metadata: spec: interval: 10m targetNamespace: observability - chart: - spec: - chart: promtail - version: 6.16.6 - sourceRef: - kind: HelmRepository - name: grafana - namespace: flux-system + chartRef: + kind: OCIRepository + name: promtail + namespace: flux-system install: createNamespace: true remediation: diff --git a/infrastructure/addons/observability/helmrepository-grafana.yaml b/infrastructure/addons/observability/helmrepository-grafana.yaml deleted file mode 100644 index 1235012..0000000 --- a/infrastructure/addons/observability/helmrepository-grafana.yaml +++ /dev/null @@ -1,8 +0,0 @@ -apiVersion: source.toolkit.fluxcd.io/v1 -kind: HelmRepository -metadata: - name: grafana - namespace: flux-system -spec: - interval: 1h - url: https://grafana.github.io/helm-charts diff --git a/infrastructure/addons/observability/kustomization.yaml b/infrastructure/addons/observability/kustomization.yaml index 27173c3..d3e97df 100644 --- a/infrastructure/addons/observability/kustomization.yaml +++ b/infrastructure/addons/observability/kustomization.yaml @@ -4,7 +4,8 @@ resources: - namespace.yaml - grafana-admin-externalsecret.yaml - helmrepository-prometheus-community.yaml - - helmrepository-grafana.yaml + - ocirepository-loki.yaml + - ocirepository-promtail.yaml - helmrelease-kube-prometheus-stack.yaml - helmrelease-loki.yaml - helmrelease-promtail.yaml diff --git a/infrastructure/addons/observability/ocirepository-loki.yaml b/infrastructure/addons/observability/ocirepository-loki.yaml new file mode 100644 index 0000000..6a9bd98 --- /dev/null +++ b/infrastructure/addons/observability/ocirepository-loki.yaml @@ -0,0 +1,13 @@ +apiVersion: source.toolkit.fluxcd.io/v1 +kind: OCIRepository +metadata: + name: loki + namespace: flux-system +spec: + interval: 10m + url: oci://ghcr.io/grafana/helm-charts/loki + ref: + tag: 6.46.0 + layerSelector: + mediaType: application/vnd.cncf.helm.chart.content.v1.tar+gzip + operation: copy diff --git a/infrastructure/addons/observability/ocirepository-promtail.yaml b/infrastructure/addons/observability/ocirepository-promtail.yaml new file mode 100644 index 0000000..064100c --- /dev/null +++ b/infrastructure/addons/observability/ocirepository-promtail.yaml @@ -0,0 +1,13 @@ +apiVersion: source.toolkit.fluxcd.io/v1 +kind: OCIRepository +metadata: + name: promtail + namespace: flux-system +spec: + interval: 10m + url: oci://ghcr.io/grafana/helm-charts/promtail + ref: + tag: 6.16.6 + layerSelector: + mediaType: application/vnd.cncf.helm.chart.content.v1.tar+gzip + operation: copy