From eb1ad0bea77fbd5e17a06e6db91abdded1092214 Mon Sep 17 00:00:00 2001 From: MichaelFisher1997 Date: Wed, 4 Mar 2026 19:22:31 +0000 Subject: [PATCH] fix: make grafana prometheus datasource resilient with nodeport fallback --- .../observability-content/defaults/main.yml | 3 + .../observability-content/tasks/main.yml | 68 +++++++++++++++++++ .../templates/grafana-datasources.yaml.j2 | 7 +- ...fana-default-prometheus-datasource.yaml.j2 | 26 +++++++ 4 files changed, 98 insertions(+), 6 deletions(-) create mode 100644 ansible/roles/observability-content/templates/grafana-default-prometheus-datasource.yaml.j2 diff --git a/ansible/roles/observability-content/defaults/main.yml b/ansible/roles/observability-content/defaults/main.yml index 1653d6a..c970e72 100644 --- a/ansible/roles/observability-content/defaults/main.yml +++ b/ansible/roles/observability-content/defaults/main.yml @@ -3,3 +3,6 @@ observability_namespace: "observability" grafana_dashboard_configmap_name: "grafana-dashboard-k8s-overview" grafana_datasource_configmap_name: "grafana-datasources-core" loki_enabled: true +grafana_prometheus_url: "http://kube-prometheus-stack-prometheus.{{ observability_namespace }}.svc.cluster.local:9090" +grafana_loki_url: "http://loki.{{ observability_namespace }}.svc.cluster.local:3100" +grafana_use_prometheus_nodeport_fallback: true diff --git a/ansible/roles/observability-content/tasks/main.yml b/ansible/roles/observability-content/tasks/main.yml index e84751e..1e2fbfa 100644 --- a/ansible/roles/observability-content/tasks/main.yml +++ b/ansible/roles/observability-content/tasks/main.yml @@ -9,15 +9,82 @@ command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m changed_when: false +- name: Set default Prometheus datasource URL + set_fact: + grafana_prometheus_effective_url: "{{ grafana_prometheus_url }}" + +- name: Get Grafana pod name + command: kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}' + register: grafana_pod_name + changed_when: false + +- name: Probe Prometheus from Grafana pod via default datasource URL + shell: >- + kubectl -n {{ observability_namespace }} exec {{ grafana_pod_name.stdout }} -c grafana -- + sh -c 'wget -qO- --timeout=5 {{ grafana_prometheus_url }}/-/ready >/dev/null' + register: grafana_prometheus_probe + changed_when: false + failed_when: false + +- name: Get Prometheus pod host IP for fallback + command: kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=prometheus -o jsonpath='{.items[0].status.hostIP}' + register: prometheus_host_ip + changed_when: false + when: + - grafana_use_prometheus_nodeport_fallback | bool + - grafana_prometheus_probe.rc != 0 + +- name: Get Prometheus service NodePort for fallback + command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-prometheus -o jsonpath='{.spec.ports[?(@.name=="http-web")].nodePort}' + register: prometheus_nodeport + changed_when: false + when: + - grafana_use_prometheus_nodeport_fallback | bool + - grafana_prometheus_probe.rc != 0 + +- name: Enable Prometheus NodePort fallback datasource URL + set_fact: + grafana_prometheus_effective_url: "http://{{ prometheus_host_ip.stdout }}:{{ prometheus_nodeport.stdout }}" + when: + - grafana_use_prometheus_nodeport_fallback | bool + - grafana_prometheus_probe.rc != 0 + - prometheus_host_ip.stdout | length > 0 + - prometheus_nodeport.stdout | length > 0 + +- name: Write default Prometheus datasource ConfigMap patch + template: + src: grafana-default-prometheus-datasource.yaml.j2 + dest: /tmp/grafana-default-prometheus-datasource.yaml + mode: "0644" + +- name: Apply default Prometheus datasource ConfigMap patch + command: kubectl apply -f /tmp/grafana-default-prometheus-datasource.yaml + changed_when: true + +- name: Remove legacy Loki datasource ConfigMap + command: kubectl -n {{ observability_namespace }} delete configmap grafana-datasource-loki --ignore-not-found=true + changed_when: false + failed_when: false + - name: Write Grafana datasources ConfigMap template: src: grafana-datasources.yaml.j2 dest: /tmp/grafana-datasources.yaml mode: "0644" + when: loki_enabled - name: Apply Grafana datasources ConfigMap command: kubectl apply -f /tmp/grafana-datasources.yaml changed_when: true + when: loki_enabled + +- name: Restart Grafana to load datasource updates deterministically + command: kubectl -n {{ observability_namespace }} rollout restart deployment/kube-prometheus-stack-grafana + changed_when: true + +- name: Wait for Grafana rollout after datasource update + command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m + changed_when: false - name: Write Grafana dashboard ConfigMap template: @@ -34,4 +101,5 @@ msg: | Grafana content applied. Datasources ConfigMap: {{ grafana_datasource_configmap_name }} + Prometheus datasource URL: {{ grafana_prometheus_effective_url }} Dashboard ConfigMap: {{ grafana_dashboard_configmap_name }} diff --git a/ansible/roles/observability-content/templates/grafana-datasources.yaml.j2 b/ansible/roles/observability-content/templates/grafana-datasources.yaml.j2 index 37f8bd8..bdf09d3 100644 --- a/ansible/roles/observability-content/templates/grafana-datasources.yaml.j2 +++ b/ansible/roles/observability-content/templates/grafana-datasources.yaml.j2 @@ -9,15 +9,10 @@ data: datasources.yaml: | apiVersion: 1 datasources: - - name: Prometheus - type: prometheus - access: proxy - url: http://kube-prometheus-stack-prometheus.{{ observability_namespace }}.svc.cluster.local:9090 - isDefault: true {% if loki_enabled %} - name: Loki type: loki access: proxy - url: http://loki.{{ observability_namespace }}.svc.cluster.local:3100 + url: "{{ grafana_loki_url }}" isDefault: false {% endif %} diff --git a/ansible/roles/observability-content/templates/grafana-default-prometheus-datasource.yaml.j2 b/ansible/roles/observability-content/templates/grafana-default-prometheus-datasource.yaml.j2 new file mode 100644 index 0000000..956ecc7 --- /dev/null +++ b/ansible/roles/observability-content/templates/grafana-default-prometheus-datasource.yaml.j2 @@ -0,0 +1,26 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: kube-prometheus-stack-grafana-datasource + namespace: {{ observability_namespace }} +data: + datasource.yaml: |- + apiVersion: 1 + datasources: + - name: "Prometheus" + type: prometheus + uid: prometheus + url: {{ grafana_prometheus_effective_url }}/ + access: proxy + isDefault: true + jsonData: + httpMethod: POST + timeInterval: 30s + - name: "Alertmanager" + type: alertmanager + uid: alertmanager + url: http://kube-prometheus-stack-alertmanager.{{ observability_namespace }}:9093/ + access: proxy + jsonData: + handleGrafanaManagedAlerts: false + implementation: prometheus