fix: make grafana prometheus datasource resilient with nodeport fallback
Some checks failed
Deploy Cluster / Terraform (push) Successful in 45s
Deploy Grafana Content / Grafana Content (push) Successful in 1m46s
Deploy Cluster / Ansible (push) Has been cancelled

This commit is contained in:
2026-03-04 19:22:31 +00:00
parent 9ff9d1e633
commit eb1ad0bea7
4 changed files with 98 additions and 6 deletions

View File

@@ -3,3 +3,6 @@ observability_namespace: "observability"
grafana_dashboard_configmap_name: "grafana-dashboard-k8s-overview"
grafana_datasource_configmap_name: "grafana-datasources-core"
loki_enabled: true
grafana_prometheus_url: "http://kube-prometheus-stack-prometheus.{{ observability_namespace }}.svc.cluster.local:9090"
grafana_loki_url: "http://loki.{{ observability_namespace }}.svc.cluster.local:3100"
grafana_use_prometheus_nodeport_fallback: true

View File

@@ -9,15 +9,82 @@
command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
changed_when: false
- name: Set default Prometheus datasource URL
set_fact:
grafana_prometheus_effective_url: "{{ grafana_prometheus_url }}"
- name: Get Grafana pod name
command: kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}'
register: grafana_pod_name
changed_when: false
- name: Probe Prometheus from Grafana pod via default datasource URL
shell: >-
kubectl -n {{ observability_namespace }} exec {{ grafana_pod_name.stdout }} -c grafana --
sh -c 'wget -qO- --timeout=5 {{ grafana_prometheus_url }}/-/ready >/dev/null'
register: grafana_prometheus_probe
changed_when: false
failed_when: false
- name: Get Prometheus pod host IP for fallback
command: kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=prometheus -o jsonpath='{.items[0].status.hostIP}'
register: prometheus_host_ip
changed_when: false
when:
- grafana_use_prometheus_nodeport_fallback | bool
- grafana_prometheus_probe.rc != 0
- name: Get Prometheus service NodePort for fallback
command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-prometheus -o jsonpath='{.spec.ports[?(@.name=="http-web")].nodePort}'
register: prometheus_nodeport
changed_when: false
when:
- grafana_use_prometheus_nodeport_fallback | bool
- grafana_prometheus_probe.rc != 0
- name: Enable Prometheus NodePort fallback datasource URL
set_fact:
grafana_prometheus_effective_url: "http://{{ prometheus_host_ip.stdout }}:{{ prometheus_nodeport.stdout }}"
when:
- grafana_use_prometheus_nodeport_fallback | bool
- grafana_prometheus_probe.rc != 0
- prometheus_host_ip.stdout | length > 0
- prometheus_nodeport.stdout | length > 0
- name: Write default Prometheus datasource ConfigMap patch
template:
src: grafana-default-prometheus-datasource.yaml.j2
dest: /tmp/grafana-default-prometheus-datasource.yaml
mode: "0644"
- name: Apply default Prometheus datasource ConfigMap patch
command: kubectl apply -f /tmp/grafana-default-prometheus-datasource.yaml
changed_when: true
- name: Remove legacy Loki datasource ConfigMap
command: kubectl -n {{ observability_namespace }} delete configmap grafana-datasource-loki --ignore-not-found=true
changed_when: false
failed_when: false
- name: Write Grafana datasources ConfigMap
template:
src: grafana-datasources.yaml.j2
dest: /tmp/grafana-datasources.yaml
mode: "0644"
when: loki_enabled
- name: Apply Grafana datasources ConfigMap
command: kubectl apply -f /tmp/grafana-datasources.yaml
changed_when: true
when: loki_enabled
- name: Restart Grafana to load datasource updates deterministically
command: kubectl -n {{ observability_namespace }} rollout restart deployment/kube-prometheus-stack-grafana
changed_when: true
- name: Wait for Grafana rollout after datasource update
command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
changed_when: false
- name: Write Grafana dashboard ConfigMap
template:
@@ -34,4 +101,5 @@
msg: |
Grafana content applied.
Datasources ConfigMap: {{ grafana_datasource_configmap_name }}
Prometheus datasource URL: {{ grafana_prometheus_effective_url }}
Dashboard ConfigMap: {{ grafana_dashboard_configmap_name }}

View File

@@ -9,15 +9,10 @@ data:
datasources.yaml: |
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
access: proxy
url: http://kube-prometheus-stack-prometheus.{{ observability_namespace }}.svc.cluster.local:9090
isDefault: true
{% if loki_enabled %}
- name: Loki
type: loki
access: proxy
url: http://loki.{{ observability_namespace }}.svc.cluster.local:3100
url: "{{ grafana_loki_url }}"
isDefault: false
{% endif %}

View File

@@ -0,0 +1,26 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: kube-prometheus-stack-grafana-datasource
namespace: {{ observability_namespace }}
data:
datasource.yaml: |-
apiVersion: 1
datasources:
- name: "Prometheus"
type: prometheus
uid: prometheus
url: {{ grafana_prometheus_effective_url }}/
access: proxy
isDefault: true
jsonData:
httpMethod: POST
timeInterval: 30s
- name: "Alertmanager"
type: alertmanager
uid: alertmanager
url: http://kube-prometheus-stack-alertmanager.{{ observability_namespace }}:9093/
access: proxy
jsonData:
handleGrafanaManagedAlerts: false
implementation: prometheus