fix: make grafana prometheus datasource resilient with nodeport fallback
Some checks failed
Deploy Cluster / Terraform (push) Successful in 45s
Deploy Grafana Content / Grafana Content (push) Successful in 1m46s
Deploy Cluster / Ansible (push) Has been cancelled

This commit is contained in:
2026-03-04 19:22:31 +00:00
parent 9ff9d1e633
commit eb1ad0bea7
4 changed files with 98 additions and 6 deletions

View File

@@ -9,15 +9,82 @@
command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
changed_when: false
- name: Set default Prometheus datasource URL
set_fact:
grafana_prometheus_effective_url: "{{ grafana_prometheus_url }}"
- name: Get Grafana pod name
command: kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}'
register: grafana_pod_name
changed_when: false
- name: Probe Prometheus from Grafana pod via default datasource URL
shell: >-
kubectl -n {{ observability_namespace }} exec {{ grafana_pod_name.stdout }} -c grafana --
sh -c 'wget -qO- --timeout=5 {{ grafana_prometheus_url }}/-/ready >/dev/null'
register: grafana_prometheus_probe
changed_when: false
failed_when: false
- name: Get Prometheus pod host IP for fallback
command: kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=prometheus -o jsonpath='{.items[0].status.hostIP}'
register: prometheus_host_ip
changed_when: false
when:
- grafana_use_prometheus_nodeport_fallback | bool
- grafana_prometheus_probe.rc != 0
- name: Get Prometheus service NodePort for fallback
command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-prometheus -o jsonpath='{.spec.ports[?(@.name=="http-web")].nodePort}'
register: prometheus_nodeport
changed_when: false
when:
- grafana_use_prometheus_nodeport_fallback | bool
- grafana_prometheus_probe.rc != 0
- name: Enable Prometheus NodePort fallback datasource URL
set_fact:
grafana_prometheus_effective_url: "http://{{ prometheus_host_ip.stdout }}:{{ prometheus_nodeport.stdout }}"
when:
- grafana_use_prometheus_nodeport_fallback | bool
- grafana_prometheus_probe.rc != 0
- prometheus_host_ip.stdout | length > 0
- prometheus_nodeport.stdout | length > 0
- name: Write default Prometheus datasource ConfigMap patch
template:
src: grafana-default-prometheus-datasource.yaml.j2
dest: /tmp/grafana-default-prometheus-datasource.yaml
mode: "0644"
- name: Apply default Prometheus datasource ConfigMap patch
command: kubectl apply -f /tmp/grafana-default-prometheus-datasource.yaml
changed_when: true
- name: Remove legacy Loki datasource ConfigMap
command: kubectl -n {{ observability_namespace }} delete configmap grafana-datasource-loki --ignore-not-found=true
changed_when: false
failed_when: false
- name: Write Grafana datasources ConfigMap
template:
src: grafana-datasources.yaml.j2
dest: /tmp/grafana-datasources.yaml
mode: "0644"
when: loki_enabled
- name: Apply Grafana datasources ConfigMap
command: kubectl apply -f /tmp/grafana-datasources.yaml
changed_when: true
when: loki_enabled
- name: Restart Grafana to load datasource updates deterministically
command: kubectl -n {{ observability_namespace }} rollout restart deployment/kube-prometheus-stack-grafana
changed_when: true
- name: Wait for Grafana rollout after datasource update
command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
changed_when: false
- name: Write Grafana dashboard ConfigMap
template:
@@ -34,4 +101,5 @@
msg: |
Grafana content applied.
Datasources ConfigMap: {{ grafana_datasource_configmap_name }}
Prometheus datasource URL: {{ grafana_prometheus_effective_url }}
Dashboard ConfigMap: {{ grafana_dashboard_configmap_name }}