fix: make grafana prometheus datasource resilient with nodeport fallback
This commit is contained in:
@@ -3,3 +3,6 @@ observability_namespace: "observability"
|
||||
grafana_dashboard_configmap_name: "grafana-dashboard-k8s-overview"
|
||||
grafana_datasource_configmap_name: "grafana-datasources-core"
|
||||
loki_enabled: true
|
||||
grafana_prometheus_url: "http://kube-prometheus-stack-prometheus.{{ observability_namespace }}.svc.cluster.local:9090"
|
||||
grafana_loki_url: "http://loki.{{ observability_namespace }}.svc.cluster.local:3100"
|
||||
grafana_use_prometheus_nodeport_fallback: true
|
||||
|
||||
@@ -9,15 +9,82 @@
|
||||
command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
|
||||
changed_when: false
|
||||
|
||||
- name: Set default Prometheus datasource URL
|
||||
set_fact:
|
||||
grafana_prometheus_effective_url: "{{ grafana_prometheus_url }}"
|
||||
|
||||
- name: Get Grafana pod name
|
||||
command: kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}'
|
||||
register: grafana_pod_name
|
||||
changed_when: false
|
||||
|
||||
- name: Probe Prometheus from Grafana pod via default datasource URL
|
||||
shell: >-
|
||||
kubectl -n {{ observability_namespace }} exec {{ grafana_pod_name.stdout }} -c grafana --
|
||||
sh -c 'wget -qO- --timeout=5 {{ grafana_prometheus_url }}/-/ready >/dev/null'
|
||||
register: grafana_prometheus_probe
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Get Prometheus pod host IP for fallback
|
||||
command: kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=prometheus -o jsonpath='{.items[0].status.hostIP}'
|
||||
register: prometheus_host_ip
|
||||
changed_when: false
|
||||
when:
|
||||
- grafana_use_prometheus_nodeport_fallback | bool
|
||||
- grafana_prometheus_probe.rc != 0
|
||||
|
||||
- name: Get Prometheus service NodePort for fallback
|
||||
command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-prometheus -o jsonpath='{.spec.ports[?(@.name=="http-web")].nodePort}'
|
||||
register: prometheus_nodeport
|
||||
changed_when: false
|
||||
when:
|
||||
- grafana_use_prometheus_nodeport_fallback | bool
|
||||
- grafana_prometheus_probe.rc != 0
|
||||
|
||||
- name: Enable Prometheus NodePort fallback datasource URL
|
||||
set_fact:
|
||||
grafana_prometheus_effective_url: "http://{{ prometheus_host_ip.stdout }}:{{ prometheus_nodeport.stdout }}"
|
||||
when:
|
||||
- grafana_use_prometheus_nodeport_fallback | bool
|
||||
- grafana_prometheus_probe.rc != 0
|
||||
- prometheus_host_ip.stdout | length > 0
|
||||
- prometheus_nodeport.stdout | length > 0
|
||||
|
||||
- name: Write default Prometheus datasource ConfigMap patch
|
||||
template:
|
||||
src: grafana-default-prometheus-datasource.yaml.j2
|
||||
dest: /tmp/grafana-default-prometheus-datasource.yaml
|
||||
mode: "0644"
|
||||
|
||||
- name: Apply default Prometheus datasource ConfigMap patch
|
||||
command: kubectl apply -f /tmp/grafana-default-prometheus-datasource.yaml
|
||||
changed_when: true
|
||||
|
||||
- name: Remove legacy Loki datasource ConfigMap
|
||||
command: kubectl -n {{ observability_namespace }} delete configmap grafana-datasource-loki --ignore-not-found=true
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Write Grafana datasources ConfigMap
|
||||
template:
|
||||
src: grafana-datasources.yaml.j2
|
||||
dest: /tmp/grafana-datasources.yaml
|
||||
mode: "0644"
|
||||
when: loki_enabled
|
||||
|
||||
- name: Apply Grafana datasources ConfigMap
|
||||
command: kubectl apply -f /tmp/grafana-datasources.yaml
|
||||
changed_when: true
|
||||
when: loki_enabled
|
||||
|
||||
- name: Restart Grafana to load datasource updates deterministically
|
||||
command: kubectl -n {{ observability_namespace }} rollout restart deployment/kube-prometheus-stack-grafana
|
||||
changed_when: true
|
||||
|
||||
- name: Wait for Grafana rollout after datasource update
|
||||
command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
|
||||
changed_when: false
|
||||
|
||||
- name: Write Grafana dashboard ConfigMap
|
||||
template:
|
||||
@@ -34,4 +101,5 @@
|
||||
msg: |
|
||||
Grafana content applied.
|
||||
Datasources ConfigMap: {{ grafana_datasource_configmap_name }}
|
||||
Prometheus datasource URL: {{ grafana_prometheus_effective_url }}
|
||||
Dashboard ConfigMap: {{ grafana_dashboard_configmap_name }}
|
||||
|
||||
@@ -9,15 +9,10 @@ data:
|
||||
datasources.yaml: |
|
||||
apiVersion: 1
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
url: http://kube-prometheus-stack-prometheus.{{ observability_namespace }}.svc.cluster.local:9090
|
||||
isDefault: true
|
||||
{% if loki_enabled %}
|
||||
- name: Loki
|
||||
type: loki
|
||||
access: proxy
|
||||
url: http://loki.{{ observability_namespace }}.svc.cluster.local:3100
|
||||
url: "{{ grafana_loki_url }}"
|
||||
isDefault: false
|
||||
{% endif %}
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: kube-prometheus-stack-grafana-datasource
|
||||
namespace: {{ observability_namespace }}
|
||||
data:
|
||||
datasource.yaml: |-
|
||||
apiVersion: 1
|
||||
datasources:
|
||||
- name: "Prometheus"
|
||||
type: prometheus
|
||||
uid: prometheus
|
||||
url: {{ grafana_prometheus_effective_url }}/
|
||||
access: proxy
|
||||
isDefault: true
|
||||
jsonData:
|
||||
httpMethod: POST
|
||||
timeInterval: 30s
|
||||
- name: "Alertmanager"
|
||||
type: alertmanager
|
||||
uid: alertmanager
|
||||
url: http://kube-prometheus-stack-alertmanager.{{ observability_namespace }}:9093/
|
||||
access: proxy
|
||||
jsonData:
|
||||
handleGrafanaManagedAlerts: false
|
||||
implementation: prometheus
|
||||
Reference in New Issue
Block a user