Files
HetznerTerra/ansible/roles/observability/tasks/main.yml
MichaelFisher1997 6177b581e4
Some checks failed
Deploy Cluster / Terraform (push) Successful in 44s
Deploy Grafana Content / Grafana Content (push) Successful in 1m29s
Deploy Cluster / Ansible (push) Failing after 11m11s
fix: correct dashboard verification checks and retry helm upgrade lock
2026-03-04 08:48:30 +00:00

239 lines
8.9 KiB
YAML

---
- name: Check if Helm is installed
command: helm version --short
register: helm_check
changed_when: false
failed_when: false
- name: Install Helm
shell: curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
when: helm_check.rc != 0
changed_when: true
- name: Ensure observability namespace exists
command: kubectl create namespace {{ observability_namespace }}
register: create_observability_ns
failed_when: create_observability_ns.rc != 0 and "AlreadyExists" not in create_observability_ns.stderr
changed_when: create_observability_ns.rc == 0
- name: Set Grafana admin password
set_fact:
grafana_password_effective: "{{ grafana_admin_password if grafana_admin_password | length > 0 else lookup('password', '/dev/null length=32 chars=ascii_letters,digits') }}"
- name: Write kube-prometheus-stack values
template:
src: kube-prometheus-stack-values.yaml.j2
dest: /tmp/kube-prometheus-stack-values.yaml
mode: "0644"
- name: Add Prometheus Helm repo
command: helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
register: add_prom_repo
failed_when: add_prom_repo.rc != 0 and "already exists" not in add_prom_repo.stderr
changed_when: add_prom_repo.rc == 0
- name: Add Grafana Helm repo
command: helm repo add grafana https://grafana.github.io/helm-charts
register: add_grafana_repo
failed_when: add_grafana_repo.rc != 0 and "already exists" not in add_grafana_repo.stderr
changed_when: add_grafana_repo.rc == 0
- name: Update Helm repos
command: helm repo update
changed_when: false
- name: Install kube-prometheus-stack
command: >-
helm upgrade --install kube-prometheus-stack prometheus-community/kube-prometheus-stack
--namespace {{ observability_namespace }}
--version {{ prometheus_chart_version }}
--values /tmp/kube-prometheus-stack-values.yaml
--wait
--timeout 10m
register: kube_prom_install
retries: 6
delay: 20
until: kube_prom_install.rc == 0
changed_when: true
- name: Wait for Grafana deployment rollout
command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m
changed_when: false
- name: Reset Grafana admin password in Grafana database
shell: >-
kubectl -n {{ observability_namespace }} exec
"$(kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}')"
-c grafana -- grafana cli admin reset-admin-password '{{ grafana_password_effective }}'
changed_when: true
- name: Write Loki values
template:
src: loki-values.yaml.j2
dest: /tmp/loki-values.yaml
mode: "0644"
when: loki_enabled
- name: Validate Loki chart produces resources
command: >-
helm template loki grafana/loki
--namespace {{ observability_namespace }}
--version {{ loki_chart_version }}
--values /tmp/loki-values.yaml
register: loki_template
changed_when: false
failed_when: "loki_template.rc != 0 or 'kind: StatefulSet' not in loki_template.stdout"
when: loki_enabled
- name: Remove legacy Loki resources
command: >-
kubectl -n {{ observability_namespace }} delete
deployment/loki-gateway
statefulset/loki
statefulset/loki-chunks-cache
statefulset/loki-results-cache
statefulset/loki-backend
statefulset/loki-read
statefulset/loki-write
poddisruptionbudget/loki-memcached-chunks-cache
poddisruptionbudget/loki-memcached-results-cache
--ignore-not-found=true
changed_when: false
failed_when: false
when: loki_enabled
- name: Clear stuck Helm lock for Loki
command: kubectl -n {{ observability_namespace }} delete secret sh.helm.release.v1.loki.v1 --ignore-not-found=true
changed_when: false
failed_when: false
when: loki_enabled
- name: Uninstall failed Loki release (if stuck)
command: helm uninstall loki -n {{ observability_namespace }}
changed_when: false
failed_when: false
when: loki_enabled
- name: Install Loki
command: >-
helm upgrade --install loki grafana/loki
--namespace {{ observability_namespace }}
--version {{ loki_chart_version }}
--values /tmp/loki-values.yaml
register: loki_install
changed_when: true
when: loki_enabled
- name: Wait for Loki StatefulSet
command: kubectl -n {{ observability_namespace }} rollout status statefulset/loki --timeout=10m
register: loki_rollout
changed_when: false
when: loki_enabled
- name: Show Loki pod status
command: kubectl -n {{ observability_namespace }} get pods -l app.kubernetes.io/name=loki -o wide
register: loki_pods
changed_when: false
when: loki_enabled
- name: Debug Loki pods
debug:
msg: "{{ loki_pods.stdout }}"
when: loki_enabled
- name: Write Promtail values
template:
src: promtail-values.yaml.j2
dest: /tmp/promtail-values.yaml
mode: "0644"
when: loki_enabled
- name: Install Promtail
command: >-
helm upgrade --install promtail grafana/promtail
--namespace {{ observability_namespace }}
--version {{ promtail_chart_version }}
--values /tmp/promtail-values.yaml
--wait
--timeout 10m
changed_when: true
when: loki_enabled
- name: Check Tailscale service readiness for Grafana
command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-grafana -o jsonpath='{.status.conditions[?(@.type=="TailscaleProxyReady")].status}'
register: grafana_tailscale_ready
changed_when: false
failed_when: false
when:
- observability_tailscale_expose | bool
- tailscale_operator_ready | default(false) | bool
- name: Check Tailscale service readiness for Prometheus
command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-prometheus -o jsonpath='{.status.conditions[?(@.type=="TailscaleProxyReady")].status}'
register: prometheus_tailscale_ready
changed_when: false
failed_when: false
when:
- observability_tailscale_expose | bool
- tailscale_operator_ready | default(false) | bool
- name: Check Tailscale endpoint (IP/hostname) for Grafana
shell: >-
kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-grafana
-o go-template='{{"{{"}}range .status.loadBalancer.ingress{{"}}"}}{{"{{"}}if .ip{{"}}"}}{{"{{"}}.ip{{"}}"}}{{"{{"}}else{{"}}"}}{{"{{"}}.hostname{{"}}"}}{{"{{"}}end{{"}}"}}{{"{{"}}end{{"}}"}}'
register: grafana_lb_ip
changed_when: false
failed_when: false
when:
- observability_tailscale_expose | bool
- tailscale_operator_ready | default(false) | bool
- name: Check Tailscale endpoint (IP/hostname) for Prometheus
shell: >-
kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-prometheus
-o go-template='{{"{{"}}range .status.loadBalancer.ingress{{"}}"}}{{"{{"}}if .ip{{"}}"}}{{"{{"}}.ip{{"}}"}}{{"{{"}}else{{"}}"}}{{"{{"}}.hostname{{"}}"}}{{"{{"}}end{{"}}"}}{{"{{"}}end{{"}}"}}'
register: prometheus_lb_ip
changed_when: false
failed_when: false
when:
- observability_tailscale_expose | bool
- tailscale_operator_ready | default(false) | bool
- name: Show Tailscale access details
debug:
msg: |
Observability stack deployed with Tailscale access!
Grafana: http://{{ grafana_tailscale_hostname }}{% if grafana_lb_ip.stdout | default('') | length > 0 %} (or http://{{ grafana_lb_ip.stdout }}){% endif %}
Prometheus: http://{{ prometheus_tailscale_hostname }}{% if prometheus_lb_ip.stdout | default('') | length > 0 %} (or http://{{ prometheus_lb_ip.stdout }}){% endif %}
Login: admin / {{ grafana_password_effective }}
Tailscale readiness:
- Grafana proxy ready: {{ grafana_tailscale_ready.stdout | default('pending') }}
- Prometheus proxy ready: {{ prometheus_tailscale_ready.stdout | default('pending') }}
Access via:
- MagicDNS: http://{{ grafana_tailscale_hostname }} and http://{{ prometheus_tailscale_hostname }}
- Tailnet FQDN: http://{{ grafana_tailscale_hostname }}.{{ tailscale_tailnet | default('tailnet.ts.net') }}
- Direct endpoint: {% if grafana_lb_ip.stdout | default('') | length > 0 %}http://{{ grafana_lb_ip.stdout }}{% else %}(pending){% endif %} / {% if prometheus_lb_ip.stdout | default('') | length > 0 %}http://{{ prometheus_lb_ip.stdout }}{% else %}(pending){% endif %}
when:
- observability_tailscale_expose | bool
- tailscale_operator_ready | default(false) | bool
- name: Show observability access details (fallback)
debug:
msg: |
Observability stack deployed.
Namespace: {{ observability_namespace }}
Grafana (tailnet): kubectl -n {{ observability_namespace }} port-forward svc/kube-prometheus-stack-grafana 3000:80
Prometheus (tailnet): kubectl -n {{ observability_namespace }} port-forward svc/kube-prometheus-stack-prometheus 9090:9090
Grafana admin password: {{ grafana_password_effective }}
{% if loki_enabled %}
Loki: Enabled - logs available in Grafana
{% else %}
Loki: Disabled
{% endif %}
when:
- not (observability_tailscale_expose | bool and (tailscale_operator_ready | default(false) | bool))