--- - name: Check if Helm is installed command: helm version --short register: helm_check changed_when: false failed_when: false - name: Install Helm shell: curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash when: helm_check.rc != 0 changed_when: true - name: Ensure observability namespace exists command: kubectl create namespace {{ observability_namespace }} register: create_observability_ns failed_when: create_observability_ns.rc != 0 and "AlreadyExists" not in create_observability_ns.stderr changed_when: create_observability_ns.rc == 0 - name: Set Grafana admin password set_fact: grafana_password_effective: "{{ grafana_admin_password if grafana_admin_password | length > 0 else lookup('password', '/dev/null length=32 chars=ascii_letters,digits') }}" - name: Write kube-prometheus-stack values template: src: kube-prometheus-stack-values.yaml.j2 dest: /tmp/kube-prometheus-stack-values.yaml mode: "0644" - name: Add Prometheus Helm repo command: helm repo add prometheus-community https://prometheus-community.github.io/helm-charts register: add_prom_repo failed_when: add_prom_repo.rc != 0 and "already exists" not in add_prom_repo.stderr changed_when: add_prom_repo.rc == 0 - name: Add Grafana Helm repo command: helm repo add grafana https://grafana.github.io/helm-charts register: add_grafana_repo failed_when: add_grafana_repo.rc != 0 and "already exists" not in add_grafana_repo.stderr changed_when: add_grafana_repo.rc == 0 - name: Update Helm repos command: helm repo update changed_when: false - name: Clear stale pending Helm revision secrets for kube-prometheus-stack shell: >- kubectl -n {{ observability_namespace }} delete $(kubectl -n {{ observability_namespace }} get secret -l owner=helm,name=kube-prometheus-stack,status=pending-upgrade -o name) --ignore-not-found=true; kubectl -n {{ observability_namespace }} delete $(kubectl -n {{ observability_namespace }} get secret -l owner=helm,name=kube-prometheus-stack,status=pending-install -o name) --ignore-not-found=true; kubectl -n {{ observability_namespace }} delete $(kubectl -n {{ observability_namespace }} get secret -l owner=helm,name=kube-prometheus-stack,status=pending-rollback -o name) --ignore-not-found=true changed_when: false failed_when: false - name: Install kube-prometheus-stack command: >- helm upgrade --install kube-prometheus-stack prometheus-community/kube-prometheus-stack --namespace {{ observability_namespace }} --version {{ prometheus_chart_version }} --values /tmp/kube-prometheus-stack-values.yaml --wait --timeout 10m register: kube_prom_install retries: 12 delay: 15 until: kube_prom_install.rc == 0 changed_when: true - name: Wait for Grafana deployment rollout command: kubectl -n {{ observability_namespace }} rollout status deployment/kube-prometheus-stack-grafana --timeout=5m changed_when: false - name: Reset Grafana admin password in Grafana database shell: >- kubectl -n {{ observability_namespace }} exec "$(kubectl -n {{ observability_namespace }} get pod -l app.kubernetes.io/name=grafana -o jsonpath='{.items[0].metadata.name}')" -c grafana -- grafana cli admin reset-admin-password '{{ grafana_password_effective }}' changed_when: true - name: Write Loki values template: src: loki-values.yaml.j2 dest: /tmp/loki-values.yaml mode: "0644" when: loki_enabled - name: Validate Loki chart produces resources command: >- helm template loki grafana/loki --namespace {{ observability_namespace }} --version {{ loki_chart_version }} --values /tmp/loki-values.yaml register: loki_template changed_when: false failed_when: "loki_template.rc != 0 or 'kind: StatefulSet' not in loki_template.stdout" when: loki_enabled - name: Remove legacy Loki resources command: >- kubectl -n {{ observability_namespace }} delete deployment/loki-gateway statefulset/loki statefulset/loki-chunks-cache statefulset/loki-results-cache statefulset/loki-backend statefulset/loki-read statefulset/loki-write poddisruptionbudget/loki-memcached-chunks-cache poddisruptionbudget/loki-memcached-results-cache --ignore-not-found=true changed_when: false failed_when: false when: loki_enabled - name: Clear stuck Helm lock for Loki command: kubectl -n {{ observability_namespace }} delete secret sh.helm.release.v1.loki.v1 --ignore-not-found=true changed_when: false failed_when: false when: loki_enabled - name: Uninstall failed Loki release (if stuck) command: helm uninstall loki -n {{ observability_namespace }} changed_when: false failed_when: false when: loki_enabled - name: Install Loki command: >- helm upgrade --install loki grafana/loki --namespace {{ observability_namespace }} --version {{ loki_chart_version }} --values /tmp/loki-values.yaml register: loki_install changed_when: true when: loki_enabled - name: Wait for Loki StatefulSet command: kubectl -n {{ observability_namespace }} rollout status statefulset/loki --timeout=10m register: loki_rollout changed_when: false when: loki_enabled - name: Show Loki pod status command: kubectl -n {{ observability_namespace }} get pods -l app.kubernetes.io/name=loki -o wide register: loki_pods changed_when: false when: loki_enabled - name: Debug Loki pods debug: msg: "{{ loki_pods.stdout }}" when: loki_enabled - name: Write Promtail values template: src: promtail-values.yaml.j2 dest: /tmp/promtail-values.yaml mode: "0644" when: loki_enabled - name: Install Promtail command: >- helm upgrade --install promtail grafana/promtail --namespace {{ observability_namespace }} --version {{ promtail_chart_version }} --values /tmp/promtail-values.yaml --wait --timeout 10m changed_when: true when: loki_enabled - name: Check Tailscale service readiness for Grafana command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-grafana -o jsonpath='{.status.conditions[?(@.type=="TailscaleProxyReady")].status}' register: grafana_tailscale_ready changed_when: false failed_when: false when: - observability_tailscale_expose | bool - tailscale_operator_ready | default(false) | bool - name: Check Tailscale service readiness for Prometheus command: kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-prometheus -o jsonpath='{.status.conditions[?(@.type=="TailscaleProxyReady")].status}' register: prometheus_tailscale_ready changed_when: false failed_when: false when: - observability_tailscale_expose | bool - tailscale_operator_ready | default(false) | bool - name: Check Tailscale endpoint (IP/hostname) for Grafana shell: >- kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-grafana -o go-template='{{"{{"}}range .status.loadBalancer.ingress{{"}}"}}{{"{{"}}if .ip{{"}}"}}{{"{{"}}.ip{{"}}"}}{{"{{"}}else{{"}}"}}{{"{{"}}.hostname{{"}}"}}{{"{{"}}end{{"}}"}}{{"{{"}}end{{"}}"}}' register: grafana_lb_ip changed_when: false failed_when: false when: - observability_tailscale_expose | bool - tailscale_operator_ready | default(false) | bool - name: Check Tailscale endpoint (IP/hostname) for Prometheus shell: >- kubectl -n {{ observability_namespace }} get svc kube-prometheus-stack-prometheus -o go-template='{{"{{"}}range .status.loadBalancer.ingress{{"}}"}}{{"{{"}}if .ip{{"}}"}}{{"{{"}}.ip{{"}}"}}{{"{{"}}else{{"}}"}}{{"{{"}}.hostname{{"}}"}}{{"{{"}}end{{"}}"}}{{"{{"}}end{{"}}"}}' register: prometheus_lb_ip changed_when: false failed_when: false when: - observability_tailscale_expose | bool - tailscale_operator_ready | default(false) | bool - name: Show Tailscale access details debug: msg: | Observability stack deployed with Tailscale access! Grafana: http://{{ grafana_tailscale_hostname }}{% if grafana_lb_ip.stdout | default('') | length > 0 %} (or http://{{ grafana_lb_ip.stdout }}){% endif %} Prometheus: http://{{ prometheus_tailscale_hostname }}{% if prometheus_lb_ip.stdout | default('') | length > 0 %} (or http://{{ prometheus_lb_ip.stdout }}){% endif %} Login: admin / {{ grafana_password_effective }} Tailscale readiness: - Grafana proxy ready: {{ grafana_tailscale_ready.stdout | default('pending') }} - Prometheus proxy ready: {{ prometheus_tailscale_ready.stdout | default('pending') }} Access via: - MagicDNS: http://{{ grafana_tailscale_hostname }} and http://{{ prometheus_tailscale_hostname }} - Tailnet FQDN: http://{{ grafana_tailscale_hostname }}.{{ tailscale_tailnet | default('tailnet.ts.net') }} - Direct endpoint: {% if grafana_lb_ip.stdout | default('') | length > 0 %}http://{{ grafana_lb_ip.stdout }}{% else %}(pending){% endif %} / {% if prometheus_lb_ip.stdout | default('') | length > 0 %}http://{{ prometheus_lb_ip.stdout }}{% else %}(pending){% endif %} when: - observability_tailscale_expose | bool - tailscale_operator_ready | default(false) | bool - name: Show observability access details (fallback) debug: msg: | Observability stack deployed. Namespace: {{ observability_namespace }} Grafana (tailnet): kubectl -n {{ observability_namespace }} port-forward svc/kube-prometheus-stack-grafana 3000:80 Prometheus (tailnet): kubectl -n {{ observability_namespace }} port-forward svc/kube-prometheus-stack-prometheus 9090:9090 Grafana admin password: {{ grafana_password_effective }} {% if loki_enabled %} Loki: Enabled - logs available in Grafana {% else %} Loki: Disabled {% endif %} when: - not (observability_tailscale_expose | bool and (tailscale_operator_ready | default(false) | bool))