--- - name: Check if Helm is installed command: helm version --short register: helm_check changed_when: false failed_when: false - name: Install Helm shell: curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash when: helm_check.rc != 0 changed_when: true - name: Ensure observability namespace exists command: kubectl create namespace {{ observability_namespace }} register: create_observability_ns failed_when: create_observability_ns.rc != 0 and "AlreadyExists" not in create_observability_ns.stderr changed_when: create_observability_ns.rc == 0 - name: Set Grafana admin password set_fact: grafana_password_effective: "{{ grafana_admin_password if grafana_admin_password | length > 0 else lookup('password', '/dev/null length=32 chars=ascii_letters,digits') }}" - name: Write kube-prometheus-stack values template: src: kube-prometheus-stack-values.yaml.j2 dest: /tmp/kube-prometheus-stack-values.yaml mode: "0644" - name: Add Prometheus Helm repo command: helm repo add prometheus-community https://prometheus-community.github.io/helm-charts register: add_prom_repo failed_when: add_prom_repo.rc != 0 and "already exists" not in add_prom_repo.stderr changed_when: add_prom_repo.rc == 0 - name: Add Grafana Helm repo command: helm repo add grafana https://grafana.github.io/helm-charts register: add_grafana_repo failed_when: add_grafana_repo.rc != 0 and "already exists" not in add_grafana_repo.stderr changed_when: add_grafana_repo.rc == 0 - name: Update Helm repos command: helm repo update changed_when: false - name: Install kube-prometheus-stack command: >- helm upgrade --install kube-prometheus-stack prometheus-community/kube-prometheus-stack --namespace {{ observability_namespace }} --version {{ prometheus_chart_version }} --values /tmp/kube-prometheus-stack-values.yaml --wait --timeout 10m changed_when: true - name: Write Loki values template: src: loki-values.yaml.j2 dest: /tmp/loki-values.yaml mode: "0644" - name: Validate Loki Helm values command: >- helm template loki grafana/loki --namespace {{ observability_namespace }} --version {{ loki_chart_version }} --values /tmp/loki-values.yaml register: loki_template changed_when: false - name: Show Loki template output (check for StatefulSet) debug: msg: "Loki template contains StatefulSet: {{ 'kind: StatefulSet' in loki_template.stdout }}" when: "'kind: StatefulSet' not in loki_template.stdout" - name: Fail if Loki template produces no StatefulSet fail: msg: | Loki Helm template produces no StatefulSet. Check values configuration. Template output (first 100 lines): {{ loki_template.stdout.split('\n')[:100] | join('\n') }} when: "'kind: StatefulSet' not in loki_template.stdout" - name: Remove legacy Loki scalable workloads (if present) command: >- kubectl -n {{ observability_namespace }} delete deployment/loki-gateway statefulset/loki-chunks-cache statefulset/loki-results-cache statefulset/loki-backend statefulset/loki-read statefulset/loki-write --ignore-not-found=true changed_when: false failed_when: false - name: Remove legacy Loki PDBs (if present) command: >- kubectl -n {{ observability_namespace }} delete poddisruptionbudget/loki-memcached-chunks-cache poddisruptionbudget/loki-memcached-results-cache --ignore-not-found=true changed_when: false failed_when: false - name: Wait for legacy Loki resources to be fully removed shell: >- kubectl -n {{ observability_namespace }} get deployment/loki-gateway statefulset/loki-chunks-cache statefulset/loki-results-cache statefulset/loki-backend statefulset/loki-read statefulset/loki-write 2>&1 | grep -q "NotFound\|not found" register: legacy_cleanup retries: 12 delay: 5 until: legacy_cleanup.rc == 0 changed_when: false failed_when: false - name: Install Loki command: >- helm upgrade --install loki grafana/loki --namespace {{ observability_namespace }} --version {{ loki_chart_version }} --values /tmp/loki-values.yaml --create-namespace register: loki_install failed_when: false changed_when: true - name: Show Loki Helm install result debug: msg: | Helm install rc: {{ loki_install.rc }} Helm install stdout: {{ loki_install.stdout }} Helm install stderr: {{ loki_install.stderr }} - name: Show all Loki resources after install command: kubectl -n {{ observability_namespace }} get all -l app.kubernetes.io/name=loki register: loki_resources changed_when: false failed_when: false - name: Debug Loki resources debug: msg: "{{ loki_resources.stdout }}" when: loki_resources.stdout | length > 0 - name: Wait for Loki statefulset rollout command: kubectl -n {{ observability_namespace }} rollout status statefulset/loki --timeout=10m register: loki_rollout failed_when: false changed_when: false when: loki_install.rc == 0 - name: Check Loki core container readiness command: kubectl -n {{ observability_namespace }} get pod loki-0 -o jsonpath={.status.containerStatuses[?(@.name=="loki")].ready} register: loki_core_ready failed_when: false changed_when: false when: loki_install.rc != 0 or (loki_rollout.rc | default(1)) != 0 - name: Show Loki pods on install failure command: kubectl -n {{ observability_namespace }} get pods -l app.kubernetes.io/name=loki -o wide register: loki_pods_status changed_when: false failed_when: false when: loki_install.rc != 0 or (loki_rollout.rc | default(1)) != 0 - name: Show observability PVCs on Loki install failure command: kubectl -n {{ observability_namespace }} get pvc -o wide register: loki_pvc_status changed_when: false failed_when: false when: loki_install.rc != 0 or (loki_rollout.rc | default(1)) != 0 - name: Get Loki pod name on install failure shell: kubectl -n {{ observability_namespace }} get pods -l app.kubernetes.io/name=loki -o jsonpath='{.items[0].metadata.name}' register: loki_pod_name changed_when: false failed_when: false when: loki_install.rc != 0 or (loki_rollout.rc | default(1)) != 0 - name: Describe Loki pod on install failure command: kubectl -n {{ observability_namespace }} describe pod {{ loki_pod_name.stdout }} register: loki_pod_describe changed_when: false failed_when: false when: - loki_install.rc != 0 or (loki_rollout.rc | default(1)) != 0 - loki_pod_name.stdout | length > 0 - name: Show Loki pod logs on install failure command: kubectl -n {{ observability_namespace }} logs {{ loki_pod_name.stdout }} --tail=200 register: loki_pod_logs changed_when: false failed_when: false when: - loki_install.rc != 0 or (loki_rollout.rc | default(1)) != 0 - loki_pod_name.stdout | length > 0 - name: Show observability events on Loki install failure command: kubectl -n {{ observability_namespace }} get events --sort-by=.lastTimestamp register: loki_events changed_when: false failed_when: false when: loki_install.rc != 0 or (loki_rollout.rc | default(1)) != 0 - name: Fail with Loki diagnostics fail: msg: | Loki install failed. Helm stderr: {{ loki_install.stderr | default('') }} Rollout status stderr: {{ loki_rollout.stderr | default('') }} Loki pods: {{ loki_pods_status.stdout | default('n/a') }} PVCs: {{ loki_pvc_status.stdout | default('n/a') }} Loki pod describe: {{ loki_pod_describe.stdout | default('n/a') }} Loki pod logs: {{ loki_pod_logs.stdout | default('n/a') }} Events: {{ loki_events.stdout | default('n/a') }} when: - loki_install.rc != 0 or (loki_rollout.rc | default(1)) != 0 - (loki_core_ready.stdout | default('false') | trim) != 'true' - name: Write Promtail values template: src: promtail-values.yaml.j2 dest: /tmp/promtail-values.yaml mode: "0644" - name: Install Promtail command: >- helm upgrade --install promtail grafana/promtail --namespace {{ observability_namespace }} --version {{ promtail_chart_version }} --values /tmp/promtail-values.yaml --wait --timeout 10m changed_when: true - name: Write Grafana Loki datasource manifest template: src: grafana-datasource-loki.yaml.j2 dest: /tmp/grafana-datasource-loki.yaml mode: "0644" - name: Create Grafana Loki datasource command: kubectl apply -f /tmp/grafana-datasource-loki.yaml changed_when: true - name: Show observability access details debug: msg: | Observability stack deployed. Namespace: {{ observability_namespace }} Grafana (tailnet): kubectl -n {{ observability_namespace }} port-forward svc/kube-prometheus-stack-grafana 3000:80 Prometheus (tailnet): kubectl -n {{ observability_namespace }} port-forward svc/kube-prometheus-stack-prometheus 9090:9090 Grafana admin password: {{ grafana_password_effective }}