Files
HetznerTerra/ansible/roles/observability/tasks/main.yml
MichaelFisher1997 71a1495fbc
Some checks failed
Deploy Cluster / Ansible (push) Has been cancelled
Deploy Cluster / Terraform (push) Has been cancelled
fix: add Loki template validation and resource debugging
2026-03-02 13:56:48 +00:00

275 lines
8.8 KiB
YAML

---
- name: Check if Helm is installed
command: helm version --short
register: helm_check
changed_when: false
failed_when: false
- name: Install Helm
shell: curl -fsSL https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
when: helm_check.rc != 0
changed_when: true
- name: Ensure observability namespace exists
command: kubectl create namespace {{ observability_namespace }}
register: create_observability_ns
failed_when: create_observability_ns.rc != 0 and "AlreadyExists" not in create_observability_ns.stderr
changed_when: create_observability_ns.rc == 0
- name: Set Grafana admin password
set_fact:
grafana_password_effective: "{{ grafana_admin_password if grafana_admin_password | length > 0 else lookup('password', '/dev/null length=32 chars=ascii_letters,digits') }}"
- name: Write kube-prometheus-stack values
template:
src: kube-prometheus-stack-values.yaml.j2
dest: /tmp/kube-prometheus-stack-values.yaml
mode: "0644"
- name: Add Prometheus Helm repo
command: helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
register: add_prom_repo
failed_when: add_prom_repo.rc != 0 and "already exists" not in add_prom_repo.stderr
changed_when: add_prom_repo.rc == 0
- name: Add Grafana Helm repo
command: helm repo add grafana https://grafana.github.io/helm-charts
register: add_grafana_repo
failed_when: add_grafana_repo.rc != 0 and "already exists" not in add_grafana_repo.stderr
changed_when: add_grafana_repo.rc == 0
- name: Update Helm repos
command: helm repo update
changed_when: false
- name: Install kube-prometheus-stack
command: >-
helm upgrade --install kube-prometheus-stack prometheus-community/kube-prometheus-stack
--namespace {{ observability_namespace }}
--version {{ prometheus_chart_version }}
--values /tmp/kube-prometheus-stack-values.yaml
--wait
--timeout 10m
changed_when: true
- name: Write Loki values
template:
src: loki-values.yaml.j2
dest: /tmp/loki-values.yaml
mode: "0644"
- name: Validate Loki Helm values
command: >-
helm template loki grafana/loki
--namespace {{ observability_namespace }}
--version {{ loki_chart_version }}
--values /tmp/loki-values.yaml
register: loki_template
changed_when: false
- name: Show Loki template output (check for StatefulSet)
debug:
msg: "Loki template contains StatefulSet: {{ 'kind: StatefulSet' in loki_template.stdout }}"
when: "'kind: StatefulSet' not in loki_template.stdout"
- name: Fail if Loki template produces no StatefulSet
fail:
msg: |
Loki Helm template produces no StatefulSet. Check values configuration.
Template output (first 100 lines):
{{ loki_template.stdout.split('\n')[:100] | join('\n') }}
when: "'kind: StatefulSet' not in loki_template.stdout"
- name: Remove legacy Loki scalable workloads (if present)
command: >-
kubectl -n {{ observability_namespace }} delete
deployment/loki-gateway
statefulset/loki-chunks-cache
statefulset/loki-results-cache
statefulset/loki-backend
statefulset/loki-read
statefulset/loki-write
--ignore-not-found=true
changed_when: false
failed_when: false
- name: Remove legacy Loki PDBs (if present)
command: >-
kubectl -n {{ observability_namespace }} delete
poddisruptionbudget/loki-memcached-chunks-cache
poddisruptionbudget/loki-memcached-results-cache
--ignore-not-found=true
changed_when: false
failed_when: false
- name: Wait for legacy Loki resources to be fully removed
shell: >-
kubectl -n {{ observability_namespace }} get
deployment/loki-gateway
statefulset/loki-chunks-cache
statefulset/loki-results-cache
statefulset/loki-backend
statefulset/loki-read
statefulset/loki-write
2>&1 | grep -q "NotFound\|not found"
register: legacy_cleanup
retries: 12
delay: 5
until: legacy_cleanup.rc == 0
changed_when: false
failed_when: false
- name: Install Loki
command: >-
helm upgrade --install loki grafana/loki
--namespace {{ observability_namespace }}
--version {{ loki_chart_version }}
--values /tmp/loki-values.yaml
--create-namespace
register: loki_install
failed_when: false
changed_when: true
- name: Show Loki Helm install result
debug:
msg: |
Helm install rc: {{ loki_install.rc }}
Helm install stdout:
{{ loki_install.stdout }}
Helm install stderr:
{{ loki_install.stderr }}
- name: Show all Loki resources after install
command: kubectl -n {{ observability_namespace }} get all -l app.kubernetes.io/name=loki
register: loki_resources
changed_when: false
failed_when: false
- name: Debug Loki resources
debug:
msg: "{{ loki_resources.stdout }}"
when: loki_resources.stdout | length > 0
- name: Wait for Loki statefulset rollout
command: kubectl -n {{ observability_namespace }} rollout status statefulset/loki --timeout=10m
register: loki_rollout
failed_when: false
changed_when: false
when: loki_install.rc == 0
- name: Check Loki core container readiness
command: kubectl -n {{ observability_namespace }} get pod loki-0 -o jsonpath={.status.containerStatuses[?(@.name=="loki")].ready}
register: loki_core_ready
failed_when: false
changed_when: false
when: loki_install.rc != 0 or (loki_rollout.rc | default(1)) != 0
- name: Show Loki pods on install failure
command: kubectl -n {{ observability_namespace }} get pods -l app.kubernetes.io/name=loki -o wide
register: loki_pods_status
changed_when: false
failed_when: false
when: loki_install.rc != 0 or (loki_rollout.rc | default(1)) != 0
- name: Show observability PVCs on Loki install failure
command: kubectl -n {{ observability_namespace }} get pvc -o wide
register: loki_pvc_status
changed_when: false
failed_when: false
when: loki_install.rc != 0 or (loki_rollout.rc | default(1)) != 0
- name: Get Loki pod name on install failure
shell: kubectl -n {{ observability_namespace }} get pods -l app.kubernetes.io/name=loki -o jsonpath='{.items[0].metadata.name}'
register: loki_pod_name
changed_when: false
failed_when: false
when: loki_install.rc != 0 or (loki_rollout.rc | default(1)) != 0
- name: Describe Loki pod on install failure
command: kubectl -n {{ observability_namespace }} describe pod {{ loki_pod_name.stdout }}
register: loki_pod_describe
changed_when: false
failed_when: false
when:
- loki_install.rc != 0 or (loki_rollout.rc | default(1)) != 0
- loki_pod_name.stdout | length > 0
- name: Show Loki pod logs on install failure
command: kubectl -n {{ observability_namespace }} logs {{ loki_pod_name.stdout }} --tail=200
register: loki_pod_logs
changed_when: false
failed_when: false
when:
- loki_install.rc != 0 or (loki_rollout.rc | default(1)) != 0
- loki_pod_name.stdout | length > 0
- name: Show observability events on Loki install failure
command: kubectl -n {{ observability_namespace }} get events --sort-by=.lastTimestamp
register: loki_events
changed_when: false
failed_when: false
when: loki_install.rc != 0 or (loki_rollout.rc | default(1)) != 0
- name: Fail with Loki diagnostics
fail:
msg: |
Loki install failed.
Helm stderr:
{{ loki_install.stderr | default('') }}
Rollout status stderr:
{{ loki_rollout.stderr | default('') }}
Loki pods:
{{ loki_pods_status.stdout | default('n/a') }}
PVCs:
{{ loki_pvc_status.stdout | default('n/a') }}
Loki pod describe:
{{ loki_pod_describe.stdout | default('n/a') }}
Loki pod logs:
{{ loki_pod_logs.stdout | default('n/a') }}
Events:
{{ loki_events.stdout | default('n/a') }}
when:
- loki_install.rc != 0 or (loki_rollout.rc | default(1)) != 0
- (loki_core_ready.stdout | default('false') | trim) != 'true'
- name: Write Promtail values
template:
src: promtail-values.yaml.j2
dest: /tmp/promtail-values.yaml
mode: "0644"
- name: Install Promtail
command: >-
helm upgrade --install promtail grafana/promtail
--namespace {{ observability_namespace }}
--version {{ promtail_chart_version }}
--values /tmp/promtail-values.yaml
--wait
--timeout 10m
changed_when: true
- name: Write Grafana Loki datasource manifest
template:
src: grafana-datasource-loki.yaml.j2
dest: /tmp/grafana-datasource-loki.yaml
mode: "0644"
- name: Create Grafana Loki datasource
command: kubectl apply -f /tmp/grafana-datasource-loki.yaml
changed_when: true
- name: Show observability access details
debug:
msg: |
Observability stack deployed.
Namespace: {{ observability_namespace }}
Grafana (tailnet): kubectl -n {{ observability_namespace }} port-forward svc/kube-prometheus-stack-grafana 3000:80
Prometheus (tailnet): kubectl -n {{ observability_namespace }} port-forward svc/kube-prometheus-stack-prometheus 9090:9090
Grafana admin password: {{ grafana_password_effective }}