diff --git a/ansible/roles/observability/tasks/main.yml b/ansible/roles/observability/tasks/main.yml index 47fa014..deeeead 100644 --- a/ansible/roles/observability/tasks/main.yml +++ b/ansible/roles/observability/tasks/main.yml @@ -73,9 +73,80 @@ --version {{ loki_chart_version }} --values /tmp/loki-values.yaml --wait - --timeout 10m + --timeout 20m + register: loki_install + failed_when: false changed_when: true +- name: Show Loki pods on install failure + command: kubectl -n {{ observability_namespace }} get pods -l app.kubernetes.io/name=loki -o wide + register: loki_pods_status + changed_when: false + failed_when: false + when: loki_install.rc != 0 + +- name: Show observability PVCs on Loki install failure + command: kubectl -n {{ observability_namespace }} get pvc -o wide + register: loki_pvc_status + changed_when: false + failed_when: false + when: loki_install.rc != 0 + +- name: Get Loki pod name on install failure + shell: kubectl -n {{ observability_namespace }} get pods -l app.kubernetes.io/name=loki -o jsonpath='{.items[0].metadata.name}' + register: loki_pod_name + changed_when: false + failed_when: false + when: loki_install.rc != 0 + +- name: Describe Loki pod on install failure + command: kubectl -n {{ observability_namespace }} describe pod {{ loki_pod_name.stdout }} + register: loki_pod_describe + changed_when: false + failed_when: false + when: + - loki_install.rc != 0 + - loki_pod_name.stdout | length > 0 + +- name: Show Loki pod logs on install failure + command: kubectl -n {{ observability_namespace }} logs {{ loki_pod_name.stdout }} --tail=200 + register: loki_pod_logs + changed_when: false + failed_when: false + when: + - loki_install.rc != 0 + - loki_pod_name.stdout | length > 0 + +- name: Show observability events on Loki install failure + command: kubectl -n {{ observability_namespace }} get events --sort-by=.lastTimestamp + register: loki_events + changed_when: false + failed_when: false + when: loki_install.rc != 0 + +- name: Fail with Loki diagnostics + fail: + msg: | + Loki install failed. + Helm stderr: + {{ loki_install.stderr | default('') }} + + Loki pods: + {{ loki_pods_status.stdout | default('n/a') }} + + PVCs: + {{ loki_pvc_status.stdout | default('n/a') }} + + Loki pod describe: + {{ loki_pod_describe.stdout | default('n/a') }} + + Loki pod logs: + {{ loki_pod_logs.stdout | default('n/a') }} + + Events: + {{ loki_events.stdout | default('n/a') }} + when: loki_install.rc != 0 + - name: Write Promtail values template: src: promtail-values.yaml.j2 diff --git a/ansible/roles/observability/templates/loki-values.yaml.j2 b/ansible/roles/observability/templates/loki-values.yaml.j2 index 7370ca2..5f62f24 100644 --- a/ansible/roles/observability/templates/loki-values.yaml.j2 +++ b/ansible/roles/observability/templates/loki-values.yaml.j2 @@ -4,6 +4,8 @@ loki: auth_enabled: false commonConfig: replication_factor: 1 + limits_config: + retention_period: 168h schemaConfig: configs: - from: "2024-01-01" @@ -27,6 +29,13 @@ write: singleBinary: replicas: 1 + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 1Gi persistence: enabled: true storageClass: {{ loki_storage_class }}