112 lines
4.3 KiB
Smarty
112 lines
4.3 KiB
Smarty
|
|
---
|
||
|
|
groups:
|
||
|
|
- name: "loki_alerts"
|
||
|
|
rules:
|
||
|
|
{{- with .Values.monitoring.rules }}
|
||
|
|
{{- $additionalAnnotations := .additionalRuleAnnotations }}
|
||
|
|
{{- $additionalLabels := .additionalRuleLabels }}
|
||
|
|
{{- if and (not .disabled.LokiRequestErrors) .configs.LokiRequestErrors.enabled }}
|
||
|
|
{{- with .configs.LokiRequestErrors }}
|
||
|
|
- alert: "LokiRequestErrors"
|
||
|
|
annotations:
|
||
|
|
message: |
|
||
|
|
{{`{{`}} $labels.job {{`}}`}} {{`{{`}} $labels.route {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}% errors.
|
||
|
|
{{- with $additionalAnnotations }}
|
||
|
|
{{- toYaml . | nindent 10 }}
|
||
|
|
{{- end }}
|
||
|
|
expr: |
|
||
|
|
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[{{ .lookbackPeriod }}])) by (namespace, job, route)
|
||
|
|
/
|
||
|
|
sum(rate(loki_request_duration_seconds_count[{{ .lookbackPeriod }}])) by (namespace, job, route)
|
||
|
|
> {{ .threshold }}
|
||
|
|
for: {{ .for }}
|
||
|
|
labels:
|
||
|
|
severity: {{ .severity }}
|
||
|
|
{{- with $additionalLabels }}
|
||
|
|
{{- toYaml . | nindent 10 }}
|
||
|
|
{{- end }}
|
||
|
|
{{- end }}
|
||
|
|
{{- end }}
|
||
|
|
|
||
|
|
{{- if and (not .disabled.LokiRequestPanics) .configs.LokiRequestPanics.enabled }}
|
||
|
|
{{- with .configs.LokiRequestPanics }}
|
||
|
|
- alert: "LokiRequestPanics"
|
||
|
|
annotations:
|
||
|
|
message: |
|
||
|
|
{{`{{`}} $labels.job {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}% increase of panics.
|
||
|
|
{{- with $additionalAnnotations }}
|
||
|
|
{{- toYaml . | nindent 10 }}
|
||
|
|
{{- end }}
|
||
|
|
expr: |
|
||
|
|
sum(increase(loki_panic_total[{{ .lookbackPeriod }}])) by (namespace, job) > {{ .threshold }}
|
||
|
|
labels:
|
||
|
|
severity: {{ .severity }}
|
||
|
|
{{- with $additionalLabels }}
|
||
|
|
{{ toYaml . | nindent 10 }}
|
||
|
|
{{- end }}
|
||
|
|
{{- end }}
|
||
|
|
{{- end }}
|
||
|
|
|
||
|
|
{{- if and (not .disabled.LokiRequestLatency) .configs.LokiRequestLatency.enabled }}
|
||
|
|
{{- with .configs.LokiRequestLatency }}
|
||
|
|
- alert: "LokiRequestLatency"
|
||
|
|
annotations:
|
||
|
|
message: |
|
||
|
|
{{`{{`}} $labels.job {{`}}`}} {{`{{`}} $labels.route {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}s 99th percentile latency.
|
||
|
|
{{- with $additionalAnnotations }}
|
||
|
|
{{- toYaml . | nindent 10 }}
|
||
|
|
{{- end }}
|
||
|
|
expr: |
|
||
|
|
namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*"} > {{ .threshold }}
|
||
|
|
for: {{ .for }}
|
||
|
|
labels:
|
||
|
|
severity: {{ .severity }}
|
||
|
|
{{- with $additionalLabels }}
|
||
|
|
{{ toYaml . | nindent 10 }}
|
||
|
|
{{- end }}
|
||
|
|
{{- end }}
|
||
|
|
{{- end }}
|
||
|
|
|
||
|
|
{{- if and (not .disabled.LokiTooManyCompactorsRunning) .configs.LokiTooManyCompactorsRunning.enabled }}
|
||
|
|
{{- with .configs.LokiTooManyCompactorsRunning }}
|
||
|
|
- alert: "LokiTooManyCompactorsRunning"
|
||
|
|
annotations:
|
||
|
|
message: |
|
||
|
|
{{`{{`}} $labels.cluster {{`}}`}} {{`{{`}} $labels.namespace {{`}}`}} has had {{`{{`}} printf "%.0f" $value {{`}}`}} compactors running for more than 5m. Only one compactor should run at a time.
|
||
|
|
{{- with $additionalAnnotations }}
|
||
|
|
{{- toYaml . | nindent 10 }}
|
||
|
|
{{- end }}
|
||
|
|
expr: |
|
||
|
|
sum(loki_boltdb_shipper_compactor_running) by (cluster, namespace) > 1
|
||
|
|
for: {{ .for }}
|
||
|
|
labels:
|
||
|
|
severity: {{ .severity }}
|
||
|
|
{{- with $additionalLabels }}
|
||
|
|
{{ toYaml . | nindent 10 }}
|
||
|
|
{{- end }}
|
||
|
|
{{- end }}
|
||
|
|
{{- end }}
|
||
|
|
|
||
|
|
{{- if and (not .disabled.LokiCanaryLatency) .configs.LokiCanaryLatency.enabled }}
|
||
|
|
{{- with .configs.LokiCanaryLatency }}
|
||
|
|
- name: "loki_canaries_alerts"
|
||
|
|
rules:
|
||
|
|
- alert: "LokiCanaryLatency"
|
||
|
|
annotations:
|
||
|
|
message: |
|
||
|
|
{{`{{`}} $labels.job {{`}}`}} is experiencing {{`{{`}} printf "%.2f" $value {{`}}`}}s 99th percentile latency.
|
||
|
|
{{- with $additionalAnnotations }}
|
||
|
|
{{- toYaml . | nindent 10 }}
|
||
|
|
{{- end }}
|
||
|
|
expr: |
|
||
|
|
histogram_quantile(0.99, sum(rate(loki_canary_response_latency_seconds_bucket[5m])) by (le, namespace, job)) > {{ .threshold }}
|
||
|
|
for: {{ .for }}
|
||
|
|
labels:
|
||
|
|
severity: {{ .severity }}
|
||
|
|
{{- with $additionalLabels }}
|
||
|
|
{{ toYaml . | nindent 10 }}
|
||
|
|
{{- end }}
|
||
|
|
{{- end }}
|
||
|
|
{{- end }}
|
||
|
|
{{- end }}
|