forked from grafana/loki
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalerts.yaml
45 lines (45 loc) · 2 KB
/
alerts.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
groups:
- name: loki_alerts
rules:
- alert: LokiRequestErrors
annotations:
description: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
summary: Loki request error rate is high.
expr: |
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (cluster, namespace, job, route)
/
sum(rate(loki_request_duration_seconds_count[2m])) by (cluster, namespace, job, route)
> 10
for: 15m
labels:
severity: critical
- alert: LokiRequestPanics
annotations:
description: |
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics.
summary: Loki requests are causing code panics.
expr: |
sum(increase(loki_panic_total[10m])) by (cluster, namespace, job) > 0
labels:
severity: critical
- alert: LokiRequestLatency
annotations:
description: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
summary: Loki request error latency is high.
expr: |
cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1
for: 15m
labels:
severity: critical
- alert: LokiTooManyCompactorsRunning
annotations:
description: |
{{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time.
summary: Loki deployment is running more than one compactor.
expr: |
sum(loki_boltdb_shipper_compactor_running) by (cluster, namespace) > 1
for: 5m
labels:
severity: warning