forked from grafana/loki
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalerts.libsonnet
73 lines (73 loc) · 2.37 KB
/
alerts.libsonnet
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
{
prometheusAlerts+:: {
groups+: [
{
name: 'loki_alerts',
rules: [
{
alert: 'LokiRequestErrors',
expr: |||
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (%(group_by_cluster)s, job, route)
/
sum(rate(loki_request_duration_seconds_count[2m])) by (%(group_by_cluster)s, job, route)
> 10
||| % $._config,
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
|||,
},
},
{
alert: 'LokiRequestPanics',
expr: |||
sum(increase(loki_panic_total[10m])) by (%(group_by_cluster)s, job) > 0
||| % $._config,
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics.
|||,
},
},
{
alert: 'LokiRequestLatency',
expr: |||
%(group_prefix_jobs)s_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1
||| % $._config,
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
|||,
},
},
{
alert: 'LokiTooManyCompactorsRunning',
expr: |||
sum(loki_boltdb_shipper_compactor_running) by (%(group_by_cluster)s) > 1
||| % $._config,
'for': '5m',
labels: {
severity: 'warning',
},
annotations: {
message: |||
{{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time.
|||,
},
},
],
},
],
},
}