Skip to content

Commit da04f50

Browse files
feat: mixin / add loki compaction not successfull alert (#14239)
Co-authored-by: Ashwanth <[email protected]>
1 parent f917cf3 commit da04f50

File tree

3 files changed

+113
-0
lines changed

3 files changed

+113
-0
lines changed

production/loki-mixin-compiled-ssd/alerts.yaml

+33
Original file line numberDiff line numberDiff line change
@@ -43,3 +43,36 @@ groups:
4343
for: 5m
4444
labels:
4545
severity: warning
46+
- alert: LokiCompactorHasNotSuccessfullyRunCompaction
47+
annotations:
48+
description: |
49+
{{ $labels.cluster }} {{ $labels.namespace }} has not run compaction in the last 3 hours since the last compaction. This may indicate a problem with the compactor.
50+
summary: Loki compaction has not run in the last 3 hours since the last compaction.
51+
expr: |
52+
# The "last successful run" metric is updated even if the compactor owns no tenants,
53+
# so this alert correctly doesn't fire if compactor has nothing to do.
54+
min (
55+
time() - (loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{} > 0)
56+
)
57+
by (cluster, namespace)
58+
> 60 * 60 * 3
59+
for: 1h
60+
labels:
61+
severity: critical
62+
- alert: LokiCompactorHasNotSuccessfullyRunCompaction
63+
annotations:
64+
description: |
65+
{{ $labels.cluster }} {{ $labels.namespace }} has not run compaction in the last 3h since startup. This may indicate a problem with the compactor.
66+
summary: Loki compaction has not run in the last 3h since startup.
67+
expr: |
68+
# The "last successful run" metric is updated even if the compactor owns no tenants,
69+
# so this alert correctly doesn't fire if compactor has nothing to do.
70+
max(
71+
max_over_time(
72+
loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{}[3h]
73+
)
74+
) by (cluster, namespace)
75+
== 0
76+
for: 1h
77+
labels:
78+
severity: critical

production/loki-mixin-compiled/alerts.yaml

+33
Original file line numberDiff line numberDiff line change
@@ -43,3 +43,36 @@ groups:
4343
for: 5m
4444
labels:
4545
severity: warning
46+
- alert: LokiCompactorHasNotSuccessfullyRunCompaction
47+
annotations:
48+
description: |
49+
{{ $labels.cluster }} {{ $labels.namespace }} has not run compaction in the last 3 hours since the last compaction. This may indicate a problem with the compactor.
50+
summary: Loki compaction has not run in the last 3 hours since the last compaction.
51+
expr: |
52+
# The "last successful run" metric is updated even if the compactor owns no tenants,
53+
# so this alert correctly doesn't fire if compactor has nothing to do.
54+
min (
55+
time() - (loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{} > 0)
56+
)
57+
by (cluster, namespace)
58+
> 60 * 60 * 3
59+
for: 1h
60+
labels:
61+
severity: critical
62+
- alert: LokiCompactorHasNotSuccessfullyRunCompaction
63+
annotations:
64+
description: |
65+
{{ $labels.cluster }} {{ $labels.namespace }} has not run compaction in the last 3h since startup. This may indicate a problem with the compactor.
66+
summary: Loki compaction has not run in the last 3h since startup.
67+
expr: |
68+
# The "last successful run" metric is updated even if the compactor owns no tenants,
69+
# so this alert correctly doesn't fire if compactor has nothing to do.
70+
max(
71+
max_over_time(
72+
loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{}[3h]
73+
)
74+
) by (cluster, namespace)
75+
== 0
76+
for: 1h
77+
labels:
78+
severity: critical

production/loki-mixin/alerts.libsonnet

+47
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,53 @@
7070
|||, 'cluster', $._config.per_cluster_label),
7171
},
7272
},
73+
{
74+
// Alert if the compactor has not successfully run compaction in the last 3h since the last compaction.
75+
alert: 'LokiCompactorHasNotSuccessfullyRunCompaction',
76+
expr: |||
77+
# The "last successful run" metric is updated even if the compactor owns no tenants,
78+
# so this alert correctly doesn't fire if compactor has nothing to do.
79+
min (
80+
time() - (loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{} > 0)
81+
)
82+
by (%s, namespace)
83+
> 60 * 60 * 3
84+
||| % $._config.per_cluster_label,
85+
'for': '1h',
86+
labels: {
87+
severity: 'critical',
88+
},
89+
annotations: {
90+
summary: 'Loki compaction has not run in the last 3 hours since the last compaction.',
91+
description: std.strReplace(|||
92+
{{ $labels.cluster }} {{ $labels.namespace }} has not run compaction in the last 3 hours since the last compaction. This may indicate a problem with the compactor.
93+
|||, 'cluster', $._config.per_cluster_label),
94+
},
95+
},
96+
{
97+
// Alert if the compactor has not successfully run compaction in the last 3h since startup.
98+
alert: 'LokiCompactorHasNotSuccessfullyRunCompaction',
99+
expr: |||
100+
# The "last successful run" metric is updated even if the compactor owns no tenants,
101+
# so this alert correctly doesn't fire if compactor has nothing to do.
102+
max(
103+
max_over_time(
104+
loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{}[3h]
105+
)
106+
) by (%s, namespace)
107+
== 0
108+
||| % $._config.per_cluster_label,
109+
'for': '1h',
110+
labels: {
111+
severity: 'critical',
112+
},
113+
annotations: {
114+
summary: 'Loki compaction has not run in the last 3h since startup.',
115+
description: std.strReplace(|||
116+
{{ $labels.cluster }} {{ $labels.namespace }} has not run compaction in the last 3h since startup. This may indicate a problem with the compactor.
117+
|||, 'cluster', $._config.per_cluster_label),
118+
},
119+
},
73120
],
74121
},
75122
],

0 commit comments

Comments
 (0)