|
70 | 70 | |||, 'cluster', $._config.per_cluster_label),
|
71 | 71 | },
|
72 | 72 | },
|
| 73 | + { |
| 74 | + // Alert if the compactor has not successfully run compaction in the last 3h since the last compaction. |
| 75 | + alert: 'LokiCompactorHasNotSuccessfullyRunCompaction', |
| 76 | + expr: ||| |
| 77 | + # The "last successful run" metric is updated even if the compactor owns no tenants, |
| 78 | + # so this alert correctly doesn't fire if compactor has nothing to do. |
| 79 | + min ( |
| 80 | + time() - (loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{} > 0) |
| 81 | + ) |
| 82 | + by (%s, namespace) |
| 83 | + > 60 * 60 * 3 |
| 84 | + ||| % $._config.per_cluster_label, |
| 85 | + 'for': '1h', |
| 86 | + labels: { |
| 87 | + severity: 'critical', |
| 88 | + }, |
| 89 | + annotations: { |
| 90 | + summary: 'Loki compaction has not run in the last 3 hours since the last compaction.', |
| 91 | + description: std.strReplace(||| |
| 92 | + {{ $labels.cluster }} {{ $labels.namespace }} has not run compaction in the last 3 hours since the last compaction. This may indicate a problem with the compactor. |
| 93 | + |||, 'cluster', $._config.per_cluster_label), |
| 94 | + }, |
| 95 | + }, |
| 96 | + { |
| 97 | + // Alert if the compactor has not successfully run compaction in the last 3h since startup. |
| 98 | + alert: 'LokiCompactorHasNotSuccessfullyRunCompaction', |
| 99 | + expr: ||| |
| 100 | + # The "last successful run" metric is updated even if the compactor owns no tenants, |
| 101 | + # so this alert correctly doesn't fire if compactor has nothing to do. |
| 102 | + max( |
| 103 | + max_over_time( |
| 104 | + loki_boltdb_shipper_compact_tables_operation_last_successful_run_timestamp_seconds{}[3h] |
| 105 | + ) |
| 106 | + ) by (%s, namespace) |
| 107 | + == 0 |
| 108 | + ||| % $._config.per_cluster_label, |
| 109 | + 'for': '1h', |
| 110 | + labels: { |
| 111 | + severity: 'critical', |
| 112 | + }, |
| 113 | + annotations: { |
| 114 | + summary: 'Loki compaction has not run in the last 3h since startup.', |
| 115 | + description: std.strReplace(||| |
| 116 | + {{ $labels.cluster }} {{ $labels.namespace }} has not run compaction in the last 3h since startup. This may indicate a problem with the compactor. |
| 117 | + |||, 'cluster', $._config.per_cluster_label), |
| 118 | + }, |
| 119 | + }, |
73 | 120 | ],
|
74 | 121 | },
|
75 | 122 | ],
|
|
0 commit comments