Skip to content

Commit

Permalink
Block builder: update alerts and add critical alerts (#10270)
Browse files Browse the repository at this point in the history
* Block builder: update alerts and add critical alerts

Signed-off-by: Ganesh Vernekar <[email protected]>

* Execute the jsonnet

Signed-off-by: Ganesh Vernekar <[email protected]>

* Update alerts

Signed-off-by: Ganesh Vernekar <[email protected]>

* make mixin

Signed-off-by: Ganesh Vernekar <[email protected]>

* Fix helm

Signed-off-by: Ganesh Vernekar <[email protected]>

---------

Signed-off-by: Ganesh Vernekar <[email protected]>
  • Loading branch information
codesome authored Jan 24, 2025
1 parent 5ff8293 commit 0c923bb
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 21 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1181,27 +1181,35 @@ spec:
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuildernocycleprocessing
expr: |
max by(cluster, namespace, pod) (histogram_count(increase(cortex_blockbuilder_consume_cycle_duration_seconds[60m]))) == 0
for: 5m
for: 20m
labels:
severity: warning
severity: critical
- alert: MimirBlockBuilderLagging
annotations:
message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} reports partition lag of {{ printf "%.2f" $value }}%.
message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} reports partition lag of {{ printf "%.2f" $value }}.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuilderlagging
expr: |
max by(cluster, namespace, pod) (max_over_time(cortex_blockbuilder_consumer_lag_records[10m])) > 4e6
for: 75m
labels:
severity: warning
- alert: MimirBlockBuilderLagging
annotations:
message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} reports partition lag of {{ printf "%.2f" $value }}.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuilderlagging
expr: |
max by(cluster, namespace, pod) (max_over_time(cortex_blockbuilder_consumer_lag_records[10m])) > 4e6
for: 140m
labels:
severity: critical
- alert: MimirBlockBuilderCompactAndUploadFailed
annotations:
message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} fails to compact and upload blocks.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuildercompactanduploadfailed
expr: |
sum by (cluster, namespace, pod) (rate(cortex_blockbuilder_tsdb_compact_and_upload_failed_total[1m])) > 0
for: 5m
labels:
severity: warning
severity: critical
- name: mimir_continuous_test
rules:
- alert: MimirContinuousTestNotRunningOnWrites
Expand Down
18 changes: 13 additions & 5 deletions operations/mimir-mixin-compiled-baremetal/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1155,27 +1155,35 @@ groups:
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuildernocycleprocessing
expr: |
max by(cluster, namespace, instance) (histogram_count(increase(cortex_blockbuilder_consume_cycle_duration_seconds[60m]))) == 0
for: 5m
for: 20m
labels:
severity: warning
severity: critical
- alert: MimirBlockBuilderLagging
annotations:
message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} reports partition lag of {{ printf "%.2f" $value }}%.
message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} reports partition lag of {{ printf "%.2f" $value }}.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuilderlagging
expr: |
max by(cluster, namespace, instance) (max_over_time(cortex_blockbuilder_consumer_lag_records[10m])) > 4e6
for: 75m
labels:
severity: warning
- alert: MimirBlockBuilderLagging
annotations:
message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} reports partition lag of {{ printf "%.2f" $value }}.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuilderlagging
expr: |
max by(cluster, namespace, instance) (max_over_time(cortex_blockbuilder_consumer_lag_records[10m])) > 4e6
for: 140m
labels:
severity: critical
- alert: MimirBlockBuilderCompactAndUploadFailed
annotations:
message: Mimir {{ $labels.instance }} in {{ $labels.cluster }}/{{ $labels.namespace }} fails to compact and upload blocks.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuildercompactanduploadfailed
expr: |
sum by (cluster, namespace, instance) (rate(cortex_blockbuilder_tsdb_compact_and_upload_failed_total[1m])) > 0
for: 5m
labels:
severity: warning
severity: critical
- name: mimir_continuous_test
rules:
- alert: MimirContinuousTestNotRunningOnWrites
Expand Down
18 changes: 13 additions & 5 deletions operations/mimir-mixin-compiled/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1169,27 +1169,35 @@ groups:
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuildernocycleprocessing
expr: |
max by(cluster, namespace, pod) (histogram_count(increase(cortex_blockbuilder_consume_cycle_duration_seconds[60m]))) == 0
for: 5m
for: 20m
labels:
severity: warning
severity: critical
- alert: MimirBlockBuilderLagging
annotations:
message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} reports partition lag of {{ printf "%.2f" $value }}%.
message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} reports partition lag of {{ printf "%.2f" $value }}.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuilderlagging
expr: |
max by(cluster, namespace, pod) (max_over_time(cortex_blockbuilder_consumer_lag_records[10m])) > 4e6
for: 75m
labels:
severity: warning
- alert: MimirBlockBuilderLagging
annotations:
message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} reports partition lag of {{ printf "%.2f" $value }}.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuilderlagging
expr: |
max by(cluster, namespace, pod) (max_over_time(cortex_blockbuilder_consumer_lag_records[10m])) > 4e6
for: 140m
labels:
severity: critical
- alert: MimirBlockBuilderCompactAndUploadFailed
annotations:
message: Mimir {{ $labels.pod }} in {{ $labels.cluster }}/{{ $labels.namespace }} fails to compact and upload blocks.
runbook_url: https://grafana.com/docs/mimir/latest/operators-guide/mimir-runbooks/#mimirblockbuildercompactanduploadfailed
expr: |
sum by (cluster, namespace, pod) (rate(cortex_blockbuilder_tsdb_compact_and_upload_failed_total[1m])) > 0
for: 5m
labels:
severity: warning
severity: critical
- name: mimir_continuous_test
rules:
- alert: MimirContinuousTestNotRunningOnWrites
Expand Down
24 changes: 18 additions & 6 deletions operations/mimir-mixin/alerts/ingest-storage.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -230,12 +230,12 @@
// Alert if block-builder didn't process cycles in the past hour.
{
alert: $.alertName('BlockBuilderNoCycleProcessing'),
'for': '5m',
'for': '20m',
expr: |||
max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (histogram_count(increase(cortex_blockbuilder_consume_cycle_duration_seconds[60m]))) == 0
||| % $._config,
labels: {
severity: 'warning',
severity: 'critical',
},
annotations: {
message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s has not processed cycles in the past hour.' % $._config,
Expand All @@ -255,19 +255,31 @@
severity: 'warning',
},
annotations: {
message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s reports partition lag of {{ printf "%%.2f" $value }}%%.' % $._config,
message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s reports partition lag of {{ printf "%%.2f" $value }}.' % $._config,
},
},
{
alert: $.alertName('BlockBuilderLagging'),
'for': '140m', // 2h20m. Indicating the lag did not come down for ~2 consumption cycles.
expr: |||
max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(cortex_blockbuilder_consumer_lag_records[10m])) > 4e6
||| % $._config,
labels: {
severity: 'critical',
},
annotations: {
message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s reports partition lag of {{ printf "%%.2f" $value }}.' % $._config,
},
},

// Alert if block-builder is failing to compact and upload any blocks.
// Alert immediately if block-builder is failing to compact and upload any blocks.
{
alert: $.alertName('BlockBuilderCompactAndUploadFailed'),
'for': '5m',
expr: |||
sum by (%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_blockbuilder_tsdb_compact_and_upload_failed_total[1m])) > 0
||| % $._config,
labels: {
severity: 'warning',
severity: 'critical',
},
annotations: {
message: '%(product)s {{ $labels.%(per_instance_label)s }} in %(alert_aggregation_variables)s fails to compact and upload blocks.' % $._config,
Expand Down

0 comments on commit 0c923bb

Please sign in to comment.