From fd26edbdaa89befbf57d78bf4ea1f9fa7e0a13d9 Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Tue, 24 Aug 2021 13:15:54 +0200 Subject: [PATCH] Add new alerts for alertmanager sharding mode of operation. --- jsonnet/mimir-mixin/alerts.libsonnet | 1 + .../mimir-mixin/alerts/alertmanager.libsonnet | 98 +++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 jsonnet/mimir-mixin/alerts/alertmanager.libsonnet diff --git a/jsonnet/mimir-mixin/alerts.libsonnet b/jsonnet/mimir-mixin/alerts.libsonnet index 771c62c89d5..4dc1f85c247 100644 --- a/jsonnet/mimir-mixin/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts.libsonnet @@ -1,6 +1,7 @@ { prometheusAlerts+:: (import 'alerts/alerts.libsonnet') + + (import 'alerts/alertmanager.libsonnet') + (if std.member($._config.storage_engine, 'blocks') then diff --git a/jsonnet/mimir-mixin/alerts/alertmanager.libsonnet b/jsonnet/mimir-mixin/alerts/alertmanager.libsonnet new file mode 100644 index 00000000000..e73d04b3e1a --- /dev/null +++ b/jsonnet/mimir-mixin/alerts/alertmanager.libsonnet @@ -0,0 +1,98 @@ +{ + groups+: [ + { + name: 'alertmanager_alerts', + rules: [ + { + alert: 'CortexAlertmanagerSyncConfigsFailing', + expr: ||| + rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0 + |||, + 'for': '30m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to read tenant configurations from storage. + |||, + }, + }, + { + alert: 'CortexAlertmanagerRingCheckFailing', + expr: ||| + rate(cortex_alertmanager_ring_check_errors_total[2m]) > 0 + |||, + 'for': '10m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is unable to check tenants ownership via the ring. + |||, + }, + }, + { + alert: 'CortexAlertmanagerPartialStateMergeFailing', + expr: ||| + rate(cortex_alertmanager_partial_state_merges_failed_total[2m]) > 0 + |||, + 'for': '10m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to merge partial state changes received from a replica. + |||, + }, + }, + { + alert: 'CortexAlertmanagerReplicationFailing', + expr: ||| + rate(cortex_alertmanager_state_replication_failed_total[2m]) > 0 + |||, + 'for': '10m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to replicating partial state to its replicas. + |||, + }, + }, + { + alert: 'CortexAlertmanagerPersistStateFailing', + expr: ||| + rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0 + |||, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is unable to persist full state snaphots to remote storage. + |||, + }, + }, + { + alert: 'CortexAlertmanagerInitialSyncFailed', + expr: ||| + increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0 + |||, + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} was unable to obtain some initial state when starting up. + |||, + }, + }, + ], + }, + ], +}