From 0e78e942917a9797b2446ff5562e314a438221e2 Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Tue, 24 Aug 2021 13:15:54 +0200 Subject: [PATCH] Add new alerts for alertmanager sharding mode of operation. --- CHANGELOG.md | 1 + cortex-mixin/alerts.libsonnet | 1 + cortex-mixin/alerts/alertmanager.libsonnet | 98 ++++++++++++++++++++++ cortex-mixin/docs/playbooks.md | 24 ++++++ 4 files changed, 124 insertions(+) create mode 100644 cortex-mixin/alerts/alertmanager.libsonnet diff --git a/CHANGELOG.md b/CHANGELOG.md index d404093c..2af44cd6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -44,6 +44,7 @@ * [ENHANCEMENT] Added 256MB memory ballast to querier. #369 * [ENHANCEMENT] Update gsutil command for `not healthy index found` playbook #370 * [ENHANCEMENT] Update `etcd-operator` to latest version (see https://github.com/grafana/jsonnet-libs/pull/480). #263 +* [ENHANCEMENT] Added alertmanager alerts covering configuration syncs and sharding operation. #377 * [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308 * [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. #329 * [BUGFIX] Fixed `CortexInconsistentRuntimeConfig` metric. #335 diff --git a/cortex-mixin/alerts.libsonnet b/cortex-mixin/alerts.libsonnet index 771c62c8..4dc1f85c 100644 --- a/cortex-mixin/alerts.libsonnet +++ b/cortex-mixin/alerts.libsonnet @@ -1,6 +1,7 @@ { prometheusAlerts+:: (import 'alerts/alerts.libsonnet') + + (import 'alerts/alertmanager.libsonnet') + (if std.member($._config.storage_engine, 'blocks') then diff --git a/cortex-mixin/alerts/alertmanager.libsonnet b/cortex-mixin/alerts/alertmanager.libsonnet new file mode 100644 index 00000000..e73d04b3 --- /dev/null +++ b/cortex-mixin/alerts/alertmanager.libsonnet @@ -0,0 +1,98 @@ +{ + groups+: [ + { + name: 'alertmanager_alerts', + rules: [ + { + alert: 'CortexAlertmanagerSyncConfigsFailing', + expr: ||| + rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0 + |||, + 'for': '30m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to read tenant configurations from storage. + |||, + }, + }, + { + alert: 'CortexAlertmanagerRingCheckFailing', + expr: ||| + rate(cortex_alertmanager_ring_check_errors_total[2m]) > 0 + |||, + 'for': '10m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is unable to check tenants ownership via the ring. + |||, + }, + }, + { + alert: 'CortexAlertmanagerPartialStateMergeFailing', + expr: ||| + rate(cortex_alertmanager_partial_state_merges_failed_total[2m]) > 0 + |||, + 'for': '10m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to merge partial state changes received from a replica. + |||, + }, + }, + { + alert: 'CortexAlertmanagerReplicationFailing', + expr: ||| + rate(cortex_alertmanager_state_replication_failed_total[2m]) > 0 + |||, + 'for': '10m', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to replicating partial state to its replicas. + |||, + }, + }, + { + alert: 'CortexAlertmanagerPersistStateFailing', + expr: ||| + rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0 + |||, + 'for': '1h', + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is unable to persist full state snaphots to remote storage. + |||, + }, + }, + { + alert: 'CortexAlertmanagerInitialSyncFailed', + expr: ||| + increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0 + |||, + labels: { + severity: 'critical', + }, + annotations: { + message: ||| + Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} was unable to obtain some initial state when starting up. + |||, + }, + }, + ], + }, + ], +} diff --git a/cortex-mixin/docs/playbooks.md b/cortex-mixin/docs/playbooks.md index 5dcf8d52..e94b9656 100644 --- a/cortex-mixin/docs/playbooks.md +++ b/cortex-mixin/docs/playbooks.md @@ -636,6 +636,30 @@ This can be triggered if there are too many HA dedupe keys in etcd. We saw this }, ``` +### CortexAlertmanagerSyncConfigsFailing + +Work in progress. + +### CortexAlertmanagerRingCheckFailing + +Work in progress. + +### CortexAlertmanagerPartialStateMergeFailing + +Work in progress. + +### CortexAlertmanagerReplicationFailing + +Work in progress. + +### CortexAlertmanagerPersistStateFailing + +Work in progress. + +### CortexAlertmanagerInitialSyncFailed + +Work in progress. + ## Cortex routes by path **Write path**: