From 0e78e942917a9797b2446ff5562e314a438221e2 Mon Sep 17 00:00:00 2001
From: Steve Simpson <steve.simpson@grafana.com>
Date: Tue, 24 Aug 2021 13:15:54 +0200
Subject: [PATCH] Add new alerts for alertmanager sharding mode of operation.

---
 CHANGELOG.md                               |  1 +
 cortex-mixin/alerts.libsonnet              |  1 +
 cortex-mixin/alerts/alertmanager.libsonnet | 98 ++++++++++++++++++++++
 cortex-mixin/docs/playbooks.md             | 24 ++++++
 4 files changed, 124 insertions(+)
 create mode 100644 cortex-mixin/alerts/alertmanager.libsonnet

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d404093c..2af44cd6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -44,6 +44,7 @@
 * [ENHANCEMENT] Added 256MB memory ballast to querier. #369
 * [ENHANCEMENT] Update gsutil command for `not healthy index found` playbook #370
 * [ENHANCEMENT] Update `etcd-operator` to latest version (see https://github.com/grafana/jsonnet-libs/pull/480). #263
+* [ENHANCEMENT] Added alertmanager alerts covering configuration syncs and sharding operation. #377
 * [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308
 * [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. #329
 * [BUGFIX] Fixed `CortexInconsistentRuntimeConfig` metric. #335
diff --git a/cortex-mixin/alerts.libsonnet b/cortex-mixin/alerts.libsonnet
index 771c62c8..4dc1f85c 100644
--- a/cortex-mixin/alerts.libsonnet
+++ b/cortex-mixin/alerts.libsonnet
@@ -1,6 +1,7 @@
 {
   prometheusAlerts+::
     (import 'alerts/alerts.libsonnet') +
+    (import 'alerts/alertmanager.libsonnet') +
 
     (if std.member($._config.storage_engine, 'blocks')
      then
diff --git a/cortex-mixin/alerts/alertmanager.libsonnet b/cortex-mixin/alerts/alertmanager.libsonnet
new file mode 100644
index 00000000..e73d04b3
--- /dev/null
+++ b/cortex-mixin/alerts/alertmanager.libsonnet
@@ -0,0 +1,98 @@
+{
+  groups+: [
+    {
+      name: 'alertmanager_alerts',
+      rules: [
+        {
+          alert: 'CortexAlertmanagerSyncConfigsFailing',
+          expr: |||
+            rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0
+          |||,
+          'for': '30m',
+          labels: {
+            severity: 'critical',
+          },
+          annotations: {
+            message: |||
+              Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to read tenant configurations from storage.
+            |||,
+          },
+        },
+        {
+          alert: 'CortexAlertmanagerRingCheckFailing',
+          expr: |||
+            rate(cortex_alertmanager_ring_check_errors_total[2m]) > 0
+          |||,
+          'for': '10m',
+          labels: {
+            severity: 'critical',
+          },
+          annotations: {
+            message: |||
+              Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is unable to check tenants ownership via the ring.
+            |||,
+          },
+        },
+        {
+          alert: 'CortexAlertmanagerPartialStateMergeFailing',
+          expr: |||
+            rate(cortex_alertmanager_partial_state_merges_failed_total[2m]) > 0
+          |||,
+          'for': '10m',
+          labels: {
+            severity: 'critical',
+          },
+          annotations: {
+            message: |||
+              Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to merge partial state changes received from a replica.
+            |||,
+          },
+        },
+        {
+          alert: 'CortexAlertmanagerReplicationFailing',
+          expr: |||
+            rate(cortex_alertmanager_state_replication_failed_total[2m]) > 0
+          |||,
+          'for': '10m',
+          labels: {
+            severity: 'critical',
+          },
+          annotations: {
+            message: |||
+              Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to replicating partial state to its replicas.
+            |||,
+          },
+        },
+        {
+          alert: 'CortexAlertmanagerPersistStateFailing',
+          expr: |||
+            rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0
+          |||,
+          'for': '1h',
+          labels: {
+            severity: 'critical',
+          },
+          annotations: {
+            message: |||
+              Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is unable to persist full state snaphots to remote storage.
+            |||,
+          },
+        },
+        {
+          alert: 'CortexAlertmanagerInitialSyncFailed',
+          expr: |||
+            increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0
+          |||,
+          labels: {
+            severity: 'critical',
+          },
+          annotations: {
+            message: |||
+              Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} was unable to obtain some initial state when starting up.
+            |||,
+          },
+        },
+      ],
+    },
+  ],
+}
diff --git a/cortex-mixin/docs/playbooks.md b/cortex-mixin/docs/playbooks.md
index 5dcf8d52..e94b9656 100644
--- a/cortex-mixin/docs/playbooks.md
+++ b/cortex-mixin/docs/playbooks.md
@@ -636,6 +636,30 @@ This can be triggered if there are too many HA dedupe keys in etcd. We saw this
   },
 ```
 
+### CortexAlertmanagerSyncConfigsFailing
+
+Work in progress.
+
+### CortexAlertmanagerRingCheckFailing
+
+Work in progress.
+
+### CortexAlertmanagerPartialStateMergeFailing
+
+Work in progress.
+
+### CortexAlertmanagerReplicationFailing
+
+Work in progress.
+
+### CortexAlertmanagerPersistStateFailing
+
+Work in progress.
+
+### CortexAlertmanagerInitialSyncFailed
+
+Work in progress.
+
 ## Cortex routes by path
 
 **Write path**: