Skip to content

Commit

Permalink
Merge pull request grafana/cortex-jsonnet#377 from stevesg/alertmanag…
Browse files Browse the repository at this point in the history
…er-sharding-alerts

Add new alerts for alertmanager sharding mode of operation.
  • Loading branch information
stevesg authored Aug 25, 2021
2 parents 05cdd4a + fd26edb commit cea7f02
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 0 deletions.
1 change: 1 addition & 0 deletions jsonnet/mimir-mixin/alerts.libsonnet
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{
prometheusAlerts+::
(import 'alerts/alerts.libsonnet') +
(import 'alerts/alertmanager.libsonnet') +

(if std.member($._config.storage_engine, 'blocks')
then
Expand Down
98 changes: 98 additions & 0 deletions jsonnet/mimir-mixin/alerts/alertmanager.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
{
groups+: [
{
name: 'alertmanager_alerts',
rules: [
{
alert: 'CortexAlertmanagerSyncConfigsFailing',
expr: |||
rate(cortex_alertmanager_sync_configs_failed_total[5m]) > 0
|||,
'for': '30m',
labels: {
severity: 'critical',
},
annotations: {
message: |||
Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to read tenant configurations from storage.
|||,
},
},
{
alert: 'CortexAlertmanagerRingCheckFailing',
expr: |||
rate(cortex_alertmanager_ring_check_errors_total[2m]) > 0
|||,
'for': '10m',
labels: {
severity: 'critical',
},
annotations: {
message: |||
Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is unable to check tenants ownership via the ring.
|||,
},
},
{
alert: 'CortexAlertmanagerPartialStateMergeFailing',
expr: |||
rate(cortex_alertmanager_partial_state_merges_failed_total[2m]) > 0
|||,
'for': '10m',
labels: {
severity: 'critical',
},
annotations: {
message: |||
Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to merge partial state changes received from a replica.
|||,
},
},
{
alert: 'CortexAlertmanagerReplicationFailing',
expr: |||
rate(cortex_alertmanager_state_replication_failed_total[2m]) > 0
|||,
'for': '10m',
labels: {
severity: 'critical',
},
annotations: {
message: |||
Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is failing to replicating partial state to its replicas.
|||,
},
},
{
alert: 'CortexAlertmanagerPersistStateFailing',
expr: |||
rate(cortex_alertmanager_state_persist_failed_total[15m]) > 0
|||,
'for': '1h',
labels: {
severity: 'critical',
},
annotations: {
message: |||
Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} is unable to persist full state snaphots to remote storage.
|||,
},
},
{
alert: 'CortexAlertmanagerInitialSyncFailed',
expr: |||
increase(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed"}[1m]) > 0
|||,
labels: {
severity: 'critical',
},
annotations: {
message: |||
Cortex Alertmanager {{ $labels.job }}/{{ $labels.instance }} was unable to obtain some initial state when starting up.
|||,
},
},
],
},
],
}

0 comments on commit cea7f02

Please sign in to comment.