diff --git a/CHANGELOG.md b/CHANGELOG.md index 7fc3cc32..b0d814ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,12 @@ * [ENHANCEMENT] Ruler dashboard: added "Per route p99 latency" panel in the "Configuration API" row. #353 * [ENHANCEMENT] Increased the `for` duration of the `CortexIngesterReachingSeriesLimit` warning alert to 3h. #362 * [ENHANCEMENT] Added a new tier (`medium_small_user`) so we have another tier between 100K and 1Mil active series. #364 +* [ENHANCEMENT] Extend Alertmanager dashboard: #313 + * "Tenants" stat panel - shows number of discovered tenant configurations. + * "Replication" row - information about the replication of tenants/alerts/silences over instances. + * "Tenant Configuration Sync" row - information about the configuration sync procedure. + * "Sharding Initial State Sync" row - information about the initial state sync procedure when sharding is enabled. + * "Sharding Runtime State Sync" row - information about various state operations which occur when sharding is enabled (replication, fetch, marge, persist). * [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308 * [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. #329 * [BUGFIX] Fixed `CortexInconsistentRuntimeConfig` metric. #335 diff --git a/cortex-mixin/dashboards/alertmanager.libsonnet b/cortex-mixin/dashboards/alertmanager.libsonnet index b329ce6b..6f578b11 100644 --- a/cortex-mixin/dashboards/alertmanager.libsonnet +++ b/cortex-mixin/dashboards/alertmanager.libsonnet @@ -17,6 +17,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Total Silences') + $.statPanel('sum(cortex_alertmanager_silences{%s})' % $.jobMatcher('alertmanager'), format='short') ) + .addPanel( + $.panel('Tenants') + + $.statPanel('max(cortex_alertmanager_tenants_discovered{%s})' % $.jobMatcher('alertmanager'), format='short') + ) ) .addRow( $.row('Alerts Received') @@ -86,5 +90,136 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addRows( $.getObjectStoreRows('Alertmanager Configuration Object Store (Alertmanager accesses)', 'alertmanager-storage') + ) + .addRow( + $.row('Replication') + .addPanel( + $.panel('Per %s Tenants' % $._config.per_instance_label) + + $.queryPanel( + 'max by(%s) (cortex_alertmanager_tenants_owned{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], + '{{%s}}' % $._config.per_instance_label + ) + + $.stack + ) + .addPanel( + $.panel('Per %s Alerts' % $._config.per_instance_label) + + $.queryPanel( + 'sum by(%s) (cortex_alertmanager_alerts{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], + '{{%s}}' % $._config.per_instance_label + ) + + $.stack + ) + .addPanel( + $.panel('Per %s Silences' % $._config.per_instance_label) + + $.queryPanel( + 'sum by(%s) (cortex_alertmanager_silences{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], + '{{%s}}' % $._config.per_instance_label + ) + + $.stack + ) + ) + .addRow( + $.row('Tenant Configuration Sync') + .addPanel( + $.panel('Syncs/sec') + + $.queryPanel( + [ + ||| + sum(rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval])) + - + sum(rate(cortex_alertmanager_sync_configs_failed_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], + 'sum(rate(cortex_alertmanager_sync_configs_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + ], + ['success', 'failed'] + ) + ) + .addPanel( + $.panel('Syncs/sec (By Reason)') + + $.queryPanel( + 'sum by(reason) (rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + '{{reason}}' + ) + ) + .addPanel( + $.panel('Ring Check Errors/sec') + + $.queryPanel( + 'sum (rate(cortex_alertmanager_ring_check_errors_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + 'errors' + ) + ) + ) + .addRow( + $.row('Sharding Initial State Sync') + .addPanel( + $.panel('Initial syncs/sec') + + $.queryPanel( + 'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + '{{outcome}}' + ) + ) + .addPanel( + $.panel('Initial sync duration') + + $.latencyPanel('cortex_alertmanager_state_initial_sync_duration_seconds', '{%s}' % $.jobMatcher('alertmanager')) + ) + .addPanel( + $.panel('Fetch state from other alertmanagers /sec') + + $.queryPanel( + [ + ||| + sum(rate(cortex_alertmanager_state_fetch_replica_state_total{%s}[$__rate_interval])) + - + sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], + 'sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + ], + ['success', 'failed'] + ) + ) + ) + .addRow( + $.row('Sharding Runtime State Sync') + .addPanel( + $.panel('Replicate state to other alertmanagers /sec') + + $.queryPanel( + [ + ||| + sum(rate(cortex_alertmanager_state_replication_total{%s}[$__rate_interval])) + - + sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], + 'sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + ], + ['success', 'failed'] + ) + ) + .addPanel( + $.panel('Merge state from other alertmanagers /sec') + + $.queryPanel( + [ + ||| + sum(rate(cortex_alertmanager_partial_state_merges_total{%s}[$__rate_interval])) + - + sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], + 'sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + ], + ['success', 'failed'] + ) + ) + .addPanel( + $.panel('Persist state to remote storage /sec') + + $.queryPanel( + [ + ||| + sum(rate(cortex_alertmanager_state_persist_total{%s}[$__rate_interval])) + - + sum(rate(cortex_alertmanager_state_persist_failed_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], + 'sum(rate(cortex_alertmanager_state_persist_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + ], + ['success', 'failed'] + ) + ) ), }