diff --git a/CHANGELOG.md b/CHANGELOG.md index 027037d..60e5cc7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -62,6 +62,7 @@ * [ENHANCEMENT] Add support for Azure storage in Alertmanager configuration. #381 * [ENHANCEMENT] Add support for running Alertmanager in sharding mode. #394 * [ENHANCEMENT] Allow to customize PromQL engine settings via `queryEngineConfig`. #399 +* [ENHANCEMENT] Add recording rules to improve responsiveness of Alertmanager dashboard. #387 * [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308 * [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. #329 * [BUGFIX] Fixed `CortexInconsistentRuntimeConfig` metric. #335 diff --git a/cortex-mixin/dashboards/alertmanager.libsonnet b/cortex-mixin/dashboards/alertmanager.libsonnet index 7e2e3c5..8897034 100644 --- a/cortex-mixin/dashboards/alertmanager.libsonnet +++ b/cortex-mixin/dashboards/alertmanager.libsonnet @@ -11,11 +11,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; }) .addPanel( $.panel('Total Alerts') + - $.statPanel('sum(cortex_alertmanager_alerts{%s})' % $.jobMatcher('alertmanager'), format='short') + $.statPanel('sum(cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], format='short') ) .addPanel( $.panel('Total Silences') + - $.statPanel('sum(cortex_alertmanager_silences{%s})' % $.jobMatcher('alertmanager'), format='short') + $.statPanel('sum(cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], format='short') ) .addPanel( $.panel('Tenants') + @@ -29,11 +29,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel( [ ||| - sum(rate(cortex_alertmanager_alerts_received_total{%s}[$__rate_interval])) + sum(cluster_job:cortex_alertmanager_alerts_received_total:rate5m{%s}) - - sum(rate(cortex_alertmanager_alerts_invalid_total{%s}[$__rate_interval])) + sum(cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m{%s}) ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(rate(cortex_alertmanager_alerts_invalid_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + 'sum(cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m{%s})' % $.jobMatcher('alertmanager'), ], ['success', 'failed'] ) @@ -46,11 +46,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel( [ ||| - sum(rate(cortex_alertmanager_notifications_total{%s}[$__rate_interval])) + sum(cluster_job_integration:cortex_alertmanager_notifications_total:rate5m{%s}) - - sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__rate_interval])) + sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + 'sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s})' % $.jobMatcher('alertmanager'), ], ['success', 'failed'] ) @@ -61,13 +61,13 @@ local utils = import 'mixin-utils/utils.libsonnet'; [ ||| ( - sum(rate(cortex_alertmanager_notifications_total{%s}[$__rate_interval])) by(integration) + sum(cluster_job_integration:cortex_alertmanager_notifications_total:rate5m{%s}) by(integration) - - sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__rate_interval])) by(integration) + sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) by(integration) ) > 0 or on () vector(0) ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(rate(cortex_alertmanager_notifications_failed_total{%s}[$__rate_interval])) by(integration)' % $.jobMatcher('alertmanager'), + 'sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) by(integration)' % $.jobMatcher('alertmanager'), ], ['success - {{ integration }}', 'failed - {{ integration }}'] ) @@ -104,7 +104,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per %s Alerts' % $._config.per_instance_label) + $.queryPanel( - 'sum by(%s) (cortex_alertmanager_alerts{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], + 'sum by(%s) (cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher('alertmanager')], '{{%s}}' % $._config.per_instance_label ) + $.stack @@ -112,7 +112,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addPanel( $.panel('Per %s Silences' % $._config.per_instance_label) + $.queryPanel( - 'sum by(%s) (cortex_alertmanager_silences{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], + 'sum by(%s) (cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher('alertmanager')], '{{%s}}' % $._config.per_instance_label ) + $.stack @@ -205,11 +205,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel( [ ||| - sum(rate(cortex_alertmanager_state_replication_total{%s}[$__rate_interval])) + sum(cluster_job:cortex_alertmanager_state_replication_total:rate5m{%s}) - - sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval])) + sum(cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m{%s}) ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + 'sum(cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m{%s})' % $.jobMatcher('alertmanager'), ], ['success', 'failed'] ) @@ -219,11 +219,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel( [ ||| - sum(rate(cortex_alertmanager_partial_state_merges_total{%s}[$__rate_interval])) + sum(cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m{%s}) - - sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval])) + sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m{%s}) ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + 'sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m{%s})' % $.jobMatcher('alertmanager'), ], ['success', 'failed'] ) diff --git a/cortex-mixin/recording_rules.libsonnet b/cortex-mixin/recording_rules.libsonnet index 1ce2519..a438cab 100644 --- a/cortex-mixin/recording_rules.libsonnet +++ b/cortex-mixin/recording_rules.libsonnet @@ -366,6 +366,72 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, ], }, + { + name: 'cortex_alertmanager_rules', + rules: [ + // Aggregations of per-user Alertmanager metrics used in dashboards. + { + record: 'cluster_job_%s:cortex_alertmanager_alerts:sum' % $._config.per_instance_label, + expr: ||| + sum by (cluster, job, %s) (cortex_alertmanager_alerts) + ||| % $._config.per_instance_label, + }, + { + record: 'cluster_job_%s:cortex_alertmanager_silences:sum' % $._config.per_instance_label, + expr: ||| + sum by (cluster, job, %s) (cortex_alertmanager_silences) + ||| % $._config.per_instance_label, + }, + { + record: 'cluster_job:cortex_alertmanager_alerts_received_total:rate5m', + expr: ||| + sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m])) + |||, + }, + { + record: 'cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m', + expr: ||| + sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m])) + |||, + }, + { + record: 'cluster_job_integration:cortex_alertmanager_notifications_total:rate5m', + expr: ||| + sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m])) + |||, + }, + { + record: 'cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m', + expr: ||| + sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m])) + |||, + }, + { + record: 'cluster_job:cortex_alertmanager_state_replication_total:rate5m', + expr: ||| + sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m])) + |||, + }, + { + record: 'cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m', + expr: ||| + sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m])) + |||, + }, + { + record: 'cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m', + expr: ||| + sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m])) + |||, + }, + { + record: 'cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m', + expr: ||| + sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m])) + |||, + }, + ], + }, ], }, }