From 795b0bac1ae41828a3bc03e0fa2ba17fe3431c50 Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Thu, 2 Sep 2021 12:16:27 +0200 Subject: [PATCH] Fixes to initial state sync panels on alertmanager dashboard. 1) Change minimal interval to 1m for sync duration and fetch state panels. This is in order to show infrequent events at smaller time windows. 2) Change syncs/sec panel to reflect absolute value of metric not rate. The initial sync only occurs once per-tenant so the counter value is essentially 0 or 1. Due to how per-tenant metrics are aggregated, the external facing metric really acts more like a gauge reflecting the number of tenants which achieved each outcome. Also, stack this panel as it becomes easier to visually see when the initial syncs have completed for all tenants (e.g. during a rollout). --- .../dashboards/alertmanager.libsonnet | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/cortex-mixin/dashboards/alertmanager.libsonnet b/cortex-mixin/dashboards/alertmanager.libsonnet index 6f578b11..33b257b5 100644 --- a/cortex-mixin/dashboards/alertmanager.libsonnet +++ b/cortex-mixin/dashboards/alertmanager.libsonnet @@ -152,15 +152,23 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Sharding Initial State Sync') .addPanel( - $.panel('Initial syncs/sec') + + $.panel('Tenant initial sync outcomes') + $.queryPanel( - 'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + 'sum by(outcome) (cortex_alertmanager_state_initial_sync_completed_total{%s})' % $.jobMatcher('alertmanager'), '{{outcome}}' - ) + ) + + $.stack ) .addPanel( $.panel('Initial sync duration') + - $.latencyPanel('cortex_alertmanager_state_initial_sync_duration_seconds', '{%s}' % $.jobMatcher('alertmanager')) + $.latencyPanel('cortex_alertmanager_state_initial_sync_duration_seconds', '{%s}' % $.jobMatcher('alertmanager')) + { + targets: [ + target { + interval: '1m', + } + for target in super.targets + ], + } ) .addPanel( $.panel('Fetch state from other alertmanagers /sec') + @@ -174,7 +182,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), ], ['success', 'failed'] - ) + ) + { + targets: [ + target { + interval: '1m', + } + for target in super.targets + ], + } ) ) .addRow(