Skip to content

Commit

Permalink
Merge pull request #411 from aallawala/aja_alertmanager
Browse files Browse the repository at this point in the history
use alertmanager jobname for alertmanager dashboard panels
  • Loading branch information
pracucci authored Oct 22, 2021
2 parents b9d0544 + d804539 commit b8901a9
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 27 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
* [ENHANCEMENT] Added `CortexKVStoreFailure` alert. #406
* [ENHANCEMENT] Use configured `ruler` jobname for ruler dashboard panels. #409
* [ENHANCEMENT] Add ability to override `datasource` for generated dashboards. #407
* [ENHANCEMENT] Use alertmanager jobname for alertmanager dashboard panels #411
* [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308
* [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. #329
* [BUGFIX] Fixed `CortexInconsistentRuntimeConfig` metric. #335
Expand Down
1 change: 1 addition & 0 deletions cortex-mixin/config.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
store_gateway: '(store-gateway|cortex$)',
gateway: '(gateway|cortex-gw|cortex-gw-internal)',
compactor: 'compactor.*', // Match also custom compactor deployments.
alertmanager: 'alertmanager',
},

// Grouping labels, to uniquely identify and group by {jobs, clusters}
Expand Down
54 changes: 27 additions & 27 deletions cortex-mixin/dashboards/alertmanager.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,15 @@ local utils = import 'mixin-utils/utils.libsonnet';
})
.addPanel(
$.panel('Total Alerts') +
$.statPanel('sum(cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], format='short')
$.statPanel('sum(cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], format='short')
)
.addPanel(
$.panel('Total Silences') +
$.statPanel('sum(cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], format='short')
$.statPanel('sum(cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], format='short')
)
.addPanel(
$.panel('Tenants') +
$.statPanel('max(cortex_alertmanager_tenants_discovered{%s})' % $.jobMatcher('alertmanager'), format='short')
$.statPanel('max(cortex_alertmanager_tenants_discovered{%s})' % $.jobMatcher($._config.job_names.alertmanager), format='short')
)
)
.addRow(
Expand All @@ -32,8 +32,8 @@ local utils = import 'mixin-utils/utils.libsonnet';
sum(cluster_job:cortex_alertmanager_alerts_received_total:rate5m{%s})
-
sum(cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m{%s})
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
'sum(cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m{%s})' % $.jobMatcher('alertmanager'),
||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)],
'sum(cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m{%s})' % $.jobMatcher($._config.job_names.alertmanager),
],
['success', 'failed']
)
Expand All @@ -49,8 +49,8 @@ local utils = import 'mixin-utils/utils.libsonnet';
sum(cluster_job_integration:cortex_alertmanager_notifications_total:rate5m{%s})
-
sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s})
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
'sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s})' % $.jobMatcher('alertmanager'),
||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)],
'sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s})' % $.jobMatcher($._config.job_names.alertmanager),
],
['success', 'failed']
)
Expand All @@ -66,15 +66,15 @@ local utils = import 'mixin-utils/utils.libsonnet';
sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) by(integration)
) > 0
or on () vector(0)
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
'sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) by(integration)' % $.jobMatcher('alertmanager'),
||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)],
'sum(cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m{%s}) by(integration)' % $.jobMatcher($._config.job_names.alertmanager),
],
['success - {{ integration }}', 'failed - {{ integration }}']
)
)
.addPanel(
$.panel('Latency') +
$.latencyPanel('cortex_alertmanager_notification_latency_seconds', '{%s}' % $.jobMatcher('alertmanager'))
$.latencyPanel('cortex_alertmanager_notification_latency_seconds', '{%s}' % $.jobMatcher($._config.job_names.alertmanager))
)
)
.addRow(
Expand All @@ -96,23 +96,23 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addPanel(
$.panel('Per %s Tenants' % $._config.per_instance_label) +
$.queryPanel(
'max by(%s) (cortex_alertmanager_tenants_owned{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')],
'max by(%s) (cortex_alertmanager_tenants_owned{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)],
'{{%s}}' % $._config.per_instance_label
) +
$.stack
)
.addPanel(
$.panel('Per %s Alerts' % $._config.per_instance_label) +
$.queryPanel(
'sum by(%s) (cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher('alertmanager')],
'sum by(%s) (cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)],
'{{%s}}' % $._config.per_instance_label
) +
$.stack
)
.addPanel(
$.panel('Per %s Silences' % $._config.per_instance_label) +
$.queryPanel(
'sum by(%s) (cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher('alertmanager')],
'sum by(%s) (cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)],
'{{%s}}' % $._config.per_instance_label
) +
$.stack
Expand All @@ -128,23 +128,23 @@ local utils = import 'mixin-utils/utils.libsonnet';
sum(rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval]))
-
sum(rate(cortex_alertmanager_sync_configs_failed_total{%s}[$__rate_interval]))
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
'sum(rate(cortex_alertmanager_sync_configs_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)],
'sum(rate(cortex_alertmanager_sync_configs_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager),
],
['success', 'failed']
)
)
.addPanel(
$.panel('Syncs/sec (By Reason)') +
$.queryPanel(
'sum by(reason) (rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
'sum by(reason) (rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager),
'{{reason}}'
)
)
.addPanel(
$.panel('Ring Check Errors/sec') +
$.queryPanel(
'sum (rate(cortex_alertmanager_ring_check_errors_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
'sum (rate(cortex_alertmanager_ring_check_errors_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager),
'errors'
)
)
Expand All @@ -154,7 +154,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addPanel(
$.panel('Initial syncs /sec') +
$.queryPanel(
'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager),
'{{outcome}}'
) + {
targets: [
Expand All @@ -167,7 +167,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
.addPanel(
$.panel('Initial sync duration') +
$.latencyPanel('cortex_alertmanager_state_initial_sync_duration_seconds', '{%s}' % $.jobMatcher('alertmanager')) + {
$.latencyPanel('cortex_alertmanager_state_initial_sync_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.alertmanager)) + {
targets: [
target {
interval: '1m',
Expand All @@ -184,8 +184,8 @@ local utils = import 'mixin-utils/utils.libsonnet';
sum(rate(cortex_alertmanager_state_fetch_replica_state_total{%s}[$__rate_interval]))
-
sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval]))
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
'sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)],
'sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager),
],
['success', 'failed']
) + {
Expand All @@ -208,8 +208,8 @@ local utils = import 'mixin-utils/utils.libsonnet';
sum(cluster_job:cortex_alertmanager_state_replication_total:rate5m{%s})
-
sum(cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m{%s})
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
'sum(cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m{%s})' % $.jobMatcher('alertmanager'),
||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)],
'sum(cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m{%s})' % $.jobMatcher($._config.job_names.alertmanager),
],
['success', 'failed']
)
Expand All @@ -222,8 +222,8 @@ local utils = import 'mixin-utils/utils.libsonnet';
sum(cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m{%s})
-
sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m{%s})
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
'sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m{%s})' % $.jobMatcher('alertmanager'),
||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)],
'sum(cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m{%s})' % $.jobMatcher($._config.job_names.alertmanager),
],
['success', 'failed']
)
Expand All @@ -236,8 +236,8 @@ local utils = import 'mixin-utils/utils.libsonnet';
sum(rate(cortex_alertmanager_state_persist_total{%s}[$__rate_interval]))
-
sum(rate(cortex_alertmanager_state_persist_failed_total{%s}[$__rate_interval]))
||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')],
'sum(rate(cortex_alertmanager_state_persist_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'),
||| % [$.jobMatcher($._config.job_names.alertmanager), $.jobMatcher($._config.job_names.alertmanager)],
'sum(rate(cortex_alertmanager_state_persist_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager),
],
['success', 'failed']
)
Expand Down

0 comments on commit b8901a9

Please sign in to comment.