From bd0dad687bb1d13d82570db893a89fc09edf5b77 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 7 Jun 2021 08:53:23 +0200 Subject: [PATCH] Lower CortexIngesterRestarts severity Signed-off-by: Marco Pracucci --- CHANGELOG.md | 1 + cortex-mixin/alerts/alerts.libsonnet | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2e6e4479..1979469a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ * [CHANGE] `namespace` template variable in dashboards now only selects namespaces for selected clusters. #311 * [CHANGE] Alertmanager: mounted overrides configmap to alertmanager too. #315 * [CHANGE] Memcached: upgraded memcached from `1.5.17` to `1.6.9`. #316 +* [CHANGE] `CortexIngesterRestarts` alert severity changed from `critical` to `warning`. #321 * [ENHANCEMENT] cortex-mixin: Make `cluster_namespace_deployment:kube_pod_container_resource_requests_{cpu_cores,memory_bytes}:sum` backwards compatible with `kube-state-metrics` v2.0.0. #317 * [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308 diff --git a/cortex-mixin/alerts/alerts.libsonnet b/cortex-mixin/alerts/alerts.libsonnet index 7568b4fd..276e8842 100644 --- a/cortex-mixin/alerts/alerts.libsonnet +++ b/cortex-mixin/alerts/alerts.libsonnet @@ -198,10 +198,13 @@ { alert: 'CortexIngesterRestarts', expr: ||| - changes(process_start_time_seconds{job=~".+(cortex|ingester.*)"}[30m]) > 1 + changes(process_start_time_seconds{job=~".+(cortex|ingester.*)"}[30m]) >= 2 |||, labels: { - severity: 'critical', + // This alert is on a cause not symptom. A couple of ingesters restarts may be suspicious but + // not necessarily an issue (eg. may happen because of the K8S node autoscaler), so we're + // keeping the alert as warning as a signal in case of an outage. + severity: 'warning', }, annotations: { message: '{{ $labels.job }}/{{ $labels.instance }} has restarted {{ printf "%.2f" $value }} times in the last 30 mins.',