From 4d5df5c44fb8bbbf041d685027d653ae5a8f7b75 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 13 Oct 2021 11:22:43 +0200 Subject: [PATCH 1/3] Added CortexFailingToTalkToConsul alert Signed-off-by: Marco Pracucci --- CHANGELOG.md | 1 + cortex-mixin/alerts/alerts.libsonnet | 26 ++++++++++++++++++++++++++ cortex-mixin/docs/playbooks.md | 12 ++++++++++++ 3 files changed, 39 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 090e857..d3ae42d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -66,6 +66,7 @@ * [ENHANCEMENT] Allow to customize PromQL engine settings via `queryEngineConfig`. #399 * [ENHANCEMENT] Add recording rules to improve responsiveness of Alertmanager dashboard. #387 * [ENHANCEMENT] Add `CortexRolloutStuck` alert. #405 +* [ENHANCEMENT] Added `CortexFailingToTalkToConsul` alert. #406 * [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308 * [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. #329 * [BUGFIX] Fixed `CortexInconsistentRuntimeConfig` metric. #335 diff --git a/cortex-mixin/alerts/alerts.libsonnet b/cortex-mixin/alerts/alerts.libsonnet index 993323e..d042ce0 100644 --- a/cortex-mixin/alerts/alerts.libsonnet +++ b/cortex-mixin/alerts/alerts.libsonnet @@ -715,5 +715,31 @@ }, ], }, + { + name: 'cortex-consul-alerts', + rules: [ + { + alert: 'CortexFailingToTalkToConsul', + expr: ||| + ( + sum by(%s, pod, status_code, kv_name) (rate(cortex_consul_request_duration_seconds_count{status_code!~"2.+"}[1m])) + / + sum by(%s, pod, status_code, kv_name) (rate(cortex_consul_request_duration_seconds_count[1m])) + ) + # We want to get alerted only in case there's a constant failure. + == 1 + ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + Cortex {{ $labels.pod }} in %(alert_aggregation_variables)s is failing to talk to Consul store ${{ labels.kv_name }}. + ||| % $._config, + }, + }, + ], + }, ], } diff --git a/cortex-mixin/docs/playbooks.md b/cortex-mixin/docs/playbooks.md index e61f24f..1a92546 100644 --- a/cortex-mixin/docs/playbooks.md +++ b/cortex-mixin/docs/playbooks.md @@ -734,6 +734,18 @@ How to **investigate**: - Ensure there's no pod `NotReady` (the number of ready containers should match the total number of containers, eg. `1/1` or `2/2`) - Run `kubectl -n describe statefulset ` or `kubectl -n describe deployment ` and look at "Pod Status" and "Events" to get more information +### CortexFailingToTalkToConsul + +This alert fires if a Cortex instance is failing to run any operation on Consul. + +How it **works**: +- Consul is typically used to store the hash ring state. +- If an instance is failing to talk to Consul, either the instance can't update the heartbeat in the ring or is failing to receive ring updates. + +How to **investigate**: +- Ensure Consul is up and running. +- Investigate the logs of the affected instance to find the specific error occurring when talking to Consul. + ## Cortex routes by path **Write path**: From c6e8d4e297ef4a1ad757b333550235f980b4bf82 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Wed, 13 Oct 2021 11:28:39 +0200 Subject: [PATCH 2/3] Fixed alert message Signed-off-by: Marco Pracucci --- cortex-mixin/alerts/alerts.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cortex-mixin/alerts/alerts.libsonnet b/cortex-mixin/alerts/alerts.libsonnet index d042ce0..2553d79 100644 --- a/cortex-mixin/alerts/alerts.libsonnet +++ b/cortex-mixin/alerts/alerts.libsonnet @@ -735,7 +735,7 @@ }, annotations: { message: ||| - Cortex {{ $labels.pod }} in %(alert_aggregation_variables)s is failing to talk to Consul store ${{ labels.kv_name }}. + Cortex {{ $labels.pod }} in %(alert_aggregation_variables)s is failing to talk to Consul store {{ $labels.kv_name }}. ||| % $._config, }, }, From be5af202202726ac47d8594f2a3bf686a7f6f0ff Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Thu, 14 Oct 2021 09:45:09 +0200 Subject: [PATCH 3/3] Update alert to be generic to KV stores Signed-off-by: Marco Pracucci --- CHANGELOG.md | 2 +- cortex-mixin/alerts/alerts.libsonnet | 47 +++++++++++++--------------- cortex-mixin/docs/playbooks.md | 12 ++++--- 3 files changed, 29 insertions(+), 32 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d3ae42d..698d38c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -66,7 +66,7 @@ * [ENHANCEMENT] Allow to customize PromQL engine settings via `queryEngineConfig`. #399 * [ENHANCEMENT] Add recording rules to improve responsiveness of Alertmanager dashboard. #387 * [ENHANCEMENT] Add `CortexRolloutStuck` alert. #405 -* [ENHANCEMENT] Added `CortexFailingToTalkToConsul` alert. #406 +* [ENHANCEMENT] Added `CortexKVStoreFailure` alert. #406 * [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308 * [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. #329 * [BUGFIX] Fixed `CortexInconsistentRuntimeConfig` metric. #335 diff --git a/cortex-mixin/alerts/alerts.libsonnet b/cortex-mixin/alerts/alerts.libsonnet index 2553d79..59022dd 100644 --- a/cortex-mixin/alerts/alerts.libsonnet +++ b/cortex-mixin/alerts/alerts.libsonnet @@ -235,6 +235,27 @@ |||, }, }, + { + alert: 'CortexKVStoreFailure', + expr: ||| + ( + sum by(%s, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count{status_code!~"2.+"}[1m])) + / + sum by(%s, pod, status_code, kv_name) (rate(cortex_kv_request_duration_seconds_count[1m])) + ) + # We want to get alerted only in case there's a constant failure. + == 1 + ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + Cortex {{ $labels.pod }} in %(alert_aggregation_variables)s is failing to talk to the KV store {{ $labels.kv_name }}. + ||| % $._config, + }, + }, { alert: 'CortexMemoryMapAreasTooHigh', expr: ||| @@ -715,31 +736,5 @@ }, ], }, - { - name: 'cortex-consul-alerts', - rules: [ - { - alert: 'CortexFailingToTalkToConsul', - expr: ||| - ( - sum by(%s, pod, status_code, kv_name) (rate(cortex_consul_request_duration_seconds_count{status_code!~"2.+"}[1m])) - / - sum by(%s, pod, status_code, kv_name) (rate(cortex_consul_request_duration_seconds_count[1m])) - ) - # We want to get alerted only in case there's a constant failure. - == 1 - ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], - 'for': '5m', - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - Cortex {{ $labels.pod }} in %(alert_aggregation_variables)s is failing to talk to Consul store {{ $labels.kv_name }}. - ||| % $._config, - }, - }, - ], - }, ], } diff --git a/cortex-mixin/docs/playbooks.md b/cortex-mixin/docs/playbooks.md index 1a92546..180ed50 100644 --- a/cortex-mixin/docs/playbooks.md +++ b/cortex-mixin/docs/playbooks.md @@ -734,17 +734,19 @@ How to **investigate**: - Ensure there's no pod `NotReady` (the number of ready containers should match the total number of containers, eg. `1/1` or `2/2`) - Run `kubectl -n describe statefulset ` or `kubectl -n describe deployment ` and look at "Pod Status" and "Events" to get more information -### CortexFailingToTalkToConsul +### CortexKVStoreFailure -This alert fires if a Cortex instance is failing to run any operation on Consul. +This alert fires if a Cortex instance is failing to run any operation on a KV store (eg. consul or etcd). How it **works**: - Consul is typically used to store the hash ring state. -- If an instance is failing to talk to Consul, either the instance can't update the heartbeat in the ring or is failing to receive ring updates. +- Etcd is typically used to store by the HA tracker (distributor) to deduplicate samples. +- If an instance is failing operations on the **hash ring**, either the instance can't update the heartbeat in the ring or is failing to receive ring updates. +- If an instance is failing operations on the **HA tracker** backend, either the instance can't update the authoritative replica or is failing to receive updates. How to **investigate**: -- Ensure Consul is up and running. -- Investigate the logs of the affected instance to find the specific error occurring when talking to Consul. +- Ensure Consul/Etcd is up and running. +- Investigate the logs of the affected instance to find the specific error occurring when talking to Consul/Etcd. ## Cortex routes by path