diff --git a/CHANGELOG.md b/CHANGELOG.md index 43efe95..090e857 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -65,6 +65,7 @@ * [ENHANCEMENT] Add support for running Alertmanager in sharding mode. #394 * [ENHANCEMENT] Allow to customize PromQL engine settings via `queryEngineConfig`. #399 * [ENHANCEMENT] Add recording rules to improve responsiveness of Alertmanager dashboard. #387 +* [ENHANCEMENT] Add `CortexRolloutStuck` alert. #405 * [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308 * [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. #329 * [BUGFIX] Fixed `CortexInconsistentRuntimeConfig` metric. #335 diff --git a/cortex-mixin/alerts/alerts.libsonnet b/cortex-mixin/alerts/alerts.libsonnet index c8e925f..993323e 100644 --- a/cortex-mixin/alerts/alerts.libsonnet +++ b/cortex-mixin/alerts/alerts.libsonnet @@ -412,6 +412,67 @@ }, ], }, + { + name: 'cortex-rollout-alerts', + rules: [ + { + alert: 'CortexRolloutStuck', + expr: ||| + ( + max without (revision) ( + kube_statefulset_status_current_revision + unless + kube_statefulset_status_update_revision + ) + * + ( + kube_statefulset_replicas + != + kube_statefulset_status_replicas_updated + ) + ) and ( + changes(kube_statefulset_status_replicas_updated[15m]) + == + 0 + ) + * on(%s) group_left max by(%s) (cortex_build_info) + ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + The {{ $labels.statefulset }} rollout is stuck in %(alert_aggregation_variables)s. + ||| % $._config, + }, + }, + { + alert: 'CortexRolloutStuck', + expr: ||| + ( + kube_deployment_spec_replicas + != + kube_deployment_status_replicas_updated + ) and ( + changes(kube_deployment_status_replicas_updated[15m]) + == + 0 + ) + * on(%s) group_left max by(%s) (cortex_build_info) + ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + message: ||| + The {{ $labels.deployment }} rollout is stuck in %(alert_aggregation_variables)s. + ||| % $._config, + }, + }, + ], + }, { name: 'cortex-provisioning', rules: [ diff --git a/cortex-mixin/docs/playbooks.md b/cortex-mixin/docs/playbooks.md index 98416a6..e61f24f 100644 --- a/cortex-mixin/docs/playbooks.md +++ b/cortex-mixin/docs/playbooks.md @@ -724,6 +724,15 @@ When an alertmanager cannot read the state for a tenant from storage it gets log - The state could not be merged because it might be invalid and could not be decoded. This could indicate data corruption and therefore a bug in the reading or writing of the state, and would need further investigation. - The state could not be read from storage. This could be due to a networking issue such as a timeout or an authentication and authorization issue with the remote object store. +### CortexRolloutStuck + +This alert fires when a Cortex service rollout is stuck, which means the number of updated replicas doesn't match the expected one and looks there's no progress in the rollout. The alert monitors services deployed as Kubernetes `StatefulSet` and `Deployment`. + +How to **investigate**: +- Run `kubectl -n get pods -l name=` to get a list of running pods +- Ensure there's no pod in a failing state (eg. `Error`, `OOMKilled`, `CrashLoopBackOff`) +- Ensure there's no pod `NotReady` (the number of ready containers should match the total number of containers, eg. `1/1` or `2/2`) +- Run `kubectl -n describe statefulset ` or `kubectl -n describe deployment ` and look at "Pod Status" and "Events" to get more information ## Cortex routes by path