diff --git a/CHANGELOG.md b/CHANGELOG.md index c78f9b89..e4223a3a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ * [CHANGE] `namespace` template variable in dashboards now only selects namespaces for selected clusters. #311 * [CHANGE] Alertmanager: mounted overrides configmap to alertmanager too. #315 * [CHANGE] Memcached: upgraded memcached from `1.5.17` to `1.6.9`. #316 +* [CHANGE] `CortexIngesterRestarts` alert severity changed from `critical` to `warning`. #321 * [CHANGE] Store-gateway: increased memory request and limit respectively from 6GB / 6GB to 12GB / 18GB. #322 * [CHANGE] Store-gateway: increased `-blocks-storage.bucket-store.max-chunk-pool-bytes` from 2GB (default) to 12GB. #322 * [ENHANCEMENT] cortex-mixin: Make `cluster_namespace_deployment:kube_pod_container_resource_requests_{cpu_cores,memory_bytes}:sum` backwards compatible with `kube-state-metrics` v2.0.0. #317 diff --git a/cortex-mixin/alerts/alerts.libsonnet b/cortex-mixin/alerts/alerts.libsonnet index 38c9d00e..a8c4d784 100644 --- a/cortex-mixin/alerts/alerts.libsonnet +++ b/cortex-mixin/alerts/alerts.libsonnet @@ -198,10 +198,13 @@ { alert: 'CortexIngesterRestarts', expr: ||| - changes(process_start_time_seconds{job=~".+(cortex|ingester.*)"}[30m]) > 1 + changes(process_start_time_seconds{job=~".+(cortex|ingester.*)"}[30m]) >= 2 |||, labels: { - severity: 'critical', + // This alert is on a cause not symptom. A couple of ingesters restarts may be suspicious but + // not necessarily an issue (eg. may happen because of the K8S node autoscaler), so we're + // keeping the alert as warning as a signal in case of an outage. + severity: 'warning', }, annotations: { message: '{{ $labels.job }}/{{ $labels.instance }} has restarted {{ printf "%.2f" $value }} times in the last 30 mins.',