From 2a350559fa3b0ee3533c982fb208c1c7575bbb6c Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 21 Jun 2021 12:45:01 +0200 Subject: [PATCH 1/3] Improve compactor alerts and playbooks Signed-off-by: Marco Pracucci --- CHANGELOG.md | 2 ++ cortex-mixin/alerts/compactor.libsonnet | 30 ++++++++++++------------- cortex-mixin/docs/playbooks.md | 24 +++++++++----------- 3 files changed, 27 insertions(+), 29 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bbab7c7..723b0287 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,8 @@ * [CHANGE] Dashboards: added overridable `job_labels` and `cluster_labels` to the configuration object as label lists to uniquely identify jobs and clusters in the metric names and group-by lists in dashboards. #319 * [CHANGE] Dashboards: `alert_aggregation_labels` has been removed from the configuration and overriding this value has been deprecated. Instead the labels are now defined by the `cluster_labels` list, and should be overridden accordingly through that list. #319 * [CHANGE] Ingester/Ruler: set `-server.grpc-max-send-msg-size-bytes` and `-server.grpc-max-send-msg-size-bytes` to sensible default values (10MB). #326 +* [CHANGE] Renamed `CortexCompactorHasNotUploadedBlocksSinceStart` into `CortexCompactorHasNotUploadedBlocks`. #334 +* [CHANGE] Renamed `CortexCompactorRunFailed` into `CortexCompactorHasNotSuccessfullyRunCompaction`. #334 * [ENHANCEMENT] cortex-mixin: Make `cluster_namespace_deployment:kube_pod_container_resource_requests_{cpu_cores,memory_bytes}:sum` backwards compatible with `kube-state-metrics` v2.0.0. #317 * [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308 * [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. #329 diff --git a/cortex-mixin/alerts/compactor.libsonnet b/cortex-mixin/alerts/compactor.libsonnet index be3de8c0..1f28a7e5 100644 --- a/cortex-mixin/alerts/compactor.libsonnet +++ b/cortex-mixin/alerts/compactor.libsonnet @@ -47,6 +47,19 @@ message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not run compaction in the last 24 hours.', }, }, + { + // Alert if compactor failed to run 2 consecutive compactions. + alert: 'CortexCompactorHasNotSuccessfullyRunCompaction', + expr: ||| + increase(cortex_compactor_runs_failed_total[2h]) >= 2 + |||, + labels: { + severity: 'critical', + }, + annotations: { + message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} failed to run 2 consecutive compactions.', + }, + }, { // Alert if the compactor has not uploaded anything in the last 24h. alert: 'CortexCompactorHasNotUploadedBlocks', @@ -65,7 +78,7 @@ }, { // Alert if the compactor has not uploaded anything since its start. - alert: 'CortexCompactorHasNotUploadedBlocksSinceStart', + alert: 'CortexCompactorHasNotUploadedBlocks', 'for': '24h', expr: ||| thanos_objstore_bucket_last_successful_upload_time{job=~".+/%(compactor)s"} == 0 @@ -77,21 +90,6 @@ message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not uploaded any block in the last 24 hours.', }, }, - { - // Alert if compactor fails. - alert: 'CortexCompactorRunFailed', - expr: ||| - increase(cortex_compactor_runs_failed_total[2h]) >= 2 - |||, - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.job }}/{{ $labels.instance }} failed to run compaction. - |||, - }, - }, ], }, ], diff --git a/cortex-mixin/docs/playbooks.md b/cortex-mixin/docs/playbooks.md index cc3a3ad9..daf5eaed 100644 --- a/cortex-mixin/docs/playbooks.md +++ b/cortex-mixin/docs/playbooks.md @@ -272,11 +272,21 @@ Same as [`CortexCompactorHasNotSuccessfullyCleanedUpBlocks`](#CortexCompactorHas This alert fires when a Cortex compactor is not uploading any compacted blocks to the storage since a long time. How to **investigate**: -- If the alert `CortexCompactorHasNotSuccessfullyRun` or `CortexCompactorHasNotSuccessfullyRunSinceStart` have fired as well, then investigate that issue first +- If the alert `CortexCompactorHasNotSuccessfullyRunCompaction` have fired as well, then investigate that issue first - If the alert `CortexIngesterHasNotShippedBlocks` or `CortexIngesterHasNotShippedBlocksSinceStart` have fired as well, then investigate that issue first - Ensure ingesters are successfully shipping blocks to the storage - Look for any error in the compactor logs +### CortexCompactorHasNotSuccessfullyRunCompaction + +This alert fires if the compactor is not able to successfully run a full compaction. + +When this alert fires, the compactor may still have successfully compacted some blocks but, for some reason, other blocks compaction is consistently failing. A common case is when the compactor is trying to compact a corrupted block for a single tenant: in this case the compaction of blocks for other tenants is still working, but compaction for the affected tenant is blocked by the corrupted block. + +How to **investigate**: +- Look for any error in the compactor logs + - Corruption: [`not healthy index found`](#compactor-is-failing-because-of-not-healthy-index-found) + #### Compactor is failing because of `not healthy index found` The compactor may fail to compact blocks due a corrupted block index found in one of the source blocks: @@ -301,18 +311,6 @@ To rename a block stored on GCS you can use the `gsutil` CLI: gsutil mv gs://BUCKET/TENANT/BLOCK gs://BUCKET/TENANT/corrupted-BLOCK ``` -### CortexCompactorHasNotUploadedBlocksSinceStart - -Same as [`CortexCompactorHasNotUploadedBlocks`](#CortexCompactorHasNotUploadedBlocks). - -### CortexCompactorHasNotSuccessfullyRunCompaction - -_TODO: this playbook has not been written yet._ - -### CortexCompactorRunFailed - -_TODO: this playbook has not been written yet._ - ### CortexBucketIndexNotUpdated This alert fires when the bucket index, for a given tenant, is not updated since a long time. The bucket index is expected to be periodically updated by the compactor and is used by queriers and store-gateways to get an almost-updated view over the bucket store. From 11f54955bb9accb568eb47199eb89288802cbb14 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 21 Jun 2021 15:10:24 +0200 Subject: [PATCH 2/3] Addressed review comments Signed-off-by: Marco Pracucci --- CHANGELOG.md | 4 ++-- cortex-mixin/docs/playbooks.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 723b0287..fefd22e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,8 +12,8 @@ * [CHANGE] Dashboards: added overridable `job_labels` and `cluster_labels` to the configuration object as label lists to uniquely identify jobs and clusters in the metric names and group-by lists in dashboards. #319 * [CHANGE] Dashboards: `alert_aggregation_labels` has been removed from the configuration and overriding this value has been deprecated. Instead the labels are now defined by the `cluster_labels` list, and should be overridden accordingly through that list. #319 * [CHANGE] Ingester/Ruler: set `-server.grpc-max-send-msg-size-bytes` and `-server.grpc-max-send-msg-size-bytes` to sensible default values (10MB). #326 -* [CHANGE] Renamed `CortexCompactorHasNotUploadedBlocksSinceStart` into `CortexCompactorHasNotUploadedBlocks`. #334 -* [CHANGE] Renamed `CortexCompactorRunFailed` into `CortexCompactorHasNotSuccessfullyRunCompaction`. #334 +* [CHANGE] Renamed `CortexCompactorHasNotUploadedBlocksSinceStart` to `CortexCompactorHasNotUploadedBlocks`. #334 +* [CHANGE] Renamed `CortexCompactorRunFailed` to `CortexCompactorHasNotSuccessfullyRunCompaction`. #334 * [ENHANCEMENT] cortex-mixin: Make `cluster_namespace_deployment:kube_pod_container_resource_requests_{cpu_cores,memory_bytes}:sum` backwards compatible with `kube-state-metrics` v2.0.0. #317 * [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308 * [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. #329 diff --git a/cortex-mixin/docs/playbooks.md b/cortex-mixin/docs/playbooks.md index daf5eaed..1ad67bfe 100644 --- a/cortex-mixin/docs/playbooks.md +++ b/cortex-mixin/docs/playbooks.md @@ -272,14 +272,14 @@ Same as [`CortexCompactorHasNotSuccessfullyCleanedUpBlocks`](#CortexCompactorHas This alert fires when a Cortex compactor is not uploading any compacted blocks to the storage since a long time. How to **investigate**: -- If the alert `CortexCompactorHasNotSuccessfullyRunCompaction` have fired as well, then investigate that issue first +- If the alert `CortexCompactorHasNotSuccessfullyRunCompaction` has fired as well, then investigate that issue first - If the alert `CortexIngesterHasNotShippedBlocks` or `CortexIngesterHasNotShippedBlocksSinceStart` have fired as well, then investigate that issue first - Ensure ingesters are successfully shipping blocks to the storage - Look for any error in the compactor logs ### CortexCompactorHasNotSuccessfullyRunCompaction -This alert fires if the compactor is not able to successfully run a full compaction. +This alert fires if the compactor is not able to successfully compact all discovered compactable blocks. When this alert fires, the compactor may still have successfully compacted some blocks but, for some reason, other blocks compaction is consistently failing. A common case is when the compactor is trying to compact a corrupted block for a single tenant: in this case the compaction of blocks for other tenants is still working, but compaction for the affected tenant is blocked by the corrupted block. From 700dae23799b02b7f18b47d8eb3dc8de1e1a5807 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 21 Jun 2021 15:54:21 +0200 Subject: [PATCH 3/3] Update cortex-mixin/docs/playbooks.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Marco Pracucci Co-authored-by: Peter Štibraný --- cortex-mixin/docs/playbooks.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cortex-mixin/docs/playbooks.md b/cortex-mixin/docs/playbooks.md index 1ad67bfe..704b6492 100644 --- a/cortex-mixin/docs/playbooks.md +++ b/cortex-mixin/docs/playbooks.md @@ -279,7 +279,7 @@ How to **investigate**: ### CortexCompactorHasNotSuccessfullyRunCompaction -This alert fires if the compactor is not able to successfully compact all discovered compactable blocks. +This alert fires if the compactor is not able to successfully compact all discovered compactable blocks (across all tenants). When this alert fires, the compactor may still have successfully compacted some blocks but, for some reason, other blocks compaction is consistently failing. A common case is when the compactor is trying to compact a corrupted block for a single tenant: in this case the compaction of blocks for other tenants is still working, but compaction for the affected tenant is blocked by the corrupted block.