diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bbab7c7..fefd22e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,8 @@ * [CHANGE] Dashboards: added overridable `job_labels` and `cluster_labels` to the configuration object as label lists to uniquely identify jobs and clusters in the metric names and group-by lists in dashboards. #319 * [CHANGE] Dashboards: `alert_aggregation_labels` has been removed from the configuration and overriding this value has been deprecated. Instead the labels are now defined by the `cluster_labels` list, and should be overridden accordingly through that list. #319 * [CHANGE] Ingester/Ruler: set `-server.grpc-max-send-msg-size-bytes` and `-server.grpc-max-send-msg-size-bytes` to sensible default values (10MB). #326 +* [CHANGE] Renamed `CortexCompactorHasNotUploadedBlocksSinceStart` to `CortexCompactorHasNotUploadedBlocks`. #334 +* [CHANGE] Renamed `CortexCompactorRunFailed` to `CortexCompactorHasNotSuccessfullyRunCompaction`. #334 * [ENHANCEMENT] cortex-mixin: Make `cluster_namespace_deployment:kube_pod_container_resource_requests_{cpu_cores,memory_bytes}:sum` backwards compatible with `kube-state-metrics` v2.0.0. #317 * [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308 * [BUGFIX] Alertmanager: fixed `--alertmanager.cluster.peers` CLI flag passed to alertmanager when HA is enabled. #329 diff --git a/cortex-mixin/alerts/compactor.libsonnet b/cortex-mixin/alerts/compactor.libsonnet index be3de8c0..1f28a7e5 100644 --- a/cortex-mixin/alerts/compactor.libsonnet +++ b/cortex-mixin/alerts/compactor.libsonnet @@ -47,6 +47,19 @@ message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not run compaction in the last 24 hours.', }, }, + { + // Alert if compactor failed to run 2 consecutive compactions. + alert: 'CortexCompactorHasNotSuccessfullyRunCompaction', + expr: ||| + increase(cortex_compactor_runs_failed_total[2h]) >= 2 + |||, + labels: { + severity: 'critical', + }, + annotations: { + message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} failed to run 2 consecutive compactions.', + }, + }, { // Alert if the compactor has not uploaded anything in the last 24h. alert: 'CortexCompactorHasNotUploadedBlocks', @@ -65,7 +78,7 @@ }, { // Alert if the compactor has not uploaded anything since its start. - alert: 'CortexCompactorHasNotUploadedBlocksSinceStart', + alert: 'CortexCompactorHasNotUploadedBlocks', 'for': '24h', expr: ||| thanos_objstore_bucket_last_successful_upload_time{job=~".+/%(compactor)s"} == 0 @@ -77,21 +90,6 @@ message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not uploaded any block in the last 24 hours.', }, }, - { - // Alert if compactor fails. - alert: 'CortexCompactorRunFailed', - expr: ||| - increase(cortex_compactor_runs_failed_total[2h]) >= 2 - |||, - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.job }}/{{ $labels.instance }} failed to run compaction. - |||, - }, - }, ], }, ], diff --git a/cortex-mixin/docs/playbooks.md b/cortex-mixin/docs/playbooks.md index cc3a3ad9..704b6492 100644 --- a/cortex-mixin/docs/playbooks.md +++ b/cortex-mixin/docs/playbooks.md @@ -272,11 +272,21 @@ Same as [`CortexCompactorHasNotSuccessfullyCleanedUpBlocks`](#CortexCompactorHas This alert fires when a Cortex compactor is not uploading any compacted blocks to the storage since a long time. How to **investigate**: -- If the alert `CortexCompactorHasNotSuccessfullyRun` or `CortexCompactorHasNotSuccessfullyRunSinceStart` have fired as well, then investigate that issue first +- If the alert `CortexCompactorHasNotSuccessfullyRunCompaction` has fired as well, then investigate that issue first - If the alert `CortexIngesterHasNotShippedBlocks` or `CortexIngesterHasNotShippedBlocksSinceStart` have fired as well, then investigate that issue first - Ensure ingesters are successfully shipping blocks to the storage - Look for any error in the compactor logs +### CortexCompactorHasNotSuccessfullyRunCompaction + +This alert fires if the compactor is not able to successfully compact all discovered compactable blocks (across all tenants). + +When this alert fires, the compactor may still have successfully compacted some blocks but, for some reason, other blocks compaction is consistently failing. A common case is when the compactor is trying to compact a corrupted block for a single tenant: in this case the compaction of blocks for other tenants is still working, but compaction for the affected tenant is blocked by the corrupted block. + +How to **investigate**: +- Look for any error in the compactor logs + - Corruption: [`not healthy index found`](#compactor-is-failing-because-of-not-healthy-index-found) + #### Compactor is failing because of `not healthy index found` The compactor may fail to compact blocks due a corrupted block index found in one of the source blocks: @@ -301,18 +311,6 @@ To rename a block stored on GCS you can use the `gsutil` CLI: gsutil mv gs://BUCKET/TENANT/BLOCK gs://BUCKET/TENANT/corrupted-BLOCK ``` -### CortexCompactorHasNotUploadedBlocksSinceStart - -Same as [`CortexCompactorHasNotUploadedBlocks`](#CortexCompactorHasNotUploadedBlocks). - -### CortexCompactorHasNotSuccessfullyRunCompaction - -_TODO: this playbook has not been written yet._ - -### CortexCompactorRunFailed - -_TODO: this playbook has not been written yet._ - ### CortexBucketIndexNotUpdated This alert fires when the bucket index, for a given tenant, is not updated since a long time. The bucket index is expected to be periodically updated by the compactor and is used by queriers and store-gateways to get an almost-updated view over the bucket store.