From 9d80934b3f380cf3adc8e69dbd3592ca7b371dbd Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Fri, 2 Jul 2021 14:07:37 +0200 Subject: [PATCH] Replaced CortexCacheRequestErrors with CortexMemcachedRequestErrors Signed-off-by: Marco Pracucci --- jsonnet/mimir-mixin/alerts/alerts.libsonnet | 14 +++++------ jsonnet/mimir-mixin/docs/playbooks.md | 28 +++++++++++++++++++-- 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/jsonnet/mimir-mixin/alerts/alerts.libsonnet b/jsonnet/mimir-mixin/alerts/alerts.libsonnet index 71655505973..ad24ac8ebf1 100644 --- a/jsonnet/mimir-mixin/alerts/alerts.libsonnet +++ b/jsonnet/mimir-mixin/alerts/alerts.libsonnet @@ -180,20 +180,20 @@ }, }, { - alert: 'CortexCacheRequestErrors', + alert: 'CortexMemcachedRequestErrors', expr: ||| - 100 * sum by (%s, method) (rate(cortex_cache_request_duration_seconds_count{status_code=~"5.."}[1m])) - / - sum by (%s, method) (rate(cortex_cache_request_duration_seconds_count[1m])) - > 1 + ( + sum by(%s, name, operation) (rate(thanos_memcached_operation_failures_total[1m])) / + sum by(%s, name, operation) (rate(thanos_memcached_operations_total[1m])) + ) * 100 > 5 ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], - 'for': '15m', + 'for': '5m', labels: { severity: 'warning', }, annotations: { message: ||| - Cache {{ $labels.method }} is experiencing {{ printf "%.2f" $value }}% errors. + Memcached {{ $labels.name }} used by Cortex in {{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors for {{ $labels.operation }} operation. |||, }, }, diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index dc5058529b0..5c4cbd4336d 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -414,9 +414,33 @@ _TODO: this playbook has not been written yet._ _TODO: this playbook has not been written yet._ -### CortexCacheRequestErrors +### CortexMemcachedRequestErrors -_TODO: this playbook has not been written yet._ +This alert fires if Cortex memcached client is experiencing an high error rate for a specific cache and operation. + +How to **investigate**: +- The alert reports which cache is experiencing issue + - `metadata-cache`: object store metadata cache + - `index-cache`: TSDB index cache + - `chunks-cache`: TSDB chunks cache +- Check which specific error is occurring + - Run the following query to find out the reason (replace `` with the actual Cortex cluster namespace) + ``` + sum by(name, operation, reason) (rate(thanos_memcached_operation_failures_total{namespace=""}[1m])) > 0 + ``` +- Based on the **`reason`**: + - `timeout` + - Scale up the memcached replicas + - `server-error` + - Check both Cortex and memcached logs to find more details + - `network-error` + - Check Cortex logs to find more details + - `malformed-key` + - The key is too long or contains invalid characters + - Check Cortex logs to find the offending key + - Fixing this will require changes to the application code + - `other` + - Check both Cortex and memcached logs to find more details ### CortexOldChunkInMemory