From c891992752c0119eb11a1779e649c8ed55ae1562 Mon Sep 17 00:00:00 2001
From: Marco Pracucci
For each service, there are 3 panels showing (1) requests per second to that service, (2) average, median, and p99 latency of requests to that service, and (3) p99 latency of requests to each instance of that service.
-
- The dashboard also shows metrics for the 4 optional caches that can be deployed with Cortex:
- the query results cache, the metadata cache, the chunks cache, and the index cache.
+ The dashboard also shows metrics for the 4 optional caches that can be deployed with Cortex:
+ the query results cache, the metadata cache, the chunks cache, and the index cache.
- These panels will show “no data” if the caches are not deployed.
+ These panels will show “no data” if the caches are not deployed.
- Lastly, it also includes metrics for how the ingester and store-gateway interact with object storage. + Lastly, it also includes metrics for how the ingester and store-gateway interact with object storage.
|||), ) @@ -45,7 +45,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; route=~"(prometheus|api_prom)_api_v1_query" }[$__rate_interval] ) - ) + + ) + sum( rate( cortex_prometheus_rule_evaluations_total{ @@ -61,7 +61,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'Instant queries per second', ||| Rate of instant queries per second being made to the system. - Includes both queries made to the /prometheus API as + Includes both queries made to the /prometheus API as well as queries from the ruler. ||| ), @@ -83,8 +83,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'Range queries per second', ||| - Rate of range queries per second being made to - Cortex via the /prometheus API. + Rate of range queries per second being made to + Cortex via the /prometheus API. ||| ), ) @@ -135,7 +135,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; The query scheduler is an optional service that moves the internal queue from the query-frontend into a separate component. - If this service is not deployed, + If this service is not deployed, these panels will show "No data." ||| @@ -286,8 +286,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; %s }[$__rate_interval] ) - ) - / + ) + / sum by(item_type) ( rate( thanos_store_index_cache_requests_total{ @@ -307,7 +307,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'Hit Ratio', ||| Even if you do not set up memcached for the blocks index cache, you will still see data in this panel because Cortex by default has an - in-memory blocks index cache. + in-memory blocks index cache. ||| ), ) diff --git a/jsonnet/mimir-mixin/dashboards/writes.libsonnet b/jsonnet/mimir-mixin/dashboards/writes.libsonnet index 8a77be1c16d..e99faee4c4e 100644 --- a/jsonnet/mimir-mixin/dashboards/writes.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/writes.libsonnet @@ -11,16 +11,16 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.textPanel('', |||
This dashboard shows various health metrics for the Cortex write path.
- It is broken into sections for each service on the write path,
+ It is broken into sections for each service on the write path,
and organized by the order in which the write request flows.
Incoming metrics data travels from the gateway → distributor → ingester.
For each service, there are 3 panels showing
- (1) requests per second to that service,
- (2) average, median, and p99 latency of requests to that service, and
+ (1) requests per second to that service,
+ (2) average, median, and p99 latency of requests to that service, and
(3) p99 latency of requests to each instance of that service.
-
It also includes metrics for the key-value (KV) stores used to manage the high-availability tracker and the ingesters. @@ -216,7 +216,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'Uploaded blocks / sec', ||| - The rate of blocks being uploaded from the ingesters + The rate of blocks being uploaded from the ingesters to object storage. ||| ), @@ -227,7 +227,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'Upload latency', ||| - The average, median (50th percentile), and 99th percentile time + The average, median (50th percentile), and 99th percentile time the ingesters take to upload blocks to object storage. ||| ), @@ -247,7 +247,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ||| Ingesters maintain a local TSDB per-tenant on disk. Each TSDB maintains a head block for each active time series; these blocks get periodically compacted (by default, every 2h). - This panel shows the rate of compaction operations across all TSDBs on all ingesters. + This panel shows the rate of compaction operations across all TSDBs on all ingesters. ||| ), ) @@ -275,7 +275,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'WAL truncations per second', ||| - The WAL is truncated each time a new TSDB block is written. This panel measures the rate of + The WAL is truncated each time a new TSDB block is written. This panel measures the rate of truncations. ||| ), @@ -289,7 +289,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'Checkpoints created per second', ||| - Checkpoints are created as part of the WAL truncation process. + Checkpoints are created as part of the WAL truncation process. This metric measures the rate of checkpoint creation. ||| ), @@ -301,7 +301,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panelDescription( 'WAL truncations latency (including checkpointing)', ||| - Average time taken to perform a full WAL truncation, + Average time taken to perform a full WAL truncation, including the time taken for the checkpointing to complete. ||| ), diff --git a/jsonnet/mimir-mixin/docs/playbooks.md b/jsonnet/mimir-mixin/docs/playbooks.md index 2dc2e26b493..292d932af13 100644 --- a/jsonnet/mimir-mixin/docs/playbooks.md +++ b/jsonnet/mimir-mixin/docs/playbooks.md @@ -16,7 +16,7 @@ In events you're looking for things like: ``` 57m Normal NodeControllerEviction Pod Marking for deletion Pod ingester-01 from Node cloud-provider-node-01 37m Normal SuccessfulDelete ReplicaSet (combined from similar events): Deleted pod: ingester-01 -32m Normal NodeNotReady Node Node cloud-provider-node-01 status is now: NodeNotReady +32m Normal NodeNotReady Node Node cloud-provider-node-01 status is now: NodeNotReady 28m Normal DeletingAllPods Node Node cloud-provider-node-01 event: Deleting all Pods from Node cloud-provider-node-01. ``` @@ -313,7 +313,7 @@ gsutil mv gs://BUCKET/TENANT/BLOCK gs://BUCKET/TENANT/corrupted-BLOCK ### CortexBucketIndexNotUpdated -This alert fires when the bucket index, for a given tenant, is not updated since a long time. The bucket index is expected to be periodically updated by the compactor and is used by queriers and store-gateways to get an almost-updated view over the bucket store. +This alert fires when the bucket index, for a given tenant, is not updated since a long time. The bucket index is expected to be periodically updated by the compactor and is used by queriers and store-gateways to get an almost-updated view over the bucket store. How to **investigate**: - Ensure the compactor is successfully running @@ -557,7 +557,7 @@ metadata: spec: accessModes: - ReadWriteOnce - capacity: + capacity: storage: 150Gi gcePersistentDisk: fsType: ext4 diff --git a/jsonnet/mimir-mixin/groups.libsonnet b/jsonnet/mimir-mixin/groups.libsonnet index 630766722f4..6d33ea3661d 100644 --- a/jsonnet/mimir-mixin/groups.libsonnet +++ b/jsonnet/mimir-mixin/groups.libsonnet @@ -29,7 +29,7 @@ if alert_aggregation_labels_override != null then std.trace( ||| - Deprecated: _config.alert_aggregation_labels + Deprecated: _config.alert_aggregation_labels This field has been explicitly overridden to "%s". Instead, express the override in terms of _config.cluster_labels. E.g., cluster_labels: %s will automatically convert to "%s".