From 4642b5c2c6bfe91a9b72c49afb41961d67ccce3e Mon Sep 17 00:00:00 2001
From: Darren Janeczek
Date: Tue, 8 Jun 2021 15:57:47 -0400
Subject: [PATCH 01/35] feature: add some text boxes and descriptions
Focussing on the reads and writes dashboards,
added some info panels and hover-over descriptions
for some of the panels.
Some common code used by the compactor also
received additional text content.
New functions:
- addRows
- addRowsIf
...to add a list of rows to a dashboard.
The `thanosMemcachedCache` function has had some of its query text
sprawled out for easier reading and comparison with similar dashboard
queries.
---
cortex-mixin/dashboards/compactor.libsonnet | 3 +-
.../dashboards/dashboard-utils.libsonnet | 184 +++++++--
cortex-mixin/dashboards/reads.libsonnet | 357 ++++++++++++++++--
cortex-mixin/dashboards/writes.libsonnet | 203 +++++++++-
4 files changed, 658 insertions(+), 89 deletions(-)
diff --git a/cortex-mixin/dashboards/compactor.libsonnet b/cortex-mixin/dashboards/compactor.libsonnet
index 657cfce7..a1d5ea90 100644
--- a/cortex-mixin/dashboards/compactor.libsonnet
+++ b/cortex-mixin/dashboards/compactor.libsonnet
@@ -103,6 +103,5 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.latencyPanel('cortex_compactor_meta_sync_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.compactor)),
)
)
- .addRow($.objectStorePanels1('Object Store', 'compactor'))
- .addRow($.objectStorePanels2('', 'compactor')),
+ .addRows($.getObjectStoreRows('Object Store', 'compactor')),
}
diff --git a/cortex-mixin/dashboards/dashboard-utils.libsonnet b/cortex-mixin/dashboards/dashboard-utils.libsonnet
index ded63ddc..57ae82b3 100644
--- a/cortex-mixin/dashboards/dashboard-utils.libsonnet
+++ b/cortex-mixin/dashboards/dashboard-utils.libsonnet
@@ -14,6 +14,24 @@ local utils = import 'mixin-utils/utils.libsonnet';
then self.addRow(row)
else self,
+ addRowsIf(condition, rows)::
+ if condition
+ then
+ local reduceRows(dashboard, remainingRows) =
+ if (std.length(remainingRows) == 0)
+ then dashboard
+ else
+ reduceRows(
+ dashboard.addRow(remainingRows[0]),
+ std.slice(remainingRows, 1, std.length(remainingRows), 1)
+ )
+ ;
+ reduceRows(self, rows)
+ else self,
+
+ addRows(rows)::
+ addRowsIf(true, rows),
+
addClusterSelectorTemplates(multi=true)::
local d = self {
tags: $._config.tags,
@@ -43,7 +61,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
else d
.addTemplate('cluster', 'cortex_build_info', 'cluster')
.addTemplate('namespace', 'cortex_build_info{cluster=~"$cluster"}', 'namespace'),
-
+ editable: true,
},
// The mixin allow specialism of the job selector depending on if its a single binary
@@ -274,8 +292,21 @@ local utils = import 'mixin-utils/utils.libsonnet';
type: 'text',
} + options,
- objectStorePanels1(title, component)::
- super.row(title)
+
+ getObjectStoreRows(title, component):: [
+ ($.row(title) { height: '25px' })
+ .addPanel(
+ $.textPanel(
+ '',
+ |||
+ - The panels below summarize the rate of requests issued by %s
+ to object storage, separated by operation type.
+ - It also includes the average, median, and 99th percentile latency
+ of each operation and the error rate of each operation.
+ ||| % component
+ )
+ ),
+ $.row('')
.addPanel(
$.panel('Operations / sec') +
$.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s,component="%s"}[$__rate_interval]))' % [$.namespaceMatcher(), component], '{{operation}}') +
@@ -288,62 +319,163 @@ local utils = import 'mixin-utils/utils.libsonnet';
{ yaxes: $.yaxes('percentunit') },
)
.addPanel(
- $.panel('Op: Attributes') +
+ $.panel('Latency of Op: Attributes') +
$.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="attributes"}' % [$.namespaceMatcher(), component]),
)
.addPanel(
- $.panel('Op: Exists') +
+ $.panel('Latency of Op: Exists') +
$.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="exists"}' % [$.namespaceMatcher(), component]),
),
-
- // Second row of Object Store stats
- objectStorePanels2(title, component)::
- super.row(title)
+ $.row('')
.addPanel(
- $.panel('Op: Get') +
+ $.panel('Latency of Op: Get') +
$.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="get"}' % [$.namespaceMatcher(), component]),
)
.addPanel(
- $.panel('Op: GetRange') +
+ $.panel('Latency of Op: GetRange') +
$.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="get_range"}' % [$.namespaceMatcher(), component]),
)
.addPanel(
- $.panel('Op: Upload') +
+ $.panel('Latency of Op: Upload') +
$.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="upload"}' % [$.namespaceMatcher(), component]),
)
.addPanel(
- $.panel('Op: Delete') +
+ $.panel('Latency of Op: Delete') +
$.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="delete"}' % [$.namespaceMatcher(), component]),
),
+ ],
thanosMemcachedCache(title, jobName, component, cacheName)::
+ local config = {
+ jobMatcher: $.jobMatcher(jobName),
+ component: component,
+ cacheName: cacheName,
+ cacheNameReadable: std.strReplace(cacheName, '-', ' '),
+ };
+ local panelText = {
+ 'metadata-cache':
+ |||
+ The metadata cache
+ is an optional component that the
+ store-gateway and querier
+ will check before going to object storage.
+ This set of panels focuses on the
+ %s’s use of the metadata cache.
+ ||| % component,
+ 'chunks-cache':
+ |||
+ The chunks cache
+ is an optional component that the
+ store-gateway
+ will check before going to object storage.
+ This helps reduce calls to the object store.
+ |||,
+ }[cacheName];
+
super.row(title)
+ .addPanel(
+ $.textPanel(
+ '', panelText
+ )
+ )
.addPanel(
$.panel('QPS') +
- $.queryPanel('sum by(operation) (rate(thanos_memcached_operations_total{%s,component="%s",name="%s"}[$__rate_interval]))' % [$.jobMatcher(jobName), component, cacheName], '{{operation}}') +
+ $.queryPanel(
+ |||
+ sum by(operation) (
+ rate(
+ thanos_memcached_operations_total{
+ %(jobMatcher)s,
+ component="%(component)s",
+ name="%(cacheName)s"
+ }[$__rate_interval]
+ )
+ )
+ ||| % config,
+ '{{operation}}'
+ ) +
$.stack +
- { yaxes: $.yaxes('ops') },
+ { yaxes: $.yaxes('ops') } +
+ $.panelDescription(
+ 'Requests Per Second',
+ |||
+ Requests per second made to
+ the %(cacheNameReadable)s
+ from the %(component)s,
+ separated into request type.
+ ||| % config
+ ),
)
.addPanel(
$.panel('Latency (getmulti)') +
- $.latencyPanel('thanos_memcached_operation_duration_seconds', '{%s,operation="getmulti",component="%s",name="%s"}' % [$.jobMatcher(jobName), component, cacheName])
+ $.latencyPanel(
+ 'thanos_memcached_operation_duration_seconds',
+ |||
+ {
+ %(jobMatcher)s,
+ operation="getmulti",
+ component="%(component)s",
+ name="%(cacheName)s"
+ }
+ ||| % config
+ ) +
+ $.panelDescription(
+ 'Latency (getmulti)',
+ |||
+ The average, median (50th percentile) and 99th percentile
+ time to satisfy a “getmulti” request
+ made by the %(component)s,
+ which retrieves multiple items from the cache.
+ ||| % config
+ )
)
.addPanel(
$.panel('Hit ratio') +
- $.queryPanel('sum(rate(thanos_cache_memcached_hits_total{%s,component="%s",name="%s"}[$__rate_interval])) / sum(rate(thanos_cache_memcached_requests_total{%s,component="%s",name="%s"}[$__rate_interval]))' %
- [
- $.jobMatcher(jobName),
- component,
- cacheName,
- $.jobMatcher(jobName),
- component,
- cacheName,
- ], 'items') +
- { yaxes: $.yaxes('percentunit') },
+ $.queryPanel(
+ |||
+ sum(
+ rate(
+ thanos_cache_memcached_hits_total{
+ %(jobMatcher)s,
+ component="%(component)s",
+ name="%(cacheName)s"
+ }[$__rate_interval]
+ )
+ )
+ /
+ sum(
+ rate(
+ thanos_cache_memcached_requests_total{
+ %(jobMatcher)s,
+ component="%(component)s",
+ name="%(cacheName)s"
+ }[$__rate_interval]
+ )
+ )
+ ||| % config,
+ 'items'
+ ) +
+ { yaxes: $.yaxes('percentunit') } +
+ $.panelDescription(
+ 'Hit Ratio',
+ |||
+ The fraction of %(component)s requests to the
+ %(cacheNameReadable)s that successfully return data.
+ Requests that miss the cache must go to
+ object storage for the underlying data.
+ ||| % config
+ ),
),
filterNodeDiskContainer(containerName)::
|||
ignoring(%s) group_right() (label_replace(count by(%s, %s, device) (container_fs_writes_bytes_total{%s,container="%s",device!~".*sda.*"}), "device", "$1", "device", "/dev/(.*)") * 0)
||| % [$._config.per_instance_label, $._config.per_node_label, $._config.per_instance_label, $.namespaceMatcher(), containerName],
+
+ panelDescription(title, description):: {
+ description: |||
+ ### %s
+ %s
+ ||| % [title, description],
+ },
}
diff --git a/cortex-mixin/dashboards/reads.libsonnet b/cortex-mixin/dashboards/reads.libsonnet
index 9f98308c..f741b188 100644
--- a/cortex-mixin/dashboards/reads.libsonnet
+++ b/cortex-mixin/dashboards/reads.libsonnet
@@ -4,117 +4,270 @@ local utils = import 'mixin-utils/utils.libsonnet';
'cortex-reads.json':
($.dashboard('Cortex / Reads') + { uid: '8d6ba60eccc4b6eedfa329b24b1bd339' })
.addClusterSelectorTemplates()
+ .addRow(
+ ($.row('Reads Summary') { height: '175px', showTitle: false })
+ .addPanel(
+ $.textPanel('', |||
+
+ This dashboard shows various health metrics for the Cortex read path.
+ It is broken into sections for each service on the read path, and organized by the order in which the read request flows.
+
+ Incoming queries travel from the gateway → query frontend → query scheduler → querier → ingester and/or store-gateway (depending on the age of the query).
+
+
+ The dashboard shows metrics for the 4 optional caches that can be deployed with Cortex:
+ the query results cache, the metadata cache, the chunks cache, and the index cache.
+
+ These panels will show “no data” if the caches are not deployed.
+
+
+ Lastly, it also includes metrics for how the ingester and store-gateway interact with object storage.
+
+ |||),
+ )
+ )
+ .addRow(
+ ($.row('Headlines') +
+ {
+ height: '100px',
+ showTitle: false,
+ })
+ .addPanel(
+ $.panel('Instant Queries / s') +
+ $.statPanel(|||
+ sum(
+ rate(
+ cortex_request_duration_seconds_count{
+ %(queryFrontend)s,
+ route=~"(prometheus|api_prom)_api_v1_query"
+ }[1h]
+ )
+ ) +
+ sum(
+ rate(
+ cortex_prometheus_rule_evaluations_total{
+ %(ruler)s
+ }[1h]
+ )
+ )
+ ||| % {
+ queryFrontend: $.jobMatcher($._config.job_names.query_frontend),
+ ruler: $.jobMatcher($._config.job_names.ruler),
+ }, format='reqps') +
+ $.panelDescription(
+ 'Instant Queries Per Second',
+ |||
+ Rate of instant queries per second being made to the system.
+ Includes both queries made to the /prometheus API as
+ well as queries from the ruler.
+ |||
+ ),
+ )
+ .addPanel(
+ $.panel('Range Queries / s') +
+ $.statPanel(|||
+ sum(
+ rate(
+ cortex_request_duration_seconds_count{
+ %(queryFrontend)s,
+ route=~"(prometheus|api_prom)_api_v1_query_range"
+ }[1h]
+ )
+ )
+ ||| % {
+ queryFrontend: $.jobMatcher($._config.job_names.query_frontend),
+ }, format='reqps') +
+ $.panelDescription(
+ 'Range Queries Per Second',
+ |||
+ Rate of range queries per second being made to
+ Cortex via the /prometheus API.
+ (The ruler does not issue range queries).
+ |||
+ ),
+ )
+ )
.addRow(
$.row('Gateway')
.addPanel(
$.panel('QPS') +
- $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway))
+ $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway)) +
+ $.panelDescriptionRps('gateway')
)
.addPanel(
$.panel('Latency') +
- utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')])
+ utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) +
+ $.panelDescriptionLatency('gateway')
)
.addPanel(
$.panel('Per %s p99 Latency' % $._config.per_instance_label) +
$.hiddenLegendQueryPanel(
'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway)], ''
) +
- { yaxes: $.yaxes('s') }
+ { yaxes: $.yaxes('s') } +
+ $.panelDescriptionP99Latency('gateway')
)
)
.addRow(
$.row('Query Frontend')
.addPanel(
$.panel('QPS') +
- $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend))
+ $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend)) +
+ $.panelDescriptionRps('query frontend')
)
.addPanel(
$.panel('Latency') +
- utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')])
+ utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) +
+ $.panelDescriptionLatency('query frontend')
)
.addPanel(
$.panel('Per %s p99 Latency' % $._config.per_instance_label) +
$.hiddenLegendQueryPanel(
'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.query_frontend)], ''
) +
- { yaxes: $.yaxes('s') }
+ { yaxes: $.yaxes('s') } +
+ $.panelDescriptionP99Latency('query frontend')
)
)
.addRow(
$.row('Query Scheduler')
+ .addPanel(
+ $.textPanel(
+ '',
+ |||
+
+ The query scheduler is an optional service that moves
+ the internal queue from the query frontend into a
+ separate component.
+ If this service is not deployed,
+ these panels will show "No Data."
+
+ |||
+ )
+ )
.addPanel(
$.panel('QPS') +
- $.qpsPanel('cortex_query_scheduler_queue_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.query_scheduler))
+ $.qpsPanel('cortex_query_scheduler_queue_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) +
+ $.panelDescriptionRps('query scheduler')
)
.addPanel(
$.panel('Latency (Time in Queue)') +
- $.latencyPanel('cortex_query_scheduler_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_scheduler))
+ $.latencyPanel('cortex_query_scheduler_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) +
+ $.panelDescriptionLatency('query scheduler')
)
)
.addRow(
$.row('Cache - Query Results')
+ .addPanel(
+ $.textPanel('', |||
+
+ The query results is an optional service is one of 4
+ optional caches that can be deployed as part of a Cortex
+ cluster to improve query performance.
+ It is used by the query-frontend to cache entire results
+ of queries.
+
+ |||)
+ )
.addPanel(
$.panel('QPS') +
- $.qpsPanel('cortex_cache_request_duration_seconds_count{method=~"frontend.+", %s}' % $.jobMatcher($._config.job_names.query_frontend))
+ $.qpsPanel('cortex_cache_request_duration_seconds_count{method=~"frontend.+", %s}' % $.jobMatcher($._config.job_names.query_frontend)) +
+ $.panelDescriptionRps('query results')
)
.addPanel(
$.panel('Latency') +
- utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('method', 'frontend.+')])
+ utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('method', 'frontend.+')]) +
+ $.panelDescriptionLatency('query results')
)
)
.addRow(
$.row('Querier')
.addPanel(
$.panel('QPS') +
- $.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier))
+ $.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier)) +
+ $.panelDescriptionRps(
+ 'querier'
+ )
)
.addPanel(
$.panel('Latency') +
- utils.latencyRecordingRulePanel('cortex_querier_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')])
+ utils.latencyRecordingRulePanel('cortex_querier_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) +
+ $.panelDescriptionLatency('querier')
)
.addPanel(
$.panel('Per %s p99 Latency' % $._config.per_instance_label) +
$.hiddenLegendQueryPanel(
'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_querier_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.querier)], ''
) +
- { yaxes: $.yaxes('s') }
+ { yaxes: $.yaxes('s') } +
+ $.panelDescriptionP99Latency('querier')
)
)
.addRow(
$.row('Ingester')
+ .addPanel(
+ $.textPanel(
+ '',
+ |||
+
+ For short term queries, queriers go
+ to the ingester to fetch the data.
+
+ |||
+ )
+ )
.addPanel(
$.panel('QPS') +
- $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}' % $.jobMatcher($._config.job_names.ingester))
+ $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}' % $.jobMatcher($._config.job_names.ingester)) +
+ $.panelDescriptionRps('ingester')
)
.addPanel(
$.panel('Latency') +
- utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.re('route', '/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata')])
+ utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.re('route', '/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata')]) +
+ $.panelDescriptionLatency('ingester')
)
.addPanel(
$.panel('Per %s p99 Latency' % $._config.per_instance_label) +
$.hiddenLegendQueryPanel(
'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], ''
) +
- { yaxes: $.yaxes('s') }
+ { yaxes: $.yaxes('s') } +
+ $.panelDescriptionP99Latency('ingester')
)
)
.addRowIf(
std.member($._config.storage_engine, 'blocks'),
$.row('Store-gateway')
+ .addPanel(
+ $.textPanel(
+ '',
+ |||
+
+ For longer term queries, queriers go to the store-gateways to
+ fetch the data.
+ Store-gateways are responsible for fetching the data from object
+ storage.
+
+ |||
+ )
+ )
.addPanel(
$.panel('QPS') +
- $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/gatewaypb.StoreGateway/.*"}' % $.jobMatcher($._config.job_names.store_gateway))
+ $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/gatewaypb.StoreGateway/.*"}' % $.jobMatcher($._config.job_names.store_gateway)) +
+ $.panelDescriptionRps('store gateway')
)
.addPanel(
$.panel('Latency') +
- utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.store_gateway) + [utils.selector.re('route', '/gatewaypb.StoreGateway/.*')])
+ utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.store_gateway) + [utils.selector.re('route', '/gatewaypb.StoreGateway/.*')]) +
+ $.panelDescriptionLatency('store gateway')
)
.addPanel(
$.panel('Per %s p99 Latency' % $._config.per_instance_label) +
$.hiddenLegendQueryPanel(
'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/gatewaypb.StoreGateway/.*"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.store_gateway)], ''
) +
- { yaxes: $.yaxes('s') }
+ { yaxes: $.yaxes('s') } +
+ $.panelDescriptionP99Latency('store gateway')
)
)
.addRowIf(
@@ -143,34 +296,134 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
.addRowIf(
std.member($._config.storage_engine, 'blocks'),
- $.row('Memcached – Blocks Storage – Block Index (Store-gateway)')
+ $.row('Memcached – Blocks Storage – Block Index (Store-gateway)') // Resembles thanosMemcachedCache
+ .addPanel(
+ $.textPanel(
+ '',
+ |||
+
+ The block index cache is an optional component that the
+ store-gateway will check before going to object storage.
+ This helps reduce calls to the object store.
+
+ |||
+ )
+ )
.addPanel(
$.panel('QPS') +
- $.queryPanel('sum by(operation) (rate(thanos_memcached_operations_total{component="store-gateway",name="index-cache", %s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{operation}}') +
+ $.queryPanel(
+ |||
+ sum by(operation) (
+ rate(
+ thanos_memcached_operations_total{
+ component="store-gateway",
+ name="index-cache",
+ %s
+ }[$__rate_interval]
+ )
+ )
+ ||| % $.jobMatcher($._config.job_names.store_gateway), '{{operation}}'
+ ) +
$.stack +
- { yaxes: $.yaxes('ops') },
+ { yaxes: $.yaxes('ops') } +
+ $.panelDescription(
+ 'Requests Per Second',
+ |||
+ Requests per second made to
+ the block index cache
+ from the store-gateway,
+ separated into request type.
+ |||
+ ),
)
.addPanel(
$.panel('Latency (getmulti)') +
- $.latencyPanel('thanos_memcached_operation_duration_seconds', '{%s,operation="getmulti",component="store-gateway",name="index-cache"}' % $.jobMatcher($._config.job_names.store_gateway))
+ $.latencyPanel(
+ 'thanos_memcached_operation_duration_seconds',
+ |||
+ {
+ %s,
+ operation="getmulti",
+ component="store-gateway",
+ name="index-cache"
+ }
+ ||| % $.jobMatcher($._config.job_names.store_gateway)
+ ) +
+ $.panelDescription(
+ 'Latency (getmulti)',
+ |||
+ The average, median (50th percentile) and 99th percentile
+ time to satisfy a “getmulti” request
+ from the store-gateway,
+ which retrieves multiple items from the cache.
+ |||
+ )
)
.addPanel(
$.panel('Hit ratio') +
- $.queryPanel('sum by(item_type) (rate(thanos_store_index_cache_hits_total{component="store-gateway",%s}[$__rate_interval])) / sum by(item_type) (rate(thanos_store_index_cache_requests_total{component="store-gateway",%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], '{{item_type}}') +
- { yaxes: $.yaxes('percentunit') },
+ $.queryPanel(
+ |||
+ sum by(item_type) (
+ rate(
+ thanos_store_index_cache_hits_total{
+ component="store-gateway",
+ %s
+ }[$__rate_interval]
+ )
+ )
+ /
+ sum by(item_type) (
+ rate(
+ thanos_store_index_cache_requests_total{
+ component="store-gateway",
+ %s
+ }[$__rate_interval]
+ )
+ )
+ ||| % [
+ $.jobMatcher($._config.job_names.store_gateway),
+ $.jobMatcher($._config.job_names.store_gateway),
+ ],
+ '{{item_type}}'
+ ) +
+ { yaxes: $.yaxes('percentunit') } +
+ $.panelDescription(
+ 'Hit Ratio',
+ |||
+ The fraction of requests to the
+ block index cache that successfully return data.
+ Requests that miss the cache must go to
+ object storage for the underlying data.
+ |||
+ ),
)
)
.addRowIf(
std.member($._config.storage_engine, 'blocks'),
- $.thanosMemcachedCache('Memcached – Blocks Storage – Chunks (Store-gateway)', $._config.job_names.store_gateway, 'store-gateway', 'chunks-cache')
+ $.thanosMemcachedCache(
+ 'Memcached – Blocks Storage – Chunks (Store-gateway)',
+ $._config.job_names.store_gateway,
+ 'store-gateway',
+ 'chunks-cache'
+ )
)
.addRowIf(
std.member($._config.storage_engine, 'blocks'),
- $.thanosMemcachedCache('Memcached – Blocks Storage – Metadata (Store-gateway)', $._config.job_names.store_gateway, 'store-gateway', 'metadata-cache')
+ $.thanosMemcachedCache(
+ 'Memcached – Blocks Storage – Metadata (Store-gateway)',
+ $._config.job_names.store_gateway,
+ 'store-gateway',
+ 'metadata-cache'
+ )
)
.addRowIf(
std.member($._config.storage_engine, 'blocks'),
- $.thanosMemcachedCache('Memcached – Blocks Storage – Metadata (Querier)', $._config.job_names.querier, 'querier', 'metadata-cache')
+ $.thanosMemcachedCache(
+ 'Memcached – Blocks Storage – Metadata (Querier)',
+ $._config.job_names.querier,
+ 'querier',
+ 'metadata-cache'
+ )
)
.addRowIf(
std.member($._config.storage_engine, 'chunks') &&
@@ -225,21 +478,43 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
)
// Object store metrics for the store-gateway.
- .addRowIf(
+ .addRowsIf(
std.member($._config.storage_engine, 'blocks'),
- $.objectStorePanels1('Store-gateway - Blocks Object Store', 'store-gateway'),
- )
- .addRowIf(
- std.member($._config.storage_engine, 'blocks'),
- $.objectStorePanels2('', 'store-gateway'),
+ $.getObjectStoreRows('Store-gateway - Blocks Object Store', 'store-gateway')
)
// Object store metrics for the querier.
- .addRowIf(
+ .addRowsIf(
std.member($._config.storage_engine, 'blocks'),
- $.objectStorePanels1('Querier - Blocks Object Store', 'querier'),
- )
- .addRowIf(
- std.member($._config.storage_engine, 'blocks'),
- $.objectStorePanels2('', 'querier'),
+ $.getObjectStoreRows('Querier - Blocks Object Store', 'querier')
),
-}
+} +
+(
+ {
+ panelDescriptionRps(service)::
+ $.panelDescription(
+ 'Requests Per Second',
+ |||
+ Read requests per second made to the %s(s).
+ ||| % service
+ ),
+
+ panelDescriptionLatency(service)::
+ $.panelDescription(
+ 'Latency',
+ |||
+ Across all %s instances, the average, median
+ (50th percentile), and 99th percentile time to respond
+ to a request.
+ ||| % service
+ ),
+
+ panelDescriptionP99Latency(service)::
+ $.panelDescription(
+ 'Per Instance P99 Latency',
+ |||
+ The 99th percentile latency for each individual
+ instance of the %s service.
+ ||| % service
+ ),
+ }
+)
diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet
index ea2ce3c3..cdf442ac 100644
--- a/cortex-mixin/dashboards/writes.libsonnet
+++ b/cortex-mixin/dashboards/writes.libsonnet
@@ -5,6 +5,23 @@ local utils = import 'mixin-utils/utils.libsonnet';
($.dashboard('Cortex / Writes') + { uid: '0156f6d15aa234d452a33a4f13c838e3' })
.addClusterSelectorTemplates()
.addRow(
+ ($.row('Writes Summary') { height: '125px', showTitle: false })
+ .addPanel(
+ $.textPanel('', |||
+
+ This dashboard shows various health metrics for the Cortex write path.
+ It is broken into sections for each service on the write path,
+ and organized by the order in which the write request flows.
+
+ Incoming metrics data travels from the gateway → distributor → ingester.
+
+
+ It also includes metrics for the key-value (KV) stores used to manage
+ the High Availability Tracker and the Ingesters.
+
+ |||),
+ )
+ ).addRow(
($.row('Headlines') +
{
height: '100px',
@@ -18,7 +35,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
job: $.jobMatcher($._config.job_names.distributor),
}
),
- format='reqps'
+ format='short'
)
)
.addPanel(
@@ -37,7 +54,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.statPanel('count(count by(user) (cortex_ingester_active_series{%s}))' % $.jobMatcher($._config.job_names.ingester), format='short')
)
.addPanel(
- $.panel('QPS') +
+ $.panel('Requests Per Second') +
$.statPanel('sum(rate(cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}[5m]))' % $.jobMatcher($._config.job_names.gateway), format='reqps')
)
)
@@ -45,76 +62,89 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.row('Gateway')
.addPanel(
$.panel('QPS') +
- $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.gateway))
+ $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.gateway)) +
+ $.panelDescriptionRps('gateway')
)
.addPanel(
$.panel('Latency') +
- utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_(v1|prom)_push')])
+ utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_(v1|prom)_push')]) +
+ $.panelDescriptionLatency('gateway')
)
.addPanel(
$.panel('Per %s p99 Latency' % $._config.per_instance_label) +
$.hiddenLegendQueryPanel(
'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"api_(v1|prom)_push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway)], ''
) +
- { yaxes: $.yaxes('s') }
+ { yaxes: $.yaxes('s') } +
+ $.panelDescriptionP99Latency('gateway')
)
)
.addRow(
$.row('Distributor')
.addPanel(
$.panel('QPS') +
- $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.distributor))
+ $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.distributor)) +
+ $.panelDescriptionRps('distributor')
)
.addPanel(
$.panel('Latency') +
- utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push')])
+ utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push')]) +
+ $.panelDescriptionLatency('distributor')
)
.addPanel(
$.panel('Per %s p99 Latency' % $._config.per_instance_label) +
$.hiddenLegendQueryPanel(
'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.distributor)], ''
) +
- { yaxes: $.yaxes('s') }
+ { yaxes: $.yaxes('s') } +
+ $.panelDescriptionP99Latency('distributor')
)
)
.addRow(
$.row('KV Store (HA Dedupe)')
.addPanel(
$.panel('QPS') +
- $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.distributor))
+ $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.distributor)) +
+ $.panelDescriptionRpsKvStoreDedupe()
)
.addPanel(
$.panel('Latency') +
- utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.distributor))
+ utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.distributor)) +
+ $.panelDescriptionLatencyKvStore()
)
)
.addRow(
$.row('Ingester')
.addPanel(
$.panel('QPS') +
- $.qpsPanel('cortex_request_duration_seconds_count{%s,route="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ingester))
+ $.qpsPanel('cortex_request_duration_seconds_count{%s,route="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ingester)) +
+ $.panelDescriptionRps('ingester')
)
.addPanel(
$.panel('Latency') +
- utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('route', '/cortex.Ingester/Push')])
+ utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('route', '/cortex.Ingester/Push')]) +
+ $.panelDescriptionLatency('ingester')
)
.addPanel(
$.panel('Per %s p99 Latency' % $._config.per_instance_label) +
$.hiddenLegendQueryPanel(
'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route="/cortex.Ingester/Push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], ''
) +
- { yaxes: $.yaxes('s') }
+ { yaxes: $.yaxes('s') } +
+ $.panelDescriptionP99Latency('ingester')
)
)
.addRow(
$.row('KV Store (Ring)')
.addPanel(
$.panel('QPS') +
- $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester))
+ $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester)) +
+ $.panelDescriptionRpsKvStoreRing()
)
.addPanel(
$.panel('Latency') +
- utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.ingester))
+ utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.ingester)) +
+ $.panelDescriptionLatencyKvStore()
)
)
.addRowIf(
@@ -189,36 +219,91 @@ local utils = import 'mixin-utils/utils.libsonnet';
'Uploaded blocks / sec',
'sum(rate(cortex_ingester_shipper_uploads_total{%s}[$__rate_interval])) - sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
'sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester),
+ ) +
+ $.panelDescription(
+ 'Uploaded blocks / sec',
+ |||
+ The rate of blocks being uploaded from the ingesters
+ to the long term storage/object store.
+ |||
),
)
.addPanel(
$.panel('Upload latency') +
- $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="ingester",operation="upload"}' % $.jobMatcher($._config.job_names.ingester)),
+ $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="ingester",operation="upload"}' % $.jobMatcher($._config.job_names.ingester)) +
+ $.panelDescription(
+ 'Upload latency',
+ |||
+ The average, median (50th percentile), and 99th percentile time
+ the ingester takes to upload blocks to the long term storage/object store.
+ |||
+ ),
)
)
.addRowIf(
std.member($._config.storage_engine, 'blocks'),
$.row('Ingester - Blocks storage - TSDB Head')
+ .addPanel(
+ $.textPanel('', |||
+
+ The ingester(s) maintain a local TSDB per-tenant on disk.
+ These panels contain metrics specific to the rate of
+ compaction of data on the ingesters’ local TSDBs.
+
+ |||),
+ )
.addPanel(
$.successFailurePanel(
'Compactions / sec',
'sum(rate(cortex_ingester_tsdb_compactions_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)],
'sum(rate(cortex_ingester_tsdb_compactions_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester),
+ ) +
+ $.panelDescription(
+ 'Compactions / sec',
+ |||
+ This is the rate of compaction operations local to the ingesters,
+ where every 2 hours by default, a new TSDB block is created
+ by compacting the head block.
+ |||
),
)
.addPanel(
$.panel('Compactions latency') +
- $.latencyPanel('cortex_ingester_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.ingester)),
+ $.latencyPanel('cortex_ingester_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.ingester)) +
+ $.panelDescription(
+ 'Compaction Latency',
+ |||
+ The average, median (50th percentile), and 99th percentile time
+ the ingester takes to compact the head block into a new TSDB block
+ on its local filesystem.
+ |||
+ ),
)
)
.addRowIf(
std.member($._config.storage_engine, 'blocks'),
$.row('Ingester - Blocks storage - TSDB WAL')
+ .addPanel(
+ $.textPanel('', |||
+
+ These panels contain metrics for the optional write-ahead-log (WAL)
+ that can be enabled for the local TSDBs on the ingesters.
+
+ |||),
+ )
.addPanel(
$.successFailurePanel(
'WAL truncations / sec',
'sum(rate(cortex_ingester_tsdb_wal_truncations_total{%s}[$__rate_interval])) - sum(rate(cortex_ingester_tsdb_wal_truncations_failed_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
'sum(rate(cortex_ingester_tsdb_wal_truncations_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester),
+ ) +
+ $.panelDescription(
+ 'WAL Truncations / sec',
+ |||
+ The WAL is truncated each time a new TSDB block is written
+ (by default this is every 2h). This panel measures the rate of
+ truncations.
+ |||
),
)
.addPanel(
@@ -226,12 +311,26 @@ local utils = import 'mixin-utils/utils.libsonnet';
'Checkpoints created / sec',
'sum(rate(cortex_ingester_tsdb_checkpoint_creations_total{%s}[$__rate_interval])) - sum(rate(cortex_ingester_tsdb_checkpoint_creations_failed_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
'sum(rate(cortex_ingester_tsdb_checkpoint_creations_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester),
+ ) +
+ $.panelDescription(
+ 'Checkpoints created / sec',
+ |||
+ Checkpoints are created as part of the WAL truncation process.
+ This metric measures the rate of checkpoint creation.
+ |||
),
)
.addPanel(
$.panel('WAL truncations latency (includes checkpointing)') +
$.queryPanel('sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_sum{%s}[$__rate_interval])) / sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_count{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'avg') +
- { yaxes: $.yaxes('s') },
+ { yaxes: $.yaxes('s') } +
+ $.panelDescription(
+ 'WAL Truncations Latency (including checkpointing)',
+ |||
+ Average time taken to perform a full WAL truncation,
+ including the time taken for the checkpointing to complete.
+ |||
+ ),
)
.addPanel(
$.panel('Corruptions / sec') +
@@ -248,7 +347,71 @@ local utils = import 'mixin-utils/utils.libsonnet';
WAL: '#E24D42',
'mmap-ed chunks': '#E28A42',
},
- },
+ } +
+ $.panelDescription(
+ 'Corruptions / sec',
+ |||
+ Rate of corrupted WAL and mmap-ed chunks.
+ |||
+ ),
)
),
-}
+} +
+(
+ {
+ panelDescriptionRps(service)::
+ $.panelDescription(
+ 'Requests Per Second',
+ |||
+ Write requests per second made to the %s(s).
+ ||| % service
+ ),
+
+ panelDescriptionRpsKvStoreDedupe()::
+ $.panelDescription(
+ 'Requests Per Second',
+ |||
+ Requests per second made to the key-value store
+ that manages high-availability deduplication.
+ |||
+ ),
+
+ panelDescriptionRpsKvStoreRing()::
+ $.panelDescription(
+ 'Requests Per Second',
+ |||
+ Requests per second made to the key-value store
+ used to manage which ingesters own which metrics series.
+ |||
+ ),
+
+
+ panelDescriptionLatency(service)::
+ $.panelDescription(
+ 'Latency',
+ |||
+ Across all %s instances, the average, median
+ (50th percentile), and 99th percentile time to respond
+ to a request.
+ ||| % service
+ ),
+
+ panelDescriptionLatencyKvStore()::
+ $.panelDescription(
+ 'Latency',
+ |||
+ The average, median (50th percentile), and 99th percentile time
+ the KV store takes to respond to a request.
+ |||
+ ),
+
+ panelDescriptionP99Latency(service)::
+ $.panelDescription(
+ 'Per Instance P99 Latency',
+ |||
+ The 99th percentile latency for each individual
+ instance of the %s service.
+ ||| % service
+ ),
+ }
+)
From 77f8609226bb4a25f41926d503573a3f22d8322f Mon Sep 17 00:00:00 2001
From: Darren Janeczek
Date: Wed, 9 Jun 2021 00:28:41 -0400
Subject: [PATCH 02/35] fix: text replacements, repair addRows
---
.../dashboards/dashboard-utils.libsonnet | 4 +--
cortex-mixin/dashboards/reads.libsonnet | 36 +++++++++----------
cortex-mixin/dashboards/writes.libsonnet | 20 +++++------
3 files changed, 29 insertions(+), 31 deletions(-)
diff --git a/cortex-mixin/dashboards/dashboard-utils.libsonnet b/cortex-mixin/dashboards/dashboard-utils.libsonnet
index 57ae82b3..ddff873b 100644
--- a/cortex-mixin/dashboards/dashboard-utils.libsonnet
+++ b/cortex-mixin/dashboards/dashboard-utils.libsonnet
@@ -30,7 +30,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
else self,
addRows(rows)::
- addRowsIf(true, rows),
+ self.addRowsIf(true, rows),
addClusterSelectorTemplates(multi=true)::
local d = self {
@@ -379,7 +379,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
)
.addPanel(
- $.panel('QPS') +
+ $.panel('Requests Per Second') +
$.queryPanel(
|||
sum by(operation) (
diff --git a/cortex-mixin/dashboards/reads.libsonnet b/cortex-mixin/dashboards/reads.libsonnet
index f741b188..927086f4 100644
--- a/cortex-mixin/dashboards/reads.libsonnet
+++ b/cortex-mixin/dashboards/reads.libsonnet
@@ -90,7 +90,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Gateway')
.addPanel(
- $.panel('QPS') +
+ $.panel('Requests Per Second') +
$.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway)) +
$.panelDescriptionRps('gateway')
)
@@ -111,7 +111,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Query Frontend')
.addPanel(
- $.panel('QPS') +
+ $.panel('Requests Per Second') +
$.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend)) +
$.panelDescriptionRps('query frontend')
)
@@ -146,7 +146,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
)
.addPanel(
- $.panel('QPS') +
+ $.panel('Requests Per Second') +
$.qpsPanel('cortex_query_scheduler_queue_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) +
$.panelDescriptionRps('query scheduler')
)
@@ -161,16 +161,14 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addPanel(
$.textPanel('', |||
- The query results is an optional service is one of 4
- optional caches that can be deployed as part of a Cortex
- cluster to improve query performance.
- It is used by the query-frontend to cache entire results
- of queries.
+ The query results cache is one of 4 optional caches
+ that can be deployed as part of a GEM cluster to improve query performance.
+ It is used by the query-frontend to cache entire results of queries.
|||)
)
.addPanel(
- $.panel('QPS') +
+ $.panel('Requests Per Second') +
$.qpsPanel('cortex_cache_request_duration_seconds_count{method=~"frontend.+", %s}' % $.jobMatcher($._config.job_names.query_frontend)) +
$.panelDescriptionRps('query results')
)
@@ -183,7 +181,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Querier')
.addPanel(
- $.panel('QPS') +
+ $.panel('Requests Per Second') +
$.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier)) +
$.panelDescriptionRps(
'querier'
@@ -217,7 +215,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
)
.addPanel(
- $.panel('QPS') +
+ $.panel('Requests Per Second') +
$.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}' % $.jobMatcher($._config.job_names.ingester)) +
$.panelDescriptionRps('ingester')
)
@@ -252,7 +250,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
)
.addPanel(
- $.panel('QPS') +
+ $.panel('Requests Per Second') +
$.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/gatewaypb.StoreGateway/.*"}' % $.jobMatcher($._config.job_names.store_gateway)) +
$.panelDescriptionRps('store gateway')
)
@@ -274,7 +272,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.storage_engine, 'chunks'),
$.row('Memcached - Chunks storage - Index')
.addPanel(
- $.panel('QPS') +
+ $.panel('Requests Per Second') +
$.qpsPanel('cortex_cache_request_duration_seconds_count{%s,method="store.index-cache-read.memcache.fetch"}' % $.jobMatcher($._config.job_names.querier))
)
.addPanel(
@@ -286,7 +284,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.storage_engine, 'chunks'),
$.row('Memcached - Chunks storage - Chunks')
.addPanel(
- $.panel('QPS') +
+ $.panel('Requests Per Second') +
$.qpsPanel('cortex_cache_request_duration_seconds_count{%s,method="chunksmemcache.fetch"}' % $.jobMatcher($._config.job_names.querier))
)
.addPanel(
@@ -310,7 +308,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
)
.addPanel(
- $.panel('QPS') +
+ $.panel('Requests Per Second') +
$.queryPanel(
|||
sum by(operation) (
@@ -430,7 +428,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'cassandra'),
$.row('Cassandra')
.addPanel(
- $.panel('QPS') +
+ $.panel('Requests Per Second') +
$.qpsPanel('cortex_cassandra_request_duration_seconds_count{%s, operation="SELECT"}' % $.jobMatcher($._config.job_names.querier))
)
.addPanel(
@@ -443,7 +441,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'bigtable'),
$.row('BigTable')
.addPanel(
- $.panel('QPS') +
+ $.panel('Requests Per Second') +
$.qpsPanel('cortex_bigtable_request_duration_seconds_count{%s, operation="/google.bigtable.v2.Bigtable/ReadRows"}' % $.jobMatcher($._config.job_names.querier))
)
.addPanel(
@@ -456,7 +454,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'dynamodb'),
$.row('DynamoDB')
.addPanel(
- $.panel('QPS') +
+ $.panel('Requests Per Second') +
$.qpsPanel('cortex_dynamo_request_duration_seconds_count{%s, operation="DynamoDB.QueryPages"}' % $.jobMatcher($._config.job_names.querier))
)
.addPanel(
@@ -469,7 +467,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.chunk_store_backend, 'gcs'),
$.row('GCS')
.addPanel(
- $.panel('QPS') +
+ $.panel('Requests Per Second') +
$.qpsPanel('cortex_gcs_request_duration_seconds_count{%s, operation="GET"}' % $.jobMatcher($._config.job_names.querier))
)
.addPanel(
diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet
index cdf442ac..84c9ab6c 100644
--- a/cortex-mixin/dashboards/writes.libsonnet
+++ b/cortex-mixin/dashboards/writes.libsonnet
@@ -61,7 +61,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Gateway')
.addPanel(
- $.panel('QPS') +
+ $.panel('Requests Per Second') +
$.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.gateway)) +
$.panelDescriptionRps('gateway')
)
@@ -82,7 +82,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Distributor')
.addPanel(
- $.panel('QPS') +
+ $.panel('Requests Per Second') +
$.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.distributor)) +
$.panelDescriptionRps('distributor')
)
@@ -103,7 +103,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('KV Store (HA Dedupe)')
.addPanel(
- $.panel('QPS') +
+ $.panel('Requests Per Second') +
$.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.distributor)) +
$.panelDescriptionRpsKvStoreDedupe()
)
@@ -116,7 +116,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Ingester')
.addPanel(
- $.panel('QPS') +
+ $.panel('Requests Per Second') +
$.qpsPanel('cortex_request_duration_seconds_count{%s,route="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ingester)) +
$.panelDescriptionRps('ingester')
)
@@ -137,7 +137,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('KV Store (Ring)')
.addPanel(
- $.panel('QPS') +
+ $.panel('Requests Per Second') +
$.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester)) +
$.panelDescriptionRpsKvStoreRing()
)
@@ -151,7 +151,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.storage_engine, 'chunks'),
$.row('Memcached')
.addPanel(
- $.panel('QPS') +
+ $.panel('Requests Per Second') +
$.qpsPanel('cortex_memcache_request_duration_seconds_count{%s,method="Memcache.Put"}' % $.jobMatcher($._config.job_names.ingester))
)
.addPanel(
@@ -164,7 +164,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'cassandra'),
$.row('Cassandra')
.addPanel(
- $.panel('QPS') +
+ $.panel('Requests Per Second') +
$.qpsPanel('cortex_cassandra_request_duration_seconds_count{%s, operation="INSERT"}' % $.jobMatcher($._config.job_names.ingester))
)
.addPanel(
@@ -177,7 +177,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'bigtable'),
$.row('BigTable')
.addPanel(
- $.panel('QPS') +
+ $.panel('Requests Per Second') +
$.qpsPanel('cortex_bigtable_request_duration_seconds_count{%s, operation="/google.bigtable.v2.Bigtable/MutateRows"}' % $.jobMatcher($._config.job_names.ingester))
)
.addPanel(
@@ -190,7 +190,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'dynamodb'),
$.row('DynamoDB')
.addPanel(
- $.panel('QPS') +
+ $.panel('Requests Per Second') +
$.qpsPanel('cortex_dynamo_request_duration_seconds_count{%s, operation="DynamoDB.BatchWriteItem"}' % $.jobMatcher($._config.job_names.ingester))
)
.addPanel(
@@ -203,7 +203,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.chunk_store_backend, 'gcs'),
$.row('GCS')
.addPanel(
- $.panel('QPS') +
+ $.panel('Requests Per Second') +
$.qpsPanel('cortex_gcs_request_duration_seconds_count{%s, operation="POST"}' % $.jobMatcher($._config.job_names.ingester))
)
.addPanel(
From c4db3e1c980fda7d9277e559d293cea5b98eaad1 Mon Sep 17 00:00:00 2001
From: Darren Janeczek
Date: Wed, 9 Jun 2021 00:31:42 -0400
Subject: [PATCH 03/35] fix: changelog
---
CHANGELOG.md | 1 +
1 file changed, 1 insertion(+)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 21bca0aa..25e0f3f8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,6 +12,7 @@
* [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308
* [CHANGE] Dashboards: added overridable `job_labels` and `cluster_labels` to the configuration object as label lists to uniquely identify jobs and clusters in the metric names and group-by lists in dashboards. #319
* [CHANGE] Dashboards: `alert_aggregation_labels` has been removed from the configuration and overriding this value has been deprecated. Instead the labels are now defined by the `cluster_labels` list, and should be overridden accordingly through that list. #319
+* [ENHANCEMENT] Added documentation text panels and descriptions to Reads and Writes dashboards.
## 1.9.0 / 2021-05-18
From 9e6c2f4f18f469a6d307de678ecf12105b01b0c6 Mon Sep 17 00:00:00 2001
From: Jennifer Villa
Date: Sun, 13 Jun 2021 14:23:58 -0500
Subject: [PATCH 04/35] Changing copy to add 'latency' as well.
---
cortex-mixin/dashboards/dashboard-utils.libsonnet | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cortex-mixin/dashboards/dashboard-utils.libsonnet b/cortex-mixin/dashboards/dashboard-utils.libsonnet
index ddff873b..7d5f8047 100644
--- a/cortex-mixin/dashboards/dashboard-utils.libsonnet
+++ b/cortex-mixin/dashboards/dashboard-utils.libsonnet
@@ -299,7 +299,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.textPanel(
'',
|||
- - The panels below summarize the rate of requests issued by %s
+ - The panels below summarize the latency and rate of requests issued by %s
to object storage, separated by operation type.
- It also includes the average, median, and 99th percentile latency
of each operation and the error rate of each operation.
From 7a7b13cecedf40c8790ca9edebb4ec8f36e91e40 Mon Sep 17 00:00:00 2001
From: Jennifer Villa
Date: Sun, 13 Jun 2021 17:04:01 -0500
Subject: [PATCH 05/35] Cut down on text from initial PR. Tucked existing text
from the compactor dashboard under tooltips, rather than making them text
boxes.
---
cortex-mixin/dashboards/compactor.libsonnet | 59 +++---
.../dashboards/dashboard-utils.libsonnet | 84 +-------
cortex-mixin/dashboards/reads.libsonnet | 185 ++++--------------
cortex-mixin/dashboards/writes.libsonnet | 155 +++------------
4 files changed, 113 insertions(+), 370 deletions(-)
diff --git a/cortex-mixin/dashboards/compactor.libsonnet b/cortex-mixin/dashboards/compactor.libsonnet
index a1d5ea90..4be906a7 100644
--- a/cortex-mixin/dashboards/compactor.libsonnet
+++ b/cortex-mixin/dashboards/compactor.libsonnet
@@ -6,12 +6,6 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addClusterSelectorTemplates()
.addRow(
$.row('Summary')
- .addPanel(
- $.textPanel('', |||
- - **Per-instance runs**: number of times a compactor instance triggers a compaction across all tenants its shard manage.
- - **Tenants compaction progress**: in a multi-tenant cluster it shows the progress of tenants compacted while compaction is running. Reset to 0 once the compaction run is completed for all tenants in the shard.
- |||),
- )
.addPanel(
$.startedCompletedFailedPanel(
'Per-instance runs / sec',
@@ -20,7 +14,13 @@ local utils = import 'mixin-utils/utils.libsonnet';
'sum(rate(cortex_compactor_runs_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor)
) +
$.bars +
- { yaxes: $.yaxes('ops') },
+ { yaxes: $.yaxes('ops') } +
+ $.panelDescription(
+ 'Per-instance runs',
+ |||
+ Number of times a compactor instance triggers a compaction across all tenants its shard manage.
+ |||
+ ),
)
.addPanel(
$.panel('Tenants compaction progress') +
@@ -31,42 +31,55 @@ local utils = import 'mixin-utils/utils.libsonnet';
cortex_compactor_tenants_skipped{%s}
) / cortex_compactor_tenants_discovered{%s}
||| % [$.jobMatcher($._config.job_names.compactor), $.jobMatcher($._config.job_names.compactor), $.jobMatcher($._config.job_names.compactor), $.jobMatcher($._config.job_names.compactor)], '{{%s}}' % $._config.per_instance_label) +
- { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) },
+ { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) } +
+ $.panelDescription(
+ 'Tenants compaction progress',
+ |||
+ In a multi-tenant cluster this shows the progress of tenants compacted while compaction is running.
+ Reset to 0 once the compaction run is completed for all tenants in the shard.
+ |||
+ ),
)
)
.addRow(
$.row('')
- .addPanel(
- $.textPanel('', |||
- - **Compacted blocks**: number of blocks generated as a result of a compaction operation.
- - **Per-block compaction duration**: time taken to generate a single compacted block.
- |||),
- )
.addPanel(
$.panel('Compacted blocks / sec') +
$.queryPanel('sum(rate(prometheus_tsdb_compactions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), 'blocks') +
- { yaxes: $.yaxes('ops') },
+ { yaxes: $.yaxes('ops') } +
+ $.panelDescription(
+ 'Compacted Blocks / Sec',
+ |||
+ Time taken to generate a single compacted block
+ |||
+ ),
)
.addPanel(
$.panel('Per-block compaction duration') +
- $.latencyPanel('prometheus_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.compactor))
+ $.latencyPanel('prometheus_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.compactor)) +
+ $.panelDescription(
+ 'Per-block compaction duration',
+ |||
+ Rate of blocks generated as a result of a compaction operation
+ |||
+ ),
)
)
.addRow(
$.row('')
- .addPanel(
- $.textPanel('', |||
- - **Average blocks / tenant**: the average number of blocks per tenant.
- - **Tenants with largest number of blocks**: the 10 tenants with the largest number of blocks.
- |||),
- )
.addPanel(
$.panel('Average blocks / tenant') +
$.queryPanel('avg(max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher($._config.job_names.compactor), 'avg'),
)
.addPanel(
$.panel('Tenants with largest number of blocks') +
- $.queryPanel('topk(10, max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher($._config.job_names.compactor), '{{user}}'),
+ $.queryPanel('topk(10, max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher($._config.job_names.compactor), '{{user}}') +
+ $.panelDescription(
+ 'Tenants with largest number of blocks',
+ |||
+ The 10 tenants with the largest number of blocks
+ |||
+ ),
)
)
.addRow(
diff --git a/cortex-mixin/dashboards/dashboard-utils.libsonnet b/cortex-mixin/dashboards/dashboard-utils.libsonnet
index 7d5f8047..8641bf3d 100644
--- a/cortex-mixin/dashboards/dashboard-utils.libsonnet
+++ b/cortex-mixin/dashboards/dashboard-utils.libsonnet
@@ -294,19 +294,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
getObjectStoreRows(title, component):: [
- ($.row(title) { height: '25px' })
- .addPanel(
- $.textPanel(
- '',
- |||
- - The panels below summarize the latency and rate of requests issued by %s
- to object storage, separated by operation type.
- - It also includes the average, median, and 99th percentile latency
- of each operation and the error rate of each operation.
- ||| % component
- )
- ),
- $.row('')
+ ($.row(title))
.addPanel(
$.panel('Operations / sec') +
$.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s,component="%s"}[$__rate_interval]))' % [$.namespaceMatcher(), component], '{{operation}}') +
@@ -346,39 +334,14 @@ local utils = import 'mixin-utils/utils.libsonnet';
],
thanosMemcachedCache(title, jobName, component, cacheName)::
- local config = {
- jobMatcher: $.jobMatcher(jobName),
- component: component,
- cacheName: cacheName,
- cacheNameReadable: std.strReplace(cacheName, '-', ' '),
- };
- local panelText = {
- 'metadata-cache':
- |||
- The metadata cache
- is an optional component that the
- store-gateway and querier
- will check before going to object storage.
- This set of panels focuses on the
- %s’s use of the metadata cache.
- ||| % component,
- 'chunks-cache':
- |||
- The chunks cache
- is an optional component that the
- store-gateway
- will check before going to object storage.
- This helps reduce calls to the object store.
- |||,
- }[cacheName];
-
+ local config = {
+ jobMatcher: $.jobMatcher(jobName),
+ component: component,
+ cacheName: cacheName,
+ cacheNameReadable: std.strReplace(cacheName, '-', ' '),
+ };
super.row(title)
- .addPanel(
- $.textPanel(
- '', panelText
- )
- )
- .addPanel(
+ .addPanel(
$.panel('Requests Per Second') +
$.queryPanel(
|||
@@ -395,16 +358,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
'{{operation}}'
) +
$.stack +
- { yaxes: $.yaxes('ops') } +
- $.panelDescription(
- 'Requests Per Second',
- |||
- Requests per second made to
- the %(cacheNameReadable)s
- from the %(component)s,
- separated into request type.
- ||| % config
- ),
+ { yaxes: $.yaxes('ops') }
)
.addPanel(
$.panel('Latency (getmulti)') +
@@ -418,15 +372,6 @@ local utils = import 'mixin-utils/utils.libsonnet';
name="%(cacheName)s"
}
||| % config
- ) +
- $.panelDescription(
- 'Latency (getmulti)',
- |||
- The average, median (50th percentile) and 99th percentile
- time to satisfy a “getmulti” request
- made by the %(component)s,
- which retrieves multiple items from the cache.
- ||| % config
)
)
.addPanel(
@@ -455,16 +400,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
||| % config,
'items'
) +
- { yaxes: $.yaxes('percentunit') } +
- $.panelDescription(
- 'Hit Ratio',
- |||
- The fraction of %(component)s requests to the
- %(cacheNameReadable)s that successfully return data.
- Requests that miss the cache must go to
- object storage for the underlying data.
- ||| % config
- ),
+ { yaxes: $.yaxes('percentunit') }
),
filterNodeDiskContainer(containerName)::
diff --git a/cortex-mixin/dashboards/reads.libsonnet b/cortex-mixin/dashboards/reads.libsonnet
index 927086f4..af384f4f 100644
--- a/cortex-mixin/dashboards/reads.libsonnet
+++ b/cortex-mixin/dashboards/reads.libsonnet
@@ -5,17 +5,19 @@ local utils = import 'mixin-utils/utils.libsonnet';
($.dashboard('Cortex / Reads') + { uid: '8d6ba60eccc4b6eedfa329b24b1bd339' })
.addClusterSelectorTemplates()
.addRow(
- ($.row('Reads Summary') { height: '175px', showTitle: false })
+ ($.row('Reads Dashboard Description') { height: '175px', showTitle: false })
.addPanel(
$.textPanel('', |||
- This dashboard shows various health metrics for the Cortex read path.
+ This dashboard shows health metrics for the Cortex read path.
It is broken into sections for each service on the read path, and organized by the order in which the read request flows.
Incoming queries travel from the gateway → query frontend → query scheduler → querier → ingester and/or store-gateway (depending on the age of the query).
+
+ For each service, there are 3 panels showing (1) requests per second to that service, (2) average, median, and p99 latency of requests to that service, and (3) p99 latency of requests to each instance of that service.
- The dashboard shows metrics for the 4 optional caches that can be deployed with Cortex:
+ The dashboard also shows metrics for the 4 optional caches that can be deployed with Cortex:
the query results cache, the metadata cache, the chunks cache, and the index cache.
These panels will show “no data” if the caches are not deployed.
@@ -82,7 +84,6 @@ local utils = import 'mixin-utils/utils.libsonnet';
|||
Rate of range queries per second being made to
Cortex via the /prometheus API.
- (The ruler does not issue range queries).
|||
),
)
@@ -91,42 +92,36 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.row('Gateway')
.addPanel(
$.panel('Requests Per Second') +
- $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway)) +
- $.panelDescriptionRps('gateway')
+ $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway))
)
.addPanel(
$.panel('Latency') +
- utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) +
- $.panelDescriptionLatency('gateway')
+ utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')])
)
.addPanel(
$.panel('Per %s p99 Latency' % $._config.per_instance_label) +
$.hiddenLegendQueryPanel(
'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway)], ''
) +
- { yaxes: $.yaxes('s') } +
- $.panelDescriptionP99Latency('gateway')
+ { yaxes: $.yaxes('s') }
)
)
.addRow(
$.row('Query Frontend')
.addPanel(
$.panel('Requests Per Second') +
- $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend)) +
- $.panelDescriptionRps('query frontend')
+ $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend))
)
.addPanel(
$.panel('Latency') +
- utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) +
- $.panelDescriptionLatency('query frontend')
+ utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')])
)
.addPanel(
$.panel('Per %s p99 Latency' % $._config.per_instance_label) +
$.hiddenLegendQueryPanel(
'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.query_frontend)], ''
) +
- { yaxes: $.yaxes('s') } +
- $.panelDescriptionP99Latency('query frontend')
+ { yaxes: $.yaxes('s') }
)
)
.addRow(
@@ -147,125 +142,77 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
.addPanel(
$.panel('Requests Per Second') +
- $.qpsPanel('cortex_query_scheduler_queue_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) +
- $.panelDescriptionRps('query scheduler')
+ $.qpsPanel('cortex_query_scheduler_queue_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.query_scheduler))
)
.addPanel(
$.panel('Latency (Time in Queue)') +
- $.latencyPanel('cortex_query_scheduler_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) +
- $.panelDescriptionLatency('query scheduler')
+ $.latencyPanel('cortex_query_scheduler_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_scheduler))
)
)
.addRow(
$.row('Cache - Query Results')
- .addPanel(
- $.textPanel('', |||
-
- The query results cache is one of 4 optional caches
- that can be deployed as part of a GEM cluster to improve query performance.
- It is used by the query-frontend to cache entire results of queries.
-
- |||)
- )
.addPanel(
$.panel('Requests Per Second') +
- $.qpsPanel('cortex_cache_request_duration_seconds_count{method=~"frontend.+", %s}' % $.jobMatcher($._config.job_names.query_frontend)) +
- $.panelDescriptionRps('query results')
+ $.qpsPanel('cortex_cache_request_duration_seconds_count{method=~"frontend.+", %s}' % $.jobMatcher($._config.job_names.query_frontend))
)
.addPanel(
$.panel('Latency') +
- utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('method', 'frontend.+')]) +
- $.panelDescriptionLatency('query results')
+ utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('method', 'frontend.+')])
)
)
.addRow(
$.row('Querier')
.addPanel(
$.panel('Requests Per Second') +
- $.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier)) +
- $.panelDescriptionRps(
- 'querier'
+ $.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier))
)
- )
.addPanel(
$.panel('Latency') +
- utils.latencyRecordingRulePanel('cortex_querier_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) +
- $.panelDescriptionLatency('querier')
+ utils.latencyRecordingRulePanel('cortex_querier_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')])
)
.addPanel(
$.panel('Per %s p99 Latency' % $._config.per_instance_label) +
$.hiddenLegendQueryPanel(
'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_querier_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.querier)], ''
) +
- { yaxes: $.yaxes('s') } +
- $.panelDescriptionP99Latency('querier')
+ { yaxes: $.yaxes('s') }
)
)
.addRow(
$.row('Ingester')
- .addPanel(
- $.textPanel(
- '',
- |||
-
- For short term queries, queriers go
- to the ingester to fetch the data.
-
- |||
- )
- )
.addPanel(
$.panel('Requests Per Second') +
- $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}' % $.jobMatcher($._config.job_names.ingester)) +
- $.panelDescriptionRps('ingester')
+ $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}' % $.jobMatcher($._config.job_names.ingester))
)
.addPanel(
$.panel('Latency') +
- utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.re('route', '/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata')]) +
- $.panelDescriptionLatency('ingester')
+ utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.re('route', '/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata')])
)
.addPanel(
$.panel('Per %s p99 Latency' % $._config.per_instance_label) +
$.hiddenLegendQueryPanel(
'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], ''
) +
- { yaxes: $.yaxes('s') } +
- $.panelDescriptionP99Latency('ingester')
+ { yaxes: $.yaxes('s') }
)
)
.addRowIf(
std.member($._config.storage_engine, 'blocks'),
$.row('Store-gateway')
- .addPanel(
- $.textPanel(
- '',
- |||
-
- For longer term queries, queriers go to the store-gateways to
- fetch the data.
- Store-gateways are responsible for fetching the data from object
- storage.
-
- |||
- )
- )
.addPanel(
$.panel('Requests Per Second') +
- $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/gatewaypb.StoreGateway/.*"}' % $.jobMatcher($._config.job_names.store_gateway)) +
- $.panelDescriptionRps('store gateway')
+ $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/gatewaypb.StoreGateway/.*"}' % $.jobMatcher($._config.job_names.store_gateway))
)
.addPanel(
$.panel('Latency') +
- utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.store_gateway) + [utils.selector.re('route', '/gatewaypb.StoreGateway/.*')]) +
- $.panelDescriptionLatency('store gateway')
+ utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.store_gateway) + [utils.selector.re('route', '/gatewaypb.StoreGateway/.*')])
)
.addPanel(
$.panel('Per %s p99 Latency' % $._config.per_instance_label) +
$.hiddenLegendQueryPanel(
'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/gatewaypb.StoreGateway/.*"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.store_gateway)], ''
) +
- { yaxes: $.yaxes('s') } +
- $.panelDescriptionP99Latency('store gateway')
+ { yaxes: $.yaxes('s') }
)
)
.addRowIf(
@@ -294,19 +241,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
.addRowIf(
std.member($._config.storage_engine, 'blocks'),
- $.row('Memcached – Blocks Storage – Block Index (Store-gateway)') // Resembles thanosMemcachedCache
- .addPanel(
- $.textPanel(
- '',
- |||
-
- The block index cache is an optional component that the
- store-gateway will check before going to object storage.
- This helps reduce calls to the object store.
-
- |||
- )
- )
+ $.row('Memcached – Blocks Storage – Block Index Cache (Store-gateway accesses)') // Resembles thanosMemcachedCache
.addPanel(
$.panel('Requests Per Second') +
$.queryPanel(
@@ -323,16 +258,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
||| % $.jobMatcher($._config.job_names.store_gateway), '{{operation}}'
) +
$.stack +
- { yaxes: $.yaxes('ops') } +
- $.panelDescription(
- 'Requests Per Second',
- |||
- Requests per second made to
- the block index cache
- from the store-gateway,
- separated into request type.
- |||
- ),
+ { yaxes: $.yaxes('ops') }
)
.addPanel(
$.panel('Latency (getmulti)') +
@@ -346,15 +272,6 @@ local utils = import 'mixin-utils/utils.libsonnet';
name="index-cache"
}
||| % $.jobMatcher($._config.job_names.store_gateway)
- ) +
- $.panelDescription(
- 'Latency (getmulti)',
- |||
- The average, median (50th percentile) and 99th percentile
- time to satisfy a “getmulti” request
- from the store-gateway,
- which retrieves multiple items from the cache.
- |||
)
)
.addPanel(
@@ -384,14 +301,12 @@ local utils = import 'mixin-utils/utils.libsonnet';
],
'{{item_type}}'
) +
- { yaxes: $.yaxes('percentunit') } +
+ { yaxes: $.yaxes('percentunit') } +
$.panelDescription(
'Hit Ratio',
|||
- The fraction of requests to the
- block index cache that successfully return data.
- Requests that miss the cache must go to
- object storage for the underlying data.
+ Even if you do not set up memcached for the blocks index cache, you will still see data in this panel because Cortex by default has an
+ in-memory blocks index cache.
|||
),
)
@@ -399,7 +314,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRowIf(
std.member($._config.storage_engine, 'blocks'),
$.thanosMemcachedCache(
- 'Memcached – Blocks Storage – Chunks (Store-gateway)',
+ 'Memcached – Blocks Storage – Chunks Cache (Store-gateway accesses)',
$._config.job_names.store_gateway,
'store-gateway',
'chunks-cache'
@@ -408,7 +323,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRowIf(
std.member($._config.storage_engine, 'blocks'),
$.thanosMemcachedCache(
- 'Memcached – Blocks Storage – Metadata (Store-gateway)',
+ 'Memcached – Blocks Storage – Metadata Cache (Store-gateway accesses)',
$._config.job_names.store_gateway,
'store-gateway',
'metadata-cache'
@@ -417,7 +332,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRowIf(
std.member($._config.storage_engine, 'blocks'),
$.thanosMemcachedCache(
- 'Memcached – Blocks Storage – Metadata (Querier)',
+ 'Memcached – Blocks Storage – Metadata Cache (Querier accesses)',
$._config.job_names.querier,
'querier',
'metadata-cache'
@@ -478,41 +393,11 @@ local utils = import 'mixin-utils/utils.libsonnet';
// Object store metrics for the store-gateway.
.addRowsIf(
std.member($._config.storage_engine, 'blocks'),
- $.getObjectStoreRows('Store-gateway - Blocks Object Store', 'store-gateway')
+ $.getObjectStoreRows('Blocks Object Store (Store-gateway accesses)', 'store-gateway')
)
// Object store metrics for the querier.
.addRowsIf(
std.member($._config.storage_engine, 'blocks'),
- $.getObjectStoreRows('Querier - Blocks Object Store', 'querier')
+ $.getObjectStoreRows('Blocks Object Store (Querier accesses)', 'querier')
),
-} +
-(
- {
- panelDescriptionRps(service)::
- $.panelDescription(
- 'Requests Per Second',
- |||
- Read requests per second made to the %s(s).
- ||| % service
- ),
-
- panelDescriptionLatency(service)::
- $.panelDescription(
- 'Latency',
- |||
- Across all %s instances, the average, median
- (50th percentile), and 99th percentile time to respond
- to a request.
- ||| % service
- ),
-
- panelDescriptionP99Latency(service)::
- $.panelDescription(
- 'Per Instance P99 Latency',
- |||
- The 99th percentile latency for each individual
- instance of the %s service.
- ||| % service
- ),
- }
-)
+}
\ No newline at end of file
diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet
index 84c9ab6c..5486fd87 100644
--- a/cortex-mixin/dashboards/writes.libsonnet
+++ b/cortex-mixin/dashboards/writes.libsonnet
@@ -5,7 +5,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
($.dashboard('Cortex / Writes') + { uid: '0156f6d15aa234d452a33a4f13c838e3' })
.addClusterSelectorTemplates()
.addRow(
- ($.row('Writes Summary') { height: '125px', showTitle: false })
+ ($.row('Writes Dashboard Description') { height: '125px', showTitle: false })
.addPanel(
$.textPanel('', |||
@@ -14,6 +14,11 @@ local utils = import 'mixin-utils/utils.libsonnet';
and organized by the order in which the write request flows.
Incoming metrics data travels from the gateway → distributor → ingester.
+
+ For each service, there are 3 panels showing
+ (1) requests per second to that service,
+ (2) average, median, and p99 latency of requests to that service, and
+ (3) p99 latency of requests to each instance of that service.
It also includes metrics for the key-value (KV) stores used to manage
@@ -62,89 +67,76 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.row('Gateway')
.addPanel(
$.panel('Requests Per Second') +
- $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.gateway)) +
- $.panelDescriptionRps('gateway')
+ $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.gateway))
)
.addPanel(
$.panel('Latency') +
- utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_(v1|prom)_push')]) +
- $.panelDescriptionLatency('gateway')
+ utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_(v1|prom)_push')])
)
.addPanel(
$.panel('Per %s p99 Latency' % $._config.per_instance_label) +
$.hiddenLegendQueryPanel(
'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"api_(v1|prom)_push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway)], ''
) +
- { yaxes: $.yaxes('s') } +
- $.panelDescriptionP99Latency('gateway')
+ { yaxes: $.yaxes('s') }
)
)
.addRow(
$.row('Distributor')
.addPanel(
$.panel('Requests Per Second') +
- $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.distributor)) +
- $.panelDescriptionRps('distributor')
+ $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.distributor))
)
.addPanel(
$.panel('Latency') +
- utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push')]) +
- $.panelDescriptionLatency('distributor')
+ utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push')])
)
.addPanel(
$.panel('Per %s p99 Latency' % $._config.per_instance_label) +
$.hiddenLegendQueryPanel(
'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.distributor)], ''
) +
- { yaxes: $.yaxes('s') } +
- $.panelDescriptionP99Latency('distributor')
+ { yaxes: $.yaxes('s') }
)
)
.addRow(
- $.row('KV Store (HA Dedupe)')
+ $.row('Key-Value store for high-availability (HA) deduplication')
.addPanel(
$.panel('Requests Per Second') +
- $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.distributor)) +
- $.panelDescriptionRpsKvStoreDedupe()
+ $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.distributor))
)
.addPanel(
$.panel('Latency') +
- utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.distributor)) +
- $.panelDescriptionLatencyKvStore()
+ utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.distributor))
)
)
.addRow(
$.row('Ingester')
.addPanel(
$.panel('Requests Per Second') +
- $.qpsPanel('cortex_request_duration_seconds_count{%s,route="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ingester)) +
- $.panelDescriptionRps('ingester')
+ $.qpsPanel('cortex_request_duration_seconds_count{%s,route="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ingester))
)
.addPanel(
$.panel('Latency') +
- utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('route', '/cortex.Ingester/Push')]) +
- $.panelDescriptionLatency('ingester')
+ utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('route', '/cortex.Ingester/Push')])
)
.addPanel(
$.panel('Per %s p99 Latency' % $._config.per_instance_label) +
$.hiddenLegendQueryPanel(
'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route="/cortex.Ingester/Push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], ''
) +
- { yaxes: $.yaxes('s') } +
- $.panelDescriptionP99Latency('ingester')
+ { yaxes: $.yaxes('s') }
)
)
.addRow(
- $.row('KV Store (Ring)')
+ $.row('Key-Value store for the ingester ring')
.addPanel(
$.panel('Requests Per Second') +
- $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester)) +
- $.panelDescriptionRpsKvStoreRing()
+ $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester))
)
.addPanel(
$.panel('Latency') +
- utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.ingester)) +
- $.panelDescriptionLatencyKvStore()
+ utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.ingester))
)
)
.addRowIf(
@@ -224,7 +216,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
'Uploaded blocks / sec',
|||
The rate of blocks being uploaded from the ingesters
- to the long term storage/object store.
+ to object storage.
|||
),
)
@@ -235,7 +227,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
'Upload latency',
|||
The average, median (50th percentile), and 99th percentile time
- the ingester takes to upload blocks to the long term storage/object store.
+ the ingester takes to upload blocks to object storage.
|||
),
)
@@ -243,15 +235,6 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRowIf(
std.member($._config.storage_engine, 'blocks'),
$.row('Ingester - Blocks storage - TSDB Head')
- .addPanel(
- $.textPanel('', |||
-
- The ingester(s) maintain a local TSDB per-tenant on disk.
- These panels contain metrics specific to the rate of
- compaction of data on the ingesters’ local TSDBs.
-
- |||),
- )
.addPanel(
$.successFailurePanel(
'Compactions / sec',
@@ -261,9 +244,9 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.panelDescription(
'Compactions / sec',
|||
- This is the rate of compaction operations local to the ingesters,
- where every 2 hours by default, a new TSDB block is created
- by compacting the head block.
+ Ingesters maintain a local TSDB per-tenant on disk. Each TSDB maintains a head block for each
+ active time series; these blocks get periodically compacted (by default, every 2h).
+ This panel shows the rate of compaction operations across all TSDBs on all ingesters.
|||
),
)
@@ -273,24 +256,15 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.panelDescription(
'Compaction Latency',
|||
- The average, median (50th percentile), and 99th percentile time
- the ingester takes to compact the head block into a new TSDB block
- on its local filesystem.
+ The average, median (50th percentile), and 99th percentile time ingesters take to compact head blocks
+ on the local filesystem.
|||
),
)
)
.addRowIf(
std.member($._config.storage_engine, 'blocks'),
- $.row('Ingester - Blocks storage - TSDB WAL')
- .addPanel(
- $.textPanel('', |||
-
- These panels contain metrics for the optional write-ahead-log (WAL)
- that can be enabled for the local TSDBs on the ingesters.
-
- |||),
- )
+ $.row('Ingester - Blocks storage - TSDB Write Ahead Log (WAL)')
.addPanel(
$.successFailurePanel(
'WAL truncations / sec',
@@ -300,8 +274,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.panelDescription(
'WAL Truncations / sec',
|||
- The WAL is truncated each time a new TSDB block is written
- (by default this is every 2h). This panel measures the rate of
+ The WAL is truncated each time a new TSDB block is written. This panel measures the rate of
truncations.
|||
),
@@ -347,71 +320,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
WAL: '#E24D42',
'mmap-ed chunks': '#E28A42',
},
- } +
- $.panelDescription(
- 'Corruptions / sec',
- |||
- Rate of corrupted WAL and mmap-ed chunks.
- |||
- ),
+ },
)
),
-} +
-(
- {
- panelDescriptionRps(service)::
- $.panelDescription(
- 'Requests Per Second',
- |||
- Write requests per second made to the %s(s).
- ||| % service
- ),
-
- panelDescriptionRpsKvStoreDedupe()::
- $.panelDescription(
- 'Requests Per Second',
- |||
- Requests per second made to the key-value store
- that manages high-availability deduplication.
- |||
- ),
-
- panelDescriptionRpsKvStoreRing()::
- $.panelDescription(
- 'Requests Per Second',
- |||
- Requests per second made to the key-value store
- used to manage which ingesters own which metrics series.
- |||
- ),
-
-
- panelDescriptionLatency(service)::
- $.panelDescription(
- 'Latency',
- |||
- Across all %s instances, the average, median
- (50th percentile), and 99th percentile time to respond
- to a request.
- ||| % service
- ),
-
- panelDescriptionLatencyKvStore()::
- $.panelDescription(
- 'Latency',
- |||
- The average, median (50th percentile), and 99th percentile time
- the KV store takes to respond to a request.
- |||
- ),
-
- panelDescriptionP99Latency(service)::
- $.panelDescription(
- 'Per Instance P99 Latency',
- |||
- The 99th percentile latency for each individual
- instance of the %s service.
- ||| % service
- ),
- }
-)
+}
\ No newline at end of file
From acc320a49bae613b543761d3c11aec88f108b263 Mon Sep 17 00:00:00 2001
From: Jennifer Villa
Date: Sun, 13 Jun 2021 17:18:20 -0500
Subject: [PATCH 06/35] Getting rid of a few space/comma errors.
---
cortex-mixin/dashboards/dashboard-utils.libsonnet | 3 +--
cortex-mixin/dashboards/reads.libsonnet | 14 +++++++-------
cortex-mixin/dashboards/writes.libsonnet | 2 +-
3 files changed, 9 insertions(+), 10 deletions(-)
diff --git a/cortex-mixin/dashboards/dashboard-utils.libsonnet b/cortex-mixin/dashboards/dashboard-utils.libsonnet
index 8641bf3d..6a6845e7 100644
--- a/cortex-mixin/dashboards/dashboard-utils.libsonnet
+++ b/cortex-mixin/dashboards/dashboard-utils.libsonnet
@@ -292,9 +292,8 @@ local utils = import 'mixin-utils/utils.libsonnet';
type: 'text',
} + options,
-
getObjectStoreRows(title, component):: [
- ($.row(title))
+ super.row(title)
.addPanel(
$.panel('Operations / sec') +
$.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s,component="%s"}[$__rate_interval]))' % [$.namespaceMatcher(), component], '{{operation}}') +
diff --git a/cortex-mixin/dashboards/reads.libsonnet b/cortex-mixin/dashboards/reads.libsonnet
index af384f4f..d606eb61 100644
--- a/cortex-mixin/dashboards/reads.libsonnet
+++ b/cortex-mixin/dashboards/reads.libsonnet
@@ -96,7 +96,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
.addPanel(
$.panel('Latency') +
- utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')])
+ utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')])
)
.addPanel(
$.panel('Per %s p99 Latency' % $._config.per_instance_label) +
@@ -110,7 +110,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.row('Query Frontend')
.addPanel(
$.panel('Requests Per Second') +
- $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend))
+ $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend))
)
.addPanel(
$.panel('Latency') +
@@ -121,7 +121,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.hiddenLegendQueryPanel(
'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.query_frontend)], ''
) +
- { yaxes: $.yaxes('s') }
+ { yaxes: $.yaxes('s') }
)
)
.addRow(
@@ -146,7 +146,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
.addPanel(
$.panel('Latency (Time in Queue)') +
- $.latencyPanel('cortex_query_scheduler_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_scheduler))
+ $.latencyPanel('cortex_query_scheduler_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_scheduler))
)
)
.addRow(
@@ -165,7 +165,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addPanel(
$.panel('Requests Per Second') +
$.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier))
- )
+ )
.addPanel(
$.panel('Latency') +
utils.latencyRecordingRulePanel('cortex_querier_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')])
@@ -258,7 +258,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
||| % $.jobMatcher($._config.job_names.store_gateway), '{{operation}}'
) +
$.stack +
- { yaxes: $.yaxes('ops') }
+ { yaxes: $.yaxes('ops') },
)
.addPanel(
$.panel('Latency (getmulti)') +
@@ -400,4 +400,4 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.storage_engine, 'blocks'),
$.getObjectStoreRows('Blocks Object Store (Querier accesses)', 'querier')
),
-}
\ No newline at end of file
+}
diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet
index 5486fd87..2815da0d 100644
--- a/cortex-mixin/dashboards/writes.libsonnet
+++ b/cortex-mixin/dashboards/writes.libsonnet
@@ -323,4 +323,4 @@ local utils = import 'mixin-utils/utils.libsonnet';
},
)
),
-}
\ No newline at end of file
+}
From 8368248da2cec53d978b86cf9a4af305959932c4 Mon Sep 17 00:00:00 2001
From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com>
Date: Tue, 15 Jun 2021 13:27:39 -0400
Subject: [PATCH 07/35] Update CHANGELOG.md
Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com>
---
CHANGELOG.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 25e0f3f8..2da99946 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,7 +12,7 @@
* [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308
* [CHANGE] Dashboards: added overridable `job_labels` and `cluster_labels` to the configuration object as label lists to uniquely identify jobs and clusters in the metric names and group-by lists in dashboards. #319
* [CHANGE] Dashboards: `alert_aggregation_labels` has been removed from the configuration and overriding this value has been deprecated. Instead the labels are now defined by the `cluster_labels` list, and should be overridden accordingly through that list. #319
-* [ENHANCEMENT] Added documentation text panels and descriptions to Reads and Writes dashboards.
+* [ENHANCEMENT] Added documentation text panels and descriptions to reads and writes dashboards.
## 1.9.0 / 2021-05-18
From 6ad57cd0400ef0d506586e2d1a77064f9c74a7a6 Mon Sep 17 00:00:00 2001
From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com>
Date: Tue, 15 Jun 2021 13:29:38 -0400
Subject: [PATCH 08/35] Update cortex-mixin/dashboards/compactor.libsonnet
Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com>
---
cortex-mixin/dashboards/compactor.libsonnet | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cortex-mixin/dashboards/compactor.libsonnet b/cortex-mixin/dashboards/compactor.libsonnet
index 4be906a7..28c55a89 100644
--- a/cortex-mixin/dashboards/compactor.libsonnet
+++ b/cortex-mixin/dashboards/compactor.libsonnet
@@ -35,7 +35,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.panelDescription(
'Tenants compaction progress',
|||
- In a multi-tenant cluster this shows the progress of tenants compacted while compaction is running.
+ In a multi-tenant cluster, display the progress of tenants that are compacted while compaction is running.
Reset to 0 once the compaction run is completed for all tenants in the shard.
|||
),
From c33303a9ddc26e920c29bd3d7ae2b7ee0c8fb99f Mon Sep 17 00:00:00 2001
From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com>
Date: Tue, 15 Jun 2021 13:29:54 -0400
Subject: [PATCH 09/35] Update cortex-mixin/dashboards/compactor.libsonnet
Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com>
---
cortex-mixin/dashboards/compactor.libsonnet | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cortex-mixin/dashboards/compactor.libsonnet b/cortex-mixin/dashboards/compactor.libsonnet
index 28c55a89..1867f45f 100644
--- a/cortex-mixin/dashboards/compactor.libsonnet
+++ b/cortex-mixin/dashboards/compactor.libsonnet
@@ -36,7 +36,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
'Tenants compaction progress',
|||
In a multi-tenant cluster, display the progress of tenants that are compacted while compaction is running.
- Reset to 0 once the compaction run is completed for all tenants in the shard.
+ Reset to `0` after the compaction run is completed for all tenants in the shard.
|||
),
)
From cb7054c5d11c22e8a972a95353a7fea502307633 Mon Sep 17 00:00:00 2001
From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com>
Date: Tue, 15 Jun 2021 13:30:04 -0400
Subject: [PATCH 10/35] Update cortex-mixin/dashboards/compactor.libsonnet
Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com>
---
cortex-mixin/dashboards/compactor.libsonnet | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cortex-mixin/dashboards/compactor.libsonnet b/cortex-mixin/dashboards/compactor.libsonnet
index 1867f45f..7e5b6c23 100644
--- a/cortex-mixin/dashboards/compactor.libsonnet
+++ b/cortex-mixin/dashboards/compactor.libsonnet
@@ -48,7 +48,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.queryPanel('sum(rate(prometheus_tsdb_compactions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), 'blocks') +
{ yaxes: $.yaxes('ops') } +
$.panelDescription(
- 'Compacted Blocks / Sec',
+ 'Compacted blocks / sec',
|||
Time taken to generate a single compacted block
|||
From 357db43d38dbfcbaf9713f9783075f88c8330b6d Mon Sep 17 00:00:00 2001
From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com>
Date: Tue, 15 Jun 2021 13:30:48 -0400
Subject: [PATCH 11/35] Update cortex-mixin/dashboards/compactor.libsonnet
Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com>
---
cortex-mixin/dashboards/compactor.libsonnet | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cortex-mixin/dashboards/compactor.libsonnet b/cortex-mixin/dashboards/compactor.libsonnet
index 7e5b6c23..027297cf 100644
--- a/cortex-mixin/dashboards/compactor.libsonnet
+++ b/cortex-mixin/dashboards/compactor.libsonnet
@@ -50,7 +50,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.panelDescription(
'Compacted blocks / sec',
|||
- Time taken to generate a single compacted block
+ Display the amount of time that it’s taken to generate a single compacted block.
|||
),
)
From 19cb601ee0c98862c0a68867910172aa305cadfe Mon Sep 17 00:00:00 2001
From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com>
Date: Tue, 15 Jun 2021 13:30:59 -0400
Subject: [PATCH 12/35] Update cortex-mixin/dashboards/compactor.libsonnet
Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com>
---
cortex-mixin/dashboards/compactor.libsonnet | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cortex-mixin/dashboards/compactor.libsonnet b/cortex-mixin/dashboards/compactor.libsonnet
index 027297cf..98df4965 100644
--- a/cortex-mixin/dashboards/compactor.libsonnet
+++ b/cortex-mixin/dashboards/compactor.libsonnet
@@ -60,7 +60,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.panelDescription(
'Per-block compaction duration',
|||
- Rate of blocks generated as a result of a compaction operation
+ Rate of blocks that are generated as a result of a compaction operation.
|||
),
)
From 4735870909420e8ae265d14da345a5794607780d Mon Sep 17 00:00:00 2001
From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com>
Date: Tue, 15 Jun 2021 13:31:09 -0400
Subject: [PATCH 13/35] Update cortex-mixin/dashboards/compactor.libsonnet
Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com>
---
cortex-mixin/dashboards/compactor.libsonnet | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cortex-mixin/dashboards/compactor.libsonnet b/cortex-mixin/dashboards/compactor.libsonnet
index 98df4965..c4b77074 100644
--- a/cortex-mixin/dashboards/compactor.libsonnet
+++ b/cortex-mixin/dashboards/compactor.libsonnet
@@ -77,7 +77,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.panelDescription(
'Tenants with largest number of blocks',
|||
- The 10 tenants with the largest number of blocks
+ The 10 tenants with the largest number of blocks.
|||
),
)
From fa48a91df6143f5af397ff419ed4f30ce7de3b82 Mon Sep 17 00:00:00 2001
From: Darren Janeczek
Date: Tue, 15 Jun 2021 13:33:41 -0400
Subject: [PATCH 14/35] fix: formatting - limit to 4 panels per row
---
cortex-mixin/dashboards/writes.libsonnet | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet
index 84c9ab6c..e766ad64 100644
--- a/cortex-mixin/dashboards/writes.libsonnet
+++ b/cortex-mixin/dashboards/writes.libsonnet
@@ -282,7 +282,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
.addRowIf(
std.member($._config.storage_engine, 'blocks'),
- $.row('Ingester - Blocks storage - TSDB WAL')
+ ($.row('Ingester - Blocks storage - TSDB WAL') {height: "32px"})
.addPanel(
$.textPanel('', |||
@@ -291,6 +291,10 @@ local utils = import 'mixin-utils/utils.libsonnet';
|||),
)
+ )
+ .addRowIf(
+ std.member($._config.storage_engine, 'blocks'),
+ ($.row('') {showTitle: false})
.addPanel(
$.successFailurePanel(
'WAL truncations / sec',
From c7b787124c3a22bc76a15b19ddd06dbcfed30d06 Mon Sep 17 00:00:00 2001
From: Darren Janeczek
Date: Tue, 15 Jun 2021 13:39:28 -0400
Subject: [PATCH 15/35] fmt
---
cortex-mixin/dashboards/dashboard-utils.libsonnet | 14 +++++++-------
cortex-mixin/dashboards/reads.libsonnet | 8 ++++----
2 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/cortex-mixin/dashboards/dashboard-utils.libsonnet b/cortex-mixin/dashboards/dashboard-utils.libsonnet
index 6a6845e7..4433e28a 100644
--- a/cortex-mixin/dashboards/dashboard-utils.libsonnet
+++ b/cortex-mixin/dashboards/dashboard-utils.libsonnet
@@ -333,14 +333,14 @@ local utils = import 'mixin-utils/utils.libsonnet';
],
thanosMemcachedCache(title, jobName, component, cacheName)::
- local config = {
- jobMatcher: $.jobMatcher(jobName),
- component: component,
- cacheName: cacheName,
- cacheNameReadable: std.strReplace(cacheName, '-', ' '),
- };
+ local config = {
+ jobMatcher: $.jobMatcher(jobName),
+ component: component,
+ cacheName: cacheName,
+ cacheNameReadable: std.strReplace(cacheName, '-', ' '),
+ };
super.row(title)
- .addPanel(
+ .addPanel(
$.panel('Requests Per Second') +
$.queryPanel(
|||
diff --git a/cortex-mixin/dashboards/reads.libsonnet b/cortex-mixin/dashboards/reads.libsonnet
index d606eb61..c6004e04 100644
--- a/cortex-mixin/dashboards/reads.libsonnet
+++ b/cortex-mixin/dashboards/reads.libsonnet
@@ -92,7 +92,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.row('Gateway')
.addPanel(
$.panel('Requests Per Second') +
- $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway))
+ $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway))
)
.addPanel(
$.panel('Latency') +
@@ -114,7 +114,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
.addPanel(
$.panel('Latency') +
- utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')])
+ utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')])
)
.addPanel(
$.panel('Per %s p99 Latency' % $._config.per_instance_label) +
@@ -142,7 +142,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
.addPanel(
$.panel('Requests Per Second') +
- $.qpsPanel('cortex_query_scheduler_queue_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.query_scheduler))
+ $.qpsPanel('cortex_query_scheduler_queue_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.query_scheduler))
)
.addPanel(
$.panel('Latency (Time in Queue)') +
@@ -301,7 +301,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
],
'{{item_type}}'
) +
- { yaxes: $.yaxes('percentunit') } +
+ { yaxes: $.yaxes('percentunit') } +
$.panelDescription(
'Hit Ratio',
|||
From 6c0066c58a8a3781561480e95c70c186ab9b0080 Mon Sep 17 00:00:00 2001
From: Darren Janeczek
Date: Tue, 15 Jun 2021 13:40:49 -0400
Subject: [PATCH 16/35] fix: remove accidental line
---
cortex-mixin/dashboards/dashboard-utils.libsonnet | 1 -
1 file changed, 1 deletion(-)
diff --git a/cortex-mixin/dashboards/dashboard-utils.libsonnet b/cortex-mixin/dashboards/dashboard-utils.libsonnet
index 4433e28a..5627522f 100644
--- a/cortex-mixin/dashboards/dashboard-utils.libsonnet
+++ b/cortex-mixin/dashboards/dashboard-utils.libsonnet
@@ -61,7 +61,6 @@ local utils = import 'mixin-utils/utils.libsonnet';
else d
.addTemplate('cluster', 'cortex_build_info', 'cluster')
.addTemplate('namespace', 'cortex_build_info{cluster=~"$cluster"}', 'namespace'),
- editable: true,
},
// The mixin allow specialism of the job selector depending on if its a single binary
From 773926aa3d799fac1e1bfbfcb7386d9443b85ecc Mon Sep 17 00:00:00 2001
From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com>
Date: Tue, 15 Jun 2021 13:43:11 -0400
Subject: [PATCH 17/35] Update
cortex-mixin/dashboards/dashboard-utils.libsonnet
Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com>
---
cortex-mixin/dashboards/dashboard-utils.libsonnet | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cortex-mixin/dashboards/dashboard-utils.libsonnet b/cortex-mixin/dashboards/dashboard-utils.libsonnet
index 5627522f..a1c25996 100644
--- a/cortex-mixin/dashboards/dashboard-utils.libsonnet
+++ b/cortex-mixin/dashboards/dashboard-utils.libsonnet
@@ -340,7 +340,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
};
super.row(title)
.addPanel(
- $.panel('Requests Per Second') +
+ $.panel('Requests per second') +
$.queryPanel(
|||
sum by(operation) (
From a12d8157a7d6676a6c657f7839be3b1711c80705 Mon Sep 17 00:00:00 2001
From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com>
Date: Tue, 15 Jun 2021 13:47:24 -0400
Subject: [PATCH 18/35] Update cortex-mixin/dashboards/reads.libsonnet
Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com>
---
cortex-mixin/dashboards/reads.libsonnet | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cortex-mixin/dashboards/reads.libsonnet b/cortex-mixin/dashboards/reads.libsonnet
index c6004e04..b44a4501 100644
--- a/cortex-mixin/dashboards/reads.libsonnet
+++ b/cortex-mixin/dashboards/reads.libsonnet
@@ -5,7 +5,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
($.dashboard('Cortex / Reads') + { uid: '8d6ba60eccc4b6eedfa329b24b1bd339' })
.addClusterSelectorTemplates()
.addRow(
- ($.row('Reads Dashboard Description') { height: '175px', showTitle: false })
+ ($.row('Reads dashboard description') { height: '175px', showTitle: false })
.addPanel(
$.textPanel('', |||
From b335df9ba44b185220643b3997e68a74c351fb02 Mon Sep 17 00:00:00 2001
From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com>
Date: Tue, 15 Jun 2021 13:47:56 -0400
Subject: [PATCH 19/35] Update cortex-mixin/dashboards/reads.libsonnet
Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com>
---
cortex-mixin/dashboards/reads.libsonnet | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cortex-mixin/dashboards/reads.libsonnet b/cortex-mixin/dashboards/reads.libsonnet
index b44a4501..9d95cb8c 100644
--- a/cortex-mixin/dashboards/reads.libsonnet
+++ b/cortex-mixin/dashboards/reads.libsonnet
@@ -35,7 +35,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
showTitle: false,
})
.addPanel(
- $.panel('Instant Queries / s') +
+ $.panel('Instant queries / sec') +
$.statPanel(|||
sum(
rate(
From 73e65cfa034919a8f5eba0adbc794ae9fa5d7043 Mon Sep 17 00:00:00 2001
From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com>
Date: Tue, 15 Jun 2021 13:48:13 -0400
Subject: [PATCH 20/35] Update cortex-mixin/dashboards/writes.libsonnet
Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com>
---
cortex-mixin/dashboards/writes.libsonnet | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet
index 2815da0d..eae9436f 100644
--- a/cortex-mixin/dashboards/writes.libsonnet
+++ b/cortex-mixin/dashboards/writes.libsonnet
@@ -298,7 +298,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.queryPanel('sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_sum{%s}[$__rate_interval])) / sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_count{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'avg') +
{ yaxes: $.yaxes('s') } +
$.panelDescription(
- 'WAL Truncations Latency (including checkpointing)',
+ 'WAL truncations latency (including checkpointing)',
|||
Average time taken to perform a full WAL truncation,
including the time taken for the checkpointing to complete.
From 13f0fa369ed8ce4eb690d752c6f7d7cc0b60d0e4 Mon Sep 17 00:00:00 2001
From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com>
Date: Tue, 15 Jun 2021 13:49:04 -0400
Subject: [PATCH 21/35] Update cortex-mixin/dashboards/writes.libsonnet
Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com>
---
cortex-mixin/dashboards/writes.libsonnet | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet
index eae9436f..d45a6eb4 100644
--- a/cortex-mixin/dashboards/writes.libsonnet
+++ b/cortex-mixin/dashboards/writes.libsonnet
@@ -272,7 +272,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
'sum(rate(cortex_ingester_tsdb_wal_truncations_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester),
) +
$.panelDescription(
- 'WAL Truncations / sec',
+ 'WAL truncations / sec',
|||
The WAL is truncated each time a new TSDB block is written. This panel measures the rate of
truncations.
From 6c0ebb81408603f44e4da1358a16e9daad15e0ca Mon Sep 17 00:00:00 2001
From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com>
Date: Tue, 15 Jun 2021 13:49:15 -0400
Subject: [PATCH 22/35] Update cortex-mixin/dashboards/writes.libsonnet
Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com>
---
cortex-mixin/dashboards/writes.libsonnet | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet
index d45a6eb4..df7af240 100644
--- a/cortex-mixin/dashboards/writes.libsonnet
+++ b/cortex-mixin/dashboards/writes.libsonnet
@@ -264,7 +264,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
.addRowIf(
std.member($._config.storage_engine, 'blocks'),
- $.row('Ingester - Blocks storage - TSDB Write Ahead Log (WAL)')
+ $.row('Ingester - blocks storage - TSDB write ahead log (WAL)')
.addPanel(
$.successFailurePanel(
'WAL truncations / sec',
From ea7d87d10127d3c63b0007f8b65b1f9415b8bbbf Mon Sep 17 00:00:00 2001
From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com>
Date: Tue, 15 Jun 2021 13:49:21 -0400
Subject: [PATCH 23/35] Update cortex-mixin/dashboards/writes.libsonnet
Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com>
---
cortex-mixin/dashboards/writes.libsonnet | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet
index df7af240..dc62e4c5 100644
--- a/cortex-mixin/dashboards/writes.libsonnet
+++ b/cortex-mixin/dashboards/writes.libsonnet
@@ -254,7 +254,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.panel('Compactions latency') +
$.latencyPanel('cortex_ingester_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.ingester)) +
$.panelDescription(
- 'Compaction Latency',
+ 'Compaction latency',
|||
The average, median (50th percentile), and 99th percentile time ingesters take to compact head blocks
on the local filesystem.
From 4aed69699a08fd1f2041322c78349353ba7c1d71 Mon Sep 17 00:00:00 2001
From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com>
Date: Tue, 15 Jun 2021 13:49:28 -0400
Subject: [PATCH 24/35] Update cortex-mixin/dashboards/writes.libsonnet
Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com>
---
cortex-mixin/dashboards/writes.libsonnet | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet
index dc62e4c5..3c87f193 100644
--- a/cortex-mixin/dashboards/writes.libsonnet
+++ b/cortex-mixin/dashboards/writes.libsonnet
@@ -195,7 +195,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.chunk_store_backend, 'gcs'),
$.row('GCS')
.addPanel(
- $.panel('Requests Per Second') +
+ $.panel('Requests per second') +
$.qpsPanel('cortex_gcs_request_duration_seconds_count{%s, operation="POST"}' % $.jobMatcher($._config.job_names.ingester))
)
.addPanel(
From c411115031eef228a7f51e9459200925b4f5f068 Mon Sep 17 00:00:00 2001
From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com>
Date: Tue, 15 Jun 2021 13:50:13 -0400
Subject: [PATCH 25/35] Update cortex-mixin/dashboards/reads.libsonnet
Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com>
---
cortex-mixin/dashboards/reads.libsonnet | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cortex-mixin/dashboards/reads.libsonnet b/cortex-mixin/dashboards/reads.libsonnet
index 9d95cb8c..8ef201c5 100644
--- a/cortex-mixin/dashboards/reads.libsonnet
+++ b/cortex-mixin/dashboards/reads.libsonnet
@@ -57,7 +57,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
ruler: $.jobMatcher($._config.job_names.ruler),
}, format='reqps') +
$.panelDescription(
- 'Instant Queries Per Second',
+ 'Instant Queries per second',
|||
Rate of instant queries per second being made to the system.
Includes both queries made to the /prometheus API as
From 0c17f02c2e19fd4184282d7bfc3bd18e384da1a6 Mon Sep 17 00:00:00 2001
From: Darren Janeczek
Date: Tue, 15 Jun 2021 13:51:45 -0400
Subject: [PATCH 26/35] fix: Requests per second
---
cortex-mixin/dashboards/reads.libsonnet | 28 ++++++++++++------------
cortex-mixin/dashboards/writes.libsonnet | 20 ++++++++---------
2 files changed, 24 insertions(+), 24 deletions(-)
diff --git a/cortex-mixin/dashboards/reads.libsonnet b/cortex-mixin/dashboards/reads.libsonnet
index 8ef201c5..d5817222 100644
--- a/cortex-mixin/dashboards/reads.libsonnet
+++ b/cortex-mixin/dashboards/reads.libsonnet
@@ -91,7 +91,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Gateway')
.addPanel(
- $.panel('Requests Per Second') +
+ $.panel('Requests per second') +
$.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway))
)
.addPanel(
@@ -109,7 +109,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Query Frontend')
.addPanel(
- $.panel('Requests Per Second') +
+ $.panel('Requests per second') +
$.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend))
)
.addPanel(
@@ -141,7 +141,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
)
.addPanel(
- $.panel('Requests Per Second') +
+ $.panel('Requests per second') +
$.qpsPanel('cortex_query_scheduler_queue_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.query_scheduler))
)
.addPanel(
@@ -152,7 +152,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Cache - Query Results')
.addPanel(
- $.panel('Requests Per Second') +
+ $.panel('Requests per second') +
$.qpsPanel('cortex_cache_request_duration_seconds_count{method=~"frontend.+", %s}' % $.jobMatcher($._config.job_names.query_frontend))
)
.addPanel(
@@ -163,7 +163,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Querier')
.addPanel(
- $.panel('Requests Per Second') +
+ $.panel('Requests per second') +
$.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier))
)
.addPanel(
@@ -181,7 +181,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Ingester')
.addPanel(
- $.panel('Requests Per Second') +
+ $.panel('Requests per second') +
$.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}' % $.jobMatcher($._config.job_names.ingester))
)
.addPanel(
@@ -200,7 +200,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.storage_engine, 'blocks'),
$.row('Store-gateway')
.addPanel(
- $.panel('Requests Per Second') +
+ $.panel('Requests per second') +
$.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/gatewaypb.StoreGateway/.*"}' % $.jobMatcher($._config.job_names.store_gateway))
)
.addPanel(
@@ -219,7 +219,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.storage_engine, 'chunks'),
$.row('Memcached - Chunks storage - Index')
.addPanel(
- $.panel('Requests Per Second') +
+ $.panel('Requests per second') +
$.qpsPanel('cortex_cache_request_duration_seconds_count{%s,method="store.index-cache-read.memcache.fetch"}' % $.jobMatcher($._config.job_names.querier))
)
.addPanel(
@@ -231,7 +231,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.storage_engine, 'chunks'),
$.row('Memcached - Chunks storage - Chunks')
.addPanel(
- $.panel('Requests Per Second') +
+ $.panel('Requests per second') +
$.qpsPanel('cortex_cache_request_duration_seconds_count{%s,method="chunksmemcache.fetch"}' % $.jobMatcher($._config.job_names.querier))
)
.addPanel(
@@ -243,7 +243,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.storage_engine, 'blocks'),
$.row('Memcached – Blocks Storage – Block Index Cache (Store-gateway accesses)') // Resembles thanosMemcachedCache
.addPanel(
- $.panel('Requests Per Second') +
+ $.panel('Requests per second') +
$.queryPanel(
|||
sum by(operation) (
@@ -343,7 +343,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'cassandra'),
$.row('Cassandra')
.addPanel(
- $.panel('Requests Per Second') +
+ $.panel('Requests per second') +
$.qpsPanel('cortex_cassandra_request_duration_seconds_count{%s, operation="SELECT"}' % $.jobMatcher($._config.job_names.querier))
)
.addPanel(
@@ -356,7 +356,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'bigtable'),
$.row('BigTable')
.addPanel(
- $.panel('Requests Per Second') +
+ $.panel('Requests per second') +
$.qpsPanel('cortex_bigtable_request_duration_seconds_count{%s, operation="/google.bigtable.v2.Bigtable/ReadRows"}' % $.jobMatcher($._config.job_names.querier))
)
.addPanel(
@@ -369,7 +369,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'dynamodb'),
$.row('DynamoDB')
.addPanel(
- $.panel('Requests Per Second') +
+ $.panel('Requests per second') +
$.qpsPanel('cortex_dynamo_request_duration_seconds_count{%s, operation="DynamoDB.QueryPages"}' % $.jobMatcher($._config.job_names.querier))
)
.addPanel(
@@ -382,7 +382,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.chunk_store_backend, 'gcs'),
$.row('GCS')
.addPanel(
- $.panel('Requests Per Second') +
+ $.panel('Requests per second') +
$.qpsPanel('cortex_gcs_request_duration_seconds_count{%s, operation="GET"}' % $.jobMatcher($._config.job_names.querier))
)
.addPanel(
diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet
index 3c87f193..01f116a4 100644
--- a/cortex-mixin/dashboards/writes.libsonnet
+++ b/cortex-mixin/dashboards/writes.libsonnet
@@ -59,14 +59,14 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.statPanel('count(count by(user) (cortex_ingester_active_series{%s}))' % $.jobMatcher($._config.job_names.ingester), format='short')
)
.addPanel(
- $.panel('Requests Per Second') +
+ $.panel('Requests per second') +
$.statPanel('sum(rate(cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}[5m]))' % $.jobMatcher($._config.job_names.gateway), format='reqps')
)
)
.addRow(
$.row('Gateway')
.addPanel(
- $.panel('Requests Per Second') +
+ $.panel('Requests per second') +
$.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.gateway))
)
.addPanel(
@@ -84,7 +84,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Distributor')
.addPanel(
- $.panel('Requests Per Second') +
+ $.panel('Requests per second') +
$.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.distributor))
)
.addPanel(
@@ -102,7 +102,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Key-Value store for high-availability (HA) deduplication')
.addPanel(
- $.panel('Requests Per Second') +
+ $.panel('Requests per second') +
$.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.distributor))
)
.addPanel(
@@ -113,7 +113,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Ingester')
.addPanel(
- $.panel('Requests Per Second') +
+ $.panel('Requests per second') +
$.qpsPanel('cortex_request_duration_seconds_count{%s,route="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ingester))
)
.addPanel(
@@ -131,7 +131,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Key-Value store for the ingester ring')
.addPanel(
- $.panel('Requests Per Second') +
+ $.panel('Requests per second') +
$.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester))
)
.addPanel(
@@ -143,7 +143,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.storage_engine, 'chunks'),
$.row('Memcached')
.addPanel(
- $.panel('Requests Per Second') +
+ $.panel('Requests per second') +
$.qpsPanel('cortex_memcache_request_duration_seconds_count{%s,method="Memcache.Put"}' % $.jobMatcher($._config.job_names.ingester))
)
.addPanel(
@@ -156,7 +156,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'cassandra'),
$.row('Cassandra')
.addPanel(
- $.panel('Requests Per Second') +
+ $.panel('Requests per second') +
$.qpsPanel('cortex_cassandra_request_duration_seconds_count{%s, operation="INSERT"}' % $.jobMatcher($._config.job_names.ingester))
)
.addPanel(
@@ -169,7 +169,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'bigtable'),
$.row('BigTable')
.addPanel(
- $.panel('Requests Per Second') +
+ $.panel('Requests per second') +
$.qpsPanel('cortex_bigtable_request_duration_seconds_count{%s, operation="/google.bigtable.v2.Bigtable/MutateRows"}' % $.jobMatcher($._config.job_names.ingester))
)
.addPanel(
@@ -182,7 +182,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'dynamodb'),
$.row('DynamoDB')
.addPanel(
- $.panel('Requests Per Second') +
+ $.panel('Requests per second') +
$.qpsPanel('cortex_dynamo_request_duration_seconds_count{%s, operation="DynamoDB.BatchWriteItem"}' % $.jobMatcher($._config.job_names.ingester))
)
.addPanel(
From b8ccaccfd7cee9819a20751b05a2344f0c2ec180 Mon Sep 17 00:00:00 2001
From: Darren Janeczek
Date: Tue, 15 Jun 2021 13:53:19 -0400
Subject: [PATCH 27/35] fix: text
---
cortex-mixin/dashboards/reads.libsonnet | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cortex-mixin/dashboards/reads.libsonnet b/cortex-mixin/dashboards/reads.libsonnet
index d5817222..971c51df 100644
--- a/cortex-mixin/dashboards/reads.libsonnet
+++ b/cortex-mixin/dashboards/reads.libsonnet
@@ -66,7 +66,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
),
)
.addPanel(
- $.panel('Range Queries / s') +
+ $.panel('Range queries / s') +
$.statPanel(|||
sum(
rate(
From d5b14c11897ed0aecb77d9c5374ad14bcf8aa24a Mon Sep 17 00:00:00 2001
From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com>
Date: Tue, 15 Jun 2021 13:58:33 -0400
Subject: [PATCH 28/35] Apply suggestions from code review as per @osg-grafana
Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com>
---
cortex-mixin/dashboards/reads.libsonnet | 4 ++--
cortex-mixin/dashboards/writes.libsonnet | 8 ++++----
2 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/cortex-mixin/dashboards/reads.libsonnet b/cortex-mixin/dashboards/reads.libsonnet
index 971c51df..e73afc22 100644
--- a/cortex-mixin/dashboards/reads.libsonnet
+++ b/cortex-mixin/dashboards/reads.libsonnet
@@ -80,7 +80,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
queryFrontend: $.jobMatcher($._config.job_names.query_frontend),
}, format='reqps') +
$.panelDescription(
- 'Range Queries Per Second',
+ 'Range queries per second',
|||
Rate of range queries per second being made to
Cortex via the /prometheus API.
@@ -135,7 +135,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
the internal queue from the query frontend into a
separate component.
If this service is not deployed,
- these panels will show "No Data."
+ these panels will show "No data."
|||
)
diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet
index 01f116a4..ae36445b 100644
--- a/cortex-mixin/dashboards/writes.libsonnet
+++ b/cortex-mixin/dashboards/writes.libsonnet
@@ -5,7 +5,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
($.dashboard('Cortex / Writes') + { uid: '0156f6d15aa234d452a33a4f13c838e3' })
.addClusterSelectorTemplates()
.addRow(
- ($.row('Writes Dashboard Description') { height: '125px', showTitle: false })
+ ($.row('Writes dashboard description') { height: '125px', showTitle: false })
.addPanel(
$.textPanel('', |||
@@ -22,7 +22,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
It also includes metrics for the key-value (KV) stores used to manage
- the High Availability Tracker and the Ingesters.
+ the high-availability tracker and the ingesters.
|||),
)
@@ -100,7 +100,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
)
.addRow(
- $.row('Key-Value store for high-availability (HA) deduplication')
+ $.row('Key-value store for high-availability (HA) deduplication')
.addPanel(
$.panel('Requests per second') +
$.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.distributor))
@@ -129,7 +129,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
)
.addRow(
- $.row('Key-Value store for the ingester ring')
+ $.row('Key-value store for the ingester ring')
.addPanel(
$.panel('Requests per second') +
$.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester))
From b22d22e4843136a1a513c8ebcd843e7b38c4bfcd Mon Sep 17 00:00:00 2001
From: Darren Janeczek
Date: Tue, 15 Jun 2021 16:03:56 -0400
Subject: [PATCH 29/35] fix: clarity
---
cortex-mixin/dashboards/compactor.libsonnet | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cortex-mixin/dashboards/compactor.libsonnet b/cortex-mixin/dashboards/compactor.libsonnet
index c4b77074..18065547 100644
--- a/cortex-mixin/dashboards/compactor.libsonnet
+++ b/cortex-mixin/dashboards/compactor.libsonnet
@@ -18,7 +18,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.panelDescription(
'Per-instance runs',
|||
- Number of times a compactor instance triggers a compaction across all tenants its shard manage.
+ Number of times a compactor instance triggers a compaction across all tenants that it manages.
|||
),
)
From dffe62a8758e31048db6f747c0d586de9582b7c7 Mon Sep 17 00:00:00 2001
From: Darren Janeczek <38694490+darrenjaneczek@users.noreply.github.com>
Date: Tue, 15 Jun 2021 16:06:48 -0400
Subject: [PATCH 30/35] Apply suggestions from code review as per @osg-grafana
Co-authored-by: Ursula Kallio <73951760+osg-grafana@users.noreply.github.com>
---
cortex-mixin/dashboards/reads.libsonnet | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/cortex-mixin/dashboards/reads.libsonnet b/cortex-mixin/dashboards/reads.libsonnet
index e73afc22..84026a88 100644
--- a/cortex-mixin/dashboards/reads.libsonnet
+++ b/cortex-mixin/dashboards/reads.libsonnet
@@ -241,7 +241,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
.addRowIf(
std.member($._config.storage_engine, 'blocks'),
- $.row('Memcached – Blocks Storage – Block Index Cache (Store-gateway accesses)') // Resembles thanosMemcachedCache
+ $.row('Memcached – blocks storage – block index cache (store-gateway accesses)') // Resembles thanosMemcachedCache
.addPanel(
$.panel('Requests per second') +
$.queryPanel(
@@ -314,7 +314,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRowIf(
std.member($._config.storage_engine, 'blocks'),
$.thanosMemcachedCache(
- 'Memcached – Blocks Storage – Chunks Cache (Store-gateway accesses)',
+ 'Memcached – blocks storage – chunks cache (store-gateway accesses)',
$._config.job_names.store_gateway,
'store-gateway',
'chunks-cache'
@@ -323,7 +323,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRowIf(
std.member($._config.storage_engine, 'blocks'),
$.thanosMemcachedCache(
- 'Memcached – Blocks Storage – Metadata Cache (Store-gateway accesses)',
+ 'Memcached – blocks storage – metadata cache (store-gateway accesses)',
$._config.job_names.store_gateway,
'store-gateway',
'metadata-cache'
@@ -332,7 +332,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRowIf(
std.member($._config.storage_engine, 'blocks'),
$.thanosMemcachedCache(
- 'Memcached – Blocks Storage – Metadata Cache (Querier accesses)',
+ 'Memcached – blocks storage – metadata cache (querier accesses)',
$._config.job_names.querier,
'querier',
'metadata-cache'
From eafdbfc012a8c181dd9487651aed8fb649e1df19 Mon Sep 17 00:00:00 2001
From: Darren Janeczek
Date: Thu, 17 Jun 2021 13:34:32 -0400
Subject: [PATCH 31/35] fix: query formatting to aid in merge
---
.../dashboards/dashboard-utils.libsonnet | 21 ++++++++++++++++++-
1 file changed, 20 insertions(+), 1 deletion(-)
diff --git a/cortex-mixin/dashboards/dashboard-utils.libsonnet b/cortex-mixin/dashboards/dashboard-utils.libsonnet
index a1c25996..ea0f1592 100644
--- a/cortex-mixin/dashboards/dashboard-utils.libsonnet
+++ b/cortex-mixin/dashboards/dashboard-utils.libsonnet
@@ -403,7 +403,26 @@ local utils = import 'mixin-utils/utils.libsonnet';
filterNodeDiskContainer(containerName)::
|||
- ignoring(%s) group_right() (label_replace(count by(%s, %s, device) (container_fs_writes_bytes_total{%s,container="%s",device!~".*sda.*"}), "device", "$1", "device", "/dev/(.*)") * 0)
+ ignoring(%s) group_right() (
+ label_replace(
+ count by(
+ %s,
+ %s,
+ device
+ )
+ (
+ container_fs_writes_bytes_total{
+ %s,
+ container="%s",
+ device!~".*sda.*"
+ }
+ ),
+ "device",
+ "$1",
+ "device",
+ "/dev/(.*)"
+ ) * 0
+ )
||| % [$._config.per_instance_label, $._config.per_node_label, $._config.per_instance_label, $.namespaceMatcher(), containerName],
panelDescription(title, description):: {
From dddd6e720842bcd78067ec701e3ec5135a61fded Mon Sep 17 00:00:00 2001
From: Darren Janeczek
Date: Thu, 17 Jun 2021 13:36:43 -0400
Subject: [PATCH 32/35] fix: query formatting to aid in merge
---
cortex-mixin/dashboards/dashboard-utils.libsonnet | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/cortex-mixin/dashboards/dashboard-utils.libsonnet b/cortex-mixin/dashboards/dashboard-utils.libsonnet
index ea0f1592..254619d4 100644
--- a/cortex-mixin/dashboards/dashboard-utils.libsonnet
+++ b/cortex-mixin/dashboards/dashboard-utils.libsonnet
@@ -423,7 +423,13 @@ local utils = import 'mixin-utils/utils.libsonnet';
"/dev/(.*)"
) * 0
)
- ||| % [$._config.per_instance_label, $._config.per_node_label, $._config.per_instance_label, $.namespaceMatcher(), containerName],
+ ||| % [
+ $._config.per_instance_label,
+ $._config.per_node_label,
+ $._config.per_instance_label,
+ $.namespaceMatcher(),
+ containerName,
+ ],
panelDescription(title, description):: {
description: |||
From fcc48964df5f7254c741669a24755dfca447226c Mon Sep 17 00:00:00 2001
From: Darren Janeczek
Date: Thu, 17 Jun 2021 15:07:29 -0400
Subject: [PATCH 33/35] fix: consistent labelling
---
cortex-mixin/dashboards/reads.libsonnet | 6 +++---
cortex-mixin/dashboards/writes.libsonnet | 20 ++++++++++----------
2 files changed, 13 insertions(+), 13 deletions(-)
diff --git a/cortex-mixin/dashboards/reads.libsonnet b/cortex-mixin/dashboards/reads.libsonnet
index 84026a88..aa7d1b26 100644
--- a/cortex-mixin/dashboards/reads.libsonnet
+++ b/cortex-mixin/dashboards/reads.libsonnet
@@ -35,7 +35,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
showTitle: false,
})
.addPanel(
- $.panel('Instant queries / sec') +
+ $.panel('Instant queries per second') +
$.statPanel(|||
sum(
rate(
@@ -57,7 +57,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
ruler: $.jobMatcher($._config.job_names.ruler),
}, format='reqps') +
$.panelDescription(
- 'Instant Queries per second',
+ 'Instant queries per second',
|||
Rate of instant queries per second being made to the system.
Includes both queries made to the /prometheus API as
@@ -66,7 +66,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
),
)
.addPanel(
- $.panel('Range queries / s') +
+ $.panel('Range queries per second') +
$.statPanel(|||
sum(
rate(
diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet
index ae36445b..799fcc4f 100644
--- a/cortex-mixin/dashboards/writes.libsonnet
+++ b/cortex-mixin/dashboards/writes.libsonnet
@@ -33,7 +33,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
showTitle: false,
})
.addPanel(
- $.panel('Samples / s') +
+ $.panel('Samples per second') +
$.statPanel(
'sum(%(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m{%(job)s})' % (
$._config {
@@ -208,12 +208,12 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.row('Ingester - Blocks storage - Shipper')
.addPanel(
$.successFailurePanel(
- 'Uploaded blocks / sec',
+ 'Uploaded blocks per second',
'sum(rate(cortex_ingester_shipper_uploads_total{%s}[$__rate_interval])) - sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
'sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester),
) +
$.panelDescription(
- 'Uploaded blocks / sec',
+ 'Uploaded blocks per second',
|||
The rate of blocks being uploaded from the ingesters
to object storage.
@@ -237,12 +237,12 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.row('Ingester - Blocks storage - TSDB Head')
.addPanel(
$.successFailurePanel(
- 'Compactions / sec',
+ 'Compactions per second',
'sum(rate(cortex_ingester_tsdb_compactions_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)],
'sum(rate(cortex_ingester_tsdb_compactions_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester),
) +
$.panelDescription(
- 'Compactions / sec',
+ 'Compactions per second',
|||
Ingesters maintain a local TSDB per-tenant on disk. Each TSDB maintains a head block for each
active time series; these blocks get periodically compacted (by default, every 2h).
@@ -267,12 +267,12 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.row('Ingester - blocks storage - TSDB write ahead log (WAL)')
.addPanel(
$.successFailurePanel(
- 'WAL truncations / sec',
+ 'WAL truncations per second',
'sum(rate(cortex_ingester_tsdb_wal_truncations_total{%s}[$__rate_interval])) - sum(rate(cortex_ingester_tsdb_wal_truncations_failed_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
'sum(rate(cortex_ingester_tsdb_wal_truncations_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester),
) +
$.panelDescription(
- 'WAL truncations / sec',
+ 'WAL truncations per second',
|||
The WAL is truncated each time a new TSDB block is written. This panel measures the rate of
truncations.
@@ -281,12 +281,12 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
.addPanel(
$.successFailurePanel(
- 'Checkpoints created / sec',
+ 'Checkpoints created per second',
'sum(rate(cortex_ingester_tsdb_checkpoint_creations_total{%s}[$__rate_interval])) - sum(rate(cortex_ingester_tsdb_checkpoint_creations_failed_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
'sum(rate(cortex_ingester_tsdb_checkpoint_creations_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester),
) +
$.panelDescription(
- 'Checkpoints created / sec',
+ 'Checkpoints created per second',
|||
Checkpoints are created as part of the WAL truncation process.
This metric measures the rate of checkpoint creation.
@@ -306,7 +306,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
),
)
.addPanel(
- $.panel('Corruptions / sec') +
+ $.panel('Corruptions per second') +
$.queryPanel([
'sum(rate(cortex_ingester_wal_corruptions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester),
'sum(rate(cortex_ingester_tsdb_mmap_chunk_corruptions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester),
From 513b096f7087a66e739de618626c8cd2d55e3a8f Mon Sep 17 00:00:00 2001
From: Darren Janeczek
Date: Thu, 17 Jun 2021 15:22:50 -0400
Subject: [PATCH 34/35] fix: ensure panel titles are consistent
- Most existing "per second" panel titles in `main` are written "/ sec",
corrected recent commits to match.
---
cortex-mixin/dashboards/reads.libsonnet | 32 ++++++++++-----------
cortex-mixin/dashboards/writes.libsonnet | 36 ++++++++++++------------
2 files changed, 34 insertions(+), 34 deletions(-)
diff --git a/cortex-mixin/dashboards/reads.libsonnet b/cortex-mixin/dashboards/reads.libsonnet
index aa7d1b26..cb2411d4 100644
--- a/cortex-mixin/dashboards/reads.libsonnet
+++ b/cortex-mixin/dashboards/reads.libsonnet
@@ -35,7 +35,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
showTitle: false,
})
.addPanel(
- $.panel('Instant queries per second') +
+ $.panel('Instant queries / sec') +
$.statPanel(|||
sum(
rate(
@@ -66,7 +66,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
),
)
.addPanel(
- $.panel('Range queries per second') +
+ $.panel('Range queries / sec') +
$.statPanel(|||
sum(
rate(
@@ -91,7 +91,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Gateway')
.addPanel(
- $.panel('Requests per second') +
+ $.panel('Requests / sec') +
$.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway))
)
.addPanel(
@@ -109,7 +109,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Query Frontend')
.addPanel(
- $.panel('Requests per second') +
+ $.panel('Requests / sec') +
$.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend))
)
.addPanel(
@@ -141,7 +141,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
)
.addPanel(
- $.panel('Requests per second') +
+ $.panel('Requests / sec') +
$.qpsPanel('cortex_query_scheduler_queue_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.query_scheduler))
)
.addPanel(
@@ -152,7 +152,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Cache - Query Results')
.addPanel(
- $.panel('Requests per second') +
+ $.panel('Requests / sec') +
$.qpsPanel('cortex_cache_request_duration_seconds_count{method=~"frontend.+", %s}' % $.jobMatcher($._config.job_names.query_frontend))
)
.addPanel(
@@ -163,7 +163,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Querier')
.addPanel(
- $.panel('Requests per second') +
+ $.panel('Requests / sec') +
$.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier))
)
.addPanel(
@@ -181,7 +181,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Ingester')
.addPanel(
- $.panel('Requests per second') +
+ $.panel('Requests / sec') +
$.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}' % $.jobMatcher($._config.job_names.ingester))
)
.addPanel(
@@ -200,7 +200,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.storage_engine, 'blocks'),
$.row('Store-gateway')
.addPanel(
- $.panel('Requests per second') +
+ $.panel('Requests / sec') +
$.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/gatewaypb.StoreGateway/.*"}' % $.jobMatcher($._config.job_names.store_gateway))
)
.addPanel(
@@ -219,7 +219,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.storage_engine, 'chunks'),
$.row('Memcached - Chunks storage - Index')
.addPanel(
- $.panel('Requests per second') +
+ $.panel('Requests / sec') +
$.qpsPanel('cortex_cache_request_duration_seconds_count{%s,method="store.index-cache-read.memcache.fetch"}' % $.jobMatcher($._config.job_names.querier))
)
.addPanel(
@@ -231,7 +231,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.storage_engine, 'chunks'),
$.row('Memcached - Chunks storage - Chunks')
.addPanel(
- $.panel('Requests per second') +
+ $.panel('Requests / sec') +
$.qpsPanel('cortex_cache_request_duration_seconds_count{%s,method="chunksmemcache.fetch"}' % $.jobMatcher($._config.job_names.querier))
)
.addPanel(
@@ -243,7 +243,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.storage_engine, 'blocks'),
$.row('Memcached – blocks storage – block index cache (store-gateway accesses)') // Resembles thanosMemcachedCache
.addPanel(
- $.panel('Requests per second') +
+ $.panel('Requests / sec') +
$.queryPanel(
|||
sum by(operation) (
@@ -343,7 +343,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'cassandra'),
$.row('Cassandra')
.addPanel(
- $.panel('Requests per second') +
+ $.panel('Requests / sec') +
$.qpsPanel('cortex_cassandra_request_duration_seconds_count{%s, operation="SELECT"}' % $.jobMatcher($._config.job_names.querier))
)
.addPanel(
@@ -356,7 +356,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'bigtable'),
$.row('BigTable')
.addPanel(
- $.panel('Requests per second') +
+ $.panel('Requests / sec') +
$.qpsPanel('cortex_bigtable_request_duration_seconds_count{%s, operation="/google.bigtable.v2.Bigtable/ReadRows"}' % $.jobMatcher($._config.job_names.querier))
)
.addPanel(
@@ -369,7 +369,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'dynamodb'),
$.row('DynamoDB')
.addPanel(
- $.panel('Requests per second') +
+ $.panel('Requests / sec') +
$.qpsPanel('cortex_dynamo_request_duration_seconds_count{%s, operation="DynamoDB.QueryPages"}' % $.jobMatcher($._config.job_names.querier))
)
.addPanel(
@@ -382,7 +382,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.chunk_store_backend, 'gcs'),
$.row('GCS')
.addPanel(
- $.panel('Requests per second') +
+ $.panel('Requests / sec') +
$.qpsPanel('cortex_gcs_request_duration_seconds_count{%s, operation="GET"}' % $.jobMatcher($._config.job_names.querier))
)
.addPanel(
diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet
index 799fcc4f..cf49e0da 100644
--- a/cortex-mixin/dashboards/writes.libsonnet
+++ b/cortex-mixin/dashboards/writes.libsonnet
@@ -33,7 +33,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
showTitle: false,
})
.addPanel(
- $.panel('Samples per second') +
+ $.panel('Samples / sec') +
$.statPanel(
'sum(%(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m{%(job)s})' % (
$._config {
@@ -59,14 +59,14 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.statPanel('count(count by(user) (cortex_ingester_active_series{%s}))' % $.jobMatcher($._config.job_names.ingester), format='short')
)
.addPanel(
- $.panel('Requests per second') +
+ $.panel('Requests / sec') +
$.statPanel('sum(rate(cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}[5m]))' % $.jobMatcher($._config.job_names.gateway), format='reqps')
)
)
.addRow(
$.row('Gateway')
.addPanel(
- $.panel('Requests per second') +
+ $.panel('Requests / sec') +
$.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.gateway))
)
.addPanel(
@@ -84,7 +84,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Distributor')
.addPanel(
- $.panel('Requests per second') +
+ $.panel('Requests / sec') +
$.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.distributor))
)
.addPanel(
@@ -102,7 +102,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Key-value store for high-availability (HA) deduplication')
.addPanel(
- $.panel('Requests per second') +
+ $.panel('Requests / sec') +
$.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.distributor))
)
.addPanel(
@@ -113,7 +113,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Ingester')
.addPanel(
- $.panel('Requests per second') +
+ $.panel('Requests / sec') +
$.qpsPanel('cortex_request_duration_seconds_count{%s,route="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ingester))
)
.addPanel(
@@ -131,7 +131,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Key-value store for the ingester ring')
.addPanel(
- $.panel('Requests per second') +
+ $.panel('Requests / sec') +
$.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester))
)
.addPanel(
@@ -143,7 +143,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.storage_engine, 'chunks'),
$.row('Memcached')
.addPanel(
- $.panel('Requests per second') +
+ $.panel('Requests / sec') +
$.qpsPanel('cortex_memcache_request_duration_seconds_count{%s,method="Memcache.Put"}' % $.jobMatcher($._config.job_names.ingester))
)
.addPanel(
@@ -156,7 +156,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'cassandra'),
$.row('Cassandra')
.addPanel(
- $.panel('Requests per second') +
+ $.panel('Requests / sec') +
$.qpsPanel('cortex_cassandra_request_duration_seconds_count{%s, operation="INSERT"}' % $.jobMatcher($._config.job_names.ingester))
)
.addPanel(
@@ -169,7 +169,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'bigtable'),
$.row('BigTable')
.addPanel(
- $.panel('Requests per second') +
+ $.panel('Requests / sec') +
$.qpsPanel('cortex_bigtable_request_duration_seconds_count{%s, operation="/google.bigtable.v2.Bigtable/MutateRows"}' % $.jobMatcher($._config.job_names.ingester))
)
.addPanel(
@@ -182,7 +182,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'dynamodb'),
$.row('DynamoDB')
.addPanel(
- $.panel('Requests per second') +
+ $.panel('Requests / sec') +
$.qpsPanel('cortex_dynamo_request_duration_seconds_count{%s, operation="DynamoDB.BatchWriteItem"}' % $.jobMatcher($._config.job_names.ingester))
)
.addPanel(
@@ -195,7 +195,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
std.member($._config.chunk_store_backend, 'gcs'),
$.row('GCS')
.addPanel(
- $.panel('Requests per second') +
+ $.panel('Requests / sec') +
$.qpsPanel('cortex_gcs_request_duration_seconds_count{%s, operation="POST"}' % $.jobMatcher($._config.job_names.ingester))
)
.addPanel(
@@ -208,12 +208,12 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.row('Ingester - Blocks storage - Shipper')
.addPanel(
$.successFailurePanel(
- 'Uploaded blocks per second',
+ 'Uploaded blocks / sec',
'sum(rate(cortex_ingester_shipper_uploads_total{%s}[$__rate_interval])) - sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
'sum(rate(cortex_ingester_shipper_upload_failures_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester),
) +
$.panelDescription(
- 'Uploaded blocks per second',
+ 'Uploaded blocks / sec',
|||
The rate of blocks being uploaded from the ingesters
to object storage.
@@ -237,7 +237,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.row('Ingester - Blocks storage - TSDB Head')
.addPanel(
$.successFailurePanel(
- 'Compactions per second',
+ 'Compactions / sec',
'sum(rate(cortex_ingester_tsdb_compactions_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester)],
'sum(rate(cortex_ingester_tsdb_compactions_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester),
) +
@@ -267,7 +267,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.row('Ingester - blocks storage - TSDB write ahead log (WAL)')
.addPanel(
$.successFailurePanel(
- 'WAL truncations per second',
+ 'WAL truncations / sec',
'sum(rate(cortex_ingester_tsdb_wal_truncations_total{%s}[$__rate_interval])) - sum(rate(cortex_ingester_tsdb_wal_truncations_failed_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
'sum(rate(cortex_ingester_tsdb_wal_truncations_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester),
) +
@@ -281,7 +281,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
.addPanel(
$.successFailurePanel(
- 'Checkpoints created per second',
+ 'Checkpoints created / sec',
'sum(rate(cortex_ingester_tsdb_checkpoint_creations_total{%s}[$__rate_interval])) - sum(rate(cortex_ingester_tsdb_checkpoint_creations_failed_total{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)],
'sum(rate(cortex_ingester_tsdb_checkpoint_creations_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester),
) +
@@ -306,7 +306,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
),
)
.addPanel(
- $.panel('Corruptions per second') +
+ $.panel('Corruptions / sec') +
$.queryPanel([
'sum(rate(cortex_ingester_wal_corruptions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester),
'sum(rate(cortex_ingester_tsdb_mmap_chunk_corruptions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester),
From 5794607f1cff8140d9db9cb676b47c4db4b74936 Mon Sep 17 00:00:00 2001
From: Darren Janeczek
Date: Mon, 21 Jun 2021 10:02:15 -0400
Subject: [PATCH 35/35] fix: resolve review feedback
---
CHANGELOG.md | 2 +-
cortex-mixin/config.libsonnet | 6 ++++++
cortex-mixin/dashboards/compactor.libsonnet | 6 +++---
.../dashboards/dashboard-utils.libsonnet | 3 +--
cortex-mixin/dashboards/reads.libsonnet | 21 ++++++++++---------
cortex-mixin/dashboards/writes.libsonnet | 11 +++++-----
6 files changed, 28 insertions(+), 21 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2da99946..442c4ede 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,7 +12,7 @@
* [BUGFIX] Fixed `CortexIngesterHasNotShippedBlocks` alert false positive in case an ingester instance had ingested samples in the past, then no traffic was received for a long period and then it started receiving samples again. #308
* [CHANGE] Dashboards: added overridable `job_labels` and `cluster_labels` to the configuration object as label lists to uniquely identify jobs and clusters in the metric names and group-by lists in dashboards. #319
* [CHANGE] Dashboards: `alert_aggregation_labels` has been removed from the configuration and overriding this value has been deprecated. Instead the labels are now defined by the `cluster_labels` list, and should be overridden accordingly through that list. #319
-* [ENHANCEMENT] Added documentation text panels and descriptions to reads and writes dashboards.
+* [ENHANCEMENT] Added documentation text panels and descriptions to reads and writes dashboards. #324
## 1.9.0 / 2021-05-18
diff --git a/cortex-mixin/config.libsonnet b/cortex-mixin/config.libsonnet
index d07c61be..3e884292 100644
--- a/cortex-mixin/config.libsonnet
+++ b/cortex-mixin/config.libsonnet
@@ -58,5 +58,11 @@
// The label used to differentiate between different nodes (i.e. servers).
per_node_label: 'instance',
+
+ // Whether certain dashboard description headers should be shown
+ show_dashboard_descriptions: {
+ writes: true,
+ reads: true,
+ },
},
}
diff --git a/cortex-mixin/dashboards/compactor.libsonnet b/cortex-mixin/dashboards/compactor.libsonnet
index 18065547..aeb64491 100644
--- a/cortex-mixin/dashboards/compactor.libsonnet
+++ b/cortex-mixin/dashboards/compactor.libsonnet
@@ -36,7 +36,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
'Tenants compaction progress',
|||
In a multi-tenant cluster, display the progress of tenants that are compacted while compaction is running.
- Reset to `0` after the compaction run is completed for all tenants in the shard.
+ Reset to 0 after the compaction run is completed for all tenants in the shard.
|||
),
)
@@ -50,7 +50,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.panelDescription(
'Compacted blocks / sec',
|||
- Display the amount of time that it’s taken to generate a single compacted block.
+ Rate of blocks that are generated as a result of a compaction operation.
|||
),
)
@@ -60,7 +60,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.panelDescription(
'Per-block compaction duration',
|||
- Rate of blocks that are generated as a result of a compaction operation.
+ Display the amount of time that it has taken to generate a single compacted block.
|||
),
)
diff --git a/cortex-mixin/dashboards/dashboard-utils.libsonnet b/cortex-mixin/dashboards/dashboard-utils.libsonnet
index 254619d4..c965b265 100644
--- a/cortex-mixin/dashboards/dashboard-utils.libsonnet
+++ b/cortex-mixin/dashboards/dashboard-utils.libsonnet
@@ -336,11 +336,10 @@ local utils = import 'mixin-utils/utils.libsonnet';
jobMatcher: $.jobMatcher(jobName),
component: component,
cacheName: cacheName,
- cacheNameReadable: std.strReplace(cacheName, '-', ' '),
};
super.row(title)
.addPanel(
- $.panel('Requests per second') +
+ $.panel('Requests / sec') +
$.queryPanel(
|||
sum by(operation) (
diff --git a/cortex-mixin/dashboards/reads.libsonnet b/cortex-mixin/dashboards/reads.libsonnet
index cb2411d4..965e0e76 100644
--- a/cortex-mixin/dashboards/reads.libsonnet
+++ b/cortex-mixin/dashboards/reads.libsonnet
@@ -4,7 +4,8 @@ local utils = import 'mixin-utils/utils.libsonnet';
'cortex-reads.json':
($.dashboard('Cortex / Reads') + { uid: '8d6ba60eccc4b6eedfa329b24b1bd339' })
.addClusterSelectorTemplates()
- .addRow(
+ .addRowIf(
+ $._config.show_dashboard_descriptions.reads,
($.row('Reads dashboard description') { height: '175px', showTitle: false })
.addPanel(
$.textPanel('', |||
@@ -12,7 +13,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
This dashboard shows health metrics for the Cortex read path.
It is broken into sections for each service on the read path, and organized by the order in which the read request flows.
- Incoming queries travel from the gateway → query frontend → query scheduler → querier → ingester and/or store-gateway (depending on the age of the query).
+ Incoming queries travel from the gateway → query frontend → query scheduler → querier → ingester and/or store-gateway (depending on the time range of the query).
For each service, there are 3 panels showing (1) requests per second to that service, (2) average, median, and p99 latency of requests to that service, and (3) p99 latency of requests to each instance of that service.
@@ -42,14 +43,14 @@ local utils = import 'mixin-utils/utils.libsonnet';
cortex_request_duration_seconds_count{
%(queryFrontend)s,
route=~"(prometheus|api_prom)_api_v1_query"
- }[1h]
+ }[$__rate_interval]
)
) +
sum(
rate(
cortex_prometheus_rule_evaluations_total{
%(ruler)s
- }[1h]
+ }[$__rate_interval]
)
)
||| % {
@@ -73,7 +74,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
cortex_request_duration_seconds_count{
%(queryFrontend)s,
route=~"(prometheus|api_prom)_api_v1_query_range"
- }[1h]
+ }[$__rate_interval]
)
)
||| % {
@@ -132,7 +133,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
|||
The query scheduler is an optional service that moves
- the internal queue from the query frontend into a
+ the internal queue from the query-frontend into a
separate component.
If this service is not deployed,
these panels will show "No data."
@@ -241,7 +242,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
.addRowIf(
std.member($._config.storage_engine, 'blocks'),
- $.row('Memcached – blocks storage – block index cache (store-gateway accesses)') // Resembles thanosMemcachedCache
+ $.row('Memcached – Blocks storage – Block index cache (store-gateway accesses)') // Resembles thanosMemcachedCache
.addPanel(
$.panel('Requests / sec') +
$.queryPanel(
@@ -314,7 +315,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRowIf(
std.member($._config.storage_engine, 'blocks'),
$.thanosMemcachedCache(
- 'Memcached – blocks storage – chunks cache (store-gateway accesses)',
+ 'Memcached – Blocks storage – Chunks cache (store-gateway accesses)',
$._config.job_names.store_gateway,
'store-gateway',
'chunks-cache'
@@ -323,7 +324,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRowIf(
std.member($._config.storage_engine, 'blocks'),
$.thanosMemcachedCache(
- 'Memcached – blocks storage – metadata cache (store-gateway accesses)',
+ 'Memcached – Blocks storage – Metadata cache (store-gateway accesses)',
$._config.job_names.store_gateway,
'store-gateway',
'metadata-cache'
@@ -332,7 +333,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRowIf(
std.member($._config.storage_engine, 'blocks'),
$.thanosMemcachedCache(
- 'Memcached – blocks storage – metadata cache (querier accesses)',
+ 'Memcached – Blocks storage – Metadata cache (querier accesses)',
$._config.job_names.querier,
'querier',
'metadata-cache'
diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet
index cf49e0da..8a77be1c 100644
--- a/cortex-mixin/dashboards/writes.libsonnet
+++ b/cortex-mixin/dashboards/writes.libsonnet
@@ -4,7 +4,8 @@ local utils = import 'mixin-utils/utils.libsonnet';
'cortex-writes.json':
($.dashboard('Cortex / Writes') + { uid: '0156f6d15aa234d452a33a4f13c838e3' })
.addClusterSelectorTemplates()
- .addRow(
+ .addRowIf(
+ $._config.show_dashboard_descriptions.writes,
($.row('Writes dashboard description') { height: '125px', showTitle: false })
.addPanel(
$.textPanel('', |||
@@ -129,7 +130,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
)
.addRow(
- $.row('Key-value store for the ingester ring')
+ $.row('Key-value store for the ingesters ring')
.addPanel(
$.panel('Requests / sec') +
$.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester))
@@ -227,7 +228,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
'Upload latency',
|||
The average, median (50th percentile), and 99th percentile time
- the ingester takes to upload blocks to object storage.
+ the ingesters take to upload blocks to object storage.
|||
),
)
@@ -256,7 +257,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.panelDescription(
'Compaction latency',
|||
- The average, median (50th percentile), and 99th percentile time ingesters take to compact head blocks
+ The average, median (50th percentile), and 99th percentile time ingesters take to compact TSDB head blocks
on the local filesystem.
|||
),
@@ -264,7 +265,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
.addRowIf(
std.member($._config.storage_engine, 'blocks'),
- $.row('Ingester - blocks storage - TSDB write ahead log (WAL)')
+ $.row('Ingester - Blocks storage - TSDB write ahead log (WAL)')
.addPanel(
$.successFailurePanel(
'WAL truncations / sec',