From c1a711f3abb838f6c0c885d5de66da8cf46d7b7b Mon Sep 17 00:00:00 2001 From: gotjosh Date: Thu, 10 Dec 2020 16:38:19 +0000 Subject: [PATCH 1/3] [Ruler] Dashboard changes Adds several improvements to the visibility and experience of the Ruler dashboard: - [chunks] Cache information - [chunks] Index and Chunk information based on queries - [blocks] Store-gateway information based on queries - [both] Makes the group and rule evaluation panels collapsed by default --- cortex-mixin/dashboards/ruler.libsonnet | 94 ++++++++++++++++++++----- 1 file changed, 77 insertions(+), 17 deletions(-) diff --git a/cortex-mixin/dashboards/ruler.libsonnet b/cortex-mixin/dashboards/ruler.libsonnet index 6f19d61d..5383b52b 100644 --- a/cortex-mixin/dashboards/ruler.libsonnet +++ b/cortex-mixin/dashboards/ruler.libsonnet @@ -135,8 +135,83 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.latencyPanel('cortex_ingester_client_request_duration_seconds', '{%s, operation="/cortex.Ingester/QueryStream"}' % $.jobMatcher('ruler')) ) ) + .addRowIf( + std.member($._config.storage_engine, 'chunks'), + $.row('Ruler - Chunks storage - Index Cache') + .addPanel( + $.panel('Total entries') + + $.queryPanel('sum(querier_cache_added_new_total{cache="store.index-cache-read.fifocache",%s}) - sum(querier_cache_evicted_total{cache="store.index-cache-read.fifocache",%s})' % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], 'Entries'), + ) + .addPanel( + $.panel('Cache Hit %') + + $.queryPanel('(sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m])) - sum(rate(querier_cache_misses_total{cache="store.index-cache-read.fifocache",%s}[1m]))) / sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m]))' % [$.jobMatcher($._config.job_names.querier), $.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], 'hit rate') + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, + ) + .addPanel( + $.panel('Churn Rate') + + $.queryPanel('sum(rate(querier_cache_evicted_total{cache="store.index-cache-read.fifocache",%s}[1m]))' % $.jobMatcher($._config.job_names.ruler), 'churn rate'), + ) + ) + .addRowIf( + std.member($._config.storage_engine, 'chunks'), + $.row('Ruler - Chunks storage - Store') + .addPanel( + $.panel('Index Lookups per Query') + + utils.latencyRecordingRulePanel('cortex_chunk_store_index_lookups_per_query', $.jobSelector($._config.job_names.ruler), multiplier=1) + + { yaxes: $.yaxes('short') }, + ) + .addPanel( + $.panel('Series (pre-intersection) per Query') + + utils.latencyRecordingRulePanel('cortex_chunk_store_series_pre_intersection_per_query', $.jobSelector($._config.job_names.ruler), multiplier=1) + + { yaxes: $.yaxes('short') }, + ) + .addPanel( + $.panel('Series (post-intersection) per Query') + + utils.latencyRecordingRulePanel('cortex_chunk_store_series_post_intersection_per_query', $.jobSelector($._config.job_names.ruler), multiplier=1) + + { yaxes: $.yaxes('short') }, + ) + .addPanel( + $.panel('Chunks per Query') + + utils.latencyRecordingRulePanel('cortex_chunk_store_chunks_per_query', $.jobSelector($._config.job_names.ruler), multiplier=1) + + { yaxes: $.yaxes('short') }, + ) + ) + .addRowIf( + std.member($._config.storage_engine, 'blocks'), + $.row('Ruler - Blocks storage') + .addPanel( + $.panel('Number of store-gateways hit per Query') + + $.latencyPanel('cortex_querier_storegateway_instances_hit_per_query', '{%s}' % $.jobMatcher($._config.job_names.ruler), multiplier=1) + + { yaxes: $.yaxes('short') }, + ) + .addPanel( + $.panel('Refetches of missing blocks per Query') + + $.latencyPanel('cortex_querier_storegateway_refetches_per_query', '{%s}' % $.jobMatcher($._config.job_names.ruler), multiplier=1) + + { yaxes: $.yaxes('short') }, + ) + .addPanel( + $.panel('Consistency checks failed') + + $.queryPanel('sum(rate(cortex_querier_blocks_consistency_checks_failed_total{%s}[1m])) / sum(rate(cortex_querier_blocks_consistency_checks_total{%s}[1m]))' % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], 'Failure Rate') + + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, + ) + ) .addRow( - $.row('Group Evaluations') + $.row('Notifications') + .addPanel( + $.panel('Delivery Errors') + + $.queryPanel($.rulerQueries.notifications.failure % [$.jobMatcher('ruler'), $.jobMatcher('ruler')], '{{ user }}') + ) + .addPanel( + $.panel('Queue Length') + + $.queryPanel($.rulerQueries.notifications.queue % [$.jobMatcher('ruler'), $.jobMatcher('ruler')], '{{ user }}') + ) + .addPanel( + $.panel('Dropped') + + $.queryPanel($.rulerQueries.notifications.dropped % $.jobMatcher('ruler'), '{{ user }}') + ) + ) + .addRow( + ($.row('Group Evaluations') + { collapse: true }) .addPanel( $.panel('Missed Iterations') + $.queryPanel($.rulerQueries.groupEvaluations.missedIterations % $.jobMatcher('ruler'), '{{ user }}'), @@ -156,7 +231,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRow( - $.row('Rule Evaluation per User') + ($.row('Rule Evaluation per User') + { collapse: true }) .addPanel( $.panel('Latency') + $.queryPanel( @@ -164,20 +239,5 @@ local utils = import 'mixin-utils/utils.libsonnet'; '{{ user }}' ) ) - ) - .addRow( - $.row('Notifications') - .addPanel( - $.panel('Delivery Errors') + - $.queryPanel($.rulerQueries.notifications.failure % [$.jobMatcher('ruler'), $.jobMatcher('ruler')], '{{ user }}') - ) - .addPanel( - $.panel('Queue Length') + - $.queryPanel($.rulerQueries.notifications.queue % [$.jobMatcher('ruler'), $.jobMatcher('ruler')], '{{ user }}') - ) - .addPanel( - $.panel('Dropped') + - $.queryPanel($.rulerQueries.notifications.dropped % $.jobMatcher('ruler'), '{{ user }}') - ) ), } From e1fd343c7ea6f8691388281cd9f66ee75adb1b5b Mon Sep 17 00:00:00 2001 From: gotjosh Date: Thu, 10 Dec 2020 18:40:09 +0000 Subject: [PATCH 2/3] Add changelog entry --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f799348b..23ee1ee0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ * [CHANGE] Add default present for ruler limits on all 'user' types. #221, #222 * [CHANGE] Enabled sharding for the blocks storage compactor. #218 * [ENHANCEMENT] Introduce a resources dashboard for the Alertmanager. #219 +* [ENHANCEMENT] Improves query visibility in the Ruler Dashboard for both chunks and blocks storage. #226 ## 1.5.0 / 2020-11-12 From 10902c48afd2e306e8a276e8d59578194341f333 Mon Sep 17 00:00:00 2001 From: gotjosh Date: Thu, 10 Dec 2020 18:44:17 +0000 Subject: [PATCH 3/3] missing a place for using the querier job reference --- cortex-mixin/dashboards/ruler.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cortex-mixin/dashboards/ruler.libsonnet b/cortex-mixin/dashboards/ruler.libsonnet index 5383b52b..b9347f7f 100644 --- a/cortex-mixin/dashboards/ruler.libsonnet +++ b/cortex-mixin/dashboards/ruler.libsonnet @@ -144,7 +144,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.panel('Cache Hit %') + - $.queryPanel('(sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m])) - sum(rate(querier_cache_misses_total{cache="store.index-cache-read.fifocache",%s}[1m]))) / sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m]))' % [$.jobMatcher($._config.job_names.querier), $.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], 'hit rate') + $.queryPanel('(sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m])) - sum(rate(querier_cache_misses_total{cache="store.index-cache-read.fifocache",%s}[1m]))) / sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m]))' % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], 'hit rate') { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, ) .addPanel(