Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update dashboards to support Helm chart deployment #427

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

## master / unreleased

* [ENHANCEMENT] Update cortex-mixin to support Cortex deployment from the Helm chart. #361

## 1.11.0 / 2021-12-30

* [CHANGE] Store gateway: set `-blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency`,
Expand Down
14 changes: 12 additions & 2 deletions cortex-mixin/config.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,14 @@
// Enables chunks- or blocks- specific panels and dashboards.
storage_engine: ['blocks'],

// HTTP URL prefix under which the Prometheus api is available.
prometheus_http_prefix: 'prometheus',

// Disable unused panels depending whether a component was installed or not
cortex_gw_enabled: true,
query_scheduler_enabled: true,
ruler_enabled: true,

// For chunks backend, switch for chunk index type.
// May contain 'bigtable', 'dynamodb' or 'cassandra'.
chunk_index_backend: ['bigtable', 'dynamodb', 'cassandra'],
Expand Down Expand Up @@ -54,8 +62,10 @@

// Name selectors for different application instances, using the "per_instance_label".
instance_names: {
compactor: 'compactor.*',
alertmanager: 'alertmanager.*',
alertmanager: '.*alertmanager.*',
compactor: '.*compactor.*',
ingester: '.*ingester.*',
store_gateway: '.*store-gateway.*',
},

// The label used to differentiate between different nodes (i.e. servers).
Expand Down
5 changes: 3 additions & 2 deletions cortex-mixin/dashboards/alertmanager-resources.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ local utils = import 'mixin-utils/utils.libsonnet';
'alertmanager-resources.json':
($.dashboard('Cortex / Alertmanager Resources') + { uid: '68b66aed90ccab448009089544a8d6c6' })
.addClusterSelectorTemplates(false)
.addRow(
.addRowIf(
$._config.cortex_gw_enabled,
$.row('Gateway')
.addPanel(
$.containerCPUUsagePanel('CPU', $._config.job_names.gateway),
Expand Down Expand Up @@ -61,7 +62,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('')
.addPanel(
$.containerDiskSpaceUtilization('Disk Space Utilization', 'alertmanager'),
$.containerDiskSpaceUtilization('Disk Space Utilization', $._config.instance_names.alertmanager),
)
),
}
2 changes: 1 addition & 1 deletion cortex-mixin/dashboards/compactor-resources.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.containerDiskReadsPanel('Disk Reads', 'compactor'),
)
.addPanel(
$.containerDiskSpaceUtilization('Disk Space Utilization', 'compactor'),
$.containerDiskSpaceUtilization('Disk Space Utilization', $._config.instance_names.compactor),
)
) + {
templating+: {
Expand Down
52 changes: 17 additions & 35 deletions cortex-mixin/dashboards/dashboard-utils.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ local utils = import 'mixin-utils/utils.libsonnet';
// - some links that propagate the selectred cluster.
dashboard(title)::
super.dashboard(title=title, datasource=$._config.dashboard_datasource) + {
refresh: '30s',
timezone: 'browser',

addRowIf(condition, row)::
if condition
then self.addRow(row)
Expand Down Expand Up @@ -73,7 +76,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
namespaceMatcher()::
if $._config.singleBinary
then 'job=~"$job"'
else 'cluster=~"$cluster", namespace=~"$namespace"',
else 'namespace=~"$namespace"',

jobSelector(job)::
if $._config.singleBinary
Expand Down Expand Up @@ -204,15 +207,13 @@ local utils = import 'mixin-utils/utils.libsonnet';
|||
sum by(%s, %s, device) (
rate(
node_disk_written_bytes_total[$__rate_interval]
%s[$__rate_interval]
)
)
+
%s
||| % [
$._config.per_node_label,
$._config.per_instance_label,
$.filterNodeDiskContainer(containerName),
$.nodeDiskContainerBytesTotal(containerName, 'writes'),
],
'{{%s}} - {{device}}' % $._config.per_instance_label
) +
Expand All @@ -225,13 +226,13 @@ local utils = import 'mixin-utils/utils.libsonnet';
|||
sum by(%s, %s, device) (
rate(
node_disk_read_bytes_total[$__rate_interval]
%s[$__rate_interval]
)
) + %s
)
||| % [
$._config.per_node_label,
$._config.per_instance_label,
$.filterNodeDiskContainer(containerName),
$.nodeDiskContainerBytesTotal(containerName, 'reads'),
],
'{{%s}} - {{device}}' % $._config.per_instance_label
) +
Expand Down Expand Up @@ -261,9 +262,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
{ yaxes: $.yaxes('percentunit') },

containerLabelMatcher(containerName)::
if containerName == 'ingester'
then 'label_name=~"ingester.*"'
else 'label_name="%s"' % containerName,
'persistentvolumeclaim=~"%s"' % containerName,

goHeapInUsePanel(title, jobName)::
$.panel(title) +
Expand Down Expand Up @@ -467,32 +466,15 @@ local utils = import 'mixin-utils/utils.libsonnet';
{ yaxes: $.yaxes('percentunit') }
),

filterNodeDiskContainer(containerName)::
nodeDiskContainerBytesTotal(containerName, op)::
|||
ignoring(%s) group_right() (
label_replace(
count by(
%s,
%s,
device
)
(
container_fs_writes_bytes_total{
%s,
container="%s",
device!~".*sda.*"
}
),
"device",
"$1",
"device",
"/dev/(.*)"
) * 0
)
container_fs_%s_bytes_total{
%s,
container="%s",
device!~".*sda.*|.*nvme0.*"
}
||| % [
$._config.per_instance_label,
$._config.per_node_label,
$._config.per_instance_label,
op,
$.namespaceMatcher(),
containerName,
],
Expand Down
3 changes: 2 additions & 1 deletion cortex-mixin/dashboards/queries.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.queryPanel('cortex_query_frontend_queue_length{%s}' % $.jobMatcher($._config.job_names.query_frontend), '{{cluster}} / {{namespace}} / {{%s}}' % $._config.per_instance_label),
)
)
.addRow(
.addRowIf(
$._config.query_scheduler_enabled,
$.row('Query Scheduler')
.addPanel(
$.panel('Queue Duration') +
Expand Down
14 changes: 9 additions & 5 deletions cortex-mixin/dashboards/reads-resources.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ local utils = import 'mixin-utils/utils.libsonnet';
'cortex-reads-resources.json':
($.dashboard('Cortex / Reads Resources') + { uid: '2fd2cda9eea8d8af9fbc0a5960425120' })
.addClusterSelectorTemplates(false)
.addRow(
.addRowIf(
$._config.cortex_gw_enabled,
$.row('Gateway')
.addPanel(
$.containerCPUUsagePanel('CPU', $._config.job_names.gateway),
Expand All @@ -28,7 +29,8 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.goHeapInUsePanel('Memory (go heap inuse)', $._config.job_names.query_frontend),
)
)
.addRow(
.addRowIf(
$._config.query_scheduler_enabled,
$.row('Query Scheduler')
.addPanel(
$.containerCPUUsagePanel('CPU', 'query-scheduler'),
Expand Down Expand Up @@ -64,7 +66,8 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.goHeapInUsePanel('Memory (go heap inuse)', $._config.job_names.ingester),
)
)
.addRow(
.addRowIf(
$._config.ruler_enabled,
$.row('Ruler')
.addPanel(
$.panel('Rules') +
Expand All @@ -77,7 +80,8 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.containerCPUUsagePanel('CPU', 'ruler'),
)
)
.addRow(
.addRowIf(
$._config.ruler_enabled,
$.row('')
.addPanel(
$.containerMemoryWorkingSetPanel('Memory (workingset)', 'ruler'),
Expand Down Expand Up @@ -109,7 +113,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.containerDiskReadsPanel('Disk Reads', 'store-gateway'),
)
.addPanel(
$.containerDiskSpaceUtilization('Disk Space Utilization', 'store-gateway'),
$.containerDiskSpaceUtilization('Disk Space Utilization', $._config.instance_names.store_gateway),
)
) + {
templating+: {
Expand Down
36 changes: 22 additions & 14 deletions cortex-mixin/dashboards/reads.libsonnet
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
local utils = import 'mixin-utils/utils.libsonnet';

(import 'dashboard-utils.libsonnet') {
local config = {
gateway_read_routes_regex: '(%s|api_prom)_api_v1_.+' % $._config.prometheus_http_prefix,
},

'cortex-reads.json':
($.dashboard('Cortex / Reads') + { uid: '8d6ba60eccc4b6eedfa329b24b1bd339' })
.addClusterSelectorTemplates()
Expand Down Expand Up @@ -42,7 +46,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
rate(
cortex_request_duration_seconds_count{
%(queryFrontend)s,
route=~"(prometheus|api_prom)_api_v1_query"
route=~"%(query_routes_regex)s"
}[$__rate_interval]
)
) +
Expand All @@ -51,10 +55,11 @@ local utils = import 'mixin-utils/utils.libsonnet';
cortex_prometheus_rule_evaluations_total{
%(ruler)s
}[$__rate_interval]
)
) or on() vector(0)
)
||| % {
queryFrontend: $.jobMatcher($._config.job_names.query_frontend),
query_routes_regex: '(%s|api_prom)_api_v1_query' % $._config.prometheus_http_prefix,
ruler: $.jobMatcher($._config.job_names.ruler),
}, format='reqps') +
$.panelDescription(
Expand All @@ -73,12 +78,13 @@ local utils = import 'mixin-utils/utils.libsonnet';
rate(
cortex_request_duration_seconds_count{
%(queryFrontend)s,
route=~"(prometheus|api_prom)_api_v1_query_range"
route=~"%(query_range_routes_regex)s"
}[$__rate_interval]
)
)
||| % {
queryFrontend: $.jobMatcher($._config.job_names.query_frontend),
query_range_routes_regex: '(%s|api_prom)_api_v1_query_range' % $._config.prometheus_http_prefix,
}, format='reqps') +
$.panelDescription(
'Range queries per second',
Expand All @@ -89,20 +95,21 @@ local utils = import 'mixin-utils/utils.libsonnet';
),
)
)
.addRow(
.addRowIf(
$._config.cortex_gw_enabled,
$.row('Gateway')
.addPanel(
$.panel('Requests / sec') +
$.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway))
$.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"%s"}' % [$.jobMatcher($._config.job_names.gateway), config.gateway_read_routes_regex])
)
.addPanel(
$.panel('Latency') +
utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')])
utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', config.gateway_read_routes_regex)])
)
.addPanel(
$.panel('Per %s p99 Latency' % $._config.per_instance_label) +
$.hiddenLegendQueryPanel(
'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway)], ''
'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"%s"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway), config.gateway_read_routes_regex], ''
) +
{ yaxes: $.yaxes('s') }
)
Expand All @@ -111,21 +118,22 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.row('Query Frontend')
.addPanel(
$.panel('Requests / sec') +
$.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend))
$.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"%s"}' % [$.jobMatcher($._config.job_names.query_frontend), config.gateway_read_routes_regex])
)
.addPanel(
$.panel('Latency') +
utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')])
utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', config.gateway_read_routes_regex)])
)
.addPanel(
$.panel('Per %s p99 Latency' % $._config.per_instance_label) +
$.hiddenLegendQueryPanel(
'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.query_frontend)], ''
'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"%s"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.query_frontend), config.gateway_read_routes_regex], ''
) +
{ yaxes: $.yaxes('s') }
)
)
.addRow(
.addRowIf(
$._config.query_scheduler_enabled,
$.row('Query Scheduler')
.addPanel(
$.textPanel(
Expand Down Expand Up @@ -165,16 +173,16 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.row('Querier')
.addPanel(
$.panel('Requests / sec') +
$.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier))
$.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"%s"}' % [$.jobMatcher($._config.job_names.querier), config.gateway_read_routes_regex])
)
.addPanel(
$.panel('Latency') +
utils.latencyRecordingRulePanel('cortex_querier_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')])
utils.latencyRecordingRulePanel('cortex_querier_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.re('route', config.gateway_read_routes_regex)])
)
.addPanel(
$.panel('Per %s p99 Latency' % $._config.per_instance_label) +
$.hiddenLegendQueryPanel(
'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_querier_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.querier)], ''
'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_querier_request_duration_seconds_bucket{%s, route=~"%s"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.querier), config.gateway_read_routes_regex], ''
) +
{ yaxes: $.yaxes('s') }
)
Expand Down
Loading