Skip to content

Commit

Permalink
Add recording rules to calculate Cortex scaling
Browse files Browse the repository at this point in the history
- Update dashboard so it only shows under provisioned services and why
- Add sizing rules based on limits.
- Add some docs to the dashboard.

Signed-off-by: Tom Wilkie <[email protected]>
  • Loading branch information
tomwilkie committed Mar 19, 2021
1 parent 5cf0c4f commit fbf3f98
Show file tree
Hide file tree
Showing 4 changed files with 249 additions and 99 deletions.
125 changes: 37 additions & 88 deletions cortex-mixin/dashboards/scaling.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -6,105 +6,54 @@ local utils = import 'mixin-utils/utils.libsonnet';
($.dashboard('Cortex / Scaling') + { uid: '88c041017b96856c9176e07cf557bdcf' })
.addClusterSelectorTemplates()
.addRow(
$.row('Workload-based scaling')
.addPanel(
$.panel('Workload-based scaling') + { sort: { col: 1, desc: false } } +
$.tablePanel([
|||
sum by (cluster, namespace, deployment) (
kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace", deployment=~"ingester|memcached"}
or
label_replace(
kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace", deployment=~"ingester|memcached"},
"deployment", "$1", "statefulset", "(.*)"
)
)
|||,
|||
quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(rate(cortex_distributor_received_samples_total{cluster=~"$cluster", namespace=~"$namespace"}[1m]), "deployment", "ingester", "cluster", ".*"))[1h:])
* 3 / 80e3
|||,
|||
label_replace(
sum by(cluster, namespace) (
cortex_ingester_memory_series{cluster=~"$cluster", namespace=~"$namespace"}
) / 1e+6,
"deployment", "ingester", "cluster", ".*"
)
or
label_replace(
sum by (cluster, namespace) (
4 * cortex_ingester_memory_series{cluster=~"$cluster", namespace=~"$namespace", job=~".+/ingester"}
*
cortex_ingester_chunk_size_bytes_sum{cluster=~"$cluster", namespace=~"$namespace", job=~".+/ingester"}
/
cortex_ingester_chunk_size_bytes_count{cluster=~"$cluster", namespace=~"$namespace", job=~".+/ingester"}
)
/
avg by (cluster, namespace) (memcached_limit_bytes{cluster=~"$cluster", namespace=~"$namespace", job=~".+/memcached"}),
"deployment", "memcached", "namespace", ".*"
)
($.row('Cortex Service Scaling') + { height: '200px' })
.addPanel({
type: 'text',
title: '',
options: {
content: |||
This dashboards shows any services which are not scaled correctly.
The table below gives the required number of replicas and the reason why.
We only show services without enough replicas.
Reasons:
- **sample_rate**: There are not enough replicas to handle the
sample rate. Applies to distributor and ingesters.
- **active_series**: There are not enough replicas
to handle the number of active series. Applies to ingesters.
- **cpu_usage**: There are not enough replicas
based on the CPU usage of the jobs vs the resource requests.
Applies to all jobs.
- **memory_usage**: There are not enough replicas based on the memory
usage vs the resource requests. Applies to all jobs.
- **active_series_limits**: There are not enough replicas to hold 60% of the
sum of all the per tenant series limits.
- **sample_rate_limits**: There are not enough replicas to handle 60% of the
sum of all the per tenant rate limits.
|||,
], {
cluster: { alias: 'Cluster' },
namespace: { alias: 'Namespace' },
deployment: { alias: 'Deployment' },
'Value #A': { alias: 'Current Replicas', decimals: 0 },
'Value #B': { alias: 'Required Replicas, by ingestion rate', decimals: 0 },
'Value #C': { alias: 'Required Replicas, by active series', decimals: 0 },
})
)
mode: 'markdown',
},
})
)
.addRow(
($.row('Resource-based scaling') + { height: '500px' })
($.row('Scaling') + { height: '400px' })
.addPanel(
$.panel('Resource-based scaling') + { sort: { col: 1, desc: false } } +
$.panel('Workload-based scaling') + { sort: { col: 0, desc: false } } +
$.tablePanel([
|||
sum by (cluster, namespace, deployment) (
kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace"}
or
label_replace(
kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace"},
"deployment", "$1", "statefulset", "(.*)"
)
)
|||,
|||
sum by (cluster, namespace, deployment) (
kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace"}
or
label_replace(
kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace"},
"deployment", "$1", "statefulset", "(.*)"
)
)
*
quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(rate(container_cpu_usage_seconds_total{cluster=~"$cluster", namespace=~"$namespace"}[1m]), "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))[24h:])
/
sum by (cluster, namespace, deployment) (label_replace(kube_pod_container_resource_requests_cpu_cores{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))
|||,
|||
sum by (cluster, namespace, deployment) (
kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace"}
or
label_replace(
kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace"},
"deployment", "$1", "statefulset", "(.*)"
)
sort_desc(
cluster_namespace_deployment_reason:required_replicas:count{cluster=~"$cluster", namespace=~"$namespace"}
> ignoring(reason) group_left
cluster_namespace_deployment:actual_replicas:count{cluster=~"$cluster", namespace=~"$namespace"}
)
*
quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(container_memory_usage_bytes{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))[24h:1m])
/
sum by (cluster, namespace, deployment) (label_replace(kube_pod_container_resource_requests_memory_bytes{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))
|||,
], {
'__name__': { alias: 'Cluster', type: 'hidden' },
cluster: { alias: 'Cluster' },
namespace: { alias: 'Namespace' },
deployment: { alias: 'Deployment' },
'Value #A': { alias: 'Current Replicas', decimals: 0 },
'Value #B': { alias: 'Required Replicas, by CPU usage', decimals: 0 },
'Value #C': { alias: 'Required Replicas, by Memory usage', decimals: 0 },
deployment: { alias: 'Service' },
reason: { alias: 'Reason' },
'Value': { alias: 'Required Replicas', decimals: 0 },
})
)
),
Expand Down
201 changes: 201 additions & 0 deletions cortex-mixin/recording_rules.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,207 @@ local utils = import 'mixin-utils/utils.libsonnet';
},
],
},
{
local _config = {
max_series_per_ingester: 1.5e6,
max_samples_per_sec_per_ingester: 80e3,
max_samples_per_sec_per_distributor: 240e3,
limit_utilisation_target: 0.6,
},
name: 'cortex_scaling_rules',
rules: [
{
// Convenience rule to get the number of replicas for both a deployment and a statefulset.
record: 'cluster_namespace_deployment:actual_replicas:count',
expr: |||
sum by (cluster, namespace, deployment) (kube_deployment_spec_replicas)
or
sum by (cluster, namespace, deployment) (
label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*)")
)
|||,
},
{
// Distributors should be able to deal with 240k samples/s.
record: 'cluster_namespace_deployment_reason:required_replicas:count',
labels: {
deployment: 'distributor',
reason: 'sample_rate',
},
expr: |||
ceil(
quantile_over_time(0.99,
sum by (cluster, namespace) (
cluster_namespace_job:cortex_distributor_received_samples:rate5m
)[24h:]
)
/ %(max_samples_per_sec_per_distributor)s
)
||| % _config,
},
{
// We should be about to cover 80% of our limits,
// and ingester can have 80k samples/s.
record: 'cluster_namespace_deployment_reason:required_replicas:count',
labels: {
deployment: 'distributor',
reason: 'sample_rate_limits',
},
expr: |||
ceil(
sum by (cluster, namespace) (cortex_overrides{limit_name="ingestion_rate"})
* %(limit_utilisation_target)s / %(max_samples_per_sec_per_distributor)s
)
||| % _config,
},
{
// We want ingesters each ingester to deal with 80k samples/s.
// NB we measure this at the distributors and multiple by RF (3).
record: 'cluster_namespace_deployment_reason:required_replicas:count',
labels: {
deployment: 'ingester',
reason: 'sample_rate',
},
expr: |||
ceil(
quantile_over_time(0.99,
sum by (cluster, namespace) (
cluster_namespace_job:cortex_distributor_received_samples:rate5m
)[24h:]
)
* 3 / %(max_samples_per_sec_per_ingester)s
)
||| % _config,
},
{
// Ingester should have 1.5M series in memory
record: 'cluster_namespace_deployment_reason:required_replicas:count',
labels: {
deployment: 'ingester',
reason: 'active_series',
},
expr: |||
ceil(
quantile_over_time(0.99,
sum by(cluster, namespace) (
cortex_ingester_memory_series
)[24h:]
)
/ %(max_series_per_ingester)s
)
||| % _config,
},
{
// We should be about to cover 60% of our limits,
// and ingester can have 1.5M series in memory
record: 'cluster_namespace_deployment_reason:required_replicas:count',
labels: {
deployment: 'ingester',
reason: 'active_series_limits',
},
expr: |||
ceil(
sum by (cluster, namespace) (cortex_overrides{limit_name="max_global_series_per_user"})
* 3 * %(limit_utilisation_target)s / %(max_series_per_ingester)s
)
||| % _config,
},
{
// We should be about to cover 60% of our limits,
// and ingester can have 80k samples/s.
record: 'cluster_namespace_deployment_reason:required_replicas:count',
labels: {
deployment: 'ingester',
reason: 'sample_rate_limits',
},
expr: |||
ceil(
sum by (cluster, namespace) (cortex_overrides{limit_name="ingestion_rate"})
* %(limit_utilisation_target)s / %(max_samples_per_sec_per_ingester)s
)
||| % _config,
},
{
// Ingesters store 96h of data on disk - we want memcached to store 1/4 of that.
record: 'cluster_namespace_deployment_reason:required_replicas:count',
labels: {
deployment: 'memcached',
reason: 'active_series',
},
expr: |||
ceil(
(sum by (cluster, namespace) (
cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester"}
) / 4)
/
avg by (cluster, namespace) (
memcached_limit_bytes{job=~".+/memcached"}
)
)
|||,
},
{
// Jobs should be sized to their CPU usage.
// We do this by comparing 99th percentile usage over the last 24hrs to
// their current provisioned #replicas and resource requests.
record: 'cluster_namespace_deployment_reason:required_replicas:count',
labels: {
reason: 'cpu_usage',
},
expr: |||
ceil(
cluster_namespace_deployment:actual_replicas:count
*
quantile_over_time(0.99,
sum by (cluster, namespace, deployment) (
label_replace(
node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate,
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
)
)[24h:5m]
)
/
sum by (cluster, namespace, deployment) (
label_replace(
kube_pod_container_resource_requests_cpu_cores,
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
)
)
)
|||,
},
{
// Jobs should be sized to their Memory usage.
// We do this by comparing 99th percentile usage over the last 24hrs to
// their current provisioned #replicas and resource requests.
record: 'cluster_namespace_deployment_reason:required_replicas:count',
labels: {
reason: 'memory_usage',
},
expr: |||
ceil(
cluster_namespace_deployment:actual_replicas:count
*
quantile_over_time(0.99,
sum by (cluster, namespace, deployment) (
label_replace(
container_memory_usage_bytes,
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
)
)[24h:5m]
)
/
sum by (cluster, namespace, deployment) (
label_replace(
kube_pod_container_resource_requests_memory_bytes,
"deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
)
)
)
|||,
},
],
},
],
},
}
2 changes: 1 addition & 1 deletion cortex/ingester.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
),

ingester_statefulset_args::
$._config.grpcConfig +
$._config.grpcConfig
{
'ingester.wal-enabled': true,
'ingester.checkpoint-enabled': true,
Expand Down
20 changes: 10 additions & 10 deletions cortex/query-frontend.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
local container = $.core.v1.container,

query_frontend_args::
$._config.grpcConfig +
$._config.grpcConfig
{
target: 'query-frontend',

Expand Down Expand Up @@ -38,17 +38,17 @@
'limits.per-user-override-config': '/etc/cortex/overrides.yaml',
} + (
if $._config.queryFrontend.sharded_queries_enabled then
{
'querier.parallelise-shardable-queries': 'true',
{
'querier.parallelise-shardable-queries': 'true',

// in process tenant queues on frontends. We divide by the number of frontends; 2 in this case in order to apply the global limit in aggregate.
// basically base * shard_factor * query_split_factor / num_frontends where
'querier.max-outstanding-requests-per-tenant': std.floor(200 * $._config.queryFrontend.shard_factor * $._config.queryFrontend.query_split_factor / $._config.queryFrontend.replicas),
// in process tenant queues on frontends. We divide by the number of frontends; 2 in this case in order to apply the global limit in aggregate.
// basically base * shard_factor * query_split_factor / num_frontends where
'querier.max-outstanding-requests-per-tenant': std.floor(200 * $._config.queryFrontend.shard_factor * $._config.queryFrontend.query_split_factor / $._config.queryFrontend.replicas),

'querier.query-ingesters-within': $._config.queryConfig['querier.query-ingesters-within'],
} + $._config.storageConfig
else {}
),
'querier.query-ingesters-within': $._config.queryConfig['querier.query-ingesters-within'],
} + $._config.storageConfig
else {}
),

query_frontend_container::
container.new('query-frontend', $._images.query_frontend) +
Expand Down

0 comments on commit fbf3f98

Please sign in to comment.