From 564a959652c385ab606fb27b25597f8e2c064cf5 Mon Sep 17 00:00:00 2001 From: Tom Wilkie <tom.wilkie@gmail.com> Date: Thu, 18 Mar 2021 12:58:12 +0000 Subject: [PATCH] Add recording rules to calculate Cortex scaling - Update dashboard so it only shows under provisioned services and why - Add sizing rules based on limits. - Add some docs to the dashboard. Signed-off-by: Tom Wilkie <tom@grafana.com> --- cortex-mixin/dashboards/scaling.libsonnet | 125 ++++---------- cortex-mixin/recording_rules.libsonnet | 201 ++++++++++++++++++++++ cortex/ingester.libsonnet | 2 +- cortex/query-frontend.libsonnet | 20 +-- 4 files changed, 249 insertions(+), 99 deletions(-) diff --git a/cortex-mixin/dashboards/scaling.libsonnet b/cortex-mixin/dashboards/scaling.libsonnet index d1ff7bd3..11e1f795 100644 --- a/cortex-mixin/dashboards/scaling.libsonnet +++ b/cortex-mixin/dashboards/scaling.libsonnet @@ -6,105 +6,54 @@ local utils = import 'mixin-utils/utils.libsonnet'; ($.dashboard('Cortex / Scaling') + { uid: '88c041017b96856c9176e07cf557bdcf' }) .addClusterSelectorTemplates() .addRow( - $.row('Workload-based scaling') - .addPanel( - $.panel('Workload-based scaling') + { sort: { col: 1, desc: false } } + - $.tablePanel([ - ||| - sum by (cluster, namespace, deployment) ( - kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace", deployment=~"ingester|memcached"} - or - label_replace( - kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace", deployment=~"ingester|memcached"}, - "deployment", "$1", "statefulset", "(.*)" - ) - ) - |||, - ||| - quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(rate(cortex_distributor_received_samples_total{cluster=~"$cluster", namespace=~"$namespace"}[1m]), "deployment", "ingester", "cluster", ".*"))[1h:]) - * 3 / 80e3 - |||, - ||| - label_replace( - sum by(cluster, namespace) ( - cortex_ingester_memory_series{cluster=~"$cluster", namespace=~"$namespace"} - ) / 1e+6, - "deployment", "ingester", "cluster", ".*" - ) - or - label_replace( - sum by (cluster, namespace) ( - 4 * cortex_ingester_memory_series{cluster=~"$cluster", namespace=~"$namespace", job=~".+/ingester"} - * - cortex_ingester_chunk_size_bytes_sum{cluster=~"$cluster", namespace=~"$namespace", job=~".+/ingester"} - / - cortex_ingester_chunk_size_bytes_count{cluster=~"$cluster", namespace=~"$namespace", job=~".+/ingester"} - ) - / - avg by (cluster, namespace) (memcached_limit_bytes{cluster=~"$cluster", namespace=~"$namespace", job=~".+/memcached"}), - "deployment", "memcached", "namespace", ".*" - ) + ($.row('Cortex Service Scaling') + { height: '200px' }) + .addPanel({ + type: 'text', + title: '', + options: { + content: ||| + This dashboards shows any services which are not scaled correctly. + The table below gives the required number of replicas and the reason why. + We only show services without enough replicas. + + Reasons: + - **sample_rate**: There are not enough replicas to handle the + sample rate. Applies to distributor and ingesters. + - **active_series**: There are not enough replicas + to handle the number of active series. Applies to ingesters. + - **cpu_usage**: There are not enough replicas + based on the CPU usage of the jobs vs the resource requests. + Applies to all jobs. + - **memory_usage**: There are not enough replicas based on the memory + usage vs the resource requests. Applies to all jobs. + - **active_series_limits**: There are not enough replicas to hold 60% of the + sum of all the per tenant series limits. + - **sample_rate_limits**: There are not enough replicas to handle 60% of the + sum of all the per tenant rate limits. |||, - ], { - cluster: { alias: 'Cluster' }, - namespace: { alias: 'Namespace' }, - deployment: { alias: 'Deployment' }, - 'Value #A': { alias: 'Current Replicas', decimals: 0 }, - 'Value #B': { alias: 'Required Replicas, by ingestion rate', decimals: 0 }, - 'Value #C': { alias: 'Required Replicas, by active series', decimals: 0 }, - }) - ) + mode: 'markdown', + }, + }) ) .addRow( - ($.row('Resource-based scaling') + { height: '500px' }) + ($.row('Scaling') + { height: '400px' }) .addPanel( - $.panel('Resource-based scaling') + { sort: { col: 1, desc: false } } + + $.panel('Workload-based scaling') + { sort: { col: 0, desc: false } } + $.tablePanel([ ||| - sum by (cluster, namespace, deployment) ( - kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace"} - or - label_replace( - kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace"}, - "deployment", "$1", "statefulset", "(.*)" - ) - ) - |||, - ||| - sum by (cluster, namespace, deployment) ( - kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace"} - or - label_replace( - kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace"}, - "deployment", "$1", "statefulset", "(.*)" - ) - ) - * - quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(rate(container_cpu_usage_seconds_total{cluster=~"$cluster", namespace=~"$namespace"}[1m]), "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))[24h:]) - / - sum by (cluster, namespace, deployment) (label_replace(kube_pod_container_resource_requests_cpu_cores{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))")) - |||, - ||| - sum by (cluster, namespace, deployment) ( - kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace"} - or - label_replace( - kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace"}, - "deployment", "$1", "statefulset", "(.*)" - ) + sort_desc( + cluster_namespace_deployment_reason:required_replicas:count{cluster=~"$cluster", namespace=~"$namespace"} + > ignoring(reason) group_left + cluster_namespace_deployment:actual_replicas:count{cluster=~"$cluster", namespace=~"$namespace"} ) - * - quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(container_memory_usage_bytes{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))[24h:1m]) - / - sum by (cluster, namespace, deployment) (label_replace(kube_pod_container_resource_requests_memory_bytes{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))")) |||, ], { + '__name__': { alias: 'Cluster', type: 'hidden' }, cluster: { alias: 'Cluster' }, namespace: { alias: 'Namespace' }, - deployment: { alias: 'Deployment' }, - 'Value #A': { alias: 'Current Replicas', decimals: 0 }, - 'Value #B': { alias: 'Required Replicas, by CPU usage', decimals: 0 }, - 'Value #C': { alias: 'Required Replicas, by Memory usage', decimals: 0 }, + deployment: { alias: 'Service' }, + reason: { alias: 'Reason' }, + 'Value': { alias: 'Required Replicas', decimals: 0 }, }) ) ), diff --git a/cortex-mixin/recording_rules.libsonnet b/cortex-mixin/recording_rules.libsonnet index 54544d0a..85ad6d39 100644 --- a/cortex-mixin/recording_rules.libsonnet +++ b/cortex-mixin/recording_rules.libsonnet @@ -58,6 +58,207 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, ], }, + { + local _config = { + max_series_per_ingester: 1.5e6, + max_samples_per_sec_per_ingester: 80e3, + max_samples_per_sec_per_distributor: 240e3, + limit_utilisation_target: 0.6, + }, + name: 'cortex_scaling_rules', + rules: [ + { + // Convenience rule to get the number of replicas for both a deployment and a statefulset. + record: 'cluster_namespace_deployment:actual_replicas:count', + expr: ||| + sum by (cluster, namespace, deployment) (kube_deployment_spec_replicas) + or + sum by (cluster, namespace, deployment) ( + label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*)") + ) + |||, + }, + { + // Distributors should be able to deal with 240k samples/s. + record: 'cluster_namespace_deployment_reason:required_replicas:count', + labels: { + deployment: 'distributor', + reason: 'sample_rate', + }, + expr: ||| + ceil( + quantile_over_time(0.99, + sum by (cluster, namespace) ( + cluster_namespace_job:cortex_distributor_received_samples:rate5m + )[24h:] + ) + / %(max_samples_per_sec_per_distributor)s + ) + ||| % _config, + }, + { + // We should be about to cover 80% of our limits, + // and ingester can have 80k samples/s. + record: 'cluster_namespace_deployment_reason:required_replicas:count', + labels: { + deployment: 'distributor', + reason: 'sample_rate_limits', + }, + expr: ||| + ceil( + sum by (cluster, namespace) (cortex_overrides{limit_name="ingestion_rate"}) + * %(limit_utilisation_target)s / %(max_samples_per_sec_per_distributor)s + ) + ||| % _config, + }, + { + // We want ingesters each ingester to deal with 80k samples/s. + // NB we measure this at the distributors and multiple by RF (3). + record: 'cluster_namespace_deployment_reason:required_replicas:count', + labels: { + deployment: 'ingester', + reason: 'sample_rate', + }, + expr: ||| + ceil( + quantile_over_time(0.99, + sum by (cluster, namespace) ( + cluster_namespace_job:cortex_distributor_received_samples:rate5m + )[24h:] + ) + * 3 / %(max_samples_per_sec_per_ingester)s + ) + ||| % _config, + }, + { + // Ingester should have 1.5M series in memory + record: 'cluster_namespace_deployment_reason:required_replicas:count', + labels: { + deployment: 'ingester', + reason: 'active_series', + }, + expr: ||| + ceil( + quantile_over_time(0.99, + sum by(cluster, namespace) ( + cortex_ingester_memory_series + )[24h:] + ) + / %(max_series_per_ingester)s + ) + ||| % _config, + }, + { + // We should be about to cover 60% of our limits, + // and ingester can have 1.5M series in memory + record: 'cluster_namespace_deployment_reason:required_replicas:count', + labels: { + deployment: 'ingester', + reason: 'active_series_limits', + }, + expr: ||| + ceil( + sum by (cluster, namespace) (cortex_overrides{limit_name="max_global_series_per_user"}) + * 3 * %(limit_utilisation_target)s / %(max_series_per_ingester)s + ) + ||| % _config, + }, + { + // We should be about to cover 60% of our limits, + // and ingester can have 80k samples/s. + record: 'cluster_namespace_deployment_reason:required_replicas:count', + labels: { + deployment: 'ingester', + reason: 'sample_rate_limits', + }, + expr: ||| + ceil( + sum by (cluster, namespace) (cortex_overrides{limit_name="ingestion_rate"}) + * %(limit_utilisation_target)s / %(max_samples_per_sec_per_ingester)s + ) + ||| % _config, + }, + { + // Ingesters store 96h of data on disk - we want memcached to store 1/4 of that. + record: 'cluster_namespace_deployment_reason:required_replicas:count', + labels: { + deployment: 'memcached', + reason: 'active_series', + }, + expr: ||| + ceil( + (sum by (cluster, namespace) ( + cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester"} + ) / 4) + / + avg by (cluster, namespace) ( + memcached_limit_bytes{job=~".+/memcached"} + ) + ) + |||, + }, + { + // Jobs should be sized to their CPU usage. + // We do this by comparing 99th percentile usage over the last 24hrs to + // their current provisioned #replicas and resource requests. + record: 'cluster_namespace_deployment_reason:required_replicas:count', + labels: { + reason: 'cpu_usage', + }, + expr: ||| + ceil( + cluster_namespace_deployment:actual_replicas:count + * + quantile_over_time(0.99, + sum by (cluster, namespace, deployment) ( + label_replace( + node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ) + )[24h:5m] + ) + / + sum by (cluster, namespace, deployment) ( + label_replace( + kube_pod_container_resource_requests_cpu_cores, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ) + ) + ) + |||, + }, + { + // Jobs should be sized to their Memory usage. + // We do this by comparing 99th percentile usage over the last 24hrs to + // their current provisioned #replicas and resource requests. + record: 'cluster_namespace_deployment_reason:required_replicas:count', + labels: { + reason: 'memory_usage', + }, + expr: ||| + ceil( + cluster_namespace_deployment:actual_replicas:count + * + quantile_over_time(0.99, + sum by (cluster, namespace, deployment) ( + label_replace( + container_memory_usage_bytes, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ) + )[24h:5m] + ) + / + sum by (cluster, namespace, deployment) ( + label_replace( + kube_pod_container_resource_requests_memory_bytes, + "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + ) + ) + ) + |||, + }, + ], + }, ], }, } diff --git a/cortex/ingester.libsonnet b/cortex/ingester.libsonnet index a0b6ac7d..e11bee0c 100644 --- a/cortex/ingester.libsonnet +++ b/cortex/ingester.libsonnet @@ -43,7 +43,7 @@ ), ingester_statefulset_args:: - $._config.grpcConfig + + $._config.grpcConfig { 'ingester.wal-enabled': true, 'ingester.checkpoint-enabled': true, diff --git a/cortex/query-frontend.libsonnet b/cortex/query-frontend.libsonnet index 3386a312..d64e205b 100644 --- a/cortex/query-frontend.libsonnet +++ b/cortex/query-frontend.libsonnet @@ -2,7 +2,7 @@ local container = $.core.v1.container, query_frontend_args:: - $._config.grpcConfig + + $._config.grpcConfig { target: 'query-frontend', @@ -38,17 +38,17 @@ 'limits.per-user-override-config': '/etc/cortex/overrides.yaml', } + ( if $._config.queryFrontend.sharded_queries_enabled then - { - 'querier.parallelise-shardable-queries': 'true', + { + 'querier.parallelise-shardable-queries': 'true', - // in process tenant queues on frontends. We divide by the number of frontends; 2 in this case in order to apply the global limit in aggregate. - // basically base * shard_factor * query_split_factor / num_frontends where - 'querier.max-outstanding-requests-per-tenant': std.floor(200 * $._config.queryFrontend.shard_factor * $._config.queryFrontend.query_split_factor / $._config.queryFrontend.replicas), + // in process tenant queues on frontends. We divide by the number of frontends; 2 in this case in order to apply the global limit in aggregate. + // basically base * shard_factor * query_split_factor / num_frontends where + 'querier.max-outstanding-requests-per-tenant': std.floor(200 * $._config.queryFrontend.shard_factor * $._config.queryFrontend.query_split_factor / $._config.queryFrontend.replicas), - 'querier.query-ingesters-within': $._config.queryConfig['querier.query-ingesters-within'], - } + $._config.storageConfig - else {} - ), + 'querier.query-ingesters-within': $._config.queryConfig['querier.query-ingesters-within'], + } + $._config.storageConfig + else {} + ), query_frontend_container:: container.new('query-frontend', $._images.query_frontend) +