From 564a959652c385ab606fb27b25597f8e2c064cf5 Mon Sep 17 00:00:00 2001
From: Tom Wilkie <tom.wilkie@gmail.com>
Date: Thu, 18 Mar 2021 12:58:12 +0000
Subject: [PATCH] Add recording rules to calculate Cortex scaling

- Update dashboard so it only shows under provisioned services and why
- Add sizing rules based on limits.
- Add some docs to the dashboard.

Signed-off-by: Tom Wilkie <tom@grafana.com>
---
 cortex-mixin/dashboards/scaling.libsonnet | 125 ++++----------
 cortex-mixin/recording_rules.libsonnet    | 201 ++++++++++++++++++++++
 cortex/ingester.libsonnet                 |   2 +-
 cortex/query-frontend.libsonnet           |  20 +--
 4 files changed, 249 insertions(+), 99 deletions(-)

diff --git a/cortex-mixin/dashboards/scaling.libsonnet b/cortex-mixin/dashboards/scaling.libsonnet
index d1ff7bd3..11e1f795 100644
--- a/cortex-mixin/dashboards/scaling.libsonnet
+++ b/cortex-mixin/dashboards/scaling.libsonnet
@@ -6,105 +6,54 @@ local utils = import 'mixin-utils/utils.libsonnet';
     ($.dashboard('Cortex / Scaling') + { uid: '88c041017b96856c9176e07cf557bdcf' })
     .addClusterSelectorTemplates()
     .addRow(
-      $.row('Workload-based scaling')
-      .addPanel(
-        $.panel('Workload-based scaling') + { sort: { col: 1, desc: false } } +
-        $.tablePanel([
-          |||
-            sum by (cluster, namespace, deployment) (
-              kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace", deployment=~"ingester|memcached"}
-              or
-              label_replace(
-                kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace", deployment=~"ingester|memcached"},
-                "deployment", "$1", "statefulset", "(.*)"
-              )
-            )
-          |||,
-          |||
-            quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(rate(cortex_distributor_received_samples_total{cluster=~"$cluster", namespace=~"$namespace"}[1m]), "deployment", "ingester", "cluster", ".*"))[1h:])
-              * 3 / 80e3
-          |||,
-          |||
-            label_replace(
-              sum by(cluster, namespace) (
-                cortex_ingester_memory_series{cluster=~"$cluster", namespace=~"$namespace"}
-              ) / 1e+6,
-              "deployment", "ingester", "cluster", ".*"
-            )
-              or
-            label_replace(
-              sum by (cluster, namespace) (
-                4 * cortex_ingester_memory_series{cluster=~"$cluster", namespace=~"$namespace", job=~".+/ingester"}
-                  *
-                cortex_ingester_chunk_size_bytes_sum{cluster=~"$cluster", namespace=~"$namespace", job=~".+/ingester"}
-                  /
-                cortex_ingester_chunk_size_bytes_count{cluster=~"$cluster", namespace=~"$namespace", job=~".+/ingester"}
-              )
-                /
-              avg by (cluster, namespace) (memcached_limit_bytes{cluster=~"$cluster", namespace=~"$namespace", job=~".+/memcached"}),
-              "deployment", "memcached", "namespace", ".*"
-            )
+      ($.row('Cortex Service Scaling') + { height: '200px' })
+      .addPanel({
+        type: 'text',
+        title: '',
+        options: {
+          content: |||
+            This dashboards shows any services which are not scaled correctly.
+            The table below gives the required number of replicas and the reason why.
+            We only show services without enough replicas.
+
+            Reasons:
+            - **sample_rate**: There are not enough replicas to handle the
+              sample rate.  Applies to distributor and ingesters.
+            - **active_series**: There are not enough replicas
+              to handle the number of active series.  Applies to ingesters.
+            - **cpu_usage**: There are not enough replicas
+              based on the CPU usage of the jobs vs the resource requests.
+              Applies to all jobs.
+            - **memory_usage**: There are not enough replicas based on the memory
+              usage vs the resource requests.  Applies to all jobs.
+            - **active_series_limits**: There are not enough replicas to hold 60% of the
+              sum of all the per tenant series limits.
+            - **sample_rate_limits**: There are not enough replicas to handle 60% of the
+              sum of all the per tenant rate limits.
           |||,
-        ], {
-          cluster: { alias: 'Cluster' },
-          namespace: { alias: 'Namespace' },
-          deployment: { alias: 'Deployment' },
-          'Value #A': { alias: 'Current Replicas', decimals: 0 },
-          'Value #B': { alias: 'Required Replicas, by ingestion rate', decimals: 0 },
-          'Value #C': { alias: 'Required Replicas, by active series', decimals: 0 },
-        })
-      )
+          mode: 'markdown',
+        },
+      })
     )
     .addRow(
-      ($.row('Resource-based scaling') + { height: '500px' })
+      ($.row('Scaling') + { height: '400px' })
       .addPanel(
-        $.panel('Resource-based scaling') + { sort: { col: 1, desc: false } } +
+        $.panel('Workload-based scaling') + { sort: { col: 0, desc: false } } +
         $.tablePanel([
           |||
-            sum by (cluster, namespace, deployment) (
-              kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace"}
-              or
-              label_replace(
-                kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace"},
-                "deployment", "$1", "statefulset", "(.*)"
-              )
-            )
-          |||,
-          |||
-            sum by (cluster, namespace, deployment) (
-              kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace"}
-              or
-              label_replace(
-                kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace"},
-                "deployment", "$1", "statefulset", "(.*)"
-              )
-            )
-              *
-            quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(rate(container_cpu_usage_seconds_total{cluster=~"$cluster", namespace=~"$namespace"}[1m]), "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))[24h:])
-              /
-            sum by (cluster, namespace, deployment) (label_replace(kube_pod_container_resource_requests_cpu_cores{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))
-          |||,
-          |||
-            sum by (cluster, namespace, deployment) (
-              kube_deployment_spec_replicas{cluster=~"$cluster", namespace=~"$namespace"}
-              or
-              label_replace(
-                kube_statefulset_replicas{cluster=~"$cluster", namespace=~"$namespace"},
-                "deployment", "$1", "statefulset", "(.*)"
-              )
+            sort_desc(
+              cluster_namespace_deployment_reason:required_replicas:count{cluster=~"$cluster", namespace=~"$namespace"}
+                > ignoring(reason) group_left
+              cluster_namespace_deployment:actual_replicas:count{cluster=~"$cluster", namespace=~"$namespace"}
             )
-              *
-            quantile_over_time(0.99, sum by (cluster, namespace, deployment) (label_replace(container_memory_usage_bytes{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))[24h:1m])
-              /
-            sum by (cluster, namespace, deployment) (label_replace(kube_pod_container_resource_requests_memory_bytes{cluster=~"$cluster", namespace=~"$namespace"}, "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"))
           |||,
         ], {
+          '__name__': { alias: 'Cluster', type: 'hidden' },
           cluster: { alias: 'Cluster' },
           namespace: { alias: 'Namespace' },
-          deployment: { alias: 'Deployment' },
-          'Value #A': { alias: 'Current Replicas', decimals: 0 },
-          'Value #B': { alias: 'Required Replicas, by CPU usage', decimals: 0 },
-          'Value #C': { alias: 'Required Replicas, by Memory usage', decimals: 0 },
+          deployment: { alias: 'Service' },
+          reason: { alias: 'Reason' },
+          'Value': { alias: 'Required Replicas', decimals: 0 },
         })
       )
     ),
diff --git a/cortex-mixin/recording_rules.libsonnet b/cortex-mixin/recording_rules.libsonnet
index 54544d0a..85ad6d39 100644
--- a/cortex-mixin/recording_rules.libsonnet
+++ b/cortex-mixin/recording_rules.libsonnet
@@ -58,6 +58,207 @@ local utils = import 'mixin-utils/utils.libsonnet';
           },
         ],
       },
+      {
+        local _config = {
+          max_series_per_ingester: 1.5e6,
+          max_samples_per_sec_per_ingester: 80e3,
+          max_samples_per_sec_per_distributor: 240e3,
+          limit_utilisation_target: 0.6,
+        },
+        name: 'cortex_scaling_rules',
+        rules: [
+          {
+            // Convenience rule to get the number of replicas for both a deployment and a statefulset.
+            record: 'cluster_namespace_deployment:actual_replicas:count',
+            expr: |||
+              sum by (cluster, namespace, deployment) (kube_deployment_spec_replicas)
+                or
+              sum by (cluster, namespace, deployment) (
+                label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*)")
+              )
+            |||,
+          },
+          {
+            // Distributors should be able to deal with 240k samples/s.
+            record: 'cluster_namespace_deployment_reason:required_replicas:count',
+            labels: {
+              deployment: 'distributor',
+              reason: 'sample_rate',
+            },
+            expr: |||
+              ceil(
+                quantile_over_time(0.99,
+                  sum by (cluster, namespace) (
+                    cluster_namespace_job:cortex_distributor_received_samples:rate5m
+                  )[24h:]
+                )
+                / %(max_samples_per_sec_per_distributor)s
+              )
+            ||| % _config,
+          },
+          {
+            // We should be about to cover 80% of our limits,
+            // and ingester can have 80k samples/s.
+            record: 'cluster_namespace_deployment_reason:required_replicas:count',
+            labels: {
+              deployment: 'distributor',
+              reason: 'sample_rate_limits',
+            },
+            expr: |||
+              ceil(
+                sum by (cluster, namespace) (cortex_overrides{limit_name="ingestion_rate"})
+                * %(limit_utilisation_target)s / %(max_samples_per_sec_per_distributor)s
+              )
+            ||| % _config,
+          },
+          {
+            // We want ingesters each ingester to deal with 80k samples/s.
+            // NB we measure this at the distributors and multiple by RF (3).
+            record: 'cluster_namespace_deployment_reason:required_replicas:count',
+            labels: {
+              deployment: 'ingester',
+              reason: 'sample_rate',
+            },
+            expr: |||
+              ceil(
+                quantile_over_time(0.99,
+                  sum by (cluster, namespace) (
+                    cluster_namespace_job:cortex_distributor_received_samples:rate5m
+                  )[24h:]
+                )
+                * 3 / %(max_samples_per_sec_per_ingester)s
+              )
+            ||| % _config,
+          },
+          {
+            // Ingester should have 1.5M series in memory
+            record: 'cluster_namespace_deployment_reason:required_replicas:count',
+            labels: {
+              deployment: 'ingester',
+              reason: 'active_series',
+            },
+            expr: |||
+              ceil(
+                quantile_over_time(0.99,
+                  sum by(cluster, namespace) (
+                    cortex_ingester_memory_series
+                  )[24h:]
+                )
+                / %(max_series_per_ingester)s
+              )
+            ||| % _config,
+          },
+          {
+            // We should be about to cover 60% of our limits,
+            // and ingester can have 1.5M series in memory
+            record: 'cluster_namespace_deployment_reason:required_replicas:count',
+            labels: {
+              deployment: 'ingester',
+              reason: 'active_series_limits',
+            },
+            expr: |||
+              ceil(
+                sum by (cluster, namespace) (cortex_overrides{limit_name="max_global_series_per_user"})
+                * 3 * %(limit_utilisation_target)s / %(max_series_per_ingester)s
+              )
+            ||| % _config,
+          },
+          {
+            // We should be about to cover 60% of our limits,
+            // and ingester can have 80k samples/s.
+            record: 'cluster_namespace_deployment_reason:required_replicas:count',
+            labels: {
+              deployment: 'ingester',
+              reason: 'sample_rate_limits',
+            },
+            expr: |||
+              ceil(
+                sum by (cluster, namespace) (cortex_overrides{limit_name="ingestion_rate"})
+                * %(limit_utilisation_target)s / %(max_samples_per_sec_per_ingester)s
+              )
+            ||| % _config,
+          },
+          {
+            // Ingesters store 96h of data on disk - we want memcached to store 1/4 of that.
+            record: 'cluster_namespace_deployment_reason:required_replicas:count',
+            labels: {
+              deployment: 'memcached',
+              reason: 'active_series',
+            },
+            expr: |||
+              ceil(
+                (sum by (cluster, namespace) (
+                  cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester"}
+                ) / 4)
+                  /
+                avg by (cluster, namespace) (
+                  memcached_limit_bytes{job=~".+/memcached"}
+                )
+              )
+            |||,
+          },
+          {
+            // Jobs should be sized to their CPU usage.
+            // We do this by comparing 99th percentile usage over the last 24hrs to
+            // their current provisioned #replicas and resource requests.
+            record: 'cluster_namespace_deployment_reason:required_replicas:count',
+            labels: {
+              reason: 'cpu_usage',
+            },
+            expr: |||
+              ceil(
+                cluster_namespace_deployment:actual_replicas:count
+                  *
+                quantile_over_time(0.99,
+                  sum by (cluster, namespace, deployment) (
+                    label_replace(
+                      node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate,
+                      "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
+                    )
+                  )[24h:5m]
+                )
+                  /
+                sum by (cluster, namespace, deployment) (
+                  label_replace(
+                    kube_pod_container_resource_requests_cpu_cores,
+                    "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
+                  )
+                )
+              )
+            |||,
+          },
+          {
+            // Jobs should be sized to their Memory usage.
+            // We do this by comparing 99th percentile usage over the last 24hrs to
+            // their current provisioned #replicas and resource requests.
+            record: 'cluster_namespace_deployment_reason:required_replicas:count',
+            labels: {
+              reason: 'memory_usage',
+            },
+            expr: |||
+              ceil(
+                cluster_namespace_deployment:actual_replicas:count
+                  *
+                quantile_over_time(0.99,
+                  sum by (cluster, namespace, deployment) (
+                    label_replace(
+                      container_memory_usage_bytes,
+                      "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
+                    )
+                  )[24h:5m]
+                )
+                  /
+                sum by (cluster, namespace, deployment) (
+                  label_replace(
+                    kube_pod_container_resource_requests_memory_bytes,
+                    "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))"
+                  )
+                )
+              )
+            |||,
+          },
+        ],
+      },
     ],
   },
 }
diff --git a/cortex/ingester.libsonnet b/cortex/ingester.libsonnet
index a0b6ac7d..e11bee0c 100644
--- a/cortex/ingester.libsonnet
+++ b/cortex/ingester.libsonnet
@@ -43,7 +43,7 @@
     ),
 
   ingester_statefulset_args::
-    $._config.grpcConfig +
+    $._config.grpcConfig
     {
       'ingester.wal-enabled': true,
       'ingester.checkpoint-enabled': true,
diff --git a/cortex/query-frontend.libsonnet b/cortex/query-frontend.libsonnet
index 3386a312..d64e205b 100644
--- a/cortex/query-frontend.libsonnet
+++ b/cortex/query-frontend.libsonnet
@@ -2,7 +2,7 @@
   local container = $.core.v1.container,
 
   query_frontend_args::
-    $._config.grpcConfig +
+    $._config.grpcConfig
     {
       target: 'query-frontend',
 
@@ -38,17 +38,17 @@
       'limits.per-user-override-config': '/etc/cortex/overrides.yaml',
     } + (
       if $._config.queryFrontend.sharded_queries_enabled then
-      {
-        'querier.parallelise-shardable-queries': 'true',
+        {
+          'querier.parallelise-shardable-queries': 'true',
 
-        // in process tenant queues on frontends. We divide by the number of frontends; 2 in this case in order to apply the global limit in aggregate.
-        // basically base * shard_factor * query_split_factor / num_frontends where
-        'querier.max-outstanding-requests-per-tenant': std.floor(200 * $._config.queryFrontend.shard_factor * $._config.queryFrontend.query_split_factor / $._config.queryFrontend.replicas),
+          // in process tenant queues on frontends. We divide by the number of frontends; 2 in this case in order to apply the global limit in aggregate.
+          // basically base * shard_factor * query_split_factor / num_frontends where
+          'querier.max-outstanding-requests-per-tenant': std.floor(200 * $._config.queryFrontend.shard_factor * $._config.queryFrontend.query_split_factor / $._config.queryFrontend.replicas),
 
-        'querier.query-ingesters-within': $._config.queryConfig['querier.query-ingesters-within'],
-      } + $._config.storageConfig
-    else {}
-  ),
+          'querier.query-ingesters-within': $._config.queryConfig['querier.query-ingesters-within'],
+        } + $._config.storageConfig
+      else {}
+    ),
 
   query_frontend_container::
     container.new('query-frontend', $._images.query_frontend) +