Use alert_aggreation at the obvious locations

Signed-off-by: Whyeasy <[email protected]>
grafana · Apr 8, 2022 · 7b32d0a · 7b32d0a
1 parent ca1b6d6
commit 7b32d0a
Show file tree

Hide file tree

Showing 5 changed files with 52 additions and 43 deletions.
diff --git a/operations/mimir-mixin/alerts/alerts.libsonnet b/operations/mimir-mixin/alerts/alerts.libsonnet
@@ -435,7 +435,7 @@
           alert: $.alertName('ProvisioningTooManyWrites'),
           // 80k writes / s per ingester max.
           expr: |||
-            avg by (%(alert_aggregation_labels)s) (%(clusterLabel)s_namespace_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m) > 80e3
+            avg by (%(alert_aggregation_labels)s) (%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m) > 80e3
           ||| % $._config,
           'for': '15m',
           labels: {

diff --git a/operations/mimir-mixin/alerts/blocks.libsonnet b/operations/mimir-mixin/alerts/blocks.libsonnet
@@ -14,13 +14,13 @@
             (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0)
             and
             # Only if the ingester has ingested samples over the last 4h.
-            (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(clusterLabel)s_namespace_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
+            (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
             and
             # Only if the ingester was ingesting samples 4h ago. This protects from the case the ingester instance
             # had ingested samples in the past, then no traffic was received for a long period and then it starts
             # receiving samples again. Without this check, the alert would fire as soon as it gets back receiving
             # samples, while the a block shipping is expected within the next 4h.
-            (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(clusterLabel)s_namespace_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[1h] offset 4h)) > 0)
+            (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[1h] offset 4h)) > 0)
           ||| % $._config,
           labels: {
             severity: 'critical',
@@ -37,7 +37,7 @@
           expr: |||
             (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0)
             and
-            (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(clusterLabel)s_namespace_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
+            (max by(%(alert_aggregation_labels)s, %(per_instance_label)s) (max_over_time(%(alert_aggregation_rule_prefix)s_%(per_instance_label)s:cortex_ingester_ingested_samples_total:rate1m[4h])) > 0)
           ||| % $._config,
           labels: {
             severity: 'critical',

diff --git a/operations/mimir-mixin/dashboards/scaling.libsonnet b/operations/mimir-mixin/dashboards/scaling.libsonnet
@@ -41,9 +41,9 @@ local utils = import 'mixin-utils/utils.libsonnet';
         $.tablePanel([
           |||
             sort_desc(
-              %(clusterLabel)s_namespace_deployment_reason:required_replicas:count{%(clusterLabel)s=~"$cluster", namespace=~"$namespace"}
+              %(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count{%(clusterLabel)s=~"$cluster", namespace=~"$namespace"}
                 > ignoring(reason) group_left
-              %(clusterLabel)s_namespace_deployment:actual_replicas:count{%(clusterLabel)s=~"$cluster", namespace=~"$namespace"}
+              %(alert_aggregation_rule_prefix)s_deployment:actual_replicas:count{%(clusterLabel)s=~"$cluster", namespace=~"$namespace"}
             )
           ||| % $._config,
         ], {

diff --git a/operations/mimir-mixin/groups.libsonnet b/operations/mimir-mixin/groups.libsonnet
@@ -58,5 +58,14 @@
           ),
         ),
       ),
+    alert_aggregation_rule_prefix:
+      std.join(
+        '_',
+          // Split the configured labels by comma and remove whitespaces.
+          std.map(
+            function(l) std.strReplace(l, ' ', ''),
+            std.split($._config.alert_aggregation_labels, ',')
+          ),
+      ),
   },
 }
diff --git a/operations/mimir-mixin/recording_rules.libsonnet b/operations/mimir-mixin/recording_rules.libsonnet
@@ -113,9 +113,9 @@ local utils = import 'mixin-utils/utils.libsonnet';
           {
             // Convenience rule to get the number of replicas for both a deployment and a statefulset.
             // Multi-zone deployments are grouped together removing the "zone-X" suffix.
-            record: '%(clusterLabel)s_namespace_deployment:actual_replicas:count' % _config,
+            record: '%(alert_aggregation_rule_prefix)s_deployment:actual_replicas:count' % _config,
             expr: |||
-              sum by (%(clusterLabel)s, namespace, deployment) (
+              sum by (%(alert_aggregation_labels)s, deployment) (
                 label_replace(
                   kube_deployment_spec_replicas,
                   # The question mark in "(.*?)" is used to make it non-greedy, otherwise it
@@ -124,22 +124,22 @@ local utils = import 'mixin-utils/utils.libsonnet';
                 )
               )
               or
-              sum by (%(clusterLabel)s, namespace, deployment) (
+              sum by (%(alert_aggregation_labels)s, deployment) (
                 label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?")
               )
             ||| % _config,
           },
           {
             // Distributors should be able to deal with 240k samples/s.
-            record: '%(clusterLabel)s_namespace_deployment_reason:required_replicas:count' % _config,
+            record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config,
             labels: {
               deployment: 'distributor',
               reason: 'sample_rate',
             },
             expr: |||
               ceil(
                 quantile_over_time(0.99,
-                  sum by (%(clusterLabel)s, namespace) (
+                  sum by (%(alert_aggregation_labels)s) (
                     %(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m
                   )[24h:]
                 )
@@ -150,30 +150,30 @@ local utils = import 'mixin-utils/utils.libsonnet';
           {
             // We should be about to cover 80% of our limits,
             // and ingester can have 80k samples/s.
-            record: '%(clusterLabel)s_namespace_deployment_reason:required_replicas:count' % _config,
+            record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config,
             labels: {
               deployment: 'distributor',
               reason: 'sample_rate_limits',
             },
             expr: |||
               ceil(
-                sum by (%(clusterLabel)s, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"})
+                sum by (%(alert_aggregation_labels)s) (cortex_limits_overrides{limit_name="ingestion_rate"})
                 * %(limit_utilisation_target)s / %(max_samples_per_sec_per_distributor)s
               )
             ||| % _config,
           },
           {
             // We want ingesters each ingester to deal with 80k samples/s.
             // NB we measure this at the distributors and multiple by RF (3).
-            record: '%(clusterLabel)s_namespace_deployment_reason:required_replicas:count' % _config,
+            record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config,
             labels: {
               deployment: 'ingester',
               reason: 'sample_rate',
             },
             expr: |||
               ceil(
                 quantile_over_time(0.99,
-                  sum by (%(clusterLabel)s, namespace) (
+                  sum by (%(alert_aggregation_labels)s) (
                     %(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m
                   )[24h:]
                 )
@@ -183,15 +183,15 @@ local utils = import 'mixin-utils/utils.libsonnet';
           },
           {
             // Ingester should have 1.5M series in memory
-            record: '%(clusterLabel)s_namespace_deployment_reason:required_replicas:count' % _config,
+            record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config,
             labels: {
               deployment: 'ingester',
               reason: 'active_series',
             },
             expr: |||
               ceil(
                 quantile_over_time(0.99,
-                  sum by(%(clusterLabel)s, namespace) (
+                  sum by(%(alert_aggregation_labels)s) (
                     cortex_ingester_memory_series
                   )[24h:]
                 )
@@ -202,47 +202,47 @@ local utils = import 'mixin-utils/utils.libsonnet';
           {
             // We should be about to cover 60% of our limits,
             // and ingester can have 1.5M series in memory
-            record: '%(clusterLabel)s_namespace_deployment_reason:required_replicas:count' % _config,
+            record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config,
             labels: {
               deployment: 'ingester',
               reason: 'active_series_limits',
             },
             expr: |||
               ceil(
-                sum by (%(clusterLabel)s, namespace) (cortex_limits_overrides{limit_name="max_global_series_per_user"})
+                sum by (%(alert_aggregation_labels)s) (cortex_limits_overrides{limit_name="max_global_series_per_user"})
                 * 3 * %(limit_utilisation_target)s / %(max_series_per_ingester)s
               )
             ||| % _config,
           },
           {
             // We should be about to cover 60% of our limits,
             // and ingester can have 80k samples/s.
-            record: '%(clusterLabel)s_namespace_deployment_reason:required_replicas:count' % _config,
+            record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config,
             labels: {
               deployment: 'ingester',
               reason: 'sample_rate_limits',
             },
             expr: |||
               ceil(
-                sum by (%(clusterLabel)s, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"})
+                sum by (%(alert_aggregation_labels)s) (cortex_limits_overrides{limit_name="ingestion_rate"})
                 * %(limit_utilisation_target)s / %(max_samples_per_sec_per_ingester)s
               )
             ||| % _config,
           },
           {
             // Ingesters store 96h of data on disk - we want memcached to store 1/4 of that.
-            record: '%(clusterLabel)s_namespace_deployment_reason:required_replicas:count' % _config,
+            record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config,
             labels: {
               deployment: 'memcached',
               reason: 'active_series',
             },
             expr: |||
               ceil(
-                (sum by (%(clusterLabel)s, namespace) (
+                (sum by (%(alert_aggregation_labels)s) (
                   cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"}
                 ) / 4)
                   /
-                avg by (%(clusterLabel)s, namespace) (
+                avg by (%(alert_aggregation_labels)s) (
                   memcached_limit_bytes{job=~".+/memcached"}
                 )
               )
@@ -251,9 +251,9 @@ local utils = import 'mixin-utils/utils.libsonnet';
           {
             // Convenience rule to get the CPU utilization for both a deployment and a statefulset.
             // Multi-zone deployments are grouped together removing the "zone-X" suffix.
-            record: '%(clusterLabel)s_namespace_deployment:container_cpu_usage_seconds_total:sum_rate' % _config,
+            record: '%(alert_aggregation_rule_prefix)s_deployment:container_cpu_usage_seconds_total:sum_rate' % _config,
             expr: |||
-              sum by (%(clusterLabel)s, namespace, deployment) (
+              sum by (%(alert_aggregation_labels)s, deployment) (
                 label_replace(
                   label_replace(
                     node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate,
@@ -269,7 +269,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
           {
             // Convenience rule to get the CPU request for both a deployment and a statefulset.
             // Multi-zone deployments are grouped together removing the "zone-X" suffix.
-            record: '%(clusterLabel)s_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum' % _config,
+            record: '%(alert_aggregation_rule_prefix)s_deployment:kube_pod_container_resource_requests_cpu_cores:sum' % _config,
             expr: |||
               # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
               # that remove resource metrics, ref:
@@ -279,7 +279,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
               # This is the old expression, compatible with kube-state-metrics < v2.0.0,
               # where kube_pod_container_resource_requests_cpu_cores was removed:
               (
-                sum by (%(clusterLabel)s, namespace, deployment) (
+                sum by (%(alert_aggregation_labels)s, deployment) (
                   label_replace(
                     label_replace(
                       kube_pod_container_resource_requests_cpu_cores,
@@ -295,7 +295,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
               # This expression is compatible with kube-state-metrics >= v1.4.0,
               # where kube_pod_container_resource_requests was introduced.
               (
-                sum by (%(clusterLabel)s, namespace, deployment) (
+                sum by (%(alert_aggregation_labels)s, deployment) (
                   label_replace(
                     label_replace(
                       kube_pod_container_resource_requests{resource="cpu"},
@@ -313,26 +313,26 @@ local utils = import 'mixin-utils/utils.libsonnet';
             // Jobs should be sized to their CPU usage.
             // We do this by comparing 99th percentile usage over the last 24hrs to
             // their current provisioned #replicas and resource requests.
-            record: '%(clusterLabel)s_namespace_deployment_reason:required_replicas:count' % _config,
+            record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config,
             labels: {
               reason: 'cpu_usage',
             },
             expr: |||
               ceil(
-                %(clusterLabel)s_namespace_deployment:actual_replicas:count
+                %(alert_aggregation_rule_prefix)s_deployment:actual_replicas:count
                   *
-                quantile_over_time(0.99, %(clusterLabel)s_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h])
+                quantile_over_time(0.99, %(alert_aggregation_rule_prefix)s_deployment:container_cpu_usage_seconds_total:sum_rate[24h])
                   /
-                %(clusterLabel)s_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum
+                %(alert_aggregation_rule_prefix)s_deployment:kube_pod_container_resource_requests_cpu_cores:sum
               )
             ||| % _config,
           },
           {
             // Convenience rule to get the Memory utilization for both a deployment and a statefulset.
             // Multi-zone deployments are grouped together removing the "zone-X" suffix.
-            record: '%(clusterLabel)s_namespace_deployment:container_memory_usage_bytes:sum' % _config,
+            record: '%(alert_aggregation_rule_prefix)s_deployment:container_memory_usage_bytes:sum' % _config,
             expr: |||
-              sum by (%(clusterLabel)s, namespace, deployment) (
+              sum by (%(alert_aggregation_labels)s, deployment) (
                 label_replace(
                   label_replace(
                     container_memory_usage_bytes,
@@ -348,7 +348,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
           {
             // Convenience rule to get the Memory request for both a deployment and a statefulset.
             // Multi-zone deployments are grouped together removing the "zone-X" suffix.
-            record: '%(clusterLabel)s_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum' % _config,
+            record: '%(alert_aggregation_rule_prefix)s_deployment:kube_pod_container_resource_requests_memory_bytes:sum' % _config,
             expr: |||
               # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2
               # that remove resource metrics, ref:
@@ -358,7 +358,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
               # This is the old expression, compatible with kube-state-metrics < v2.0.0,
               # where kube_pod_container_resource_requests_memory_bytes was removed:
               (
-                sum by (%(clusterLabel)s, namespace, deployment) (
+                sum by (%(alert_aggregation_labels)s, deployment) (
                   label_replace(
                     label_replace(
                       kube_pod_container_resource_requests_memory_bytes,
@@ -374,7 +374,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
               # This expression is compatible with kube-state-metrics >= v1.4.0,
               # where kube_pod_container_resource_requests was introduced.
               (
-                sum by (%(clusterLabel)s, namespace, deployment) (
+                sum by (%(alert_aggregation_labels)s, deployment) (
                   label_replace(
                     label_replace(
                       kube_pod_container_resource_requests{resource="memory"},
@@ -392,17 +392,17 @@ local utils = import 'mixin-utils/utils.libsonnet';
             // Jobs should be sized to their Memory usage.
             // We do this by comparing 99th percentile usage over the last 24hrs to
             // their current provisioned #replicas and resource requests.
-            record: '%(clusterLabel)s_namespace_deployment_reason:required_replicas:count' % _config,
+            record: '%(alert_aggregation_rule_prefix)s_deployment_reason:required_replicas:count' % _config,
             labels: {
               reason: 'memory_usage',
             },
             expr: |||
               ceil(
-                %(clusterLabel)s_namespace_deployment:actual_replicas:count
+                %(alert_aggregation_rule_prefix)s_deployment:actual_replicas:count
                   *
-                quantile_over_time(0.99, %(clusterLabel)s_namespace_deployment:container_memory_usage_bytes:sum[24h])
+                quantile_over_time(0.99, %(alert_aggregation_rule_prefix)s_deployment:container_memory_usage_bytes:sum[24h])
                   /
-                %(clusterLabel)s_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum
+                %(alert_aggregation_rule_prefix)s_deployment:kube_pod_container_resource_requests_memory_bytes:sum
               )
             ||| % _config,
           },
@@ -479,7 +479,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
         rules: [
           {
             // cortex_ingester_ingested_samples_total is per user, in this rule we want to see the sum per cluster/namespace/instance
-            record: '%s_namespace_%s:cortex_ingester_ingested_samples_total:rate1m' % [$._config.clusterLabel, $._config.per_instance_label],
+            record: '%s_%s:cortex_ingester_ingested_samples_total:rate1m' % [$._config.alert_aggregation_rule_prefix, $._config.per_instance_label],
             expr: |||
               sum by(%(alert_aggregation_labels)s, %(per_instance_label)s) (rate(cortex_ingester_ingested_samples_total[1m]))
             ||| % $._config,