From ef99b2a6eeb73160efc5dbfcd1f0c948956842d4 Mon Sep 17 00:00:00 2001 From: Dimitar Dimitrov Date: Wed, 25 Sep 2024 17:31:42 +0200 Subject: [PATCH 1/5] WIP Signed-off-by: Dimitar Dimitrov --- .../dashboards/mimir-remote-ruler-reads.json | 6 +++--- .../dashboards/mimir-writes.json | 4 ++-- .../dashboards/mimir-remote-ruler-reads.json | 6 +++--- .../dashboards/mimir-writes.json | 4 ++-- .../dashboards/dashboard-utils.libsonnet | 15 ++++++++++----- 5 files changed, 20 insertions(+), 15 deletions(-) diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-remote-ruler-reads.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-remote-ruler-reads.json index 7f71bcc5ff1..0d03ee4584f 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-remote-ruler-reads.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-remote-ruler-reads.json @@ -1497,7 +1497,7 @@ "span": 4, "targets": [ { - "expr": "sum by (scaler) (\n label_replace(\n keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*cpu.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"},\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", + "expr": "sum by (scaler) (\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric, scaler) keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*cpu.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric_name) (kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"}),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", "format": "time_series", "legendFormat": "{{ scaler }}", "legendLink": null @@ -1546,7 +1546,7 @@ "span": 4, "targets": [ { - "expr": "sum by (scaler) (\n label_replace(\n keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*memory.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"},\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", + "expr": "sum by (scaler) (\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric, scaler) keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*memory.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric_name) (kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"}),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", "format": "time_series", "legendFormat": "{{ scaler }}", "legendLink": null @@ -1595,7 +1595,7 @@ "span": 4, "targets": [ { - "expr": "sum by (scaler) (\n label_replace(\n keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*queries.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"},\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", + "expr": "sum by (scaler) (\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric, scaler) keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*queries.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric_name) (kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"}),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", "format": "time_series", "legendFormat": "{{ scaler }}", "legendLink": null diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json index 028f6fd5716..95825e7a9ed 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json @@ -1275,7 +1275,7 @@ "span": 3, "targets": [ { - "expr": "sum by (scaler) (\n label_replace(\n keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*cpu.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"},\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", + "expr": "sum by (scaler) (\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric, scaler) keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*cpu.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric_name) (kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"}),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", "format": "time_series", "legendFormat": "{{ scaler }}", "legendLink": null @@ -1324,7 +1324,7 @@ "span": 3, "targets": [ { - "expr": "sum by (scaler) (\n label_replace(\n keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*memory.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"},\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", + "expr": "sum by (scaler) (\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric, scaler) keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*memory.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric_name) (kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"}),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", "format": "time_series", "legendFormat": "{{ scaler }}", "legendLink": null diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-remote-ruler-reads.json b/operations/mimir-mixin-compiled/dashboards/mimir-remote-ruler-reads.json index 9b43b3abfb0..544f156e2b6 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-remote-ruler-reads.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-remote-ruler-reads.json @@ -1497,7 +1497,7 @@ "span": 4, "targets": [ { - "expr": "sum by (scaler) (\n label_replace(\n keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*cpu.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"},\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", + "expr": "sum by (scaler) (\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric, scaler) keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*cpu.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric_name) (kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"}),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", "format": "time_series", "legendFormat": "{{ scaler }}", "legendLink": null @@ -1546,7 +1546,7 @@ "span": 4, "targets": [ { - "expr": "sum by (scaler) (\n label_replace(\n keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*memory.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"},\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", + "expr": "sum by (scaler) (\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric, scaler) keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*memory.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric_name) (kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"}),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", "format": "time_series", "legendFormat": "{{ scaler }}", "legendLink": null @@ -1595,7 +1595,7 @@ "span": 4, "targets": [ { - "expr": "sum by (scaler) (\n label_replace(\n keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*queries.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"},\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", + "expr": "sum by (scaler) (\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric, scaler) keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*queries.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric_name) (kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"}),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", "format": "time_series", "legendFormat": "{{ scaler }}", "legendLink": null diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json index bda925b0957..41438d5066f 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json @@ -1275,7 +1275,7 @@ "span": 3, "targets": [ { - "expr": "sum by (scaler) (\n label_replace(\n keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*cpu.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"},\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", + "expr": "sum by (scaler) (\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric, scaler) keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*cpu.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric_name) (kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"}),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", "format": "time_series", "legendFormat": "{{ scaler }}", "legendLink": null @@ -1324,7 +1324,7 @@ "span": 3, "targets": [ { - "expr": "sum by (scaler) (\n label_replace(\n keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*memory.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"},\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", + "expr": "sum by (scaler) (\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric, scaler) keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*memory.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric_name) (kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"}),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", "format": "time_series", "legendFormat": "{{ scaler }}", "legendLink": null diff --git a/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet index a9d815d76bb..fed89173933 100644 --- a/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -701,13 +701,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; ||| sum by (scaler) ( label_replace( - keda_scaler_metrics_value{%(cluster_label)s=~"$cluster", exported_namespace=~"$namespace", scaler=~"%(scaler_selector)s"}, + # Using `max by ()` so that series churn doesn't break the promQL join + max by (%(aggregation_labels)s, scaledObject, metric, scaler) keda_scaler_metrics_value{%(cluster_label)s=~"$cluster", exported_namespace=~"$namespace", scaler=~"%(scaler_selector)s"}, "namespace", "$1", "exported_namespace", "(.*)" ) / on(%(aggregation_labels)s, scaledObject, metric) group_left label_replace( label_replace( - kube_horizontalpodautoscaler_spec_target_metric{%(namespace)s, horizontalpodautoscaler=~"%(hpa_name)s"}, + # Using `max by ()` so that series churn doesn't break the promQL join + max by (%(aggregation_labels)s, scaledObject, metric_name) (kube_horizontalpodautoscaler_spec_target_metric{%(namespace)s, horizontalpodautoscaler=~"%(hpa_name)s"}), "metric", "$1", "metric_name", "(.+)" ), "scaledObject", "$1", "horizontalpodautoscaler", "%(hpa_prefix)s(.*)" @@ -744,20 +746,23 @@ local utils = import 'mixin-utils/utils.libsonnet'; ||| sum by (scaler) ( label_replace( - keda_scaler_metrics_value{%(cluster_label)s=~"$cluster", exported_namespace=~"$namespace", scaler=~"%(scaler_selector)s"}, + # Using `max by ()` so that series churn doesn't break the promQL join + max by (namespace, scaler, %(aggregation_labels)s, scaledObject, metric) (keda_scaler_metrics_value{%(cluster_label)s=~"$cluster", exported_namespace=~"$namespace", scaler=~"%(scaler_selector)s"}), "namespace", "$1", "exported_namespace", "(.*)" ) / on(%(aggregation_labels)s, scaledObject, metric) group_left label_replace( label_replace( - kube_horizontalpodautoscaler_spec_target_metric{%(namespace)s, horizontalpodautoscaler=~"%(hpa_name)s"}, + # Using `max by ()` so that series churn doesn't break the promQL join + max by (%(aggregation_labels)s, scaledObject, metric_name) (kube_horizontalpodautoscaler_spec_target_metric{%(namespace)s, horizontalpodautoscaler=~"%(hpa_name)s"}), "metric", "$1", "metric_name", "(.+)" ), "scaledObject", "$1", "horizontalpodautoscaler", "%(hpa_prefix)s(.*)" ) * on(%(aggregation_labels)s, scaledObject) group_left label_replace( - kube_horizontalpodautoscaler_status_current_replicas{%(namespace)s, horizontalpodautoscaler=~"%(hpa_name)s"}, + # Using `max by ()` so that series churn doesn't break the promQL join + max by (%(aggregation_labels)s, horizontalpodautoscaler) (kube_horizontalpodautoscaler_status_current_replicas{%(namespace)s, horizontalpodautoscaler=~"%(hpa_name)s"}), "scaledObject", "$1", "horizontalpodautoscaler", "keda-hpa-(.*)" ) ) From 5e41f391bea6e0d0afee39a49f95271b8b6b6961 Mon Sep 17 00:00:00 2001 From: Dimitar Dimitrov Date: Wed, 25 Sep 2024 17:31:42 +0200 Subject: [PATCH 2/5] mixin: fix errors on autoscaling metrics after series churn The promQL joins would fail when the series churn. For example, when there are labels added by scrapers or when the K8s exporter restarts and gets a new `pod` label. Signed-off-by: Dimitar Dimitrov --- .../mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json | 2 +- .../dashboards/mimir-remote-ruler-reads.json | 2 +- .../dashboards/mimir-writes.json | 2 +- operations/mimir-mixin-compiled/dashboards/mimir-reads.json | 2 +- .../dashboards/mimir-remote-ruler-reads.json | 2 +- operations/mimir-mixin-compiled/dashboards/mimir-writes.json | 2 +- operations/mimir-mixin/dashboards/dashboard-utils.libsonnet | 3 ++- 7 files changed, 8 insertions(+), 7 deletions(-) diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json index 28c6bf3049e..2f1090cb231 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-reads.json @@ -2613,7 +2613,7 @@ "span": 4, "targets": [ { - "expr": "sum by(cluster, namespace, scaler, metric, scaledObject) (\n label_replace(\n rate(keda_scaler_errors[$__rate_interval]),\n \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\"\n )\n) +\non(cluster, namespace, metric, scaledObject) group_left\nlabel_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-querier\"} * 0,\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n ),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n)\n", + "expr": "sum by(cluster, namespace, scaler, metric, scaledObject) (\n label_replace(\n rate(keda_scaler_errors[$__rate_interval]),\n \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\"\n )\n) +\non(cluster, namespace, metric, scaledObject) group_left\nlabel_replace(\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, metric, horizontalpodautoscaler) (kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-querier\"} * 0),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n ),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n)\n", "format": "time_series", "legendFormat": "{{scaler}} failures", "legendLink": null diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-remote-ruler-reads.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-remote-ruler-reads.json index 0d03ee4584f..c6c9d180c8b 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-remote-ruler-reads.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-remote-ruler-reads.json @@ -1436,7 +1436,7 @@ "span": 6, "targets": [ { - "expr": "sum by(cluster, namespace, scaler, metric, scaledObject) (\n label_replace(\n rate(keda_scaler_errors[$__rate_interval]),\n \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\"\n )\n) +\non(cluster, namespace, metric, scaledObject) group_left\nlabel_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"} * 0,\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n ),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n)\n", + "expr": "sum by(cluster, namespace, scaler, metric, scaledObject) (\n label_replace(\n rate(keda_scaler_errors[$__rate_interval]),\n \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\"\n )\n) +\non(cluster, namespace, metric, scaledObject) group_left\nlabel_replace(\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, metric, horizontalpodautoscaler) (kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"} * 0),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n ),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n)\n", "format": "time_series", "legendFormat": "{{scaler}} failures", "legendLink": null diff --git a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json index 95825e7a9ed..482683e7016 100644 --- a/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json @@ -1373,7 +1373,7 @@ "span": 3, "targets": [ { - "expr": "sum by(cluster, namespace, scaler, metric, scaledObject) (\n label_replace(\n rate(keda_scaler_errors[$__rate_interval]),\n \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\"\n )\n) +\non(cluster, namespace, metric, scaledObject) group_left\nlabel_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"} * 0,\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n ),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n)\n", + "expr": "sum by(cluster, namespace, scaler, metric, scaledObject) (\n label_replace(\n rate(keda_scaler_errors[$__rate_interval]),\n \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\"\n )\n) +\non(cluster, namespace, metric, scaledObject) group_left\nlabel_replace(\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, metric, horizontalpodautoscaler) (kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"} * 0),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n ),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n)\n", "format": "time_series", "legendFormat": "{{scaler}} failures", "legendLink": null diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-reads.json b/operations/mimir-mixin-compiled/dashboards/mimir-reads.json index e59ce6c7f19..452576ef574 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-reads.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-reads.json @@ -2613,7 +2613,7 @@ "span": 4, "targets": [ { - "expr": "sum by(cluster, namespace, scaler, metric, scaledObject) (\n label_replace(\n rate(keda_scaler_errors[$__rate_interval]),\n \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\"\n )\n) +\non(cluster, namespace, metric, scaledObject) group_left\nlabel_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-querier\"} * 0,\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n ),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n)\n", + "expr": "sum by(cluster, namespace, scaler, metric, scaledObject) (\n label_replace(\n rate(keda_scaler_errors[$__rate_interval]),\n \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\"\n )\n) +\non(cluster, namespace, metric, scaledObject) group_left\nlabel_replace(\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, metric, horizontalpodautoscaler) (kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-querier\"} * 0),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n ),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n)\n", "format": "time_series", "legendFormat": "{{scaler}} failures", "legendLink": null diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-remote-ruler-reads.json b/operations/mimir-mixin-compiled/dashboards/mimir-remote-ruler-reads.json index 544f156e2b6..b12b0723d8a 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-remote-ruler-reads.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-remote-ruler-reads.json @@ -1436,7 +1436,7 @@ "span": 6, "targets": [ { - "expr": "sum by(cluster, namespace, scaler, metric, scaledObject) (\n label_replace(\n rate(keda_scaler_errors[$__rate_interval]),\n \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\"\n )\n) +\non(cluster, namespace, metric, scaledObject) group_left\nlabel_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"} * 0,\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n ),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n)\n", + "expr": "sum by(cluster, namespace, scaler, metric, scaledObject) (\n label_replace(\n rate(keda_scaler_errors[$__rate_interval]),\n \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\"\n )\n) +\non(cluster, namespace, metric, scaledObject) group_left\nlabel_replace(\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, metric, horizontalpodautoscaler) (kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"} * 0),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n ),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n)\n", "format": "time_series", "legendFormat": "{{scaler}} failures", "legendLink": null diff --git a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json index 41438d5066f..176fd2f6621 100644 --- a/operations/mimir-mixin-compiled/dashboards/mimir-writes.json +++ b/operations/mimir-mixin-compiled/dashboards/mimir-writes.json @@ -1373,7 +1373,7 @@ "span": 3, "targets": [ { - "expr": "sum by(cluster, namespace, scaler, metric, scaledObject) (\n label_replace(\n rate(keda_scaler_errors[$__rate_interval]),\n \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\"\n )\n) +\non(cluster, namespace, metric, scaledObject) group_left\nlabel_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"} * 0,\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n ),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n)\n", + "expr": "sum by(cluster, namespace, scaler, metric, scaledObject) (\n label_replace(\n rate(keda_scaler_errors[$__rate_interval]),\n \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\"\n )\n) +\non(cluster, namespace, metric, scaledObject) group_left\nlabel_replace(\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, metric, horizontalpodautoscaler) (kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"} * 0),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n ),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n)\n", "format": "time_series", "legendFormat": "{{scaler}} failures", "legendLink": null diff --git a/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet index fed89173933..c2dd1509f3e 100644 --- a/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet +++ b/operations/mimir-mixin/dashboards/dashboard-utils.libsonnet @@ -1438,7 +1438,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; on(%(aggregation_labels)s, metric, scaledObject) group_left label_replace( label_replace( - kube_horizontalpodautoscaler_spec_target_metric{%(namespace)s, horizontalpodautoscaler=~"%(hpa_name)s"} * 0, + # Using `max by ()` so that series churn doesn't break the promQL join + max by (%(aggregation_labels)s, metric, horizontalpodautoscaler) (kube_horizontalpodautoscaler_spec_target_metric{%(namespace)s, horizontalpodautoscaler=~"%(hpa_name)s"} * 0), "scaledObject", "$1", "horizontalpodautoscaler", "%(hpa_prefix)s(.*)" ), "metric", "$1", "metric_name", "(.+)" From 609aa917faf035f8bf4d0bc250a21721ee9489b8 Mon Sep 17 00:00:00 2001 From: Dimitar Dimitrov Date: Wed, 25 Sep 2024 19:54:17 +0200 Subject: [PATCH 3/5] Add CHANGELOG.md entry Signed-off-by: Dimitar Dimitrov --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index bff930218df..cbaf2d4aaf6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -151,6 +151,7 @@ * [BUGFIX] Alerts: do not fire `MimirRingMembersMismatch` during the migration to experimental ingest storage. #8727 * [BUGFIX] Dashboards: avoid over-counting of ingesters metrics when migrating to experimental ingest storage. #9170 * [BUGFIX] Dashboards: fix `job_prefix` not utilized in `jobSelector`. #9155 +* [BUGFIX] Dashboards: Fix autoscaling metrics joins when series churn. #9412 ### Jsonnet From 02920a5a78ce8dab843f03d9d7c08855b7706f97 Mon Sep 17 00:00:00 2001 From: Dimitar Dimitrov Date: Wed, 25 Sep 2024 20:18:50 +0200 Subject: [PATCH 4/5] Also fix alert Signed-off-by: Dimitar Dimitrov --- CHANGELOG.md | 1 + .../mimir-mixin-compiled-baremetal/alerts.yaml | 13 ++++++++++--- operations/mimir-mixin-compiled/alerts.yaml | 13 ++++++++++--- operations/mimir-mixin/alerts/autoscaling.libsonnet | 13 ++++++++++--- 4 files changed, 31 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cbaf2d4aaf6..8843291e871 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -152,6 +152,7 @@ * [BUGFIX] Dashboards: avoid over-counting of ingesters metrics when migrating to experimental ingest storage. #9170 * [BUGFIX] Dashboards: fix `job_prefix` not utilized in `jobSelector`. #9155 * [BUGFIX] Dashboards: Fix autoscaling metrics joins when series churn. #9412 +* [BUGFIX] Alerts: Fix autoscaling metrics joins in `MimirAutoscalerNotActive` when series churn. #9412 ### Jsonnet diff --git a/operations/mimir-mixin-compiled-baremetal/alerts.yaml b/operations/mimir-mixin-compiled-baremetal/alerts.yaml index ce4a92d556f..1fb8acf7cd4 100644 --- a/operations/mimir-mixin-compiled-baremetal/alerts.yaml +++ b/operations/mimir-mixin-compiled-baremetal/alerts.yaml @@ -964,7 +964,11 @@ groups: # Match only Mimir namespaces. * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) # Add "metric" label. - + on(cluster, namespace, horizontalpodautoscaler) group_right label_replace(kube_horizontalpodautoscaler_spec_target_metric*0, "metric", "$1", "metric_name", "(.+)") + + on(cluster, namespace, horizontalpodautoscaler) group_right + # Using `max by ()` so that series churn doesn't break the promQL join + max by (cluster, namespace, horizontalpodautoscaler) ( + label_replace(kube_horizontalpodautoscaler_spec_target_metric*0, "metric", "$1", "metric_name", "(.+)") + ) > 0), "scaledObject", "$1", "horizontalpodautoscaler", "keda-hpa-(.*)" ) @@ -972,8 +976,11 @@ groups: # Alert only if the scaling metric exists and is > 0. If the KEDA ScaledObject is configured to scale down 0, # then HPA ScalingActive may be false when expected to run 0 replicas. In this case, the scaling metric exported # by KEDA could not exist at all or being exposed with a value of 0. - and on (cluster, namespace, metric, scaledObject) - (label_replace(keda_scaler_metrics_value, "namespace", "$0", "exported_namespace", ".+") > 0) + and on (cluster, namespace, metric, scaledObject) ( + max by (cluster, namespace, metric, scaledObject) ( + label_replace(keda_scaler_metrics_value, "namespace", "$0", "exported_namespace", ".+") > 0 + ) + ) for: 1h labels: severity: critical diff --git a/operations/mimir-mixin-compiled/alerts.yaml b/operations/mimir-mixin-compiled/alerts.yaml index 1a5ea6a2323..5b8551450ff 100644 --- a/operations/mimir-mixin-compiled/alerts.yaml +++ b/operations/mimir-mixin-compiled/alerts.yaml @@ -978,7 +978,11 @@ groups: # Match only Mimir namespaces. * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) # Add "metric" label. - + on(cluster, namespace, horizontalpodautoscaler) group_right label_replace(kube_horizontalpodautoscaler_spec_target_metric*0, "metric", "$1", "metric_name", "(.+)") + + on(cluster, namespace, horizontalpodautoscaler) group_right + # Using `max by ()` so that series churn doesn't break the promQL join + max by (cluster, namespace, horizontalpodautoscaler) ( + label_replace(kube_horizontalpodautoscaler_spec_target_metric*0, "metric", "$1", "metric_name", "(.+)") + ) > 0), "scaledObject", "$1", "horizontalpodautoscaler", "keda-hpa-(.*)" ) @@ -986,8 +990,11 @@ groups: # Alert only if the scaling metric exists and is > 0. If the KEDA ScaledObject is configured to scale down 0, # then HPA ScalingActive may be false when expected to run 0 replicas. In this case, the scaling metric exported # by KEDA could not exist at all or being exposed with a value of 0. - and on (cluster, namespace, metric, scaledObject) - (label_replace(keda_scaler_metrics_value, "namespace", "$0", "exported_namespace", ".+") > 0) + and on (cluster, namespace, metric, scaledObject) ( + max by (cluster, namespace, metric, scaledObject) ( + label_replace(keda_scaler_metrics_value, "namespace", "$0", "exported_namespace", ".+") > 0 + ) + ) for: 1h labels: severity: critical diff --git a/operations/mimir-mixin/alerts/autoscaling.libsonnet b/operations/mimir-mixin/alerts/autoscaling.libsonnet index 28a4028afa8..eb04c86642d 100644 --- a/operations/mimir-mixin/alerts/autoscaling.libsonnet +++ b/operations/mimir-mixin/alerts/autoscaling.libsonnet @@ -13,7 +13,11 @@ # Match only Mimir namespaces. * on(%(aggregation_labels)s) group_left max by(%(aggregation_labels)s) (cortex_build_info) # Add "metric" label. - + on(%(aggregation_labels)s, horizontalpodautoscaler) group_right label_replace(kube_horizontalpodautoscaler_spec_target_metric*0, "metric", "$1", "metric_name", "(.+)") + + on(%(aggregation_labels)s, horizontalpodautoscaler) group_right + # Using `max by ()` so that series churn doesn't break the promQL join + max by (%(aggregation_labels)s, horizontalpodautoscaler) ( + label_replace(kube_horizontalpodautoscaler_spec_target_metric*0, "metric", "$1", "metric_name", "(.+)") + ) > 0), "scaledObject", "$1", "horizontalpodautoscaler", "%(hpa_prefix)s(.*)" ) @@ -21,8 +25,11 @@ # Alert only if the scaling metric exists and is > 0. If the KEDA ScaledObject is configured to scale down 0, # then HPA ScalingActive may be false when expected to run 0 replicas. In this case, the scaling metric exported # by KEDA could not exist at all or being exposed with a value of 0. - and on (%(aggregation_labels)s, metric, scaledObject) - (label_replace(keda_scaler_metrics_value, "namespace", "$0", "exported_namespace", ".+") > 0) + and on (%(aggregation_labels)s, metric, scaledObject) ( + max by (%(aggregation_labels)s, metric, scaledObject) ( + label_replace(keda_scaler_metrics_value, "namespace", "$0", "exported_namespace", ".+") > 0 + ) + ) ||| % { hpa_prefix: $._config.autoscaling_hpa_prefix, aggregation_labels: $._config.alert_aggregation_labels, From 82487211ec76f38bb6227f230a5e51cc202c0345 Mon Sep 17 00:00:00 2001 From: Dimitar Dimitrov Date: Wed, 25 Sep 2024 20:20:47 +0200 Subject: [PATCH 5/5] Regenerate helm tests Signed-off-by: Dimitar Dimitrov --- .../metamonitoring/grafana-dashboards.yaml | 16 ++++++++-------- .../templates/metamonitoring/mixin-alerts.yaml | 13 ++++++++++--- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml index fa933cf45cc..4fb88c46f8e 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/grafana-dashboards.yaml @@ -20357,7 +20357,7 @@ data: "span": 4, "targets": [ { - "expr": "sum by(cluster, namespace, scaler, metric, scaledObject) (\n label_replace(\n rate(keda_scaler_errors[$__rate_interval]),\n \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\"\n )\n) +\non(cluster, namespace, metric, scaledObject) group_left\nlabel_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-querier\"} * 0,\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n ),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n)\n", + "expr": "sum by(cluster, namespace, scaler, metric, scaledObject) (\n label_replace(\n rate(keda_scaler_errors[$__rate_interval]),\n \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\"\n )\n) +\non(cluster, namespace, metric, scaledObject) group_left\nlabel_replace(\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, metric, horizontalpodautoscaler) (kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-querier\"} * 0),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n ),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n)\n", "format": "time_series", "legendFormat": "{{scaler}} failures", "legendLink": null @@ -26151,7 +26151,7 @@ data: "span": 6, "targets": [ { - "expr": "sum by(cluster, namespace, scaler, metric, scaledObject) (\n label_replace(\n rate(keda_scaler_errors[$__rate_interval]),\n \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\"\n )\n) +\non(cluster, namespace, metric, scaledObject) group_left\nlabel_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"} * 0,\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n ),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n)\n", + "expr": "sum by(cluster, namespace, scaler, metric, scaledObject) (\n label_replace(\n rate(keda_scaler_errors[$__rate_interval]),\n \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\"\n )\n) +\non(cluster, namespace, metric, scaledObject) group_left\nlabel_replace(\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, metric, horizontalpodautoscaler) (kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"} * 0),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n ),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n)\n", "format": "time_series", "legendFormat": "{{scaler}} failures", "legendLink": null @@ -26212,7 +26212,7 @@ data: "span": 4, "targets": [ { - "expr": "sum by (scaler) (\n label_replace(\n keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*cpu.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"},\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", + "expr": "sum by (scaler) (\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric, scaler) keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*cpu.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric_name) (kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"}),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", "format": "time_series", "legendFormat": "{{ scaler }}", "legendLink": null @@ -26261,7 +26261,7 @@ data: "span": 4, "targets": [ { - "expr": "sum by (scaler) (\n label_replace(\n keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*memory.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"},\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", + "expr": "sum by (scaler) (\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric, scaler) keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*memory.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric_name) (kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"}),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", "format": "time_series", "legendFormat": "{{ scaler }}", "legendLink": null @@ -26310,7 +26310,7 @@ data: "span": 4, "targets": [ { - "expr": "sum by (scaler) (\n label_replace(\n keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*queries.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"},\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", + "expr": "sum by (scaler) (\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric, scaler) keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*queries.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric_name) (kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-ruler-querier\"}),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", "format": "time_series", "legendFormat": "{{ scaler }}", "legendLink": null @@ -40428,7 +40428,7 @@ data: "span": 3, "targets": [ { - "expr": "sum by (scaler) (\n label_replace(\n keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*cpu.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"},\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", + "expr": "sum by (scaler) (\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric, scaler) keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*cpu.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric_name) (kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"}),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", "format": "time_series", "legendFormat": "{{ scaler }}", "legendLink": null @@ -40477,7 +40477,7 @@ data: "span": 3, "targets": [ { - "expr": "sum by (scaler) (\n label_replace(\n keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*memory.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"},\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", + "expr": "sum by (scaler) (\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric, scaler) keda_scaler_metrics_value{cluster=~\"$cluster\", exported_namespace=~\"$namespace\", scaler=~\".*memory.*\"},\n \"namespace\", \"$1\", \"exported_namespace\", \"(.*)\"\n )\n /\n on(cluster, namespace, scaledObject, metric) group_left label_replace(\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, scaledObject, metric_name) (kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"}),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n ),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n )\n)\n", "format": "time_series", "legendFormat": "{{ scaler }}", "legendLink": null @@ -40526,7 +40526,7 @@ data: "span": 3, "targets": [ { - "expr": "sum by(cluster, namespace, scaler, metric, scaledObject) (\n label_replace(\n rate(keda_scaler_errors[$__rate_interval]),\n \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\"\n )\n) +\non(cluster, namespace, metric, scaledObject) group_left\nlabel_replace(\n label_replace(\n kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"} * 0,\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n ),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n)\n", + "expr": "sum by(cluster, namespace, scaler, metric, scaledObject) (\n label_replace(\n rate(keda_scaler_errors[$__rate_interval]),\n \"namespace\", \"$1\", \"exported_namespace\", \"(.+)\"\n )\n) +\non(cluster, namespace, metric, scaledObject) group_left\nlabel_replace(\n label_replace(\n # Using `max by ()` so that series churn doesn't break the promQL join\n max by (cluster, namespace, metric, horizontalpodautoscaler) (kube_horizontalpodautoscaler_spec_target_metric{cluster=~\"$cluster\", namespace=~\"$namespace\", horizontalpodautoscaler=~\"keda-hpa-distributor\"} * 0),\n \"scaledObject\", \"$1\", \"horizontalpodautoscaler\", \"keda-hpa-(.*)\"\n ),\n \"metric\", \"$1\", \"metric_name\", \"(.+)\"\n)\n", "format": "time_series", "legendFormat": "{{scaler}} failures", "legendLink": null diff --git a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml index 7b405c7e2e3..4cb5a5f5f7f 100644 --- a/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml +++ b/operations/helm/tests/metamonitoring-values-generated/mimir-distributed/templates/metamonitoring/mixin-alerts.yaml @@ -990,7 +990,11 @@ spec: # Match only Mimir namespaces. * on(cluster, namespace) group_left max by(cluster, namespace) (cortex_build_info) # Add "metric" label. - + on(cluster, namespace, horizontalpodautoscaler) group_right label_replace(kube_horizontalpodautoscaler_spec_target_metric*0, "metric", "$1", "metric_name", "(.+)") + + on(cluster, namespace, horizontalpodautoscaler) group_right + # Using `max by ()` so that series churn doesn't break the promQL join + max by (cluster, namespace, horizontalpodautoscaler) ( + label_replace(kube_horizontalpodautoscaler_spec_target_metric*0, "metric", "$1", "metric_name", "(.+)") + ) > 0), "scaledObject", "$1", "horizontalpodautoscaler", "keda-hpa-(.*)" ) @@ -998,8 +1002,11 @@ spec: # Alert only if the scaling metric exists and is > 0. If the KEDA ScaledObject is configured to scale down 0, # then HPA ScalingActive may be false when expected to run 0 replicas. In this case, the scaling metric exported # by KEDA could not exist at all or being exposed with a value of 0. - and on (cluster, namespace, metric, scaledObject) - (label_replace(keda_scaler_metrics_value, "namespace", "$0", "exported_namespace", ".+") > 0) + and on (cluster, namespace, metric, scaledObject) ( + max by (cluster, namespace, metric, scaledObject) ( + label_replace(keda_scaler_metrics_value, "namespace", "$0", "exported_namespace", ".+") > 0 + ) + ) for: 1h labels: severity: critical