From afbc08eca4624b23465fd0003ab1e23030fecaf2 Mon Sep 17 00:00:00 2001 From: QuentinBisson Date: Thu, 11 Apr 2024 13:45:56 +0200 Subject: [PATCH] fix cluster_id label override in mixins Signed-off-by: QuentinBisson --- .../loki-mixin-compiled-ssd/alerts.yaml | 80 +++---- .../dashboards/loki-logs.json | 20 +- .../dashboards/loki-operational.json | 176 ++++++++-------- production/loki-mixin-compiled-ssd/rules.yaml | 90 ++++---- production/loki-mixin-compiled/alerts.yaml | 80 +++---- .../dashboards/loki-logs.json | 20 +- .../dashboards/loki-operational.json | 186 ++++++++--------- production/loki-mixin-compiled/rules.yaml | 90 ++++---- production/loki-mixin/alerts.libsonnet | 8 +- .../dashboards/dashboard-loki-logs.json | 22 +- .../dashboard-loki-operational.json | 196 +++++++++--------- .../loki-canary-dashboard.libsonnet | 48 ++--- .../loki-mixin/dashboards/loki-logs.libsonnet | 5 +- .../dashboards/loki-operational.libsonnet | 10 +- .../dashboards/recording-rules.libsonnet | 45 ++-- 15 files changed, 535 insertions(+), 541 deletions(-) diff --git a/production/loki-mixin-compiled-ssd/alerts.yaml b/production/loki-mixin-compiled-ssd/alerts.yaml index 77f285b99c060..af06880cf698d 100644 --- a/production/loki-mixin-compiled-ssd/alerts.yaml +++ b/production/loki-mixin-compiled-ssd/alerts.yaml @@ -1,41 +1,41 @@ groups: -- name: loki_alerts - rules: - - alert: LokiRequestErrors - annotations: - message: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. - expr: | - 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) - / - sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) - > 10 - for: 15m - labels: - severity: critical - - alert: LokiRequestPanics - annotations: - message: | - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. - expr: | - sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 - labels: - severity: critical - - alert: LokiRequestLatency - annotations: - message: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. - expr: | - cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 - for: 15m - labels: - severity: critical - - alert: LokiTooManyCompactorsRunning - annotations: - message: | - {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. - expr: | - sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 - for: 5m - labels: - severity: warning + - name: loki_alerts + rules: + - alert: LokiRequestErrors + annotations: + message: | + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + expr: | + 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) + / + sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) + > 10 + for: 15m + labels: + severity: critical + - alert: LokiRequestPanics + annotations: + message: | + {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. + expr: | + sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 + labels: + severity: critical + - alert: LokiRequestLatency + annotations: + message: | + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + expr: | + cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 + for: 15m + labels: + severity: critical + - alert: LokiTooManyCompactorsRunning + annotations: + message: | + {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. + expr: | + sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 + for: 5m + labels: + severity: warning diff --git a/production/loki-mixin-compiled-ssd/dashboards/loki-logs.json b/production/loki-mixin-compiled-ssd/dashboards/loki-logs.json index 90691632b6c29..ec085acbc5729 100644 --- a/production/loki-mixin-compiled-ssd/dashboards/loki-logs.json +++ b/production/loki-mixin-compiled-ssd/dashboards/loki-logs.json @@ -63,7 +63,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(go_goroutines{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"})", + "expr": "sum(go_goroutines{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"})", "refId": "A" } ], @@ -149,7 +149,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(go_gc_duration_seconds{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}) by (quantile)", + "expr": "sum(go_gc_duration_seconds{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}) by (quantile)", "legendFormat": "{{quantile}}", "refId": "A" } @@ -236,7 +236,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_cpu_usage_seconds_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[5m]))", + "expr": "sum(rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[5m]))", "refId": "A" } ], @@ -322,7 +322,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"})", + "expr": "sum(container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"})", "refId": "A" } ], @@ -408,7 +408,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))", + "expr": "sum(rate(container_network_transmit_bytes_total{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))", "refId": "A" } ], @@ -494,7 +494,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))", + "expr": "sum(rate(container_network_receive_bytes_total{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))", "refId": "A" } ], @@ -580,7 +580,7 @@ "steppedLine": false, "targets": [ { - "expr": "increase(kube_pod_container_status_last_terminated_reason{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[30m]) > 0", + "expr": "increase(kube_pod_container_status_last_terminated_reason{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[30m]) > 0", "legendFormat": "{{reason}}", "refId": "A" } @@ -667,7 +667,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(promtail_custom_bad_words_total{cluster=\"$cluster\", exported_namespace=\"$namespace\", exported_pod=~\"$deployment.*\", exported_pod=~\"$pod\", container=~\"$container\"}[5m])) by (level)", + "expr": "sum(rate(promtail_custom_bad_words_total{cluster=~\"$cluster\", exported_namespace=\"$namespace\", exported_pod=~\"$deployment.*\", exported_pod=~\"$pod\", container=~\"$container\"}[5m])) by (level)", "legendFormat": "{{level}}", "refId": "A" } @@ -771,7 +771,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\" } |logfmt| level=\"$level\" |= \"$filter\" [5m])) by (level)", + "expr": "sum(rate({cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\" } |logfmt| level=\"$level\" |= \"$filter\" [5m])) by (level)", "intervalFactor": 3, "legendFormat": "{{level}}", "refId": "A" @@ -836,7 +836,7 @@ }, "targets": [ { - "expr": "{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"} | logfmt | level=\"$level\" |= \"$filter\"", + "expr": "{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"} | logfmt | level=\"$level\" |= \"$filter\"", "refId": "A" } ], diff --git a/production/loki-mixin-compiled-ssd/dashboards/loki-operational.json b/production/loki-mixin-compiled-ssd/dashboards/loki-operational.json index 5610b088dceb0..cafc7a5799976 100644 --- a/production/loki-mixin-compiled-ssd/dashboards/loki-operational.json +++ b/production/loki-mixin-compiled-ssd/dashboards/loki-operational.json @@ -87,7 +87,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\nlabel_replace(\n label_replace(\n rate(loki_request_duration_seconds_count{cluster=\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\"}[5m]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n\"status\", \"${1}\", \"status_code\", \"([a-z]+)\")\n)", + "expr": "sum by (status) (\nlabel_replace(\n label_replace(\n rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\"}[5m]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n\"status\", \"${1}\", \"status_code\", \"([a-z]+)\")\n)", "legendFormat": "{{status}}", "refId": "A" } @@ -183,7 +183,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\nlabel_replace(\n label_replace(\n rate(loki_request_duration_seconds_count{cluster=\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\", route=~\"api_prom_push|loki_api_v1_push\"}[5m]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n\"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))", + "expr": "sum by (status) (\nlabel_replace(\n label_replace(\n rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\", route=~\"api_prom_push|loki_api_v1_push\"}[5m]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n\"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))", "legendFormat": "{{status}}", "refId": "A" } @@ -278,7 +278,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10, sum(rate(loki_distributor_lines_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (tenant))", + "expr": "topk(10, sum(rate(loki_distributor_lines_received_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (tenant))", "legendFormat": "{{tenant}}", "refId": "A" } @@ -374,7 +374,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10, sum(rate(loki_distributor_bytes_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (tenant)) / 1024 / 1024", + "expr": "topk(10, sum(rate(loki_distributor_bytes_received_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (tenant)) / 1024 / 1024", "legendFormat": "{{tenant}}", "refId": "A" } @@ -468,7 +468,7 @@ "steppedLine": false, "targets": [ { - "expr": "increase(kube_pod_container_status_restarts_total{cluster=\"$cluster\", namespace=\"$namespace\"}[10m]) > 0", + "expr": "increase(kube_pod_container_status_restarts_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[10m]) > 0", "hide": false, "interval": "", "legendFormat": "{{container}}-{{pod}}", @@ -778,7 +778,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\"}[$__rate_interval])) by (route) > 0", + "expr": "sum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\"}[$__rate_interval])) by (route) > 0", "interval": "", "legendFormat": "{{route}}", "refId": "A" @@ -984,7 +984,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\", status_code!~\"5[0-9]{2}\", route=\"/logproto.Pusher/Push\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\", route=\"/logproto.Pusher/Push\"}[$__rate_interval])) by (route) > 0", + "expr": "sum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\", status_code!~\"5[0-9]{2}\", route=\"/logproto.Pusher/Push\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\", route=\"/logproto.Pusher/Push\"}[$__rate_interval])) by (route) > 0", "interval": "", "legendFormat": "{{route}}", "refId": "A" @@ -1084,17 +1084,17 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"}))", + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=~\"$cluster\"}))", "legendFormat": "{{route}}-.99", "refId": "A" }, { - "expr": "histogram_quantile(0.9, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"}))", + "expr": "histogram_quantile(0.9, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=~\"$cluster\"}))", "legendFormat": "{{route}}-.9", "refId": "B" }, { - "expr": "histogram_quantile(0.5, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"}))", + "expr": "histogram_quantile(0.5, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=~\"$cluster\"}))", "legendFormat": "{{route}}-.5", "refId": "C" } @@ -1190,17 +1190,17 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=~\"$cluster\"})) * 1e3", "legendFormat": ".99-{{route}}", "refId": "A" }, { - "expr": "histogram_quantile(0.9, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.9, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=~\"$cluster\"})) * 1e3", "legendFormat": ".9-{{route}}", "refId": "B" }, { - "expr": "histogram_quantile(0.5, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.5, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-read\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=~\"$cluster\"})) * 1e3", "legendFormat": ".5-{{route}}", "refId": "C" } @@ -1296,7 +1296,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-read\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-read\"}[$__rate_interval])) by (route) > 0", + "expr": "sum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-read\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-read\"}[$__rate_interval])) by (route) > 0", "interval": "", "legendFormat": "{{route}}", "refId": "A" @@ -1395,17 +1395,17 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=~\"$cluster\"})) * 1e3", "legendFormat": ".99-{{route}}", "refId": "A" }, { - "expr": "histogram_quantile(0.9, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.9, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=~\"$cluster\"})) * 1e3", "legendFormat": ".9-{{route}}", "refId": "B" }, { - "expr": "histogram_quantile(0.5, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.5, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/(loki|enterprise-logs)-write\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=~\"$cluster\"})) * 1e3", "legendFormat": ".5-{{route}}", "refId": "C" } @@ -1501,7 +1501,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\", status_code!~\"5[0-9]{2}\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[$__rate_interval])) by (route) > 0", + "expr": "sum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\", status_code!~\"5[0-9]{2}\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[$__rate_interval])) by (route) > 0", "interval": "", "legendFormat": "{{route}}", "refId": "A" @@ -1602,7 +1602,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10,sum by (tenant, reason) (rate(loki_discarded_samples_total{cluster=\"$cluster\",namespace=\"$namespace\"}[1m])))", + "expr": "topk(10,sum by (tenant, reason) (rate(loki_discarded_samples_total{cluster=~\"$cluster\",namespace=\"$namespace\"}[1m])))", "interval": "", "legendFormat": "{{ tenant }} - {{ reason }}", "refId": "A" @@ -1727,7 +1727,7 @@ ], "targets": [ { - "expr": "topk(10, sum by (tenant, reason) (sum_over_time(increase(loki_discarded_samples_total{cluster=\"$cluster\",namespace=\"$namespace\"}[1m])[$__range:1m])))", + "expr": "topk(10, sum by (tenant, reason) (sum_over_time(increase(loki_discarded_samples_total{cluster=~\"$cluster\",namespace=\"$namespace\"}[1m])[$__range:1m])))", "format": "table", "instant": true, "interval": "", @@ -1890,7 +1890,7 @@ "steppedLine": false, "targets": [ { - "expr": "go_memstats_heap_inuse_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"(loki|enterprise-logs)-write.*\"}", + "expr": "go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"(loki|enterprise-logs)-write.*\"}", "instant": false, "intervalFactor": 3, "legendFormat": "{{pod}}", @@ -1985,7 +1985,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\"} | logfmt | level=\"error\"[1m]))", + "expr": "sum(rate({cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\"} | logfmt | level=\"error\"[1m]))", "refId": "A" } ], @@ -2048,7 +2048,7 @@ "panels": [ ], "targets": [ { - "expr": "{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\"} |= \"level=error\"", + "expr": "{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\"} |= \"level=error\"", "refId": "A" } ], @@ -2099,7 +2099,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\"}[$__rate_interval])) by (route) > 0", + "expr": "sum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-write\"}[$__rate_interval])) by (route) > 0", "interval": "", "intervalFactor": 1, "legendFormat": "{{route}}", @@ -2189,7 +2189,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_distributor_ingester_append_failures_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", + "expr": "sum(rate(loki_distributor_ingester_append_failures_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" @@ -2278,7 +2278,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_distributor_bytes_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", + "expr": "sum(rate(loki_distributor_bytes_received_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" @@ -2367,7 +2367,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_distributor_lines_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", + "expr": "sum(rate(loki_distributor_lines_received_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" @@ -2474,7 +2474,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10,sum by (tenant) (loki_ingester_memory_streams{cluster=\"$cluster\",job=~\"($namespace)/(loki|enterprise-logs)-write\"}))", + "expr": "topk(10,sum by (tenant) (loki_ingester_memory_streams{cluster=~\"$cluster\",job=~\"($namespace)/(loki|enterprise-logs)-write\"}))", "interval": "", "legendFormat": "{{ tenant }}", "refId": "A" @@ -2565,7 +2565,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10, sum by (tenant) (rate(loki_ingester_streams_created_total{cluster=\"$cluster\",job=~\"($namespace)/(loki|enterprise-logs)-write\"}[1m]) > 0))", + "expr": "topk(10, sum by (tenant) (rate(loki_ingester_streams_created_total{cluster=~\"$cluster\",job=~\"($namespace)/(loki|enterprise-logs)-write\"}[1m]) > 0))", "interval": "", "legendFormat": "{{ tenant }}", "refId": "A" @@ -2675,13 +2675,13 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_ingester_chunks_flushed_total{cluster=\"$cluster\",job=~\"($namespace)/(loki|enterprise-logs)-write\"}[1m]))", + "expr": "sum(rate(loki_ingester_chunks_flushed_total{cluster=~\"$cluster\",job=~\"($namespace)/(loki|enterprise-logs)-write\"}[1m]))", "interval": "", "legendFormat": "Chunks", "refId": "A" }, { - "expr": "sum(increase(loki_chunk_store_deduped_chunks_total{cluster=\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\"}[1m]))/sum(increase(loki_ingester_chunks_flushed_total{cluster=\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\"}[1m])) < 1", + "expr": "sum(increase(loki_chunk_store_deduped_chunks_total{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\"}[1m]))/sum(increase(loki_ingester_chunks_flushed_total{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\"}[1m])) < 1", "interval": "", "legendFormat": "De-Dupe Ratio", "refId": "B" @@ -2759,7 +2759,7 @@ "reverseYBuckets": false, "targets": [ { - "expr": "sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\",job=~\"($namespace)/(loki|enterprise-logs)-write\"}[1m])) by (le)", + "expr": "sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=~\"$cluster\",job=~\"($namespace)/(loki|enterprise-logs)-write\"}[1m])) by (le)", "format": "heatmap", "instant": false, "interval": "", @@ -2914,7 +2914,7 @@ "reverseYBuckets": false, "targets": [ { - "expr": "sum by (le) (rate(loki_ingester_chunk_utilization_bucket{cluster=\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\"}[1m]))", + "expr": "sum by (le) (rate(loki_ingester_chunk_utilization_bucket{cluster=~\"$cluster\", job=~\"($namespace)/(loki|enterprise-logs)-write\"}[1m]))", "format": "heatmap", "instant": false, "interval": "", @@ -3097,7 +3097,7 @@ "steppedLine": false, "targets": [ { - "expr": "go_memstats_heap_inuse_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"(loki|enterprise-logs)-read.*\"}", + "expr": "go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"(loki|enterprise-logs)-read.*\"}", "instant": false, "intervalFactor": 3, "legendFormat": "{{pod}}", @@ -3192,7 +3192,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-read\"} | logfmt | level=\"error\"[1m]))", + "expr": "sum(rate({cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-read\"} | logfmt | level=\"error\"[1m]))", "refId": "A" } ], @@ -3255,7 +3255,7 @@ "panels": [ ], "targets": [ { - "expr": "{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-read\"} |= \"level=error\"", + "expr": "{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-read\"} |= \"level=error\"", "refId": "A" } ], @@ -3306,7 +3306,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-read\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-read\"}[$__rate_interval])) by (route) > 0", + "expr": "sum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-read\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/(loki|enterprise-logs)-read\"}[$__rate_interval])) by (route) > 0", "interval": "", "intervalFactor": 1, "legendFormat": "{{route}}", @@ -3415,19 +3415,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", + "expr": "histogram_quantile(.99, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", "intervalFactor": 1, "legendFormat": "{{container}}: .99-{{method}}-{{name}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", + "expr": "histogram_quantile(.9, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", "hide": false, "legendFormat": "{{container}}: .9-{{method}}-{{name}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", + "expr": "histogram_quantile(.5, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", "hide": false, "legendFormat": "{{container}}: .5-{{method}}-{{name}}", "refId": "C" @@ -3519,7 +3519,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_memcache_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, method, name, container)", + "expr": "sum(rate(loki_memcache_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, method, name, container)", "intervalFactor": 1, "legendFormat": "{{container}}: {{status_code}}-{{method}}-{{name}}", "refId": "A" @@ -3627,19 +3627,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -3731,7 +3731,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, status_code, method)", + "expr": "sum(rate(loki_consul_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, status_code, method)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -3837,17 +3837,17 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".9", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", "refId": "C" } ], @@ -3935,20 +3935,20 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", "interval": "", "intervalFactor": 1, "legendFormat": "99%", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", "interval": "", "legendFormat": "90%", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", "interval": "", "legendFormat": "50%", "refId": "C" @@ -4038,20 +4038,20 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", "interval": "", "intervalFactor": 1, "legendFormat": "99%", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", "interval": "", "legendFormat": "90%", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", "interval": "", "legendFormat": "50%", "refId": "C" @@ -4141,17 +4141,17 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".9", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", "refId": "C" } ], @@ -4239,7 +4239,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (status_code)", + "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (status_code)", "intervalFactor": 1, "legendFormat": "{{status_code}}", "refId": "A" @@ -4329,7 +4329,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (status_code)", + "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (status_code)", "intervalFactor": 1, "legendFormat": "{{status_code}}", "refId": "A" @@ -4419,7 +4419,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (status_code)", + "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (status_code)", "intervalFactor": 1, "legendFormat": "{{status_code}}", "refId": "A" @@ -4509,7 +4509,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (status_code)", + "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (status_code)", "intervalFactor": 1, "legendFormat": "{{status_code}}", "refId": "A" @@ -4617,19 +4617,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -4721,7 +4721,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_gcs_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_gcs_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -4825,7 +4825,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_failures_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m]))", + "expr": "sum(rate(loki_dynamo_failures_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m]))", "refId": "A" } ], @@ -4911,7 +4911,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_consumed_capacity_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m]))", + "expr": "sum(rate(loki_dynamo_consumed_capacity_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m]))", "refId": "A" } ], @@ -4997,7 +4997,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_throttled_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m]))", + "expr": "sum(rate(loki_dynamo_throttled_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m]))", "refId": "A" } ], @@ -5083,7 +5083,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_dropped_requests_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m]))", + "expr": "sum(rate(loki_dynamo_dropped_requests_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m]))", "refId": "A" } ], @@ -5169,17 +5169,17 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])))", + "expr": "histogram_quantile(.99, sum(rate(loki_dynamo_query_pages_count{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])))", "legendFormat": ".99", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])))", + "expr": "histogram_quantile(.9, sum(rate(loki_dynamo_query_pages_count{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])))", "legendFormat": ".9", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])))", + "expr": "histogram_quantile(.5, sum(rate(loki_dynamo_query_pages_count{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])))", "legendFormat": ".5", "refId": "C" } @@ -5269,19 +5269,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -5372,7 +5372,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_dynamo_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -5479,19 +5479,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -5582,7 +5582,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_s3_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_s3_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -5689,19 +5689,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -5792,7 +5792,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_azure_blob_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_azure_blob_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -5899,19 +5899,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -6002,7 +6002,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_boltdb_shipper_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_boltdb_shipper_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" diff --git a/production/loki-mixin-compiled-ssd/rules.yaml b/production/loki-mixin-compiled-ssd/rules.yaml index 2a54ed4fb2e5b..5893770570f6e 100644 --- a/production/loki-mixin-compiled-ssd/rules.yaml +++ b/production/loki-mixin-compiled-ssd/rules.yaml @@ -1,53 +1,39 @@ groups: -- name: loki_rules - rules: - - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:loki_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) - by (cluster, job) - record: cluster_job:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job) - record: cluster_job:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) - record: cluster_job:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) - record: cluster_job:loki_request_duration_seconds_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:loki_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) - / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, - route) - record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, - job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, - namespace, job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, - job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, - job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, - job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate + - name: loki_rules + rules: + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job) + record: cluster_job:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) + record: cluster_job_route:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) + record: cluster_job_route:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate diff --git a/production/loki-mixin-compiled/alerts.yaml b/production/loki-mixin-compiled/alerts.yaml index 77f285b99c060..af06880cf698d 100644 --- a/production/loki-mixin-compiled/alerts.yaml +++ b/production/loki-mixin-compiled/alerts.yaml @@ -1,41 +1,41 @@ groups: -- name: loki_alerts - rules: - - alert: LokiRequestErrors - annotations: - message: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. - expr: | - 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) - / - sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) - > 10 - for: 15m - labels: - severity: critical - - alert: LokiRequestPanics - annotations: - message: | - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. - expr: | - sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 - labels: - severity: critical - - alert: LokiRequestLatency - annotations: - message: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. - expr: | - cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 - for: 15m - labels: - severity: critical - - alert: LokiTooManyCompactorsRunning - annotations: - message: | - {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. - expr: | - sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 - for: 5m - labels: - severity: warning + - name: loki_alerts + rules: + - alert: LokiRequestErrors + annotations: + message: | + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + expr: | + 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) + / + sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) + > 10 + for: 15m + labels: + severity: critical + - alert: LokiRequestPanics + annotations: + message: | + {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. + expr: | + sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 + labels: + severity: critical + - alert: LokiRequestLatency + annotations: + message: | + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + expr: | + cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 + for: 15m + labels: + severity: critical + - alert: LokiTooManyCompactorsRunning + annotations: + message: | + {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. + expr: | + sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 + for: 5m + labels: + severity: warning diff --git a/production/loki-mixin-compiled/dashboards/loki-logs.json b/production/loki-mixin-compiled/dashboards/loki-logs.json index 90691632b6c29..ec085acbc5729 100644 --- a/production/loki-mixin-compiled/dashboards/loki-logs.json +++ b/production/loki-mixin-compiled/dashboards/loki-logs.json @@ -63,7 +63,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(go_goroutines{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"})", + "expr": "sum(go_goroutines{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"})", "refId": "A" } ], @@ -149,7 +149,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(go_gc_duration_seconds{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}) by (quantile)", + "expr": "sum(go_gc_duration_seconds{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}) by (quantile)", "legendFormat": "{{quantile}}", "refId": "A" } @@ -236,7 +236,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_cpu_usage_seconds_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[5m]))", + "expr": "sum(rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[5m]))", "refId": "A" } ], @@ -322,7 +322,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"})", + "expr": "sum(container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"})", "refId": "A" } ], @@ -408,7 +408,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))", + "expr": "sum(rate(container_network_transmit_bytes_total{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))", "refId": "A" } ], @@ -494,7 +494,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))", + "expr": "sum(rate(container_network_receive_bytes_total{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))", "refId": "A" } ], @@ -580,7 +580,7 @@ "steppedLine": false, "targets": [ { - "expr": "increase(kube_pod_container_status_last_terminated_reason{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[30m]) > 0", + "expr": "increase(kube_pod_container_status_last_terminated_reason{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[30m]) > 0", "legendFormat": "{{reason}}", "refId": "A" } @@ -667,7 +667,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(promtail_custom_bad_words_total{cluster=\"$cluster\", exported_namespace=\"$namespace\", exported_pod=~\"$deployment.*\", exported_pod=~\"$pod\", container=~\"$container\"}[5m])) by (level)", + "expr": "sum(rate(promtail_custom_bad_words_total{cluster=~\"$cluster\", exported_namespace=\"$namespace\", exported_pod=~\"$deployment.*\", exported_pod=~\"$pod\", container=~\"$container\"}[5m])) by (level)", "legendFormat": "{{level}}", "refId": "A" } @@ -771,7 +771,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\" } |logfmt| level=\"$level\" |= \"$filter\" [5m])) by (level)", + "expr": "sum(rate({cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\" } |logfmt| level=\"$level\" |= \"$filter\" [5m])) by (level)", "intervalFactor": 3, "legendFormat": "{{level}}", "refId": "A" @@ -836,7 +836,7 @@ }, "targets": [ { - "expr": "{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"} | logfmt | level=\"$level\" |= \"$filter\"", + "expr": "{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"} | logfmt | level=\"$level\" |= \"$filter\"", "refId": "A" } ], diff --git a/production/loki-mixin-compiled/dashboards/loki-operational.json b/production/loki-mixin-compiled/dashboards/loki-operational.json index 133dbc27b51b5..5d82f7b638028 100644 --- a/production/loki-mixin-compiled/dashboards/loki-operational.json +++ b/production/loki-mixin-compiled/dashboards/loki-operational.json @@ -87,7 +87,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\nlabel_replace(\n label_replace(\n rate(loki_request_duration_seconds_count{cluster=\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\"}[5m]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n\"status\", \"${1}\", \"status_code\", \"([a-z]+)\")\n)", + "expr": "sum by (status) (\nlabel_replace(\n label_replace(\n rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=~\"api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\"}[5m]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n\"status\", \"${1}\", \"status_code\", \"([a-z]+)\")\n)", "legendFormat": "{{status}}", "refId": "A" } @@ -183,7 +183,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\nlabel_replace(\n label_replace(\n rate(loki_request_duration_seconds_count{cluster=\"$cluster\", job=~\"($namespace)/distributor\", route=~\"api_prom_push|loki_api_v1_push\"}[5m]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n\"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))", + "expr": "sum by (status) (\nlabel_replace(\n label_replace(\n rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/distributor\", route=~\"api_prom_push|loki_api_v1_push\"}[5m]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n\"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))", "legendFormat": "{{status}}", "refId": "A" } @@ -279,7 +279,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(5, sum by (name,level) (rate(promtail_custom_bad_words_total{cluster=\"$cluster\", exported_namespace=\"$namespace\"}[$__interval])) - \nsum by (name,level) (rate(promtail_custom_bad_words_total{cluster=\"$cluster\", exported_namespace=\"$namespace\"}[$__interval] offset 1h)))", + "expr": "topk(5, sum by (name,level) (rate(promtail_custom_bad_words_total{cluster=~\"$cluster\", exported_namespace=\"$namespace\"}[$__interval])) - \nsum by (name,level) (rate(promtail_custom_bad_words_total{cluster=~\"$cluster\", exported_namespace=\"$namespace\"}[$__interval] offset 1h)))", "legendFormat": "{{name}}-{{level}}", "refId": "A" } @@ -374,7 +374,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10, sum(rate(loki_distributor_lines_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (tenant))", + "expr": "topk(10, sum(rate(loki_distributor_lines_received_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (tenant))", "legendFormat": "{{tenant}}", "refId": "A" } @@ -470,7 +470,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10, sum(rate(loki_distributor_bytes_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (tenant)) / 1024 / 1024", + "expr": "topk(10, sum(rate(loki_distributor_bytes_received_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (tenant)) / 1024 / 1024", "legendFormat": "{{tenant}}", "refId": "A" } @@ -564,7 +564,7 @@ "steppedLine": false, "targets": [ { - "expr": "increase(kube_pod_container_status_restarts_total{cluster=\"$cluster\", namespace=\"$namespace\"}[10m]) > 0", + "expr": "increase(kube_pod_container_status_restarts_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[10m]) > 0", "hide": false, "interval": "", "legendFormat": "{{container}}-{{pod}}", @@ -874,7 +874,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/distributor\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/distributor\"}[$__rate_interval])) by (route) > 0", + "expr": "sum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/distributor\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/distributor\"}[$__rate_interval])) by (route) > 0", "interval": "", "legendFormat": "{{route}}", "refId": "A" @@ -1080,7 +1080,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/ingester.*\", status_code!~\"5[0-9]{2}\", route=\"/logproto.Pusher/Push\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/ingester.*\", route=\"/logproto.Pusher/Push\"}[$__rate_interval])) by (route) > 0", + "expr": "sum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/ingester.*\", status_code!~\"5[0-9]{2}\", route=\"/logproto.Pusher/Push\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/ingester.*\", route=\"/logproto.Pusher/Push\"}[$__rate_interval])) by (route) > 0", "interval": "", "legendFormat": "{{route}}", "refId": "A" @@ -1180,17 +1180,17 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/querier\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"}))", + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/querier\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=~\"$cluster\"}))", "legendFormat": "{{route}}-.99", "refId": "A" }, { - "expr": "histogram_quantile(0.9, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/querier\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"}))", + "expr": "histogram_quantile(0.9, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/querier\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=~\"$cluster\"}))", "legendFormat": "{{route}}-.9", "refId": "B" }, { - "expr": "histogram_quantile(0.5, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/querier\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"}))", + "expr": "histogram_quantile(0.5, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/querier\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=~\"$cluster\"}))", "legendFormat": "{{route}}-.5", "refId": "C" } @@ -1286,17 +1286,17 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/querier\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/querier\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=~\"$cluster\"})) * 1e3", "legendFormat": ".99-{{route}}", "refId": "A" }, { - "expr": "histogram_quantile(0.9, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/querier\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.9, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/querier\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=~\"$cluster\"})) * 1e3", "legendFormat": ".9-{{route}}", "refId": "B" }, { - "expr": "histogram_quantile(0.5, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/querier\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.5, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/querier\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=~\"$cluster\"})) * 1e3", "legendFormat": ".5-{{route}}", "refId": "C" } @@ -1392,7 +1392,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/querier\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/querier\"}[$__rate_interval])) by (route) > 0", + "expr": "sum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/querier\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/querier\"}[$__rate_interval])) by (route) > 0", "interval": "", "legendFormat": "{{route}}", "refId": "A" @@ -1491,17 +1491,17 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/ingester.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/ingester.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=~\"$cluster\"})) * 1e3", "legendFormat": ".99-{{route}}", "refId": "A" }, { - "expr": "histogram_quantile(0.9, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/ingester.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.9, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/ingester.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=~\"$cluster\"})) * 1e3", "legendFormat": ".9-{{route}}", "refId": "B" }, { - "expr": "histogram_quantile(0.5, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/ingester.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.5, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=~\"($namespace)/ingester.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=~\"$cluster\"})) * 1e3", "legendFormat": ".5-{{route}}", "refId": "C" } @@ -1597,7 +1597,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/ingester.*\", status_code!~\"5[0-9]{2}\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/ingester.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[$__rate_interval])) by (route) > 0", + "expr": "sum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/ingester.*\", status_code!~\"5[0-9]{2}\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/ingester.*\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[$__rate_interval])) by (route) > 0", "interval": "", "legendFormat": "{{route}}", "refId": "A" @@ -1698,7 +1698,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10,sum by (tenant, reason) (rate(loki_discarded_samples_total{cluster=\"$cluster\",namespace=\"$namespace\"}[1m])))", + "expr": "topk(10,sum by (tenant, reason) (rate(loki_discarded_samples_total{cluster=~\"$cluster\",namespace=\"$namespace\"}[1m])))", "interval": "", "legendFormat": "{{ tenant }} - {{ reason }}", "refId": "A" @@ -1823,7 +1823,7 @@ ], "targets": [ { - "expr": "topk(10, sum by (tenant, reason) (sum_over_time(increase(loki_discarded_samples_total{cluster=\"$cluster\",namespace=\"$namespace\"}[1m])[$__range:1m])))", + "expr": "topk(10, sum by (tenant, reason) (sum_over_time(increase(loki_discarded_samples_total{cluster=~\"$cluster\",namespace=\"$namespace\"}[1m])[$__range:1m])))", "format": "table", "instant": true, "interval": "", @@ -1986,7 +1986,7 @@ "steppedLine": false, "targets": [ { - "expr": "go_memstats_heap_inuse_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"distributor.*\"}", + "expr": "go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"distributor.*\"}", "instant": false, "intervalFactor": 3, "legendFormat": "{{pod}}", @@ -2081,7 +2081,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/distributor\"} | logfmt | level=\"error\"[1m]))", + "expr": "sum(rate({cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/distributor\"} | logfmt | level=\"error\"[1m]))", "refId": "A" } ], @@ -2144,7 +2144,7 @@ "panels": [ ], "targets": [ { - "expr": "{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/distributor\"} |= \"level=error\"", + "expr": "{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/distributor\"} |= \"level=error\"", "refId": "A" } ], @@ -2195,7 +2195,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/distributor\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/distributor\"}[$__rate_interval])) by (route) > 0", + "expr": "sum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/distributor\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/distributor\"}[$__rate_interval])) by (route) > 0", "interval": "", "intervalFactor": 1, "legendFormat": "{{route}}", @@ -2285,7 +2285,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_distributor_ingester_append_failures_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", + "expr": "sum(rate(loki_distributor_ingester_append_failures_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" @@ -2374,7 +2374,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_distributor_bytes_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", + "expr": "sum(rate(loki_distributor_bytes_received_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" @@ -2463,7 +2463,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_distributor_lines_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", + "expr": "sum(rate(loki_distributor_lines_received_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" @@ -2659,7 +2659,7 @@ "steppedLine": false, "targets": [ { - "expr": "go_memstats_heap_inuse_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"ingester.*\"}", + "expr": "go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"ingester.*\"}", "instant": false, "intervalFactor": 3, "legendFormat": "{{pod}}", @@ -2754,7 +2754,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/ingester.*\"} | logfmt | level=\"error\"[1m]))", + "expr": "sum(rate({cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/ingester.*\"} | logfmt | level=\"error\"[1m]))", "refId": "A" } ], @@ -2817,7 +2817,7 @@ "panels": [ ], "targets": [ { - "expr": "{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/ingester.*\"} |= \"level=error\"", + "expr": "{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/ingester.*\"} |= \"level=error\"", "refId": "A" } ], @@ -2868,7 +2868,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/ingester.*\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/ingester.*\"}[$__rate_interval])) by (route) > 0", + "expr": "sum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/ingester.*\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/ingester.*\"}[$__rate_interval])) by (route) > 0", "interval": "", "intervalFactor": 1, "legendFormat": "{{route}}", @@ -2976,7 +2976,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10,sum by (tenant) (loki_ingester_memory_streams{cluster=\"$cluster\",job=~\"($namespace)/ingester.*\"}))", + "expr": "topk(10,sum by (tenant) (loki_ingester_memory_streams{cluster=~\"$cluster\",job=~\"($namespace)/ingester.*\"}))", "interval": "", "legendFormat": "{{ tenant }}", "refId": "A" @@ -3067,7 +3067,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10, sum by (tenant) (rate(loki_ingester_streams_created_total{cluster=\"$cluster\",job=~\"($namespace)/ingester.*\"}[1m]) > 0))", + "expr": "topk(10, sum by (tenant) (rate(loki_ingester_streams_created_total{cluster=~\"$cluster\",job=~\"($namespace)/ingester.*\"}[1m]) > 0))", "interval": "", "legendFormat": "{{ tenant }}", "refId": "A" @@ -3177,13 +3177,13 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_ingester_chunks_flushed_total{cluster=\"$cluster\",job=~\"($namespace)/ingester.*\"}[1m]))", + "expr": "sum(rate(loki_ingester_chunks_flushed_total{cluster=~\"$cluster\",job=~\"($namespace)/ingester.*\"}[1m]))", "interval": "", "legendFormat": "Chunks", "refId": "A" }, { - "expr": "sum(increase(loki_chunk_store_deduped_chunks_total{cluster=\"$cluster\", job=~\"($namespace)/ingester.*\"}[1m]))/sum(increase(loki_ingester_chunks_flushed_total{cluster=\"$cluster\", job=~\"($namespace)/ingester.*\"}[1m])) < 1", + "expr": "sum(increase(loki_chunk_store_deduped_chunks_total{cluster=~\"$cluster\", job=~\"($namespace)/ingester.*\"}[1m]))/sum(increase(loki_ingester_chunks_flushed_total{cluster=~\"$cluster\", job=~\"($namespace)/ingester.*\"}[1m])) < 1", "interval": "", "legendFormat": "De-Dupe Ratio", "refId": "B" @@ -3261,7 +3261,7 @@ "reverseYBuckets": false, "targets": [ { - "expr": "sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\",job=~\"($namespace)/ingester.*\"}[1m])) by (le)", + "expr": "sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=~\"$cluster\",job=~\"($namespace)/ingester.*\"}[1m])) by (le)", "format": "heatmap", "instant": false, "interval": "", @@ -3416,7 +3416,7 @@ "reverseYBuckets": false, "targets": [ { - "expr": "sum by (le) (rate(loki_ingester_chunk_utilization_bucket{cluster=\"$cluster\", job=~\"($namespace)/ingester.*\"}[1m]))", + "expr": "sum by (le) (rate(loki_ingester_chunk_utilization_bucket{cluster=~\"$cluster\", job=~\"($namespace)/ingester.*\"}[1m]))", "format": "heatmap", "instant": false, "interval": "", @@ -3599,7 +3599,7 @@ "steppedLine": false, "targets": [ { - "expr": "go_memstats_heap_inuse_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"querier.*\"}", + "expr": "go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"querier.*\"}", "instant": false, "intervalFactor": 3, "legendFormat": "{{pod}}", @@ -3694,7 +3694,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/querier\"} | logfmt | level=\"error\"[1m]))", + "expr": "sum(rate({cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/querier\"} | logfmt | level=\"error\"[1m]))", "refId": "A" } ], @@ -3757,7 +3757,7 @@ "panels": [ ], "targets": [ { - "expr": "{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/querier\"} |= \"level=error\"", + "expr": "{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/querier\"} |= \"level=error\"", "refId": "A" } ], @@ -3808,7 +3808,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/querier\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/querier\"}[$__rate_interval])) by (route) > 0", + "expr": "sum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/querier\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=~\"($namespace)/querier\"}[$__rate_interval])) by (route) > 0", "interval": "", "intervalFactor": 1, "legendFormat": "{{route}}", @@ -3917,19 +3917,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", + "expr": "histogram_quantile(.99, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", "intervalFactor": 1, "legendFormat": "{{container}}: .99-{{method}}-{{name}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", + "expr": "histogram_quantile(.9, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", "hide": false, "legendFormat": "{{container}}: .9-{{method}}-{{name}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", + "expr": "histogram_quantile(.5, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", "hide": false, "legendFormat": "{{container}}: .5-{{method}}-{{name}}", "refId": "C" @@ -4021,7 +4021,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_memcache_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, method, name, container)", + "expr": "sum(rate(loki_memcache_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, method, name, container)", "intervalFactor": 1, "legendFormat": "{{container}}: {{status_code}}-{{method}}-{{name}}", "refId": "A" @@ -4129,19 +4129,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -4233,7 +4233,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, status_code, method)", + "expr": "sum(rate(loki_consul_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, status_code, method)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -4339,17 +4339,17 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".9", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", "refId": "C" } ], @@ -4437,20 +4437,20 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", "interval": "", "intervalFactor": 1, "legendFormat": "99%", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", "interval": "", "legendFormat": "90%", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", "interval": "", "legendFormat": "50%", "refId": "C" @@ -4540,20 +4540,20 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", "interval": "", "intervalFactor": 1, "legendFormat": "99%", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", "interval": "", "legendFormat": "90%", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", "interval": "", "legendFormat": "50%", "refId": "C" @@ -4643,17 +4643,17 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".9", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", "refId": "C" } ], @@ -4741,7 +4741,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (status_code)", + "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (status_code)", "intervalFactor": 1, "legendFormat": "{{status_code}}", "refId": "A" @@ -4831,7 +4831,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (status_code)", + "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (status_code)", "intervalFactor": 1, "legendFormat": "{{status_code}}", "refId": "A" @@ -4921,7 +4921,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (status_code)", + "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (status_code)", "intervalFactor": 1, "legendFormat": "{{status_code}}", "refId": "A" @@ -5011,7 +5011,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (status_code)", + "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (status_code)", "intervalFactor": 1, "legendFormat": "{{status_code}}", "refId": "A" @@ -5119,19 +5119,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -5223,7 +5223,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_gcs_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_gcs_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -5327,7 +5327,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_failures_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m]))", + "expr": "sum(rate(loki_dynamo_failures_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m]))", "refId": "A" } ], @@ -5413,7 +5413,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_consumed_capacity_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m]))", + "expr": "sum(rate(loki_dynamo_consumed_capacity_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m]))", "refId": "A" } ], @@ -5499,7 +5499,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_throttled_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m]))", + "expr": "sum(rate(loki_dynamo_throttled_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m]))", "refId": "A" } ], @@ -5585,7 +5585,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_dropped_requests_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m]))", + "expr": "sum(rate(loki_dynamo_dropped_requests_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m]))", "refId": "A" } ], @@ -5671,17 +5671,17 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])))", + "expr": "histogram_quantile(.99, sum(rate(loki_dynamo_query_pages_count{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])))", "legendFormat": ".99", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])))", + "expr": "histogram_quantile(.9, sum(rate(loki_dynamo_query_pages_count{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])))", "legendFormat": ".9", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])))", + "expr": "histogram_quantile(.5, sum(rate(loki_dynamo_query_pages_count{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])))", "legendFormat": ".5", "refId": "C" } @@ -5771,19 +5771,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -5874,7 +5874,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_dynamo_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -5981,19 +5981,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -6084,7 +6084,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_s3_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_s3_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -6191,19 +6191,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -6294,7 +6294,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_azure_blob_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_azure_blob_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -6401,19 +6401,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -6504,7 +6504,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_boltdb_shipper_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_boltdb_shipper_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" diff --git a/production/loki-mixin-compiled/rules.yaml b/production/loki-mixin-compiled/rules.yaml index 2a54ed4fb2e5b..5893770570f6e 100644 --- a/production/loki-mixin-compiled/rules.yaml +++ b/production/loki-mixin-compiled/rules.yaml @@ -1,53 +1,39 @@ groups: -- name: loki_rules - rules: - - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:loki_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) - by (cluster, job) - record: cluster_job:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job) - record: cluster_job:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) - record: cluster_job:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) - record: cluster_job:loki_request_duration_seconds_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:loki_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) - / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, - route) - record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, - job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, - namespace, job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, - job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, - job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, - job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate + - name: loki_rules + rules: + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job) + record: cluster_job:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) + record: cluster_job_route:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) + record: cluster_job_route:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate diff --git a/production/loki-mixin/alerts.libsonnet b/production/loki-mixin/alerts.libsonnet index 0045cc194ba3a..e38673d728dc1 100644 --- a/production/loki-mixin/alerts.libsonnet +++ b/production/loki-mixin/alerts.libsonnet @@ -54,16 +54,16 @@ { alert: 'LokiTooManyCompactorsRunning', expr: ||| - sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 - |||, + sum(loki_boltdb_shipper_compactor_running) by (namespace, %s) > 1 + ||| % $._config.per_cluster_label, 'for': '5m', labels: { severity: 'warning', }, annotations: { - message: ||| + message: std.strReplace(||| {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. - |||, + |||, 'cluster', $._config.per_cluster_label), }, }, ], diff --git a/production/loki-mixin/dashboards/dashboard-loki-logs.json b/production/loki-mixin/dashboards/dashboard-loki-logs.json index bcb5737aab52c..e434321c1c8da 100644 --- a/production/loki-mixin/dashboards/dashboard-loki-logs.json +++ b/production/loki-mixin/dashboards/dashboard-loki-logs.json @@ -65,7 +65,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(go_goroutines{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"})", + "expr": "sum(go_goroutines{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"})", "refId": "A" } ], @@ -151,7 +151,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(go_gc_duration_seconds{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}) by (quantile)", + "expr": "sum(go_gc_duration_seconds{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}) by (quantile)", "legendFormat": "{{quantile}}", "refId": "A" } @@ -238,7 +238,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_cpu_usage_seconds_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[5m]))", + "expr": "sum(rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[5m]))", "refId": "A" } ], @@ -324,7 +324,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"})", + "expr": "sum(container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"})", "refId": "A" } ], @@ -410,7 +410,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))", + "expr": "sum(rate(container_network_transmit_bytes_total{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))", "refId": "A" } ], @@ -496,7 +496,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))", + "expr": "sum(rate(container_network_receive_bytes_total{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))", "refId": "A" } ], @@ -582,7 +582,7 @@ "steppedLine": false, "targets": [ { - "expr": "increase(kube_pod_container_status_last_terminated_reason{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[30m]) > 0", + "expr": "increase(kube_pod_container_status_last_terminated_reason{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[30m]) > 0", "legendFormat": "{{reason}}", "refId": "A" } @@ -669,7 +669,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(promtail_custom_bad_words_total{cluster=\"$cluster\", exported_namespace=\"$namespace\", exported_pod=~\"$deployment.*\", exported_pod=~\"$pod\", container=~\"$container\"}[5m])) by (level)", + "expr": "sum(rate(promtail_custom_bad_words_total{cluster=~\"$cluster\", exported_namespace=\"$namespace\", exported_pod=~\"$deployment.*\", exported_pod=~\"$pod\", container=~\"$container\"}[5m])) by (level)", "legendFormat": "{{level}}", "refId": "A" } @@ -773,7 +773,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\" } |logfmt| level=\"$level\" |= \"$filter\" [5m])) by (level)", + "expr": "sum(rate({cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\" } |logfmt| level=\"$level\" |= \"$filter\" [5m])) by (level)", "intervalFactor": 3, "legendFormat": "{{level}}", "refId": "A" @@ -838,7 +838,7 @@ }, "targets": [ { - "expr": "{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"} | logfmt | level=\"$level\" |= \"$filter\"", + "expr": "{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"} | logfmt | level=\"$level\" |= \"$filter\"", "refId": "A" } ], @@ -874,4 +874,4 @@ "title": "Logs", "uid": "a7e130cb82be229d6f3edbfd0a438001", "version": 9 -} \ No newline at end of file +} diff --git a/production/loki-mixin/dashboards/dashboard-loki-operational.json b/production/loki-mixin/dashboards/dashboard-loki-operational.json index 2dd944c202984..9a64301505a6a 100644 --- a/production/loki-mixin/dashboards/dashboard-loki-operational.json +++ b/production/loki-mixin/dashboards/dashboard-loki-operational.json @@ -13,7 +13,7 @@ { "datasource": "$loki_datasource", "enable": true, - "expr": "{cluster=\"$cluster\", container=\"kube-diff-logger\"} | json | namespace_extracted=\"$namespace\"", + "expr": "{cluster=~\"$cluster\", container=\"kube-diff-logger\"} | json | namespace_extracted=\"$namespace\"", "hide": true, "iconColor": "rgba(255, 96, 96, 1)", "name": "deployments", @@ -90,7 +90,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\nlabel_replace(\n label_replace(\n rate(loki_request_duration_seconds_count{cluster=\"$cluster\", job=~\"$namespace/cortex-gw(-internal)?\", route=~\"api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\"}[5m]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n\"status\", \"${1}\", \"status_code\", \"([a-z]+)\")\n)", + "expr": "sum by (status) (\nlabel_replace(\n label_replace(\n rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"$namespace/cortex-gw(-internal)?\", route=~\"api_prom_query|api_prom_label|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\"}[5m]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n\"status\", \"${1}\", \"status_code\", \"([a-z]+)\")\n)", "legendFormat": "{{status}}", "refId": "A" } @@ -185,7 +185,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (\nlabel_replace(\n label_replace(\n rate(loki_request_duration_seconds_count{cluster=\"$cluster\", job=~\"$namespace/cortex-gw(-internal)?\", route=~\"api_prom_push|loki_api_v1_push\"}[5m]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n\"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))", + "expr": "sum by (status) (\nlabel_replace(\n label_replace(\n rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"$namespace/cortex-gw(-internal)?\", route=~\"api_prom_push|loki_api_v1_push\"}[5m]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n\"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))", "legendFormat": "{{status}}", "refId": "A" } @@ -280,7 +280,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(5, sum by (name,level) (rate(promtail_custom_bad_words_total{cluster=\"$cluster\", exported_namespace=\"$namespace\"}[$__interval])) - \nsum by (name,level) (rate(promtail_custom_bad_words_total{cluster=\"$cluster\", exported_namespace=\"$namespace\"}[$__interval] offset 1h)))", + "expr": "topk(5, sum by (name,level) (rate(promtail_custom_bad_words_total{cluster=~\"$cluster\", exported_namespace=\"$namespace\"}[$__interval])) - \nsum by (name,level) (rate(promtail_custom_bad_words_total{cluster=~\"$cluster\", exported_namespace=\"$namespace\"}[$__interval] offset 1h)))", "legendFormat": "{{name}}-{{level}}", "refId": "A" } @@ -374,7 +374,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10, sum(rate(loki_distributor_lines_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (tenant))", + "expr": "topk(10, sum(rate(loki_distributor_lines_received_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (tenant))", "legendFormat": "{{tenant}}", "refId": "A" } @@ -469,7 +469,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10, sum(rate(loki_distributor_bytes_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (tenant)) / 1024 / 1024", + "expr": "topk(10, sum(rate(loki_distributor_bytes_received_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (tenant)) / 1024 / 1024", "legendFormat": "{{tenant}}", "refId": "A" } @@ -562,7 +562,7 @@ "steppedLine": false, "targets": [ { - "expr": "increase(kube_pod_container_status_restarts_total{cluster=\"$cluster\", namespace=\"$namespace\"}[10m]) > 0", + "expr": "increase(kube_pod_container_status_restarts_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[10m]) > 0", "hide": false, "interval": "", "legendFormat": "{{container}}-{{pod}}", @@ -869,7 +869,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/distributor\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/distributor\"}[$__rate_interval])) by (route) > 0", + "expr": "sum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=\"$namespace/distributor\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=\"$namespace/distributor\"}[$__rate_interval])) by (route) > 0", "interval": "", "legendFormat": "{{route}}", "refId": "A" @@ -1073,7 +1073,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\", status_code!~\"5[0-9]{2}\", route=\"/logproto.Pusher/Push\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\", route=\"/logproto.Pusher/Push\"}[$__rate_interval])) by (route) > 0", + "expr": "sum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\", status_code!~\"5[0-9]{2}\", route=\"/logproto.Pusher/Push\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\", route=\"/logproto.Pusher/Push\"}[$__rate_interval])) by (route) > 0", "interval": "", "legendFormat": "{{route}}", "refId": "A" @@ -1172,17 +1172,17 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=\"$namespace/querier\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"}))", + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=\"$namespace/querier\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=~\"$cluster\"}))", "legendFormat": "{{route}}-.99", "refId": "A" }, { - "expr": "histogram_quantile(0.9, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=\"$namespace/querier\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"}))", + "expr": "histogram_quantile(0.9, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=\"$namespace/querier\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=~\"$cluster\"}))", "legendFormat": "{{route}}-.9", "refId": "B" }, { - "expr": "histogram_quantile(0.5, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=\"$namespace/querier\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"}))", + "expr": "histogram_quantile(0.5, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=\"$namespace/querier\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=~\"$cluster\"}))", "legendFormat": "{{route}}-.5", "refId": "C" } @@ -1277,17 +1277,17 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=\"$namespace/querier\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=\"$namespace/querier\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=~\"$cluster\"})) * 1e3", "legendFormat": ".99-{{route}}", "refId": "A" }, { - "expr": "histogram_quantile(0.9, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=\"$namespace/querier\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.9, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=\"$namespace/querier\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=~\"$cluster\"})) * 1e3", "legendFormat": ".9-{{route}}", "refId": "B" }, { - "expr": "histogram_quantile(0.5, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=\"$namespace/querier\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.5, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=\"$namespace/querier\", route=~\"api_prom_query|api_prom_labels|api_prom_label_name_values|loki_api_v1_query|loki_api_v1_query_range|loki_api_v1_label|loki_api_v1_label_name_values\", cluster=~\"$cluster\"})) * 1e3", "legendFormat": ".5-{{route}}", "refId": "C" } @@ -1382,7 +1382,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/querier\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/querier\"}[$__rate_interval])) by (route) > 0", + "expr": "sum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=\"$namespace/querier\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=\"$namespace/querier\"}[$__rate_interval])) by (route) > 0", "interval": "", "legendFormat": "{{route}}", "refId": "A" @@ -1480,17 +1480,17 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=\"$namespace/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.99, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=\"$namespace/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=~\"$cluster\"})) * 1e3", "legendFormat": ".99-{{route}}", "refId": "A" }, { - "expr": "histogram_quantile(0.9, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=\"$namespace/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.9, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=\"$namespace/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=~\"$cluster\"})) * 1e3", "legendFormat": ".9-{{route}}", "refId": "B" }, { - "expr": "histogram_quantile(0.5, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=\"$namespace/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=\"$cluster\"})) * 1e3", + "expr": "histogram_quantile(0.5, sum by (le,route) (cluster_job_route:loki_request_duration_seconds_bucket:sum_rate{job=\"$namespace/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\", cluster=~\"$cluster\"})) * 1e3", "legendFormat": ".5-{{route}}", "refId": "C" } @@ -1585,7 +1585,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\", status_code!~\"5[0-9]{2}\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[$__rate_interval])) by (route) > 0", + "expr": "sum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\", status_code!~\"5[0-9]{2}\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\", route=~\"/logproto.Querier/Query|/logproto.Querier/Label|/logproto.Querier/Series|/logproto.Querier/QuerySample|/logproto.Querier/GetChunkIDs\"}[$__rate_interval])) by (route) > 0", "interval": "", "legendFormat": "{{route}}", "refId": "A" @@ -1685,7 +1685,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10,sum by (tenant, reason) (rate(loki_discarded_samples_total{cluster=\"$cluster\",namespace=\"$namespace\"}[1m])))", + "expr": "topk(10,sum by (tenant, reason) (rate(loki_discarded_samples_total{cluster=~\"$cluster\",namespace=\"$namespace\"}[1m])))", "interval": "", "legendFormat": "{{ tenant }} - {{ reason }}", "refId": "A" @@ -1809,7 +1809,7 @@ ], "targets": [ { - "expr": "topk(10, sum by (tenant, reason) (sum_over_time(increase(loki_discarded_samples_total{cluster=\"$cluster\",namespace=\"$namespace\"}[1m])[$__range:1m])))", + "expr": "topk(10, sum by (tenant, reason) (sum_over_time(increase(loki_discarded_samples_total{cluster=~\"$cluster\",namespace=\"$namespace\"}[1m])[$__range:1m])))", "format": "table", "instant": true, "interval": "", @@ -1969,7 +1969,7 @@ "steppedLine": false, "targets": [ { - "expr": "go_memstats_heap_inuse_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"distributor.*\"}", + "expr": "go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"distributor.*\"}", "instant": false, "intervalFactor": 3, "legendFormat": "{{pod}}", @@ -2063,7 +2063,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/distributor\"} | logfmt | level=\"error\"[1m]))", + "expr": "sum(rate({cluster=~\"$cluster\", namespace=\"$namespace\", job=\"$namespace/distributor\"} | logfmt | level=\"error\"[1m]))", "refId": "A" } ], @@ -2125,7 +2125,7 @@ }, "targets": [ { - "expr": "{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/distributor\"} |= \"level=error\"", + "expr": "{cluster=~\"$cluster\", namespace=\"$namespace\", job=\"$namespace/distributor\"} |= \"level=error\"", "refId": "A" } ], @@ -2175,7 +2175,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/distributor\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/distributor\"}[$__rate_interval])) by (route) > 0", + "expr": "sum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=\"$namespace/distributor\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=\"$namespace/distributor\"}[$__rate_interval])) by (route) > 0", "interval": "", "intervalFactor": 1, "legendFormat": "{{route}}", @@ -2264,7 +2264,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_distributor_ingester_append_failures_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", + "expr": "sum(rate(loki_distributor_ingester_append_failures_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" @@ -2352,7 +2352,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_distributor_bytes_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", + "expr": "sum(rate(loki_distributor_bytes_received_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" @@ -2440,7 +2440,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_distributor_lines_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", + "expr": "sum(rate(loki_distributor_lines_received_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", "intervalFactor": 1, "legendFormat": "{{pod}}", "refId": "A" @@ -2633,7 +2633,7 @@ "steppedLine": false, "targets": [ { - "expr": "go_memstats_heap_inuse_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"ingester.*\"}", + "expr": "go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"ingester.*\"}", "instant": false, "intervalFactor": 3, "legendFormat": "{{pod}}", @@ -2727,7 +2727,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\"} | logfmt | level=\"error\"[1m]))", + "expr": "sum(rate({cluster=~\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\"} | logfmt | level=\"error\"[1m]))", "refId": "A" } ], @@ -2789,7 +2789,7 @@ }, "targets": [ { - "expr": "{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\"} |= \"level=error\"", + "expr": "{cluster=~\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\"} |= \"level=error\"", "refId": "A" } ], @@ -2839,7 +2839,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\"}[$__rate_interval])) by (route) > 0", + "expr": "sum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=\"$namespace/ingester\"}[$__rate_interval])) by (route) > 0", "interval": "", "intervalFactor": 1, "legendFormat": "{{route}}", @@ -2945,7 +2945,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10,sum by (tenant) (loki_ingester_memory_streams{cluster=\"$cluster\",job=\"$namespace/ingester\"}))", + "expr": "topk(10,sum by (tenant) (loki_ingester_memory_streams{cluster=~\"$cluster\",job=\"$namespace/ingester\"}))", "interval": "", "legendFormat": "{{ tenant }}", "refId": "A" @@ -3035,7 +3035,7 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10, sum by (tenant) (rate(loki_ingester_streams_created_total{cluster=\"$cluster\",job=\"$namespace/ingester\"}[1m]) > 0))", + "expr": "topk(10, sum by (tenant) (rate(loki_ingester_streams_created_total{cluster=~\"$cluster\",job=\"$namespace/ingester\"}[1m]) > 0))", "interval": "", "legendFormat": "{{ tenant }}", "refId": "A" @@ -3143,13 +3143,13 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_ingester_chunks_flushed_total{cluster=\"$cluster\",job=\"$namespace/ingester\"}[1m]))", + "expr": "sum(rate(loki_ingester_chunks_flushed_total{cluster=~\"$cluster\",job=\"$namespace/ingester\"}[1m]))", "interval": "", "legendFormat": "Chunks", "refId": "A" }, { - "expr": "sum(increase(loki_chunk_store_deduped_chunks_total{cluster=\"$cluster\", job=\"$namespace/ingester\"}[1m]))/sum(increase(loki_ingester_chunks_flushed_total{cluster=\"$cluster\", job=\"$namespace/ingester\"}[1m])) < 1", + "expr": "sum(increase(loki_chunk_store_deduped_chunks_total{cluster=~\"$cluster\", job=\"$namespace/ingester\"}[1m]))/sum(increase(loki_ingester_chunks_flushed_total{cluster=~\"$cluster\", job=\"$namespace/ingester\"}[1m])) < 1", "interval": "", "legendFormat": "De-Dupe Ratio", "refId": "B" @@ -3226,7 +3226,7 @@ "reverseYBuckets": false, "targets": [ { - "expr": "sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=\"$cluster\",job=\"$namespace/ingester\"}[1m])) by (le)", + "expr": "sum(rate(loki_ingester_chunk_size_bytes_bucket{cluster=~\"$cluster\",job=\"$namespace/ingester\"}[1m])) by (le)", "format": "heatmap", "instant": false, "interval": "", @@ -3379,7 +3379,7 @@ "reverseYBuckets": false, "targets": [ { - "expr": "sum by (le) (rate(loki_ingester_chunk_utilization_bucket{cluster=\"$cluster\", job=\"$namespace/ingester\"}[1m]))", + "expr": "sum by (le) (rate(loki_ingester_chunk_utilization_bucket{cluster=~\"$cluster\", job=\"$namespace/ingester\"}[1m]))", "format": "heatmap", "instant": false, "interval": "", @@ -3559,7 +3559,7 @@ "steppedLine": false, "targets": [ { - "expr": "go_memstats_heap_inuse_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"querier.*\"}", + "expr": "go_memstats_heap_inuse_bytes{cluster=~\"$cluster\", namespace=\"$namespace\", pod=~\"querier.*\"}", "instant": false, "intervalFactor": 3, "legendFormat": "{{pod}}", @@ -3653,7 +3653,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/querier\"} | logfmt | level=\"error\"[1m]))", + "expr": "sum(rate({cluster=~\"$cluster\", namespace=\"$namespace\", job=\"$namespace/querier\"} | logfmt | level=\"error\"[1m]))", "refId": "A" } ], @@ -3715,7 +3715,7 @@ }, "targets": [ { - "expr": "{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/querier\"} |= \"level=error\"", + "expr": "{cluster=~\"$cluster\", namespace=\"$namespace\", job=\"$namespace/querier\"} |= \"level=error\"", "refId": "A" } ], @@ -3765,7 +3765,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/querier\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", job=\"$namespace/querier\"}[$__rate_interval])) by (route) > 0", + "expr": "sum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=\"$namespace/querier\", status_code!~\"5[0-9]{2}\"}[$__rate_interval])) by (route)\n/\nsum(rate(loki_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", job=\"$namespace/querier\"}[$__rate_interval])) by (route) > 0", "interval": "", "intervalFactor": 1, "legendFormat": "{{route}}", @@ -3872,19 +3872,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", + "expr": "histogram_quantile(.99, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", "intervalFactor": 1, "legendFormat": "{{container}}: .99-{{method}}-{{name}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", + "expr": "histogram_quantile(.9, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", "hide": false, "legendFormat": "{{container}}: .9-{{method}}-{{name}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", + "expr": "histogram_quantile(.5, sum(rate(loki_memcache_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (method, name, le, container))", "hide": false, "legendFormat": "{{container}}: .5-{{method}}-{{name}}", "refId": "C" @@ -3975,7 +3975,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_memcache_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, method, name, container)", + "expr": "sum(rate(loki_memcache_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, method, name, container)", "intervalFactor": 1, "legendFormat": "{{container}}: {{status_code}}-{{method}}-{{name}}", "refId": "A" @@ -4081,19 +4081,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_consul_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -4184,7 +4184,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_consul_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, status_code, method)", + "expr": "sum(rate(loki_consul_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, status_code, method)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -4288,17 +4288,17 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".9", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (operation, le))", "refId": "C" } ], @@ -4385,20 +4385,20 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", "interval": "", "intervalFactor": 1, "legendFormat": "99%", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", "interval": "", "legendFormat": "90%", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (operation, le))", "interval": "", "legendFormat": "50%", "refId": "C" @@ -4487,20 +4487,20 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", "interval": "", "intervalFactor": 1, "legendFormat": "99%", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", "interval": "", "legendFormat": "90%", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (operation, le))", "interval": "", "legendFormat": "50%", "refId": "C" @@ -4589,17 +4589,17 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".9", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_bigtable_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (operation, le))", "refId": "C" } ], @@ -4686,7 +4686,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (status_code)", + "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/MutateRows\"}[5m])) by (status_code)", "intervalFactor": 1, "legendFormat": "{{status_code}}", "refId": "A" @@ -4775,7 +4775,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (status_code)", + "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.v2.Bigtable/ReadRows\"}[5m])) by (status_code)", "intervalFactor": 1, "legendFormat": "{{status_code}}", "refId": "A" @@ -4864,7 +4864,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (status_code)", + "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/GetTable\"}[5m])) by (status_code)", "intervalFactor": 1, "legendFormat": "{{status_code}}", "refId": "A" @@ -4953,7 +4953,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (status_code)", + "expr": "sum(rate(loki_bigtable_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\", operation=\"/google.bigtable.admin.v2.BigtableTableAdmin/ListTables\"}[5m])) by (status_code)", "intervalFactor": 1, "legendFormat": "{{status_code}}", "refId": "A" @@ -5059,19 +5059,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_gcs_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -5162,7 +5162,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_gcs_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_gcs_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -5264,7 +5264,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_failures_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m]))", + "expr": "sum(rate(loki_dynamo_failures_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m]))", "refId": "A" } ], @@ -5349,7 +5349,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_consumed_capacity_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m]))", + "expr": "sum(rate(loki_dynamo_consumed_capacity_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m]))", "refId": "A" } ], @@ -5434,7 +5434,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_throttled_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m]))", + "expr": "sum(rate(loki_dynamo_throttled_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m]))", "refId": "A" } ], @@ -5519,7 +5519,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_dropped_requests_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m]))", + "expr": "sum(rate(loki_dynamo_dropped_requests_total{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m]))", "refId": "A" } ], @@ -5604,17 +5604,17 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])))", + "expr": "histogram_quantile(.99, sum(rate(loki_dynamo_query_pages_count{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])))", "legendFormat": ".99", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])))", + "expr": "histogram_quantile(.9, sum(rate(loki_dynamo_query_pages_count{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])))", "legendFormat": ".9", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_dynamo_query_pages_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])))", + "expr": "histogram_quantile(.5, sum(rate(loki_dynamo_query_pages_count{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])))", "legendFormat": ".5", "refId": "C" } @@ -5703,19 +5703,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_dynamo_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -5805,7 +5805,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_dynamo_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_dynamo_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -5910,19 +5910,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_s3_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -6012,7 +6012,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_s3_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_s3_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -6117,19 +6117,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_azure_blob_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -6219,7 +6219,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_azure_blob_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_azure_blob_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -6324,19 +6324,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_cassandra_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_cassandra_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_cassandra_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_cassandra_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_cassandra_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_cassandra_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -6426,7 +6426,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_cassandra_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_cassandra_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" @@ -6531,19 +6531,19 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(.99, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.99, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "intervalFactor": 1, "legendFormat": ".99-{{operation}}", "refId": "A" }, { - "expr": "histogram_quantile(.9, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.9, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".9-{{operation}}", "refId": "B" }, { - "expr": "histogram_quantile(.5, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", + "expr": "histogram_quantile(.5, sum(rate(loki_boltdb_shipper_request_duration_seconds_bucket{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (operation, le))", "hide": false, "legendFormat": ".5-{{operation}}", "refId": "C" @@ -6633,7 +6633,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_boltdb_shipper_request_duration_seconds_count{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", + "expr": "sum(rate(loki_boltdb_shipper_request_duration_seconds_count{cluster=~\"$cluster\", namespace=\"$namespace\"}[5m])) by (status_code, operation)", "intervalFactor": 1, "legendFormat": "{{status_code}}-{{operation}}", "refId": "A" diff --git a/production/loki-mixin/dashboards/loki-canary-dashboard.libsonnet b/production/loki-mixin/dashboards/loki-canary-dashboard.libsonnet index 6539a34d77e4c..7b2a65f3b9d40 100644 --- a/production/loki-mixin/dashboards/loki-canary-dashboard.libsonnet +++ b/production/loki-mixin/dashboards/loki-canary-dashboard.libsonnet @@ -24,8 +24,8 @@ local grafana = import 'grafonnet/grafana.libsonnet'; // This logic is inherited from mimir-mixin. dashboard.dashboard('Canary') // We can't make use of simplified template selectors from the loki dashboard utils until we port the cortex dashboard utils panel/grid functionality. - .addTemplate('cluster', 'loki_build_info', 'cluster') - .addTemplate('namespace', 'loki_build_info{cluster=~"$cluster"}', 'namespace') + .addTemplate('cluster', 'loki_build_info', $._config.per_cluster_label) + .addTemplate('namespace', 'loki_build_info{' + $._config.per_cluster_label + '=~"$cluster"}', 'namespace') + { // This dashboard uses the new grid system in order to place panels (using gridPos). // Because of this we can't use the mixin's addRow() and addPanel(). @@ -33,7 +33,7 @@ local grafana = import 'grafonnet/grafana.libsonnet'; rows: null, // ugly hack, copy pasta the tag/link // code from the loki-mixin - tags: ['loki'], + tags: $._config.tags, links: [ { asDropdown: true, @@ -49,60 +49,60 @@ local grafana = import 'grafonnet/grafana.libsonnet'; panels: [ // grid row 1 dashboard.panel('Canary Entries Total') + - dashboard.newStatPanel('sum(count(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}))', unit='short') + + dashboard.newStatPanel('sum(count(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster", namespace=~"$namespace"}))', unit='short') + { gridPos: { h: 4, w: 3, x: 0, y: 0 } }, dashboard.panel('Canary Logs Total') + - dashboard.newStatPanel('sum(increase(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + + dashboard.newStatPanel('sum(increase(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster", namespace=~"$namespace"}[$__range]))', unit='short') + { gridPos: { h: 4, w: 3, x: 3, y: 0 } }, dashboard.panel('Missing') + - dashboard.newStatPanel('sum(increase(loki_canary_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + + dashboard.newStatPanel('sum(increase(loki_canary_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster", namespace=~"$namespace"}[$__range]))', unit='short') + { gridPos: { h: 4, w: 3, x: 6, y: 0 } }, dashboard.panel('Spotcheck Missing') + - dashboard.newStatPanel('sum(increase(loki_canary_spot_check_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + + dashboard.newStatPanel('sum(increase(loki_canary_spot_check_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + { gridPos: { h: 4, w: 3, x: 9, y: 0 } }, // grid row 2 dashboard.panel('Spotcheck Total') + - dashboard.newStatPanel('sum(increase(loki_canary_spot_check_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + + dashboard.newStatPanel('sum(increase(loki_canary_spot_check_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + { gridPos: { h: 4, w: 3, x: 0, y: 4 } }, dashboard.panel('Metric Test Error %') + - dashboard.newStatPanel('((sum(loki_canary_metric_test_expected{cluster=~"$cluster",namespace=~"$namespace"}) - sum(loki_canary_metric_test_actual{cluster=~"$cluster",namespace=~"$namespace"}))/(sum(loki_canary_metric_test_actual{cluster=~"$cluster",namespace=~"$namespace"}))) * 100') + + dashboard.newStatPanel('((sum(loki_canary_metric_test_expected{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}) - sum(loki_canary_metric_test_actual{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}))/(sum(loki_canary_metric_test_actual{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}))) * 100') + { gridPos: { h: 4, w: 3, x: 3, y: 4 } }, dashboard.panel('Missing %') + - dashboard.newStatPanel('(sum(increase(loki_canary_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range])))*100') + + dashboard.newStatPanel('(sum(increase(loki_canary_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range])))*100') + { gridPos: { h: 4, w: 3, x: 6, y: 4 } }, dashboard.panel('Spotcheck Missing %') + - dashboard.newStatPanel('(sum(increase(loki_canary_spot_check_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_spot_check_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))) * 100') + + dashboard.newStatPanel('(sum(increase(loki_canary_spot_check_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_spot_check_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))) * 100') + { gridPos: { h: 4, w: 3, x: 9, y: 4 } }, // grid row 3 dashboard.panel('Metric Test Expected') + - dashboard.newStatPanel('sum(loki_canary_metric_test_expected{cluster=~"$cluster",namespace=~"$namespace"})', unit='short') + + dashboard.newStatPanel('sum(loki_canary_metric_test_expected{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"})', unit='short') + { gridPos: { h: 4, w: 3, x: 0, y: 8 } }, dashboard.panel('Metric Test Actual') + - dashboard.newStatPanel('sum(loki_canary_metric_test_actual{cluster=~"$cluster",namespace=~"$namespace"})', unit='short') + + dashboard.newStatPanel('sum(loki_canary_metric_test_actual{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"})', unit='short') + { gridPos: { h: 4, w: 3, x: 3, y: 8 } }, dashboard.panel('Websocket Missing') + - dashboard.newStatPanel('sum(increase(loki_canary_websocket_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + + dashboard.newStatPanel('sum(increase(loki_canary_websocket_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + { gridPos: { h: 4, w: 3, x: 6, y: 8 } }, dashboard.panel('Websocket Missing %') + - dashboard.newStatPanel('(sum(increase(loki_canary_websocket_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range])))*100') + + dashboard.newStatPanel('(sum(increase(loki_canary_websocket_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range])))*100') + { gridPos: { h: 4, w: 3, x: 9, y: 8 } }, // end of grid dashboard.panel('Log Write to read Latency Percentiles') + dashboard.queryPanel([ - 'histogram_quantile(0.95, sum(rate(loki_canary_response_latency_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', - 'histogram_quantile(0.50, sum(rate(loki_canary_response_latency_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', + 'histogram_quantile(0.95, sum(rate(loki_canary_response_latency_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', + 'histogram_quantile(0.50, sum(rate(loki_canary_response_latency_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', ], ['p95', 'p50']) + { gridPos: { h: 6, w: 12, x: 12, y: 0 } }, @@ -115,7 +115,7 @@ local grafana = import 'grafonnet/grafana.libsonnet'; ).addTargets( [ grafana.prometheus.target( - 'sum(rate(loki_canary_response_latency_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le)', + 'sum(rate(loki_canary_response_latency_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le)', legendFormat='{{le}}', format='heatmap', ), @@ -125,24 +125,24 @@ local grafana = import 'grafonnet/grafana.libsonnet'; dashboard.panel('Spot Check Query') + dashboard.queryPanel([ - 'histogram_quantile(0.99, sum(rate(loki_canary_spot_check_request_duration_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', - 'histogram_quantile(0.50, sum(rate(loki_canary_spot_check_request_duration_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', + 'histogram_quantile(0.99, sum(rate(loki_canary_spot_check_request_duration_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', + 'histogram_quantile(0.50, sum(rate(loki_canary_spot_check_request_duration_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', ], ['p99', 'p95']) + { gridPos: { h: 6, w: 12, x: 0, y: 14 } }, dashboard.panel('Metric Test Query') + dashboard.queryPanel([ - 'histogram_quantile(0.99, sum(rate(loki_canary_metric_test_request_duration_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[15m])) by (le))', - 'histogram_quantile(0.50, sum(rate(loki_canary_metric_test_request_duration_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[15m])) by (le))', + 'histogram_quantile(0.99, sum(rate(loki_canary_metric_test_request_duration_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[15m])) by (le))', + 'histogram_quantile(0.50, sum(rate(loki_canary_metric_test_request_duration_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[15m])) by (le))', ], ['p99', 'p95'],) + { gridPos: { h: 6, w: 12, x: 12, y: 14 } }, dashboard.panel('Spot Check Missing %') + - dashboard.queryPanel('topk(20, (sum by (cluster, pod) (increase(loki_canary_spot_check_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval]))/sum by (cluster, pod) (increase(loki_canary_spot_check_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) * 100)) > 0', '') + + dashboard.queryPanel('topk(20, (sum by (' + $._config.per_cluster_label + ', pod) (increase(loki_canary_spot_check_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval]))/sum by (' + $._config.per_cluster_label + ', pod) (increase(loki_canary_spot_check_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) * 100)) > 0', '') + { gridPos: { h: 6, w: 12, x: 0, y: 20 } }, g.panel('Missing logs') + - g.queryPanel('topk(20,(sum by (cluster, pod)(increase(loki_canary_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval]))/sum by (cluster, pod)(increase(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])))*100) > 0', 'Missing {{ cluster }} {{ pod }}') + + g.queryPanel('topk(20,(sum by (' + $._config.per_cluster_label + ', pod)(increase(loki_canary_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval]))/sum by (' + $._config.per_cluster_label + ', pod)(increase(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])))*100) > 0', 'Missing {{ ' + $._config.per_cluster_label + ' }} {{ pod }}') + { gridPos: { h: 6, w: 12, x: 12, y: 20 } }, ], diff --git a/production/loki-mixin/dashboards/loki-logs.libsonnet b/production/loki-mixin/dashboards/loki-logs.libsonnet index 9fd6eee589502..611121b85f70a 100644 --- a/production/loki-mixin/dashboards/loki-logs.libsonnet +++ b/production/loki-mixin/dashboards/loki-logs.libsonnet @@ -48,8 +48,6 @@ local template = import 'grafonnet/template.libsonnet'; local cfg = self, showMultiCluster:: true, - clusterLabel:: $._config.per_cluster_label, - } + lokiLogs + $.dashboard('Loki / Logs', uid='logs') .addCluster() @@ -61,7 +59,8 @@ local template = import 'grafonnet/template.libsonnet'; p { targets: [ e { - expr: if dashboards['loki-logs.json'].showMultiCluster then super.expr + expr: if dashboards['loki-logs.json'].showMultiCluster + then std.strReplace(super.expr, 'cluster="$cluster", ', $._config.per_cluster_label + '=~"$cluster", ') else std.strReplace(super.expr, $._config.per_cluster_label + '="$cluster", ', ''), } for e in p.targets diff --git a/production/loki-mixin/dashboards/loki-operational.libsonnet b/production/loki-mixin/dashboards/loki-operational.libsonnet index e8f5d98248746..193796fa3ec29 100644 --- a/production/loki-mixin/dashboards/loki-operational.libsonnet +++ b/production/loki-mixin/dashboards/loki-operational.libsonnet @@ -62,7 +62,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; local replaceClusterMatchers(expr) = if dashboards['loki-operational.json'].showMultiCluster - then expr + then std.strReplace( + std.strReplace( + expr, + 'cluster="$cluster", ', + $._config.per_cluster_label + '=~"$cluster", ' + ), + 'cluster_job_route', + $._config.per_cluster_label + '_job_route' + ) else std.strReplace( std.strReplace( diff --git a/production/loki-mixin/dashboards/recording-rules.libsonnet b/production/loki-mixin/dashboards/recording-rules.libsonnet index 2d943807c6485..9355b408a0f0e 100644 --- a/production/loki-mixin/dashboards/recording-rules.libsonnet +++ b/production/loki-mixin/dashboards/recording-rules.libsonnet @@ -7,27 +7,42 @@ local template = import 'grafonnet/template.libsonnet'; template.new( 'tenant', '$datasource', - 'query_result(sum by (id) (grafanacloud_logs_instance_info) and sum(label_replace(loki_tenant:active_streams{cluster="$cluster",namespace="$namespace"},"id","$1","tenant","(.*)")) by(id))', + 'query_result(sum by (id) (grafanacloud_logs_instance_info) and sum(label_replace(loki_tenant:active_streams{' + $._config.per_cluster_label + '="$cluster",namespace="$namespace"},"id","$1","tenant","(.*)")) by(id))', regex='/"([^"]+)"/', sort=1, includeAll=true, allValues='.+', ), - grafanaDashboards+: - { - 'loki-mixin-recording-rules.json': raw + - $.dashboard('Loki / Recording Rules', uid='recording-rules') - .addCluster() - .addNamespace() - .addLog() - .addTag() - + { - templating+: { - list+: [ - tenantTemplate, - ], - }, + grafanaDashboards+: { + local dashboards = self, + 'loki-mixin-recording-rules.json': { + showMultiCluster:: true, + } + raw + + $.dashboard('Loki / Recording Rules', uid='recording-rules') + .addCluster() + .addNamespace() + .addLog() + .addTag() + + { + panels: [ + p { + targets: [ + e { + expr: if dashboards['loki-mixin-recording-rules.json'].showMultiCluster + then std.strReplace(super.expr, 'cluster="$cluster", ', $._config.per_cluster_label + '=~"$cluster", ') + else std.strReplace(super.expr, $._config.per_cluster_label + '="$cluster", ', ''), + } + for e in p.targets + ], + } + for p in super.panels + ], + templating+: { + list+: [ + tenantTemplate, + ], }, + }, }, }