From 138fe2e36d890d306e93d38cb85449b98440841e Mon Sep 17 00:00:00 2001 From: Naseem Ullah <24660299+naseemkullah@users.noreply.github.com> Date: Tue, 12 Apr 2022 17:40:46 -0400 Subject: [PATCH] Improve req handling dashboard (#8322) Display per method/path combos for various metrics, adjust titles, and sort tooltip by decreasing Signed-off-by: Naseem Ullah <24660299+naseemkullah@users.noreply.github.com> --- .../request-handling-performance.json | 350 +++++++++--------- 1 file changed, 177 insertions(+), 173 deletions(-) diff --git a/deploy/grafana/dashboards/request-handling-performance.json b/deploy/grafana/dashboards/request-handling-performance.json index e231129bea..1422336aee 100644 --- a/deploy/grafana/dashboards/request-handling-performance.json +++ b/deploy/grafana/dashboards/request-handling-performance.json @@ -9,17 +9,18 @@ "pluginName": "Prometheus" } ], + "__elements": [], "__requires": [ { "type": "grafana", "id": "grafana", "name": "Grafana", - "version": "6.6.0" + "version": "8.3.4" }, { "type": "panel", "id": "graph", - "name": "Graph", + "name": "Graph (old)", "version": "" }, { @@ -38,25 +39,41 @@ "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, "type": "dashboard" } ] }, "description": "", "editable": true, + "fiscalYearStartMonth": 0, "gnetId": 9614, "graphTooltip": 1, "id": null, - "iteration": 1582146566338, + "iteration": 1646929474557, "links": [], + "liveNow": false, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "description": "Total time taken for nginx and upstream servers to process a request and send a response", + "datasource": { + "uid": "${DS_PROMETHEUS}" + }, + "description": "Total time for NGINX and upstream servers to process a request and send a response", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -80,9 +97,10 @@ "linewidth": 1, "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, + "pluginVersion": "8.3.4", "pointradius": 2, "points": false, "renderer": "flot", @@ -92,63 +110,52 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(\n 0.5,\n sum by (le)(\n rate(\n nginx_ingress_controller_request_duration_seconds_bucket{\n ingress =~ \"$ingress\"\n }[1m]\n )\n )\n)", + "expr": "histogram_quantile(\n 0.5,\n sum by (le)(\n rate(\n nginx_ingress_controller_request_duration_seconds_bucket{\n ingress =~ \"$ingress\"\n }[5m]\n )\n )\n)", "interval": "", "legendFormat": ".5", "refId": "D" }, { - "expr": "histogram_quantile(\n 0.95,\n sum by (le)(\n rate(\n nginx_ingress_controller_request_duration_seconds_bucket{\n ingress =~ \"$ingress\"\n }[1m]\n )\n )\n)", + "expr": "histogram_quantile(\n 0.95,\n sum by (le)(\n rate(\n nginx_ingress_controller_request_duration_seconds_bucket{\n ingress =~ \"$ingress\"\n }[5m]\n )\n )\n)", "interval": "", "legendFormat": ".95", "refId": "B" }, { - "expr": "histogram_quantile(\n 0.99,\n sum by (le)(\n rate(\n nginx_ingress_controller_request_duration_seconds_bucket{\n ingress =~ \"$ingress\"\n }[1m]\n )\n )\n)", + "expr": "histogram_quantile(\n 0.99,\n sum by (le)(\n rate(\n nginx_ingress_controller_request_duration_seconds_bucket{\n ingress =~ \"$ingress\"\n }[5m]\n )\n )\n)", "interval": "", "legendFormat": ".99", "refId": "A" } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, - "title": "Total request handling time", + "title": "Request Latency Percentiles", "tooltip": { "shared": true, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -156,8 +163,16 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "uid": "${DS_PROMETHEUS}" + }, "description": "The time spent on receiving the response from the upstream server", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -181,9 +196,10 @@ "linewidth": 1, "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, + "pluginVersion": "8.3.4", "pointradius": 2, "points": false, "renderer": "flot", @@ -193,7 +209,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(\n 0.5,\n sum by (le)(\n rate(\n nginx_ingress_controller_response_duration_seconds_bucket{\n ingress =~ \"$ingress\"\n }[1m]\n )\n )\n)", + "expr": "histogram_quantile(\n 0.5,\n sum by (le)(\n rate(\n nginx_ingress_controller_response_duration_seconds_bucket{\n ingress =~ \"$ingress\"\n }[5m]\n )\n )\n)", "instant": false, "interval": "", "intervalFactor": 1, @@ -201,57 +217,46 @@ "refId": "D" }, { - "expr": "histogram_quantile(\n 0.95,\n sum by (le)(\n rate(\n nginx_ingress_controller_response_duration_seconds_bucket{\n ingress =~ \"$ingress\"\n }[1m]\n )\n )\n)", + "expr": "histogram_quantile(\n 0.95,\n sum by (le)(\n rate(\n nginx_ingress_controller_response_duration_seconds_bucket{\n ingress =~ \"$ingress\"\n }[5m]\n )\n )\n)", "interval": "", "legendFormat": ".95", "refId": "B" }, { - "expr": "histogram_quantile(\n 0.99,\n sum by (le)(\n rate(\n nginx_ingress_controller_response_duration_seconds_bucket{\n ingress =~ \"$ingress\"\n }[1m]\n )\n )\n)", + "expr": "histogram_quantile(\n 0.99,\n sum by (le)(\n rate(\n nginx_ingress_controller_response_duration_seconds_bucket{\n ingress =~ \"$ingress\"\n }[5m]\n )\n )\n)", "interval": "", "legendFormat": ".99", "refId": "A" } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, - "title": "Upstream response time", + "title": "Upstream Response Latency Percentiles", "tooltip": { "shared": true, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -259,7 +264,15 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -285,9 +298,10 @@ "linewidth": 1, "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, + "pluginVersion": "8.3.4", "pointradius": 2, "points": false, "renderer": "flot", @@ -297,52 +311,41 @@ "steppedLine": false, "targets": [ { - "expr": " sum by (path)(\n rate(\n nginx_ingress_controller_request_duration_seconds_count{\n ingress =~ \"$ingress\"\n }[1m]\n )\n )\n", + "expr": " sum by (method, host, path)(\n rate(\n nginx_ingress_controller_request_duration_seconds_count{\n ingress =~ \"$ingress\"\n }[5m]\n )\n )\n", "interval": "", "intervalFactor": 1, - "legendFormat": "{{ path }}", + "legendFormat": "{{ method }} {{ host }}{{path }}", "refId": "A" } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, - "title": "Request volume by Path", + "title": "Request Rate by Method and Path", "tooltip": { "shared": true, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "reqps", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -350,8 +353,16 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "uid": "${DS_PROMETHEUS}" + }, "description": "For each path observed, its median upstream response time", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -377,9 +388,10 @@ "linewidth": 1, "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, + "pluginVersion": "8.3.4", "pointradius": 2, "points": false, "renderer": "flot", @@ -389,52 +401,41 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(\n .5,\n sum by (le, path)(\n rate(\n nginx_ingress_controller_response_duration_seconds_bucket{\n ingress =~ \"$ingress\"\n }[1m]\n )\n )\n)", + "expr": "histogram_quantile(\n .5,\n sum by (le, method, host, path)(\n rate(\n nginx_ingress_controller_response_duration_seconds_bucket{\n ingress =~ \"$ingress\"\n }[5m]\n )\n )\n)", "interval": "", "intervalFactor": 1, - "legendFormat": "{{ path }}", + "legendFormat": "{{ method }} {{ host }}{{path }}", "refId": "A" } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, - "title": "Median upstream response time by Path", + "title": "Median Upstream Response Time by Method and Path", "tooltip": { "shared": true, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -442,8 +443,16 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "uid": "${DS_PROMETHEUS}" + }, "description": "Percentage of 4xx and 5xx responses among all responses.", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -469,9 +478,10 @@ "linewidth": 1, "nullPointMode": "null as zero", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, + "pluginVersion": "8.3.4", "pointradius": 2, "points": false, "renderer": "flot", @@ -481,52 +491,41 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (path) (rate(nginx_ingress_controller_request_duration_seconds_count{\n ingress =~ \"$ingress\",\n status =~ \"[4-5].*\"\n}[1m])) / sum by (path) (rate(nginx_ingress_controller_request_duration_seconds_count{\n ingress =~ \"$ingress\",\n}[1m]))", + "expr": "sum by (method, host, path) (rate(nginx_ingress_controller_request_duration_seconds_count{\n ingress =~ \"$ingress\",\n status =~ \"[4-5].*\"\n}[5m])) / sum by (method, host, path) (rate(nginx_ingress_controller_request_duration_seconds_count{\n ingress =~ \"$ingress\",\n}[5m]))", "interval": "", "intervalFactor": 1, - "legendFormat": "{{ path }}", + "legendFormat": "{{ method }} {{ host }}{{path }}", "refId": "A" } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, - "title": "Response error rate by Path", + "title": "Response Error Rate by Method and Path", "tooltip": { "shared": true, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -534,8 +533,16 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "uid": "${DS_PROMETHEUS}" + }, "description": "For each path observed, the sum of upstream request time", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -561,9 +568,10 @@ "linewidth": 1, "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, + "pluginVersion": "8.3.4", "pointradius": 2, "points": false, "renderer": "flot", @@ -573,52 +581,41 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (path) (rate(nginx_ingress_controller_response_duration_seconds_sum{ingress =~ \"$ingress\"}[1m]))", + "expr": "sum by (method, host, path) (rate(nginx_ingress_controller_response_duration_seconds_sum{ingress =~ \"$ingress\"}[5m]))", "interval": "", "intervalFactor": 1, - "legendFormat": "{{ path }}", + "legendFormat": "{{ method }} {{ host }}{{path }}", "refId": "A" } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, - "title": "Upstream time consumed by Path", + "title": "Upstream Response Time by Method and Path", "tooltip": { "shared": true, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -626,7 +623,15 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -652,9 +657,10 @@ "linewidth": 1, "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, + "pluginVersion": "8.3.4", "pointradius": 2, "points": false, "renderer": "flot", @@ -664,52 +670,41 @@ "steppedLine": false, "targets": [ { - "expr": " sum (\n rate(\n nginx_ingress_controller_request_duration_seconds_count{\n ingress =~ \"$ingress\",\n status =~\"[4-5].*\",\n }[1m]\n )\n ) by(path, status)\n", + "expr": " sum (\n rate(\n nginx_ingress_controller_request_duration_seconds_count{\n ingress =~ \"$ingress\",\n status =~\"[4-5].*\",\n }[5m]\n )\n ) by(method, host, path, status)\n", "interval": "", "intervalFactor": 1, - "legendFormat": "{{ path }} {{ status }}", + "legendFormat": "{{ method }} {{ host }}{{path }} {{ status }}", "refId": "A" } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, - "title": "Response error volume by Path", + "title": "Response Error Rate by Method and Path", "tooltip": { "shared": true, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "reqps", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -717,7 +712,15 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -743,9 +746,10 @@ "linewidth": 1, "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, + "pluginVersion": "8.3.4", "pointradius": 2, "points": false, "renderer": "flot", @@ -755,60 +759,49 @@ "steppedLine": false, "targets": [ { - "expr": "sum (\n rate (\n nginx_ingress_controller_response_size_sum {\n ingress =~ \"$ingress\",\n }[1m]\n )\n) by (path) / sum (\n rate(\n nginx_ingress_controller_response_size_count {\n ingress =~ \"$ingress\",\n }[1m]\n )\n) by (path)\n", + "expr": "sum (\n rate (\n nginx_ingress_controller_response_size_sum {\n ingress =~ \"$ingress\",\n }[5m]\n )\n) by (method, host, path) / sum (\n rate(\n nginx_ingress_controller_response_size_count {\n ingress =~ \"$ingress\",\n }[5m]\n )\n) by (method, host, path)\n", "hide": false, "instant": false, "interval": "", "intervalFactor": 1, - "legendFormat": "{{ path }}", + "legendFormat": "{{ method }} {{ host }}{{path }}", "refId": "D" }, { - "expr": " sum (rate(nginx_ingress_controller_response_size_bucket{\n ingress =~ \"$ingress\",\n }[1m])) by (le)\n", + "expr": " sum (rate(nginx_ingress_controller_response_size_bucket{\n ingress =~ \"$ingress\",\n }[5m])) by (le)\n", "hide": true, "legendFormat": "{{le}}", "refId": "A" } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, - "title": "Average response size by Path", + "title": "Average Response Size by Method and Path", "tooltip": { "shared": true, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } }, { @@ -816,7 +809,15 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, "fill": 1, "fillGradient": 0, "gridPos": { @@ -840,9 +841,10 @@ "linewidth": 1, "nullPointMode": "null", "options": { - "dataLinks": [] + "alertThreshold": true }, "percentage": false, + "pluginVersion": "8.3.4", "pointradius": 2, "points": false, "renderer": "flot", @@ -852,7 +854,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum (\n rate(\n nginx_ingress_controller_ingress_upstream_latency_seconds_sum {\n ingress =~ \"$ingress\",\n }[1m]\n)) / sum (\n rate(\n nginx_ingress_controller_ingress_upstream_latency_seconds_count {\n ingress =~ \"$ingress\",\n }[1m]\n )\n)\n", + "expr": "sum (\n rate(\n nginx_ingress_controller_ingress_upstream_latency_seconds_sum {\n ingress =~ \"$ingress\",\n }[5m]\n)) / sum (\n rate(\n nginx_ingress_controller_ingress_upstream_latency_seconds_count {\n ingress =~ \"$ingress\",\n }[5m]\n )\n)\n", "hide": false, "instant": false, "interval": "", @@ -862,49 +864,38 @@ } ], "thresholds": [], - "timeFrom": null, "timeRegions": [], - "timeShift": null, - "title": "Upstream service latency", + "title": "Upstream Service Latency", "tooltip": { "shared": true, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { - "buckets": null, "mode": "time", - "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true }, { "format": "short", - "label": null, "logBase": 1, - "max": null, - "min": null, "show": true } ], "yaxis": { - "align": false, - "alignLevel": null + "align": false } } ], "refresh": "30s", - "schemaVersion": 22, + "schemaVersion": 34, "style": "dark", "tags": [ "nginx" @@ -912,19 +903,29 @@ "templating": { "list": [ { + "current": { + "selected": false, + "text": "Prometheus", + "value": "Prometheus" + }, "hide": 0, + "includeAll": false, "label": "datasource", + "multi": false, "name": "DS_PROMETHEUS", "options": [], "query": "prometheus", "refresh": 1, "regex": "", + "skipUrlSync": false, "type": "datasource" }, { "allValue": ".*", "current": {}, - "datasource": "${DS_PROMETHEUS}", + "datasource": { + "uid": "${DS_PROMETHEUS}" + }, "definition": "label_values(nginx_ingress_controller_requests, ingress) ", "hide": 0, "includeAll": true, @@ -932,13 +933,15 @@ "multi": false, "name": "ingress", "options": [], - "query": "label_values(nginx_ingress_controller_requests, ingress) ", + "query": { + "query": "label_values(nginx_ingress_controller_requests, ingress) ", + "refId": "Prometheus-ingress-Variable-Query" + }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 2, "tagValuesQuery": "", - "tags": [], "tagsQuery": "", "type": "query", "useTags": false @@ -977,5 +980,6 @@ "timezone": "browser", "title": "Request Handling Performance", "uid": "4GFbkOsZk", - "version": 1 + "version": 1, + "weekStart": "" }