From 35258823ad504eed7ce2ca8ca00edcbadd589b43 Mon Sep 17 00:00:00 2001 From: Dave Storey Date: Fri, 31 Jan 2020 14:38:12 +0000 Subject: [PATCH 1/3] dashboard configmap added --- .../grafana-dashboard-configmap.yaml | 879 ++++++++++++++++++ docs/resources.md | 6 +- 2 files changed, 884 insertions(+), 1 deletion(-) create mode 100644 config/prometheus/grafana-dashboard-configmap.yaml diff --git a/config/prometheus/grafana-dashboard-configmap.yaml b/config/prometheus/grafana-dashboard-configmap.yaml new file mode 100644 index 0000000..0430588 --- /dev/null +++ b/config/prometheus/grafana-dashboard-configmap.yaml @@ -0,0 +1,879 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + labels: + grafana_dashboard: "1" + name: azure-databricks-operator-dashboard +data: + dash.json: |- + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 26, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Total number of reconciliations per controller\n\ncontroller_runtime_reconcile_total", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 18, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(controller_runtime_reconcile_total[1m])", + "legendFormat": "{{controller}} - {{result}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Reconciliations Per Controller", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "hiddenSeries": false, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.5, sum(rate(databricks_request_duration_seconds_bucket[1m])) by (action, object_type, le)) * 1e3", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": " {{object_type}} - {{action}} Avg", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(databricks_request_duration_seconds_bucket[1m])) by (action, object_type, le)) * 1e3", + "legendFormat": " {{object_type}} - {{action}} 95%", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Average Databricks Request Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "ms", + "label": "Ms", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "hiddenSeries": false, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.5, sum(rate(controller_runtime_reconcile_time_seconds_bucket[1m])) by (le, controller))", + "intervalFactor": 2, + "legendFormat": "{{controller}} - Avg", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(controller_runtime_reconcile_time_seconds_bucket[1m])) by (le, controller))", + "legendFormat": "{{controller}} - 95%", + "refId": "B" + }, + { + "expr": "sum(rate(controller_runtime_reconcile_time_seconds_sum[1m])) / sum(rate(controller_runtime_reconcile_time_seconds_count[1m])) ", + "legendFormat": "{{controller}} - Mean", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Controller Reconcile Time", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "hiddenSeries": false, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": " increase(databricks_request_duration_seconds_bucket{outcome=\"success\"}[1m])", + "legendFormat": "{{object_type}} - {{action}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Databricks REST endpoint calls - Success", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "cps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Total number of adds handled by workqueue\n\nworkqueue_adds_total", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(workqueue_adds_total{name=~\"run|djob|dcluster|dbfsblock|secretscope|workspaceitem\"}[1m])", + "legendFormat": "{{name}}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Workqueue Adds", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "Ops", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": " increase(databricks_request_duration_seconds_bucket{outcome=\"failure\"}[1m])", + "legendFormat": "{{object_type}} - {{action}} ", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Databricks REST endpoint calls - Failure", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "cps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "Current depth of workqueue\n\nworkqueue_depth", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "hiddenSeries": false, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "workqueue_depth{name=~\"run|djob|dcluster|dbfsblock|secretscope|workspaceitem\"}", + "legendFormat": "{{name}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Workqueue Depth", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "How long in seconds processing an item from workqueue takes\n\n(workqueue_work_duration_seconds)", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "hiddenSeries": false, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": " histogram_quantile(0.5, sum(rate(workqueue_work_duration_seconds_bucket{name=~\"run|djob|dcluster|dbfsblock|secretscope|workspaceitem\"}[1m])) by (le, name))", + "hide": false, + "legendFormat": "{{name}} - Avg", + "refId": "A" + }, + { + "expr": " histogram_quantile(0.95, sum(rate(workqueue_work_duration_seconds_bucket{name=~\"run|djob|dcluster|dbfsblock|secretscope|workspaceitem\"}[1m])) by (le, name))", + "hide": false, + "legendFormat": "{{name}} - 95%", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Workqueue - Work Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": null, + "description": "How long in seconds an item stays in workqueue before being requested (workqueue_queue_duration_seconds)", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 32 + }, + "hiddenSeries": false, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "dataLinks": [] + }, + "percentage": false, + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": " histogram_quantile(0.5, sum(rate(workqueue_queue_duration_seconds_bucket{name=~\"run|djob|dcluster|dbfsblock|secretscope|workspaceitem\"}[1m])) by (le, name))", + "legendFormat": "{{name}} - AVG", + "refId": "A" + }, + { + "expr": " histogram_quantile(0.95, sum(rate(workqueue_queue_duration_seconds_bucket{name=~\"run|djob|dcluster|dbfsblock|secretscope|workspaceitem\"}[1m])) by (le, name))", + "legendFormat": "{{name}} - 95%", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Workqueue - Queue Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "5s", + "schemaVersion": 21, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "", + "title": "Databricks Operator", + "uid": "RSrHyjaZk", + "version": 1 + } \ No newline at end of file diff --git a/docs/resources.md b/docs/resources.md index a5c8037..363fc40 100644 --- a/docs/resources.md +++ b/docs/resources.md @@ -43,7 +43,11 @@ More info: - If you don't want Prometheus-Operator configuration generated, it can be disabled by commenting out the line indicated in `config/default/kustomization.yaml` - *NOTE:* If you don't have the Prometheus-Operator installed, the ServiceMonitor CRD will not be available to you - Custom metrics exposed by the Operator can be found by searching for `databricks_` inside the Prometheus web ui -- Metrics follow the naming guidlines recommended by Prometheus +- Metrics follow the naming guidlines recommended by Prometheus +- A Grafana dashboard compatible `configmap` is provided for use via `config/prometheus/grafana-dashboard-configmap.yaml` + - The dashboard provides you metrics regarding the health of your operator (upstream databricks call success/failure rates and general health of the operator) + - If Prometheus-Operator is being used ensure the configmap is modified to be deployed in the same namespace + - If you are not using Grafana/Prometheus-Operator, then the json can be extracted and imported manually ### How to access the Prometheus instance - Have the operator installed and running locally. See [deploy.md](https://github.com/microsoft/azure-databricks-operator/blob/master/docs/deploy.md) From 5adce829117d9a7fd6d2c490f61be85f1e928572 Mon Sep 17 00:00:00 2001 From: Dave Storey Date: Fri, 31 Jan 2020 14:48:14 +0000 Subject: [PATCH 2/3] moved the Grafana docs to own section --- docs/resources.md | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/resources.md b/docs/resources.md index 363fc40..90f7514 100644 --- a/docs/resources.md +++ b/docs/resources.md @@ -44,10 +44,6 @@ More info: - *NOTE:* If you don't have the Prometheus-Operator installed, the ServiceMonitor CRD will not be available to you - Custom metrics exposed by the Operator can be found by searching for `databricks_` inside the Prometheus web ui - Metrics follow the naming guidlines recommended by Prometheus -- A Grafana dashboard compatible `configmap` is provided for use via `config/prometheus/grafana-dashboard-configmap.yaml` - - The dashboard provides you metrics regarding the health of your operator (upstream databricks call success/failure rates and general health of the operator) - - If Prometheus-Operator is being used ensure the configmap is modified to be deployed in the same namespace - - If you are not using Grafana/Prometheus-Operator, then the json can be extracted and imported manually ### How to access the Prometheus instance - Have the operator installed and running locally. See [deploy.md](https://github.com/microsoft/azure-databricks-operator/blob/master/docs/deploy.md) @@ -63,6 +59,12 @@ More info: - Port forward localhost:8080 to your pod: `kubectl port-forward -n azure-databricks-operator-system pod/azure-databricks-operator-controller-manager- 8080:8080` - Open another terminal and curl request the metric endpoint: `curl localhost:8080/metrics` +### How to access metrics via Grafana +- A Grafana dashboard compatible `configmap` is provided for use via `config/prometheus/grafana-dashboard-configmap.yaml` +- If Prometheus-Operator is being used ensure the configmap is modified to be deployed in the same namespace +- If you are not using Grafana/Prometheus-Operator, then the json can be extracted and imported manually +- The dashboard provides you general metrics regarding the health of your operator (upstream databricks call success/failure rates and general health of the operator) + ### Counter metrics In addition to the standard metrics that kubebuilder provides, the following custom metrics have been added. From 3a5308ae83459c575a0236f43a0f9645703688a4 Mon Sep 17 00:00:00 2001 From: Dave Storey Date: Fri, 31 Jan 2020 14:56:50 +0000 Subject: [PATCH 3/3] updated the docs for installing grafana dashboard --- docs/resources.md | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/docs/resources.md b/docs/resources.md index 90f7514..a24230f 100644 --- a/docs/resources.md +++ b/docs/resources.md @@ -60,8 +60,17 @@ More info: - Open another terminal and curl request the metric endpoint: `curl localhost:8080/metrics` ### How to access metrics via Grafana -- A Grafana dashboard compatible `configmap` is provided for use via `config/prometheus/grafana-dashboard-configmap.yaml` -- If Prometheus-Operator is being used ensure the configmap is modified to be deployed in the same namespace +- Have the operator installed and running locally. See [deploy.md](https://github.com/microsoft/azure-databricks-operator/blob/master/docs/deploy.md) +- Determine the name of Grafana service running in your cluster (by default this will be prom-azure-databricks-operator-grafana) +- Port forward localhost:8080 to your service: `kubectl port-forward service/prom-azure-databricks-operator-grafana 8080:80` + - If using VSCode and Dev Container, you may need to expose the internal port out to your host machine (Command Pallete > Remote Containers Forward Port From Container) +- Using a browser navigate to `http://localhost:8080` to view the Prometheus dashboard +- If you are using the default helm installation of the Prometheus-Operator (as provided) then you can find the [default login details here](https://github.com/helm/charts/tree/master/stable/grafana#configuration) + +This repo also includes a Grafana dashboard that can be installed: +- If Prometheus-Operator is being used ensure then by default a sidecar is available to automatically install dashboards via `configmap`: + - Update `config/prometheus/grafana-dashboard-configmap.yaml` to have a namespace matching your Grafana service + - Apply `configmap` into the same namespace as your Grafana service running the sidecar `kubectl apply -f ./config/prometheus/grafana-dashboard-configmap.yaml` - If you are not using Grafana/Prometheus-Operator, then the json can be extracted and imported manually - The dashboard provides you general metrics regarding the health of your operator (upstream databricks call success/failure rates and general health of the operator)