diff --git a/grafana/scylla-cql-optimization.2019.1.template.json b/grafana/scylla-cql-optimization.2019.1.template.json new file mode 100644 index 0000000000..b60394237b --- /dev/null +++ b/grafana/scylla-cql-optimization.2019.1.template.json @@ -0,0 +1,440 @@ +{ + "dashboard": { + "class": "dashboard", + "uid": "cqlopt-2019.1", + "originalTitle": "Scylla CQL Optimization", + "rows": [ + { + "class": "logo_row" + }, + { + "class": "row", + "panels": [ + { + "class": "single_stat_panel", + "targets": [ + { + "expr": "count(up{job=\"scylla\", cluster=~\"$cluster|$^\"})", + "intervalFactor": 1, + "legendFormat": "Total Nodes", + "refId": "A", + "step": 1 + } + ], + "title": "Total Nodes" + }, + { + "class": "single_stat_panel_fail", + "targets": [ + { + "expr": "count(scrape_samples_scraped{job=\"scylla\", cluster=~\"$cluster|$^\"}==0) OR vector(0)", + "intervalFactor": 1, + "legendFormat": "Unreachable", + "refId": "A", + "step": 1 + } + ], + "thresholds": "1,2", + "title": "Unreachable" + }, + { + "class": "single_stat_panel_fail", + "description": "Number of nodes that reported their status as Starting or Joining", + "targets": [ + { + "expr": "count(scylla_node_operation_mode==1) + count(scylla_node_operation_mode==2)OR vector(0)", + "intervalFactor": 1, + "legendFormat": "Joining", + "refId": "A", + "step": 20 + } + ], + "thresholds": "1,2", + "title": "Joining" + }, + { + "class": "single_stat_panel_fail", + "description": "Number of nodes that reported their status as Leaving, Decommissioned, Draining or Drained", + "targets": [ + { + "expr": "count(scylla_node_operation_mode>3)OR vector(0)", + "intervalFactor": 1, + "legendFormat": "Leaving", + "refId": "A", + "step": 20 + } + ], + "thresholds": "1,2", + "title": "Leaving" + }, + { + "class": "ops_panel", + "description": "Total requests graph is the baseline for comparison", + "span": 4, + "targets": [ + { + "expr": "sum(irate(scylla_transport_requests_served{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]]) + sum(irate(scylla_thrift_served{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "Total Requests", + "refId": "A", + "step": 1 + } + ], + "title": "Total Requests" + }, + { + "class":"plain_text", + "content": "", + "span":2 + }, + { + "class": "dashlist", + "tags": [ + "2019.1" + ] + } + ], + "title": "Node status" + }, + { + "class": "row", + "panels": [ + + { + "class": "gauge_errors_panel", + "description": "All of the requests should be prepared\n\nPrepared statements remove the overhead of parsing the query every time and allow optimal routing of requests from client to server", + "targets": [ + { + "expr": "floor(100 *sum(irate(scylla_query_processor_statements_prepared{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) /(sum(irate(scylla_cql_reads{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) + sum(irate(scylla_cql_inserts{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) + sum(irate(scylla_cql_updates{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) + sum(irate(scylla_cql_deletes{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s]))))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "aaa", + "refId": "A" + } + ], + "title": "CQL Non-Prepared Statements" + }, + { + "class": "ops_panel", + "description": "All of the requests should be prepared\n", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_query_processor_statements_prepared{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "refId": "A" + } + ], + "title": "CQL Non-Prepared Statements" + }, + + { + "class": "gauge_errors_panel", + "description": "All of the requests should be Token Aware\n\nNon Token Aware requests sources:\n* Non-Prepared Stamements\n* Client not using a Token Aware load balancing policy\n\nTokenAware requests are sent to a Scylla node that is also a replica. Token Un-Aware requests require extra hop and additional processing.", + "targets": [ + { + "expr": "100 - floor(100*(sum(irate(scylla_storage_proxy_coordinator_reads_local_node{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) +sum(irate(scylla_storage_proxy_coordinator_total_write_attempts_local_node{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])))/(sum(irate(scylla_cql_inserts{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s]) + irate(scylla_cql_updates{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s]) + irate(scylla_cql_deletes{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s]) + irate(scylla_cql_reads{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s]))))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "aaa", + "refId": "A" + } + ], + "title": "Non-Token Aware" + }, + { + "class": "ops_panel", + "description": "Requests that are not token aware indicates that requests are not routed to the right node, which require extra hop and additional processing", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_cql_reads{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]]) - sum(irate(scylla_storage_proxy_coordinator_reads_local_node{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]]) + sum(irate(scylla_cql_inserts{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s]) + irate(scylla_cql_updates{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s]) + irate(scylla_cql_deletes{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]]) - sum(irate(scylla_storage_proxy_coordinator_total_write_attempts_local_node{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])\n", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "refId": "A" + } + ], + "title": "Non-Token Aware Queries" + }, + { + "class": "gauge_errors_panel", + "description": "All requests should be paged\n\nNon Paged request sources:\n- Client modifying the fetch size\n\nNon Paged requests require reading all the results and returning them in a single request.", + "targets": [ + { + "expr": "100 * (sum(irate(scylla_cql_unpaged_select_queries{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s]))/sum(irate(scylla_cql_reads{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "aaa", + "refId": "A" + } + ], + "title": "Non-Paged CQL Reads" + }, + { + "class": "ops_panel", + "span": 3, + "description": "Non-Paged requests require reading all the results and returning them in a single request", + "targets": [ + { + "expr": "sum(irate(scylla_cql_unpaged_select_queries{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "refId": "A" + } + ], + "title": "Non-Paged CQL Reads" + } + ], + "title": "New row" + }, + { + "class": "row", + "panels": [ + { + "class": "gauge_errors_panel", + "description": "Reversed CQL Reads entail additional processing on server side\n\nSources: CQL Read requests with ORDER BY that is different from the \"CLUSTERING ORDER BY\" of the table\nAlternatives:\n\n* Denormalize your data (use a Materialized View)", + "targets": [ + { + "expr": "100 * sum(irate(scylla_cql_reverse_queries{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) / sum(irate(scylla_cql_reads{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "aaa", + "refId": "A" + } + ], + "title": "Reversed CQL Reads" + }, + { + "class": "ops_panel", + "span": 3, + "description": "Reversed CQL Reads entail additional processing on server side and should be avoided", + "targets": [ + { + "expr": "sum(irate(scylla_cql_reverse_queries{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "refId": "A" + } + ], + "title": "Reversed CQL Reads" + }, + { + "class": "gauge_errors_panel", + "description": "ALLOW FILTERING CQL Reads, the percentage of read requests with 'ALLOW FILTERING'\n\nALLOW FILTERING CQL Reads entail additional processing on server side\n\nSources: CQL Read requests with \"ALLOW FILTERING\"\n\nALLOW FILTERING should be used when large parts of the filtered data is returned - check \n\"ALLOW FILTERING CQL Read Filtered Rows to check what percentage of the data is used\"\n\nAlternatives:\n- Use a Secondary Index\n- Denormalize your data (use a Materialized View)", + "targets": [ + { + "expr": "100 * sum(irate(scylla_cql_filtered_read_requests{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) / sum(irate(scylla_cql_reads{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s]))", + "format": "time_series", + "hide": true, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "aaa", + "refId": "A" + }, + { + "expr": "100 * sum(irate(scylla_cql_filtered_read_requests{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s]))/sum(irate(scylla_cql_reads{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s]))", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "refId": "B" + } + ], + "title": "ALLOW FILTERING CQL Reads" + }, + { + "class": "rps_panel", + "span": 3, + "description": "Read requests with ALLOW FILTERING\n\nALLOW FILTERING CQL Reads entail additional processing on server side and should be avoided", + "targets": [ + { + "expr": "sum(irate(scylla_cql_filtered_read_requests{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "refId": "A" + } + ], + "title": "ALLOW FILTERING CQL Reads" + }, + { + "class": "gauge_errors_panel", + "description": "ALLOW FILTERING Filtered rows, the percentage of rows that were read and then filtered.\n\nALLOW FILTERING CQL Reads entail additional processing on server side. \nReading a row and then filter it is a waste of resources.\n\nSources: CQL Read requests with \"ALLOW FILTERING\"\n\nALLOW FILTERING should be used when large parts of the filtered data is returned\n\nAlternatives:\n- Use a Secondary Index\n- Denormalize your data (use a Materialized View)", + "targets": [ + { + "expr": "100 * sum(irate(scylla_cql_filtered_rows_dropped_total{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) /sum(irate(scylla_cql_filtered_rows_read_total{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "aaa", + "refId": "A" + } + ], + "title": "ALLOW FILTERING Filtered Rows" + }, + { + "class": "rps_panel", + "span": 3, + "description": "CQL Queries with ALLOW FILTERING should be avoided.\nDropped rows are rows that were read but were filtered by the server.\nWhen dropped rows is relatively high you should consider the alternatives", + "targets": [ + { + "expr": "sum(irate(scylla_cql_filtered_rows_read_total{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "rows read $node $shard", + "refId": "A" + }, + { + "expr": "sum(irate(scylla_cql_filtered_rows_matched_total{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "rows matched $node $shard", + "refId": "B" + }, + { + "expr": "sum(irate(scylla_cql_filtered_rows_dropped_total{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "rows dropped $node $shard", + "refId": "C" + } + ], + "title": "ALLOW FILTERING CQL Read Filtering" + } + ], + "title": "New row" + }, + { + "class": "row", + "panels": [ + { + "class": "gauge_errors_panel", + "repeat": "dc", + "span": 2, + "description": "Cross DC traffic may cause additional latencies and network loads and in most cases, should be avoided.\n\nCross DC Read requests sources:\n- Consistency Level that is not LOCAL_XXX\n- Tables with read_repair_chance > 0\n\nNote:\n- If requests are supposed to be DC local - verify client is using a DCAware policy and a LOCAL_XX consistency level", + "targets": [ + { + "expr": "100*(sum(irate(scylla_storage_proxy_coordinator_reads_remote_node{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) - sum(irate(scylla_storage_proxy_coordinator_reads_remote_node{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", datacenter=~\"$dc\", shard=~\"[[shard]]\"}[30s])))/sum(irate(scylla_storage_proxy_coordinator_reads_remote_node{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s]))", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "title": "Cross DC read requests $dc" + } + ] + } + ], + "tags": [ + "2019.1" + ], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "tags": [], + "text": "Instance", + "value": "instance" + }, + "hide": 0, + "includeAll": false, + "label": "by", + "multi": false, + "name": "by", + "options": [ + { + "selected": false, + "text": "Cluster", + "value": "cluster" + }, + { + "selected": false, + "text": "DC", + "value": "dc" + }, + { + "selected": true, + "text": "Instance", + "value": "instance" + }, + { + "selected": false, + "text": "Shard", + "value": "shard" + } + ], + "query": "Cluster,DC,Instance,Shard", + "type": "custom" + }, + { + "class": "template_variable_single", + "label": "cluster", + "name": "cluster", + "query": "label_values(scylla_reactor_utilization, cluster)" + }, + { + "class": "template_variable_all", + "label": "dc", + "name": "dc", + "query": "label_values(scylla_reactor_utilization{cluster=~\"$cluster\"}, dc)" + }, + { + "class": "template_variable_all", + "label": "node", + "name": "node", + "query": "label_values(scylla_reactor_utilization{cluster=~\"$cluster|$^\", dc=~\"$dc\"}, instance)" + }, + { + "class": "template_variable_all", + "label": "shard", + "name": "shard", + "query": "label_values(scylla_reactor_utilization,shard)", + "sort": 3 + }, + { + "class": "template_variable_custom", + "current": { + "text": "2019.1", + "value": "2019.1" + }, + "name": "dash_version", + "options": [ + { + "selected": true, + "text": "2019.1", + "value": "2019.1" + } + ], + "query": "2019.1" + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "overwrite": true, + "title": "Scylla CQL Optimization" + } +} \ No newline at end of file diff --git a/grafana/scylla-dash-cpu-per-server.2019.1.template.json b/grafana/scylla-dash-cpu-per-server.2019.1.template.json new file mode 100644 index 0000000000..b5c658033c --- /dev/null +++ b/grafana/scylla-dash-cpu-per-server.2019.1.template.json @@ -0,0 +1,158 @@ +{ + "dashboard": { + "class": "dashboard", + "uid": "cpu-2019.1", + "rows": [ + { + "class": "logo_row" + }, + { + "class": "row", + "panels": [ + { + "class": "percent_panel", + "targets": [ + { + "expr": "avg(scylla_reactor_utilization{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"} ) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "CPU Utilization per [[by]]", + "description" : "the percentage of the time during which the CPU is utilized by Scylla. Note that because Scylla does busy polling for some time before going idle, CPU utilization as seen by the operating system may be much higher. Your system is not yet CPU-bottlenecked until this metric is high" + }, + { + "class": "text_panel", + "content": "## ", + "mode": "markdown", + "span": 3, + "style": {} + }, + { + "class": "dashlist", + "tags": [ + "2019.1" + ] + } + ], + "title": "New row" + }, + { + "class": "row", + "panels": [ + { + "class": "percent_panel", + "pointradius": 1, + "targets": [ + { + "expr": "avg(irate(scylla_scheduler_runtime_ms{group=\"main\",instance=~\"[[node]]\" ,cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])/10 + avg(irate(scylla_scheduler_runtime_ms{group=\"statement\",instance=~\"[[node]]\" ,cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])/10", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 30 + } + ], + "title": "Foreground CPU Utilization by [[by]]", + "description": "Time spent handling foreground requests (like reads, writes, and some system tasks). The remaining time is either idle, or used by background load like compactions and repairs. Background load in Scylla is opportunistic: background requests will try to use all resources available to complete as fast as possible and rely on the schedulers to provide isolation. This graph is better understood in conjunction with the main CPU load graph. For example, if there are spikes in CPU load that are not present in this graph, that indicates that the foreground load itself is stable." + }, + { + "class": "ms_panel", + "pointradius": 1, + "targets": [ + { + "expr": "sum(irate(scylla_scheduler_time_spent_on_task_quota_violations_ms{instance=~\"[[node]]\" ,cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 30 + } + ], + "title": "Time spent in task quota violations by [[by]]", + "description": "Scylla employs an event-loop like reactor that alternates between the execution of different groups of tasks periodically. The maximum amount of time during which a task group can run is called the \"task quota\". Some task groups may disrespect that and run for longer. This may cause latency issues" + } + ], + "title": "New row" + } + ], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "tags": [], + "text": "Instance", + "value": "instance" + }, + "hide": 0, + "includeAll": false, + "label": "by", + "multi": false, + "name": "by", + "options": [ + { + "selected": false, + "text": "Cluster", + "value": "cluster" + }, + { + "selected": false, + "text": "DC", + "value": "dc" + }, + { + "selected": true, + "text": "Instance", + "value": "instance" + }, + { + "selected": false, + "text": "Shard", + "value": "shard" + } + ], + "query": "Cluster,DC,Instance,Shard", + "type": "custom" + }, + { + "class": "template_variable_single", + "label": "cluster", + "name": "cluster", + "query": "label_values(scylla_reactor_utilization, cluster)" + }, + { + "class": "template_variable_all", + "label": "dc", + "name": "dc", + "query": "label_values(scylla_reactor_utilization{cluster=~\"$cluster\"}, dc)" + }, + { + "class": "template_variable_all", + "label": "node", + "name": "node", + "query": "label_values(scylla_reactor_utilization{cluster=~\"$cluster|$^\", dc=~\"$dc\"}, instance)" + }, + { + "class": "template_variable_all", + "label": "shard", + "name": "shard", + "query": "label_values(scylla_reactor_utilization,shard)", + "sort": 3 + } + ] + }, + "tags": [ + "2019.1" + ], + "time": { + "from": "now-30m", + "to": "now" + }, + "title": "Scylla CPU Per Server Metrics", + "overwrite": true, + "version": 5 + } +} diff --git a/grafana/scylla-dash-io-per-server.2019.1.template.json b/grafana/scylla-dash-io-per-server.2019.1.template.json new file mode 100644 index 0000000000..2a89bf786c --- /dev/null +++ b/grafana/scylla-dash-io-per-server.2019.1.template.json @@ -0,0 +1,572 @@ +{ + "dashboard": { + "class": "dashboard", + "uid": "io-2019.1", + "rows": [ + { + "class": "logo_row" + }, + { + "class": "row", + "height": "150px", + "panels": [ + { + "class": "single_stat_panel", + "targets": [ + { + "expr": "count(up{job=\"scylla\", cluster=~\"$cluster|$^\"})", + "intervalFactor": 1, + "legendFormat": "Total Nodes", + "refId": "A", + "step": 240 + } + ], + "title": "Total Nodes" + }, + { + "class": "single_stat_panel_fail", + "targets": [ + { + "expr": "count(scrape_samples_scraped{job=\"scylla\", cluster=~\"$cluster|$^\"}==0) OR vector(0)", + "intervalFactor": 1, + "legendFormat": "Unreachable", + "refId": "A", + "step": 120 + } + ], + "thresholds": "1,2", + "title": "Unreachable" + }, + { + "class": "text_panel", + "content": "## ", + "mode": "markdown", + "span": 3, + "style": {} + }, + { + "aliasColors": { + "{}": "#584477" + }, + "bars": true, + "class": "graph_panel", + "lines": false, + "seriesOverrides": [ + {} + ], + "targets": [ + { + "expr": "sum(irate(scylla_transport_requests_served{cluster=~\"$cluster|$^\"}[30s])) + sum(irate(scylla_thrift_served{cluster=~\"$cluster|$^\"}[30s]))", + "intervalFactor": 1, + "legendFormat": "Total Requests", + "refId": "A", + "step": 30 + } + ], + "title": "Total Requests", + "transparent": false + }, + { + "class": "dashlist", + "tags": [ + "2019.1" + ] + } + ], + "title": "New row" + }, + { + "class": "row", + "panels": [ + { + "class": "graph_panel", + "seriesOverrides": [ + {} + ], + "targets": [ + { + "expr": "avg(scylla_reactor_utilization{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"} ) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 30 + } + ], + "title": "Load per [[by]]", + "transparent": false + }, + { + "class": "graph_panel", + "pointradius": 1, + "targets": [ + { + "expr": "sum(irate(scylla_transport_requests_served{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]]) + sum(irate(scylla_thrift_served{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 30 + } + ], + "title": "Requests Served per [[by]]" + } + ], + "title": "New row" + }, + { + "class": "row", + "height": "25px", + "gridPos": {"h": 2}, + "panels": [ + { + "class": "plain_text", + "content": "

Disk Activity

" + } + ], + "title": "New row" + }, + { + "class": "row", + "panels": [ + { + "class": "graph_panel", + "span": 6, + "targets": [ + { + "expr": "sum(irate(node_disk_writes_completed{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", device=\"$monitor_disk\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 20 + } + ], + "title": "Disk Writes per Server" + }, + { + "class": "graph_panel", + "span": 6, + "targets": [ + { + "expr": "sum(irate(node_disk_reads_completed{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", device=\"$monitor_disk\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 20 + } + ], + "title": "Disk Reads per Server" + } + ], + "title": "New row" + }, + { + "class": "row", + "panels": [ + { + "class": "bps_panel", + "span": 6, + "targets": [ + { + "expr": "sum(irate(node_disk_bytes_written{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", device=\"$monitor_disk\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 20 + } + ], + "title": "Disk Writes Bps per Server" + }, + { + "class": "bps_panel", + "span": 6, + "targets": [ + { + "expr": "sum(irate(node_disk_bytes_read{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", device=\"$monitor_disk\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 20 + } + ], + "title": "Disk Read Bps per Server" + } + ], + "title": "New row" + }, + { + "class": "row", + "height": "25px", + "gridPos": {"h": 2}, + "panels": [ + { + "class": "plain_text", + "content": "

I/O Queue

" + } + ], + "title": "New row" + }, + { + "class": "row", + "panels": [ + { + "class": "us_panel", + "span": 4, + "targets": [ + { + "expr": "1000000*max(scylla_io_queue_delay{class=\"compaction\", instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"} ) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "seastar_io_queue_delay", + "refId": "A", + "step": 30 + } + ], + "title": "Compactions I/O Queue delay by [[by]]" + }, + { + "class": "bps_panel", + "span": 4, + "targets": [ + { + "expr": "sum(irate(scylla_io_queue_total_bytes{class=\"compaction\", instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "seastar_io_queue_delay", + "refId": "A", + "step": 30 + } + ], + "title": "Compactions I/O Queue bandwidth by [[by]]" + }, + { + "class": "iops_panel", + "span": 4, + "targets": [ + { + "expr": "sum(irate(scylla_io_queue_total_operations{class=\"compaction\", instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "seastar_io_queue_delay", + "refId": "A", + "step": 30 + } + ], + "title": "Compactions I/O Queue IOPS by [[by]]" + }, + { + "class": "us_panel", + "span": 4, + "targets": [ + { + "expr": "1000000*max(scylla_io_queue_delay{class=\"query\", instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "seastar_io_queue_delay", + "refId": "A", + "step": 30 + } + ], + "title": "Query I/O Queue delay by [[by]]" + }, + { + "class": "bps_panel", + "span": 4, + "targets": [ + { + "expr": "sum(irate(scylla_io_queue_total_bytes{class=\"query\", instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "seastar_io_queue_delay", + "refId": "A", + "step": 30 + } + ], + "title": "Query I/O Queue bandwidth by [[by]]" + }, + { + "class": "iops_panel", + "span": 4, + "targets": [ + { + "expr": "sum(irate(scylla_io_queue_total_operations{class=\"query\", instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "seastar_io_queue_delay", + "refId": "A", + "step": 30 + } + ], + "title": "Query I/O Queue IOPS by [[by]]" + }, + { + "class": "us_panel", + "span": 4, + "targets": [ + { + "expr": "1000000*max(scylla_io_queue_delay{class=\"commitlog\", instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "seastar_io_queue_delay", + "refId": "A", + "step": 30 + } + ], + "title": "Commitlog I/O Queue delay by [[by]]" + }, + { + "class": "bps_panel", + "span": 4, + "targets": [ + { + "expr": "sum(irate(scylla_io_queue_total_bytes{class=\"commitlog\", instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "seastar_io_queue_delay", + "refId": "A", + "step": 30 + } + ], + "title": "Commitlog I/O Queue bandwidth by [[by]]" + }, + { + "class": "iops_panel", + "span": 4, + "targets": [ + { + "expr": "sum(irate(scylla_io_queue_total_operations{class=\"commitlog\", instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "seastar_io_queue_delay", + "refId": "A", + "step": 30 + } + ], + "title": "Commitlog I/O Queue IOPS by [[by]]" + }, + { + "class": "us_panel", + "span": 4, + "targets": [ + { + "expr": "1000000*max(scylla_io_queue_flush_delay{class=\"memtable\", instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "seastar_io_queue_delay", + "refId": "A", + "step": 30 + } + ], + "title": "Memtable Flush I/O Queue delay by [[by]]" + }, + { + "class": "bps_panel", + "span": 4, + "targets": [ + { + "expr": "sum(irate(scylla_io_queue_flush_total_bytes{class=\"memtable\", instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "seastar_io_queue_delay", + "refId": "A", + "step": 30 + } + ], + "title": "Memtable Flush I/O Queue bandwidth by [[by]]" + }, + { + "class": "iops_panel", + "span": 4, + "targets": [ + { + "expr": "sum(irate(scylla_io_queue_flush_total_operations{class=\"memtable\", instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "seastar_io_queue_delay", + "refId": "A", + "step": 30 + } + ], + "title": "Memtable Flush I/O Queue IOPS by [[by]]" + }, + { + "class": "us_panel", + "span": 4, + "targets": [ + { + "expr": "1000000*max(scylla_io_queue_read_delay{class=\"streaming\", instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "seastar_io_queue_delay", + "refId": "A", + "step": 30 + } + ], + "title": "Streaming Reads I/O Queue delay by [[by]]" + }, + { + "class": "bps_panel", + "span": 4, + "targets": [ + { + "expr": "sum(irate(scylla_io_queue_read_total_bytes{class=\"streaming\", instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "seastar_io_queue_delay", + "refId": "A", + "step": 30 + } + ], + "title": "Streaming Reads I/O Queue bandwidth by [[by]]" + }, + { + "class": "iops_panel", + "span": 4, + "targets": [ + { + "expr": "sum(irate(scylla_io_queue_read_total_operations{class=\"streaming\", instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "seastar_io_queue_delay", + "refId": "A", + "step": 30 + } + ], + "title": "Streaming Reads I/O Queue IOPS by [[by]]" + }, + { + "class": "us_panel", + "span": 4, + "targets": [ + { + "expr": "1000000*max(scylla_io_queue_write_delay{class=\"streaming\", instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"} ) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "seastar_io_queue_delay", + "refId": "A", + "step": 30 + } + ], + "title": "Streaming Writes I/O Queue delay by [[by]]" + }, + { + "class": "bps_panel", + "span": 4, + "targets": [ + { + "expr": "sum(irate(scylla_io_queue_write_total_bytes{class=\"streaming\", instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "seastar_io_queue_delay", + "refId": "A", + "step": 30 + } + ], + "title": "Streaming Writes I/O Queue bandwidth by [[by]]" + }, + { + "class": "iops_panel", + "span": 4, + "targets": [ + { + "expr": "sum(irate(scylla_io_queue_write_total_operations{class=\"streaming\", instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "seastar_io_queue_delay", + "refId": "A", + "step": 30 + } + ], + "title": "Streaming Writes I/O Queue IOPS by [[by]]" + } + ], + "title": "New row" + } + ], + "tags": [ + "2019.1" + ], + "templating": { + "list": [ + { + "current": {}, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "multi": false, + "name": "monitor_disk", + "options": [], + "query": "node_disk_bytes_read", + "refresh": 2, + "regex": "/.*device=\"([^\\\"]*)\".*/", + "type": "query" + }, + { + "allValue": null, + "current": { + "tags": [], + "text": "Instance", + "value": "instance" + }, + "hide": 0, + "includeAll": false, + "label": "by", + "multi": false, + "name": "by", + "options": [ + { + "selected": false, + "text": "Cluster", + "value": "cluster" + }, + { + "selected": false, + "text": "DC", + "value": "dc" + }, + { + "selected": true, + "text": "Instance", + "value": "instance" + }, + { + "selected": false, + "text": "Shard", + "value": "shard" + } + ], + "query": "Cluster,DC,Instance,Shard", + "type": "custom" + }, + { + "class": "template_variable_single", + "label": "cluster", + "name": "cluster", + "query": "label_values(scylla_reactor_utilization, cluster)" + }, + { + "class": "template_variable_all", + "label": "dc", + "name": "dc", + "query": "label_values(scylla_reactor_utilization{cluster=~\"$cluster\"}, dc)" + }, + { + "class": "template_variable_all", + "label": "node", + "name": "node", + "query": "label_values(scylla_reactor_utilization{cluster=~\"$cluster|$^\", dc=~\"$dc\"}, instance)" + }, + { + "class": "template_variable_all", + "label": "shard", + "name": "shard", + "query": "label_values(scylla_reactor_utilization,shard)", + "sort": 3 + } + ] + }, + "overwrite": true, + "title": "Scylla Per-Server Disk I/O" + } +} \ No newline at end of file diff --git a/grafana/scylla-dash-per-machine.2019.1.template.json b/grafana/scylla-dash-per-machine.2019.1.template.json new file mode 100644 index 0000000000..2e611a5cb4 --- /dev/null +++ b/grafana/scylla-dash-per-machine.2019.1.template.json @@ -0,0 +1,310 @@ +{ + "dashboard": { + "class": "dashboard", + "uid": "machine-2019.1", + "rows": [ + { + "class": "logo_row" + }, + { + "class": "row", + "height": "200px", + "panels": [ + { + "class": "pie_chart_panel", + "repeat": "node", + "height": "250px", + "targets": [ + { + "expr": "sum(node_filesystem_avail{mountpoint=\"$mount_point\", instance=~\"$node\"})", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Free", + "metric": "", + "refId": "A", + "step": 7200 + }, + { + "expr": "(sum(node_filesystem_size{mountpoint=\"$mount_point\", instance=~\"$node\"})-sum(node_filesystem_avail{mountpoint=\"$mount_point\", instance=~\"$node\"}))", + "intervalFactor": 1, + "legendFormat": "Used", + "refId": "B", + "step": 7200 + } + ], + "title": "Total Storage $node" + } + ], + "title": "New row" + }, + { + "class": "row", + "height": "25px", + "gridPos": {"h": 2}, + "panels": [ + { + "class": "text_panel", + "content": "

Disk $monitor_disk

", + "style": {} + } + ], + "title": "New row" + }, + { + "class": "row", + "panels": [ + { + "class": "wps_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(node_disk_writes_completed{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", device=\"$monitor_disk\"}[30s])) by (instance)", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1 + } + ], + "title": "Disk Writes per Server" + }, + { + "class": "rps_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(node_disk_reads_completed{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", device=\"$monitor_disk\"}[30s])) by (instance)", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Disk Reads per Server" + }, + { + "class": "bps_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(node_disk_bytes_written{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", device=\"$monitor_disk\"}[30s])) by (instance)", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1 + } + ], + "title": "Disk Writes Bps per Server" + }, + { + "class": "bps_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(node_disk_bytes_read{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", device=\"$monitor_disk\"}[30s])) by (instance)", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Disk Read Bps per Server" + } + ], + "title": "New row" + }, + { + "class": "row", + "height": "25px", + "gridPos": {"h": 2}, + "panels": [ + { + "class": "text_panel", + "content": "

Network $monitor_network_interface

", + "style": {} + } + ], + "title": "New row" + }, + { + "class": "row", + "panels": [ + { + "class": "pps_panel", + "span": 6, + "targets": [ + { + "expr": "sum(irate(node_network_receive_packets{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", device=\"$monitor_network_interface\"}[30s])) by (instance)", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1 + } + ], + "title": "Interface Rx Packets" + }, + { + "class": "pps_panel", + "span": 6, + "targets": [ + { + "expr": "sum(irate(node_network_transmit_packets{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", device=\"$monitor_network_interface\"}[30s])) by (instance)", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1 + } + ], + "title": "Interface Tx Packets" + } + ], + "title": "New row" + }, + { + "class": "row", + "panels": [ + { + "class": "bps_panel", + "span": 6, + "targets": [ + { + "expr": "sum(irate(node_network_receive_bytes{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", device=\"$monitor_network_interface\"}[30s])) by (instance)", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1 + } + ], + "title": "Interface Rx Bps" + }, + { + "class": "bps_panel", + "span": 6, + "targets": [ + { + "expr": "sum(irate(node_network_transmit_bytes{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", device=\"$monitor_network_interface\"}[30s])) by (instance)", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1 + } + ], + "title": "Interface Tx Bps" + } + ], + "title": "New row" + } + ], + "templating": { + "list": [ + { + "class": "template_variable_single", + "label": "cluster", + "name": "cluster", + "query": "label_values(node_filesystem_avail, cluster)" + }, + { + "class": "template_variable_all", + "label": "dc", + "name": "dc", + "query": "label_values(node_filesystem_avail{cluster=~\"$cluster\"}, dc)" + }, + { + "class": "template_variable_all", + "label": "node", + "name": "node", + "query": "label_values(node_filesystem_avail{cluster=~\"$cluster|$^\", dc=~\"$dc\"}, instance)" + }, + { + "allValue": null, + "current": { + "isNone": true, + "text": "None", + "value": "" + }, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "monitor_disk", + "options": [], + "query": "node_disk_bytes_read", + "refresh": 2, + "regex": "/.*device=\"([^\\\"]*)\".*/", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "isNone": true, + "text": "None", + "value": "" + }, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "monitor_network_interface", + "options": [], + "query": "node_network_receive_packets", + "refresh": 2, + "regex": "/.*device=\"([^\\\"]*)\".*/", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "/var/lib/scylla", + "value": "/var/lib/scylla" + }, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": "Mounnt path", + "multi": false, + "name": "mount_point", + "options": [], + "query": "node_filesystem_avail", + "refresh": 2, + "regex": "/mountpoint=\"([^\"]*)\".*/", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + + + + ] + }, + "tags": [ + "2019.1" + ], + "time": { + "from": "now-30m", + "to": "now" + }, + "title": "Scylla Per Machine Metrics", + "overwrite": true, + "version": 5 + } +} diff --git a/grafana/scylla-dash-per-server.2019.1.template.json b/grafana/scylla-dash-per-server.2019.1.template.json new file mode 100644 index 0000000000..3a35510517 --- /dev/null +++ b/grafana/scylla-dash-per-server.2019.1.template.json @@ -0,0 +1,1014 @@ +{ + "dashboard": { + "class": "dashboard", + "uid": "detail-2019.1", + "rows": [ + { + "class": "logo_row" + }, + { + "class": "row", + "panels": [ + { + "class": "single_stat_panel", + "targets": [ + { + "expr": "count(up{job=\"scylla\", cluster=~\"$cluster|$^\"})", + "intervalFactor": 1, + "legendFormat": "Total Nodes", + "refId": "A", + "step": 1 + } + ], + "title": "Total Nodes" + }, + { + "class": "single_stat_panel_fail", + "targets": [ + { + "expr": "count(scrape_samples_scraped{job=\"scylla\", cluster=~\"$cluster|$^\"}==0) OR vector(0)", + "intervalFactor": 1, + "legendFormat": "Unreachable", + "refId": "A", + "step": 1 + } + ], + "thresholds": "1,2", + "title": "Unreachable" + }, + { + "class": "single_stat_panel_fail", + "description": "Number of nodes that reported their status as Starting or Joining", + "targets": [ + { + "expr": "count(scylla_node_operation_mode==1) + count(scylla_node_operation_mode==2)OR vector(0)", + "intervalFactor": 1, + "legendFormat": "Joining", + "refId": "A", + "step": 20 + } + ], + "thresholds": "1,2", + "title": "Joining" + }, + { + "class": "single_stat_panel_fail", + "description": "Number of nodes that reported their status as Leaving, Decommissioned, Draining or Drained", + "targets": [ + { + "expr": "count(scylla_node_operation_mode>3)OR vector(0)", + "intervalFactor": 1, + "legendFormat": "Leaving", + "refId": "A", + "step": 20 + } + ], + "thresholds": "1,2", + "title": "Leaving" + }, + { + "class": "text_panel", + "content": "## ", + "mode": "markdown", + "span": 1, + "style": {} + }, + { + "class": "ops_panel", + "bars": true, + "targets": [ + { + "expr": "sum(irate(scylla_transport_requests_served{cluster=~\"$cluster|$^\"}[30s])) + sum(irate(scylla_thrift_served{cluster=~\"$cluster|$^\"}[30s]))", + "intervalFactor": 1, + "legendFormat": "Total Requests", + "refId": "A", + "step": 1 + } + ], + "title": "Total Requests" + }, + { + "class": "dashlist", + "tags": [ + "2019.1" + ] + } + ], + "title": "New row" + }, + { + "class": "row", + "panels": [ + { + "class": "percent_panel", + "targets": [ + { + "expr": "avg(scylla_reactor_utilization{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"} ) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Load per [[by]]" + }, + { + "class": "ops_panel", + "targets": [ + { + "expr": "sum(irate(scylla_transport_requests_served{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]]) + sum(irate(scylla_thrift_served{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1 + } + ], + "description": "Amount of requests served as the coordinator. Imbalances here represent dispersion at the connection level, not your data model.", + "title": "Requests Served per [[by]]" + } + ], + "title": "New row" + }, + { + "class": "row", + "height": "25px", + "gridPos": {"h": 2}, + "panels": [ + { + "class": "text_header_panel", + "content": "

Reads and Writes

" + }, + { + "class": "text_header_panel", + "content": "

Timeouts and Errors

" + } + ], + "title": "New row" + }, + { + "class": "row", + "height": "200px", + "panels": [ + { + "class": "graph_panel", + "span": 3, + "targets": [ + { + "expr": "sum(scylla_storage_proxy_coordinator_foreground_writes{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Foreground Writes per [[by]]" + }, + { + "class": "graph_panel", + "span": 3, + "targets": [ + { + "expr": "sum(scylla_storage_proxy_coordinator_foreground_reads{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1 + } + ], + "title": "Foreground Reads per [[by]]" + }, + { + "class": "wps_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_storage_proxy_coordinator_write_timeouts{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Write Timeouts per [[by]]" + }, + { + "class": "wps_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_storage_proxy_coordinator_write_unavailable{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Write Unavailable per [[by]]" + } + ], + "title": "New row" + }, + { + "class": "row", + "height": "200px", + "panels": [ + { + "class": "graph_panel", + "span": 3, + "targets": [ + { + "expr": "sum(scylla_storage_proxy_coordinator_background_writes{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Background Writes per [[by]]" + }, + { + "class": "graph_panel", + "span": 3, + "targets": [ + { + "expr": "sum(scylla_storage_proxy_coordinator_background_reads{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 4 + } + ], + "title": "Background Reads per [[by]]" + }, + { + "class": "rps_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_storage_proxy_coordinator_read_timeouts{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Read Timeouts per [[by]]" + }, + { + "class": "rps_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_storage_proxy_coordinator_read_unavailable{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 4 + } + ], + "title": "Read Unavailable per [[by]]" + } + ], + "title": "New row" + }, + { + "class": "row", + "gridPos": {"h": 2}, + "panels": [ + { + "class": "plain_text", + "content": "

Replica

" + } + ] + }, + { + "class": "row", + "height": "auto", + "panels": [ + { + "class": "rps_panel", + "span": 6, + "targets": [ + { + "expr": "sum(irate(scylla_database_total_reads{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Reads" + }, + { + "class": "wps_panel", + "span": 6, + "targets": [ + { + "expr": "sum(irate(scylla_database_total_writes{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1 + } + ], + "title": "Writes" + }, + { + "class": "graph_panel", + "span": 3, + "targets": [ + { + "expr": "sum(scylla_database_active_reads{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Active sstable reads" + }, + { + "class": "graph_panel", + "span": 3, + "targets": [ + { + "expr": "sum(scylla_database_queued_reads{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Queued sstable reads" + }, + { + "class": "graph_panel", + "span": 3, + "targets": [ + { + "expr": "sum(scylla_database_requests_blocked_memory_current{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Writes currently blocked on dirty" + }, + { + "class": "graph_panel", + "span": 3, + "targets": [ + { + "expr": "sum(scylla_commitlog_pending_allocations{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Writes currently blocked on commitlog" + }, + { + "class": "text_panel", + "content": "", + "mode": "markdown", + "span": 3 + }, + { + "class": "rps_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_database_total_reads_failed{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Reads failed" + }, + { + "class": "wps_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_database_requests_blocked_memory{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Writes blocked on dirty" + }, + { + "class": "wps_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_commitlog_requests_blocked_memory{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Writes blocked on commitlog" + }, + { + "class": "text_panel", + "content": "", + "mode": "markdown", + "span": 3 + }, + { + "class": "text_panel", + "content": "", + "mode": "markdown", + "span": 3 + }, + { + "class": "wps_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_database_total_writes_failed{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Writes failed" + }, + { + "class": "wps_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_database_total_writes_timedout{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Writes timed out" + } + ], + "title": "New row" + }, + { + "class": "row", + "height": "25px", + "gridPos": {"h": 2}, + "panels": [ + { + "class": "text_panel", + "content": "

Cache

", + "style": {} + } + ], + "title": "New row" + }, + { + "class": "row", + "panels": [ + { + "class": "rps_panel", + "span": 6, + "targets": [ + { + "expr": "sum(irate(scylla_cache_reads{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s]) - irate(scylla_cache_reads_with_misses{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Reads with no misses" + }, + { + "class": "rps_panel", + "span": 6, + "targets": [ + { + "expr": "sum(irate(scylla_cache_reads_with_misses{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Reads with misses" + } + ] + }, + { + "class" : "row", + "panels": [ + { + "class": "rps_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_cache_row_hits{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Row Hits" + }, + { + "class": "rps_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_cache_partition_hits{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Partition Hits" + }, + { + "class": "rps_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_cache_row_misses{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Row Misses" + }, + { + "class": "rps_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_cache_partition_misses{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Partition Misses" + } + ], + "title": "New row" + }, + { + "class": "row", + "panels": [ + { + "class": "rps_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_cache_row_insertions{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Row Insertions" + }, + { + "class": "rps_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_cache_partition_insertions{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Partition Insertions" + }, + { + "class": "ops_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_cache_row_evictions{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Row Evictions" + }, + { + "class": "ops_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_cache_partition_evictions{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Partition Evictions" + } + ] + }, + { + "class": "row", + "panels": [ + { + "class": "ops_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_cache_rows_merged_from_memtable{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Row Merges" + }, + { + "class": "ops_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_cache_partition_merges{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Partition Merges" + }, + { + "class": "ops_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_cache_row_removals{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Row Removals" + }, + { + "class": "ops_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_cache_partition_removals{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Partition Removals" + } + ] + }, + { + "class": "row", + "panels": [ + { + "class": "graph_panel", + "span": 3, + "targets": [ + { + "expr": "sum(scylla_cache_rows{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Rows" + }, + { + "class": "graph_panel", + "span": 3, + "targets": [ + { + "expr": "sum(scylla_cache_partitions{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Partitions" + }, + { + "class": "bytes_panel", + "span": 3, + "targets": [ + { + "expr": "sum(scylla_cache_bytes_used{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Used Bytes" + }, + { + "class": "bytes_panel", + "span": 3, + "targets": [ + { + "expr": "sum(scylla_cache_bytes_total{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Total Bytes" + } + ], + "title": "New row" + }, + { + "class": "row", + "height": "25px", + "gridPos": {"h": 2}, + "panels": [ + { + "class": "text_panel", + "content": "

Memory

", + "style": {} + } + ], + "title": "New row" + }, + { + "class": "row", + "panels": [ + { + "class": "bytes_panel", + "span": 6, + "targets": [ + { + "expr": "sum(scylla_lsa_total_space_bytes{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "LSA total memory" + }, + { + "class": "bytes_panel", + "span": 6, + "targets": [ + { + "expr": "sum(scylla_lsa_non_lsa_used_space_bytes{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Non-LSA used memory" + } + ], + "title": "New row" + }, + { + "class": "row", + "height": "25px", + "gridPos": {"h": 2}, + "panels": [ + { + "class": "text_panel", + "content": "

Compaction

", + "style": {} + } + ], + "title": "New row" + }, + { + "class": "row", + "panels": [ + { + "class": "graph_panel", + "span": 12, + "targets": [ + { + "expr": "sum(scylla_compaction_manager_compactions{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1 + } + ], + "title": "Running Compactions" + } + ], + "title": "New row" + }, + { + "class": "row", + "height": "25px", + "gridPos": {"h": 2}, + "panels": [ + { + "class": "text_panel", + "content": "

CQL

", + "style": {} + } + ], + "title": "New row" + }, + { + "class": "row", + "panels": [ + { + "class": "ops_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_cql_inserts{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[300s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1 + } + ], + "title": "CQL Insert" + }, + { + "class": "ops_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_cql_reads{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[300s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1 + } + ], + "title": "CQL Reads" + }, + { + "class": "ops_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_cql_deletes{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[300s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1 + } + ], + "title": "CQL Deletes" + }, + { + "class": "ops_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_cql_updates{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[300s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1 + } + ], + "title": "CQL Updates" + }, + { + "class": "graph_panel", + "span": 3, + "pointradius": 1, + "targets": [ + { + "expr": "sum(scylla_transport_current_connections{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 30 + } + ], + "title": "Client CQL connections by [[by]]", + "description": "amount of CQL connections currently established" + } + ], + "title": "New row" + } + ], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "tags": [], + "text": "Instance", + "value": "instance" + }, + "hide": 0, + "includeAll": false, + "label": "by", + "multi": false, + "name": "by", + "options": [ + { + "selected": false, + "text": "Cluster", + "value": "cluster" + }, + { + "selected": false, + "text": "DC", + "value": "dc" + }, + { + "selected": true, + "text": "Instance", + "value": "instance" + }, + { + "selected": false, + "text": "Shard", + "value": "shard" + } + ], + "query": "Cluster,DC,Instance,Shard", + "type": "custom" + }, + { + "class": "template_variable_single", + "label": "cluster", + "name": "cluster", + "query": "label_values(scylla_reactor_utilization, cluster)" + }, + { + "class": "template_variable_all", + "label": "dc", + "name": "dc", + "query": "label_values(scylla_reactor_utilization{cluster=~\"$cluster\"}, dc)" + }, + { + "class": "template_variable_all", + "label": "node", + "name": "node", + "query": "label_values(scylla_reactor_utilization{cluster=~\"$cluster|$^\", dc=~\"$dc\"}, instance)" + }, + { + "class": "template_variable_all", + "label": "shard", + "name": "shard", + "query": "label_values(scylla_reactor_utilization,shard)", + "sort": 3 + } + ] + }, + "tags": [ + "2019.1" + ], + "time": { + "from": "now-30m", + "to": "now" + }, + "title": "Scylla Per Server Metrics", + "overwrite": true, + "version": 5 + } +} diff --git a/grafana/scylla-dash.2019.1.template.json b/grafana/scylla-dash.2019.1.template.json new file mode 100644 index 0000000000..120d6019dc --- /dev/null +++ b/grafana/scylla-dash.2019.1.template.json @@ -0,0 +1,601 @@ +{ + "dashboard": { + "class": "dashboard", + "uid": "overview-2019.1", + "originalTitle": "Scylla Cluster Metrics", + "rows": [ + { + "class": "logo_row" + }, + { + "class": "row", + "height": "200px", + "panels": [ + { + "class": "single_stat_panel", + "targets": [ + { + "expr": "count(up{job=\"scylla\", cluster=~\"$cluster|$^\"})", + "intervalFactor": 1, + "legendFormat": "Total Nodes", + "refId": "A", + "step": 40 + } + ], + "title": "Total Nodes" + }, + { + "class": "single_stat_panel_fail", + "targets": [ + { + "expr": "count(scrape_samples_scraped{job=\"scylla\", cluster=~\"$cluster|$^\"}==0) OR vector(0)", + "intervalFactor": 1, + "legendFormat": "Unreachable", + "refId": "A", + "step": 20 + } + ], + "thresholds": "1,2", + "title": "Unreachable" + }, + { + "class": "single_stat_panel_fail", + "description": "Number of nodes that reported their status as Starting or Joining", + "targets": [ + { + "expr": "count(scylla_node_operation_mode==1) + count(scylla_node_operation_mode==2)OR vector(0)", + "intervalFactor": 1, + "legendFormat": "Joining", + "refId": "A", + "step": 20 + } + ], + "thresholds": "1,2", + "title": "Joining" + }, + { + "class": "single_stat_panel_fail", + "description": "Number of nodes that reported their status as Leaving, Decommissioned, Draining or Drained", + "targets": [ + { + "expr": "count(scylla_node_operation_mode>3)OR vector(0)", + "intervalFactor": 1, + "legendFormat": "Leaving", + "refId": "A", + "step": 20 + } + ], + "thresholds": "1,2", + "title": "Leaving" + }, + { + "class": "text_panel", + "content": "## ", + "mode": "markdown", + "span": 1, + "style": {} + }, + { + "class": "alert_table", + "title": "Active Alerts" + }, + { + "class": "dashlist", + "tags": [ + "2019.1" + ] + } + ], + "title": "New row" + }, + { + "class": "row", + "panels": [ + { + "class": "percent_panel", + "targets": [ + { + "expr": "avg(scylla_reactor_utilization{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"} ) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 4 + } + ], + "title": "Load", + "description": "The percentage of the time during which Scylla utilized the CPU. Note that because Scylla does busy polling for some time before going idle, CPU utilization as seen by the operating system may be much higher. Your system is not yet CPU-bottlenecked until this metric is high." + }, + { + "class": "ops_panel", + "targets": [ + { + "expr": "sum(irate(scylla_transport_requests_served{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]]) + sum(irate(scylla_thrift_served{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 4 + } + ], + "title": "Requests Served", + "description": "Amount of requests served as the coordinator. Imbalances here represent dispersion at the client-side level or connection balancing level, not your data model." + } + ], + "title": "New row" + }, + { + "class": "row", + "panels": [ + { + "class": "us_panel", + "span": 4, + "targets": [ + { + "expr": "sum(rate(scylla_storage_proxy_coordinator_write_latency_sum{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]|$^\"}[30s])) by ([[by]])/(sum(rate(scylla_storage_proxy_coordinator_write_latency_count{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]|$^\"}[30s])) by ([[by]]) + 1)", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Average write latency by [[by]]" + }, + { + "class": "us_panel", + "span": 4, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]|$^\"}[30s])) by ([[by]], le))", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1 + } + ], + "title": "95th percentile write latency by [[by]]" + }, + { + "class": "us_panel", + "span": 4, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(scylla_storage_proxy_coordinator_write_latency_bucket{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]|$^\"}[30s])) by ([[by]], le))", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1 + } + ], + "title": "99th percentile write latency by [[by]]" + } + ], + "title": "New row" + }, + { + "class": "row", + "panels": [ + { + "class": "us_panel", + "span": 4, + "targets": [ + { + "expr": "sum(rate(scylla_storage_proxy_coordinator_read_latency_sum{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]|$^\"}[30s])) by ([[by]])/(sum(rate(scylla_storage_proxy_coordinator_read_latency_count{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]|$^\"}[30s])) by ([[by]]) + 1)", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 1 + } + ], + "title": "Average read latency by [[by]]" + }, + { + "class": "us_panel", + "span": 4, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]|$^\"}[30s])) by ([[by]], le))", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1 + } + ], + "title": "95th percentile read latency by [[by]]" + }, + { + "class": "us_panel", + "span": 4, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(scylla_storage_proxy_coordinator_read_latency_bucket{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]|$^\"}[30s])) by ([[by]], le))", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 1 + } + ], + "title": "99th percentile read latency by [[by]]" + } + ], + "title": "New row" + }, + + { + "class": "row", + "height": "25px", + "gridPos": {"h": 2}, + "panels": [ + { + "class": "text_header_panel", + "content": "

Reads and Writes

" + }, + { + "class": "text_header_panel", + "content": "

Timeouts and Errors

" + } + ], + "title": "New row" + }, + { + "class": "row", + "height": "200px", + "panels": [ + { + "class": "queue_lenght_panel", + "span": 3, + "targets": [ + { + "expr": "sum(scylla_storage_proxy_coordinator_foreground_writes{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]|$^\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 10 + } + ], + "title": "Foreground Writes", + "description": "Foreground writes are writes that weren't acknowledged yet to the application. For instance, if a single replica responded and two are needed due to the consistency level. This metric represents a queue size, not a rate. High values here correlate with increased write latencies." + }, + { + "class": "queue_lenght_panel", + "span": 3, + "targets": [ + { + "expr": "sum(scylla_storage_proxy_coordinator_foreground_reads{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]|$^\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "metric": "", + "refId": "A", + "step": 10 + } + ], + "title": "Foreground Reads", + "description": "Foreground reads are reads that weren't acknowledged yet to the application. For instance, if a single replica responded and two are needed due to the consistency level. This metric represents a queue size, not a rate. High values here correlate with increased read latencies." + }, + { + "class": "wps_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_storage_proxy_coordinator_write_timeouts{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]|$^\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 10 + } + ], + "title": "Write Timeouts", + "description": "Requests that Scylla tried to write but timed out. Timeouts are counted in the node that received the request (the coordinator), not at the replicas." + }, + { + "class": "wps_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_storage_proxy_coordinator_write_unavailable{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]|$^\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 10 + } + ], + "title": "Write Unavailable", + "description": "Requests that Scylla did not even try to write because replicas that were needed to execute this write were unavailable. Unavailable writes are counted in the node that received the request (the coordinator), not at the replicas." + } + ], + "title": "New row" + }, + { + "class": "row", + "height": "200px", + "panels": [ + { + "class": "queue_lenght_panel", + "span": 3, + "targets": [ + { + "expr": "sum(scylla_storage_proxy_coordinator_background_writes{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]|$^\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 10 + } + ], + "title": "Background Writes", + "description": "Background writes are writes that are already acknowledged to the application but have additional work to be done. For instance, if a replica responded and only one is needed, this request is still listed as a background request until all replicas respond. This metric represents a queue size, not a rate. High values here correlate with increased write latencies." + }, + { + "class": "queue_lenght_panel", + "span": 3, + "targets": [ + { + "expr": "sum(scylla_storage_proxy_coordinator_background_reads{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 10 + } + ], + "title": "Background Reads", + "description": "Background reads are reads that are already acknowledged to the application but have additional work to be done. For instance, if a replica responded and only one is needed, this request is still listed as a background request until all replicas respond. This metric represents a queue size, not a rate. High values here correlate with increased read latencies." + }, + { + "class": "rps_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_storage_proxy_coordinator_read_timeouts{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 10 + } + ], + "title": "Read Timeouts", + "description": "Requests that Scylla tried to read but timed out. Timeouts are counted in the node that received the request (the coordinator), not at the replicas." + }, + { + "class": "rps_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_storage_proxy_coordinator_read_unavailable{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 10 + } + ], + "title": "Read Unavailable", + "description": "Requests that Scylla did not even try to read because replicas that were needed to execute this write were unavailable. Unavailable reads are counted in the node that received the request (the coordinator), not at the replicas." + } + ], + "title": "New row" + }, + { + "class": "row", + "height": "25px", + "gridPos": {"h": 2}, + "panels": [ + { + "content": "

Cache

", + "editable": true, + "error": false, + "id": "auto", + "isNew": true, + "links": [], + "mode": "html", + "span": 6, + "style": {}, + "title": "", + "transparent": true, + "type": "text" + }, + { + "content": "

Materialized Views

", + "editable": true, + "error": false, + "id": "auto", + "isNew": true, + "links": [], + "mode": "html", + "span": 6, + "style": {}, + "title": "", + "transparent": true, + "type": "text" + } + + ], + "title": "New row" + }, + { + "class": "row", + "panels": [ + { + "class": "ops_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_cache_row_hits{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 10 + } + ], + "title": "Cache Hits", + "description" : "Number of rows that were read from the cache, without needing to be fetched from storage." + }, + { + "class": "ops_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_cache_row_misses{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 10 + } + ], + "title": "Cache Misses", + "description" : "Number of rows that were not present in the cache, and had to be fetched from storage." + }, + { + "class": "bytes_panel", + "span": 3, + "targets": [ + { + "expr": "sum(scylla_database_view_update_backlog{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]|$^\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 10 + } + ], + "title": "View Update Backlog", + "description" : "Size in bytes of the view update backlog at each base replica." + }, + { + "class": "ops_panel", + "span": 3, + "targets": [ + { + "expr": "sum(irate(scylla_database_dropped_view_updates{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]|$^\"}[30s])) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 10 + } + ], + "title": "Dropped View Updates", + "description" : "Number of dropped view updates due to an excessive view update backlog." + }, + { + "class": "text_panel", + "content": "", + "mode": "markdown", + "span": 9 + }, + { + "class": "wps_panel", + "span": 3, + "targets": [ + { + "expr": "sum(scylla_storage_proxy_coordinator_current_throttled_base_writes{instance=~\"[[node]]\",cluster=~\"$cluster|$^\", dc=~\"$dc\", shard=~\"[[shard]]|$^\"}) by ([[by]])", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A", + "step": 10 + } + ], + "title": "Throttled Base Writes", + "description" : "Currently throttled base writes, as a consequence of the respective view update backlog." + } + ], + "title": "New row" + }, + { + "class": "row", + "panels": [], + "title": "New row" + } + ], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "tags": [], + "text": "Instance", + "value": "instance" + }, + "hide": 0, + "includeAll": false, + "label": "by", + "multi": false, + "name": "by", + "options": [ + { + "selected": false, + "text": "Cluster", + "value": "cluster" + }, + { + "selected": false, + "text": "DC", + "value": "dc" + }, + { + "selected": true, + "text": "Instance", + "value": "instance" + }, + { + "selected": false, + "text": "Shard", + "value": "shard" + } + ], + "query": "Cluster,DC,Instance,Shard", + "type": "custom" + }, + { + "class": "template_variable_single", + "label": "cluster", + "name": "cluster", + "query": "label_values(scylla_reactor_utilization, cluster)" + }, + { + "class": "template_variable_all", + "label": "dc", + "name": "dc", + "query": "label_values(scylla_reactor_utilization{cluster=~\"$cluster\"}, dc)" + }, + { + "class": "template_variable_all", + "label": "node", + "name": "node", + "query": "label_values(scylla_reactor_utilization{cluster=~\"$cluster|$^\", dc=~\"$dc\"}, instance)" + }, + { + "class": "template_variable_all", + "label": "shard", + "name": "shard", + "query": "label_values(scylla_reactor_utilization,shard)", + "sort": 3 + }, + { + "class": "template_variable_custom", + "name": "dash_version", + "options": [ + { + "selected": true, + "text": "2019.1", + "value": "2019.1" + } + ], + "query": "2019.1", + "current": { + "text": "2019.1", + "value": "2019.1" + } + } + ] + }, + "tags": [ + "2019.1" + ], + "time": { + "from": "now-30m", + "to": "now" + }, + "title": "Scylla Overview Metrics", + "overwrite": true, + "version": 3 + } +}