Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove tempo serverless #4599

Merged
merged 11 commits into from
Jan 24, 2025
Prev Previous commit
Next Next commit
clean up serverless from tempo-mixin and playbook
  • Loading branch information
electron0zero committed Jan 23, 2025
commit d2af019a655e74daee73785a5ca0751b4a8fb31a
220 changes: 5 additions & 215 deletions operations/tempo-mixin-compiled/dashboards/tempo-reads.json
Original file line number Diff line number Diff line change
Expand Up @@ -697,216 +697,6 @@
"renderer": "flot",
"seriesOverrides": [

],
"spaceLength": 10,
"span": 6,
"stack": true,
"steppedLine": false,
"targets": [
{
"expr": "sum by (status) (\n label_replace(label_replace(rate(tempo_querier_external_endpoint_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__rate_interval]),\n \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"),\n \"status\", \"${1}\", \"status_code\", \"([a-zA-Z]+)\"))\n",
"format": "time_series",
"interval": "1m",
"legendFormat": "{{status}}",
"refId": "A"
}
],
"thresholds": [

],
"timeFrom": null,
"timeShift": null,
"title": "QPS",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [

]
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
},
{
"aliasColors": {

},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"id": 8,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [

],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [

],
"spaceLength": 10,
"span": 6,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(tempo_querier_external_endpoint_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__rate_interval])) by (le,endpoint)) * 1e3",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
"legendFormat": "{{route}} 99th",
"refId": "A",
"step": 10
},
{
"expr": "histogram_quantile(0.50, sum(rate(tempo_querier_external_endpoint_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__rate_interval])) by (le,endpoint)) * 1e3",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
"legendFormat": "{{route}} 50th",
"refId": "B",
"step": 10
},
{
"expr": "sum(rate(tempo_querier_external_endpoint_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__rate_interval])) by (endpoint) * 1e3 / sum(rate(tempo_querier_external_endpoint_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\"}[$__rate_interval])) by (endpoint)",
"format": "time_series",
"interval": "1m",
"intervalFactor": 2,
"legendFormat": "{{route}} Average",
"refId": "C",
"step": 10
}
],
"thresholds": [

],
"timeFrom": null,
"timeShift": null,
"title": "Latency",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": [

]
},
"yaxes": [
{
"format": "ms",
"label": null,
"logBase": 1,
"max": null,
"min": 0,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
]
}
],
"repeat": null,
"repeatIteration": null,
"repeatRowId": null,
"showTitle": true,
"title": "Querier External Endpoint",
"titleSize": "h6"
},
{
"collapse": false,
"height": "250px",
"panels": [
{
"aliasColors": {
"1xx": "#EAB839",
"2xx": "#7EB26D",
"3xx": "#6ED0E0",
"4xx": "#EF843C",
"5xx": "#E24D42",
"OK": "#7EB26D",
"cancel": "#A9A9A9",
"error": "#E24D42",
"success": "#7EB26D"
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "$datasource",
"fill": 10,
"id": 9,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 0,
"links": [

],
"nullPointMode": "null as zero",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [

],
"spaceLength": 10,
"span": 6,
Expand Down Expand Up @@ -970,7 +760,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"id": 10,
"id": 8,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -1095,7 +885,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 10,
"id": 11,
"id": 9,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -1180,7 +970,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"id": 12,
"id": 10,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -1305,7 +1095,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 10,
"id": 13,
"id": 11,
"legend": {
"avg": false,
"current": false,
Expand Down Expand Up @@ -1390,7 +1180,7 @@
"dashes": false,
"datasource": "$datasource",
"fill": 1,
"id": 14,
"id": 12,
"legend": {
"avg": false,
"current": false,
Expand Down
11 changes: 0 additions & 11 deletions operations/tempo-mixin/dashboards/tempo-reads.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,6 @@ dashboard_utils {
$.latencyPanel('tempo_request_duration_seconds', '{%s,route=~"querier_%sapi_.*"}' % [$.jobMatcher($._config.jobs.querier), $._config.http_api_prefix], additional_grouping='route')
)
)
.addRow(
g.row('Querier External Endpoint')
.addPanel(
$.panel('QPS') +
$.qpsPanel('tempo_querier_external_endpoint_duration_seconds_count{%s}' % [$.jobMatcher($._config.jobs.querier)])
)
.addPanel(
$.panel('Latency') +
$.latencyPanel('tempo_querier_external_endpoint_duration_seconds', '{%s}' % [$.jobMatcher($._config.jobs.querier)], additional_grouping='endpoint')
)
)
.addRow(
g.row('Ingester')
.addPanel(
Expand Down
21 changes: 3 additions & 18 deletions operations/tempo-mixin/runbook.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@ Another way to increase parallelism is by increasing the size of the worker pool
A theoretically ideal value for this config to avoid _any_ queueing would be (Size of blocklist / Max Concurrent Queries).
But also factor in the resources provided to the querier.

Our [documentation](https://grafana.com/docs/tempo/latest/operations/backend_search/#query-frontend)
includes [a solid guide](https://grafana.com/docs/tempo/latest/operations/backend_search/#guidelines-on-key-configuration-parameters) on the various parameters with suggestions.

### Trace Lookup Failures

If trace lookups are fail with the error: `error querying store in Querier.FindTraceByID: queue doesn't have room for <xyz> jobs`, this
Expand All @@ -70,24 +73,6 @@ Consider the following resolutions:
- Increase the queue_depth size to do more work per querier
- Adjust compaction settings to reduce the number of blocks

### Serverless/External Endpoints

If the request latency issues are due to backend searches with serverless/external endpoints there may be additional configuration
options that will help decrease latency. Before you get started know that serverless functionality only impacts the `/api/search` endpoint
if a `start` and `end` parameter are passed and `external_endpoints` are configured on the querier. One way to determine if external
endpoints are getting hit is to check the Reads dashboard and look for the "Querier External Endpoint" row.

Tuning serverless search can be difficult. Our [public documentation](https://grafana.com/docs/tempo/latest/operations/backend_search/#query-frontend)
includes a solid guide on the various parameters with suggestions. The linked documentation is augmented with some suggestions here:

- Consider provisioning more serverless functions and adding them to the `querier.search.external_endpoints` array. This will increase your
baseline latency and your total throughput.
- Decreasing `querier.search.hedge_requests_at` and increasing `querier.search.hedge_requests_up_to` will put more pressure on the serverless endpoints but will
result in lower latency.
- Increasing `querier.search.prefer_self` and scaling up the queriers will cause more work to be performed by the queriers which will lower latencies.
- Increasing `query_frontend.max_oustanding_per_tenant` and `query_frontend.search.concurrent_jobs` will increase the rate at which the
query_frontend tries to feed jobs to the queriers and can decrease latency.

## TempoCompactorUnhealthy

If this occurs access the [ring page](https://grafana.com/docs/tempo/latest/operations/consistent_hash_ring/) at `/compactor/ring`.
Expand Down