Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Avoid returning 500 for TRACE_TOO_LARGE #4160

Merged
merged 10 commits into from
Oct 18, 2024
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
* [CHANGE] No longer send the final diff in GRPC streaming. Instead we rely on the streamed intermediate results. [#4062](https://github.com/grafana/tempo/pull/4062) (@joe-elliott)
* [CHANGE] Update Go to 1.23.1 [#4146](https://github.com/grafana/tempo/pull/4146) [#4147](https://github.com/grafana/tempo/pull/4147) (@javiermolinar)
* [CHANGE] TraceQL: Add range condition for byte predicates [#4198](https://github.com/grafana/tempo/pull/4198) (@ie-pham)
* [CHANGE] Return 422 for TRACE_TOO_LARGE queries [#4160](https://github.com/grafana/tempo/pull/4160) (@zalegrala)
* [FEATURE] Discarded span logging `log_discarded_spans` [#3957](https://github.com/grafana/tempo/issues/3957) (@dastrobu)
* [FEATURE] TraceQL support for instrumentation scope [#3967](https://github.com/grafana/tempo/pull/3967) (@ie-pham)
* [ENHANCEMENT] TraceQL: Attribute iterators collect matched array values [#3867](https://github.com/grafana/tempo/pull/3867) (@electron0zero, @stoewer)
Expand Down
17 changes: 9 additions & 8 deletions integration/e2e/limits_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (

"github.com/grafana/tempo/integration/util"
"github.com/grafana/tempo/pkg/httpclient"
"github.com/grafana/tempo/pkg/model/trace"
"github.com/grafana/tempo/pkg/tempopb"
tempoUtil "github.com/grafana/tempo/pkg/util"
"github.com/grafana/tempo/pkg/util/test"
Expand Down Expand Up @@ -233,22 +234,22 @@ func TestQueryLimits(t *testing.T) {
querierClient := httpclient.New("http://"+tempo.Endpoint(3200)+"/querier", tempoUtil.FakeTenantID)

_, err = client.QueryTrace(tempoUtil.TraceIDToHexString(traceID[:]))
require.ErrorContains(t, err, "trace exceeds max size")
require.ErrorContains(t, err, "failed with response: 500") // confirm frontend returns 500
require.ErrorContains(t, err, trace.ErrTraceTooLarge.Error())
require.ErrorContains(t, err, "failed with response: 422") // confirm frontend returns 422

_, err = querierClient.QueryTrace(tempoUtil.TraceIDToHexString(traceID[:]))
require.ErrorContains(t, err, "trace exceeds max size")
require.ErrorContains(t, err, "failed with response: 500") // todo: this should return 400 ideally so the frontend does not retry, but does not currently
require.ErrorContains(t, err, trace.ErrTraceTooLarge.Error())
require.ErrorContains(t, err, "failed with response: 422")

// complete block timeout is 10 seconds
time.Sleep(15 * time.Second)
_, err = client.QueryTrace(tempoUtil.TraceIDToHexString(traceID[:]))
require.ErrorContains(t, err, "trace exceeds max size")
require.ErrorContains(t, err, "failed with response: 500") // confirm frontend returns 500
require.ErrorContains(t, err, trace.ErrTraceTooLarge.Error())
require.ErrorContains(t, err, "failed with response: 422") // confirm frontend returns 422

_, err = querierClient.QueryTrace(tempoUtil.TraceIDToHexString(traceID[:]))
require.ErrorContains(t, err, "trace exceeds max size")
require.ErrorContains(t, err, "failed with response: 400") // confirm querier returns 400
require.ErrorContains(t, err, trace.ErrTraceTooLarge.Error())
require.ErrorContains(t, err, "failed with response: 422") // confirm querier returns 422
}

func TestLimitsPartialSuccess(t *testing.T) {
Expand Down
13 changes: 13 additions & 0 deletions modules/frontend/combiner/trace_by_id.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package combiner

import (
"bytes"
"errors"
"fmt"
"io"
"net/http"
Expand Down Expand Up @@ -83,6 +84,13 @@ func (c *traceByIDCombiner) AddResponse(r PipelineResponse) error {

// Consume the trace
_, err = c.c.Consume(resp.Trace)

if errors.Is(err, trace.ErrTraceTooLarge) {
c.code = http.StatusUnprocessableEntity
c.statusMessage = trace.ErrTraceTooLarge.Error()
return nil
}

return err
}

Expand Down Expand Up @@ -156,6 +164,11 @@ func (c *traceByIDCombiner) shouldQuit() bool {
return false
}

// test special case for 422
if c.code == http.StatusUnprocessableEntity {
return true
}

// bail on other 400s
if c.code/100 == 4 {
return true
Expand Down
6 changes: 3 additions & 3 deletions modules/frontend/combiner/trace_by_id_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,15 @@ func TestTraceByIDShouldQuit(t *testing.T) {
should = c.ShouldQuit()
require.False(t, should)

// trace too large, should not quit but should return an error
// trace too large, should quit and should not return an error
c = NewTraceByID(1, api.HeaderAcceptJSON)
err = c.AddResponse(toHTTPProtoResponse(t, &tempopb.TraceByIDResponse{
Trace: test.MakeTrace(1, nil),
Metrics: &tempopb.TraceByIDMetrics{},
}, 200))
require.Error(t, err)
require.NoError(t, err)
should = c.ShouldQuit()
require.False(t, should)
require.True(t, should)
}

func TestTraceByIDHonorsContentType(t *testing.T) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,9 @@ func (c statusCodeAdjustWare) RoundTrip(req Request) (*http.Response, error) {
// if the frontend issues a bad request then externally we need to represent that as an
// internal error
// exceptions
// 422 - unprocessable entity
// 429 - too many requests
if resp.StatusCode >= 400 && resp.StatusCode < 500 && resp.StatusCode != 429 {
if resp.StatusCode >= 400 && resp.StatusCode < 500 && resp.StatusCode != 429 && resp.StatusCode != 422 {
resp.StatusCode = http.StatusInternalServerError
resp.Status = http.StatusText(http.StatusInternalServerError)
// leave the body alone. it will preserve the original error message
Expand Down
8 changes: 5 additions & 3 deletions modules/querier/http.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"errors"
"fmt"
"net/http"
"strings"
"time"

"github.com/golang/protobuf/jsonpb" //nolint:all //deprecated
Expand Down Expand Up @@ -425,9 +426,10 @@ func handleError(w http.ResponseWriter, err error) {
return
}

// todo: better understand all errors returned from queriers and categorize more as 4XX
if errors.Is(err, trace.ErrTraceTooLarge) {
http.Error(w, err.Error(), http.StatusBadRequest)
// TODO: better understand all errors returned from queriers and categorize more as 4XX
// NOTE: we receive a GRPC error from the ingesters, and so we need to check the string content of error as well.
if errors.Is(err, trace.ErrTraceTooLarge) || strings.Contains(err.Error(), trace.ErrTraceTooLarge.Error()) {
http.Error(w, err.Error(), http.StatusUnprocessableEntity)
return
}

Expand Down
Loading