Skip to content

Commit 7232795

Browse files
authored
fix: add a retry middleware to all the stats handlers (#13584)
Signed-off-by: Edward Welch <[email protected]>
1 parent 3a1a3a2 commit 7232795

File tree

5 files changed

+95
-25
lines changed

5 files changed

+95
-25
lines changed

pkg/querier/queryrange/queryrangebase/retry.go

+24
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package queryrangebase
22

33
import (
44
"context"
5+
"reflect"
56
"time"
67

78
"github.com/go-kit/log"
@@ -81,20 +82,28 @@ func (r retry) Do(ctx context.Context, req Request) (Response, error) {
8182
query := req.GetQuery()
8283

8384
for ; tries < r.maxRetries; tries++ {
85+
// Make sure the context isn't done before sending the request
8486
if ctx.Err() != nil {
8587
return nil, ctx.Err()
8688
}
89+
8790
resp, err := r.next.Do(ctx, req)
8891
if err == nil {
8992
return resp, nil
9093
}
9194

95+
// Make sure the context isn't done before retrying the request
96+
if ctx.Err() != nil {
97+
return nil, ctx.Err()
98+
}
99+
92100
// Retry if we get a HTTP 500 or an unknown error.
93101
if code := grpcutil.ErrorToStatusCode(err); code == codes.Unknown || code/100 == 5 {
94102
lastErr = err
95103
level.Error(util_log.WithContext(ctx, r.log)).Log(
96104
"msg", "error processing request",
97105
"try", tries,
106+
"type", logImplementingType(req),
98107
"query", query,
99108
"query_hash", util.HashedQuery(query),
100109
"start", start.Format(time.RFC3339Nano),
@@ -113,3 +122,18 @@ func (r retry) Do(ctx context.Context, req Request) (Response, error) {
113122
}
114123
return nil, lastErr
115124
}
125+
126+
func logImplementingType(i Request) string {
127+
if i == nil {
128+
return "nil"
129+
}
130+
131+
t := reflect.TypeOf(i)
132+
133+
// Check if it's a pointer and get the underlying type if so
134+
if t.Kind() == reflect.Ptr {
135+
t = t.Elem()
136+
}
137+
138+
return t.String()
139+
}

pkg/querier/queryrange/querysharding.go

+14-9
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ func NewQueryShardMiddleware(
4242
limits Limits,
4343
maxShards int,
4444
statsHandler queryrangebase.Handler,
45+
retryNextHandler queryrangebase.Handler,
4546
shardAggregation []string,
4647
) queryrangebase.Middleware {
4748
noshards := !hasShards(confs)
@@ -56,7 +57,7 @@ func NewQueryShardMiddleware(
5657
}
5758

5859
mapperware := queryrangebase.MiddlewareFunc(func(next queryrangebase.Handler) queryrangebase.Handler {
59-
return newASTMapperware(confs, engineOpts, next, statsHandler, logger, shardingMetrics, limits, maxShards, shardAggregation)
60+
return newASTMapperware(confs, engineOpts, next, retryNextHandler, statsHandler, logger, shardingMetrics, limits, maxShards, shardAggregation)
6061
})
6162

6263
return queryrangebase.MiddlewareFunc(func(next queryrangebase.Handler) queryrangebase.Handler {
@@ -76,6 +77,7 @@ func newASTMapperware(
7677
confs ShardingConfigs,
7778
engineOpts logql.EngineOpts,
7879
next queryrangebase.Handler,
80+
retryNextHandler queryrangebase.Handler,
7981
statsHandler queryrangebase.Handler,
8082
logger log.Logger,
8183
metrics *logql.MapperMetrics,
@@ -88,6 +90,7 @@ func newASTMapperware(
8890
logger: log.With(logger, "middleware", "QueryShard.astMapperware"),
8991
limits: limits,
9092
next: next,
93+
retryNextHandler: retryNextHandler,
9194
statsHandler: next,
9295
ng: logql.NewDownstreamEngine(engineOpts, DownstreamHandler{next: next, limits: limits}, limits, logger),
9396
metrics: metrics,
@@ -103,14 +106,15 @@ func newASTMapperware(
103106
}
104107

105108
type astMapperware struct {
106-
confs ShardingConfigs
107-
logger log.Logger
108-
limits Limits
109-
next queryrangebase.Handler
110-
statsHandler queryrangebase.Handler
111-
ng *logql.DownstreamEngine
112-
metrics *logql.MapperMetrics
113-
maxShards int
109+
confs ShardingConfigs
110+
logger log.Logger
111+
limits Limits
112+
next queryrangebase.Handler
113+
retryNextHandler queryrangebase.Handler
114+
statsHandler queryrangebase.Handler
115+
ng *logql.DownstreamEngine
116+
metrics *logql.MapperMetrics
117+
maxShards int
114118

115119
// Feature flag for sharding range and vector aggregations such as
116120
// quantile_ver_time with probabilistic data structures.
@@ -191,6 +195,7 @@ func (ast *astMapperware) Do(ctx context.Context, r queryrangebase.Request) (que
191195
ast.maxShards,
192196
r,
193197
ast.statsHandler,
198+
ast.retryNextHandler,
194199
ast.next,
195200
ast.limits,
196201
)

pkg/querier/queryrange/querysharding_test.go

+6
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,7 @@ func Test_astMapper(t *testing.T) {
168168
},
169169
testEngineOpts,
170170
handler,
171+
handler,
171172
nil,
172173
log.NewNopLogger(),
173174
nilShardingMetrics,
@@ -307,6 +308,7 @@ func Test_astMapper_QuerySizeLimits(t *testing.T) {
307308
},
308309
testEngineOpts,
309310
handler,
311+
handler,
310312
nil,
311313
log.NewNopLogger(),
312314
nilShardingMetrics,
@@ -352,6 +354,7 @@ func Test_ShardingByPass(t *testing.T) {
352354
},
353355
testEngineOpts,
354356
handler,
357+
handler,
355358
nil,
356359
log.NewNopLogger(),
357360
nilShardingMetrics,
@@ -439,6 +442,7 @@ func Test_InstantSharding(t *testing.T) {
439442
},
440443
0,
441444
nil,
445+
nil,
442446
[]string{},
443447
)
444448
response, err := sharding.Wrap(queryrangebase.HandlerFunc(func(c context.Context, r queryrangebase.Request) (queryrangebase.Response, error) {
@@ -718,6 +722,7 @@ func TestShardingAcrossConfigs_ASTMapper(t *testing.T) {
718722
confs,
719723
testEngineOpts,
720724
handler,
725+
handler,
721726
nil,
722727
log.NewNopLogger(),
723728
nilShardingMetrics,
@@ -853,6 +858,7 @@ func Test_ASTMapper_MaxLookBackPeriod(t *testing.T) {
853858
testSchemasTSDB,
854859
engineOpts,
855860
queryHandler,
861+
queryHandler,
856862
statsHandler,
857863
log.NewNopLogger(),
858864
nilShardingMetrics,

pkg/querier/queryrange/roundtrip.go

+32
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,10 @@ func NewMiddleware(
285285
func NewDetectedLabelsTripperware(cfg Config, opts logql.EngineOpts, logger log.Logger, l Limits, schema config.SchemaConfig, metrics *Metrics, mw base.Middleware, namespace string, merger base.Merger, limits Limits, iqo util.IngesterQueryOptions) (base.Middleware, error) {
286286
return base.MiddlewareFunc(func(next base.Handler) base.Handler {
287287
statsHandler := mw.Wrap(next)
288+
if cfg.MaxRetries > 0 {
289+
rm := base.NewRetryMiddleware(logger, cfg.MaxRetries, metrics.RetryMiddlewareMetrics, namespace)
290+
statsHandler = rm.Wrap(statsHandler)
291+
}
288292
splitter := newDefaultSplitter(limits, iqo)
289293

290294
queryRangeMiddleware := []base.Middleware{
@@ -553,6 +557,12 @@ func getOperation(path string) string {
553557
func NewLogFilterTripperware(cfg Config, engineOpts logql.EngineOpts, log log.Logger, limits Limits, schema config.SchemaConfig, merger base.Merger, iqo util.IngesterQueryOptions, c cache.Cache, metrics *Metrics, indexStatsTripperware base.Middleware, metricsNamespace string) (base.Middleware, error) {
554558
return base.MiddlewareFunc(func(next base.Handler) base.Handler {
555559
statsHandler := indexStatsTripperware.Wrap(next)
560+
retryNextHandler := next
561+
if cfg.MaxRetries > 0 {
562+
rm := base.NewRetryMiddleware(log, cfg.MaxRetries, metrics.RetryMiddlewareMetrics, metricsNamespace)
563+
statsHandler = rm.Wrap(statsHandler)
564+
retryNextHandler = rm.Wrap(next)
565+
}
556566

557567
queryRangeMiddleware := []base.Middleware{
558568
QueryMetricsMiddleware(metrics.QueryMetrics),
@@ -592,6 +602,7 @@ func NewLogFilterTripperware(cfg Config, engineOpts logql.EngineOpts, log log.Lo
592602
limits,
593603
0, // 0 is unlimited shards
594604
statsHandler,
605+
retryNextHandler,
595606
cfg.ShardAggregations,
596607
),
597608
)
@@ -618,6 +629,12 @@ func NewLogFilterTripperware(cfg Config, engineOpts logql.EngineOpts, log log.Lo
618629
func NewLimitedTripperware(cfg Config, engineOpts logql.EngineOpts, log log.Logger, limits Limits, schema config.SchemaConfig, metrics *Metrics, merger base.Merger, iqo util.IngesterQueryOptions, indexStatsTripperware base.Middleware, metricsNamespace string) (base.Middleware, error) {
619630
return base.MiddlewareFunc(func(next base.Handler) base.Handler {
620631
statsHandler := indexStatsTripperware.Wrap(next)
632+
retryNextHandler := next
633+
if cfg.MaxRetries > 0 {
634+
rm := base.NewRetryMiddleware(log, cfg.MaxRetries, metrics.RetryMiddlewareMetrics, metricsNamespace)
635+
statsHandler = rm.Wrap(statsHandler)
636+
retryNextHandler = rm.Wrap(next)
637+
}
621638

622639
queryRangeMiddleware := []base.Middleware{
623640
StatsCollectorMiddleware(),
@@ -639,6 +656,7 @@ func NewLimitedTripperware(cfg Config, engineOpts logql.EngineOpts, log log.Logg
639656
// and overwhelming the frontend, therefore we fix the number of shards to prevent this.
640657
32, // 0 is unlimited shards
641658
statsHandler,
659+
retryNextHandler,
642660
cfg.ShardAggregations,
643661
),
644662
)
@@ -854,6 +872,12 @@ func NewMetricTripperware(cfg Config, engineOpts logql.EngineOpts, log log.Logge
854872

855873
return base.MiddlewareFunc(func(next base.Handler) base.Handler {
856874
statsHandler := indexStatsTripperware.Wrap(next)
875+
retryNextHandler := next
876+
if cfg.MaxRetries > 0 {
877+
rm := base.NewRetryMiddleware(log, cfg.MaxRetries, metrics.RetryMiddlewareMetrics, metricsNamespace)
878+
statsHandler = rm.Wrap(statsHandler)
879+
retryNextHandler = rm.Wrap(next)
880+
}
857881

858882
queryRangeMiddleware := []base.Middleware{
859883
QueryMetricsMiddleware(metrics.QueryMetrics),
@@ -895,6 +919,7 @@ func NewMetricTripperware(cfg Config, engineOpts logql.EngineOpts, log log.Logge
895919
limits,
896920
0, // 0 is unlimited shards
897921
statsHandler,
922+
retryNextHandler,
898923
cfg.ShardAggregations,
899924
),
900925
)
@@ -976,6 +1001,12 @@ func NewInstantMetricTripperware(
9761001

9771002
return base.MiddlewareFunc(func(next base.Handler) base.Handler {
9781003
statsHandler := indexStatsTripperware.Wrap(next)
1004+
retryNextHandler := next
1005+
if cfg.MaxRetries > 0 {
1006+
rm := base.NewRetryMiddleware(log, cfg.MaxRetries, metrics.RetryMiddlewareMetrics, metricsNamespace)
1007+
statsHandler = rm.Wrap(statsHandler)
1008+
retryNextHandler = rm.Wrap(next)
1009+
}
9791010

9801011
queryRangeMiddleware := []base.Middleware{
9811012
StatsCollectorMiddleware(),
@@ -1003,6 +1034,7 @@ func NewInstantMetricTripperware(
10031034
limits,
10041035
0, // 0 is unlimited shards
10051036
statsHandler,
1037+
retryNextHandler,
10061038
cfg.ShardAggregations,
10071039
),
10081040
)

pkg/querier/queryrange/shard_resolver.go

+19-16
Original file line numberDiff line numberDiff line change
@@ -37,21 +37,22 @@ func shardResolverForConf(
3737
maxParallelism int,
3838
maxShards int,
3939
r queryrangebase.Request,
40-
statsHandler, next queryrangebase.Handler,
40+
statsHandler, next, retryNext queryrangebase.Handler,
4141
limits Limits,
4242
) (logql.ShardResolver, bool) {
4343
if conf.IndexType == types.TSDBType {
4444
return &dynamicShardResolver{
45-
ctx: ctx,
46-
logger: logger,
47-
statsHandler: statsHandler,
48-
next: next,
49-
limits: limits,
50-
from: model.Time(r.GetStart().UnixMilli()),
51-
through: model.Time(r.GetEnd().UnixMilli()),
52-
maxParallelism: maxParallelism,
53-
maxShards: maxShards,
54-
defaultLookback: defaultLookback,
45+
ctx: ctx,
46+
logger: logger,
47+
statsHandler: statsHandler,
48+
retryNextHandler: retryNext,
49+
next: next,
50+
limits: limits,
51+
from: model.Time(r.GetStart().UnixMilli()),
52+
through: model.Time(r.GetEnd().UnixMilli()),
53+
maxParallelism: maxParallelism,
54+
maxShards: maxShards,
55+
defaultLookback: defaultLookback,
5556
}, true
5657
}
5758
if conf.RowShards < 2 {
@@ -64,10 +65,11 @@ type dynamicShardResolver struct {
6465
ctx context.Context
6566
// TODO(owen-d): shouldn't have to fork handlers here -- one should just transparently handle the right logic
6667
// depending on the underlying type?
67-
statsHandler queryrangebase.Handler // index stats handler (hooked up to results cache, etc)
68-
next queryrangebase.Handler // next handler in the chain (used for non-stats reqs)
69-
logger log.Logger
70-
limits Limits
68+
statsHandler queryrangebase.Handler // index stats handler (hooked up to results cache, etc)
69+
retryNextHandler queryrangebase.Handler // next handler wrapped with retries
70+
next queryrangebase.Handler // next handler in the chain (used for non-stats reqs)
71+
logger log.Logger
72+
limits Limits
7173

7274
from, through model.Time
7375
maxParallelism int
@@ -251,7 +253,8 @@ func (r *dynamicShardResolver) ShardingRanges(expr syntax.Expr, targetBytesPerSh
251253
exprStr := expr.String()
252254
// try to get shards for the given expression
253255
// if it fails, fallback to linearshards based on stats
254-
resp, err := r.next.Do(r.ctx, &logproto.ShardsRequest{
256+
// use the retry handler here to retry transient errors
257+
resp, err := r.retryNextHandler.Do(r.ctx, &logproto.ShardsRequest{
255258
From: adjustedFrom,
256259
Through: r.through,
257260
Query: expr.String(),

0 commit comments

Comments
 (0)