From 96b23311321dcce33de86e3650d02fd42e30d7b8 Mon Sep 17 00:00:00 2001 From: Dimitar Dimitrov Date: Thu, 9 Jan 2025 15:12:40 +0100 Subject: [PATCH] ruler: cap the number of remote eval retries (#10375) * ruler: cap the number of remote eval retries The retries happen more aggressively than actual evaluations. With the current setup an error spike results in 3x the query rate - initial query, and two retries fairly quickly 100ms & 200ms after that. This PR changes that so that the whole process doesn't retry more than a fixed number of queries/sec. I chose 170 because at GL the average evals/sec is 340 per ruler. This would retry about half of the rules on average. _On average_ that should increase query load by 50%. Signed-off-by: Dimitar Dimitrov * Add CHANGELOG.md entry Signed-off-by: Dimitar Dimitrov * Fix a totally arbitrary stupid linter rule Signed-off-by: Dimitar Dimitrov * Use a CB instead of a rate limtier Signed-off-by: Dimitar Dimitrov * Revert "Use a CB instead of a rate limtier" This reverts commit b07366f25a995312c637f284780277d800ca6a1b. * Don't abort retries if we're over the rate limit Signed-off-by: Dimitar Dimitrov * Cancel reservation when context expires Signed-off-by: Dimitar Dimitrov --------- Signed-off-by: Dimitar Dimitrov (cherry picked from commit ffee57de406dd651dccf104db25ee93ec46a3c0e) --- CHANGELOG.md | 1 + cmd/mimir/config-descriptor.json | 10 +++++++ cmd/mimir/help-all.txt.tmpl | 2 ++ cmd/mimir/help.txt.tmpl | 2 ++ .../configuration-parameters/index.md | 4 +++ pkg/mimir/modules.go | 2 +- pkg/ruler/remotequerier.go | 27 +++++++++++++---- pkg/ruler/remotequerier_test.go | 30 +++++++++---------- 8 files changed, 57 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 40b1f27baf1..5dd9eef32be 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ * `cortex_alertmanager_silences` * [CHANGE] Distributor: Drop experimental `-distributor.direct-otlp-translation-enabled` flag, since direct OTLP translation is well tested at this point. #9647 * [CHANGE] Ingester: Change `-initial-delay` for circuit breakers to begin when the first request is received, rather than at breaker activation. #9842 +* [CHANGE] Ruler: cap the rate of retries for remote query evaluation to 170/sec. This is configurable via `-ruler.query-frontend.max-retries-rate`. #10375 * [FEATURE] Distributor: Add support for `lz4` OTLP compression. #9763 * [FEATURE] Querier: add experimental streaming PromQL engine, enabled with `-querier.query-engine=mimir`. #9367 #9368 #9398 #9399 #9403 #9417 #9418 #9419 #9420 #9482 #9504 #9505 #9507 #9518 #9531 #9532 #9533 #9553 #9558 #9588 #9589 #9639 #9641 #9642 #9651 #9664 #9681 #9717 #9874 #9998 * [FEATURE] Query-frontend: added experimental configuration options `query-frontend.cache-errors` and `query-frontend.results-cache-ttl-for-errors` to allow non-transient responses to be cached. When set to `true` error responses from hitting limits or bad data are cached for a short TTL. #9028 diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index 515e61079c3..9c0b974cf69 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -13121,6 +13121,16 @@ "fieldDefaultValue": "protobuf", "fieldFlag": "ruler.query-frontend.query-result-response-format", "fieldType": "string" + }, + { + "kind": "field", + "name": "max_retries_rate", + "required": false, + "desc": "Maximum number of retries for failed queries per second.", + "fieldValue": null, + "fieldDefaultValue": 170, + "fieldFlag": "ruler.query-frontend.max-retries-rate", + "fieldType": "float" } ], "fieldValue": null, diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 3753384e200..1fac12bffc1 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -2971,6 +2971,8 @@ Usage of ./cmd/mimir/mimir: Override the default minimum TLS version. Allowed values: VersionTLS10, VersionTLS11, VersionTLS12, VersionTLS13 -ruler.query-frontend.grpc-client-config.tls-server-name string Override the expected name on the server certificate. + -ruler.query-frontend.max-retries-rate float + Maximum number of retries for failed queries per second. (default 170) -ruler.query-frontend.query-result-response-format string Format to use when retrieving query results from query-frontends. Supported values: json, protobuf (default "protobuf") -ruler.query-stats-enabled diff --git a/cmd/mimir/help.txt.tmpl b/cmd/mimir/help.txt.tmpl index cf7e4e4b7a1..7259716be05 100644 --- a/cmd/mimir/help.txt.tmpl +++ b/cmd/mimir/help.txt.tmpl @@ -731,6 +731,8 @@ Usage of ./cmd/mimir/mimir: Maximum number of rules per rule group by namespace. Value is a map, where each key is the namespace and value is the number of rules allowed in the namespace (int). On the command line, this map is given in a JSON format. The number of rules specified has the same meaning as -ruler.max-rules-per-rule-group, but only applies for the specific namespace. If specified, it supersedes -ruler.max-rules-per-rule-group. (default {}) -ruler.query-frontend.address string GRPC listen address of the query-frontend(s). Must be a DNS address (prefixed with dns:///) to enable client side load balancing. + -ruler.query-frontend.max-retries-rate float + Maximum number of retries for failed queries per second. (default 170) -ruler.query-frontend.query-result-response-format string Format to use when retrieving query results from query-frontends. Supported values: json, protobuf (default "protobuf") -ruler.recording-rules-evaluation-enabled diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index 1a22d498be6..b34049522b0 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -2078,6 +2078,10 @@ query_frontend: # CLI flag: -ruler.query-frontend.query-result-response-format [query_result_response_format: | default = "protobuf"] + # Maximum number of retries for failed queries per second. + # CLI flag: -ruler.query-frontend.max-retries-rate + [max_retries_rate: | default = 170] + tenant_federation: # Enable rule groups to query against multiple tenants. The tenant IDs # involved need to be in the rule group's 'source_tenants' field. If this flag diff --git a/pkg/mimir/modules.go b/pkg/mimir/modules.go index d95f21c1c5f..e830870c643 100644 --- a/pkg/mimir/modules.go +++ b/pkg/mimir/modules.go @@ -852,7 +852,7 @@ func (t *Mimir) initRuler() (serv services.Service, err error) { if err != nil { return nil, err } - remoteQuerier := ruler.NewRemoteQuerier(queryFrontendClient, t.Cfg.Querier.EngineConfig.Timeout, t.Cfg.Ruler.QueryFrontend.QueryResultResponseFormat, t.Cfg.API.PrometheusHTTPPrefix, util_log.Logger, ruler.WithOrgIDMiddleware) + remoteQuerier := ruler.NewRemoteQuerier(queryFrontendClient, t.Cfg.Querier.EngineConfig.Timeout, 1, t.Cfg.Ruler.QueryFrontend.QueryResultResponseFormat, t.Cfg.API.PrometheusHTTPPrefix, util_log.Logger, ruler.WithOrgIDMiddleware) embeddedQueryable = prom_remote.NewSampleAndChunkQueryableClient( remoteQuerier, diff --git a/pkg/ruler/remotequerier.go b/pkg/ruler/remotequerier.go index 3a25145c960..f02eb4d8669 100644 --- a/pkg/ruler/remotequerier.go +++ b/pkg/ruler/remotequerier.go @@ -31,6 +31,7 @@ import ( "github.com/prometheus/prometheus/storage" "github.com/prometheus/prometheus/storage/remote" "golang.org/x/exp/slices" + "golang.org/x/time/rate" "google.golang.org/grpc" "google.golang.org/grpc/codes" @@ -67,6 +68,8 @@ type QueryFrontendConfig struct { GRPCClientConfig grpcclient.Config `yaml:"grpc_client_config" doc:"description=Configures the gRPC client used to communicate between the rulers and query-frontends."` QueryResultResponseFormat string `yaml:"query_result_response_format"` + + MaxRetriesRate float64 `yaml:"max_retries_rate"` } func (c *QueryFrontendConfig) RegisterFlags(f *flag.FlagSet) { @@ -80,6 +83,7 @@ func (c *QueryFrontendConfig) RegisterFlags(f *flag.FlagSet) { c.GRPCClientConfig.RegisterFlagsWithPrefix("ruler.query-frontend.grpc-client-config", f) f.StringVar(&c.QueryResultResponseFormat, "ruler.query-frontend.query-result-response-format", formatProtobuf, fmt.Sprintf("Format to use when retrieving query results from query-frontends. Supported values: %s", strings.Join(allFormats, ", "))) + f.Float64Var(&c.MaxRetriesRate, "ruler.query-frontend.max-retries-rate", 170, "Maximum number of retries for failed queries per second.") } func (c *QueryFrontendConfig) Validate() error { @@ -115,6 +119,7 @@ type Middleware func(ctx context.Context, req *httpgrpc.HTTPRequest) error // RemoteQuerier executes read operations against a httpgrpc.HTTPClient. type RemoteQuerier struct { client httpgrpc.HTTPClient + retryLimiter *rate.Limiter timeout time.Duration middlewares []Middleware promHTTPPrefix string @@ -130,6 +135,7 @@ var protobufDecoderInstance = protobufDecoder{} func NewRemoteQuerier( client httpgrpc.HTTPClient, timeout time.Duration, + maxRetryRate float64, // maxRetryRate is the maximum number of retries for failed queries per second. preferredQueryResultResponseFormat string, prometheusHTTPPrefix string, logger log.Logger, @@ -138,6 +144,7 @@ func NewRemoteQuerier( return &RemoteQuerier{ client: client, timeout: timeout, + retryLimiter: rate.NewLimiter(rate.Limit(maxRetryRate), 1), middlewares: middlewares, promHTTPPrefix: prometheusHTTPPrefix, logger: logger, @@ -349,12 +356,22 @@ func (q *RemoteQuerier) sendRequest(ctx context.Context, req *httpgrpc.HTTPReque if !retry.Ongoing() { return nil, err } - level.Warn(logger).Log("msg", "failed to remotely evaluate query expression, will retry", "err", err) - retry.Wait() - // Avoid masking last known error if context was cancelled while waiting. - if ctx.Err() != nil { - return nil, fmt.Errorf("%s while retrying request, last err was: %w", ctx.Err(), err) + retryReservation := q.retryLimiter.Reserve() + if !retryReservation.OK() { + // This should only happen if we've misconfigured the limiter. + return nil, fmt.Errorf("couldn't reserve a retry token") + } + // We want to wait at least the time for the backoff, but also don't want to exceed the rate limit. + // All of this is capped to the max backoff, so that we are less likely to overrun into the next evaluation. + retryDelay := max(retry.NextDelay(), min(retryConfig.MaxBackoff, retryReservation.Delay())) + level.Warn(logger).Log("msg", "failed to remotely evaluate query expression, will retry", "err", err, "retry_delay", retryDelay) + select { + case <-time.After(retryDelay): + case <-ctx.Done(): + retryReservation.Cancel() + // Avoid masking last known error if context was cancelled while waiting. + return nil, fmt.Errorf("%s while retrying request, last error was: %w", ctx.Err(), err) } } } diff --git a/pkg/ruler/remotequerier_test.go b/pkg/ruler/remotequerier_test.go index 1105bb77093..39b7df17755 100644 --- a/pkg/ruler/remotequerier_test.go +++ b/pkg/ruler/remotequerier_test.go @@ -64,7 +64,7 @@ func TestRemoteQuerier_Read(t *testing.T) { t.Run("should issue a remote read request", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) _, err := q.Read(context.Background(), &prompb.Query{}, false) require.NoError(t, err) @@ -76,7 +76,7 @@ func TestRemoteQuerier_Read(t *testing.T) { t.Run("should not inject the read consistency header if none is defined in the context", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) _, err := q.Read(context.Background(), &prompb.Query{}, false) require.NoError(t, err) @@ -86,7 +86,7 @@ func TestRemoteQuerier_Read(t *testing.T) { t.Run("should inject the read consistency header if it is defined in the context", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) ctx := api.ContextWithReadConsistencyLevel(context.Background(), api.ReadConsistencyStrong) _, err := q.Read(ctx, &prompb.Query{}, false) @@ -101,7 +101,7 @@ func TestRemoteQuerier_ReadReqTimeout(t *testing.T) { <-ctx.Done() return nil, ctx.Err() } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Second, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Second, 1, formatJSON, "/prometheus", log.NewNopLogger()) _, err := q.Read(context.Background(), &prompb.Query{}, false) require.Error(t, err) @@ -139,7 +139,7 @@ func TestRemoteQuerier_Query(t *testing.T) { t.Run(fmt.Sprintf("format = %s", format), func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, format, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, 1, format, "/prometheus", log.NewNopLogger()) _, err := q.Query(context.Background(), "qs", tm) require.NoError(t, err) @@ -165,7 +165,7 @@ func TestRemoteQuerier_Query(t *testing.T) { t.Run("should not inject the read consistency header if none is defined in the context", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) _, err := q.Query(context.Background(), "qs", tm) require.NoError(t, err) @@ -175,7 +175,7 @@ func TestRemoteQuerier_Query(t *testing.T) { t.Run("should inject the read consistency header if it is defined in the context", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) ctx := api.ContextWithReadConsistencyLevel(context.Background(), api.ReadConsistencyStrong) _, err := q.Query(ctx, "qs", tm) @@ -276,7 +276,7 @@ func TestRemoteQuerier_QueryRetryOnFailure(t *testing.T) { } return testCase.response, nil } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) require.Equal(t, int64(0), count.Load()) _, err := q.Query(ctx, "qs", time.Now()) if testCase.err == nil { @@ -284,12 +284,12 @@ func TestRemoteQuerier_QueryRetryOnFailure(t *testing.T) { require.NoError(t, err) } else { require.Error(t, err) - require.EqualError(t, err, testCase.expectedError.Error()) + require.ErrorContains(t, err, testCase.expectedError.Error()) } require.Equal(t, int64(1), count.Load()) } else { require.Error(t, err) - require.EqualError(t, err, testCase.expectedError.Error()) + require.ErrorContains(t, err, testCase.expectedError.Error()) if testCase.expectedRetries { require.Greater(t, count.Load(), int64(1)) } else { @@ -405,7 +405,7 @@ func TestRemoteQuerier_QueryJSONDecoding(t *testing.T) { Body: []byte(scenario.body), }, nil } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) tm := time.Unix(1649092025, 515834) actual, err := q.Query(context.Background(), "qs", tm) @@ -678,7 +678,7 @@ func TestRemoteQuerier_QueryProtobufDecoding(t *testing.T) { Body: b, }, nil } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatProtobuf, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, 1, formatProtobuf, "/prometheus", log.NewNopLogger()) tm := time.Unix(1649092025, 515834) actual, err := q.Query(context.Background(), "qs", tm) @@ -701,7 +701,7 @@ func TestRemoteQuerier_QueryUnknownResponseContentType(t *testing.T) { Body: []byte("some body content"), }, nil } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) tm := time.Unix(1649092025, 515834) _, err := q.Query(context.Background(), "qs", tm) @@ -713,7 +713,7 @@ func TestRemoteQuerier_QueryReqTimeout(t *testing.T) { <-ctx.Done() return nil, ctx.Err() } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Second, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Second, 1, formatJSON, "/prometheus", log.NewNopLogger()) tm := time.Unix(1649092025, 515834) _, err := q.Query(context.Background(), "qs", tm) @@ -771,7 +771,7 @@ func TestRemoteQuerier_StatusErrorResponses(t *testing.T) { return testCase.resp, testCase.err } logger := newLoggerWithCounter() - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", logger) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, 1, formatJSON, "/prometheus", logger) tm := time.Unix(1649092025, 515834)