From cf288e4dae1abbf59668435231be6457246ce5fb Mon Sep 17 00:00:00 2001 From: Dimitar Dimitrov Date: Wed, 8 Jan 2025 14:57:40 +0200 Subject: [PATCH 1/7] ruler: cap the number of remote eval retries The retries happen more aggressively than actual evaluations. With the current setup an error spike results in 3x the query rate - initial query, and two retries fairly quickly 100ms & 200ms after that. This PR changes that so that the whole process doesn't retry more than a fixed number of queries/sec. I chose 170 because at GL the average evals/sec is 340 per ruler. This would retry about half of the rules on average. _On average_ that should increase query load by 50%. Signed-off-by: Dimitar Dimitrov --- cmd/mimir/config-descriptor.json | 10 ++++ cmd/mimir/help-all.txt.tmpl | 2 + cmd/mimir/help.txt.tmpl | 2 + .../configuration-parameters/index.md | 4 ++ pkg/mimir/modules.go | 2 +- pkg/ruler/remotequerier.go | 18 +++++- pkg/ruler/remotequerier_test.go | 59 ++++++++++++++----- 7 files changed, 79 insertions(+), 18 deletions(-) diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index dcc0ebfb925..fd8a7db6ee1 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -13216,6 +13216,16 @@ "fieldDefaultValue": "protobuf", "fieldFlag": "ruler.query-frontend.query-result-response-format", "fieldType": "string" + }, + { + "kind": "field", + "name": "max_retries_rate", + "required": false, + "desc": "Maximum number of retries for failed queries per second.", + "fieldValue": null, + "fieldDefaultValue": 170, + "fieldFlag": "ruler.query-frontend.max-retries-rate", + "fieldType": "float" } ], "fieldValue": null, diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index bd0256e3f6c..4be2bc83a00 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -2985,6 +2985,8 @@ Usage of ./cmd/mimir/mimir: Override the default minimum TLS version. Allowed values: VersionTLS10, VersionTLS11, VersionTLS12, VersionTLS13 -ruler.query-frontend.grpc-client-config.tls-server-name string Override the expected name on the server certificate. + -ruler.query-frontend.max-retries-rate float + Maximum number of retries for failed queries per second. (default 170) -ruler.query-frontend.query-result-response-format string Format to use when retrieving query results from query-frontends. Supported values: json, protobuf (default "protobuf") -ruler.query-stats-enabled diff --git a/cmd/mimir/help.txt.tmpl b/cmd/mimir/help.txt.tmpl index f25ae81834b..d44ff1b3871 100644 --- a/cmd/mimir/help.txt.tmpl +++ b/cmd/mimir/help.txt.tmpl @@ -737,6 +737,8 @@ Usage of ./cmd/mimir/mimir: Maximum number of rules per rule group by namespace. Value is a map, where each key is the namespace and value is the number of rules allowed in the namespace (int). On the command line, this map is given in a JSON format. The number of rules specified has the same meaning as -ruler.max-rules-per-rule-group, but only applies for the specific namespace. If specified, it supersedes -ruler.max-rules-per-rule-group. (default {}) -ruler.query-frontend.address string GRPC listen address of the query-frontend(s). Must be a DNS address (prefixed with dns:///) to enable client side load balancing. + -ruler.query-frontend.max-retries-rate float + Maximum number of retries for failed queries per second. (default 170) -ruler.query-frontend.query-result-response-format string Format to use when retrieving query results from query-frontends. Supported values: json, protobuf (default "protobuf") -ruler.recording-rules-evaluation-enabled diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index dfd84fbca61..447c0b4f11c 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -2103,6 +2103,10 @@ query_frontend: # CLI flag: -ruler.query-frontend.query-result-response-format [query_result_response_format: | default = "protobuf"] + # Maximum number of retries for failed queries per second. + # CLI flag: -ruler.query-frontend.max-retries-rate + [max_retries_rate: | default = 170] + tenant_federation: # Enable rule groups to query against multiple tenants. The tenant IDs # involved need to be in the rule group's 'source_tenants' field. If this flag diff --git a/pkg/mimir/modules.go b/pkg/mimir/modules.go index a2839280bb9..3420c206527 100644 --- a/pkg/mimir/modules.go +++ b/pkg/mimir/modules.go @@ -859,7 +859,7 @@ func (t *Mimir) initRuler() (serv services.Service, err error) { if err != nil { return nil, err } - remoteQuerier := ruler.NewRemoteQuerier(queryFrontendClient, t.Cfg.Querier.EngineConfig.Timeout, t.Cfg.Ruler.QueryFrontend.QueryResultResponseFormat, t.Cfg.API.PrometheusHTTPPrefix, util_log.Logger, ruler.WithOrgIDMiddleware) + remoteQuerier := ruler.NewRemoteQuerier(queryFrontendClient, t.Cfg.Querier.EngineConfig.Timeout, 1, t.Cfg.Ruler.QueryFrontend.QueryResultResponseFormat, t.Cfg.API.PrometheusHTTPPrefix, util_log.Logger, ruler.WithOrgIDMiddleware) embeddedQueryable = prom_remote.NewSampleAndChunkQueryableClient( remoteQuerier, diff --git a/pkg/ruler/remotequerier.go b/pkg/ruler/remotequerier.go index 57a13f7ef6e..f4df70664e2 100644 --- a/pkg/ruler/remotequerier.go +++ b/pkg/ruler/remotequerier.go @@ -31,6 +31,7 @@ import ( "github.com/prometheus/prometheus/promql" "github.com/prometheus/prometheus/storage" "github.com/prometheus/prometheus/storage/remote" + "golang.org/x/time/rate" "google.golang.org/grpc" "google.golang.org/grpc/codes" @@ -67,6 +68,8 @@ type QueryFrontendConfig struct { GRPCClientConfig grpcclient.Config `yaml:"grpc_client_config" doc:"description=Configures the gRPC client used to communicate between the rulers and query-frontends."` QueryResultResponseFormat string `yaml:"query_result_response_format"` + + MaxRetriesRate float64 `yaml:"max_retries_rate"` } func (c *QueryFrontendConfig) RegisterFlags(f *flag.FlagSet) { @@ -80,6 +83,7 @@ func (c *QueryFrontendConfig) RegisterFlags(f *flag.FlagSet) { c.GRPCClientConfig.RegisterFlagsWithPrefix("ruler.query-frontend.grpc-client-config", f) f.StringVar(&c.QueryResultResponseFormat, "ruler.query-frontend.query-result-response-format", formatProtobuf, fmt.Sprintf("Format to use when retrieving query results from query-frontends. Supported values: %s", strings.Join(allFormats, ", "))) + f.Float64Var(&c.MaxRetriesRate, "ruler.query-frontend.max-retries-rate", 170, "Maximum number of retries for failed queries per second.") } func (c *QueryFrontendConfig) Validate() error { @@ -115,6 +119,7 @@ type Middleware func(ctx context.Context, req *httpgrpc.HTTPRequest) error // RemoteQuerier executes read operations against a httpgrpc.HTTPClient. type RemoteQuerier struct { client httpgrpc.HTTPClient + retryLimiter *rate.Limiter timeout time.Duration middlewares []Middleware promHTTPPrefix string @@ -130,6 +135,7 @@ var protobufDecoderInstance = protobufDecoder{} func NewRemoteQuerier( client httpgrpc.HTTPClient, timeout time.Duration, + maxRetryRate float64, // maxRetryRate is the maximum number of retries for failed queries per second. preferredQueryResultResponseFormat string, prometheusHTTPPrefix string, logger log.Logger, @@ -138,6 +144,7 @@ func NewRemoteQuerier( return &RemoteQuerier{ client: client, timeout: timeout, + retryLimiter: rate.NewLimiter(rate.Limit(maxRetryRate), 1), middlewares: middlewares, promHTTPPrefix: prometheusHTTPPrefix, logger: logger, @@ -349,12 +356,19 @@ func (q *RemoteQuerier) sendRequest(ctx context.Context, req *httpgrpc.HTTPReque if !retry.Ongoing() { return nil, err } - level.Warn(logger).Log("msg", "failed to remotely evaluate query expression, will retry", "err", err) retry.Wait() + // The backoff wait spreads the retries. We check the rate limiter only after that spread. + // That should give queries a bigger chance at passing through the rate limiter + // in cases of short error bursts and the rate of failed queries being larger than the rate limit. + if !q.retryLimiter.Allow() { + return nil, fmt.Errorf("exhausted global retry budget for remote ruler execution (ruler.query-frontend.max-retries-rate); last error was: %w", err) + } + level.Warn(logger).Log("msg", "failed to remotely evaluate query expression, will retry", "err", err) + // Avoid masking last known error if context was cancelled while waiting. if ctx.Err() != nil { - return nil, fmt.Errorf("%s while retrying request, last err was: %w", ctx.Err(), err) + return nil, fmt.Errorf("%s while retrying request, last error was: %w", ctx.Err(), err) } } } diff --git a/pkg/ruler/remotequerier_test.go b/pkg/ruler/remotequerier_test.go index 1105bb77093..4038212f0c8 100644 --- a/pkg/ruler/remotequerier_test.go +++ b/pkg/ruler/remotequerier_test.go @@ -21,6 +21,7 @@ import ( "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/prompb" "github.com/prometheus/prometheus/promql" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.uber.org/atomic" "google.golang.org/grpc" @@ -64,7 +65,7 @@ func TestRemoteQuerier_Read(t *testing.T) { t.Run("should issue a remote read request", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) _, err := q.Read(context.Background(), &prompb.Query{}, false) require.NoError(t, err) @@ -76,7 +77,7 @@ func TestRemoteQuerier_Read(t *testing.T) { t.Run("should not inject the read consistency header if none is defined in the context", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) _, err := q.Read(context.Background(), &prompb.Query{}, false) require.NoError(t, err) @@ -86,7 +87,7 @@ func TestRemoteQuerier_Read(t *testing.T) { t.Run("should inject the read consistency header if it is defined in the context", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) ctx := api.ContextWithReadConsistencyLevel(context.Background(), api.ReadConsistencyStrong) _, err := q.Read(ctx, &prompb.Query{}, false) @@ -101,7 +102,7 @@ func TestRemoteQuerier_ReadReqTimeout(t *testing.T) { <-ctx.Done() return nil, ctx.Err() } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Second, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Second, 1, formatJSON, "/prometheus", log.NewNopLogger()) _, err := q.Read(context.Background(), &prompb.Query{}, false) require.Error(t, err) @@ -139,7 +140,7 @@ func TestRemoteQuerier_Query(t *testing.T) { t.Run(fmt.Sprintf("format = %s", format), func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, format, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, 1, format, "/prometheus", log.NewNopLogger()) _, err := q.Query(context.Background(), "qs", tm) require.NoError(t, err) @@ -165,7 +166,7 @@ func TestRemoteQuerier_Query(t *testing.T) { t.Run("should not inject the read consistency header if none is defined in the context", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) _, err := q.Query(context.Background(), "qs", tm) require.NoError(t, err) @@ -175,7 +176,7 @@ func TestRemoteQuerier_Query(t *testing.T) { t.Run("should inject the read consistency header if it is defined in the context", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) ctx := api.ContextWithReadConsistencyLevel(context.Background(), api.ReadConsistencyStrong) _, err := q.Query(ctx, "qs", tm) @@ -276,7 +277,7 @@ func TestRemoteQuerier_QueryRetryOnFailure(t *testing.T) { } return testCase.response, nil } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) require.Equal(t, int64(0), count.Load()) _, err := q.Query(ctx, "qs", time.Now()) if testCase.err == nil { @@ -284,12 +285,12 @@ func TestRemoteQuerier_QueryRetryOnFailure(t *testing.T) { require.NoError(t, err) } else { require.Error(t, err) - require.EqualError(t, err, testCase.expectedError.Error()) + require.ErrorContains(t, err, testCase.expectedError.Error()) } require.Equal(t, int64(1), count.Load()) } else { require.Error(t, err) - require.EqualError(t, err, testCase.expectedError.Error()) + require.ErrorContains(t, err, testCase.expectedError.Error()) if testCase.expectedRetries { require.Greater(t, count.Load(), int64(1)) } else { @@ -299,6 +300,34 @@ func TestRemoteQuerier_QueryRetryOnFailure(t *testing.T) { }) } } +func TestRemoteQuerier_QueryRetryBudgetExhaustion(t *testing.T) { + ctx := context.Background() + attempts := &atomic.Int64{} + + // Mock client that always returns a 500 error to trigger retries + mockClientFn := func(context.Context, *httpgrpc.HTTPRequest, ...grpc.CallOption) (*httpgrpc.HTTPResponse, error) { + attempts.Add(1) + return nil, httpgrpc.Errorf(http.StatusInternalServerError, assert.AnError.Error()) + } + + // Create querier with very low retry rate to trigger budget exhaustion quickly + q := NewRemoteQuerier( + mockHTTPGRPCClient(mockClientFn), + time.Minute, // timeout + 0.0001, // retry rate + formatJSON, + "/prometheus", + log.NewNopLogger(), + ) + + _, err := q.Query(ctx, "test_query", time.Now()) + + require.Error(t, err) + require.ErrorContains(t, err, "exhausted global retry budget for remote ruler execution (ruler.query-frontend.max-retries-rate)") + + // Should have attempted at least one retry before exhausting budget + require.GreaterOrEqual(t, attempts.Load(), int64(2)) +} func TestRemoteQuerier_QueryJSONDecoding(t *testing.T) { scenarios := map[string]struct { @@ -405,7 +434,7 @@ func TestRemoteQuerier_QueryJSONDecoding(t *testing.T) { Body: []byte(scenario.body), }, nil } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) tm := time.Unix(1649092025, 515834) actual, err := q.Query(context.Background(), "qs", tm) @@ -678,7 +707,7 @@ func TestRemoteQuerier_QueryProtobufDecoding(t *testing.T) { Body: b, }, nil } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatProtobuf, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, 1, formatProtobuf, "/prometheus", log.NewNopLogger()) tm := time.Unix(1649092025, 515834) actual, err := q.Query(context.Background(), "qs", tm) @@ -701,7 +730,7 @@ func TestRemoteQuerier_QueryUnknownResponseContentType(t *testing.T) { Body: []byte("some body content"), }, nil } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) tm := time.Unix(1649092025, 515834) _, err := q.Query(context.Background(), "qs", tm) @@ -713,7 +742,7 @@ func TestRemoteQuerier_QueryReqTimeout(t *testing.T) { <-ctx.Done() return nil, ctx.Err() } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Second, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Second, 1, formatJSON, "/prometheus", log.NewNopLogger()) tm := time.Unix(1649092025, 515834) _, err := q.Query(context.Background(), "qs", tm) @@ -771,7 +800,7 @@ func TestRemoteQuerier_StatusErrorResponses(t *testing.T) { return testCase.resp, testCase.err } logger := newLoggerWithCounter() - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", logger) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, 1, formatJSON, "/prometheus", logger) tm := time.Unix(1649092025, 515834) From 59e2c496b79482ba4c48ebaaee60a40e6739b56f Mon Sep 17 00:00:00 2001 From: Dimitar Dimitrov Date: Wed, 8 Jan 2025 15:02:39 +0200 Subject: [PATCH 2/7] Add CHANGELOG.md entry Signed-off-by: Dimitar Dimitrov --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c6d542b8ee0..88065da5c61 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,12 +2,12 @@ ## main / unreleased -* [CHANGE] Query-frontend: Add `topic` label to `cortex_ingest_storage_strong_consistency_requests_total`, `cortex_ingest_storage_strong_consistency_failures_total`, and `cortex_ingest_storage_strong_consistency_wait_duration_seconds` metrics. #10220 - ### Grafana Mimir * [CHANGE] Distributor: OTLP and push handler replace all non-UTF8 characters with the unicode replacement character `\uFFFD` in error messages before propagating them. #10236 * [CHANGE] Querier: pass query matchers to queryable `IsApplicable` hook. #10256 +* [CHANGE] Query-frontend: Add `topic` label to `cortex_ingest_storage_strong_consistency_requests_total`, `cortex_ingest_storage_strong_consistency_failures_total`, and `cortex_ingest_storage_strong_consistency_wait_duration_seconds` metrics. #10220 +* [CHANGE] Ruler: cap the rate of retries for remote query evaluation to 170/sec. This is configurable via `-ruler.query-frontend.max-retries-rate`. #10375 * [ENHANCEMENT] Query Frontend: Return server-side `samples_processed` statistics. #10103 * [ENHANCEMENT] Distributor: OTLP receiver now converts also metric metadata. See also https://github.com/prometheus/prometheus/pull/15416. #10168 * [ENHANCEMENT] Distributor: discard float and histogram samples with duplicated timestamps from each timeseries in a request before the request is forwarded to ingesters. Discarded samples are tracked by the `cortex_discarded_samples_total` metrics with reason `sample_duplicate_timestamp`. #10145 From b020a8fa8942be32d07a53947d4a0ca01fcf3e0a Mon Sep 17 00:00:00 2001 From: Dimitar Dimitrov Date: Wed, 8 Jan 2025 15:03:35 +0200 Subject: [PATCH 3/7] Fix a totally arbitrary stupid linter rule Signed-off-by: Dimitar Dimitrov --- pkg/ruler/remotequerier_test.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pkg/ruler/remotequerier_test.go b/pkg/ruler/remotequerier_test.go index 4038212f0c8..33f78dced3e 100644 --- a/pkg/ruler/remotequerier_test.go +++ b/pkg/ruler/remotequerier_test.go @@ -21,7 +21,6 @@ import ( "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/prompb" "github.com/prometheus/prometheus/promql" - "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "go.uber.org/atomic" "google.golang.org/grpc" @@ -307,7 +306,7 @@ func TestRemoteQuerier_QueryRetryBudgetExhaustion(t *testing.T) { // Mock client that always returns a 500 error to trigger retries mockClientFn := func(context.Context, *httpgrpc.HTTPRequest, ...grpc.CallOption) (*httpgrpc.HTTPResponse, error) { attempts.Add(1) - return nil, httpgrpc.Errorf(http.StatusInternalServerError, assert.AnError.Error()) + return nil, httpgrpc.Errorf(http.StatusInternalServerError, "some error") } // Create querier with very low retry rate to trigger budget exhaustion quickly From b07366f25a995312c637f284780277d800ca6a1b Mon Sep 17 00:00:00 2001 From: Dimitar Dimitrov Date: Thu, 9 Jan 2025 12:19:59 +0200 Subject: [PATCH 4/7] Use a CB instead of a rate limtier Signed-off-by: Dimitar Dimitrov --- CHANGELOG.md | 2 +- cmd/mimir/config-descriptor.json | 10 -- cmd/mimir/help-all.txt.tmpl | 2 - cmd/mimir/help.txt.tmpl | 2 - .../configuration-parameters/index.md | 4 - pkg/mimir/modules.go | 2 +- pkg/ruler/remotequerier.go | 101 +++++++++++------- pkg/ruler/remotequerier_test.go | 44 ++++---- 8 files changed, 90 insertions(+), 77 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 88065da5c61..446a9b9bc33 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ * [CHANGE] Distributor: OTLP and push handler replace all non-UTF8 characters with the unicode replacement character `\uFFFD` in error messages before propagating them. #10236 * [CHANGE] Querier: pass query matchers to queryable `IsApplicable` hook. #10256 * [CHANGE] Query-frontend: Add `topic` label to `cortex_ingest_storage_strong_consistency_requests_total`, `cortex_ingest_storage_strong_consistency_failures_total`, and `cortex_ingest_storage_strong_consistency_wait_duration_seconds` metrics. #10220 -* [CHANGE] Ruler: cap the rate of retries for remote query evaluation to 170/sec. This is configurable via `-ruler.query-frontend.max-retries-rate`. #10375 +* [CHANGE] Ruler: do not retry remote ruler evaluations when more than 50% of evaluations have failed in the last 5 seconds. #10375 * [ENHANCEMENT] Query Frontend: Return server-side `samples_processed` statistics. #10103 * [ENHANCEMENT] Distributor: OTLP receiver now converts also metric metadata. See also https://github.com/prometheus/prometheus/pull/15416. #10168 * [ENHANCEMENT] Distributor: discard float and histogram samples with duplicated timestamps from each timeseries in a request before the request is forwarded to ingesters. Discarded samples are tracked by the `cortex_discarded_samples_total` metrics with reason `sample_duplicate_timestamp`. #10145 diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index fd8a7db6ee1..dcc0ebfb925 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -13216,16 +13216,6 @@ "fieldDefaultValue": "protobuf", "fieldFlag": "ruler.query-frontend.query-result-response-format", "fieldType": "string" - }, - { - "kind": "field", - "name": "max_retries_rate", - "required": false, - "desc": "Maximum number of retries for failed queries per second.", - "fieldValue": null, - "fieldDefaultValue": 170, - "fieldFlag": "ruler.query-frontend.max-retries-rate", - "fieldType": "float" } ], "fieldValue": null, diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index 4be2bc83a00..bd0256e3f6c 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -2985,8 +2985,6 @@ Usage of ./cmd/mimir/mimir: Override the default minimum TLS version. Allowed values: VersionTLS10, VersionTLS11, VersionTLS12, VersionTLS13 -ruler.query-frontend.grpc-client-config.tls-server-name string Override the expected name on the server certificate. - -ruler.query-frontend.max-retries-rate float - Maximum number of retries for failed queries per second. (default 170) -ruler.query-frontend.query-result-response-format string Format to use when retrieving query results from query-frontends. Supported values: json, protobuf (default "protobuf") -ruler.query-stats-enabled diff --git a/cmd/mimir/help.txt.tmpl b/cmd/mimir/help.txt.tmpl index d44ff1b3871..f25ae81834b 100644 --- a/cmd/mimir/help.txt.tmpl +++ b/cmd/mimir/help.txt.tmpl @@ -737,8 +737,6 @@ Usage of ./cmd/mimir/mimir: Maximum number of rules per rule group by namespace. Value is a map, where each key is the namespace and value is the number of rules allowed in the namespace (int). On the command line, this map is given in a JSON format. The number of rules specified has the same meaning as -ruler.max-rules-per-rule-group, but only applies for the specific namespace. If specified, it supersedes -ruler.max-rules-per-rule-group. (default {}) -ruler.query-frontend.address string GRPC listen address of the query-frontend(s). Must be a DNS address (prefixed with dns:///) to enable client side load balancing. - -ruler.query-frontend.max-retries-rate float - Maximum number of retries for failed queries per second. (default 170) -ruler.query-frontend.query-result-response-format string Format to use when retrieving query results from query-frontends. Supported values: json, protobuf (default "protobuf") -ruler.recording-rules-evaluation-enabled diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index 447c0b4f11c..dfd84fbca61 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -2103,10 +2103,6 @@ query_frontend: # CLI flag: -ruler.query-frontend.query-result-response-format [query_result_response_format: | default = "protobuf"] - # Maximum number of retries for failed queries per second. - # CLI flag: -ruler.query-frontend.max-retries-rate - [max_retries_rate: | default = 170] - tenant_federation: # Enable rule groups to query against multiple tenants. The tenant IDs # involved need to be in the rule group's 'source_tenants' field. If this flag diff --git a/pkg/mimir/modules.go b/pkg/mimir/modules.go index 3420c206527..a2839280bb9 100644 --- a/pkg/mimir/modules.go +++ b/pkg/mimir/modules.go @@ -859,7 +859,7 @@ func (t *Mimir) initRuler() (serv services.Service, err error) { if err != nil { return nil, err } - remoteQuerier := ruler.NewRemoteQuerier(queryFrontendClient, t.Cfg.Querier.EngineConfig.Timeout, 1, t.Cfg.Ruler.QueryFrontend.QueryResultResponseFormat, t.Cfg.API.PrometheusHTTPPrefix, util_log.Logger, ruler.WithOrgIDMiddleware) + remoteQuerier := ruler.NewRemoteQuerier(queryFrontendClient, t.Cfg.Querier.EngineConfig.Timeout, t.Cfg.Ruler.QueryFrontend.QueryResultResponseFormat, t.Cfg.API.PrometheusHTTPPrefix, util_log.Logger, ruler.WithOrgIDMiddleware) embeddedQueryable = prom_remote.NewSampleAndChunkQueryableClient( remoteQuerier, diff --git a/pkg/ruler/remotequerier.go b/pkg/ruler/remotequerier.go index f4df70664e2..0ba357b0000 100644 --- a/pkg/ruler/remotequerier.go +++ b/pkg/ruler/remotequerier.go @@ -14,6 +14,7 @@ import ( "strings" "time" + "github.com/failsafe-go/failsafe-go/circuitbreaker" "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/gogo/protobuf/proto" @@ -31,7 +32,6 @@ import ( "github.com/prometheus/prometheus/promql" "github.com/prometheus/prometheus/storage" "github.com/prometheus/prometheus/storage/remote" - "golang.org/x/time/rate" "google.golang.org/grpc" "google.golang.org/grpc/codes" @@ -55,6 +55,10 @@ const ( formatJSON = "json" formatProtobuf = "protobuf" + + cbRetryFailurePercentage = 50 // 50% + cbRetryFailureThreshold = 10 // 10 failed queries + cbPeriod = 5 * time.Second ) var allFormats = []string{formatJSON, formatProtobuf} @@ -68,8 +72,6 @@ type QueryFrontendConfig struct { GRPCClientConfig grpcclient.Config `yaml:"grpc_client_config" doc:"description=Configures the gRPC client used to communicate between the rulers and query-frontends."` QueryResultResponseFormat string `yaml:"query_result_response_format"` - - MaxRetriesRate float64 `yaml:"max_retries_rate"` } func (c *QueryFrontendConfig) RegisterFlags(f *flag.FlagSet) { @@ -83,7 +85,6 @@ func (c *QueryFrontendConfig) RegisterFlags(f *flag.FlagSet) { c.GRPCClientConfig.RegisterFlagsWithPrefix("ruler.query-frontend.grpc-client-config", f) f.StringVar(&c.QueryResultResponseFormat, "ruler.query-frontend.query-result-response-format", formatProtobuf, fmt.Sprintf("Format to use when retrieving query results from query-frontends. Supported values: %s", strings.Join(allFormats, ", "))) - f.Float64Var(&c.MaxRetriesRate, "ruler.query-frontend.max-retries-rate", 170, "Maximum number of retries for failed queries per second.") } func (c *QueryFrontendConfig) Validate() error { @@ -119,7 +120,7 @@ type Middleware func(ctx context.Context, req *httpgrpc.HTTPRequest) error // RemoteQuerier executes read operations against a httpgrpc.HTTPClient. type RemoteQuerier struct { client httpgrpc.HTTPClient - retryLimiter *rate.Limiter + retryCB circuitbreaker.CircuitBreaker[struct{}] timeout time.Duration middlewares []Middleware promHTTPPrefix string @@ -135,16 +136,19 @@ var protobufDecoderInstance = protobufDecoder{} func NewRemoteQuerier( client httpgrpc.HTTPClient, timeout time.Duration, - maxRetryRate float64, // maxRetryRate is the maximum number of retries for failed queries per second. preferredQueryResultResponseFormat string, prometheusHTTPPrefix string, logger log.Logger, middlewares ...Middleware, ) *RemoteQuerier { + retryCB := circuitbreaker.Builder[struct{}](). + WithDelay(cbPeriod). // disable retries for 5s when the CB opens + WithFailureRateThreshold(cbRetryFailurePercentage, cbRetryFailureThreshold, cbPeriod). + Build() return &RemoteQuerier{ client: client, timeout: timeout, - retryLimiter: rate.NewLimiter(rate.Limit(maxRetryRate), 1), + retryCB: retryCB, middlewares: middlewares, promHTTPPrefix: prometheusHTTPPrefix, logger: logger, @@ -313,6 +317,13 @@ func (q *RemoteQuerier) createRequest(ctx context.Context, query string, ts time return req, nil } +// sendRequest sends the request and retries if possible. It attempts each query 3 times. Only server errors are retried. +// It also respects a circuit breaker for retries. All evaluations are attempted at least once, but retries aren't guaranteed. +// Retries are disabled when the CB is open. The CB opens and closes based on the outcomes of all requests, not only retries. +// The goal is to +// - have a more predictable load on the remote query-frontend +// - not shed query evaluations unnecessarily +// - limit the added load of retries when most queries are failing already. func (q *RemoteQuerier) sendRequest(ctx context.Context, req *httpgrpc.HTTPRequest, logger log.Logger) (*httpgrpc.HTTPResponse, error) { // Ongoing request may be cancelled during evaluation due to some transient error or server shutdown, // so we'll keep retrying until we get a successful response or backoff is terminated. @@ -325,44 +336,25 @@ func (q *RemoteQuerier) sendRequest(ctx context.Context, req *httpgrpc.HTTPReque for { resp, err := q.client.Handle(ctx, req) - if err == nil { - // Responses with status codes 4xx should always be considered erroneous. - // These errors shouldn't be retried because it is expected that - // running the same query gives rise to the same 4xx error. - if resp.Code/100 == 4 { - return nil, httpgrpc.ErrorFromHTTPResponse(resp) - } - return resp, nil - } - - // Bail out if the error is known to be not retriable. - switch code := grpcutil.ErrorToStatusCode(err); code { - case codes.ResourceExhausted: - // In case the server is configured with "grpc-max-send-msg-size-bytes", - // and the response exceeds this limit, there is no point retrying the request. - // This is a special case, refer to grafana/mimir#7216. - if strings.Contains(err.Error(), "message larger than max") { - return nil, err - } - default: - // In case the error was a wrapped HTTPResponse, its code represents HTTP status; - // 4xx errors shouldn't be retried because it is expected that - // running the same query gives rise to the same 4xx error. - if code/100 == 4 { - return nil, err - } + canRetry, err := parseRemoteEvalError(err, resp) + if err == nil || !canRetry { + // An error we can't retry is the same as a successful request. For example, invalid promQL queries. + q.retryCB.RecordSuccess() + return resp, err } + q.retryCB.RecordFailure() if !retry.Ongoing() { return nil, err } retry.Wait() - // The backoff wait spreads the retries. We check the rate limiter only after that spread. - // That should give queries a bigger chance at passing through the rate limiter - // in cases of short error bursts and the rate of failed queries being larger than the rate limit. - if !q.retryLimiter.Allow() { - return nil, fmt.Errorf("exhausted global retry budget for remote ruler execution (ruler.query-frontend.max-retries-rate); last error was: %w", err) + // The backoff wait spreads the retries. We check the circuit breaker (CB) only after that spread. + // That should give queries a bigger chance at passing through the CB + // in cases when the CB closed of half-closed while we were waiting. + if !q.retryCB.TryAcquirePermit() { + return nil, fmt.Errorf("remote evaluation retries have been disabled because there were too many failed evaluations recently (more than %d%% in the last %s); last error was: %w", + cbRetryFailurePercentage, cbPeriod, err) } level.Warn(logger).Log("msg", "failed to remotely evaluate query expression, will retry", "err", err) @@ -373,6 +365,39 @@ func (q *RemoteQuerier) sendRequest(ctx context.Context, req *httpgrpc.HTTPReque } } +// parseRemoteEvalError returns the error to be propagated and a boolean indicating whether the error is retriable. +// If the error is nil, then it is not considered retriable. +func parseRemoteEvalError(err error, resp *httpgrpc.HTTPResponse) (bool, error) { + if err == nil { + // Responses with status codes 4xx should always be considered erroneous. + // These errors shouldn't be retried because it is expected that + // running the same query gives rise to the same 4xx error. + if resp.Code/100 == 4 { + return false, httpgrpc.ErrorFromHTTPResponse(resp) + } + return false, nil + } + + // Bail out if the error is known to be not retriable. + switch code := grpcutil.ErrorToStatusCode(err); code { + case codes.ResourceExhausted: + // In case the server is configured with "grpc-max-send-msg-size-bytes", + // and the response exceeds this limit, there is no point retrying the request. + // This is a special case, refer to grafana/mimir#7216. + if strings.Contains(err.Error(), "message larger than max") { + return false, err + } + default: + // In case the error was a wrapped HTTPResponse, its code represents HTTP status; + // 4xx errors shouldn't be retried because it is expected that + // running the same query gives rise to the same 4xx error. + if code/100 == 4 { + return false, err + } + } + return true, err +} + // WithOrgIDMiddleware attaches 'X-Scope-OrgID' header value to the outgoing request by inspecting the passed context. // In case the expression to evaluate corresponds to a federated rule, the ExtractTenantIDs function will take care // of normalizing and concatenating source tenants by separating them with a '|' character. diff --git a/pkg/ruler/remotequerier_test.go b/pkg/ruler/remotequerier_test.go index 33f78dced3e..6ee3de1b489 100644 --- a/pkg/ruler/remotequerier_test.go +++ b/pkg/ruler/remotequerier_test.go @@ -64,7 +64,7 @@ func TestRemoteQuerier_Read(t *testing.T) { t.Run("should issue a remote read request", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) _, err := q.Read(context.Background(), &prompb.Query{}, false) require.NoError(t, err) @@ -76,7 +76,7 @@ func TestRemoteQuerier_Read(t *testing.T) { t.Run("should not inject the read consistency header if none is defined in the context", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) _, err := q.Read(context.Background(), &prompb.Query{}, false) require.NoError(t, err) @@ -86,7 +86,7 @@ func TestRemoteQuerier_Read(t *testing.T) { t.Run("should inject the read consistency header if it is defined in the context", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) ctx := api.ContextWithReadConsistencyLevel(context.Background(), api.ReadConsistencyStrong) _, err := q.Read(ctx, &prompb.Query{}, false) @@ -101,7 +101,7 @@ func TestRemoteQuerier_ReadReqTimeout(t *testing.T) { <-ctx.Done() return nil, ctx.Err() } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Second, 1, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Second, formatJSON, "/prometheus", log.NewNopLogger()) _, err := q.Read(context.Background(), &prompb.Query{}, false) require.Error(t, err) @@ -139,7 +139,7 @@ func TestRemoteQuerier_Query(t *testing.T) { t.Run(fmt.Sprintf("format = %s", format), func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, 1, format, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, format, "/prometheus", log.NewNopLogger()) _, err := q.Query(context.Background(), "qs", tm) require.NoError(t, err) @@ -165,7 +165,7 @@ func TestRemoteQuerier_Query(t *testing.T) { t.Run("should not inject the read consistency header if none is defined in the context", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) _, err := q.Query(context.Background(), "qs", tm) require.NoError(t, err) @@ -175,7 +175,7 @@ func TestRemoteQuerier_Query(t *testing.T) { t.Run("should inject the read consistency header if it is defined in the context", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) ctx := api.ContextWithReadConsistencyLevel(context.Background(), api.ReadConsistencyStrong) _, err := q.Query(ctx, "qs", tm) @@ -276,7 +276,7 @@ func TestRemoteQuerier_QueryRetryOnFailure(t *testing.T) { } return testCase.response, nil } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) require.Equal(t, int64(0), count.Load()) _, err := q.Query(ctx, "qs", time.Now()) if testCase.err == nil { @@ -313,19 +313,25 @@ func TestRemoteQuerier_QueryRetryBudgetExhaustion(t *testing.T) { q := NewRemoteQuerier( mockHTTPGRPCClient(mockClientFn), time.Minute, // timeout - 0.0001, // retry rate formatJSON, "/prometheus", log.NewNopLogger(), ) - _, err := q.Query(ctx, "test_query", time.Now()) + // Run a few queries very quickly + queriesStart := time.Now() + for i := 0; i < cbRetryFailureThreshold+1; i++ { + _, err := q.Query(ctx, "test_query", time.Now()) + require.Error(t, err) + } - require.Error(t, err) - require.ErrorContains(t, err, "exhausted global retry budget for remote ruler execution (ruler.query-frontend.max-retries-rate)") + attemptsAfterBurstQueries := attempts.Load() + // One last evaluation should lead to a failure, but also shouldn't trigger a retry. + _, err := q.Query(ctx, "test_query", time.Now()) - // Should have attempted at least one retry before exhausting budget - require.GreaterOrEqual(t, attempts.Load(), int64(2)) + require.Less(t, time.Since(queriesStart), cbPeriod, "running queries took longer thant he CB period, the test will be flaky") + require.ErrorContains(t, err, "remote evaluation retries have been disabled because there were too many failed evaluations recently") + require.Equal(t, attemptsAfterBurstQueries+1, attempts.Load(), "expected only a single additional query without retries") } func TestRemoteQuerier_QueryJSONDecoding(t *testing.T) { @@ -433,7 +439,7 @@ func TestRemoteQuerier_QueryJSONDecoding(t *testing.T) { Body: []byte(scenario.body), }, nil } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) tm := time.Unix(1649092025, 515834) actual, err := q.Query(context.Background(), "qs", tm) @@ -706,7 +712,7 @@ func TestRemoteQuerier_QueryProtobufDecoding(t *testing.T) { Body: b, }, nil } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, 1, formatProtobuf, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatProtobuf, "/prometheus", log.NewNopLogger()) tm := time.Unix(1649092025, 515834) actual, err := q.Query(context.Background(), "qs", tm) @@ -729,7 +735,7 @@ func TestRemoteQuerier_QueryUnknownResponseContentType(t *testing.T) { Body: []byte("some body content"), }, nil } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) tm := time.Unix(1649092025, 515834) _, err := q.Query(context.Background(), "qs", tm) @@ -741,7 +747,7 @@ func TestRemoteQuerier_QueryReqTimeout(t *testing.T) { <-ctx.Done() return nil, ctx.Err() } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Second, 1, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Second, formatJSON, "/prometheus", log.NewNopLogger()) tm := time.Unix(1649092025, 515834) _, err := q.Query(context.Background(), "qs", tm) @@ -799,7 +805,7 @@ func TestRemoteQuerier_StatusErrorResponses(t *testing.T) { return testCase.resp, testCase.err } logger := newLoggerWithCounter() - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, 1, formatJSON, "/prometheus", logger) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", logger) tm := time.Unix(1649092025, 515834) From f19496b75ddc5aee7f1995dc93ecbd169d577dda Mon Sep 17 00:00:00 2001 From: Dimitar Dimitrov Date: Thu, 9 Jan 2025 12:56:24 +0200 Subject: [PATCH 5/7] Revert "Use a CB instead of a rate limtier" This reverts commit b07366f25a995312c637f284780277d800ca6a1b. --- CHANGELOG.md | 2 +- cmd/mimir/config-descriptor.json | 10 ++ cmd/mimir/help-all.txt.tmpl | 2 + cmd/mimir/help.txt.tmpl | 2 + .../configuration-parameters/index.md | 4 + pkg/mimir/modules.go | 2 +- pkg/ruler/remotequerier.go | 101 +++++++----------- pkg/ruler/remotequerier_test.go | 44 ++++---- 8 files changed, 77 insertions(+), 90 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 446a9b9bc33..88065da5c61 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ * [CHANGE] Distributor: OTLP and push handler replace all non-UTF8 characters with the unicode replacement character `\uFFFD` in error messages before propagating them. #10236 * [CHANGE] Querier: pass query matchers to queryable `IsApplicable` hook. #10256 * [CHANGE] Query-frontend: Add `topic` label to `cortex_ingest_storage_strong_consistency_requests_total`, `cortex_ingest_storage_strong_consistency_failures_total`, and `cortex_ingest_storage_strong_consistency_wait_duration_seconds` metrics. #10220 -* [CHANGE] Ruler: do not retry remote ruler evaluations when more than 50% of evaluations have failed in the last 5 seconds. #10375 +* [CHANGE] Ruler: cap the rate of retries for remote query evaluation to 170/sec. This is configurable via `-ruler.query-frontend.max-retries-rate`. #10375 * [ENHANCEMENT] Query Frontend: Return server-side `samples_processed` statistics. #10103 * [ENHANCEMENT] Distributor: OTLP receiver now converts also metric metadata. See also https://github.com/prometheus/prometheus/pull/15416. #10168 * [ENHANCEMENT] Distributor: discard float and histogram samples with duplicated timestamps from each timeseries in a request before the request is forwarded to ingesters. Discarded samples are tracked by the `cortex_discarded_samples_total` metrics with reason `sample_duplicate_timestamp`. #10145 diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index dcc0ebfb925..fd8a7db6ee1 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -13216,6 +13216,16 @@ "fieldDefaultValue": "protobuf", "fieldFlag": "ruler.query-frontend.query-result-response-format", "fieldType": "string" + }, + { + "kind": "field", + "name": "max_retries_rate", + "required": false, + "desc": "Maximum number of retries for failed queries per second.", + "fieldValue": null, + "fieldDefaultValue": 170, + "fieldFlag": "ruler.query-frontend.max-retries-rate", + "fieldType": "float" } ], "fieldValue": null, diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index bd0256e3f6c..4be2bc83a00 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -2985,6 +2985,8 @@ Usage of ./cmd/mimir/mimir: Override the default minimum TLS version. Allowed values: VersionTLS10, VersionTLS11, VersionTLS12, VersionTLS13 -ruler.query-frontend.grpc-client-config.tls-server-name string Override the expected name on the server certificate. + -ruler.query-frontend.max-retries-rate float + Maximum number of retries for failed queries per second. (default 170) -ruler.query-frontend.query-result-response-format string Format to use when retrieving query results from query-frontends. Supported values: json, protobuf (default "protobuf") -ruler.query-stats-enabled diff --git a/cmd/mimir/help.txt.tmpl b/cmd/mimir/help.txt.tmpl index f25ae81834b..d44ff1b3871 100644 --- a/cmd/mimir/help.txt.tmpl +++ b/cmd/mimir/help.txt.tmpl @@ -737,6 +737,8 @@ Usage of ./cmd/mimir/mimir: Maximum number of rules per rule group by namespace. Value is a map, where each key is the namespace and value is the number of rules allowed in the namespace (int). On the command line, this map is given in a JSON format. The number of rules specified has the same meaning as -ruler.max-rules-per-rule-group, but only applies for the specific namespace. If specified, it supersedes -ruler.max-rules-per-rule-group. (default {}) -ruler.query-frontend.address string GRPC listen address of the query-frontend(s). Must be a DNS address (prefixed with dns:///) to enable client side load balancing. + -ruler.query-frontend.max-retries-rate float + Maximum number of retries for failed queries per second. (default 170) -ruler.query-frontend.query-result-response-format string Format to use when retrieving query results from query-frontends. Supported values: json, protobuf (default "protobuf") -ruler.recording-rules-evaluation-enabled diff --git a/docs/sources/mimir/configure/configuration-parameters/index.md b/docs/sources/mimir/configure/configuration-parameters/index.md index dfd84fbca61..447c0b4f11c 100644 --- a/docs/sources/mimir/configure/configuration-parameters/index.md +++ b/docs/sources/mimir/configure/configuration-parameters/index.md @@ -2103,6 +2103,10 @@ query_frontend: # CLI flag: -ruler.query-frontend.query-result-response-format [query_result_response_format: | default = "protobuf"] + # Maximum number of retries for failed queries per second. + # CLI flag: -ruler.query-frontend.max-retries-rate + [max_retries_rate: | default = 170] + tenant_federation: # Enable rule groups to query against multiple tenants. The tenant IDs # involved need to be in the rule group's 'source_tenants' field. If this flag diff --git a/pkg/mimir/modules.go b/pkg/mimir/modules.go index a2839280bb9..3420c206527 100644 --- a/pkg/mimir/modules.go +++ b/pkg/mimir/modules.go @@ -859,7 +859,7 @@ func (t *Mimir) initRuler() (serv services.Service, err error) { if err != nil { return nil, err } - remoteQuerier := ruler.NewRemoteQuerier(queryFrontendClient, t.Cfg.Querier.EngineConfig.Timeout, t.Cfg.Ruler.QueryFrontend.QueryResultResponseFormat, t.Cfg.API.PrometheusHTTPPrefix, util_log.Logger, ruler.WithOrgIDMiddleware) + remoteQuerier := ruler.NewRemoteQuerier(queryFrontendClient, t.Cfg.Querier.EngineConfig.Timeout, 1, t.Cfg.Ruler.QueryFrontend.QueryResultResponseFormat, t.Cfg.API.PrometheusHTTPPrefix, util_log.Logger, ruler.WithOrgIDMiddleware) embeddedQueryable = prom_remote.NewSampleAndChunkQueryableClient( remoteQuerier, diff --git a/pkg/ruler/remotequerier.go b/pkg/ruler/remotequerier.go index 0ba357b0000..f4df70664e2 100644 --- a/pkg/ruler/remotequerier.go +++ b/pkg/ruler/remotequerier.go @@ -14,7 +14,6 @@ import ( "strings" "time" - "github.com/failsafe-go/failsafe-go/circuitbreaker" "github.com/go-kit/log" "github.com/go-kit/log/level" "github.com/gogo/protobuf/proto" @@ -32,6 +31,7 @@ import ( "github.com/prometheus/prometheus/promql" "github.com/prometheus/prometheus/storage" "github.com/prometheus/prometheus/storage/remote" + "golang.org/x/time/rate" "google.golang.org/grpc" "google.golang.org/grpc/codes" @@ -55,10 +55,6 @@ const ( formatJSON = "json" formatProtobuf = "protobuf" - - cbRetryFailurePercentage = 50 // 50% - cbRetryFailureThreshold = 10 // 10 failed queries - cbPeriod = 5 * time.Second ) var allFormats = []string{formatJSON, formatProtobuf} @@ -72,6 +68,8 @@ type QueryFrontendConfig struct { GRPCClientConfig grpcclient.Config `yaml:"grpc_client_config" doc:"description=Configures the gRPC client used to communicate between the rulers and query-frontends."` QueryResultResponseFormat string `yaml:"query_result_response_format"` + + MaxRetriesRate float64 `yaml:"max_retries_rate"` } func (c *QueryFrontendConfig) RegisterFlags(f *flag.FlagSet) { @@ -85,6 +83,7 @@ func (c *QueryFrontendConfig) RegisterFlags(f *flag.FlagSet) { c.GRPCClientConfig.RegisterFlagsWithPrefix("ruler.query-frontend.grpc-client-config", f) f.StringVar(&c.QueryResultResponseFormat, "ruler.query-frontend.query-result-response-format", formatProtobuf, fmt.Sprintf("Format to use when retrieving query results from query-frontends. Supported values: %s", strings.Join(allFormats, ", "))) + f.Float64Var(&c.MaxRetriesRate, "ruler.query-frontend.max-retries-rate", 170, "Maximum number of retries for failed queries per second.") } func (c *QueryFrontendConfig) Validate() error { @@ -120,7 +119,7 @@ type Middleware func(ctx context.Context, req *httpgrpc.HTTPRequest) error // RemoteQuerier executes read operations against a httpgrpc.HTTPClient. type RemoteQuerier struct { client httpgrpc.HTTPClient - retryCB circuitbreaker.CircuitBreaker[struct{}] + retryLimiter *rate.Limiter timeout time.Duration middlewares []Middleware promHTTPPrefix string @@ -136,19 +135,16 @@ var protobufDecoderInstance = protobufDecoder{} func NewRemoteQuerier( client httpgrpc.HTTPClient, timeout time.Duration, + maxRetryRate float64, // maxRetryRate is the maximum number of retries for failed queries per second. preferredQueryResultResponseFormat string, prometheusHTTPPrefix string, logger log.Logger, middlewares ...Middleware, ) *RemoteQuerier { - retryCB := circuitbreaker.Builder[struct{}](). - WithDelay(cbPeriod). // disable retries for 5s when the CB opens - WithFailureRateThreshold(cbRetryFailurePercentage, cbRetryFailureThreshold, cbPeriod). - Build() return &RemoteQuerier{ client: client, timeout: timeout, - retryCB: retryCB, + retryLimiter: rate.NewLimiter(rate.Limit(maxRetryRate), 1), middlewares: middlewares, promHTTPPrefix: prometheusHTTPPrefix, logger: logger, @@ -317,13 +313,6 @@ func (q *RemoteQuerier) createRequest(ctx context.Context, query string, ts time return req, nil } -// sendRequest sends the request and retries if possible. It attempts each query 3 times. Only server errors are retried. -// It also respects a circuit breaker for retries. All evaluations are attempted at least once, but retries aren't guaranteed. -// Retries are disabled when the CB is open. The CB opens and closes based on the outcomes of all requests, not only retries. -// The goal is to -// - have a more predictable load on the remote query-frontend -// - not shed query evaluations unnecessarily -// - limit the added load of retries when most queries are failing already. func (q *RemoteQuerier) sendRequest(ctx context.Context, req *httpgrpc.HTTPRequest, logger log.Logger) (*httpgrpc.HTTPResponse, error) { // Ongoing request may be cancelled during evaluation due to some transient error or server shutdown, // so we'll keep retrying until we get a successful response or backoff is terminated. @@ -336,25 +325,44 @@ func (q *RemoteQuerier) sendRequest(ctx context.Context, req *httpgrpc.HTTPReque for { resp, err := q.client.Handle(ctx, req) - canRetry, err := parseRemoteEvalError(err, resp) - if err == nil || !canRetry { - // An error we can't retry is the same as a successful request. For example, invalid promQL queries. - q.retryCB.RecordSuccess() - return resp, err + if err == nil { + // Responses with status codes 4xx should always be considered erroneous. + // These errors shouldn't be retried because it is expected that + // running the same query gives rise to the same 4xx error. + if resp.Code/100 == 4 { + return nil, httpgrpc.ErrorFromHTTPResponse(resp) + } + return resp, nil + } + + // Bail out if the error is known to be not retriable. + switch code := grpcutil.ErrorToStatusCode(err); code { + case codes.ResourceExhausted: + // In case the server is configured with "grpc-max-send-msg-size-bytes", + // and the response exceeds this limit, there is no point retrying the request. + // This is a special case, refer to grafana/mimir#7216. + if strings.Contains(err.Error(), "message larger than max") { + return nil, err + } + default: + // In case the error was a wrapped HTTPResponse, its code represents HTTP status; + // 4xx errors shouldn't be retried because it is expected that + // running the same query gives rise to the same 4xx error. + if code/100 == 4 { + return nil, err + } } - q.retryCB.RecordFailure() if !retry.Ongoing() { return nil, err } retry.Wait() - // The backoff wait spreads the retries. We check the circuit breaker (CB) only after that spread. - // That should give queries a bigger chance at passing through the CB - // in cases when the CB closed of half-closed while we were waiting. - if !q.retryCB.TryAcquirePermit() { - return nil, fmt.Errorf("remote evaluation retries have been disabled because there were too many failed evaluations recently (more than %d%% in the last %s); last error was: %w", - cbRetryFailurePercentage, cbPeriod, err) + // The backoff wait spreads the retries. We check the rate limiter only after that spread. + // That should give queries a bigger chance at passing through the rate limiter + // in cases of short error bursts and the rate of failed queries being larger than the rate limit. + if !q.retryLimiter.Allow() { + return nil, fmt.Errorf("exhausted global retry budget for remote ruler execution (ruler.query-frontend.max-retries-rate); last error was: %w", err) } level.Warn(logger).Log("msg", "failed to remotely evaluate query expression, will retry", "err", err) @@ -365,39 +373,6 @@ func (q *RemoteQuerier) sendRequest(ctx context.Context, req *httpgrpc.HTTPReque } } -// parseRemoteEvalError returns the error to be propagated and a boolean indicating whether the error is retriable. -// If the error is nil, then it is not considered retriable. -func parseRemoteEvalError(err error, resp *httpgrpc.HTTPResponse) (bool, error) { - if err == nil { - // Responses with status codes 4xx should always be considered erroneous. - // These errors shouldn't be retried because it is expected that - // running the same query gives rise to the same 4xx error. - if resp.Code/100 == 4 { - return false, httpgrpc.ErrorFromHTTPResponse(resp) - } - return false, nil - } - - // Bail out if the error is known to be not retriable. - switch code := grpcutil.ErrorToStatusCode(err); code { - case codes.ResourceExhausted: - // In case the server is configured with "grpc-max-send-msg-size-bytes", - // and the response exceeds this limit, there is no point retrying the request. - // This is a special case, refer to grafana/mimir#7216. - if strings.Contains(err.Error(), "message larger than max") { - return false, err - } - default: - // In case the error was a wrapped HTTPResponse, its code represents HTTP status; - // 4xx errors shouldn't be retried because it is expected that - // running the same query gives rise to the same 4xx error. - if code/100 == 4 { - return false, err - } - } - return true, err -} - // WithOrgIDMiddleware attaches 'X-Scope-OrgID' header value to the outgoing request by inspecting the passed context. // In case the expression to evaluate corresponds to a federated rule, the ExtractTenantIDs function will take care // of normalizing and concatenating source tenants by separating them with a '|' character. diff --git a/pkg/ruler/remotequerier_test.go b/pkg/ruler/remotequerier_test.go index 6ee3de1b489..33f78dced3e 100644 --- a/pkg/ruler/remotequerier_test.go +++ b/pkg/ruler/remotequerier_test.go @@ -64,7 +64,7 @@ func TestRemoteQuerier_Read(t *testing.T) { t.Run("should issue a remote read request", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) _, err := q.Read(context.Background(), &prompb.Query{}, false) require.NoError(t, err) @@ -76,7 +76,7 @@ func TestRemoteQuerier_Read(t *testing.T) { t.Run("should not inject the read consistency header if none is defined in the context", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) _, err := q.Read(context.Background(), &prompb.Query{}, false) require.NoError(t, err) @@ -86,7 +86,7 @@ func TestRemoteQuerier_Read(t *testing.T) { t.Run("should inject the read consistency header if it is defined in the context", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) ctx := api.ContextWithReadConsistencyLevel(context.Background(), api.ReadConsistencyStrong) _, err := q.Read(ctx, &prompb.Query{}, false) @@ -101,7 +101,7 @@ func TestRemoteQuerier_ReadReqTimeout(t *testing.T) { <-ctx.Done() return nil, ctx.Err() } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Second, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Second, 1, formatJSON, "/prometheus", log.NewNopLogger()) _, err := q.Read(context.Background(), &prompb.Query{}, false) require.Error(t, err) @@ -139,7 +139,7 @@ func TestRemoteQuerier_Query(t *testing.T) { t.Run(fmt.Sprintf("format = %s", format), func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, format, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, 1, format, "/prometheus", log.NewNopLogger()) _, err := q.Query(context.Background(), "qs", tm) require.NoError(t, err) @@ -165,7 +165,7 @@ func TestRemoteQuerier_Query(t *testing.T) { t.Run("should not inject the read consistency header if none is defined in the context", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) _, err := q.Query(context.Background(), "qs", tm) require.NoError(t, err) @@ -175,7 +175,7 @@ func TestRemoteQuerier_Query(t *testing.T) { t.Run("should inject the read consistency header if it is defined in the context", func(t *testing.T) { client, inReq := setup() - q := NewRemoteQuerier(client, time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(client, time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) ctx := api.ContextWithReadConsistencyLevel(context.Background(), api.ReadConsistencyStrong) _, err := q.Query(ctx, "qs", tm) @@ -276,7 +276,7 @@ func TestRemoteQuerier_QueryRetryOnFailure(t *testing.T) { } return testCase.response, nil } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) require.Equal(t, int64(0), count.Load()) _, err := q.Query(ctx, "qs", time.Now()) if testCase.err == nil { @@ -313,25 +313,19 @@ func TestRemoteQuerier_QueryRetryBudgetExhaustion(t *testing.T) { q := NewRemoteQuerier( mockHTTPGRPCClient(mockClientFn), time.Minute, // timeout + 0.0001, // retry rate formatJSON, "/prometheus", log.NewNopLogger(), ) - // Run a few queries very quickly - queriesStart := time.Now() - for i := 0; i < cbRetryFailureThreshold+1; i++ { - _, err := q.Query(ctx, "test_query", time.Now()) - require.Error(t, err) - } - - attemptsAfterBurstQueries := attempts.Load() - // One last evaluation should lead to a failure, but also shouldn't trigger a retry. _, err := q.Query(ctx, "test_query", time.Now()) - require.Less(t, time.Since(queriesStart), cbPeriod, "running queries took longer thant he CB period, the test will be flaky") - require.ErrorContains(t, err, "remote evaluation retries have been disabled because there were too many failed evaluations recently") - require.Equal(t, attemptsAfterBurstQueries+1, attempts.Load(), "expected only a single additional query without retries") + require.Error(t, err) + require.ErrorContains(t, err, "exhausted global retry budget for remote ruler execution (ruler.query-frontend.max-retries-rate)") + + // Should have attempted at least one retry before exhausting budget + require.GreaterOrEqual(t, attempts.Load(), int64(2)) } func TestRemoteQuerier_QueryJSONDecoding(t *testing.T) { @@ -439,7 +433,7 @@ func TestRemoteQuerier_QueryJSONDecoding(t *testing.T) { Body: []byte(scenario.body), }, nil } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) tm := time.Unix(1649092025, 515834) actual, err := q.Query(context.Background(), "qs", tm) @@ -712,7 +706,7 @@ func TestRemoteQuerier_QueryProtobufDecoding(t *testing.T) { Body: b, }, nil } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatProtobuf, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, 1, formatProtobuf, "/prometheus", log.NewNopLogger()) tm := time.Unix(1649092025, 515834) actual, err := q.Query(context.Background(), "qs", tm) @@ -735,7 +729,7 @@ func TestRemoteQuerier_QueryUnknownResponseContentType(t *testing.T) { Body: []byte("some body content"), }, nil } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, 1, formatJSON, "/prometheus", log.NewNopLogger()) tm := time.Unix(1649092025, 515834) _, err := q.Query(context.Background(), "qs", tm) @@ -747,7 +741,7 @@ func TestRemoteQuerier_QueryReqTimeout(t *testing.T) { <-ctx.Done() return nil, ctx.Err() } - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Second, formatJSON, "/prometheus", log.NewNopLogger()) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Second, 1, formatJSON, "/prometheus", log.NewNopLogger()) tm := time.Unix(1649092025, 515834) _, err := q.Query(context.Background(), "qs", tm) @@ -805,7 +799,7 @@ func TestRemoteQuerier_StatusErrorResponses(t *testing.T) { return testCase.resp, testCase.err } logger := newLoggerWithCounter() - q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, formatJSON, "/prometheus", logger) + q := NewRemoteQuerier(mockHTTPGRPCClient(mockClientFn), time.Minute, 1, formatJSON, "/prometheus", logger) tm := time.Unix(1649092025, 515834) From 93cf6a1e237088d29f663dde9891c193605e35ce Mon Sep 17 00:00:00 2001 From: Dimitar Dimitrov Date: Thu, 9 Jan 2025 13:17:32 +0200 Subject: [PATCH 6/7] Don't abort retries if we're over the rate limit Signed-off-by: Dimitar Dimitrov --- pkg/ruler/remotequerier.go | 23 +++++++++++++---------- pkg/ruler/remotequerier_test.go | 28 ---------------------------- 2 files changed, 13 insertions(+), 38 deletions(-) diff --git a/pkg/ruler/remotequerier.go b/pkg/ruler/remotequerier.go index f4df70664e2..8d7624b79a9 100644 --- a/pkg/ruler/remotequerier.go +++ b/pkg/ruler/remotequerier.go @@ -356,18 +356,21 @@ func (q *RemoteQuerier) sendRequest(ctx context.Context, req *httpgrpc.HTTPReque if !retry.Ongoing() { return nil, err } - retry.Wait() - // The backoff wait spreads the retries. We check the rate limiter only after that spread. - // That should give queries a bigger chance at passing through the rate limiter - // in cases of short error bursts and the rate of failed queries being larger than the rate limit. - if !q.retryLimiter.Allow() { - return nil, fmt.Errorf("exhausted global retry budget for remote ruler execution (ruler.query-frontend.max-retries-rate); last error was: %w", err) + retryReservation := q.retryLimiter.Reserve() + if !retryReservation.OK() { + // This should only happen if we've misconfigured the limiter. + return nil, fmt.Errorf("couldn't reserve a retry token") } - level.Warn(logger).Log("msg", "failed to remotely evaluate query expression, will retry", "err", err) - - // Avoid masking last known error if context was cancelled while waiting. - if ctx.Err() != nil { + // We want to wait at least the time for the backoff, but also don't want to exceed the rate limit. + // All of this is capped to the max backoff, so that we are less likely to overrun into the next evaluation. + retryDelay := max(retry.NextDelay(), min(retryConfig.MaxBackoff, retryReservation.Delay())) + level.Warn(logger).Log("msg", "failed to remotely evaluate query expression, will retry", "err", err, "retry_delay", retryDelay) + + select { + case <-time.After(retryDelay): + case <-ctx.Done(): + // Avoid masking last known error if context was cancelled while waiting. return nil, fmt.Errorf("%s while retrying request, last error was: %w", ctx.Err(), err) } } diff --git a/pkg/ruler/remotequerier_test.go b/pkg/ruler/remotequerier_test.go index 33f78dced3e..39b7df17755 100644 --- a/pkg/ruler/remotequerier_test.go +++ b/pkg/ruler/remotequerier_test.go @@ -299,34 +299,6 @@ func TestRemoteQuerier_QueryRetryOnFailure(t *testing.T) { }) } } -func TestRemoteQuerier_QueryRetryBudgetExhaustion(t *testing.T) { - ctx := context.Background() - attempts := &atomic.Int64{} - - // Mock client that always returns a 500 error to trigger retries - mockClientFn := func(context.Context, *httpgrpc.HTTPRequest, ...grpc.CallOption) (*httpgrpc.HTTPResponse, error) { - attempts.Add(1) - return nil, httpgrpc.Errorf(http.StatusInternalServerError, "some error") - } - - // Create querier with very low retry rate to trigger budget exhaustion quickly - q := NewRemoteQuerier( - mockHTTPGRPCClient(mockClientFn), - time.Minute, // timeout - 0.0001, // retry rate - formatJSON, - "/prometheus", - log.NewNopLogger(), - ) - - _, err := q.Query(ctx, "test_query", time.Now()) - - require.Error(t, err) - require.ErrorContains(t, err, "exhausted global retry budget for remote ruler execution (ruler.query-frontend.max-retries-rate)") - - // Should have attempted at least one retry before exhausting budget - require.GreaterOrEqual(t, attempts.Load(), int64(2)) -} func TestRemoteQuerier_QueryJSONDecoding(t *testing.T) { scenarios := map[string]struct { From 2064f523890c5c235371fea854dc3d10c389da8c Mon Sep 17 00:00:00 2001 From: Dimitar Dimitrov Date: Thu, 9 Jan 2025 14:44:35 +0200 Subject: [PATCH 7/7] Cancel reservation when context expires Signed-off-by: Dimitar Dimitrov --- pkg/ruler/remotequerier.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/ruler/remotequerier.go b/pkg/ruler/remotequerier.go index 8d7624b79a9..4b0a4d7b1d7 100644 --- a/pkg/ruler/remotequerier.go +++ b/pkg/ruler/remotequerier.go @@ -366,10 +366,10 @@ func (q *RemoteQuerier) sendRequest(ctx context.Context, req *httpgrpc.HTTPReque // All of this is capped to the max backoff, so that we are less likely to overrun into the next evaluation. retryDelay := max(retry.NextDelay(), min(retryConfig.MaxBackoff, retryReservation.Delay())) level.Warn(logger).Log("msg", "failed to remotely evaluate query expression, will retry", "err", err, "retry_delay", retryDelay) - select { case <-time.After(retryDelay): case <-ctx.Done(): + retryReservation.Cancel() // Avoid masking last known error if context was cancelled while waiting. return nil, fmt.Errorf("%s while retrying request, last error was: %w", ctx.Err(), err) }