Skip to content

Commit

Permalink
Report gRPC status codes as labels in request duration metrics (#7144)
Browse files Browse the repository at this point in the history
* Report gRPC status codes as labels in request duration metrics

Signed-off-by: Yuri Nikolic <[email protected]>

* Fixing review findings

Signed-off-by: Yuri Nikolic <[email protected]>

---------

Signed-off-by: Yuri Nikolic <[email protected]>
  • Loading branch information
duricanikolic authored Jan 17, 2024
1 parent 5c37e4e commit a0f2856
Show file tree
Hide file tree
Showing 8 changed files with 44 additions and 45 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
* `prometheus_sd_refresh_duration_seconds` renamed to `cortex_prometheus_sd_refresh_duration_seconds`
* [CHANGE] Query-frontend: the default value for `-query-frontend.not-running-timeout` has been changed from 0 (disabled) to 2s. The configuration option has also been moved from "experimental" to "advanced". #7126
* [CHANGE] Store-gateway: to reduce disk contention on HDDs the default value for `blocks-storage.bucket-store.tenant-sync-concurrency` has been changed from `10` to `1` and the default value for `blocks-storage.bucket-store.block-sync-concurrency` has been changed from `20` to `4`. #7136
* [CHANGE] All: set `-server.report-grpc-codes-in-instrumentation-label-enabled` to `true` by default, which enables reporting gRPC status codes as `status_code` labels in the `cortex_request_duration_seconds` metric. #7144
* [CHANGE] Distributor: report gRPC status codes as `status_code` labels in the `cortex_ingester_client_request_duration_seconds` metric by default. #7144
* [CHANGE] Distributor: CLI flag `-ingester.client.report-grpc-codes-in-instrumentation-label-enabled` has been deprecated, and its default value is set to `true`. #7144
* [FEATURE] Introduce `-tenant-federation.max-tenants` option to limit the max number of tenants allowed for requests when federation is enabled. #6959
* [FEATURE] Cardinality API: added a new `count_method` parameter which enables counting active label values. #7085
* [FEATURE] Querier / query-frontend: added `-querier.promql-experimental-functions-enabled` CLI flag (and respective YAML config option) to enable experimental PromQL functions. The experimental functions introduced are: `mad_over_time()`, `sort_by_label()` and `sort_by_label_desc()`. #7057
Expand Down
6 changes: 3 additions & 3 deletions cmd/mimir/config-descriptor.json
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@
"required": false,
"desc": "If set to true, gRPC statuses will be reported in instrumentation labels with their string representations. Otherwise, they will be reported as \"error\".",
"fieldValue": null,
"fieldDefaultValue": false,
"fieldDefaultValue": true,
"fieldFlag": "server.report-grpc-codes-in-instrumentation-label-enabled",
"fieldType": "boolean"
},
Expand Down Expand Up @@ -2272,10 +2272,10 @@
"required": false,
"desc": "If set to true, gRPC status codes will be reported in \"status_code\" label of \"cortex_ingester_client_request_duration_seconds\" metric. Otherwise, they will be reported as \"error\"",
"fieldValue": null,
"fieldDefaultValue": false,
"fieldDefaultValue": true,
"fieldFlag": "ingester.client.report-grpc-codes-in-instrumentation-label-enabled",
"fieldType": "boolean",
"fieldCategory": "advanced"
"fieldCategory": "deprecated"
}
],
"fieldValue": null,
Expand Down
4 changes: 2 additions & 2 deletions cmd/mimir/help-all.txt.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -1300,7 +1300,7 @@ Usage of ./cmd/mimir/mimir:
-ingester.client.initial-stream-window-size value
[experimental] Initial stream window size. Values less than the default are not supported and are ignored. Setting this to a value other than the default disables the BDP estimator. (default 63KiB1023B)
-ingester.client.report-grpc-codes-in-instrumentation-label-enabled
If set to true, gRPC status codes will be reported in "status_code" label of "cortex_ingester_client_request_duration_seconds" metric. Otherwise, they will be reported as "error"
[deprecated] If set to true, gRPC status codes will be reported in "status_code" label of "cortex_ingester_client_request_duration_seconds" metric. Otherwise, they will be reported as "error" (default true)
-ingester.client.tls-ca-path string
Path to the CA certificates to validate server certificate against. If not set, the host's root CA certificates are used.
-ingester.client.tls-cert-path string
Expand Down Expand Up @@ -2646,7 +2646,7 @@ Usage of ./cmd/mimir/mimir:
-server.register-instrumentation
Register the intrumentation handlers (/metrics etc). (default true)
-server.report-grpc-codes-in-instrumentation-label-enabled
If set to true, gRPC statuses will be reported in instrumentation labels with their string representations. Otherwise, they will be reported as "error".
If set to true, gRPC statuses will be reported in instrumentation labels with their string representations. Otherwise, they will be reported as "error". (default true)
-server.tls-cipher-suites string
Comma-separated list of cipher suites to use. If blank, the default Go cipher suites is used.
-server.tls-min-version string
Expand Down
2 changes: 1 addition & 1 deletion cmd/mimir/help.txt.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -682,7 +682,7 @@ Usage of ./cmd/mimir/mimir:
-server.log-request-headers-exclude-list string
Comma separated list of headers to exclude from loggin. Only used if server.log-request-headers is true.
-server.report-grpc-codes-in-instrumentation-label-enabled
If set to true, gRPC statuses will be reported in instrumentation labels with their string representations. Otherwise, they will be reported as "error".
If set to true, gRPC statuses will be reported in instrumentation labels with their string representations. Otherwise, they will be reported as "error". (default true)
-server.tls-cipher-suites string
Comma-separated list of cipher suites to use. If blank, the default Go cipher suites is used.
-server.tls-min-version string
Expand Down
10 changes: 5 additions & 5 deletions docs/sources/mimir/configure/configuration-parameters/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -595,7 +595,7 @@ grpc_tls_config:
# If set to true, gRPC statuses will be reported in instrumentation labels with
# their string representations. Otherwise, they will be reported as "error".
# CLI flag: -server.report-grpc-codes-in-instrumentation-label-enabled
[report_grpc_codes_in_instrumentation_label_enabled: <boolean> | default = false]
[report_grpc_codes_in_instrumentation_label_enabled: <boolean> | default = true]
# (advanced) Timeout for graceful shutdowns
# CLI flag: -server.graceful-shutdown-timeout
Expand Down Expand Up @@ -2341,11 +2341,11 @@ circuit_breaker:
# CLI flag: -ingester.client.circuit-breaker.cooldown-period
[cooldown_period: <duration> | default = 1m]
# (advanced) If set to true, gRPC status codes will be reported in "status_code"
# label of "cortex_ingester_client_request_duration_seconds" metric. Otherwise,
# they will be reported as "error"
# (deprecated) If set to true, gRPC status codes will be reported in
# "status_code" label of "cortex_ingester_client_request_duration_seconds"
# metric. Otherwise, they will be reported as "error"
# CLI flag: -ingester.client.report-grpc-codes-in-instrumentation-label-enabled
[report_grpc_codes_in_instrumentation_label_enabled: <boolean> | default = false]
[report_grpc_codes_in_instrumentation_label_enabled: <boolean> | default = true]
```

### grpc_client
Expand Down
24 changes: 4 additions & 20 deletions integration/ingester_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -569,7 +569,7 @@ func TestIngesterQuerying(t *testing.T) {
return counts[0], nil
}

successfulQueryRequests, err := queryRequestCount("2xx")
successfulQueryRequests, err := queryRequestCount("OK")
require.NoError(t, err)

cancelledQueryRequests, err := queryRequestCount("cancel")
Expand Down Expand Up @@ -662,7 +662,7 @@ func TestIngesterQueryingWithRequestMinimization(t *testing.T) {
[]string{"cortex_request_duration_seconds"},
e2e.WithLabelMatchers(
labels.MustNewMatcher(labels.MatchEqual, "route", "/cortex.Ingester/QueryStream"),
labels.MustNewMatcher(labels.MatchEqual, "status_code", "success"),
labels.MustNewMatcher(labels.MatchEqual, "status_code", "OK"),
),
e2e.SkipMissingMetrics,
e2e.WithMetricCount,
Expand All @@ -686,31 +686,16 @@ func TestIngesterReportGRPCStatusCodes(t *testing.T) {
queryStep := 10 * time.Minute

testCases := map[string]struct {
serverReportGRPCStatusCodes bool
ingesterClientReportGRPCStatusCodes bool
expectedPushStatusCode string
expectedQueryStatusCode string
}{
"when server and ingester client do not report grpc codes, successful push and query give success and 2xx": {
serverReportGRPCStatusCodes: false,
ingesterClientReportGRPCStatusCodes: false,
expectedPushStatusCode: "success",
expectedQueryStatusCode: "2xx",
},
"when server does not report and ingester client reports grpc codes, successful push and query give success and OK": {
serverReportGRPCStatusCodes: false,
ingesterClientReportGRPCStatusCodes: true,
expectedPushStatusCode: "success",
expectedQueryStatusCode: "OK",
},
"when server reports and ingester client does not report grpc codes, successful push and query give OK and 2xx": {
serverReportGRPCStatusCodes: true,
"when ingester client does not report grpc codes, successful push and query give OK and 2xx": {
ingesterClientReportGRPCStatusCodes: false,
expectedPushStatusCode: "OK",
expectedQueryStatusCode: "2xx",
},
"when server and ingester client report grpc codes, successful push and query give OK and OK": {
serverReportGRPCStatusCodes: true,
"when ingester client report grpc codes, successful push and query give OK and OK": {
ingesterClientReportGRPCStatusCodes: true,
expectedPushStatusCode: "OK",
expectedQueryStatusCode: "OK",
Expand Down Expand Up @@ -745,7 +730,6 @@ func TestIngesterReportGRPCStatusCodes(t *testing.T) {
"-distributor.ingestion-tenant-shard-size": "0",
"-ingester.ring.heartbeat-period": "1s",
"-ingester.client.report-grpc-codes-in-instrumentation-label-enabled": strconv.FormatBool(testData.ingesterClientReportGRPCStatusCodes),
"-server.report-grpc-codes-in-instrumentation-label-enabled": strconv.FormatBool(testData.serverReportGRPCStatusCodes),
}

flags := mergeFlags(
Expand Down
23 changes: 17 additions & 6 deletions pkg/ingester/client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@ import (

"github.com/grafana/mimir/pkg/mimirpb"
querierapi "github.com/grafana/mimir/pkg/querier/api"
"github.com/grafana/mimir/pkg/util"
)

const deprecatedReportGRPCStatusCodesFlag = "ingester.client.report-grpc-codes-in-instrumentation-label-enabled" // Deprecated. TODO: Remove in Mimir 2.14.

// HealthAndIngesterClient is the union of IngesterClient and grpc_health_v1.HealthClient.
type HealthAndIngesterClient interface {
IngesterClient
Expand All @@ -37,7 +40,7 @@ type closableHealthAndIngesterClient struct {
func MakeIngesterClient(inst ring.InstanceDesc, cfg Config, metrics *Metrics, logger log.Logger) (HealthAndIngesterClient, error) {
logger = log.With(logger, "component", "ingester-client")
var reportGRPCStatusesOptions []middleware.InstrumentationOption
if cfg.ReportGRPCStatusCodes {
if cfg.DeprecatedReportGRPCStatusCodes {
reportGRPCStatusesOptions = []middleware.InstrumentationOption{middleware.ReportGRPCStatusOption}
}
unary, stream := grpcclient.Instrument(metrics.requestDuration, reportGRPCStatusesOptions...)
Expand Down Expand Up @@ -72,23 +75,31 @@ func (c *closableHealthAndIngesterClient) Close() error {

// Config is the configuration struct for the ingester client
type Config struct {
GRPCClientConfig grpcclient.Config `yaml:"grpc_client_config" doc:"description=Configures the gRPC client used to communicate with ingesters from distributors, queriers and rulers."`
CircuitBreaker CircuitBreakerConfig `yaml:"circuit_breaker"`
ReportGRPCStatusCodes bool `yaml:"report_grpc_codes_in_instrumentation_label_enabled" category:"advanced"`
GRPCClientConfig grpcclient.Config `yaml:"grpc_client_config" doc:"description=Configures the gRPC client used to communicate with ingesters from distributors, queriers and rulers."`
CircuitBreaker CircuitBreakerConfig `yaml:"circuit_breaker"`
DeprecatedReportGRPCStatusCodes bool `yaml:"report_grpc_codes_in_instrumentation_label_enabled" category:"deprecated"` // Deprecated: Deprecated in Mimir 2.12, remove in Mimir 2.14 (https://github.com/grafana/mimir/issues/6008#issuecomment-1854320098)
}

// RegisterFlags registers configuration settings used by the ingester client config.
func (cfg *Config) RegisterFlags(f *flag.FlagSet) {
cfg.GRPCClientConfig.RegisterFlagsWithPrefix("ingester.client", f)
cfg.CircuitBreaker.RegisterFlagsWithPrefix("ingester.client", f)
f.BoolVar(&cfg.ReportGRPCStatusCodes, "ingester.client.report-grpc-codes-in-instrumentation-label-enabled", false, "If set to true, gRPC status codes will be reported in \"status_code\" label of \"cortex_ingester_client_request_duration_seconds\" metric. Otherwise, they will be reported as \"error\"")
// The ingester.client.report-grpc-codes-in-instrumentation-label-enabled flag has been deprecated.
// According to the migration plan (https://github.com/grafana/mimir/issues/6008#issuecomment-1854320098)
// the default behaviour of Mimir should be as this flag were set to true.
// TODO: Remove in Mimir 2.14.0
f.BoolVar(&cfg.DeprecatedReportGRPCStatusCodes, deprecatedReportGRPCStatusCodesFlag, true, "If set to true, gRPC status codes will be reported in \"status_code\" label of \"cortex_ingester_client_request_duration_seconds\" metric. Otherwise, they will be reported as \"error\"")
}

func (cfg *Config) Validate() error {
func (cfg *Config) Validate(logger log.Logger) error {
if err := cfg.GRPCClientConfig.Validate(); err != nil {
return err
}

if !cfg.DeprecatedReportGRPCStatusCodes {
util.WarnDeprecatedConfig(deprecatedReportGRPCStatusCodesFlag, logger)
}

return cfg.CircuitBreaker.Validate()
}

Expand Down
17 changes: 9 additions & 8 deletions pkg/mimir/mimir.go
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ func (c *Config) Validate(log log.Logger) error {
return fmt.Errorf("querier timeout (%s) must be lower than or equal to HTTP server write timeout (%s)",
c.Querier.EngineConfig.Timeout, c.Server.HTTPServerWriteTimeout)
}
if err := c.IngesterClient.Validate(); err != nil {
if err := c.IngesterClient.Validate(log); err != nil {
return errors.Wrap(err, "invalid ingester_client config")
}
if err := c.Ingester.Validate(); err != nil {
Expand Down Expand Up @@ -537,13 +537,14 @@ func (c *Config) registerServerFlagsWithChangedDefaultValues(fs *flag.FlagSet) {
c.Server.RegisterFlags(throwaway)

defaultsOverrides := map[string]string{
"server.grpc-max-recv-msg-size-bytes": strconv.Itoa(100 * 1024 * 1024),
"server.grpc-max-send-msg-size-bytes": strconv.Itoa(100 * 1024 * 1024),
"server.grpc.keepalive.min-time-between-pings": "10s",
"server.grpc.keepalive.ping-without-stream-allowed": "true",
"server.grpc.num-workers": "100",
"server.http-listen-port": "8080",
"server.http-write-timeout": "2m",
"server.grpc-max-recv-msg-size-bytes": strconv.Itoa(100 * 1024 * 1024),
"server.grpc-max-send-msg-size-bytes": strconv.Itoa(100 * 1024 * 1024),
"server.grpc.keepalive.min-time-between-pings": "10s",
"server.grpc.keepalive.ping-without-stream-allowed": "true",
"server.grpc.num-workers": "100",
"server.http-listen-port": "8080",
"server.http-write-timeout": "2m",
"server.report-grpc-codes-in-instrumentation-label-enabled": "true",
}

throwaway.VisitAll(func(f *flag.Flag) {
Expand Down

0 comments on commit a0f2856

Please sign in to comment.