diff --git a/CHANGELOG.md b/CHANGELOG.md index 8bb35c8d444..a9b8c926ece 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -62,6 +62,7 @@ * [ENHANCEMENT] Query-frontend: added "queue_time_seconds" field to "query stats" log. This is total time that query and subqueries spent in the queue, before queriers picked it up. #6537 * [ENHANCEMENT] Server: Add `-server.report-grpc-codes-in-instrumentation-label-enabled` CLI flag to specify whether gRPC status codes should be used in `status_code` label of `cortex_request_duration_seconds` metric. It defaults to false, meaning that successful and erroneous gRPC status codes are represented with `success` and `error` respectively. #6562 * [ENHANCEMENT] Server: Add `-ingester.client.report-grpc-codes-in-instrumentation-label-enabled` CLI flag to specify whether gRPC status codes should be used in `status_code` label of `cortex_ingester_client_request_duration_seconds` metric. It defaults to false, meaning that successful and erroneous gRPC status codes are represented with `2xx` and `error` respectively. #6562 +* [ENHANCEMENT] Server: Add `-server.http-log-closed-connections-without-response-enabled` option to log details about connections to HTTP server that were closed before any data was sent back. This can happen if client doesn't manage to send complete HTTP headers before timeout. #6612 * [BUGFIX] Ring: Ensure network addresses used for component hash rings are formatted correctly when using IPv6. #6068 * [BUGFIX] Query-scheduler: don't retain connections from queriers that have shut down, leading to gradually increasing enqueue latency over time. #6100 #6145 * [BUGFIX] Ingester: prevent query logic from continuing to execute after queries are canceled. #6085 diff --git a/cmd/mimir/config-descriptor.json b/cmd/mimir/config-descriptor.json index fada4570e2e..b6b90a5b35a 100644 --- a/cmd/mimir/config-descriptor.json +++ b/cmd/mimir/config-descriptor.json @@ -469,6 +469,16 @@ "fieldType": "duration", "fieldCategory": "advanced" }, + { + "kind": "field", + "name": "http_log_closed_connections_without_response_enabled", + "required": false, + "desc": "Log closed connections that did not receive any response, most likely because client didn't send any request within timeout.", + "fieldValue": null, + "fieldDefaultValue": false, + "fieldFlag": "server.http-log-closed-connections-without-response-enabled", + "fieldType": "boolean" + }, { "kind": "field", "name": "grpc_server_max_recv_msg_size", diff --git a/cmd/mimir/help-all.txt.tmpl b/cmd/mimir/help-all.txt.tmpl index c000fecd9c3..595b428a175 100644 --- a/cmd/mimir/help-all.txt.tmpl +++ b/cmd/mimir/help-all.txt.tmpl @@ -2569,6 +2569,8 @@ Usage of ./cmd/mimir/mimir: HTTP server listen network, default tcp (default "tcp") -server.http-listen-port int HTTP server listen port. (default 8080) + -server.http-log-closed-connections-without-response-enabled + Log closed connections that did not receive any response, most likely because client didn't send any request within timeout. -server.http-read-header-timeout duration Read timeout for HTTP request headers. If set to 0, value of -server.http-read-timeout is used. -server.http-read-timeout duration diff --git a/cmd/mimir/help.txt.tmpl b/cmd/mimir/help.txt.tmpl index 0159e0d252a..e77c35f4d52 100644 --- a/cmd/mimir/help.txt.tmpl +++ b/cmd/mimir/help.txt.tmpl @@ -665,6 +665,8 @@ Usage of ./cmd/mimir/mimir: HTTP server listen address. -server.http-listen-port int HTTP server listen port. (default 8080) + -server.http-log-closed-connections-without-response-enabled + Log closed connections that did not receive any response, most likely because client didn't send any request within timeout. -server.http-read-header-timeout duration Read timeout for HTTP request headers. If set to 0, value of -server.http-read-timeout is used. -server.log-request-headers diff --git a/docs/sources/mimir/configure/about-versioning.md b/docs/sources/mimir/configure/about-versioning.md index b1bbc785556..8b8b4ea3b90 100644 --- a/docs/sources/mimir/configure/about-versioning.md +++ b/docs/sources/mimir/configure/about-versioning.md @@ -150,6 +150,7 @@ The following features are currently experimental: - Limiting inflight requests to Distributor and Ingester via gRPC limiter: - `-distributor.limit-inflight-requests-using-grpc-method-limiter` - `-ingester.limit-inflight-requests-using-grpc-method-limiter` +- Logging of requests that did not send any HTTP request: `-server.http-log-closed-connections-without-response-enabled`. ## Deprecated features diff --git a/docs/sources/mimir/references/configuration-parameters/index.md b/docs/sources/mimir/references/configuration-parameters/index.md index ee062c1174f..4dd5a3c9d04 100644 --- a/docs/sources/mimir/references/configuration-parameters/index.md +++ b/docs/sources/mimir/references/configuration-parameters/index.md @@ -612,6 +612,11 @@ grpc_tls_config: # CLI flag: -server.http-idle-timeout [http_server_idle_timeout: | default = 2m] +# Log closed connections that did not receive any response, most likely because +# client didn't send any request within timeout. +# CLI flag: -server.http-log-closed-connections-without-response-enabled +[http_log_closed_connections_without_response_enabled: | default = false] + # (advanced) Limit on the size of a gRPC message this server can receive # (bytes). # CLI flag: -server.grpc-max-recv-msg-size-bytes diff --git a/go.mod b/go.mod index d1f48cc6e67..3181d46ee3c 100644 --- a/go.mod +++ b/go.mod @@ -20,7 +20,7 @@ require ( github.com/golang/snappy v0.0.4 github.com/google/gopacket v1.1.19 github.com/gorilla/mux v1.8.1 - github.com/grafana/dskit v0.0.0-20231106102023-e59db7bb4403 + github.com/grafana/dskit v0.0.0-20231110082806-620b5f187e90 github.com/grafana/e2e v0.1.1 github.com/hashicorp/golang-lru v1.0.2 // indirect github.com/json-iterator/go v1.1.12 diff --git a/go.sum b/go.sum index 449c8820185..5dd412a3b7d 100644 --- a/go.sum +++ b/go.sum @@ -538,8 +538,8 @@ github.com/gosimple/slug v1.1.1 h1:fRu/digW+NMwBIP+RmviTK97Ho/bEj/C9swrCspN3D4= github.com/gosimple/slug v1.1.1/go.mod h1:ER78kgg1Mv0NQGlXiDe57DpCyfbNywXXZ9mIorhxAf0= github.com/grafana-tools/sdk v0.0.0-20220919052116-6562121319fc h1:PXZQA2WCxe85Tnn+WEvr8fDpfwibmEPgfgFEaC87G24= github.com/grafana-tools/sdk v0.0.0-20220919052116-6562121319fc/go.mod h1:AHHlOEv1+GGQ3ktHMlhuTUwo3zljV3QJbC0+8o2kn+4= -github.com/grafana/dskit v0.0.0-20231106102023-e59db7bb4403 h1:CGq4R5z3RJuNVXJqsZuJXd1cKC1quLGKQic505VWtmY= -github.com/grafana/dskit v0.0.0-20231106102023-e59db7bb4403/go.mod h1:8dsy5tQOkeNQyjXpm5mQsbCu3H5uzeBD35MzRQFznKU= +github.com/grafana/dskit v0.0.0-20231110082806-620b5f187e90 h1:+YsdVHl/VdXqCQDuVefswsTfHTT25mbergE9xPwvDlo= +github.com/grafana/dskit v0.0.0-20231110082806-620b5f187e90/go.mod h1:8dsy5tQOkeNQyjXpm5mQsbCu3H5uzeBD35MzRQFznKU= github.com/grafana/e2e v0.1.1 h1:/b6xcv5BtoBnx8cZnCiey9DbjEc8z7gXHO5edoeRYxc= github.com/grafana/e2e v0.1.1/go.mod h1:RpNLgae5VT+BUHvPE+/zSypmOXKwEu4t+tnEMS1ATaE= github.com/grafana/goautoneg v0.0.0-20231010094147-47ce5e72a9ae h1:Yxbw9jKGJVC6qAK5Ubzzb/qZwM6rRMMqaDc/d4Vp3pM= diff --git a/vendor/github.com/grafana/dskit/concurrency/buffer.go b/vendor/github.com/grafana/dskit/concurrency/buffer.go index 623b9a70761..b8da4423f10 100644 --- a/vendor/github.com/grafana/dskit/concurrency/buffer.go +++ b/vendor/github.com/grafana/dskit/concurrency/buffer.go @@ -24,3 +24,10 @@ func (sb *SyncBuffer) String() string { return sb.buf.String() } + +func (sb *SyncBuffer) Reset() { + sb.mu.Lock() + defer sb.mu.Unlock() + + sb.buf.Reset() +} diff --git a/vendor/github.com/grafana/dskit/middleware/zero_response.go b/vendor/github.com/grafana/dskit/middleware/zero_response.go new file mode 100644 index 00000000000..1bb4ecc8d1f --- /dev/null +++ b/vendor/github.com/grafana/dskit/middleware/zero_response.go @@ -0,0 +1,132 @@ +package middleware + +import ( + "errors" + "net" + "os" + "regexp" + "strconv" + "sync" + + "github.com/go-kit/log" + "go.uber.org/atomic" +) + +// NewZeroResponseListener returns a Listener that logs all connections that encountered io timeout on reads, and were closed before sending any response. +func NewZeroResponseListener(list net.Listener, log log.Logger) net.Listener { + return &zeroResponseListener{ + Listener: list, + log: log, + bufPool: sync.Pool{ + New: func() interface{} { return &bufHolder{buf: make([]byte, 0, requestBufSize)} }, + }, + } +} + +// Wrap a slice in a struct, so we can store a pointer in sync.Pool +type bufHolder struct { + buf []byte +} + +// Size of buffer for read data. We log this eventually. +const requestBufSize = 512 + +type zeroResponseListener struct { + net.Listener + log log.Logger + bufPool sync.Pool // pool of &bufHolder. +} + +func (zl *zeroResponseListener) Accept() (net.Conn, error) { + conn, err := zl.Listener.Accept() + if err != nil { + return nil, err + } + bh := zl.bufPool.Get().(*bufHolder) + bh.buf = bh.buf[:0] + return &zeroResponseConn{Conn: conn, log: zl.log, bufHolder: bh, returnPool: &zl.bufPool}, nil +} + +type zeroResponseConn struct { + net.Conn + + log log.Logger + once sync.Once + returnPool *sync.Pool + + bufHolderMux sync.Mutex + bufHolder *bufHolder // Buffer with first requestBufSize bytes from connection. Set to nil as soon as data is written to the connection. + + lastReadErrIsDeadlineExceeded atomic.Bool +} + +func (zc *zeroResponseConn) Read(b []byte) (n int, err error) { + n, err = zc.Conn.Read(b) + if err != nil && errors.Is(err, os.ErrDeadlineExceeded) { + zc.lastReadErrIsDeadlineExceeded.Store(true) + } else { + zc.lastReadErrIsDeadlineExceeded.Store(false) + } + + // Store first requestBufSize read bytes on connection into the buffer for logging. + if n > 0 { + zc.bufHolderMux.Lock() + defer zc.bufHolderMux.Unlock() + + if zc.bufHolder != nil { + rem := requestBufSize - len(zc.bufHolder.buf) // how much space is in our buffer. + if rem > n { + rem = n + } + if rem > 0 { + zc.bufHolder.buf = append(zc.bufHolder.buf, b[:rem]...) + } + } + } + return +} + +func (zc *zeroResponseConn) Write(b []byte) (n int, err error) { + n, err = zc.Conn.Write(b) + if n > 0 { + zc.bufHolderMux.Lock() + if zc.bufHolder != nil { + zc.returnPool.Put(zc.bufHolder) + zc.bufHolder = nil + } + zc.bufHolderMux.Unlock() + } + return +} + +var authRegexp = regexp.MustCompile(`((?i)\r\nauthorization:\s+)(\S+\s+)(\S+)`) + +func (zc *zeroResponseConn) Close() error { + err := zc.Conn.Close() + + zc.once.Do(func() { + zc.bufHolderMux.Lock() + defer zc.bufHolderMux.Unlock() + + // If buffer was already returned, it means there was some data written on the connection, nothing to do. + if zc.bufHolder == nil { + return + } + + // If we didn't write anything to this connection, and we've got timeout while reading data, it looks like + // slow a slow client failing to send a request to us. + if !zc.lastReadErrIsDeadlineExceeded.Load() { + return + } + + b := zc.bufHolder.buf + b = authRegexp.ReplaceAll(b, []byte("${1}${2}***")) // Replace value in Authorization header with ***. + + _ = zc.log.Log("msg", "read timeout, connection closed with no response", "read", strconv.Quote(string(b)), "remote", zc.RemoteAddr().String()) + + zc.returnPool.Put(zc.bufHolder) + zc.bufHolder = nil + }) + + return err +} diff --git a/vendor/github.com/grafana/dskit/server/server.go b/vendor/github.com/grafana/dskit/server/server.go index b2947d34a5f..2b54283df7f 100644 --- a/vendor/github.com/grafana/dskit/server/server.go +++ b/vendor/github.com/grafana/dskit/server/server.go @@ -103,6 +103,8 @@ type Config struct { HTTPServerWriteTimeout time.Duration `yaml:"http_server_write_timeout"` HTTPServerIdleTimeout time.Duration `yaml:"http_server_idle_timeout"` + HTTPLogClosedConnectionsWithoutResponse bool `yaml:"http_log_closed_connections_without_response_enabled"` + GRPCOptions []grpc.ServerOption `yaml:"-"` GRPCMiddleware []grpc.UnaryServerInterceptor `yaml:"-"` GRPCStreamMiddleware []grpc.StreamServerInterceptor `yaml:"-"` @@ -175,6 +177,7 @@ func (cfg *Config) RegisterFlags(f *flag.FlagSet) { f.DurationVar(&cfg.HTTPServerReadHeaderTimeout, "server.http-read-header-timeout", 0, "Read timeout for HTTP request headers. If set to 0, value of -server.http-read-timeout is used.") f.DurationVar(&cfg.HTTPServerWriteTimeout, "server.http-write-timeout", 30*time.Second, "Write timeout for HTTP server") f.DurationVar(&cfg.HTTPServerIdleTimeout, "server.http-idle-timeout", 120*time.Second, "Idle timeout for HTTP server") + f.BoolVar(&cfg.HTTPLogClosedConnectionsWithoutResponse, "server.http-log-closed-connections-without-response-enabled", false, "Log closed connections that did not receive any response, most likely because client didn't send any request within timeout.") f.IntVar(&cfg.GRPCServerMaxRecvMsgSize, "server.grpc-max-recv-msg-size-bytes", 4*1024*1024, "Limit on the size of a gRPC message this server can receive (bytes).") f.IntVar(&cfg.GRPCServerMaxSendMsgSize, "server.grpc-max-send-msg-size-bytes", 4*1024*1024, "Limit on the size of a gRPC message this server can send (bytes).") f.UintVar(&cfg.GRPCServerMaxConcurrentStreams, "server.grpc-max-concurrent-streams", 100, "Limit on the number of concurrent streams for gRPC calls per client connection (0 = unlimited)") @@ -263,6 +266,9 @@ func newServer(cfg Config, metrics *Metrics) (*Server, error) { return nil, err } httpListener = middleware.CountingListener(httpListener, metrics.TCPConnections.WithLabelValues("http")) + if cfg.HTTPLogClosedConnectionsWithoutResponse { + httpListener = middleware.NewZeroResponseListener(httpListener, level.Warn(logger)) + } metrics.TCPConnectionsLimit.WithLabelValues("http").Set(float64(cfg.HTTPConnLimit)) if cfg.HTTPConnLimit > 0 { diff --git a/vendor/modules.txt b/vendor/modules.txt index 3cbb3d4718a..cbbca09b8ac 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -534,7 +534,7 @@ github.com/gosimple/slug # github.com/grafana-tools/sdk v0.0.0-20220919052116-6562121319fc ## explicit; go 1.13 github.com/grafana-tools/sdk -# github.com/grafana/dskit v0.0.0-20231106102023-e59db7bb4403 +# github.com/grafana/dskit v0.0.0-20231110082806-620b5f187e90 ## explicit; go 1.20 github.com/grafana/dskit/backoff github.com/grafana/dskit/cache