From 1ca1058d63212ea680e2293e5e40f9ece62b9abe Mon Sep 17 00:00:00 2001 From: Lonng Date: Mon, 9 Sep 2019 11:31:54 +0800 Subject: [PATCH] tikvclient: add metrics for gRPC connection transient failure Signed-off-by: Lonng --- metrics/gprc.go | 27 +++++++++++++++++++++++++++ metrics/metrics.go | 1 + metrics/session.go | 2 ++ metrics/tikvclient.go | 2 +- store/tikv/client.go | 10 ++++++++-- 5 files changed, 39 insertions(+), 3 deletions(-) create mode 100644 metrics/gprc.go diff --git a/metrics/gprc.go b/metrics/gprc.go new file mode 100644 index 0000000000000..33875054b64a1 --- /dev/null +++ b/metrics/gprc.go @@ -0,0 +1,27 @@ +// Copyright 2019 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// See the License for the specific language governing permissions and +// limitations under the License. + +package metrics + +import "github.com/prometheus/client_golang/prometheus" + +// Metrics to monitor gRPC service +var ( + GRPCConnTransientFailureCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "tidb", + Subsystem: "grpc", + Name: "connection_transient_failure_count", + Help: "Counter of gRPC connection transient failure", + }, []string{LblAddress, LblStore}) +) diff --git a/metrics/metrics.go b/metrics/metrics.go index fd7d65ff571ec..5cf292acfbe8f 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -153,4 +153,5 @@ func RegisterMetrics() { prometheus.MustRegister(TiKVRangeTaskStats) prometheus.MustRegister(TiKVRangeTaskPushDuration) prometheus.MustRegister(TiKVTokenWaitDuration) + prometheus.MustRegister(GRPCConnTransientFailureCounter) } diff --git a/metrics/session.go b/metrics/session.go index 0ea3548547986..4e7559400bc60 100644 --- a/metrics/session.go +++ b/metrics/session.go @@ -112,4 +112,6 @@ const ( LblSQLType = "sql_type" LblGeneral = "general" LblInternal = "internal" + LblStore = "store" + LblAddress = "address" ) diff --git a/metrics/tikvclient.go b/metrics/tikvclient.go index 704647ab5a0bd..4e9390c8dc276 100644 --- a/metrics/tikvclient.go +++ b/metrics/tikvclient.go @@ -74,7 +74,7 @@ var ( Name: "request_seconds", Help: "Bucketed histogram of sending request duration.", Buckets: prometheus.ExponentialBuckets(0.0005, 2, 20), // 0.5ms ~ 524s - }, []string{LblType, "store"}) + }, []string{LblType, LblStore}) TiKVCoprocessorHistogram = prometheus.NewHistogram( prometheus.HistogramOpts{ diff --git a/store/tikv/client.go b/store/tikv/client.go index 08d75ca9eda66..4061ef269b1b3 100644 --- a/store/tikv/client.go +++ b/store/tikv/client.go @@ -35,6 +35,7 @@ import ( "github.com/pingcap/tidb/store/tikv/tikvrpc" "github.com/pingcap/tidb/util/logutil" "google.golang.org/grpc" + "google.golang.org/grpc/connectivity" "google.golang.org/grpc/credentials" "google.golang.org/grpc/keepalive" ) @@ -299,14 +300,19 @@ func (c *rpcClient) SendRequest(ctx context.Context, addr string, req *tikvrpc.R } } + clientConn := connArray.Get() + if state := clientConn.GetState(); state == connectivity.TransientFailure { + metrics.GRPCConnTransientFailureCounter.WithLabelValues(addr, storeID).Inc() + } + if req.IsDebugReq() { - client := debugpb.NewDebugClient(connArray.Get()) + client := debugpb.NewDebugClient(clientConn) ctx1, cancel := context.WithTimeout(ctx, timeout) defer cancel() return tikvrpc.CallDebugRPC(ctx1, client, req) } - client := tikvpb.NewTikvClient(connArray.Get()) + client := tikvpb.NewTikvClient(clientConn) if req.Type != tikvrpc.CmdCopStream { ctx1, cancel := context.WithTimeout(ctx, timeout)