From b138371ba58cb1566f9ea2ad1d0ad8217e962801 Mon Sep 17 00:00:00 2001 From: Jane Quintero Date: Thu, 22 Apr 2021 09:31:19 -0700 Subject: [PATCH] add various metrics --- docs/pages/metrics-logs-reference.mdx | 7 +++++++ lib/auth/auth.go | 18 +++++++++++++++++- lib/auth/grpcserver.go | 19 +++++++++++++++++-- lib/auth/methods.go | 1 + lib/auth/sessions.go | 2 +- lib/cache/cache.go | 17 +++++++++++++++++ lib/services/server.go | 10 ++++++++++ lib/srv/authhandlers.go | 24 ++++++++++++++++++++++++ lib/srv/regular/proxy.go | 15 +++++++++++++++ lib/srv/regular/sshserver.go | 15 +++++++++++++++ lib/sshutils/server.go | 17 +++++++++++++++++ metrics.go | 27 +++++++++++++++++++++++++++ 12 files changed, 168 insertions(+), 4 deletions(-) diff --git a/docs/pages/metrics-logs-reference.mdx b/docs/pages/metrics-logs-reference.mdx index 2bec15ec08728..aa66cc156a567 100644 --- a/docs/pages/metrics-logs-reference.mdx +++ b/docs/pages/metrics-logs-reference.mdx @@ -106,3 +106,10 @@ Now you can see the monitoring information by visiting several endpoints: | `auth_generate_requests_throttled_total` | counter | Teleport Auth | Number of throttled requests to generate new server keys | | `auth_generate_requests_total` | counter | Teleport Auth | Number of requests to generate new server keys | | `auth_generate_seconds` | `histogram` | Teleport Auth | Latency for generate requests | +| `cluster_name_not_found_total` | counter | Teleport Auth | Number of times a cluster was not found | +| `heartbeat_connections_received_total` | counter | Teleport Auth | Number of times auth received a heartbeat connection | +| `user_login_total` | counter | Teleport Auth | Number of user logins | +| `failed_connect_to_node_attempts_total` | counter | Teleport Proxy | Number of times a user failed connecting to a node | +| `proxy_connection_limit_exceeded_total` | counter | Teleport Proxy | Number of connections that exceeded the proxy connection limit | +| `certificate_mismatch_total` | counter | Teleport Proxy | Number of times there was a certificate mismatch | +| `user_max_concurrent_sessions_hit_total` | counter | Teleport Node | Number of times a user exceeded their concurrent session limit | diff --git a/lib/auth/auth.go b/lib/auth/auth.go index 1fbe0f8219c8d..167a850facca8 100644 --- a/lib/auth/auth.go +++ b/lib/auth/auth.go @@ -201,6 +201,20 @@ var ( Buckets: prometheus.ExponentialBuckets(0.001, 2, 16), }, ) + // UserLoginCount counts user logins + UserLoginCount = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: teleport.MetricUserLoginCount, + Help: "Number of times there was a user login", + }, + ) + + heartbeatsMissedByAuth = prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: teleport.MetricHeartbeatsMissed, + Help: "Number of hearbeats missed by auth server", + }, + ) ) // Server keeps the cluster together. It acts as a certificate authority (CA) for @@ -1662,7 +1676,7 @@ func (a *Server) NewWebSession(req types.NewWebSessionRequest) (services.WebSess BearerTokenExpires: startTime.UTC().Add(bearerTokenTTL), LoginTime: req.LoginTime, } - + UserLoginCount.Inc() return services.NewWebSession(token, services.KindWebSession, services.KindWebSession, sessionSpec), nil } @@ -2537,4 +2551,6 @@ func init() { prometheus.MustRegister(generateThrottledRequestsCount) prometheus.MustRegister(generateRequestsCurrent) prometheus.MustRegister(generateRequestsLatencies) + prometheus.MustRegister(UserLoginCount) + prometheus.MustRegister(heartbeatsMissedByAuth) } diff --git a/lib/auth/grpcserver.go b/lib/auth/grpcserver.go index f46e83bb02478..68cd29099e59b 100644 --- a/lib/auth/grpcserver.go +++ b/lib/auth/grpcserver.go @@ -35,10 +35,11 @@ import ( "github.com/gravitational/teleport/lib/services" "github.com/gravitational/teleport/lib/session" "github.com/gravitational/teleport/lib/utils" - - "github.com/golang/protobuf/ptypes/empty" "github.com/gravitational/trace" "github.com/gravitational/trace/trail" + + "github.com/golang/protobuf/ptypes/empty" + "github.com/prometheus/client_golang/prometheus" "github.com/sirupsen/logrus" "google.golang.org/grpc" "google.golang.org/grpc/codes" @@ -66,6 +67,19 @@ func (g *GRPCServer) GetServer() (*grpc.Server, error) { return g.server, nil } +var ( + heartbeatConnectionsReceived = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: teleport.MetricHeartbeatConnectionsReceived, + Help: "Number of auth received a heartbeat", + }, + ) +) + +func init() { + prometheus.MustRegister(heartbeatConnectionsReceived) +} + // EmitAuditEvent emits audit event func (g *GRPCServer) EmitAuditEvent(ctx context.Context, req *events.OneOf) (*empty.Empty, error) { auth, err := g.authenticate(ctx) @@ -91,6 +105,7 @@ func (g *GRPCServer) SendKeepAlives(stream proto.AuthService_SendKeepAlivesServe return trail.ToGRPC(err) } g.Debugf("Got heartbeat connection from %v.", auth.User.GetName()) + heartbeatConnectionsReceived.Inc() for { keepAlive, err := stream.Recv() if err == io.EOF { diff --git a/lib/auth/methods.go b/lib/auth/methods.go index e810bb5295579..3b69df01efddd 100644 --- a/lib/auth/methods.go +++ b/lib/auth/methods.go @@ -393,6 +393,7 @@ func (s *Server) AuthenticateSSHUser(req AuthenticateSSHRequest) (*SSHLoginRespo if err != nil { return nil, trace.Wrap(err) } + UserLoginCount.Inc() return &SSHLoginResponse{ Username: req.Username, Cert: certs.ssh, diff --git a/lib/auth/sessions.go b/lib/auth/sessions.go index 4c19130aace59..4667c8f2edba7 100644 --- a/lib/auth/sessions.go +++ b/lib/auth/sessions.go @@ -98,7 +98,7 @@ func (s *Server) CreateAppSession(ctx context.Context, req services.CreateAppSes return nil, trace.Wrap(err) } log.Debugf("Generated application web session for %v with TTL %v.", req.Username, ttl) - + UserLoginCount.Inc() return session, nil } diff --git a/lib/cache/cache.go b/lib/cache/cache.go index 9cd94bf9b8ad7..21b3c95c17816 100644 --- a/lib/cache/cache.go +++ b/lib/cache/cache.go @@ -31,6 +31,7 @@ import ( "github.com/gravitational/trace" "github.com/jonboulle/clockwork" + "github.com/prometheus/client_golang/prometheus" log "github.com/sirupsen/logrus" "go.uber.org/atomic" ) @@ -1014,10 +1015,26 @@ func (c *Cache) GetClusterConfig(opts ...services.MarshalOption) (services.Clust return rg.clusterConfig.GetClusterConfig(services.AddOptions(opts, services.SkipValidation())...) } +var ( + clusterNameNotFound = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: teleport.MetricClusterNameNotFound, + Help: "Number of times a cluster name was not found", + }, + ) +) + +func init() { + prometheus.MustRegister(clusterNameNotFound) +} + // GetClusterName gets the name of the cluster from the backend. func (c *Cache) GetClusterName(opts ...services.MarshalOption) (services.ClusterName, error) { rg, err := c.read() if err != nil { + if trace.IsNotFound(err) { + clusterNameNotFound.Inc() + } return nil, trace.Wrap(err) } defer rg.Release() diff --git a/lib/services/server.go b/lib/services/server.go index 279b041f296b0..0813ed052aeb5 100644 --- a/lib/services/server.go +++ b/lib/services/server.go @@ -19,6 +19,7 @@ package services import ( "encoding/json" "fmt" + "time" "github.com/gravitational/teleport/api/types" "github.com/gravitational/teleport/lib/defaults" @@ -498,3 +499,12 @@ func MarshalServers(s []Server) ([]byte, error) { return bytes, nil } + +// NodeHasMissedKeepAlives checks if node has missed its keep alive +func NodeHasMissedKeepAlives(s Server) bool { + serverExpiry := s.Expiry() + if serverExpiry.Before(time.Now().Add(defaults.ServerAnnounceTTL - (defaults.ServerKeepAliveTTL * 2))) { + return true + } + return false +} diff --git a/lib/srv/authhandlers.go b/lib/srv/authhandlers.go index f94826b2231cb..cad4d0f2a366e 100644 --- a/lib/srv/authhandlers.go +++ b/lib/srv/authhandlers.go @@ -34,6 +34,7 @@ import ( "github.com/gravitational/trace" "github.com/jonboulle/clockwork" + "github.com/prometheus/client_golang/prometheus" log "github.com/sirupsen/logrus" ) @@ -150,6 +151,27 @@ func (h *AuthHandlers) CheckPortForward(addr string, ctx *ServerContext) error { return nil } +var ( + failedLoginCount = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: teleport.MetricFailedLoginAttempts, + Help: "Number of times there was a failed login", + }, + ) + + certificateMismatchCount = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: teleport.MetricCertificateMistmatch, + Help: "Number of times there was a certificate mismatch", + }, + ) +) + +func init() { + prometheus.MustRegister(failedLoginCount) + prometheus.MustRegister(certificateMismatchCount) +} + // UserKeyAuth implements SSH client authentication using public keys and is // called by the server every time the client connects. func (h *AuthHandlers) UserKeyAuth(conn ssh.ConnMetadata, key ssh.PublicKey) (*ssh.Permissions, error) { @@ -184,6 +206,7 @@ func (h *AuthHandlers) UserKeyAuth(conn ssh.ConnMetadata, key ssh.PublicKey) (*s // only failed attempts are logged right now recordFailedLogin := func(err error) { + failedLoginCount.Inc() if err := h.Emitter.EmitAuditEvent(h.Server.Context(), &events.AuthAttempt{ Metadata: events.Metadata{ Type: events.AuthAttemptEvent, @@ -222,6 +245,7 @@ func (h *AuthHandlers) UserKeyAuth(conn ssh.ConnMetadata, key ssh.PublicKey) (*s } permissions, err := certChecker.Authenticate(conn, key) if err != nil { + certificateMismatchCount.Inc() recordFailedLogin(err) return nil, trace.Wrap(err) } diff --git a/lib/srv/regular/proxy.go b/lib/srv/regular/proxy.go index ae25898588011..625d2e5405d2c 100644 --- a/lib/srv/regular/proxy.go +++ b/lib/srv/regular/proxy.go @@ -285,6 +285,20 @@ func (t *proxySubsys) proxyToSite( return nil } +var ( + // failedConnectingToNode counts failed attempts to connect to a node + failedConnectingToNode = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: teleport.MetricFailedConnectToNodeAttempts, + Help: "Number of times client failed to connect to a node", + }, + ) +) + +func init() { + prometheus.MustRegister(failedConnectingToNode) +} + // proxyToHost establishes a proxy connection from the connected SSH client to the // requested remote node (t.host:t.port) via the given site func (t *proxySubsys) proxyToHost( @@ -434,6 +448,7 @@ func (t *proxySubsys) proxyToHost( ConnType: services.NodeTunnel, }) if err != nil { + failedConnectingToNode.Inc() return trace.Wrap(err) } diff --git a/lib/srv/regular/sshserver.go b/lib/srv/regular/sshserver.go index dd9dfffd0dc11..bc75228c0840b 100644 --- a/lib/srv/regular/sshserver.go +++ b/lib/srv/regular/sshserver.go @@ -52,6 +52,7 @@ import ( "github.com/gravitational/trace" "github.com/jonboulle/clockwork" + "github.com/prometheus/client_golang/prometheus" "github.com/sirupsen/logrus" ) @@ -815,6 +816,19 @@ func (s *Server) serveAgent(ctx *srv.ServerContext) error { return nil } +var ( + userSessionLimitHitCount = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: teleport.MetricUserMaxConcurrentSessionsHit, + Help: "Number of times the user exceeded their max concurrent ssh connections", + }, + ) +) + +func init() { + prometheus.MustRegister(userSessionLimitHitCount) +} + // HandleRequest processes global out-of-band requests. Global out-of-band // requests are processed in order (this way the originator knows which // request we are responding to). If Teleport does not support the request @@ -880,6 +894,7 @@ func (s *Server) HandleNewConn(ctx context.Context, ccx *sshutils.ConnectionCont if err != nil { if strings.Contains(err.Error(), teleport.MaxLeases) { // user has exceeded their max concurrent ssh connections. + userSessionLimitHitCount.Inc() if err := s.EmitAuditEvent(s.ctx, &events.SessionReject{ Metadata: events.Metadata{ Type: events.SessionRejectedEvent, diff --git a/lib/sshutils/server.go b/lib/sshutils/server.go index 0a865584bac10..5da60ae806492 100644 --- a/lib/sshutils/server.go +++ b/lib/sshutils/server.go @@ -37,6 +37,7 @@ import ( "github.com/gravitational/trace" + "github.com/prometheus/client_golang/prometheus" log "github.com/sirupsen/logrus" ) @@ -391,6 +392,19 @@ func (s *Server) trackConnections(delta int32) int32 { return atomic.AddInt32(&s.conns, delta) } +var ( + proxyConnectionLimitHitCount = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: teleport.MetricProxyConnectionLimitHit, + Help: "Number of times proxy connection limit was exceeded", + }, + ) +) + +func init() { + prometheus.MustRegister(proxyConnectionLimitHitCount) +} + // HandleConnection is called every time an SSH server accepts a new // connection from a client. // @@ -407,6 +421,9 @@ func (s *Server) HandleConnection(conn net.Conn) { log.Errorf(err.Error()) } if err := s.limiter.AcquireConnection(remoteAddr); err != nil { + if trace.IsLimitExceeded(err) { + proxyConnectionLimitHitCount.Inc() + } log.Errorf(err.Error()) conn.Close() return diff --git a/metrics.go b/metrics.go index b37812ef3c876..e737da6d7135c 100644 --- a/metrics.go +++ b/metrics.go @@ -43,6 +43,33 @@ const ( // MetricTrustedClusters counts trusted clusters MetricTrustedClusters = "trusted_clusters" + // MetricClusterNameNotFound counts times a cluster name was not found + MetricClusterNameNotFound = "cluster_name_not_found_total" + + // MetricFailedLoginAttempts counts failed login attempts + MetricFailedLoginAttempts = "failed_login_attempts_total" + + // MetricFailedConnectToNodeAttempts counts failed ssh attempts + MetricFailedConnectToNodeAttempts = "failed_connect_to_node_attempts_total" + + // MetricUserMaxConcurrentSessionsHit counts number of times the user exceeded their max concurrent ssh connections + MetricUserMaxConcurrentSessionsHit = "user_max_concurrent_sessions_hit_total" + + // MetricProxyConnectionLimitHit counts the number of times proxy connection limit was exceeded + MetricProxyConnectionLimitHit = "proxy_connection_limit_exceeded_total" + + // MetricUserLoginCount counts user logins + MetricUserLoginCount = "user_login_total" + + // MetricHeartbeatConnectionsReceived counts heartbeat connections received by auth + MetricHeartbeatConnectionsReceived = "heartbeat_connections_received_total" + + // MetricCertificateMistmatch counts login failures due to cert mismatch + MetricCertificateMistmatch = "certificate_mismatch_total" + + // MetricHeartbeatsMissed counts the nodes that failed to heartbeat + MetricHeartbeatsMissed = "heartbeats_missed_total" + // TagCluster is a metric tag for a cluster TagCluster = "cluster" )