From f3fdcbe2693fe8636f885225f1e8e38b2eab5c22 Mon Sep 17 00:00:00 2001 From: Jane Quintero Date: Mon, 3 May 2021 15:54:25 -0700 Subject: [PATCH] add various prometheus metrics --- docs/pages/metrics-logs-reference.mdx | 9 +++++++ lib/auth/auth.go | 37 ++++++++++++++++++++++++++- lib/auth/grpcserver.go | 18 ++++++++++++- lib/auth/methods.go | 1 + lib/auth/sessions.go | 2 +- lib/services/local/configuration.go | 17 ++++++++++++ lib/services/server.go | 7 +++++ lib/srv/authhandlers.go | 24 +++++++++++++++++ lib/srv/regular/proxy.go | 15 +++++++++++ lib/srv/regular/sshserver.go | 15 +++++++++++ lib/sshutils/server.go | 17 ++++++++++++ metrics.go | 27 +++++++++++++++++++ 12 files changed, 186 insertions(+), 3 deletions(-) diff --git a/docs/pages/metrics-logs-reference.mdx b/docs/pages/metrics-logs-reference.mdx index 2bec15ec08728..690ba26abf463 100644 --- a/docs/pages/metrics-logs-reference.mdx +++ b/docs/pages/metrics-logs-reference.mdx @@ -106,3 +106,12 @@ Now you can see the monitoring information by visiting several endpoints: | `auth_generate_requests_throttled_total` | counter | Teleport Auth | Number of throttled requests to generate new server keys | | `auth_generate_requests_total` | counter | Teleport Auth | Number of requests to generate new server keys | | `auth_generate_seconds` | `histogram` | Teleport Auth | Latency for generate requests | +| `cluster_name_not_found_total` | counter | Teleport Auth | Number of times a cluster was not found | +| `heartbeat_connections_received_total` | counter | Teleport Auth | Number of times auth received a heartbeat connection | +| `heartbeat_connections_missed_total` | counter | Teleport Auth | Number of times auth did not receive a heartbeat from a node | +| `user_login_total` | counter | Teleport Auth | Number of user logins | +| `failed_connect_to_node_attempts_total` | counter | Teleport Proxy | Number of times a user failed connecting to a node | +| `proxy_connection_limit_exceeded_total` | counter | Teleport Proxy | Number of connections that exceeded the proxy connection limit | +| `certificate_mismatch_total` | counter | Teleport Proxy | Number of times there was a certificate mismatch | +| `failed_login_attempts_total` | counter | Teleport Proxy | Number of failed `tsh login` or `tsh ssh` logins | +| `user_max_concurrent_sessions_hit_total` | counter | Teleport Node | Number of times a user exceeded their concurrent session limit | \ No newline at end of file diff --git a/lib/auth/auth.go b/lib/auth/auth.go index d9873279eee7f..464698416e998 100644 --- a/lib/auth/auth.go +++ b/lib/auth/auth.go @@ -56,6 +56,7 @@ import ( "github.com/gravitational/teleport/lib/sshutils" "github.com/gravitational/teleport/lib/tlsca" "github.com/gravitational/teleport/lib/utils" + "github.com/gravitational/teleport/lib/utils/interval" "github.com/coreos/go-oidc/oauth2" "github.com/coreos/go-oidc/oidc" @@ -201,6 +202,19 @@ var ( Buckets: prometheus.ExponentialBuckets(0.001, 2, 16), }, ) + // UserLoginCount counts user logins + UserLoginCount = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: teleport.MetricUserLoginCount, + Help: "Number of times there was a user login", + }, + ) + heartbeatsMissedByAuth = prometheus.NewGauge( + prometheus.GaugeOpts{ + Name: teleport.MetricHeartbeatsMissed, + Help: "Number of hearbeats missed by auth server", + }, + ) ) // Server keeps the cluster together. It acts as a certificate authority (CA) for @@ -284,7 +298,14 @@ func (a *Server) runPeriodicOperations() { period := defaults.HighResPollingPeriod + time.Duration(r.Intn(int(defaults.HighResPollingPeriod/time.Second)))*time.Second log.Debugf("Ticking with period: %v.", period) ticker := time.NewTicker(period) + // Create a ticker with jitter + heartbeatCheckTicker := interval.New(interval.Config{ + Duration: defaults.ServerKeepAliveTTL * 2, + Jitter: utils.NewSeventhJitter(), + }) + missedKeepAliveCount := 0 defer ticker.Stop() + defer heartbeatCheckTicker.Stop() for { select { case <-a.closeCtx.Done(): @@ -298,6 +319,18 @@ func (a *Server) runPeriodicOperations() { log.Errorf("Failed to perform cert rotation check: %v.", err) } } + case <-heartbeatCheckTicker.Next(): + nodes, err := a.GetNodes(defaults.Namespace, services.SkipValidation()) + if err != nil { + log.Errorf("Failed to load nodes for heartbeat metric calculation: %v", err) + } + for _, node := range nodes { + if services.NodeHasMissedKeepAlives(node) { + missedKeepAliveCount++ + } + } + // Update prometheus gauge + heartbeatsMissedByAuth.Set(float64(missedKeepAliveCount)) } } } @@ -1659,7 +1692,7 @@ func (a *Server) NewWebSession(req types.NewWebSessionRequest) (services.WebSess BearerTokenExpires: startTime.UTC().Add(bearerTokenTTL), LoginTime: req.LoginTime, } - + UserLoginCount.Inc() return services.NewWebSession(token, services.KindWebSession, services.KindWebSession, sessionSpec), nil } @@ -2534,4 +2567,6 @@ func init() { prometheus.MustRegister(generateThrottledRequestsCount) prometheus.MustRegister(generateRequestsCurrent) prometheus.MustRegister(generateRequestsLatencies) + prometheus.MustRegister(UserLoginCount) + prometheus.MustRegister(heartbeatsMissedByAuth) } diff --git a/lib/auth/grpcserver.go b/lib/auth/grpcserver.go index c92498f28a920..6c96f4e637c71 100644 --- a/lib/auth/grpcserver.go +++ b/lib/auth/grpcserver.go @@ -36,9 +36,11 @@ import ( "github.com/gravitational/teleport/lib/session" "github.com/gravitational/teleport/lib/utils" - "github.com/golang/protobuf/ptypes/empty" "github.com/gravitational/trace" "github.com/gravitational/trace/trail" + + "github.com/golang/protobuf/ptypes/empty" + "github.com/prometheus/client_golang/prometheus" "github.com/sirupsen/logrus" "google.golang.org/grpc" "google.golang.org/grpc/codes" @@ -66,6 +68,19 @@ func (g *GRPCServer) GetServer() (*grpc.Server, error) { return g.server, nil } +var ( + heartbeatConnectionsReceived = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: teleport.MetricHeartbeatConnectionsReceived, + Help: "Number of times auth received a heartbeat connection", + }, + ) +) + +func init() { + prometheus.MustRegister(heartbeatConnectionsReceived) +} + // EmitAuditEvent emits audit event func (g *GRPCServer) EmitAuditEvent(ctx context.Context, req *events.OneOf) (*empty.Empty, error) { auth, err := g.authenticate(ctx) @@ -91,6 +106,7 @@ func (g *GRPCServer) SendKeepAlives(stream proto.AuthService_SendKeepAlivesServe return trail.ToGRPC(err) } g.Debugf("Got heartbeat connection from %v.", auth.User.GetName()) + heartbeatConnectionsReceived.Inc() for { keepAlive, err := stream.Recv() if err == io.EOF { diff --git a/lib/auth/methods.go b/lib/auth/methods.go index e810bb5295579..3b69df01efddd 100644 --- a/lib/auth/methods.go +++ b/lib/auth/methods.go @@ -393,6 +393,7 @@ func (s *Server) AuthenticateSSHUser(req AuthenticateSSHRequest) (*SSHLoginRespo if err != nil { return nil, trace.Wrap(err) } + UserLoginCount.Inc() return &SSHLoginResponse{ Username: req.Username, Cert: certs.ssh, diff --git a/lib/auth/sessions.go b/lib/auth/sessions.go index 4c19130aace59..4667c8f2edba7 100644 --- a/lib/auth/sessions.go +++ b/lib/auth/sessions.go @@ -98,7 +98,7 @@ func (s *Server) CreateAppSession(ctx context.Context, req services.CreateAppSes return nil, trace.Wrap(err) } log.Debugf("Generated application web session for %v with TTL %v.", req.Username, ttl) - + UserLoginCount.Inc() return session, nil } diff --git a/lib/services/local/configuration.go b/lib/services/local/configuration.go index 2468973bec741..36f4aade62238 100644 --- a/lib/services/local/configuration.go +++ b/lib/services/local/configuration.go @@ -19,12 +19,28 @@ package local import ( "context" + "github.com/gravitational/teleport" "github.com/gravitational/teleport/lib/backend" "github.com/gravitational/teleport/lib/services" "github.com/gravitational/trace" + + "github.com/prometheus/client_golang/prometheus" +) + +var ( + clusterNameNotFound = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: teleport.MetricClusterNameNotFound, + Help: "Number of times a cluster name was not found", + }, + ) ) +func init() { + prometheus.MustRegister(clusterNameNotFound) +} + // ClusterConfigurationService is responsible for managing cluster configuration. type ClusterConfigurationService struct { backend.Backend @@ -42,6 +58,7 @@ func (s *ClusterConfigurationService) GetClusterName(opts ...services.MarshalOpt item, err := s.Get(context.TODO(), backend.Key(clusterConfigPrefix, namePrefix)) if err != nil { if trace.IsNotFound(err) { + clusterNameNotFound.Inc() return nil, trace.NotFound("cluster name not found") } return nil, trace.Wrap(err) diff --git a/lib/services/server.go b/lib/services/server.go index 8f38931f66aca..0d1fd31c9f5f2 100644 --- a/lib/services/server.go +++ b/lib/services/server.go @@ -19,6 +19,7 @@ package services import ( "encoding/json" "fmt" + "time" "github.com/gravitational/teleport/api/types" "github.com/gravitational/teleport/lib/defaults" @@ -529,3 +530,9 @@ func MarshalServers(s []Server) ([]byte, error) { return bytes, nil } + +// NodeHasMissedKeepAlives checks if node has missed its keep alive +func NodeHasMissedKeepAlives(s Server) bool { + serverExpiry := s.Expiry() + return serverExpiry.Before(time.Now().Add(defaults.ServerAnnounceTTL - (defaults.ServerKeepAliveTTL * 2))) +} diff --git a/lib/srv/authhandlers.go b/lib/srv/authhandlers.go index 65a54eb422b45..214b6e23ddc69 100644 --- a/lib/srv/authhandlers.go +++ b/lib/srv/authhandlers.go @@ -34,6 +34,7 @@ import ( "github.com/gravitational/trace" "github.com/jonboulle/clockwork" + "github.com/prometheus/client_golang/prometheus" log "github.com/sirupsen/logrus" ) @@ -150,6 +151,27 @@ func (h *AuthHandlers) CheckPortForward(addr string, ctx *ServerContext) error { return nil } +var ( + failedLoginCount = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: teleport.MetricFailedLoginAttempts, + Help: "Number of times there was a failed login", + }, + ) + + certificateMismatchCount = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: teleport.MetricCertificateMistmatch, + Help: "Number of times there was a certificate mismatch", + }, + ) +) + +func init() { + prometheus.MustRegister(failedLoginCount) + prometheus.MustRegister(certificateMismatchCount) +} + // UserKeyAuth implements SSH client authentication using public keys and is // called by the server every time the client connects. func (h *AuthHandlers) UserKeyAuth(conn ssh.ConnMetadata, key ssh.PublicKey) (*ssh.Permissions, error) { @@ -184,6 +206,7 @@ func (h *AuthHandlers) UserKeyAuth(conn ssh.ConnMetadata, key ssh.PublicKey) (*s // only failed attempts are logged right now recordFailedLogin := func(err error) { + failedLoginCount.Inc() if err := h.Emitter.EmitAuditEvent(h.Server.Context(), &events.AuthAttempt{ Metadata: events.Metadata{ Type: events.AuthAttemptEvent, @@ -222,6 +245,7 @@ func (h *AuthHandlers) UserKeyAuth(conn ssh.ConnMetadata, key ssh.PublicKey) (*s } permissions, err := certChecker.Authenticate(conn, key) if err != nil { + certificateMismatchCount.Inc() recordFailedLogin(err) return nil, trace.Wrap(err) } diff --git a/lib/srv/regular/proxy.go b/lib/srv/regular/proxy.go index ae25898588011..cee559f7612a0 100644 --- a/lib/srv/regular/proxy.go +++ b/lib/srv/regular/proxy.go @@ -285,6 +285,20 @@ func (t *proxySubsys) proxyToSite( return nil } +var ( + // failedConnectingToNode counts failed attempts to connect to a node + failedConnectingToNode = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: teleport.MetricFailedConnectToNodeAttempts, + Help: "Number of failed attempts to connect to a node", + }, + ) +) + +func init() { + prometheus.MustRegister(failedConnectingToNode) +} + // proxyToHost establishes a proxy connection from the connected SSH client to the // requested remote node (t.host:t.port) via the given site func (t *proxySubsys) proxyToHost( @@ -434,6 +448,7 @@ func (t *proxySubsys) proxyToHost( ConnType: services.NodeTunnel, }) if err != nil { + failedConnectingToNode.Inc() return trace.Wrap(err) } diff --git a/lib/srv/regular/sshserver.go b/lib/srv/regular/sshserver.go index e23ddefa04a91..09640b6affa3a 100644 --- a/lib/srv/regular/sshserver.go +++ b/lib/srv/regular/sshserver.go @@ -52,6 +52,7 @@ import ( "github.com/gravitational/trace" "github.com/jonboulle/clockwork" + "github.com/prometheus/client_golang/prometheus" "github.com/sirupsen/logrus" ) @@ -815,6 +816,19 @@ func (s *Server) serveAgent(ctx *srv.ServerContext) error { return nil } +var ( + userSessionLimitHitCount = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: teleport.MetricUserMaxConcurrentSessionsHit, + Help: "Number of times a user exceeded their max concurrent ssh connections", + }, + ) +) + +func init() { + prometheus.MustRegister(userSessionLimitHitCount) +} + // HandleRequest processes global out-of-band requests. Global out-of-band // requests are processed in order (this way the originator knows which // request we are responding to). If Teleport does not support the request @@ -880,6 +894,7 @@ func (s *Server) HandleNewConn(ctx context.Context, ccx *sshutils.ConnectionCont if err != nil { if strings.Contains(err.Error(), teleport.MaxLeases) { // user has exceeded their max concurrent ssh connections. + userSessionLimitHitCount.Inc() if err := s.EmitAuditEvent(s.ctx, &events.SessionReject{ Metadata: events.Metadata{ Type: events.SessionRejectedEvent, diff --git a/lib/sshutils/server.go b/lib/sshutils/server.go index b9060b0c62390..de8b83279771a 100644 --- a/lib/sshutils/server.go +++ b/lib/sshutils/server.go @@ -37,6 +37,7 @@ import ( "github.com/gravitational/trace" + "github.com/prometheus/client_golang/prometheus" log "github.com/sirupsen/logrus" ) @@ -391,6 +392,19 @@ func (s *Server) trackConnections(delta int32) int32 { return atomic.AddInt32(&s.conns, delta) } +var ( + proxyConnectionLimitHitCount = prometheus.NewCounter( + prometheus.CounterOpts{ + Name: teleport.MetricProxyConnectionLimitHit, + Help: "Number of times the proxy connection limit was exceeded", + }, + ) +) + +func init() { + prometheus.MustRegister(proxyConnectionLimitHitCount) +} + // HandleConnection is called every time an SSH server accepts a new // connection from a client. // @@ -407,6 +421,9 @@ func (s *Server) HandleConnection(conn net.Conn) { log.Errorf(err.Error()) } if err := s.limiter.AcquireConnection(remoteAddr); err != nil { + if trace.IsLimitExceeded(err) { + proxyConnectionLimitHitCount.Inc() + } log.Errorf(err.Error()) conn.Close() return diff --git a/metrics.go b/metrics.go index b37812ef3c876..270377cf0342c 100644 --- a/metrics.go +++ b/metrics.go @@ -43,6 +43,33 @@ const ( // MetricTrustedClusters counts trusted clusters MetricTrustedClusters = "trusted_clusters" + // MetricClusterNameNotFound counts times a cluster name was not found + MetricClusterNameNotFound = "cluster_name_not_found_total" + + // MetricFailedLoginAttempts counts failed login attempts + MetricFailedLoginAttempts = "failed_login_attempts_total" + + // MetricFailedConnectToNodeAttempts counts failed ssh attempts + MetricFailedConnectToNodeAttempts = "failed_connect_to_node_attempts_total" + + // MetricUserMaxConcurrentSessionsHit counts number of times a user exceeded their max concurrent ssh connections + MetricUserMaxConcurrentSessionsHit = "user_max_concurrent_sessions_hit_total" + + // MetricProxyConnectionLimitHit counts the number of times the proxy connection limit was exceeded + MetricProxyConnectionLimitHit = "proxy_connection_limit_exceeded_total" + + // MetricUserLoginCount counts user logins + MetricUserLoginCount = "user_login_total" + + // MetricHeartbeatConnectionsReceived counts heartbeat connections received by auth + MetricHeartbeatConnectionsReceived = "heartbeat_connections_received_total" + + // MetricCertificateMistmatch counts login failures due to certificate mismatch + MetricCertificateMistmatch = "certificate_mismatch_total" + + // MetricHeartbeatsMissed counts the nodes that failed to heartbeat + MetricHeartbeatsMissed = "heartbeats_missed_total" + // TagCluster is a metric tag for a cluster TagCluster = "cluster" )