Skip to content

Commit

Permalink
add various metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
quinqu committed Apr 22, 2021
1 parent fbc2e4e commit b138371
Show file tree
Hide file tree
Showing 12 changed files with 168 additions and 4 deletions.
7 changes: 7 additions & 0 deletions docs/pages/metrics-logs-reference.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -106,3 +106,10 @@ Now you can see the monitoring information by visiting several endpoints:
| `auth_generate_requests_throttled_total` | counter | Teleport Auth | Number of throttled requests to generate new server keys |
| `auth_generate_requests_total` | counter | Teleport Auth | Number of requests to generate new server keys |
| `auth_generate_seconds` | `histogram` | Teleport Auth | Latency for generate requests |
| `cluster_name_not_found_total` | counter | Teleport Auth | Number of times a cluster was not found |
| `heartbeat_connections_received_total` | counter | Teleport Auth | Number of times auth received a heartbeat connection |
| `user_login_total` | counter | Teleport Auth | Number of user logins |
| `failed_connect_to_node_attempts_total` | counter | Teleport Proxy | Number of times a user failed connecting to a node |
| `proxy_connection_limit_exceeded_total` | counter | Teleport Proxy | Number of connections that exceeded the proxy connection limit |
| `certificate_mismatch_total` | counter | Teleport Proxy | Number of times there was a certificate mismatch |
| `user_max_concurrent_sessions_hit_total` | counter | Teleport Node | Number of times a user exceeded their concurrent session limit |
18 changes: 17 additions & 1 deletion lib/auth/auth.go
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,20 @@ var (
Buckets: prometheus.ExponentialBuckets(0.001, 2, 16),
},
)
// UserLoginCount counts user logins
UserLoginCount = prometheus.NewCounter(
prometheus.CounterOpts{
Name: teleport.MetricUserLoginCount,
Help: "Number of times there was a user login",
},
)

heartbeatsMissedByAuth = prometheus.NewGauge(
prometheus.GaugeOpts{
Name: teleport.MetricHeartbeatsMissed,
Help: "Number of hearbeats missed by auth server",
},
)
)

// Server keeps the cluster together. It acts as a certificate authority (CA) for
Expand Down Expand Up @@ -1662,7 +1676,7 @@ func (a *Server) NewWebSession(req types.NewWebSessionRequest) (services.WebSess
BearerTokenExpires: startTime.UTC().Add(bearerTokenTTL),
LoginTime: req.LoginTime,
}

UserLoginCount.Inc()
return services.NewWebSession(token, services.KindWebSession, services.KindWebSession, sessionSpec), nil
}

Expand Down Expand Up @@ -2537,4 +2551,6 @@ func init() {
prometheus.MustRegister(generateThrottledRequestsCount)
prometheus.MustRegister(generateRequestsCurrent)
prometheus.MustRegister(generateRequestsLatencies)
prometheus.MustRegister(UserLoginCount)
prometheus.MustRegister(heartbeatsMissedByAuth)
}
19 changes: 17 additions & 2 deletions lib/auth/grpcserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,11 @@ import (
"github.com/gravitational/teleport/lib/services"
"github.com/gravitational/teleport/lib/session"
"github.com/gravitational/teleport/lib/utils"

"github.com/golang/protobuf/ptypes/empty"
"github.com/gravitational/trace"
"github.com/gravitational/trace/trail"

"github.com/golang/protobuf/ptypes/empty"
"github.com/prometheus/client_golang/prometheus"
"github.com/sirupsen/logrus"
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
Expand Down Expand Up @@ -66,6 +67,19 @@ func (g *GRPCServer) GetServer() (*grpc.Server, error) {
return g.server, nil
}

var (
heartbeatConnectionsReceived = prometheus.NewCounter(
prometheus.CounterOpts{
Name: teleport.MetricHeartbeatConnectionsReceived,
Help: "Number of auth received a heartbeat",
},
)
)

func init() {
prometheus.MustRegister(heartbeatConnectionsReceived)
}

// EmitAuditEvent emits audit event
func (g *GRPCServer) EmitAuditEvent(ctx context.Context, req *events.OneOf) (*empty.Empty, error) {
auth, err := g.authenticate(ctx)
Expand All @@ -91,6 +105,7 @@ func (g *GRPCServer) SendKeepAlives(stream proto.AuthService_SendKeepAlivesServe
return trail.ToGRPC(err)
}
g.Debugf("Got heartbeat connection from %v.", auth.User.GetName())
heartbeatConnectionsReceived.Inc()
for {
keepAlive, err := stream.Recv()
if err == io.EOF {
Expand Down
1 change: 1 addition & 0 deletions lib/auth/methods.go
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,7 @@ func (s *Server) AuthenticateSSHUser(req AuthenticateSSHRequest) (*SSHLoginRespo
if err != nil {
return nil, trace.Wrap(err)
}
UserLoginCount.Inc()
return &SSHLoginResponse{
Username: req.Username,
Cert: certs.ssh,
Expand Down
2 changes: 1 addition & 1 deletion lib/auth/sessions.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ func (s *Server) CreateAppSession(ctx context.Context, req services.CreateAppSes
return nil, trace.Wrap(err)
}
log.Debugf("Generated application web session for %v with TTL %v.", req.Username, ttl)

UserLoginCount.Inc()
return session, nil
}

Expand Down
17 changes: 17 additions & 0 deletions lib/cache/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (

"github.com/gravitational/trace"
"github.com/jonboulle/clockwork"
"github.com/prometheus/client_golang/prometheus"
log "github.com/sirupsen/logrus"
"go.uber.org/atomic"
)
Expand Down Expand Up @@ -1014,10 +1015,26 @@ func (c *Cache) GetClusterConfig(opts ...services.MarshalOption) (services.Clust
return rg.clusterConfig.GetClusterConfig(services.AddOptions(opts, services.SkipValidation())...)
}

var (
clusterNameNotFound = prometheus.NewCounter(
prometheus.CounterOpts{
Name: teleport.MetricClusterNameNotFound,
Help: "Number of times a cluster name was not found",
},
)
)

func init() {
prometheus.MustRegister(clusterNameNotFound)
}

// GetClusterName gets the name of the cluster from the backend.
func (c *Cache) GetClusterName(opts ...services.MarshalOption) (services.ClusterName, error) {
rg, err := c.read()
if err != nil {
if trace.IsNotFound(err) {
clusterNameNotFound.Inc()
}
return nil, trace.Wrap(err)
}
defer rg.Release()
Expand Down
10 changes: 10 additions & 0 deletions lib/services/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package services
import (
"encoding/json"
"fmt"
"time"

"github.com/gravitational/teleport/api/types"
"github.com/gravitational/teleport/lib/defaults"
Expand Down Expand Up @@ -498,3 +499,12 @@ func MarshalServers(s []Server) ([]byte, error) {

return bytes, nil
}

// NodeHasMissedKeepAlives checks if node has missed its keep alive
func NodeHasMissedKeepAlives(s Server) bool {
serverExpiry := s.Expiry()
if serverExpiry.Before(time.Now().Add(defaults.ServerAnnounceTTL - (defaults.ServerKeepAliveTTL * 2))) {
return true
}
return false
}
24 changes: 24 additions & 0 deletions lib/srv/authhandlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
"github.com/gravitational/trace"

"github.com/jonboulle/clockwork"
"github.com/prometheus/client_golang/prometheus"
log "github.com/sirupsen/logrus"
)

Expand Down Expand Up @@ -150,6 +151,27 @@ func (h *AuthHandlers) CheckPortForward(addr string, ctx *ServerContext) error {
return nil
}

var (
failedLoginCount = prometheus.NewCounter(
prometheus.CounterOpts{
Name: teleport.MetricFailedLoginAttempts,
Help: "Number of times there was a failed login",
},
)

certificateMismatchCount = prometheus.NewCounter(
prometheus.CounterOpts{
Name: teleport.MetricCertificateMistmatch,
Help: "Number of times there was a certificate mismatch",
},
)
)

func init() {
prometheus.MustRegister(failedLoginCount)
prometheus.MustRegister(certificateMismatchCount)
}

// UserKeyAuth implements SSH client authentication using public keys and is
// called by the server every time the client connects.
func (h *AuthHandlers) UserKeyAuth(conn ssh.ConnMetadata, key ssh.PublicKey) (*ssh.Permissions, error) {
Expand Down Expand Up @@ -184,6 +206,7 @@ func (h *AuthHandlers) UserKeyAuth(conn ssh.ConnMetadata, key ssh.PublicKey) (*s

// only failed attempts are logged right now
recordFailedLogin := func(err error) {
failedLoginCount.Inc()
if err := h.Emitter.EmitAuditEvent(h.Server.Context(), &events.AuthAttempt{
Metadata: events.Metadata{
Type: events.AuthAttemptEvent,
Expand Down Expand Up @@ -222,6 +245,7 @@ func (h *AuthHandlers) UserKeyAuth(conn ssh.ConnMetadata, key ssh.PublicKey) (*s
}
permissions, err := certChecker.Authenticate(conn, key)
if err != nil {
certificateMismatchCount.Inc()
recordFailedLogin(err)
return nil, trace.Wrap(err)
}
Expand Down
15 changes: 15 additions & 0 deletions lib/srv/regular/proxy.go
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,20 @@ func (t *proxySubsys) proxyToSite(
return nil
}

var (
// failedConnectingToNode counts failed attempts to connect to a node
failedConnectingToNode = prometheus.NewCounter(
prometheus.CounterOpts{
Name: teleport.MetricFailedConnectToNodeAttempts,
Help: "Number of times client failed to connect to a node",
},
)
)

func init() {
prometheus.MustRegister(failedConnectingToNode)
}

// proxyToHost establishes a proxy connection from the connected SSH client to the
// requested remote node (t.host:t.port) via the given site
func (t *proxySubsys) proxyToHost(
Expand Down Expand Up @@ -434,6 +448,7 @@ func (t *proxySubsys) proxyToHost(
ConnType: services.NodeTunnel,
})
if err != nil {
failedConnectingToNode.Inc()
return trace.Wrap(err)
}

Expand Down
15 changes: 15 additions & 0 deletions lib/srv/regular/sshserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ import (
"github.com/gravitational/trace"

"github.com/jonboulle/clockwork"
"github.com/prometheus/client_golang/prometheus"
"github.com/sirupsen/logrus"
)

Expand Down Expand Up @@ -815,6 +816,19 @@ func (s *Server) serveAgent(ctx *srv.ServerContext) error {
return nil
}

var (
userSessionLimitHitCount = prometheus.NewCounter(
prometheus.CounterOpts{
Name: teleport.MetricUserMaxConcurrentSessionsHit,
Help: "Number of times the user exceeded their max concurrent ssh connections",
},
)
)

func init() {
prometheus.MustRegister(userSessionLimitHitCount)
}

// HandleRequest processes global out-of-band requests. Global out-of-band
// requests are processed in order (this way the originator knows which
// request we are responding to). If Teleport does not support the request
Expand Down Expand Up @@ -880,6 +894,7 @@ func (s *Server) HandleNewConn(ctx context.Context, ccx *sshutils.ConnectionCont
if err != nil {
if strings.Contains(err.Error(), teleport.MaxLeases) {
// user has exceeded their max concurrent ssh connections.
userSessionLimitHitCount.Inc()
if err := s.EmitAuditEvent(s.ctx, &events.SessionReject{
Metadata: events.Metadata{
Type: events.SessionRejectedEvent,
Expand Down
17 changes: 17 additions & 0 deletions lib/sshutils/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import (

"github.com/gravitational/trace"

"github.com/prometheus/client_golang/prometheus"
log "github.com/sirupsen/logrus"
)

Expand Down Expand Up @@ -391,6 +392,19 @@ func (s *Server) trackConnections(delta int32) int32 {
return atomic.AddInt32(&s.conns, delta)
}

var (
proxyConnectionLimitHitCount = prometheus.NewCounter(
prometheus.CounterOpts{
Name: teleport.MetricProxyConnectionLimitHit,
Help: "Number of times proxy connection limit was exceeded",
},
)
)

func init() {
prometheus.MustRegister(proxyConnectionLimitHitCount)
}

// HandleConnection is called every time an SSH server accepts a new
// connection from a client.
//
Expand All @@ -407,6 +421,9 @@ func (s *Server) HandleConnection(conn net.Conn) {
log.Errorf(err.Error())
}
if err := s.limiter.AcquireConnection(remoteAddr); err != nil {
if trace.IsLimitExceeded(err) {
proxyConnectionLimitHitCount.Inc()
}
log.Errorf(err.Error())
conn.Close()
return
Expand Down
27 changes: 27 additions & 0 deletions metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,33 @@ const (
// MetricTrustedClusters counts trusted clusters
MetricTrustedClusters = "trusted_clusters"

// MetricClusterNameNotFound counts times a cluster name was not found
MetricClusterNameNotFound = "cluster_name_not_found_total"

// MetricFailedLoginAttempts counts failed login attempts
MetricFailedLoginAttempts = "failed_login_attempts_total"

// MetricFailedConnectToNodeAttempts counts failed ssh attempts
MetricFailedConnectToNodeAttempts = "failed_connect_to_node_attempts_total"

// MetricUserMaxConcurrentSessionsHit counts number of times the user exceeded their max concurrent ssh connections
MetricUserMaxConcurrentSessionsHit = "user_max_concurrent_sessions_hit_total"

// MetricProxyConnectionLimitHit counts the number of times proxy connection limit was exceeded
MetricProxyConnectionLimitHit = "proxy_connection_limit_exceeded_total"

// MetricUserLoginCount counts user logins
MetricUserLoginCount = "user_login_total"

// MetricHeartbeatConnectionsReceived counts heartbeat connections received by auth
MetricHeartbeatConnectionsReceived = "heartbeat_connections_received_total"

// MetricCertificateMistmatch counts login failures due to cert mismatch
MetricCertificateMistmatch = "certificate_mismatch_total"

// MetricHeartbeatsMissed counts the nodes that failed to heartbeat
MetricHeartbeatsMissed = "heartbeats_missed_total"

// TagCluster is a metric tag for a cluster
TagCluster = "cluster"
)
Expand Down

0 comments on commit b138371

Please sign in to comment.