Skip to content

Commit

Permalink
Adding a metric for hosts not being found in resolver (#5414)
Browse files Browse the repository at this point in the history
Adds a metric around hosts not being visible in the resolver. This is for debugging and early alerting, though it's normal for this metric to appear during rebalancing or deployments temporarily. Long periods of this metric firing may indicate a problem with service discovery.
  • Loading branch information
davidporter-id-au authored Dec 22, 2023
1 parent bfe23d8 commit a031fe6
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 2 deletions.
1 change: 1 addition & 0 deletions cmd/server/cadence/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ func (s *server) startService() common.Daemon {
params.MembershipResolver, err = membership.NewResolver(
peerProvider,
params.Logger,
params.MetricsClient,
)
if err != nil {
log.Fatalf("error creating membership monitor: %v", err)
Expand Down
10 changes: 8 additions & 2 deletions common/membership/resolver.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
"github.com/uber/cadence/common"
"github.com/uber/cadence/common/log"
"github.com/uber/cadence/common/log/tag"
"github.com/uber/cadence/common/metrics"
"github.com/uber/cadence/common/service"
)

Expand Down Expand Up @@ -80,7 +81,8 @@ type (

// MultiringResolver uses ring-per-service for membership information
type MultiringResolver struct {
status int32
metrics metrics.Client
status int32

provider PeerProvider
rings map[string]*ring
Expand All @@ -92,20 +94,23 @@ var _ Resolver = (*MultiringResolver)(nil)
func NewResolver(
provider PeerProvider,
logger log.Logger,
metrics metrics.Client,
) (*MultiringResolver, error) {
return NewMultiringResolver(service.List, provider, logger.WithTags(tag.ComponentServiceResolver)), nil
return NewMultiringResolver(service.List, provider, logger.WithTags(tag.ComponentServiceResolver), metrics), nil
}

// NewMultiringResolver creates hashrings for all services
func NewMultiringResolver(
services []string,
provider PeerProvider,
logger log.Logger,
metrics metrics.Client,
) *MultiringResolver {
rpo := &MultiringResolver{
status: common.DaemonStatusInitialized,
provider: provider,
rings: make(map[string]*ring),
metrics: metrics,
}

for _, s := range services {
Expand Down Expand Up @@ -208,6 +213,7 @@ func (rpo *MultiringResolver) LookupByAddress(service, address string) (HostInfo
return m, nil
}
}
rpo.metrics.Scope(metrics.ResolverHostNotFoundScope).IncCounter(1)
return HostInfo{}, errors.New("host not found")
}

Expand Down
2 changes: 2 additions & 0 deletions common/membership/resolver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (

"github.com/uber/cadence/common"
"github.com/uber/cadence/common/log"
"github.com/uber/cadence/common/metrics"
)

var testServices = []string{"test-worker", "test-services"}
Expand Down Expand Up @@ -136,5 +137,6 @@ func newTestResolver(t *testing.T) (*MultiringResolver, *MockPeerProvider) {
testServices,
pp,
log.NewNoop(),
metrics.NewNoopMetricsClient(),
), pp
}
4 changes: 4 additions & 0 deletions common/metrics/defs.go
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,9 @@ const (
PersistenceUpdateDynamicConfigScope
// PersistenceShardRequestCountScope tracks number of persistence calls made to each shard
PersistenceShardRequestCountScope

// ResolverHostNotFoundScope is a simple low level error indicating a lookup failed in the membership resolver
ResolverHostNotFoundScope
// HistoryClientStartWorkflowExecutionScope tracks RPC calls to history service
HistoryClientStartWorkflowExecutionScope
// HistoryClientDescribeHistoryHostScope tracks RPC calls to history service
Expand Down Expand Up @@ -1392,6 +1395,7 @@ var ScopeDefs = map[ServiceIdx]map[int]scopeDefinition{
PersistenceFetchDynamicConfigScope: {operation: "FetchDynamicConfig"},
PersistenceUpdateDynamicConfigScope: {operation: "UpdateDynamicConfig"},
PersistenceShardRequestCountScope: {operation: "ShardIdPersistenceRequest"},
ResolverHostNotFoundScope: {operation: "ResolverHostNotFound"},

ClusterMetadataArchivalConfigScope: {operation: "ArchivalConfig"},

Expand Down

0 comments on commit a031fe6

Please sign in to comment.