Skip to content

Commit

Permalink
Optimize poller counting (#6651)
Browse files Browse the repository at this point in the history
Instead of iterating through the entire history cache, use the count where the values aren't needed.

Additionally remove the tasklist isolation specific logic from emitMisconfiguredPartitionMetrics. This could add additional CPU usage when enabling tasklist isolation and accurate when tasklist <-> isolation group assignment is implemented.

Generally we need to revisit this metric as the source of truth for partition count is moving from the dynamic config to persistence, but this metric still relies only on dynamic config. If partition autoscaling is enabled then it seems strange to alert in this scenario.
  • Loading branch information
natemort authored Feb 4, 2025
1 parent 9c01fa8 commit 3945418
Show file tree
Hide file tree
Showing 3 changed files with 17 additions and 10 deletions.
5 changes: 5 additions & 0 deletions service/matching/poller/history.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ type (
History interface {
UpdatePollerInfo(id Identity, info Info)
HasPollerAfter(earliestAccessTime time.Time) bool
GetPollerCount() int
GetPollerInfo(earliestAccessTime time.Time) []*types.PollerInfo
GetPollerIsolationGroups(earliestAccessTime time.Time) map[string]int
}
Expand Down Expand Up @@ -106,6 +107,10 @@ func (pollers *history) HasPollerAfter(earliestAccessTime time.Time) bool {
return false
}

func (pollers *history) GetPollerCount() int {
return pollers.historyCache.Size()
}

func (pollers *history) GetPollerInfo(earliestAccessTime time.Time) []*types.PollerInfo {
var result []*types.PollerInfo
// optimistic size get, it can change before Iterator call.
Expand Down
10 changes: 10 additions & 0 deletions service/matching/poller/history_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,16 @@ func TestHistory_HasPollerAfter(t *testing.T) {
})
}

func TestGetPollerCount(t *testing.T) {
mockCtrl := gomock.NewController(t)
mockCache := cache.NewMockCache(mockCtrl)
mockCache.EXPECT().Size().Return(10)
p := &history{
historyCache: mockCache,
}
assert.Equal(t, 10, p.GetPollerCount())
}

func TestGetPollerInfo(t *testing.T) {
t.Run("with_time_filter", func(t *testing.T) {
mockCtrl := gomock.NewController(t)
Expand Down
12 changes: 2 additions & 10 deletions service/matching/tasklist/task_list_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ func NewManager(

tlMgr.pollerHistory = poller.NewPollerHistory(func() {
scope.UpdateGauge(metrics.PollerPerTaskListCounter,
float64(len(tlMgr.pollerHistory.GetPollerInfo(time.Time{}))))
float64(tlMgr.pollerHistory.GetPollerCount()))
}, timeSource)

livenessInterval := taskListConfig.IdleTasklistCheckInterval()
Expand Down Expand Up @@ -945,15 +945,7 @@ func (c *taskListManagerImpl) emitMisconfiguredPartitionMetrics() {
if c.config.NumReadPartitions() != c.config.NumWritePartitions() {
c.scope.UpdateGauge(metrics.TaskListReadWritePartitionMismatchGauge, 1)
}
pollerCount := len(c.pollerHistory.GetPollerInfo(time.Time{}))
if c.enableIsolation { // if isolation enabled, get the minimum poller count among the isolation groups
pollerCountsByIsolationGroup := c.pollerHistory.GetPollerIsolationGroups(time.Time{})
for _, count := range pollerCountsByIsolationGroup {
if count < pollerCount {
pollerCount = count
}
}
}
pollerCount := c.pollerHistory.GetPollerCount()
if pollerCount < c.config.NumReadPartitions() || pollerCount < c.config.NumWritePartitions() {
c.scope.Tagged(metrics.IsolationEnabledTag(c.enableIsolation)).UpdateGauge(metrics.TaskListPollerPartitionMismatchGauge, 1)
}
Expand Down

0 comments on commit 3945418

Please sign in to comment.