Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enforce decision task start to close timeout upper limit #2271

Merged
merged 7 commits into from
Jul 24, 2019
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions common/service/dynamicconfig/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ var keys = map[Key]string{
EnableDomainNotActiveAutoForwarding: "system.enableDomainNotActiveAutoForwarding",
TransactionSizeLimit: "system.transactionSizeLimit",
MinRetentionDays: "system.minRetentionDays",
MaxDecisionStartToCloseSeconds: "system.maxDecisionStartToCloseSeconds",
EnableBatcher: "worker.enableBatcher",

// size limit
Expand Down Expand Up @@ -254,6 +255,8 @@ const (
TransactionSizeLimit
// MinRetentionDays is the minimal allowed retention days for domain
MinRetentionDays
// MaxDecisionStartToCloseSeconds is the minimal allowed decision start to close timeout in seconds
MaxDecisionStartToCloseSeconds

// BlobSizeLimitError is the per event blob size limit
BlobSizeLimitError
Expand Down
26 changes: 24 additions & 2 deletions service/history/historyEngine.go
Original file line number Diff line number Diff line change
Expand Up @@ -370,8 +370,19 @@ func (e *historyEngineImpl) StartWorkflowExecution(
if err != nil {
return nil, err
}

workflowID := request.GetWorkflowId()

maxDecisionStartToCloseTimeoutSeconds := int32(e.config.MaxDecisionStartToCloseSeconds(
domainEntry.GetInfo().Name,
))
if request.GetTaskStartToCloseTimeoutSeconds() > maxDecisionStartToCloseTimeoutSeconds {
e.logger.WithTags(
tag.WorkflowDomainID(domainID),
tag.WorkflowID(workflowID),
).Info("force override decision start to close timeout")
request.TaskStartToCloseTimeoutSeconds = common.Int32Ptr(maxDecisionStartToCloseTimeoutSeconds)
}

// grab the current context as a lock, nothing more
_, currentRelease, err := e.historyCache.getOrCreateCurrentWorkflowExecution(
ctx,
Expand Down Expand Up @@ -1462,8 +1473,19 @@ func (e *historyEngineImpl) SignalWithStartWorkflowExecution(
if err != nil {
return nil, err
}

workflowID := request.GetWorkflowId()

maxDecisionStartToCloseTimeoutSeconds := int32(e.config.MaxDecisionStartToCloseSeconds(
domainEntry.GetInfo().Name,
))
if request.GetTaskStartToCloseTimeoutSeconds() > maxDecisionStartToCloseTimeoutSeconds {
e.logger.WithTags(
tag.WorkflowDomainID(domainID),
tag.WorkflowID(workflowID),
).Info("force override decision start to close timeout")
request.TaskStartToCloseTimeoutSeconds = common.Int32Ptr(maxDecisionStartToCloseTimeoutSeconds)
}

// grab the current context as a lock, nothing more
_, currentRelease, err := e.historyCache.getOrCreateCurrentWorkflowExecution(
ctx,
Expand Down
2 changes: 2 additions & 0 deletions service/history/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ type Config struct {
EnableVisibilityToKafka dynamicconfig.BoolPropertyFn
EmitShardDiffLog dynamicconfig.BoolPropertyFn
MaxAutoResetPoints dynamicconfig.IntPropertyFnWithDomainFilter
MaxDecisionStartToCloseSeconds dynamicconfig.IntPropertyFnWithDomainFilter

// HistoryCache settings
// Change of these configs require shard restart
Expand Down Expand Up @@ -174,6 +175,7 @@ func NewConfig(dc *dynamicconfig.Collection, numberOfShards int, enableVisibilit
VisibilityOpenMaxQPS: dc.GetIntPropertyFilteredByDomain(dynamicconfig.HistoryVisibilityOpenMaxQPS, 300),
VisibilityClosedMaxQPS: dc.GetIntPropertyFilteredByDomain(dynamicconfig.HistoryVisibilityClosedMaxQPS, 300),
MaxAutoResetPoints: dc.GetIntPropertyFilteredByDomain(dynamicconfig.HistoryMaxAutoResetPoints, defaultHistoryMaxAutoResetPoints),
MaxDecisionStartToCloseSeconds: dc.GetIntPropertyFilteredByDomain(dynamicconfig.MaxDecisionStartToCloseSeconds, 60),
EnableVisibilityToKafka: dc.GetBoolProperty(dynamicconfig.EnableVisibilityToKafka, enableVisibilityToKafka),
EmitShardDiffLog: dc.GetBoolProperty(dynamicconfig.EmitShardDiffLog, false),
HistoryCacheInitialSize: dc.GetIntProperty(dynamicconfig.HistoryCacheInitialSize, 128),
Expand Down
4 changes: 2 additions & 2 deletions service/history/timerQueueStandbyProcessor.go
Original file line number Diff line number Diff line change
Expand Up @@ -588,7 +588,7 @@ func (t *timerQueueStandbyProcessorImpl) discardTask(
) bool {

// the current time got from shard is already delayed by t.shard.GetConfig().StandbyClusterDelay()
// so discard will be true if task is delayed by 2*t.shard.GetConfig().StandbyClusterDelay()
// so discard will be true if task is delayed by 4*t.shard.GetConfig().StandbyClusterDelay()
now := t.shard.GetCurrentTime(t.clusterName)
return now.Sub(timerTask.GetVisibilityTimestamp()) > t.shard.GetConfig().StandbyClusterDelay()
return now.Sub(timerTask.GetVisibilityTimestamp()) > 3*t.shard.GetConfig().StandbyClusterDelay()
}
10 changes: 5 additions & 5 deletions service/history/timerQueueStandbyProcessor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ func (s *timerQueueStandbyProcessorSuite) TestProcessExpiredUserTimer_Pending()
_, err = s.timerQueueStandbyProcessor.process(timerTask, true)
s.Equal(ErrTaskRetry, err)

s.mockShard.SetCurrentTime(s.clusterName, time.Now().Add(3*s.mockShard.GetConfig().StandbyClusterDelay()))
s.mockShard.SetCurrentTime(s.clusterName, time.Now().Add(5*s.mockShard.GetConfig().StandbyClusterDelay()))
s.mockHistoryRereplicator.On("SendMultiWorkflowHistory",
timerTask.DomainID, timerTask.WorkflowID,
timerTask.RunID, nextEventID,
Expand Down Expand Up @@ -438,7 +438,7 @@ func (s *timerQueueStandbyProcessorSuite) TestProcessActivityTimeout_Pending() {
_, err = s.timerQueueStandbyProcessor.process(timerTask, true)
s.Equal(ErrTaskRetry, err)

s.mockShard.SetCurrentTime(s.clusterName, time.Now().Add(3*s.mockShard.GetConfig().StandbyClusterDelay()))
s.mockShard.SetCurrentTime(s.clusterName, time.Now().Add(5*s.mockShard.GetConfig().StandbyClusterDelay()))
s.mockHistoryRereplicator.On("SendMultiWorkflowHistory",
timerTask.DomainID, timerTask.WorkflowID,
timerTask.RunID, nextEventID,
Expand Down Expand Up @@ -680,7 +680,7 @@ func (s *timerQueueStandbyProcessorSuite) TestProcessDecisionTimeout_Pending() {
_, err = s.timerQueueStandbyProcessor.process(timerTask, true)
s.Equal(ErrTaskRetry, err)

s.mockShard.SetCurrentTime(s.clusterName, time.Now().Add(3*s.mockShard.GetConfig().StandbyClusterDelay()))
s.mockShard.SetCurrentTime(s.clusterName, time.Now().Add(5*s.mockShard.GetConfig().StandbyClusterDelay()))
s.mockHistoryRereplicator.On("SendMultiWorkflowHistory",
timerTask.DomainID, timerTask.WorkflowID,
timerTask.RunID, nextEventID,
Expand Down Expand Up @@ -809,7 +809,7 @@ func (s *timerQueueStandbyProcessorSuite) TestProcessWorkflowBackoffTimer_Pendin
_, err = s.timerQueueStandbyProcessor.process(timerTask, true)
s.Equal(ErrTaskRetry, err)

s.mockShard.SetCurrentTime(s.clusterName, time.Now().Add(3*s.mockShard.GetConfig().StandbyClusterDelay()))
s.mockShard.SetCurrentTime(s.clusterName, time.Now().Add(5*s.mockShard.GetConfig().StandbyClusterDelay()))
s.mockHistoryRereplicator.On("SendMultiWorkflowHistory",
timerTask.DomainID, timerTask.WorkflowID,
timerTask.RunID, nextEventID,
Expand Down Expand Up @@ -913,7 +913,7 @@ func (s *timerQueueStandbyProcessorSuite) TestProcessWorkflowTimeout_Pending() {
_, err = s.timerQueueStandbyProcessor.process(timerTask, true)
s.Equal(ErrTaskRetry, err)

s.mockShard.SetCurrentTime(s.clusterName, time.Now().Add(3*s.mockShard.GetConfig().StandbyClusterDelay()))
s.mockShard.SetCurrentTime(s.clusterName, time.Now().Add(5*s.mockShard.GetConfig().StandbyClusterDelay()))
s.mockHistoryRereplicator.On("SendMultiWorkflowHistory",
timerTask.DomainID, timerTask.WorkflowID,
timerTask.RunID, nextEventID,
Expand Down
4 changes: 2 additions & 2 deletions service/history/transferQueueStandbyProcessor.go
Original file line number Diff line number Diff line change
Expand Up @@ -629,7 +629,7 @@ func (t *transferQueueStandbyProcessorImpl) discardTask(
) bool {

// the current time got from shard is already delayed by t.shard.GetConfig().StandbyClusterDelay()
// so discard will be true if task is delayed by 2*t.shard.GetConfig().StandbyClusterDelay()
// so discard will be true if task is delayed by 4*t.shard.GetConfig().StandbyClusterDelay()
now := t.shard.GetCurrentTime(t.clusterName)
return now.Sub(transferTask.GetVisibilityTimestamp()) > t.shard.GetConfig().StandbyClusterDelay()
return now.Sub(transferTask.GetVisibilityTimestamp()) > 3*t.shard.GetConfig().StandbyClusterDelay()
}
10 changes: 5 additions & 5 deletions service/history/transferQueueStandbyProcessor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ func (s *transferQueueStandbyProcessorSuite) TestProcessActivityTask_Pending_Pus
activityType := "some random activity type"
event, _ = addActivityTaskScheduledEvent(msBuilder, event.GetEventId(), activityID, activityType, taskListName, []byte{}, 1, 1, 1)

s.mockShard.SetCurrentTime(s.clusterName, time.Now().Add(3*s.mockShard.GetConfig().StandbyClusterDelay()))
s.mockShard.SetCurrentTime(s.clusterName, time.Now().Add(5*s.mockShard.GetConfig().StandbyClusterDelay()))
transferTask := &persistence.TransferTaskInfo{
Version: version,
DomainID: s.domainID,
Expand Down Expand Up @@ -440,7 +440,7 @@ func (s *transferQueueStandbyProcessorSuite) TestProcessDecisionTask_Pending_Pus
taskID := int64(59)
di := addDecisionTaskScheduledEvent(msBuilder)

s.mockShard.SetCurrentTime(s.clusterName, time.Now().Add(3*s.mockShard.GetConfig().StandbyClusterDelay()))
s.mockShard.SetCurrentTime(s.clusterName, time.Now().Add(5*s.mockShard.GetConfig().StandbyClusterDelay()))
transferTask := &persistence.TransferTaskInfo{
Version: version,
DomainID: s.domainID,
Expand Down Expand Up @@ -685,7 +685,7 @@ func (s *transferQueueStandbyProcessorSuite) TestProcessCancelExecution_Pending(
_, err = s.transferQueueStandbyProcessor.process(transferTask, true)
s.Equal(ErrTaskRetry, err)

s.mockShard.SetCurrentTime(s.clusterName, time.Now().Add(3*s.mockShard.GetConfig().StandbyClusterDelay()))
s.mockShard.SetCurrentTime(s.clusterName, time.Now().Add(5*s.mockShard.GetConfig().StandbyClusterDelay()))
s.mockHistoryRereplicator.On("SendMultiWorkflowHistory",
transferTask.DomainID, transferTask.WorkflowID,
transferTask.RunID, nextEventID,
Expand Down Expand Up @@ -823,7 +823,7 @@ func (s *transferQueueStandbyProcessorSuite) TestProcessSignalExecution_Pending(
_, err = s.transferQueueStandbyProcessor.process(transferTask, true)
s.Equal(ErrTaskRetry, err)

s.mockShard.SetCurrentTime(s.clusterName, time.Now().Add(3*s.mockShard.GetConfig().StandbyClusterDelay()))
s.mockShard.SetCurrentTime(s.clusterName, time.Now().Add(5*s.mockShard.GetConfig().StandbyClusterDelay()))
s.mockHistoryRereplicator.On("SendMultiWorkflowHistory",
transferTask.DomainID, transferTask.WorkflowID,
transferTask.RunID, nextEventID,
Expand Down Expand Up @@ -961,7 +961,7 @@ func (s *transferQueueStandbyProcessorSuite) TestProcessStartChildExecution_Pend
_, err = s.transferQueueStandbyProcessor.process(transferTask, true)
s.Equal(ErrTaskRetry, err)

s.mockShard.SetCurrentTime(s.clusterName, time.Now().Add(3*s.mockShard.GetConfig().StandbyClusterDelay()))
s.mockShard.SetCurrentTime(s.clusterName, time.Now().Add(5*s.mockShard.GetConfig().StandbyClusterDelay()))
s.mockHistoryRereplicator.On("SendMultiWorkflowHistory",
transferTask.DomainID, transferTask.WorkflowID,
transferTask.RunID, nextEventID,
Expand Down