From e18bb92aa87cce12eba570d43e6af8755f1fa34d Mon Sep 17 00:00:00 2001 From: wxing1292 Date: Thu, 18 Jul 2019 13:33:06 -0700 Subject: [PATCH] Refactor workflow execution context & mutable state (#2221) * Remove redundant append first event batch APIs * Remove mutableStateSessionUpdates in favor of WorkflowSnapshot & WorkflowMutation * Move force fail decision for failover / buffered events size limit into CloseTransactionAsMutation & CloseTransactionAsSnapshot * Mutable state will now keep track of all transfer / timer tasks by AddTransferTasks & AddTimerTasks * Remove the requirement for transactionID from shard when create / update workflow * Remove special handling of LimitExceededError since buffered events size limit is handled before close transaction --- client/clientBean.go | 2 - common/cache/domainCache.go | 21 +- common/service/config/rpc.go | 2 +- service/history/MockMutableState.go | 116 +- .../history/MockWorkflowExecutionContext.go | 224 +-- service/history/conflictResolver.go | 4 + service/history/decisionHandler.go | 42 +- service/history/historyEngine.go | 35 +- service/history/historyEngine_test.go | 4 +- service/history/historyReplicator.go | 74 +- service/history/historyReplicator_test.go | 182 +- service/history/mutableState.go | 9 +- service/history/mutableStateBuilder.go | 379 +++-- service/history/mutableStateSessionUpdates.go | 50 - service/history/mutableStateUtil.go | 11 + service/history/queueProcessor.go | 6 - service/history/queueProcessor_test.go | 5 - .../history/replicatorQueueProcessor_test.go | 3 + service/history/stateBuilder.go | 7 +- service/history/stateBuilder_test.go | 6 + service/history/timerBuilder.go | 4 +- service/history/timerBuilder_test.go | 10 +- service/history/timerQueueActiveProcessor.go | 51 +- service/history/timerQueueProcessorBase.go | 14 +- .../history/timerQueueProcessorBase_test.go | 5 - service/history/timerQueueProcessor_test.go | 30 +- service/history/timerQueueStandbyProcessor.go | 12 +- .../timerQueueStandbyProcessor_test.go | 12 +- .../history/transferQueueActiveProcessor.go | 13 +- service/history/workflowExecutionContext.go | 1504 +++++++---------- service/history/workflowExecutionUtil.go | 84 + service/history/workflowResetor.go | 16 +- service/history/xdcUtil.go | 8 +- 33 files changed, 1349 insertions(+), 1596 deletions(-) delete mode 100644 service/history/mutableStateSessionUpdates.go create mode 100644 service/history/workflowExecutionUtil.go diff --git a/client/clientBean.go b/client/clientBean.go index 304f2b2f10e..9054a507655 100644 --- a/client/clientBean.go +++ b/client/clientBean.go @@ -268,8 +268,6 @@ func (d *dnsUpdater) Start() { d.logger.Error("Failed to update peerList", tag.Error(err), tag.Address(d.dnsAddress)) } d.currentPeers = res.newPeers - } else { - d.logger.Debug("No change in DNS lookup", tag.Address(d.dnsAddress)) } sleepDu := now.Add(d.interval).Sub(now) time.Sleep(sleepDu) diff --git a/common/cache/domainCache.go b/common/cache/domainCache.go index bb24b532501..ac308229411 100644 --- a/common/cache/domainCache.go +++ b/common/cache/domainCache.go @@ -40,6 +40,18 @@ import ( "github.com/uber/cadence/common/persistence" ) +// ReplicationPolicy is the domain's replication policy, +// derived from domain's replication config +type ReplicationPolicy int + +const ( + // ReplicationPolicyOneCluster indicate that workflows does not need to be replicated + // applicable to local domain & global domain with one cluster + ReplicationPolicyOneCluster ReplicationPolicy = 0 + // ReplicationPolicyMultiCluster indicate that workflows need to be replicated + ReplicationPolicyMultiCluster ReplicationPolicy = 1 +) + const ( domainCacheInitialSize = 10 * 1024 domainCacheMaxSize = 64 * 1024 @@ -685,11 +697,14 @@ func (entry *DomainCacheEntry) IsDomainActive() bool { return entry.clusterMetadata.GetCurrentClusterName() == entry.replicationConfig.ActiveClusterName } -// CanReplicateEvent return whether the workflows within this domain should be replicated -func (entry *DomainCacheEntry) CanReplicateEvent() bool { +// GetReplicationPolicy return the derived workflow replication policy +func (entry *DomainCacheEntry) GetReplicationPolicy() ReplicationPolicy { // frontend guarantee that the clusters always contains the active domain, so if the # of clusters is 1 // then we do not need to send out any events for replication - return entry.isGlobalDomain && len(entry.replicationConfig.Clusters) > 1 + if entry.isGlobalDomain && len(entry.replicationConfig.Clusters) > 1 { + return ReplicationPolicyMultiCluster + } + return ReplicationPolicyOneCluster } // GetDomainNotActiveErr return err if domain is not active, nil otherwise diff --git a/common/service/config/rpc.go b/common/service/config/rpc.go index 30dc3d154a9..f4715cb62b9 100644 --- a/common/service/config/rpc.go +++ b/common/service/config/rpc.go @@ -71,7 +71,7 @@ func (d *RPCFactory) CreateDispatcher() *yarpc.Dispatcher { func (d *RPCFactory) CreateDispatcherForOutbound( callerName, serviceName, hostName string) *yarpc.Dispatcher { // Setup dispatcher(outbound) for onebox - d.logger.Info("Created RPC dispatcher outbound for service '%v' for host '%v'", tag.Service(d.serviceName), tag.Address(hostName)) + d.logger.Info("Created RPC dispatcher outbound", tag.Service(d.serviceName), tag.Address(hostName)) dispatcher := yarpc.NewDispatcher(yarpc.Config{ Name: callerName, Outbounds: yarpc.Outbounds{ diff --git a/service/history/MockMutableState.go b/service/history/MockMutableState.go index 8f201600f39..ffce0d7f4f4 100644 --- a/service/history/MockMutableState.go +++ b/service/history/MockMutableState.go @@ -1122,29 +1122,6 @@ func (_m *mockMutableState) ClearStickyness() { _m.Called() } -// CloseUpdateSession provides a mock function with given fields: -func (_m *mockMutableState) CloseUpdateSession() (*mutableStateSessionUpdates, error) { - ret := _m.Called() - - var r0 *mutableStateSessionUpdates - if rf, ok := ret.Get(0).(func() *mutableStateSessionUpdates); ok { - r0 = rf() - } else { - if ret.Get(0) != nil { - r0 = ret.Get(0).(*mutableStateSessionUpdates) - } - } - - var r1 error - if rf, ok := ret.Get(1).(func() error); ok { - r1 = rf() - } else { - r1 = ret.Error(1) - } - - return r0, r1 -} - // CopyToPersistence provides a mock function with given fields: func (_m *mockMutableState) CopyToPersistence() *persistence.WorkflowMutableState { ret := _m.Called() @@ -1467,22 +1444,6 @@ func (_m *mockMutableState) GetCompletionEvent() (*shared.HistoryEvent, bool) { return r0, r1 } -// GetContinueAsNew provides a mock function with given fields: -func (_m *mockMutableState) GetContinueAsNew() *persistence.WorkflowSnapshot { - ret := _m.Called() - - var r0 *persistence.WorkflowSnapshot - if rf, ok := ret.Get(0).(func() *persistence.WorkflowSnapshot); ok { - r0 = rf() - } else { - if ret.Get(0) != nil { - r0 = ret.Get(0).(*persistence.WorkflowSnapshot) - } - } - - return r0 -} - // GetCronBackoffDuration provides a mock function with given fields: func (_m *mockMutableState) GetCronBackoffDuration() time.Duration { ret := _m.Called() @@ -2730,6 +2691,11 @@ func (_m *mockMutableState) UpdateDecision(_a0 *decisionInfo) { _m.Called(_a0) } +// UpdateReplicationPolicy provides a mock function with given fields: _a0 +func (_m *mockMutableState) UpdateReplicationPolicy(_a0 cache.ReplicationPolicy) { + _m.Called(_a0) +} + // UpdateReplicationStateLastEventID provides a mock function with given fields: _a0, _a1 func (_m *mockMutableState) UpdateReplicationStateLastEventID(_a0 int64, _a1 int64) { _m.Called(_a0, _a1) @@ -2745,23 +2711,55 @@ func (_m *mockMutableState) UpdateUserTimer(_a0 string, _a1 *persistence.TimerIn _m.Called(_a0, _a1) } -// UpdateUserTimer provides a mock function with given fields: _a0 +// AddTransferTasks provides a mock function with given fields: _a0 func (_m *mockMutableState) AddTransferTasks(_a0 ...persistence.Task) { _m.Called(_a0) } -// UpdateUserTimer provides a mock function with given fields: _a0 +// GetTransferTasks provides a mock function with given fields: +func (_m *mockMutableState) GetTransferTasks() []persistence.Task { + ret := _m.Called() + + var r0 []persistence.Task + if rf, ok := ret.Get(0).(func() []persistence.Task); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).([]persistence.Task) + } + } + + return r0 +} + +// AddTimerTasks provides a mock function with given fields: _a0 func (_m *mockMutableState) AddTimerTasks(_a0 ...persistence.Task) { _m.Called(_a0) } -// UpdateUserTimer provides a mock function with given fields: _a0 -func (_m *mockMutableState) CloseTransactionAsMutation(_a0 time.Time) (*persistence.WorkflowMutation, []*persistence.WorkflowEvents, error) { - ret := _m.Called(_a0) +// GetTimerTasks provides a mock function with given fields: +func (_m *mockMutableState) GetTimerTasks() []persistence.Task { + ret := _m.Called() + + var r0 []persistence.Task + if rf, ok := ret.Get(0).(func() []persistence.Task); ok { + r0 = rf() + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).([]persistence.Task) + } + } + + return r0 +} + +// CloseTransactionAsMutation provides a mock function with given fields: _a0, _a1 +func (_m *mockMutableState) CloseTransactionAsMutation(_a0 time.Time, _a1 transactionPolicy) (*persistence.WorkflowMutation, []*persistence.WorkflowEvents, error) { + ret := _m.Called(_a0, _a1) var r0 *persistence.WorkflowMutation - if rf, ok := ret.Get(0).(func(_a0 time.Time) *persistence.WorkflowMutation); ok { - r0 = rf(_a0) + if rf, ok := ret.Get(0).(func(time.Time, transactionPolicy) *persistence.WorkflowMutation); ok { + r0 = rf(_a0, _a1) } else { if ret.Get(0) != nil { r0 = ret.Get(0).(*persistence.WorkflowMutation) @@ -2769,8 +2767,8 @@ func (_m *mockMutableState) CloseTransactionAsMutation(_a0 time.Time) (*persiste } var r1 []*persistence.WorkflowEvents - if rf, ok := ret.Get(1).(func(_a0 time.Time) []*persistence.WorkflowEvents); ok { - r1 = rf(_a0) + if rf, ok := ret.Get(1).(func(time.Time, transactionPolicy) []*persistence.WorkflowEvents); ok { + r1 = rf(_a0, _a1) } else { if ret.Get(1) != nil { r1 = ret.Get(1).([]*persistence.WorkflowEvents) @@ -2778,8 +2776,8 @@ func (_m *mockMutableState) CloseTransactionAsMutation(_a0 time.Time) (*persiste } var r2 error - if rf, ok := ret.Get(2).(func(_a0 time.Time) error); ok { - r2 = rf(_a0) + if rf, ok := ret.Get(2).(func(time.Time, transactionPolicy) error); ok { + r2 = rf(_a0, _a1) } else { r2 = ret.Error(2) } @@ -2787,13 +2785,13 @@ func (_m *mockMutableState) CloseTransactionAsMutation(_a0 time.Time) (*persiste return r0, r1, r2 } -// UpdateUserTimer provides a mock function with given fields: _a0, _a1 -func (_m *mockMutableState) CloseTransactionAsSnapshot(_a0 time.Time) (*persistence.WorkflowSnapshot, []*persistence.WorkflowEvents, error) { - ret := _m.Called(_a0) +// CloseTransactionAsSnapshot provides a mock function with given fields: _a0, _a1 +func (_m *mockMutableState) CloseTransactionAsSnapshot(_a0 time.Time, _a1 transactionPolicy) (*persistence.WorkflowSnapshot, []*persistence.WorkflowEvents, error) { + ret := _m.Called(_a0, _a1) var r0 *persistence.WorkflowSnapshot - if rf, ok := ret.Get(0).(func(_a0 time.Time) *persistence.WorkflowSnapshot); ok { - r0 = rf(_a0) + if rf, ok := ret.Get(0).(func(time.Time, transactionPolicy) *persistence.WorkflowSnapshot); ok { + r0 = rf(_a0, _a1) } else { if ret.Get(0) != nil { r0 = ret.Get(0).(*persistence.WorkflowSnapshot) @@ -2801,8 +2799,8 @@ func (_m *mockMutableState) CloseTransactionAsSnapshot(_a0 time.Time) (*persiste } var r1 []*persistence.WorkflowEvents - if rf, ok := ret.Get(1).(func(_a0 time.Time) []*persistence.WorkflowEvents); ok { - r1 = rf(_a0) + if rf, ok := ret.Get(1).(func(time.Time, transactionPolicy) []*persistence.WorkflowEvents); ok { + r1 = rf(_a0, _a1) } else { if ret.Get(1) != nil { r1 = ret.Get(1).([]*persistence.WorkflowEvents) @@ -2810,8 +2808,8 @@ func (_m *mockMutableState) CloseTransactionAsSnapshot(_a0 time.Time) (*persiste } var r2 error - if rf, ok := ret.Get(2).(func(_a0 time.Time) error); ok { - r2 = rf(_a0) + if rf, ok := ret.Get(2).(func(time.Time, transactionPolicy) error); ok { + r2 = rf(_a0, _a1) } else { r2 = ret.Error(2) } diff --git a/service/history/MockWorkflowExecutionContext.go b/service/history/MockWorkflowExecutionContext.go index fab7ae20cc3..b60e026f35f 100644 --- a/service/history/MockWorkflowExecutionContext.go +++ b/service/history/MockWorkflowExecutionContext.go @@ -27,7 +27,6 @@ import ( "github.com/uber/cadence/common/log" "github.com/stretchr/testify/mock" - h "github.com/uber/cadence/.gen/go/history" workflow "github.com/uber/cadence/.gen/go/shared" "github.com/uber/cadence/common/persistence" ) @@ -39,82 +38,10 @@ type mockWorkflowExecutionContext struct { var _ workflowExecutionContext = (*mockWorkflowExecutionContext)(nil) -func (_m *mockWorkflowExecutionContext) appendFirstBatchEventsForActive(_a0 mutableState, _a1 bool) (int64, persistence.Task, error) { - ret := _m.Called(_a0, _a1) - - var r0 int64 - if rf, ok := ret.Get(0).(func(mutableState, bool) int64); ok { - r0 = rf(_a0, _a1) - } else { - r0 = ret.Get(0).(int64) - } - - var r1 persistence.Task - if rf, ok := ret.Get(1).(func(mutableState, bool) persistence.Task); ok { - r1 = rf(_a0, _a1) - } else { - if ret.Get(1) != nil { - r1 = ret.Get(1).(persistence.Task) - } - } - - var r2 error - if rf, ok := ret.Get(2).(func(mutableState, bool) error); ok { - r2 = rf(_a0, _a1) - } else { - r2 = ret.Error(2) - } - - return r0, r1, r2 -} - -func (_m *mockWorkflowExecutionContext) appendFirstBatchEventsForStandby(_a0 mutableState, _a1 []*workflow.HistoryEvent) (int64, persistence.Task, error) { - ret := _m.Called(_a0, _a1) - - var r0 int64 - if rf, ok := ret.Get(0).(func(mutableState, []*workflow.HistoryEvent) int64); ok { - r0 = rf(_a0, _a1) - } else { - r0 = ret.Get(0).(int64) - } - - var r1 persistence.Task - if rf, ok := ret.Get(1).(func(mutableState, []*workflow.HistoryEvent) persistence.Task); ok { - r1 = rf(_a0, _a1) - } else { - if ret.Get(1) != nil { - r1 = ret.Get(1).(persistence.Task) - } - } - - var r2 error - if rf, ok := ret.Get(2).(func(mutableState, []*workflow.HistoryEvent) error); ok { - r2 = rf(_a0, _a1) - } else { - r2 = ret.Error(2) - } - - return r0, r1, r2 -} - func (_m *mockWorkflowExecutionContext) clear() { _m.Called() } -func (_m *mockWorkflowExecutionContext) createWorkflowExecution(_a0 *persistence.WorkflowSnapshot, _a1 int64, _a2 time.Time, _a3 int, _a4 string, _a5 int64) error { - - ret := _m.Called(_a0, _a1, _a2, _a3, _a4, _a5) - - var r0 error - if rf, ok := ret.Get(0).(func(*persistence.WorkflowSnapshot, int64, time.Time, int, string, int64) error); ok { - r0 = rf(_a0, _a1, _a2, _a3, _a4, _a5) - } else { - r0 = ret.Error(0) - } - - return r0 -} - func (_m *mockWorkflowExecutionContext) getDomainID() string { ret := _m.Called() @@ -160,6 +87,40 @@ func (_m *mockWorkflowExecutionContext) getLogger() log.Logger { return r0 } +func (_m *mockWorkflowExecutionContext) lock(_a0 context.Context) error { + ret := _m.Called(_a0) + + var r0 error + if rf, ok := ret.Get(0).(func(context.Context) error); ok { + r0 = rf(_a0) + } else { + r0 = ret.Error(0) + } + + return r0 +} + +func (_m *mockWorkflowExecutionContext) unlock() { + _m.Called() +} + +func (_m *mockWorkflowExecutionContext) getHistorySize() int64 { + ret := _m.Called() + + var r0 int64 + if rf, ok := ret.Get(0).(func() int64); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(int64) + } + + return r0 +} + +func (_m *mockWorkflowExecutionContext) setHistorySize(_a0 int64) { + _m.Called(_a0) +} + func (_m *mockWorkflowExecutionContext) loadWorkflowExecution() (mutableState, error) { ret := _m.Called() @@ -204,52 +165,6 @@ func (_m *mockWorkflowExecutionContext) loadExecutionStats() (*persistence.Execu return r0, r1 } -func (_m *mockWorkflowExecutionContext) lock(_a0 context.Context) error { - ret := _m.Called(_a0) - - var r0 error - if rf, ok := ret.Get(0).(func(context.Context) error); ok { - r0 = rf(_a0) - } else { - r0 = ret.Error(0) - } - - return r0 -} - -func (_m *mockWorkflowExecutionContext) appendFirstBatchHistoryForContinueAsNew(_a0 mutableState, _a1 int64) (int64, error) { - ret := _m.Called(_a0, _a1) - - var r0 int64 - if rf, ok := ret.Get(0).(func(mutableState, int64) int64); ok { - r0 = rf(_a0, _a1) - } else { - r0 = ret.Get(0).(int64) - } - - var r1 error - if rf, ok := ret.Get(1).(func(mutableState, int64) error); ok { - r1 = rf(_a0, _a1) - } else { - r1 = ret.Error(1) - } - - return r0, r1 -} - -func (_m *mockWorkflowExecutionContext) replicateWorkflowExecution(_a0 *h.ReplicateEventsRequest, _a1 []persistence.Task, _a2 []persistence.Task, _a3 int64, _a4 time.Time) error { - ret := _m.Called(_a0, _a1, _a2, _a3, _a4) - - var r0 error - if rf, ok := ret.Get(0).(func(*h.ReplicateEventsRequest, []persistence.Task, []persistence.Task, int64, time.Time) error); ok { - r0 = rf(_a0, _a1, _a2, _a3, _a4) - } else { - r0 = ret.Error(0) - } - - return r0 -} - func (_m *mockWorkflowExecutionContext) resetMutableState(_a0 string, _a1 int64, _a2 int, _a3 []persistence.Task, _a4 []persistence.Task, _a5 []persistence.Task, _a6 mutableState, _a7 int64) (mutableState, error) { ret := _m.Called(_a0, _a1, _a2, _a3, _a4, _a5, _a6, _a7) @@ -284,47 +199,52 @@ func (_m *mockWorkflowExecutionContext) resetWorkflowExecution(_a0 mutableState, return r0 } -func (_m *mockWorkflowExecutionContext) scheduleNewDecision(_a0 []persistence.Task, _a1 []persistence.Task) ([]persistence.Task, []persistence.Task, error) { - ret := _m.Called(_a0, _a1) +func (_m *mockWorkflowExecutionContext) createWorkflowExecution(_a0 *persistence.WorkflowSnapshot, _a1 int64, _a2 time.Time, _a3 int, _a4 string, _a5 int64) error { - var r0 []persistence.Task - if rf, ok := ret.Get(0).(func([]persistence.Task, []persistence.Task) []persistence.Task); ok { - r0 = rf(_a0, _a1) - } else { - if ret.Get(0) != nil { - r0 = ret.Get(0).([]persistence.Task) - } - } + ret := _m.Called(_a0, _a1, _a2, _a3, _a4, _a5) - var r1 []persistence.Task - if rf, ok := ret.Get(1).(func([]persistence.Task, []persistence.Task) []persistence.Task); ok { - r1 = rf(_a0, _a1) + var r0 error + if rf, ok := ret.Get(0).(func(*persistence.WorkflowSnapshot, int64, time.Time, int, string, int64) error); ok { + r0 = rf(_a0, _a1, _a2, _a3, _a4, _a5) } else { - if ret.Get(1) != nil { - r1 = ret.Get(1).([]persistence.Task) - } + r0 = ret.Error(0) } - var r2 error - if rf, ok := ret.Get(2).(func([]persistence.Task, []persistence.Task) error); ok { - r2 = rf(_a0, _a1) + return r0 +} + +func (_m *mockWorkflowExecutionContext) updateWorkflowExecutionAsActive(_a0 time.Time) error { + ret := _m.Called(_a0) + + var r0 error + if rf, ok := ret.Get(0).(func(time.Time) error); ok { + r0 = rf(_a0) } else { - r2 = ret.Error(2) + r0 = ret.Error(0) } - return r0, r1, r2 + return r0 } -func (_m *mockWorkflowExecutionContext) unlock() { - _m.Called() +func (_m *mockWorkflowExecutionContext) updateWorkflowExecutionWithNewAsActive(_a0 time.Time, _a1 workflowExecutionContext, _a2 mutableState) error { + ret := _m.Called(_a0, _a1, _a2) + + var r0 error + if rf, ok := ret.Get(0).(func(time.Time, workflowExecutionContext, mutableState) error); ok { + r0 = rf(_a0, _a1, _a2) + } else { + r0 = ret.Error(0) + } + + return r0 } -func (_m *mockWorkflowExecutionContext) updateAsPassive(_a0 []persistence.Task, _a1 []persistence.Task, _a2 int64, _a3 time.Time, _a4 bool, _a5 *historyBuilder, _a6 string) error { - ret := _m.Called(_a0, _a1, _a2, _a3, _a4, _a5, _a6) +func (_m *mockWorkflowExecutionContext) updateWorkflowExecutionAsPassive(_a0 time.Time) error { + ret := _m.Called(_a0) var r0 error - if rf, ok := ret.Get(0).(func([]persistence.Task, []persistence.Task, int64, time.Time, bool, *historyBuilder, string) error); ok { - r0 = rf(_a0, _a1, _a2, _a3, _a4, _a5, _a6) + if rf, ok := ret.Get(0).(func(time.Time) error); ok { + r0 = rf(_a0) } else { r0 = ret.Error(0) } @@ -332,11 +252,11 @@ func (_m *mockWorkflowExecutionContext) updateAsPassive(_a0 []persistence.Task, return r0 } -func (_m *mockWorkflowExecutionContext) updateAsActive(_a0 []persistence.Task, _a1 []persistence.Task, _a2 int64) error { +func (_m *mockWorkflowExecutionContext) updateWorkflowExecutionWithNewAsPassive(_a0 time.Time, _a1 workflowExecutionContext, _a2 mutableState) error { ret := _m.Called(_a0, _a1, _a2) var r0 error - if rf, ok := ret.Get(0).(func([]persistence.Task, []persistence.Task, int64) error); ok { + if rf, ok := ret.Get(0).(func(time.Time, workflowExecutionContext, mutableState) error); ok { r0 = rf(_a0, _a1, _a2) } else { r0 = ret.Error(0) @@ -345,12 +265,12 @@ func (_m *mockWorkflowExecutionContext) updateAsActive(_a0 []persistence.Task, _ return r0 } -func (_m *mockWorkflowExecutionContext) updateAsActiveWithNew(_a0 []persistence.Task, _a1 []persistence.Task, _a2 int64, _a3 mutableState) error { - ret := _m.Called(_a0, _a1, _a2, _a3) +func (_m *mockWorkflowExecutionContext) updateWorkflowExecutionWithNew(_a0 time.Time, _a1 workflowExecutionContext, _a2 mutableState, _a3 transactionPolicy, _a4 *transactionPolicy) error { + ret := _m.Called(_a0, _a1, _a2, _a3, _a4) var r0 error - if rf, ok := ret.Get(0).(func([]persistence.Task, []persistence.Task, int64, mutableState) error); ok { - r0 = rf(_a0, _a1, _a2, _a3) + if rf, ok := ret.Get(0).(func(time.Time, workflowExecutionContext, mutableState, transactionPolicy, *transactionPolicy) error); ok { + r0 = rf(_a0, _a1, _a2, _a3, _a4) } else { r0 = ret.Error(0) } diff --git a/service/history/conflictResolver.go b/service/history/conflictResolver.go index 4a3ab4a2904..4ba9746dc04 100644 --- a/service/history/conflictResolver.go +++ b/service/history/conflictResolver.go @@ -23,6 +23,7 @@ package history import ( "github.com/uber/cadence/.gen/go/shared" "github.com/uber/cadence/common" + "github.com/uber/cadence/common/cache" "github.com/uber/cadence/common/cluster" "github.com/uber/cadence/common/log" "github.com/uber/cadence/common/log/tag" @@ -119,6 +120,9 @@ func (r *conflictResolverImpl) reset( r.shard.GetEventsCache(), r.logger, firstEvent.GetVersion(), + // if can see replication task, meaning that domain is + // global domain with > 1 target clusters + cache.ReplicationPolicyMultiCluster, ) resetMutableStateBuilder.executionInfo.EventStoreVersion = eventStoreVersion diff --git a/service/history/decisionHandler.go b/service/history/decisionHandler.go index 1810cee27fb..2def6847ff9 100644 --- a/service/history/decisionHandler.go +++ b/service/history/decisionHandler.go @@ -512,22 +512,32 @@ Update_History_Loop: timerTasks = append(timerTasks, timerT) } - // Generate a transaction ID for appending events to history - transactionID, err := handler.shard.GetNextTransferTaskID() - if err != nil { - return nil, err - } + msBuilder.AddTransferTasks(transferTasks...) + msBuilder.AddTimerTasks(timerTasks...) // We apply the update to execution using optimistic concurrency. If it fails due to a conflict then reload // the history and try the operation again. var updateErr error if continueAsNewBuilder != nil { - continueAsNewTimerTasks = msBuilder.GetContinueAsNew().TimerTasks - - updateErr = context.updateAsActiveWithNew(transferTasks, timerTasks, transactionID, continueAsNewBuilder) + continueAsNewTimerTasks = continueAsNewBuilder.GetTimerTasks() + + continueAsNewExecutionInfo := continueAsNewBuilder.GetExecutionInfo() + updateErr = context.updateWorkflowExecutionWithNewAsActive( + handler.shard.GetTimeSource().Now(), + newWorkflowExecutionContext( + continueAsNewExecutionInfo.DomainID, + workflow.WorkflowExecution{ + WorkflowId: common.StringPtr(continueAsNewExecutionInfo.WorkflowID), + RunId: common.StringPtr(continueAsNewExecutionInfo.RunID), + }, + handler.shard, + handler.shard.GetExecutionManager(), + handler.logger, + ), + continueAsNewBuilder, + ) } else { - updateErr = context.updateAsActive(transferTasks, timerTasks, - transactionID) + updateErr = context.updateWorkflowExecutionAsActive(handler.shard.GetTimeSource().Now()) } if updateErr != nil { @@ -554,17 +564,13 @@ Update_History_Loop: if err != nil { return nil, err } - tranT, timerT, err := handler.historyEngine.getWorkflowHistoryCleanupTasks(domainID, workflowExecution.GetWorkflowId(), tBuilder) - if err != nil { - return nil, err - } - transferTasks = []persistence.Task{tranT} - timerTasks = []persistence.Task{timerT} - transactionID, err = handler.shard.GetNextTransferTaskID() + transferTask, timerTask, err := handler.historyEngine.getWorkflowHistoryCleanupTasks(domainID, workflowExecution.GetWorkflowId(), tBuilder) if err != nil { return nil, err } - if err := context.updateAsActive(transferTasks, timerTasks, transactionID); err != nil { + msBuilder.AddTransferTasks(transferTask) + msBuilder.AddTimerTasks(timerTask) + if err := context.updateWorkflowExecutionAsActive(handler.shard.GetTimeSource().Now()); err != nil { return nil, err } handler.timerProcessor.NotifyNewTimers(handler.currentClusterName, handler.shard.GetCurrentTime(handler.currentClusterName), timerTasks) diff --git a/service/history/historyEngine.go b/service/history/historyEngine.go index 09541b106a0..c60b08f6fdd 100644 --- a/service/history/historyEngine.go +++ b/service/history/historyEngine.go @@ -122,8 +122,6 @@ var ( ErrSignalOverSize = &workflow.BadRequestError{Message: "Signal input size is over 256K."} // ErrCancellationAlreadyRequested is the error indicating cancellation for target workflow is already requested ErrCancellationAlreadyRequested = &workflow.CancellationAlreadyRequestedError{Message: "Cancellation already requested for this workflow execution."} - // ErrBufferedEventsLimitExceeded is the error indicating limit reached for maximum number of buffered events - ErrBufferedEventsLimitExceeded = &workflow.LimitExceededError{Message: "Exceeded workflow execution limit for buffered events"} // ErrSignalsLimitExceeded is the error indicating limit reached for maximum number of signal events ErrSignalsLimitExceeded = &workflow.LimitExceededError{Message: "Exceeded workflow execution limit for signal events"} // ErrEventsAterWorkflowFinish is the error indicating server error trying to write events after workflow finish event @@ -325,6 +323,7 @@ func (e *historyEngineImpl) createMutableState( e.shard.GetEventsCache(), e.logger, domainEntry.GetFailoverVersion(), + domainEntry.GetReplicationPolicy(), ) } else { msBuilder = newMutableStateBuilder( @@ -452,7 +451,10 @@ func (e *historyEngineImpl) StartWorkflowExecution( msBuilder.AddTimerTasks(timerTasks...) now := e.timeSource.Now() - newWorkflow, newWorkflowEventsSeq, err := msBuilder.CloseTransactionAsSnapshot(now) + newWorkflow, newWorkflowEventsSeq, err := msBuilder.CloseTransactionAsSnapshot( + now, + transactionPolicyActive, + ) if err != nil { return nil, err } @@ -1450,16 +1452,12 @@ func (e *historyEngineImpl) SignalWithStartWorkflowExecution( timerTasks = append(timerTasks, stickyTaskTimeoutTimer) } } - // Generate a transaction ID for appending events to history - var transactionID int64 - transactionID, err = e.shard.GetNextTransferTaskID() - if err != nil { - return nil, err - } // We apply the update to execution using optimistic concurrency. If it fails due to a conflict then reload // the history and try the operation again. - if err := context.updateAsActive(transferTasks, timerTasks, transactionID); err != nil { + msBuilder.AddTransferTasks(transferTasks...) + msBuilder.AddTimerTasks(timerTasks...) + if err := context.updateWorkflowExecutionAsActive(e.shard.GetTimeSource().Now()); err != nil { if err == ErrConflict { continue Just_Signal_Loop } @@ -1566,7 +1564,10 @@ func (e *historyEngineImpl) SignalWithStartWorkflowExecution( msBuilder.AddTimerTasks(timerTasks...) now := e.timeSource.Now() - newWorkflow, newWorkflowEventsSeq, err := msBuilder.CloseTransactionAsSnapshot(now) + newWorkflow, newWorkflowEventsSeq, err := msBuilder.CloseTransactionAsSnapshot( + now, + transactionPolicyActive, + ) if err != nil { return nil, err } @@ -1951,15 +1952,11 @@ Update_History_Loop: } } - // Generate a transaction ID for appending events to history - transactionID, err2 := e.shard.GetNextTransferTaskID() - if err2 != nil { - return err2 - } - // We apply the update to execution using optimistic concurrency. If it fails due to a conflict then reload // the history and try the operation again. - if err := context.updateAsActive(transferTasks, timerTasks, transactionID); err != nil { + msBuilder.AddTransferTasks(transferTasks...) + msBuilder.AddTimerTasks(timerTasks...) + if err := context.updateWorkflowExecutionAsActive(e.shard.GetTimeSource().Now()); err != nil { if err == ErrConflict { continue Update_History_Loop } @@ -2093,7 +2090,7 @@ func (e *historyEngineImpl) getTimerBuilder( ) *timerBuilder { log := e.logger.WithTags(tag.WorkflowID(we.GetWorkflowId()), tag.WorkflowRunID(we.GetRunId())) - return newTimerBuilder(e.shard.GetConfig(), log, clock.NewRealTimeSource()) + return newTimerBuilder(log, clock.NewRealTimeSource()) } func (s *shardContextWrapper) UpdateWorkflowExecution( diff --git a/service/history/historyEngine_test.go b/service/history/historyEngine_test.go index 7b25db255c1..3732421ab78 100644 --- a/service/history/historyEngine_test.go +++ b/service/history/historyEngine_test.go @@ -4602,7 +4602,7 @@ func (s *engineSuite) TestCancelTimer_RespondDecisionTaskCompleted_TimerFired() di2 := addDecisionTaskScheduledEvent(msBuilder) addDecisionTaskStartedEvent(msBuilder, di2.ScheduleID, tl, identity) addTimerFiredEvent(msBuilder, di2.ScheduleID, timerID) - _, err := msBuilder.CloseUpdateSession() + _, _, err := msBuilder.CloseTransactionAsMutation(time.Now(), transactionPolicyActive) s.Nil(err) ms := createMutableState(msBuilder) @@ -5120,7 +5120,7 @@ func newMutableStateBuilderWithEventV2(shard ShardContext, eventsCache eventsCac func newMutableStateBuilderWithReplicationStateWithEventV2(shard ShardContext, eventsCache eventsCache, logger log.Logger, version int64, runID string) *mutableStateBuilder { - msBuilder := newMutableStateBuilderWithReplicationState(shard, eventsCache, logger, version) + msBuilder := newMutableStateBuilderWithReplicationState(shard, eventsCache, logger, version, cache.ReplicationPolicyOneCluster) _ = msBuilder.SetHistoryTree(runID) return msBuilder diff --git a/service/history/historyReplicator.go b/service/history/historyReplicator.go index cd141462c90..b4548144150 100644 --- a/service/history/historyReplicator.go +++ b/service/history/historyReplicator.go @@ -152,6 +152,9 @@ func newHistoryReplicator( shard.GetEventsCache(), logger, version, + // if can see replication task, meaning that domain is + // global domain with > 1 target clusters + cache.ReplicationPolicyMultiCluster, ) }, } @@ -274,12 +277,13 @@ func (r *historyReplicator) SyncActivity( timerTasks := []persistence.Task{} timeSource := clock.NewEventTimeSource() timeSource.Update(now) - timerBuilder := newTimerBuilder(r.shard.GetConfig(), r.logger, timeSource) + timerBuilder := newTimerBuilder(r.logger, timeSource) if tt := timerBuilder.GetActivityTimerTaskIfNeeded(msBuilder); tt != nil { timerTasks = append(timerTasks, tt) } - return r.updateMutableStateWithTimer(context, msBuilder, now, timerTasks) + msBuilder.AddTimerTasks(timerTasks...) + return context.updateWorkflowExecutionAsPassive(now) } func (r *historyReplicator) ApplyRawEvents( @@ -709,7 +713,7 @@ func (r *historyReplicator) ApplyReplicationTask( } // directly use stateBuilder to apply events for other events(including continueAsNew) - lastEvent, _, _, err := sBuilder.applyEvents(domainID, requestID, execution, request.History.Events, newRunHistory, request.GetEventStoreVersion(), request.GetNewRunEventStoreVersion()) + lastEvent, _, newMutableState, err := sBuilder.applyEvents(domainID, requestID, execution, request.History.Events, newRunHistory, request.GetEventStoreVersion(), request.GetNewRunEventStoreVersion()) if err != nil { return err } @@ -720,9 +724,21 @@ func (r *historyReplicator) ApplyReplicationTask( err = r.replicateWorkflowStarted(ctx, context, msBuilder, request.History, sBuilder, logger) default: now := time.Unix(0, lastEvent.GetTimestamp()) - // TODO remove this set history builder once the update path is re-written - msBuilder.SetHistoryBuilder(newHistoryBuilder(msBuilder, r.logger)) - err = context.replicateWorkflowExecution(request, sBuilder.getTransferTasks(), sBuilder.getTimerTasks(), lastEvent.GetEventId(), now) + var newContext workflowExecutionContext + if newMutableState != nil { + newExecutionInfo := newMutableState.GetExecutionInfo() + newContext = newWorkflowExecutionContext( + newExecutionInfo.DomainID, + workflow.WorkflowExecution{ + WorkflowId: common.StringPtr(newExecutionInfo.WorkflowID), + RunId: common.StringPtr(newExecutionInfo.RunID), + }, + r.shard, + r.shard.GetExecutionManager(), + r.logger, + ) + } + err = context.updateWorkflowExecutionWithNewAsPassive(now, newContext, newMutableState) } if err == nil { @@ -753,7 +769,10 @@ func (r *historyReplicator) replicateWorkflowStarted( lastEvent := history.Events[len(history.Events)-1] now := time.Unix(0, lastEvent.GetTimestamp()) - newWorkflow, workflowEventsSeq, err := msBuilder.CloseTransactionAsSnapshot(now) + newWorkflow, workflowEventsSeq, err := msBuilder.CloseTransactionAsSnapshot( + now, + transactionPolicyPassive, + ) if err != nil { return err } @@ -1152,33 +1171,6 @@ func (r *historyReplicator) resetMutableState( return msBuilder, nil } -func (r *historyReplicator) updateMutableStateOnly( - context workflowExecutionContext, - msBuilder mutableState, -) error { - return r.updateMutableStateWithTimer(context, msBuilder, time.Time{}, nil) -} - -func (r *historyReplicator) updateMutableStateWithTimer( - context workflowExecutionContext, - msBuilder mutableState, - now time.Time, - timerTasks []persistence.Task, -) error { - - // Generate a transaction ID for appending events to history - transactionID, err := r.shard.GetNextTransferTaskID() - if err != nil { - return err - } - // we need to handcraft some of the variables - // since this is a persisting the buffer replication task, - // so nothing on the replication state should be changed - lastWriteVersion := msBuilder.GetLastWriteVersion() - sourceCluster := r.clusterMetadata.ClusterNameForFailoverVersion(lastWriteVersion) - return context.updateAsPassive(nil, timerTasks, transactionID, now, false, nil, sourceCluster) -} - func (r *historyReplicator) deserializeBlob( blob *workflow.DataBlob, ) ([]*workflow.HistoryEvent, error) { @@ -1408,27 +1400,25 @@ func (r *historyReplicator) persistWorkflowMutation( if err != nil { return ErrWorkflowMutationDecision } - transferTasks = append(transferTasks, &persistence.DecisionTask{ + msBuilder.AddTransferTasks(&persistence.DecisionTask{ DomainID: executionInfo.DomainID, TaskList: di.TaskList, ScheduleID: di.ScheduleID, }) + if msBuilder.IsStickyTaskListEnabled() { - tBuilder := newTimerBuilder(r.shard.GetConfig(), r.logger, r.timeSource) + tBuilder := newTimerBuilder(r.logger, r.timeSource) stickyTaskTimeoutTimer := tBuilder.AddScheduleToStartDecisionTimoutTask( di.ScheduleID, di.Attempt, executionInfo.StickyScheduleToStartTimeout, ) - timerTasks = append(timerTasks, stickyTaskTimeoutTimer) + msBuilder.AddTimerTasks(stickyTaskTimeoutTimer) } } - transactionID, err := r.shard.GetNextTransferTaskID() - if err != nil { - return err - } - return context.updateAsActive(transferTasks, timerTasks, transactionID) + now := clock.NewRealTimeSource().Now() // this is on behalf of active logic + return context.updateWorkflowExecutionAsActive(now) } func logError( diff --git a/service/history/historyReplicator_test.go b/service/history/historyReplicator_test.go index df27832ae55..72e9d54a161 100644 --- a/service/history/historyReplicator_test.go +++ b/service/history/historyReplicator_test.go @@ -251,6 +251,7 @@ func (s *historyReplicatorSuite) TestSyncActivity_IncomingScheduleIDLarger_Incom LastWriteEventID: nextEventID - 1, }) msBuilder.On("GetLastWriteVersion").Return(lastWriteVersion) + msBuilder.On("UpdateReplicationPolicy", cache.ReplicationPolicyOneCluster).Once() msBuilder.On("UpdateReplicationStateVersion", lastWriteVersion, false).Once() s.mockMetadataMgr.On("GetDomain", &persistence.GetDomainRequest{ID: domainID}).Return( &persistence.GetDomainResponse{ @@ -311,6 +312,7 @@ func (s *historyReplicatorSuite) TestSyncActivity_IncomingScheduleIDLarger_Incom LastWriteEventID: nextEventID - 1, }) msBuilder.On("GetLastWriteVersion").Return(lastWriteVersion) + msBuilder.On("UpdateReplicationPolicy", cache.ReplicationPolicyOneCluster).Once() msBuilder.On("UpdateReplicationStateVersion", lastWriteVersion, false).Once() s.mockMetadataMgr.On("GetDomain", &persistence.GetDomainRequest{ID: domainID}).Return( &persistence.GetDomainResponse{ @@ -370,6 +372,7 @@ func (s *historyReplicatorSuite) TestSyncActivity_ActivityCompleted() { LastWriteVersion: lastWriteVersion, LastWriteEventID: nextEventID - 1, }) + msBuilder.On("UpdateReplicationPolicy", cache.ReplicationPolicyOneCluster).Once() msBuilder.On("UpdateReplicationStateVersion", lastWriteVersion, false).Once() s.mockMetadataMgr.On("GetDomain", &persistence.GetDomainRequest{ID: domainID}).Return( &persistence.GetDomainResponse{ @@ -430,6 +433,7 @@ func (s *historyReplicatorSuite) TestSyncActivity_ActivityRunning_LocalActivityV LastWriteVersion: lastWriteVersion, LastWriteEventID: nextEventID - 1, }) + msBuilder.On("UpdateReplicationPolicy", cache.ReplicationPolicyOneCluster).Once() msBuilder.On("UpdateReplicationStateVersion", lastWriteVersion, false).Once() s.mockMetadataMgr.On("GetDomain", &persistence.GetDomainRequest{ID: domainID}).Return( &persistence.GetDomainResponse{ @@ -466,7 +470,7 @@ func (s *historyReplicatorSuite) TestSyncActivity_ActivityRunning_Update_SameVer startedTime := scheduledTime.Add(time.Minute) heartBeatUpdatedTime := startedTime.Add(time.Minute) attempt := int32(0) - details := []byte("some random actitity heartbeat progress") + details := []byte("some random activity heartbeat progress") nextEventID := scheduleID + 10 @@ -503,6 +507,7 @@ func (s *historyReplicatorSuite) TestSyncActivity_ActivityRunning_Update_SameVer LastWriteVersion: version, LastWriteEventID: nextEventID - 1, }) + msBuilder.On("UpdateReplicationPolicy", cache.ReplicationPolicyOneCluster).Once() msBuilder.On("UpdateReplicationStateVersion", version, false).Once() s.mockMetadataMgr.On("GetDomain", &persistence.GetDomainRequest{ID: domainID}).Return( &persistence.GetDomainResponse{ @@ -546,7 +551,7 @@ func (s *historyReplicatorSuite) TestSyncActivity_ActivityRunning_Update_SameVer startedTime := scheduledTime.Add(time.Minute) heartBeatUpdatedTime := startedTime.Add(time.Minute) attempt := int32(100) - details := []byte("some random actitity heartbeat progress") + details := []byte("some random activity heartbeat progress") nextEventID := scheduleID + 10 @@ -583,6 +588,7 @@ func (s *historyReplicatorSuite) TestSyncActivity_ActivityRunning_Update_SameVer LastWriteVersion: version, LastWriteEventID: nextEventID - 1, }) + msBuilder.On("UpdateReplicationPolicy", cache.ReplicationPolicyOneCluster).Once() msBuilder.On("UpdateReplicationStateVersion", version, false).Once() s.mockMetadataMgr.On("GetDomain", &persistence.GetDomainRequest{ID: domainID}).Return( &persistence.GetDomainResponse{ @@ -626,7 +632,7 @@ func (s *historyReplicatorSuite) TestSyncActivity_ActivityRunning_Update_LargerV startedTime := scheduledTime.Add(time.Minute) heartBeatUpdatedTime := startedTime.Add(time.Minute) attempt := int32(100) - details := []byte("some random actitity heartbeat progress") + details := []byte("some random activity heartbeat progress") nextEventID := scheduleID + 10 @@ -663,6 +669,7 @@ func (s *historyReplicatorSuite) TestSyncActivity_ActivityRunning_Update_LargerV LastWriteVersion: version, LastWriteEventID: nextEventID - 1, }) + msBuilder.On("UpdateReplicationPolicy", cache.ReplicationPolicyOneCluster).Once() msBuilder.On("UpdateReplicationStateVersion", version, false).Once() s.mockMetadataMgr.On("GetDomain", &persistence.GetDomainRequest{ID: domainID}).Return( &persistence.GetDomainResponse{ @@ -857,8 +864,7 @@ func (s *historyReplicatorSuite) TestApplyOtherEventsMissingMutableState_Incomin s.mockClusterMetadata.On("ClusterNameForFailoverVersion", currentVersion).Return(cluster.TestCurrentClusterName) s.mockClusterMetadata.On("GetCurrentClusterName").Return(cluster.TestCurrentClusterName) - - contextCurrent.On("updateAsActive", []persistence.Task{}, []persistence.Task{}, mock.Anything).Return(nil).Once() + contextCurrent.On("updateWorkflowExecutionAsActive", mock.Anything).Return(nil).Once() err := s.historyReplicator.ApplyOtherEventsMissingMutableState(ctx.Background(), domainID, workflowID, runID, req, s.logger) s.Nil(err) @@ -960,19 +966,17 @@ func (s *historyReplicatorSuite) TestApplyOtherEventsMissingMutableState_Incomin s.mockClusterMetadata.On("ClusterNameForFailoverVersion", currentVersion).Return(cluster.TestCurrentClusterName) s.mockClusterMetadata.On("GetCurrentClusterName").Return(cluster.TestCurrentClusterName) - contextCurrent.On("updateAsActive", - []persistence.Task{&persistence.DecisionTask{ - DomainID: domainID, - TaskList: currentDecisionStickyTasklist, - ScheduleID: newDecision.ScheduleID, - }}, - []persistence.Task{newTimerBuilder(s.mockShard.GetConfig(), s.logger, clock.NewEventTimeSource()).AddScheduleToStartDecisionTimoutTask( - newDecision.ScheduleID, - newDecision.Attempt, - currentDecisionStickyTimeout, - )}, - mock.Anything, - ).Return(nil).Once() + msBuilderCurrent.On("AddTransferTasks", []persistence.Task{&persistence.DecisionTask{ + DomainID: domainID, + TaskList: currentDecisionStickyTasklist, + ScheduleID: newDecision.ScheduleID, + }}).Once() + msBuilderCurrent.On("AddTimerTasks", []persistence.Task{newTimerBuilder(s.logger, clock.NewEventTimeSource()).AddScheduleToStartDecisionTimoutTask( + newDecision.ScheduleID, + newDecision.Attempt, + currentDecisionStickyTimeout, + )}).Once() + contextCurrent.On("updateWorkflowExecutionAsActive", mock.Anything).Return(nil).Once() err := s.historyReplicator.ApplyOtherEventsMissingMutableState(ctx.Background(), domainID, workflowID, runID, req, s.logger) s.Nil(err) @@ -1193,7 +1197,9 @@ func (s *historyReplicatorSuite) TestApplyOtherEventsMissingMutableState_Incomin msBuilderCurrent.On("AddWorkflowExecutionTerminatedEvent", workflowTerminationReason, mock.Anything, workflowTerminationIdentity).Return(&workflow.HistoryEvent{}, nil) - contextCurrent.On("updateAsActive", mock.Anything, mock.Anything, mock.Anything).Return(nil).Once() + msBuilderCurrent.On("AddTransferTasks", mock.Anything).Once() + msBuilderCurrent.On("AddTimerTasks", mock.Anything).Once() + contextCurrent.On("updateWorkflowExecutionAsActive", mock.Anything).Return(nil).Once() s.mockTimerProcessor.On("NotifyNewTimers", currentClusterName, mock.Anything, mock.Anything) err := s.historyReplicator.ApplyOtherEventsMissingMutableState(ctx.Background(), domainID, workflowID, runID, req, s.logger) @@ -1574,7 +1580,7 @@ func (s *historyReplicatorSuite) TestApplyOtherEventsVersionChecking_IncomingLes s.mockClusterMetadata.On("ClusterNameForFailoverVersion", currentLastWriteVersion).Return(cluster.TestCurrentClusterName) s.mockClusterMetadata.On("GetCurrentClusterName").Return(cluster.TestCurrentClusterName) - contextCurrent.On("updateAsActive", []persistence.Task{}, []persistence.Task{}, mock.Anything).Return(nil).Once() + contextCurrent.On("updateWorkflowExecutionAsActive", mock.Anything).Return(nil).Once() msBuilderOut, err := s.historyReplicator.ApplyOtherEventsVersionChecking(ctx.Background(), context, msBuilderIn, request, s.logger) @@ -1684,19 +1690,17 @@ func (s *historyReplicatorSuite) TestApplyOtherEventsVersionChecking_IncomingLes s.mockClusterMetadata.On("ClusterNameForFailoverVersion", currentLastWriteVersion).Return(cluster.TestCurrentClusterName) s.mockClusterMetadata.On("GetCurrentClusterName").Return(cluster.TestCurrentClusterName) - contextCurrent.On("updateAsActive", - []persistence.Task{&persistence.DecisionTask{ - DomainID: domainID, - TaskList: decisionStickyTasklist, - ScheduleID: newDecision.ScheduleID, - }}, - []persistence.Task{newTimerBuilder(s.mockShard.GetConfig(), s.logger, clock.NewEventTimeSource()).AddScheduleToStartDecisionTimoutTask( - newDecision.ScheduleID, - newDecision.Attempt, - decisionStickyTimeout, - )}, - mock.Anything, - ).Return(nil).Once() + msBuilderCurrent.On("AddTransferTasks", []persistence.Task{&persistence.DecisionTask{ + DomainID: domainID, + TaskList: decisionStickyTasklist, + ScheduleID: newDecision.ScheduleID, + }}).Once() + msBuilderCurrent.On("AddTimerTasks", []persistence.Task{newTimerBuilder(s.logger, clock.NewEventTimeSource()).AddScheduleToStartDecisionTimoutTask( + newDecision.ScheduleID, + newDecision.Attempt, + decisionStickyTimeout, + )}).Once() + contextCurrent.On("updateWorkflowExecutionAsActive", mock.Anything).Return(nil).Once() msBuilderOut, err := s.historyReplicator.ApplyOtherEventsVersionChecking(ctx.Background(), context, msBuilderIn, request, s.logger) @@ -1777,7 +1781,7 @@ func (s *historyReplicatorSuite) TestApplyOtherEventsVersionChecking_IncomingLes s.mockClusterMetadata.On("ClusterNameForFailoverVersion", currentLastWriteVersion).Return(cluster.TestCurrentClusterName) s.mockClusterMetadata.On("GetCurrentClusterName").Return(cluster.TestCurrentClusterName) - context.On("updateAsActive", []persistence.Task{}, []persistence.Task{}, mock.Anything).Return(nil).Once() + context.On("updateWorkflowExecutionAsActive", mock.Anything).Return(nil).Once() msBuilderOut, err := s.historyReplicator.ApplyOtherEventsVersionChecking(ctx.Background(), context, msBuilderIn, request, s.logger) s.Nil(msBuilderOut) @@ -1852,19 +1856,17 @@ func (s *historyReplicatorSuite) TestApplyOtherEventsVersionChecking_IncomingLes s.mockClusterMetadata.On("ClusterNameForFailoverVersion", currentLastWriteVersion).Return(cluster.TestCurrentClusterName) s.mockClusterMetadata.On("GetCurrentClusterName").Return(cluster.TestCurrentClusterName) - context.On("updateAsActive", - []persistence.Task{&persistence.DecisionTask{ - DomainID: domainID, - TaskList: decisionStickyTasklist, - ScheduleID: newDecision.ScheduleID, - }}, - []persistence.Task{newTimerBuilder(s.mockShard.GetConfig(), s.logger, clock.NewEventTimeSource()).AddScheduleToStartDecisionTimoutTask( - newDecision.ScheduleID, - newDecision.Attempt, - decisionStickyTimeout, - )}, - mock.Anything, - ).Return(nil).Once() + msBuilderIn.On("AddTransferTasks", []persistence.Task{&persistence.DecisionTask{ + DomainID: domainID, + TaskList: decisionStickyTasklist, + ScheduleID: newDecision.ScheduleID, + }}).Once() + msBuilderIn.On("AddTimerTasks", []persistence.Task{newTimerBuilder(s.logger, clock.NewEventTimeSource()).AddScheduleToStartDecisionTimoutTask( + newDecision.ScheduleID, + newDecision.Attempt, + decisionStickyTimeout, + )}).Once() + context.On("updateWorkflowExecutionAsActive", mock.Anything).Return(nil).Once() msBuilderOut, err := s.historyReplicator.ApplyOtherEventsVersionChecking(ctx.Background(), context, msBuilderIn, request, s.logger) @@ -2322,19 +2324,17 @@ func (s *historyReplicatorSuite) TestApplyOtherEventsVersionChecking_IncomingGre } msBuilderIn.On("AddDecisionTaskScheduledEvent").Return(newDecision, nil) msBuilderIn.On("IsStickyTaskListEnabled").Return(true) - context.On("updateAsActive", - []persistence.Task{&persistence.DecisionTask{ - DomainID: domainID, - TaskList: decisionStickyTasklist, - ScheduleID: newDecision.ScheduleID, - }}, - []persistence.Task{newTimerBuilder(s.mockShard.GetConfig(), s.logger, clock.NewEventTimeSource()).AddScheduleToStartDecisionTimoutTask( - newDecision.ScheduleID, - newDecision.Attempt, - decisionStickyTimeout, - )}, - mock.Anything, - ).Return(nil).Once() + msBuilderIn.On("AddTransferTasks", []persistence.Task{&persistence.DecisionTask{ + DomainID: domainID, + TaskList: decisionStickyTasklist, + ScheduleID: newDecision.ScheduleID, + }}).Once() + msBuilderIn.On("AddTimerTasks", []persistence.Task{newTimerBuilder(s.logger, clock.NewEventTimeSource()).AddScheduleToStartDecisionTimoutTask( + newDecision.ScheduleID, + newDecision.Attempt, + decisionStickyTimeout, + )}).Once() + context.On("updateWorkflowExecutionAsActive", mock.Anything).Return(nil).Once() // after the flush, the pending buffered events are gone, however, the last event ID should increase msBuilderIn.On("GetReplicationState").Return(&persistence.ReplicationState{ @@ -2543,7 +2543,7 @@ func (s *historyReplicatorSuite) TestReplicateWorkflowStarted_BrandNew() { BranchToken: executionInfo.GetCurrentBranch(), Events: history.Events, }} - msBuilder.On("CloseTransactionAsSnapshot", now.Local()).Return(newWorkflowSnapshot, newWorkflowEventsSeq, nil) + msBuilder.On("CloseTransactionAsSnapshot", now.Local(), transactionPolicyPassive).Return(newWorkflowSnapshot, newWorkflowEventsSeq, nil) s.mockHistoryV2Mgr.On("AppendHistoryNodes", mock.Anything).Return(&p.AppendHistoryNodesResponse{Size: historySize}, nil).Once() s.mockExecutionMgr.On("CreateWorkflowExecution", mock.MatchedBy(func(input *persistence.CreateWorkflowExecutionRequest) bool { input.RangeID = 0 @@ -2655,12 +2655,12 @@ func (s *historyReplicatorSuite) TestReplicateWorkflowStarted_ISE() { BranchToken: executionInfo.GetCurrentBranch(), Events: history.Events, }} - msBuilder.On("CloseTransactionAsSnapshot", now.Local()).Return(newWorkflowSnapshot, newWorkflowEventsSeq, nil) + msBuilder.On("CloseTransactionAsSnapshot", now.Local(), transactionPolicyPassive).Return(newWorkflowSnapshot, newWorkflowEventsSeq, nil) s.mockHistoryV2Mgr.On("AppendHistoryNodes", mock.Anything).Return(&p.AppendHistoryNodesResponse{Size: historySize}, nil).Once() errRet := &shared.InternalServiceError{} // the test above already assert the create workflow request, so here just use anyting - s.mockExecutionMgr.On("CreateWorkflowExecution", mock.Anything).Return(nil, errRet).Once() - s.mockShardManager.On("UpdateShard", mock.Anything).Return(nil).Once() // this is called when err is returned, and shard will try to update + s.mockExecutionMgr.On("CreateWorkflowExecution", mock.Anything).Return(nil, errRet) + s.mockShardManager.On("UpdateShard", mock.Anything).Return(nil) // this is called when err is returned, and shard will try to update s.mockMetadataMgr.On("GetDomain", mock.Anything).Return(&persistence.GetDomainResponse{ Info: &persistence.DomainInfo{ID: domainID, Name: "domain name"}, @@ -2762,7 +2762,7 @@ func (s *historyReplicatorSuite) TestReplicateWorkflowStarted_SameRunID() { BranchToken: executionInfo.GetCurrentBranch(), Events: history.Events, }} - msBuilder.On("CloseTransactionAsSnapshot", now.Local()).Return(newWorkflowSnapshot, newWorkflowEventsSeq, nil) + msBuilder.On("CloseTransactionAsSnapshot", now.Local(), transactionPolicyPassive).Return(newWorkflowSnapshot, newWorkflowEventsSeq, nil) s.mockHistoryV2Mgr.On("AppendHistoryNodes", mock.Anything).Return(&p.AppendHistoryNodesResponse{Size: historySize}, nil).Once() currentVersion := version @@ -2892,7 +2892,7 @@ func (s *historyReplicatorSuite) TestReplicateWorkflowStarted_CurrentComplete_In BranchToken: executionInfo.GetCurrentBranch(), Events: history.Events, }} - msBuilder.On("CloseTransactionAsSnapshot", now.Local()).Return(newWorkflowSnapshot, newWorkflowEventsSeq, nil) + msBuilder.On("CloseTransactionAsSnapshot", now.Local(), transactionPolicyPassive).Return(newWorkflowSnapshot, newWorkflowEventsSeq, nil) s.mockHistoryV2Mgr.On("AppendHistoryNodes", mock.Anything).Return(&p.AppendHistoryNodesResponse{Size: historySize}, nil).Once() currentVersion := version + 1 @@ -3021,7 +3021,7 @@ func (s *historyReplicatorSuite) TestReplicateWorkflowStarted_CurrentComplete_In BranchToken: executionInfo.GetCurrentBranch(), Events: history.Events, }} - msBuilder.On("CloseTransactionAsSnapshot", now.Local()).Return(newWorkflowSnapshot, newWorkflowEventsSeq, nil) + msBuilder.On("CloseTransactionAsSnapshot", now.Local(), transactionPolicyPassive).Return(newWorkflowSnapshot, newWorkflowEventsSeq, nil) s.mockHistoryV2Mgr.On("AppendHistoryNodes", mock.Anything).Return(&p.AppendHistoryNodesResponse{Size: historySize}, nil).Once() currentVersion := version @@ -3150,7 +3150,7 @@ func (s *historyReplicatorSuite) TestReplicateWorkflowStarted_CurrentComplete_In BranchToken: executionInfo.GetCurrentBranch(), Events: history.Events, }} - msBuilder.On("CloseTransactionAsSnapshot", now.Local()).Return(newWorkflowSnapshot, newWorkflowEventsSeq, nil) + msBuilder.On("CloseTransactionAsSnapshot", now.Local(), transactionPolicyPassive).Return(newWorkflowSnapshot, newWorkflowEventsSeq, nil) s.mockHistoryV2Mgr.On("AppendHistoryNodes", mock.Anything).Return(&p.AppendHistoryNodesResponse{Size: historySize}, nil).Once() currentVersion := version - 1 @@ -3281,7 +3281,7 @@ func (s *historyReplicatorSuite) TestReplicateWorkflowStarted_CurrentRunning_Inc BranchToken: executionInfo.GetCurrentBranch(), Events: history.Events, }} - msBuilder.On("CloseTransactionAsSnapshot", now.Local()).Return(newWorkflowSnapshot, newWorkflowEventsSeq, nil) + msBuilder.On("CloseTransactionAsSnapshot", now.Local(), transactionPolicyPassive).Return(newWorkflowSnapshot, newWorkflowEventsSeq, nil) s.mockHistoryV2Mgr.On("AppendHistoryNodes", mock.Anything).Return(&p.AppendHistoryNodesResponse{Size: historySize}, nil).Once() currentVersion := version + 1 @@ -3439,7 +3439,7 @@ func (s *historyReplicatorSuite) TestReplicateWorkflowStarted_CurrentRunning_Inc BranchToken: executionInfo.GetCurrentBranch(), Events: history.Events, }} - msBuilder.On("CloseTransactionAsSnapshot", now.Local()).Return(newWorkflowSnapshot, newWorkflowEventsSeq, nil) + msBuilder.On("CloseTransactionAsSnapshot", now.Local(), transactionPolicyPassive).Return(newWorkflowSnapshot, newWorkflowEventsSeq, nil) s.mockHistoryV2Mgr.On("AppendHistoryNodes", mock.Anything).Return(&p.AppendHistoryNodesResponse{Size: historySize}, nil).Once() currentVersion := version + 1 @@ -3497,7 +3497,7 @@ func (s *historyReplicatorSuite) TestReplicateWorkflowStarted_CurrentRunning_Inc }, nil).Once() msBuilderCurrent.On("UpdateReplicationStateVersion", currentVersion, true).Once() msBuilderCurrent.On("HasPendingDecisionTask").Return(true) - contextCurrent.On("updateAsActive", []persistence.Task{}, []persistence.Task{}, mock.Anything).Return(nil).Once() + contextCurrent.On("updateWorkflowExecutionAsActive", mock.Anything).Return(nil).Once() s.mockClusterMetadata.On("ClusterNameForFailoverVersion", currentVersion).Return(cluster.TestCurrentClusterName) s.mockClusterMetadata.On("GetCurrentClusterName").Return(cluster.TestCurrentClusterName) @@ -3611,7 +3611,7 @@ func (s *historyReplicatorSuite) TestReplicateWorkflowStarted_CurrentRunning_Inc BranchToken: executionInfo.GetCurrentBranch(), Events: history.Events, }} - msBuilder.On("CloseTransactionAsSnapshot", now.Local()).Return(newWorkflowSnapshot, newWorkflowEventsSeq, nil) + msBuilder.On("CloseTransactionAsSnapshot", now.Local(), transactionPolicyPassive).Return(newWorkflowSnapshot, newWorkflowEventsSeq, nil) s.mockHistoryV2Mgr.On("AppendHistoryNodes", mock.Anything).Return(&p.AppendHistoryNodesResponse{Size: historySize}, nil).Once() currentVersion := version + 1 @@ -3691,19 +3691,17 @@ func (s *historyReplicatorSuite) TestReplicateWorkflowStarted_CurrentRunning_Inc } msBuilderCurrent.On("AddDecisionTaskScheduledEvent").Return(newDecision, nil) msBuilderCurrent.On("IsStickyTaskListEnabled").Return(true) - contextCurrent.On("updateAsActive", - []persistence.Task{&persistence.DecisionTask{ - DomainID: domainID, - TaskList: currentDecisionStickyTasklist, - ScheduleID: newDecision.ScheduleID, - }}, - []persistence.Task{newTimerBuilder(s.mockShard.GetConfig(), s.logger, clock.NewEventTimeSource()).AddScheduleToStartDecisionTimoutTask( - newDecision.ScheduleID, - newDecision.Attempt, - currentDecisionStickyTimeout, - )}, - mock.Anything, - ).Return(nil).Once() + msBuilderCurrent.On("AddTransferTasks", []persistence.Task{&persistence.DecisionTask{ + DomainID: domainID, + TaskList: currentDecisionStickyTasklist, + ScheduleID: newDecision.ScheduleID, + }}).Once() + msBuilderCurrent.On("AddTimerTasks", []persistence.Task{newTimerBuilder(s.logger, clock.NewEventTimeSource()).AddScheduleToStartDecisionTimoutTask( + newDecision.ScheduleID, + newDecision.Attempt, + currentDecisionStickyTimeout, + )}).Once() + contextCurrent.On("updateWorkflowExecutionAsActive", mock.Anything).Return(nil).Once() s.mockClusterMetadata.On("ClusterNameForFailoverVersion", currentVersion).Return(cluster.TestCurrentClusterName) s.mockClusterMetadata.On("GetCurrentClusterName").Return(cluster.TestCurrentClusterName) @@ -3801,7 +3799,7 @@ func (s *historyReplicatorSuite) TestReplicateWorkflowStarted_CurrentRunning_Inc BranchToken: executionInfo.GetCurrentBranch(), Events: history.Events, }} - msBuilder.On("CloseTransactionAsSnapshot", now.Local()).Return(newWorkflowSnapshot, newWorkflowEventsSeq, nil) + msBuilder.On("CloseTransactionAsSnapshot", now.Local(), transactionPolicyPassive).Return(newWorkflowSnapshot, newWorkflowEventsSeq, nil) s.mockHistoryV2Mgr.On("AppendHistoryNodes", mock.Anything).Return(&p.AppendHistoryNodesResponse{Size: historySize}, nil).Once() currentVersion := version @@ -3948,7 +3946,7 @@ func (s *historyReplicatorSuite) TestReplicateWorkflowStarted_CurrentRunning_Inc BranchToken: executionInfo.GetCurrentBranch(), Events: history.Events, }} - msBuilder.On("CloseTransactionAsSnapshot", now.Local()).Return(newWorkflowSnapshot, newWorkflowEventsSeq, nil) + msBuilder.On("CloseTransactionAsSnapshot", now.Local(), transactionPolicyPassive).Return(newWorkflowSnapshot, newWorkflowEventsSeq, nil) s.mockHistoryV2Mgr.On("AppendHistoryNodes", mock.Anything).Return(&p.AppendHistoryNodesResponse{Size: historySize}, nil).Once() currentVersion := version @@ -4112,7 +4110,7 @@ func (s *historyReplicatorSuite) TestReplicateWorkflowStarted_CurrentRunning_Inc BranchToken: executionInfo.GetCurrentBranch(), Events: history.Events, }} - msBuilder.On("CloseTransactionAsSnapshot", now.Local()).Return(newWorkflowSnapshot, newWorkflowEventsSeq, nil) + msBuilder.On("CloseTransactionAsSnapshot", now.Local(), transactionPolicyPassive).Return(newWorkflowSnapshot, newWorkflowEventsSeq, nil) s.mockHistoryV2Mgr.On("AppendHistoryNodes", mock.Anything).Return(&p.AppendHistoryNodesResponse{Size: historySize}, nil).Once() currentVersion := version - 1 @@ -4186,7 +4184,9 @@ func (s *historyReplicatorSuite) TestReplicateWorkflowStarted_CurrentRunning_Inc msBuilderCurrent.On("AddWorkflowExecutionTerminatedEvent", workflowTerminationReason, mock.Anything, workflowTerminationIdentity).Return(&workflow.HistoryEvent{}, nil) - contextCurrent.On("updateAsActive", mock.Anything, mock.Anything, mock.Anything).Return(nil).Once() + msBuilderCurrent.On("AddTransferTasks", mock.Anything).Once() + msBuilderCurrent.On("AddTimerTasks", mock.Anything).Once() + contextCurrent.On("updateWorkflowExecutionAsActive", mock.Anything).Return(nil).Once() s.mockTimerProcessor.On("NotifyNewTimers", currentClusterName, mock.Anything, mock.Anything) err := s.historyReplicator.replicateWorkflowStarted(ctx.Background(), context, msBuilder, history, sBuilder, s.logger) @@ -4337,7 +4337,9 @@ func (s *historyReplicatorSuite) TestConflictResolutionTerminateCurrentRunningIf msBuilderCurrent.On("AddWorkflowExecutionTerminatedEvent", workflowTerminationReason, mock.Anything, workflowTerminationIdentity).Return(&workflow.HistoryEvent{}, nil) - contextCurrent.On("updateAsActive", mock.Anything, mock.Anything, mock.Anything).Return(nil).Once() + msBuilderCurrent.On("AddTransferTasks", mock.Anything).Once() + msBuilderCurrent.On("AddTimerTasks", mock.Anything).Once() + contextCurrent.On("updateWorkflowExecutionAsActive", mock.Anything).Return(nil).Once() s.mockTimerProcessor.On("NotifyNewTimers", currentCluster, mock.Anything, mock.Anything) prevRunID, prevLastWriteVersion, prevState, err := s.historyReplicator.conflictResolutionTerminateCurrentRunningIfNotSelf(ctx.Background(), msBuilderTarget, incomingVersion, incomingTimestamp, s.logger) diff --git a/service/history/mutableState.go b/service/history/mutableState.go index fdcfb5d79eb..220ac669510 100644 --- a/service/history/mutableState.go +++ b/service/history/mutableState.go @@ -91,7 +91,6 @@ type ( AddWorkflowExecutionStartedEvent(*cache.DomainCacheEntry, workflow.WorkflowExecution, *h.StartWorkflowExecutionRequest) (*workflow.HistoryEvent, error) AddWorkflowExecutionTerminatedEvent(reason string, details []byte, identity string) (*workflow.HistoryEvent, error) ClearStickyness() - CloseUpdateSession() (*mutableStateSessionUpdates, error) CheckResettable() error CopyToPersistence() *persistence.WorkflowMutableState CreateActivityRetryTimer(*persistence.ActivityInfo, string) persistence.Task @@ -114,7 +113,6 @@ type ( GetChildExecutionInitiatedEvent(int64) (*workflow.HistoryEvent, bool) GetCompletionEvent() (*workflow.HistoryEvent, bool) GetStartEvent() (*workflow.HistoryEvent, bool) - GetContinueAsNew() *persistence.WorkflowSnapshot GetCurrentBranch() []byte GetCurrentVersion() int64 GetExecutionInfo() *persistence.WorkflowExecutionInfo @@ -196,6 +194,7 @@ type ( SetHistoryTree(treeID string) error UpdateActivity(*persistence.ActivityInfo) error UpdateActivityProgress(ai *persistence.ActivityInfo, request *workflow.RecordActivityTaskHeartbeatRequest) + UpdateReplicationPolicy(cache.ReplicationPolicy) UpdateDecision(*decisionInfo) UpdateReplicationStateVersion(int64, bool) UpdateReplicationStateLastEventID(int64, int64) @@ -203,8 +202,10 @@ type ( AddTransferTasks(transferTasks ...persistence.Task) AddTimerTasks(timerTasks ...persistence.Task) + GetTransferTasks() []persistence.Task + GetTimerTasks() []persistence.Task - CloseTransactionAsMutation(now time.Time) (*persistence.WorkflowMutation, []*persistence.WorkflowEvents, error) - CloseTransactionAsSnapshot(now time.Time) (*persistence.WorkflowSnapshot, []*persistence.WorkflowEvents, error) + CloseTransactionAsMutation(now time.Time, transactionPolicy transactionPolicy) (*persistence.WorkflowMutation, []*persistence.WorkflowEvents, error) + CloseTransactionAsSnapshot(now time.Time, transactionPolicy transactionPolicy) (*persistence.WorkflowSnapshot, []*persistence.WorkflowEvents, error) } ) diff --git a/service/history/mutableStateBuilder.go b/service/history/mutableStateBuilder.go index e58fc6c4cea..3456d2d622a 100644 --- a/service/history/mutableStateBuilder.go +++ b/service/history/mutableStateBuilder.go @@ -49,6 +49,8 @@ const ( var ( // ErrWorkflowFinished indicates trying to mutate mutable state after workflow finished ErrWorkflowFinished = &workflow.InternalServiceError{Message: "invalid mutable state action: mutation after finish"} + + emptyTasks = []persistence.Task{} ) type ( @@ -83,17 +85,17 @@ type ( updateBufferedEvents []*workflow.HistoryEvent // buffered history events that needs to be persisted clearBufferedEvents bool // delete buffered events from persistence - executionInfo *persistence.WorkflowExecutionInfo // Workflow mutable state info. - replicationState *persistence.ReplicationState - continueAsNew *persistence.WorkflowSnapshot - continueAsNewWorkflowEvents *persistence.WorkflowEvents - hBuilder *historyBuilder + executionInfo *persistence.WorkflowExecutionInfo // Workflow mutable state info. + replicationState *persistence.ReplicationState + hBuilder *historyBuilder // in memory only attributes // indicates whether there are buffered events in persistence hasBufferedEventsInPersistence bool - // indicates the previous next event ID + // indicates the next event ID in DB, for condition update condition int64 + // indicate whether can do replication + replicationPolicy cache.ReplicationPolicy insertTransferTasks []persistence.Task insertReplicationTasks []persistence.Task @@ -174,6 +176,7 @@ func newMutableStateBuilderWithReplicationState( eventsCache eventsCache, logger log.Logger, version int64, + replicationPolicy cache.ReplicationPolicy, ) *mutableStateBuilder { s := newMutableStateBuilder(shard, eventsCache, logger) s.replicationState = &persistence.ReplicationState{ @@ -183,6 +186,7 @@ func newMutableStateBuilderWithReplicationState( LastWriteEventID: common.EmptyEventID, LastReplicationInfo: make(map[string]*persistence.ReplicationInfo), } + s.replicationPolicy = replicationPolicy return s } @@ -418,7 +422,18 @@ func (e *mutableStateBuilder) GetLastWriteVersion() int64 { return e.replicationState.LastWriteVersion } -func (e *mutableStateBuilder) UpdateReplicationStateVersion(version int64, forceUpdate bool) { +func (e *mutableStateBuilder) UpdateReplicationPolicy( + replicationPolicy cache.ReplicationPolicy, +) { + + e.replicationPolicy = replicationPolicy +} + +func (e *mutableStateBuilder) UpdateReplicationStateVersion( + version int64, + forceUpdate bool, +) { + if version > e.replicationState.CurrentVersion || forceUpdate { e.replicationState.CurrentVersion = version } @@ -448,65 +463,6 @@ func (e *mutableStateBuilder) UpdateReplicationStateLastEventID( } } -func (e *mutableStateBuilder) CloseUpdateSession() (*mutableStateSessionUpdates, error) { - - if err := e.FlushBufferedEvents(); err != nil { - return nil, err - } - - updates := &mutableStateSessionUpdates{ - executionInfo: e.executionInfo, - newEventsBuilder: e.hBuilder, - updateActivityInfos: convertUpdateActivityInfos(e.updateActivityInfos), - deleteActivityInfos: convertDeleteActivityInfos(e.deleteActivityInfos), - syncActivityTasks: convertSyncActivityInfos(e.pendingActivityInfoIDs, e.syncActivityTasks), - updateTimerInfos: convertUpdateTimerInfos(e.updateTimerInfos), - deleteTimerInfos: convertDeleteTimerInfos(e.deleteTimerInfos), - updateChildExecutionInfos: convertUpdateChildExecutionInfos(e.updateChildExecutionInfos), - deleteChildExecutionInfo: e.deleteChildExecutionInfo, - updateCancelExecutionInfos: convertUpdateRequestCancelInfos(e.updateRequestCancelInfos), - deleteCancelExecutionInfo: e.deleteRequestCancelInfo, - updateSignalInfos: convertUpdateSignalInfos(e.updateSignalInfos), - deleteSignalInfo: e.deleteSignalInfo, - updateSignalRequestedIDs: convertSignalRequestedIDs(e.updateSignalRequestedIDs), - deleteSignalRequestedID: e.deleteSignalRequestedID, - continueAsNew: e.continueAsNew, - continueAsNewWorkflowEvents: e.continueAsNewWorkflowEvents, - newBufferedEvents: e.updateBufferedEvents, - clearBufferedEvents: e.clearBufferedEvents, - } - - // Clear all updates to prepare for the next session - e.hBuilder = newHistoryBuilder(e, e.logger) - e.updateActivityInfos = make(map[*persistence.ActivityInfo]struct{}) - e.deleteActivityInfos = make(map[int64]struct{}) - e.syncActivityTasks = make(map[int64]struct{}) - e.updateTimerInfos = make(map[*persistence.TimerInfo]struct{}) - e.deleteTimerInfos = make(map[string]struct{}) - e.updateChildExecutionInfos = make(map[*persistence.ChildExecutionInfo]struct{}) - e.deleteChildExecutionInfo = nil - e.updateRequestCancelInfos = make(map[*persistence.RequestCancelInfo]struct{}) - e.deleteRequestCancelInfo = nil - e.updateSignalInfos = make(map[*persistence.SignalInfo]struct{}) - e.deleteSignalInfo = nil - e.updateSignalRequestedIDs = make(map[string]struct{}) - e.deleteSignalRequestedID = "" - e.continueAsNew = nil - e.continueAsNewWorkflowEvents = nil - e.clearBufferedEvents = false - if e.updateBufferedEvents != nil { - e.bufferedEvents = append(e.bufferedEvents, e.updateBufferedEvents...) - e.updateBufferedEvents = nil - } - if len(e.bufferedEvents) > e.config.MaximumBufferedEventsBatch() { - return nil, ErrBufferedEventsLimitExceeded - } - - e.hasBufferedEventsInPersistence = len(e.bufferedEvents) > 0 - - return updates, nil -} - func (e *mutableStateBuilder) checkAndClearTimerFiredEvent(timerID string) *workflow.HistoryEvent { var timerEvent *workflow.HistoryEvent @@ -2982,11 +2938,15 @@ func (e *mutableStateBuilder) AddContinueAsNewEvent( firstRunID := currentStartEvent.GetWorkflowExecutionStartedEventAttributes().GetFirstExecutionRunId() var newStateBuilder *mutableStateBuilder - if domainEntry.IsGlobalDomain() { - // all workflows within a global domain should have replication state, no matter whether it will be replicated to multiple - // target clusters or not - newStateBuilder = newMutableStateBuilderWithReplicationState(e.shard, e.eventsCache, e.logger, - domainEntry.GetFailoverVersion()) + if e.GetReplicationState() != nil { + // continued as new workflow should have the same replication properties + newStateBuilder = newMutableStateBuilderWithReplicationState( + e.shard, + e.eventsCache, + e.logger, + e.GetCurrentVersion(), + e.replicationPolicy, + ) } else { newStateBuilder = newMutableStateBuilder(e.shard, e.eventsCache, e.logger) } @@ -3005,18 +2965,6 @@ func (e *mutableStateBuilder) AddContinueAsNewEvent( } } - // call FlushBufferedEvents to assign task id to event - // as well as update last event task id in new state builder - // NOTE: must flush current mutable state first - // so the task IDs assigned can be used for comparison for cross DC - err = e.FlushBufferedEvents() - if err != nil { - return nil, nil, err - } - err = newStateBuilder.FlushBufferedEvents() - if err != nil { - return nil, nil, err - } if err = e.ReplicateWorkflowExecutionContinuedAsNewEvent( firstEventID, domainID, @@ -3115,12 +3063,7 @@ func (e *mutableStateBuilder) ReplicateWorkflowExecutionContinuedAsNewEvent( return err } } - newWorkflowSnapshot, newWorkflowEventsSeq, err := newStateBuilder.CloseTransactionAsSnapshot(newStartedTime) - if err != nil { - return err - } - e.continueAsNew = newWorkflowSnapshot - e.continueAsNewWorkflowEvents = newWorkflowEventsSeq[0] + return nil } @@ -3486,10 +3429,6 @@ func (e *mutableStateBuilder) CreateActivityRetryTimer( return retryTask } -func (e *mutableStateBuilder) GetContinueAsNew() *persistence.WorkflowSnapshot { - return e.continueAsNew -} - // TODO mutable state should generate corresponding transfer / timer tasks according to // updates accumulated, while currently all transfer / timer tasks are managed manually @@ -3501,6 +3440,10 @@ func (e *mutableStateBuilder) AddTransferTasks( e.insertTransferTasks = append(e.insertTransferTasks, transferTasks...) } +func (e *mutableStateBuilder) GetTransferTasks() []persistence.Task { + return e.insertTransferTasks +} + // TODO convert AddTransferTasks to prepareTimerTasks func (e *mutableStateBuilder) AddTimerTasks( timerTasks ...persistence.Task, @@ -3509,15 +3452,20 @@ func (e *mutableStateBuilder) AddTimerTasks( e.insertTimerTasks = append(e.insertTimerTasks, timerTasks...) } +func (e *mutableStateBuilder) GetTimerTasks() []persistence.Task { + return e.insertTimerTasks +} + func (e *mutableStateBuilder) CloseTransactionAsMutation( now time.Time, + transactionPolicy transactionPolicy, ) (*persistence.WorkflowMutation, []*persistence.WorkflowEvents, error) { - if err := e.FlushBufferedEvents(); err != nil { + if err := e.prepareTransaction(transactionPolicy); err != nil { return nil, nil, err } - workflowEventsSeq, err := e.prepareEventsAndReplicationTasks() + workflowEventsSeq, err := e.prepareEventsAndReplicationTasks(transactionPolicy) if err != nil { return nil, nil, err } @@ -3533,6 +3481,9 @@ func (e *mutableStateBuilder) CloseTransactionAsMutation( setTaskInfo(e.GetCurrentVersion(), now, e.insertTransferTasks, e.insertTimerTasks) + // update last update time + e.executionInfo.LastUpdatedTimestamp = now + workflowMutation := &persistence.WorkflowMutation{ ExecutionInfo: e.executionInfo, ReplicationState: e.replicationState, @@ -3559,7 +3510,7 @@ func (e *mutableStateBuilder) CloseTransactionAsMutation( Condition: e.condition, } - if err := e.closeSession(); err != nil { + if err := e.cleanupTransaction(transactionPolicy); err != nil { return nil, nil, err } return workflowMutation, workflowEventsSeq, nil @@ -3567,13 +3518,14 @@ func (e *mutableStateBuilder) CloseTransactionAsMutation( func (e *mutableStateBuilder) CloseTransactionAsSnapshot( now time.Time, + transactionPolicy transactionPolicy, ) (*persistence.WorkflowSnapshot, []*persistence.WorkflowEvents, error) { - if err := e.FlushBufferedEvents(); err != nil { + if err := e.prepareTransaction(transactionPolicy); err != nil { return nil, nil, err } - workflowEventsSeq, err := e.prepareEventsAndReplicationTasks() + workflowEventsSeq, err := e.prepareEventsAndReplicationTasks(transactionPolicy) if err != nil { return nil, nil, err } @@ -3601,6 +3553,9 @@ func (e *mutableStateBuilder) CloseTransactionAsSnapshot( setTaskInfo(e.GetCurrentVersion(), now, e.insertTransferTasks, e.insertTimerTasks) + // update last update time + e.executionInfo.LastUpdatedTimestamp = now + workflowSnapshot := &persistence.WorkflowSnapshot{ ExecutionInfo: e.executionInfo, ReplicationState: e.replicationState, @@ -3619,38 +3574,75 @@ func (e *mutableStateBuilder) CloseTransactionAsSnapshot( Condition: e.condition, } - if err := e.closeSession(); err != nil { + if err := e.cleanupTransaction(transactionPolicy); err != nil { return nil, nil, err } return workflowSnapshot, workflowEventsSeq, nil } -func (e *mutableStateBuilder) closeSession() error { +func (e *mutableStateBuilder) prepareTransaction( + transactionPolicy transactionPolicy, +) error { + + if err := e.closeTransactionWithPolicyCheck( + transactionPolicy, + ); err != nil { + return err + } + + if err := e.closeTransactionHandleDecisionFailover( + transactionPolicy, + ); err != nil { + return err + } + + if err := e.closeTransactionHandleBufferedEventsLimit( + transactionPolicy, + ); err != nil { + return err + } + + // flushing buffered events should happen at very last + if transactionPolicy == transactionPolicyActive { + if err := e.FlushBufferedEvents(); err != nil { + return err + } + } + + return nil +} + +func (e *mutableStateBuilder) cleanupTransaction( + transactionPolicy transactionPolicy, +) error { + // Clear all updates to prepare for the next session e.hBuilder = newHistoryBuilder(e, e.logger) + e.updateActivityInfos = make(map[*persistence.ActivityInfo]struct{}) e.deleteActivityInfos = make(map[int64]struct{}) e.syncActivityTasks = make(map[int64]struct{}) + e.updateTimerInfos = make(map[*persistence.TimerInfo]struct{}) e.deleteTimerInfos = make(map[string]struct{}) + e.updateChildExecutionInfos = make(map[*persistence.ChildExecutionInfo]struct{}) e.deleteChildExecutionInfo = nil + e.updateRequestCancelInfos = make(map[*persistence.RequestCancelInfo]struct{}) e.deleteRequestCancelInfo = nil + e.updateSignalInfos = make(map[*persistence.SignalInfo]struct{}) e.deleteSignalInfo = nil + e.updateSignalRequestedIDs = make(map[string]struct{}) e.deleteSignalRequestedID = "" - e.continueAsNew = nil - e.continueAsNewWorkflowEvents = nil + e.clearBufferedEvents = false if e.updateBufferedEvents != nil { e.bufferedEvents = append(e.bufferedEvents, e.updateBufferedEvents...) e.updateBufferedEvents = nil } - if len(e.bufferedEvents) > e.config.MaximumBufferedEventsBatch() { - return ErrBufferedEventsLimitExceeded - } e.hasBufferedEventsInPersistence = len(e.bufferedEvents) > 0 e.condition = e.GetNextEventID() @@ -3662,7 +3654,9 @@ func (e *mutableStateBuilder) closeSession() error { return nil } -func (e *mutableStateBuilder) prepareEventsAndReplicationTasks() ([]*persistence.WorkflowEvents, error) { +func (e *mutableStateBuilder) prepareEventsAndReplicationTasks( + transactionPolicy transactionPolicy, +) ([]*persistence.WorkflowEvents, error) { workflowEventsSeq := []*persistence.WorkflowEvents{} if len(e.hBuilder.transientHistory) != 0 { @@ -3684,31 +3678,41 @@ func (e *mutableStateBuilder) prepareEventsAndReplicationTasks() ([]*persistence }) } - if err := e.validateNoEventsAfterWorkflowFinish(e.hBuilder.history); err != nil { + if err := e.validateNoEventsAfterWorkflowFinish( + transactionPolicy, + e.hBuilder.history, + ); err != nil { return nil, err } for _, workflowEvents := range workflowEventsSeq { e.insertReplicationTasks = append(e.insertReplicationTasks, - e.eventsToReplicationTask(workflowEvents.Events)..., + e.eventsToReplicationTask(transactionPolicy, workflowEvents.Events)..., ) } e.insertReplicationTasks = append(e.insertReplicationTasks, - convertSyncActivityInfos( - e.pendingActivityInfoIDs, - e.syncActivityTasks, - )...) + e.syncActivityToReplicationTask(transactionPolicy)..., + ) + + if transactionPolicy == transactionPolicyPassive && len(e.insertReplicationTasks) > 0 { + return nil, &workflow.InternalServiceError{ + Message: "should not generate replication task when close transaction as passive", + } + } return workflowEventsSeq, nil } func (e *mutableStateBuilder) eventsToReplicationTask( + transactionPolicy transactionPolicy, events []*workflow.HistoryEvent, ) []persistence.Task { - if len(events) == 0 || e.GetReplicationState() == nil { - return []persistence.Task{} + if transactionPolicy == transactionPolicyPassive || + !e.canReplicateEvents() || + len(events) == 0 { + return emptyTasks } firstEvent := events[0] @@ -3719,7 +3723,7 @@ func (e *mutableStateBuilder) eventsToReplicationTask( currentCluster := e.clusterMetadata.GetCurrentClusterName() if currentCluster != sourceCluster { - return []persistence.Task{} + return emptyTasks } return []persistence.Task{ &persistence.HistoryReplicationTask{ @@ -3735,14 +3739,36 @@ func (e *mutableStateBuilder) eventsToReplicationTask( } } +func (e *mutableStateBuilder) syncActivityToReplicationTask( + transactionPolicy transactionPolicy, +) []persistence.Task { + + if transactionPolicy == transactionPolicyPassive || + !e.canReplicateEvents() { + return emptyTasks + } + + return convertSyncActivityInfos( + e.pendingActivityInfoIDs, + e.syncActivityTasks, + ) +} + +func (e *mutableStateBuilder) canReplicateEvents() bool { + return e.GetReplicationState() != nil && + e.replicationPolicy == cache.ReplicationPolicyMultiCluster +} + // validateNoEventsAfterWorkflowFinish perform check on history event batch // NOTE: do not apply this check on every batch, since transient // decision && workflow finish will be broken (the first batch) func (e *mutableStateBuilder) validateNoEventsAfterWorkflowFinish( + transactionPolicy transactionPolicy, events []*workflow.HistoryEvent, ) error { - if len(events) == 0 { + if transactionPolicy == transactionPolicyPassive || + len(events) == 0 { return nil } @@ -3777,6 +3803,127 @@ func (e *mutableStateBuilder) validateNoEventsAfterWorkflowFinish( } } +func (e *mutableStateBuilder) closeTransactionWithPolicyCheck( + transactionPolicy transactionPolicy, +) error { + + if transactionPolicy == transactionPolicyPassive || + e.GetReplicationState() == nil { + return nil + } + + activeCluster := e.clusterMetadata.ClusterNameForFailoverVersion(e.GetCurrentVersion()) + currentCluster := e.clusterMetadata.GetCurrentClusterName() + + if activeCluster != currentCluster { + domainID := e.GetExecutionInfo().DomainID + return errors.NewDomainNotActiveError(domainID, currentCluster, activeCluster) + } + return nil +} + +func (e *mutableStateBuilder) closeTransactionHandleDecisionFailover( + transactionPolicy transactionPolicy, +) error { + + if transactionPolicy == transactionPolicyPassive || + !e.IsWorkflowExecutionRunning() || + e.GetReplicationState() == nil { + return nil + } + + // Handling mutable state turn from standby to active, while having a decision on the fly + di, ok := e.GetInFlightDecisionTask() + if ok && di.Version < e.GetCurrentVersion() { + // we have a decision on the fly with a lower version, fail it + if err := failDecision( + e, + di, + workflow.DecisionTaskFailedCauseFailoverCloseDecision, + ); err != nil { + return err + } + + err := scheduleDecision(e, e.timeSource, e.logger) + if err != nil { + return err + } + } + return nil +} + +func (e *mutableStateBuilder) closeTransactionHandleBufferedEventsLimit( + transactionPolicy transactionPolicy, +) error { + + if transactionPolicy == transactionPolicyPassive || + !e.IsWorkflowExecutionRunning() { + return nil + } + + if len(e.bufferedEvents) < e.config.MaximumBufferedEventsBatch() { + return nil + } + + // Handling buffered events size issue + if di, ok := e.GetInFlightDecisionTask(); ok { + // we have a decision on the fly with a lower version, fail it + if err := failDecision( + e, + di, + workflow.DecisionTaskFailedCauseForceCloseDecision, + ); err != nil { + return err + } + + err := scheduleDecision(e, e.timeSource, e.logger) + if err != nil { + return err + } + } + return nil +} + +func (e *mutableStateBuilder) closeTransactionHandleWorkflowReset( + transactionPolicy transactionPolicy, +) error { + + if transactionPolicy == transactionPolicyPassive || + !e.IsWorkflowExecutionRunning() { + return nil + } + + // compare with bad client binary checksum and schedule a reset task + + // only schedule reset task if current doesn't have childWFs. + // TODO: This will be removed once our reset allows childWFs + if len(e.GetPendingChildExecutionInfos()) != 0 { + return nil + } + + executionInfo := e.GetExecutionInfo() + domainEntry, err := e.shard.GetDomainCache().GetDomainByID(executionInfo.DomainID) + if err != nil { + return err + } + if _, pt := FindAutoResetPoint( + e.timeSource, + &domainEntry.GetConfig().BadBinaries, + e.GetExecutionInfo().AutoResetPoints, + ); pt != nil { + e.AddTransferTasks(&persistence.ResetWorkflowTask{}) + e.logger.Info("Auto-Reset task is scheduled", + tag.WorkflowDomainName(domainEntry.GetInfo().Name), + tag.WorkflowID(executionInfo.WorkflowID), + tag.WorkflowRunID(executionInfo.RunID), + tag.WorkflowResetBaseRunID(pt.GetRunId()), + tag.WorkflowEventID(pt.GetFirstDecisionCompletedId()), + tag.WorkflowBinaryChecksum(pt.GetBinaryChecksum()), + ) + } + return nil +} + func (e *mutableStateBuilder) checkMutability( actionTag tag.Tag, ) error { diff --git a/service/history/mutableStateSessionUpdates.go b/service/history/mutableStateSessionUpdates.go deleted file mode 100644 index f21b40d54c1..00000000000 --- a/service/history/mutableStateSessionUpdates.go +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (c) 2017 Uber Technologies, Inc. -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included in -// all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -// THE SOFTWARE. - -package history - -import ( - workflow "github.com/uber/cadence/.gen/go/shared" - "github.com/uber/cadence/common/persistence" -) - -type ( - mutableStateSessionUpdates struct { - executionInfo *persistence.WorkflowExecutionInfo - newEventsBuilder *historyBuilder - updateActivityInfos []*persistence.ActivityInfo - deleteActivityInfos []int64 - syncActivityTasks []persistence.Task - updateTimerInfos []*persistence.TimerInfo - deleteTimerInfos []string - updateChildExecutionInfos []*persistence.ChildExecutionInfo - deleteChildExecutionInfo *int64 - updateCancelExecutionInfos []*persistence.RequestCancelInfo - deleteCancelExecutionInfo *int64 - updateSignalInfos []*persistence.SignalInfo - deleteSignalInfo *int64 - updateSignalRequestedIDs []string - deleteSignalRequestedID string - continueAsNew *persistence.WorkflowSnapshot - continueAsNewWorkflowEvents *persistence.WorkflowEvents - newBufferedEvents []*workflow.HistoryEvent - clearBufferedEvents bool - } -) diff --git a/service/history/mutableStateUtil.go b/service/history/mutableStateUtil.go index 5d44f3a621e..1b024bcf42d 100644 --- a/service/history/mutableStateUtil.go +++ b/service/history/mutableStateUtil.go @@ -24,6 +24,17 @@ import ( "github.com/uber/cadence/common/persistence" ) +type transactionPolicy int + +const ( + transactionPolicyActive transactionPolicy = 0 + transactionPolicyPassive transactionPolicy = 1 +) + +func (policy transactionPolicy) ptr() *transactionPolicy { + return &policy +} + // NOTE: do not use make(type, len(input)) // since this will assume initial length being len(inputs) // always use make(type, 0, len(input)) diff --git a/service/history/queueProcessor.go b/service/history/queueProcessor.go index 19427ed9499..339b96b0724 100644 --- a/service/history/queueProcessor.go +++ b/service/history/queueProcessor.go @@ -394,12 +394,6 @@ func (p *queueProcessorBase) handleTaskError(scope int, startTime time.Time, return nil } - if _, ok := err.(*workflow.LimitExceededError); ok { - p.metricsClient.IncCounter(scope, metrics.TaskLimitExceededCounter) - logger.Error("Task encounter limit exceeded error.", tag.Error(err), tag.LifeCycleProcessingFailed) - return err - } - logger.Error("Fail to process task", tag.Error(err), tag.LifeCycleProcessingFailed) return err } diff --git a/service/history/queueProcessor_test.go b/service/history/queueProcessor_test.go index 42a42c4d25f..5912dd00adb 100644 --- a/service/history/queueProcessor_test.go +++ b/service/history/queueProcessor_test.go @@ -222,11 +222,6 @@ func (s *queueProcessorSuite) TestHandleTaskError_CurrentWorkflowConditionFailed s.Nil(s.queueProcessor.handleTaskError(s.scope, time.Now(), s.notificationChan, err, s.logger)) } -func (s *queueProcessorSuite) TestHandleTaskError_LimitExceededError() { - err := &workflow.LimitExceededError{} - s.Equal(err, s.queueProcessor.handleTaskError(s.scope, time.Now(), s.notificationChan, err, s.logger)) -} - func (s *queueProcessorSuite) TestHandleTaskError_RandomErr() { err := errors.New("random error") s.Equal(err, s.queueProcessor.handleTaskError(s.scope, time.Now(), s.notificationChan, err, s.logger)) diff --git a/service/history/replicatorQueueProcessor_test.go b/service/history/replicatorQueueProcessor_test.go index 562ad1ca073..412ee17273e 100644 --- a/service/history/replicatorQueueProcessor_test.go +++ b/service/history/replicatorQueueProcessor_test.go @@ -227,6 +227,7 @@ func (s *replicatorQueueProcessorSuite) TestSyncActivity_ActivityCompleted() { LastWriteEventID: nextEventID - 1, }) msBuilder.On("GetLastWriteVersion").Return(version) + msBuilder.On("UpdateReplicationPolicy", cache.ReplicationPolicyOneCluster).Once() msBuilder.On("UpdateReplicationStateVersion", version, false).Once() msBuilder.On("GetActivityInfo", scheduleID).Return(nil, false) s.mockMetadataMgr.On("GetDomain", &persistence.GetDomainRequest{ID: domainID}).Return( @@ -298,6 +299,7 @@ func (s *replicatorQueueProcessorSuite) TestSyncActivity_ActivityRetry() { LastWriteEventID: nextEventID - 1, }) msBuilder.On("GetLastWriteVersion").Return(version) + msBuilder.On("UpdateReplicationPolicy", cache.ReplicationPolicyOneCluster).Once() msBuilder.On("UpdateReplicationStateVersion", version, false).Once() msBuilder.On("GetActivityInfo", scheduleID).Return(&persistence.ActivityInfo{ Version: activityVersion, @@ -398,6 +400,7 @@ func (s *replicatorQueueProcessorSuite) TestSyncActivity_ActivityRunning() { LastWriteEventID: nextEventID - 1, }) msBuilder.On("GetLastWriteVersion").Return(version) + msBuilder.On("UpdateReplicationPolicy", cache.ReplicationPolicyOneCluster).Once() msBuilder.On("UpdateReplicationStateVersion", version, false).Once() msBuilder.On("GetActivityInfo", scheduleID).Return(&persistence.ActivityInfo{ Version: activityVersion, diff --git a/service/history/stateBuilder.go b/service/history/stateBuilder.go index 44d57b6b611..97b97644610 100644 --- a/service/history/stateBuilder.go +++ b/service/history/stateBuilder.go @@ -510,11 +510,16 @@ func (b *stateBuilderImpl) applyEvents(domainID, requestID string, execution sha } newRunStartedEvent := newRunHistory[0] // Create mutable state updates for the new run + domainEntry, err := b.domainCache.GetDomainByID(domainID) + if err != nil { + return nil, nil, nil, err + } newRunMutableStateBuilder = newMutableStateBuilderWithReplicationState( b.shard, b.shard.GetEventsCache(), b.logger, newRunStartedEvent.GetVersion(), + domainEntry.GetReplicationPolicy(), ) newRunStateBuilder := newStateBuilder(b.shard, newRunMutableStateBuilder, b.logger) @@ -693,7 +698,7 @@ func (b *stateBuilderImpl) getTimerBuilder(event *shared.HistoryEvent) *timerBui now := time.Unix(0, event.GetTimestamp()) timeSource.Update(now) - return newTimerBuilder(b.shard.GetConfig(), b.logger, timeSource) + return newTimerBuilder(b.logger, timeSource) } func (b *stateBuilderImpl) appendTasksForFinishedExecutions(event *shared.HistoryEvent, domainID, workflowID string) error { diff --git a/service/history/stateBuilder_test.go b/service/history/stateBuilder_test.go index 38ab3754d91..0a2a77bdd8c 100644 --- a/service/history/stateBuilder_test.go +++ b/service/history/stateBuilder_test.go @@ -520,6 +520,7 @@ func (s *stateBuilderSuite) TestApplyEvents_EventTypeWorkflowExecutionContinuedA ActiveClusterName: cluster.TestCurrentClusterName, Clusters: []*persistence.ClusterReplicationConfig{ {ClusterName: cluster.TestCurrentClusterName}, + {ClusterName: cluster.TestAlternativeClusterName}, }, }, IsGlobalDomain: true, @@ -534,6 +535,7 @@ func (s *stateBuilderSuite) TestApplyEvents_EventTypeWorkflowExecutionContinuedA ActiveClusterName: cluster.TestCurrentClusterName, Clusters: []*persistence.ClusterReplicationConfig{ {ClusterName: cluster.TestCurrentClusterName}, + {ClusterName: cluster.TestAlternativeClusterName}, }, }, IsGlobalDomain: true, @@ -572,6 +574,7 @@ func (s *stateBuilderSuite) TestApplyEvents_EventTypeWorkflowExecutionContinuedA s.mockShard.GetEventsCache(), s.logger, newRunStartedEvent.GetVersion(), + cache.ReplicationPolicyMultiCluster, ) err = expectedNewRunStateBuilder.ReplicateWorkflowExecutionStartedEvent( cache.NewLocalDomainCacheEntryForTest(&persistence.DomainInfo{ID: domainID}, &persistence.DomainConfig{}, "", nil), @@ -846,6 +849,7 @@ func (s *stateBuilderSuite) TestApplyEvents_EventTypeWorkflowExecutionContinuedA ActiveClusterName: cluster.TestCurrentClusterName, Clusters: []*persistence.ClusterReplicationConfig{ {ClusterName: cluster.TestCurrentClusterName}, + {ClusterName: cluster.TestAlternativeClusterName}, }, }, IsGlobalDomain: true, @@ -860,6 +864,7 @@ func (s *stateBuilderSuite) TestApplyEvents_EventTypeWorkflowExecutionContinuedA ActiveClusterName: cluster.TestCurrentClusterName, Clusters: []*persistence.ClusterReplicationConfig{ {ClusterName: cluster.TestCurrentClusterName}, + {ClusterName: cluster.TestAlternativeClusterName}, }, }, IsGlobalDomain: true, @@ -899,6 +904,7 @@ func (s *stateBuilderSuite) TestApplyEvents_EventTypeWorkflowExecutionContinuedA s.mockShard.GetEventsCache(), s.logger, newRunStartedEvent.GetVersion(), + cache.ReplicationPolicyMultiCluster, ) err = expectedNewRunStateBuilder.ReplicateWorkflowExecutionStartedEvent( cache.NewLocalDomainCacheEntryForTest(&persistence.DomainInfo{ID: domainID}, &persistence.DomainConfig{}, "", nil), diff --git a/service/history/timerBuilder.go b/service/history/timerBuilder.go index bd7677e3545..6d981df2ebb 100644 --- a/service/history/timerBuilder.go +++ b/service/history/timerBuilder.go @@ -69,7 +69,6 @@ type ( activityTimers timers pendingActivityTimers map[int64]*persistence.ActivityInfo isLoadedActivityTimers bool - config *Config logger log.Logger localSeqNumGen SequenceNumberGenerator // This one used to order in-memory list. timeSource clock.TimeSource @@ -120,13 +119,12 @@ func (l *localSeqNumGenerator) NextSeq() int64 { } // newTimerBuilder creates a timer builder. -func newTimerBuilder(config *Config, logger log.Logger, timeSource clock.TimeSource) *timerBuilder { +func newTimerBuilder(logger log.Logger, timeSource clock.TimeSource) *timerBuilder { return &timerBuilder{ userTimers: timers{}, pendingUserTimers: make(map[string]*persistence.TimerInfo), activityTimers: timers{}, pendingActivityTimers: make(map[int64]*persistence.ActivityInfo), - config: config, logger: logger.WithTags(tag.ComponentTimerBuilder), localSeqNumGen: &localSeqNumGenerator{counter: 1}, timeSource: timeSource, diff --git a/service/history/timerBuilder_test.go b/service/history/timerBuilder_test.go index 567593df043..ae21f3d2043 100644 --- a/service/history/timerBuilder_test.go +++ b/service/history/timerBuilder_test.go @@ -83,7 +83,7 @@ func (s *timerBuilderProcessorSuite) SetupTest() { } func (s *timerBuilderProcessorSuite) TestTimerBuilderSingleUserTimer() { - tb := newTimerBuilder(s.config, s.logger, &mockTimeSource{currTime: time.Now()}) + tb := newTimerBuilder(s.logger, &mockTimeSource{currTime: time.Now()}) // Add one timer. msb := newMutableStateBuilder(s.mockShard, s.mockEventsCache, s.logger) @@ -114,7 +114,7 @@ func (s *timerBuilderProcessorSuite) TestTimerBuilderSingleUserTimer() { } func (s *timerBuilderProcessorSuite) TestTimerBuilderMulitpleUserTimer() { - tb := newTimerBuilder(s.config, s.logger, &mockTimeSource{currTime: time.Now()}) + tb := newTimerBuilder(s.logger, &mockTimeSource{currTime: time.Now()}) // Add two timers. (before and after) tp := &persistence.TimerInfo{TimerID: "tid1", StartedID: 201, TaskID: 101, ExpiryTime: time.Now().Add(10 * time.Second)} @@ -145,7 +145,7 @@ func (s *timerBuilderProcessorSuite) TestTimerBuilderMulitpleUserTimer() { s.Equal(tiBefore.ExpiryTime.Unix(), t1.(*persistence.UserTimerTask).VisibilityTimestamp.Unix()) // Mutable state with out a timer task. - tb = newTimerBuilder(s.config, s.logger, &mockTimeSource{currTime: time.Now()}) + tb = newTimerBuilder(s.logger, &mockTimeSource{currTime: time.Now()}) tp2 := &persistence.TimerInfo{TimerID: "tid1", StartedID: 201, TaskID: TimerTaskStatusNone, ExpiryTime: time.Now().Add(10 * time.Second)} timerInfos = map[string]*persistence.TimerInfo{"tid1": tp2} msb = newMutableStateBuilder(s.mockShard, s.mockEventsCache, s.logger) @@ -204,7 +204,7 @@ func (s *timerBuilderProcessorSuite) TestTimerBuilder_GetActivityTimer() { }) s.Nil(err) // create a schedule to start timeout - tb := newTimerBuilder(s.config, s.logger, &mockTimeSource{currTime: time.Now()}) + tb := newTimerBuilder(s.logger, &mockTimeSource{currTime: time.Now()}) tt := tb.GetActivityTimerTaskIfNeeded(builder) s.NotNil(tt) s.Equal(workflow.TimeoutTypeScheduleToStart, workflow.TimeoutType(tt.(*persistence.ActivityTimeoutTask).TimeoutType)) @@ -212,7 +212,7 @@ func (s *timerBuilderProcessorSuite) TestTimerBuilder_GetActivityTimer() { builder.AddActivityTaskStartedEvent(ai, *ase.EventId, uuid.New(), "") // create a heart beat timeout - tb = newTimerBuilder(s.config, s.logger, &mockTimeSource{currTime: time.Now()}) + tb = newTimerBuilder(s.logger, &mockTimeSource{currTime: time.Now()}) tt = tb.GetActivityTimerTaskIfNeeded(builder) s.NotNil(tt) s.Equal(workflow.TimeoutTypeHeartbeat, workflow.TimeoutType(tt.(*persistence.ActivityTimeoutTask).TimeoutType)) diff --git a/service/history/timerQueueActiveProcessor.go b/service/history/timerQueueActiveProcessor.go index c3500f0e709..6fc93933400 100644 --- a/service/history/timerQueueActiveProcessor.go +++ b/service/history/timerQueueActiveProcessor.go @@ -757,7 +757,7 @@ Update_History_Loop: if t.config.EnableEventsV2(domainEntry.GetInfo().Name) { eventStoreVersion = persistence.EventStoreVersionV2 } - _, continueAsNewBuilder, err := msBuilder.AddContinueAsNewEvent(msBuilder.GetNextEventID(), common.EmptyEventID, domainEntry, startAttributes.GetParentWorkflowDomain(), continueAsnewAttributes, eventStoreVersion) + _, newMutableState, err := msBuilder.AddContinueAsNewEvent(msBuilder.GetNextEventID(), common.EmptyEventID, domainEntry, startAttributes.GetParentWorkflowDomain(), continueAsnewAttributes, eventStoreVersion) if err != nil { return err } @@ -776,16 +776,26 @@ Update_History_Loop: transferTasks = append(transferTasks, tranT) timerTasks = append(timerTasks, timerT) - // Generate a transaction ID for appending events to history - transactionID, err3 := t.shard.GetNextTransferTaskID() - if err3 != nil { - return err3 - } - - timersToNotify := append(timerTasks, msBuilder.GetContinueAsNew().TimerTasks...) - - err = context.updateAsActiveWithNew(transferTasks, timerTasks, transactionID, continueAsNewBuilder) - + timersToNotify := append(timerTasks, newMutableState.GetTimerTasks()...) + + msBuilder.AddTransferTasks(transferTasks...) + msBuilder.AddTimerTasks(timerTasks...) + + newExecutionInfo := newMutableState.GetExecutionInfo() + err = context.updateWorkflowExecutionWithNewAsActive( + t.shard.GetTimeSource().Now(), + newWorkflowExecutionContext( + newExecutionInfo.DomainID, + workflow.WorkflowExecution{ + WorkflowId: common.StringPtr(newExecutionInfo.WorkflowID), + RunId: common.StringPtr(newExecutionInfo.RunID), + }, + t.shard, + t.shard.GetExecutionManager(), + t.logger, + ), + newMutableState, + ) if err != nil { if err == ErrConflict { continue Update_History_Loop @@ -806,11 +816,10 @@ func (t *timerQueueActiveProcessorImpl) updateWorkflowExecution( timerTasks []persistence.Task, ) error { executionInfo := msBuilder.GetExecutionInfo() - var transferTasks []persistence.Task var err error if scheduleNewDecision { // Schedule a new decision. - transferTasks, timerTasks, err = context.scheduleNewDecision(transferTasks, timerTasks) + err = scheduleDecision(msBuilder, t.shard.GetTimeSource(), t.logger) if err != nil { return err } @@ -818,24 +827,20 @@ func (t *timerQueueActiveProcessorImpl) updateWorkflowExecution( if createDeletionTask { tBuilder := t.historyService.getTimerBuilder(context.getExecution()) - tranT, timerT, err := t.historyService.getWorkflowHistoryCleanupTasks( + transferTask, timerTask, err := t.historyService.getWorkflowHistoryCleanupTasks( executionInfo.DomainID, executionInfo.WorkflowID, tBuilder) if err != nil { return err } - transferTasks = append(transferTasks, tranT) - timerTasks = append(timerTasks, timerT) - } - - // Generate a transaction ID for appending events to history - transactionID, err1 := t.historyService.shard.GetNextTransferTaskID() - if err1 != nil { - return err1 + msBuilder.AddTransferTasks(transferTask) + msBuilder.AddTimerTasks(timerTask) } + msBuilder.AddTimerTasks(timerTasks...) - err = context.updateAsActive(transferTasks, timerTasks, transactionID) + now := t.shard.GetTimeSource().Now() + err = context.updateWorkflowExecutionAsActive(now) if err != nil { if isShardOwnershiptLostError(err) { // Shard is stolen. Stop timer processing to reduce duplicates diff --git a/service/history/timerQueueProcessorBase.go b/service/history/timerQueueProcessorBase.go index 6b17781b89f..df98196ab03 100644 --- a/service/history/timerQueueProcessorBase.go +++ b/service/history/timerQueueProcessorBase.go @@ -515,12 +515,6 @@ func (t *timerQueueProcessorBase) handleTaskError(scope int, startTime time.Time return nil } - if _, ok := err.(*workflow.LimitExceededError); ok { - t.metricsClient.IncCounter(scope, metrics.TaskLimitExceededCounter) - logger.Error("Task encounter limit exceeded error.", tag.Error(err), tag.LifeCycleProcessingFailed) - return err - } - logger.Error("Fail to process task", tag.Error(err), tag.LifeCycleProcessingFailed) return err } @@ -541,13 +535,15 @@ func (t *timerQueueProcessorBase) ackTaskOnce(task *persistence.TimerTaskInfo, s func (t *timerQueueProcessorBase) initializeLoggerForTask(task *persistence.TimerTaskInfo) log.Logger { logger := t.logger.WithTags( + tag.ShardID(t.shard.GetShardID()), + tag.WorkflowDomainID(task.DomainID), tag.WorkflowID(task.WorkflowID), tag.WorkflowRunID(task.RunID), - tag.WorkflowDomainID(task.DomainID), - tag.ShardID(t.shard.GetShardID()), tag.TaskID(task.GetTaskID()), tag.FailoverVersion(task.GetVersion()), - tag.TaskType(task.GetTaskType())) + tag.TaskType(task.GetTaskType()), + tag.WorkflowTimeoutType(int64(task.TimeoutType)), + ) logger.Debug(fmt.Sprintf("Processing timer task: %v, type: %v", task.GetTaskID(), task.GetTaskType())) return logger } diff --git a/service/history/timerQueueProcessorBase_test.go b/service/history/timerQueueProcessorBase_test.go index b7de8d95b30..f5d0507530e 100644 --- a/service/history/timerQueueProcessorBase_test.go +++ b/service/history/timerQueueProcessorBase_test.go @@ -254,11 +254,6 @@ func (s *timerQueueProcessorBaseSuite) TestHandleTaskError_CurrentWorkflowCondit s.Nil(s.timerQueueProcessor.handleTaskError(s.scope, time.Now(), s.notificationChan, err, s.logger)) } -func (s *timerQueueProcessorBaseSuite) TestHandleTaskError_LimitExceededError() { - err := &workflow.LimitExceededError{} - s.Equal(err, s.timerQueueProcessor.handleTaskError(s.scope, time.Now(), s.notificationChan, err, s.logger)) -} - func (s *timerQueueProcessorBaseSuite) TestHandleTaskError_RandomErr() { err := errors.New("random error") s.Equal(err, s.timerQueueProcessor.handleTaskError(s.scope, time.Now(), s.notificationChan, err, s.logger)) diff --git a/service/history/timerQueueProcessor_test.go b/service/history/timerQueueProcessor_test.go index 7e7bb4c6159..71eb8a99c11 100644 --- a/service/history/timerQueueProcessor_test.go +++ b/service/history/timerQueueProcessor_test.go @@ -164,7 +164,7 @@ func (s *timerQueueProcessorSuite) createExecutionWithTimers(domainID string, we timerTasks := []persistence.Task{} timerInfos := []*persistence.TimerInfo{} decisionCompletedID := int64(4) - tBuilder := newTimerBuilder(s.ShardContext.GetConfig(), s.logger, clock.NewRealTimeSource()) + tBuilder := newTimerBuilder(s.logger, clock.NewRealTimeSource()) for _, timeOut := range timeOuts { _, ti, err := builder.AddTimerStartedEvent(decisionCompletedID, @@ -340,7 +340,7 @@ func (s *timerQueueProcessorSuite) TestTimerTaskAfterProcessorStart() { processor := s.engineImpl.timerProcessor.(*timerQueueProcessorImpl) processor.Start() - tBuilder := newTimerBuilder(s.ShardContext.GetConfig(), s.logger, clock.NewRealTimeSource()) + tBuilder := newTimerBuilder(s.logger, clock.NewRealTimeSource()) tt := s.addDecisionTimer(domainID, workflowExecution, tBuilder) processor.NotifyNewTimers(cluster.TestCurrentClusterName, s.ShardContext.GetCurrentTime(cluster.TestCurrentClusterName), tt) @@ -434,7 +434,7 @@ func (s *timerQueueProcessorSuite) TestTimerActivityTaskScheduleToStart_WithOutS s.Nil(err) s.NotNil(activityScheduledEvent) - tBuilder := newTimerBuilder(s.ShardContext.GetConfig(), s.logger, &mockTimeSource{currTime: time.Now()}) + tBuilder := newTimerBuilder(s.logger, &mockTimeSource{currTime: time.Now()}) tt := tBuilder.GetActivityTimerTaskIfNeeded(builder) s.NotNil(tt) timerTasks := []persistence.Task{tt} @@ -482,7 +482,7 @@ func (s *timerQueueProcessorSuite) TestTimerActivityTaskScheduleToStart_WithStar s.Nil(err) // create a schedule to start timeout - tBuilder := newTimerBuilder(s.ShardContext.GetConfig(), s.logger, &mockTimeSource{currTime: time.Now()}) + tBuilder := newTimerBuilder(s.logger, &mockTimeSource{currTime: time.Now()}) tt := tBuilder.GetActivityTimerTaskIfNeeded(builder) s.NotNil(tt) timerTasks := []persistence.Task{tt} @@ -528,7 +528,7 @@ func (s *timerQueueProcessorSuite) TestTimerActivityTaskScheduleToStart_MoreThan s.Nil(err) s.NotNil(activityScheduledEvent) - tBuilder := newTimerBuilder(s.ShardContext.GetConfig(), s.logger, &mockTimeSource{currTime: time.Now()}) + tBuilder := newTimerBuilder(s.logger, &mockTimeSource{currTime: time.Now()}) tt := tBuilder.GetActivityTimerTaskIfNeeded(builder) s.NotNil(tt) s.Equal(workflow.TimeoutTypeScheduleToStart, workflow.TimeoutType(tt.(*persistence.ActivityTimeoutTask).TimeoutType)) @@ -576,7 +576,7 @@ func (s *timerQueueProcessorSuite) TestTimerActivityTaskStartToClose_WithStart() s.Nil(err) // create a start to close timeout - tBuilder := newTimerBuilder(s.ShardContext.GetConfig(), s.logger, &mockTimeSource{currTime: time.Now()}) + tBuilder := newTimerBuilder(s.logger, &mockTimeSource{currTime: time.Now()}) tt := tBuilder.GetActivityTimerTaskIfNeeded(builder) s.NotNil(tt) timerTasks := []persistence.Task{tt} @@ -627,7 +627,7 @@ func (s *timerQueueProcessorSuite) TestTimerActivityTaskStartToClose_CompletedAc s.Nil(err) // create a start to close timeout - tBuilder := newTimerBuilder(s.ShardContext.GetConfig(), s.logger, &mockTimeSource{currTime: time.Now()}) + tBuilder := newTimerBuilder(s.logger, &mockTimeSource{currTime: time.Now()}) t, err := tBuilder.AddStartToCloseActivityTimeout(ai) s.NoError(err) s.NotNil(t) @@ -673,7 +673,7 @@ func (s *timerQueueProcessorSuite) TestTimerActivityTaskScheduleToClose_JustSche s.NotNil(ase) // create a schedule to close timeout - tBuilder := newTimerBuilder(s.ShardContext.GetConfig(), s.logger, &mockTimeSource{currTime: time.Now()}) + tBuilder := newTimerBuilder(s.logger, &mockTimeSource{currTime: time.Now()}) tt := tBuilder.GetActivityTimerTaskIfNeeded(builder) s.NotNil(tt) timerTasks := []persistence.Task{tt} @@ -721,7 +721,7 @@ func (s *timerQueueProcessorSuite) TestTimerActivityTaskScheduleToClose_Started( s.Nil(err) // create a schedule to close timeout - tBuilder := newTimerBuilder(s.ShardContext.GetConfig(), s.logger, &mockTimeSource{currTime: time.Now()}) + tBuilder := newTimerBuilder(s.logger, &mockTimeSource{currTime: time.Now()}) tt := tBuilder.GetActivityTimerTaskIfNeeded(builder) s.NotNil(tt) timerTasks := []persistence.Task{tt} @@ -773,7 +773,7 @@ func (s *timerQueueProcessorSuite) TestTimerActivityTaskScheduleToClose_Complete s.Nil(err) // create a schedule to close timeout - tBuilder := newTimerBuilder(s.ShardContext.GetConfig(), s.logger, &mockTimeSource{currTime: time.Now()}) + tBuilder := newTimerBuilder(s.logger, &mockTimeSource{currTime: time.Now()}) t, err := tBuilder.AddScheduleToCloseActivityTimeout(ai) s.NoError(err) s.NotNil(t) @@ -802,7 +802,7 @@ func (s *timerQueueProcessorSuite) TestTimerActivityTaskHeartBeat_JustStarted() p := s.engineImpl.timerProcessor.(*timerQueueProcessorImpl) p.Start() - tBuilder := newTimerBuilder(s.ShardContext.GetConfig(), s.logger, &mockTimeSource{currTime: time.Now()}) + tBuilder := newTimerBuilder(s.logger, &mockTimeSource{currTime: time.Now()}) ase, timerTasks := s.addHeartBeatTimer(domainID, workflowExecution, tBuilder) p.NotifyNewTimers(cluster.TestCurrentClusterName, s.ShardContext.GetCurrentTime(cluster.TestCurrentClusterName), timerTasks) @@ -854,7 +854,7 @@ func (s *timerQueueProcessorSuite) TestTimerActivityTask_SameExpiry() { s.NotNil(ase2) // create a schedule to close timeout - tBuilder := newTimerBuilder(s.ShardContext.GetConfig(), s.logger, &mockTimeSource{currTime: time.Now()}) + tBuilder := newTimerBuilder(s.logger, &mockTimeSource{currTime: time.Now()}) t, err := tBuilder.AddScheduleToCloseActivityTimeout(ai1) s.NoError(err) s.NotNil(t) @@ -895,7 +895,7 @@ func (s *timerQueueProcessorSuite) TestTimerUserTimers() { p := s.engineImpl.timerProcessor.(*timerQueueProcessorImpl) p.Start() - tBuilder := newTimerBuilder(s.ShardContext.GetConfig(), s.logger, &mockTimeSource{currTime: time.Now()}) + tBuilder := newTimerBuilder(s.logger, &mockTimeSource{currTime: time.Now()}) timerID := "tid1" timerTasks := s.addUserTimer(domainID, workflowExecution, timerID, tBuilder) p.NotifyNewTimers(cluster.TestCurrentClusterName, s.ShardContext.GetCurrentTime(cluster.TestCurrentClusterName), timerTasks) @@ -926,7 +926,7 @@ func (s *timerQueueProcessorSuite) TestTimerUserTimers_SameExpiry() { condition := state.ExecutionInfo.NextEventID // load any timers. - tBuilder := newTimerBuilder(s.ShardContext.GetConfig(), s.logger, &mockTimeSource{currTime: time.Now().Add(-1 * time.Second)}) + tBuilder := newTimerBuilder(s.logger, &mockTimeSource{currTime: time.Now().Add(-1 * time.Second)}) timerTasks := []persistence.Task{} // create two user timers. @@ -973,7 +973,7 @@ func (s *timerQueueProcessorSuite) TestTimersOnClosedWorkflow() { p := s.engineImpl.timerProcessor.(*timerQueueProcessorImpl) p.Start() - tBuilder := newTimerBuilder(s.ShardContext.GetConfig(), s.logger, &mockTimeSource{currTime: time.Now()}) + tBuilder := newTimerBuilder(s.logger, &mockTimeSource{currTime: time.Now()}) // Start of one of each timers each s.addDecisionTimer(domainID, workflowExecution, tBuilder) diff --git a/service/history/timerQueueStandbyProcessor.go b/service/history/timerQueueStandbyProcessor.go index 417eb49fd6c..c16051194cb 100644 --- a/service/history/timerQueueStandbyProcessor.go +++ b/service/history/timerQueueStandbyProcessor.go @@ -291,7 +291,6 @@ func (t *timerQueueStandbyProcessorImpl) processActivityTimeout(timerTask *persi // need to clear the activity heartbeat timer task marks doUpdate := false lastWriteVersion := msBuilder.GetLastWriteVersion() - sourceCluster := t.clusterMetadata.ClusterNameForFailoverVersion(lastWriteVersion) isHeartBeatTask := timerTask.TimeoutType == int(workflow.TimeoutTypeHeartbeat) if activityInfo, pending := msBuilder.GetActivityInfo(timerTask.EventID); isHeartBeatTask && pending { doUpdate = true @@ -310,17 +309,14 @@ func (t *timerQueueStandbyProcessorImpl) processActivityTimeout(timerTask *persi return nil } - // code below does the update of activity and possible generation of a new activity timer task - transactionID, err := t.shard.GetNextTransferTaskID() - if err != nil { - return err - } now := t.getStandbyClusterTime() // we need to handcraft some of the variables // since the job being done here is update the activity and possibly write a timer task to DB // also need to reset the current version. msBuilder.UpdateReplicationStateVersion(lastWriteVersion, true) - err = context.updateAsPassive(nil, newTimerTasks, transactionID, now, false, nil, sourceCluster) + + msBuilder.AddTimerTasks(newTimerTasks...) + err := context.updateWorkflowExecutionAsPassive(now) if err == nil { t.notifyNewTimers(newTimerTasks) } @@ -464,7 +460,7 @@ func (t *timerQueueStandbyProcessorImpl) getTimerBuilder() *timerBuilder { timeSource := clock.NewEventTimeSource() now := t.getStandbyClusterTime() timeSource.Update(now) - return newTimerBuilder(t.shard.GetConfig(), t.logger, timeSource) + return newTimerBuilder(t.logger, timeSource) } func (t *timerQueueStandbyProcessorImpl) processTimer(timerTask *persistence.TimerTaskInfo, diff --git a/service/history/timerQueueStandbyProcessor_test.go b/service/history/timerQueueStandbyProcessor_test.go index 7808e6574e7..2a29039cf09 100644 --- a/service/history/timerQueueStandbyProcessor_test.go +++ b/service/history/timerQueueStandbyProcessor_test.go @@ -208,7 +208,7 @@ func (s *timerQueueStandbyProcessorSuite) TestProcessExpiredUserTimer_Pending() event, timerInfo := addTimerStartedEvent(msBuilder, event.GetEventId(), timerID, int64(timerTimeout.Seconds())) nextEventID := event.GetEventId() + 1 - tBuilder := newTimerBuilder(s.mockShard.GetConfig(), s.logger, clock.NewRealTimeSource()) + tBuilder := newTimerBuilder(s.logger, clock.NewRealTimeSource()) tBuilder.AddUserTimer(timerInfo, msBuilder) timerTask := &persistence.TimerTaskInfo{ Version: version, @@ -273,7 +273,7 @@ func (s *timerQueueStandbyProcessorSuite) TestProcessExpiredUserTimer_Success() timerTimeout := 2 * time.Second event, timerInfo := addTimerStartedEvent(msBuilder, event.GetEventId(), timerID, int64(timerTimeout.Seconds())) - tBuilder := newTimerBuilder(s.mockShard.GetConfig(), s.logger, clock.NewRealTimeSource()) + tBuilder := newTimerBuilder(s.logger, clock.NewRealTimeSource()) tBuilder.AddUserTimer(timerInfo, msBuilder) timerTask := &persistence.TimerTaskInfo{ Version: version, @@ -335,7 +335,7 @@ func (s *timerQueueStandbyProcessorSuite) TestProcessExpiredUserTimer_Multiple() timerTimeout2 := 50 * time.Second _, timerInfo2 := addTimerStartedEvent(msBuilder, event.GetEventId(), timerID2, int64(timerTimeout2.Seconds())) - tBuilder := newTimerBuilder(s.mockShard.GetConfig(), s.logger, clock.NewRealTimeSource()) + tBuilder := newTimerBuilder(s.logger, clock.NewRealTimeSource()) tBuilder.AddUserTimer(timerInfo1, msBuilder) tBuilder.AddUserTimer(timerInfo2, msBuilder) @@ -399,7 +399,7 @@ func (s *timerQueueStandbyProcessorSuite) TestProcessActivityTimeout_Pending() { int32(timerTimeout.Seconds()), int32(timerTimeout.Seconds()), int32(timerTimeout.Seconds())) nextEventID := scheduledEvent.GetEventId() + 1 - tBuilder := newTimerBuilder(s.mockShard.GetConfig(), s.logger, clock.NewRealTimeSource()) + tBuilder := newTimerBuilder(s.logger, clock.NewRealTimeSource()) timerTask := &persistence.TimerTaskInfo{ Version: version, @@ -469,7 +469,7 @@ func (s *timerQueueStandbyProcessorSuite) TestProcessActivityTimeout_Success() { int32(timerTimeout.Seconds()), int32(timerTimeout.Seconds()), int32(timerTimeout.Seconds())) startedEvent := addActivityTaskStartedEvent(msBuilder, scheduleEvent.GetEventId(), identity) - tBuilder := newTimerBuilder(s.mockShard.GetConfig(), s.logger, clock.NewRealTimeSource()) + tBuilder := newTimerBuilder(s.logger, clock.NewRealTimeSource()) _, err = tBuilder.AddStartToCloseActivityTimeout(timerInfo) s.Nil(err) @@ -546,7 +546,7 @@ func (s *timerQueueStandbyProcessorSuite) TestProcessActivityTimeout_Multiple_Ca activityInfo2.TimerTaskStatus |= TimerTaskStatusCreatedHeartbeat activityInfo2.LastHeartBeatUpdatedTime = time.Now() - tBuilder := newTimerBuilder(s.mockShard.GetConfig(), s.logger, clock.NewRealTimeSource()) + tBuilder := newTimerBuilder(s.logger, clock.NewRealTimeSource()) _, err = tBuilder.AddStartToCloseActivityTimeout(timerInfo1) s.Nil(err) _, err = tBuilder.AddScheduleToCloseActivityTimeout(timerInfo2) diff --git a/service/history/transferQueueActiveProcessor.go b/service/history/transferQueueActiveProcessor.go index 82bf356ea2b..9d7ca22f74d 100644 --- a/service/history/transferQueueActiveProcessor.go +++ b/service/history/transferQueueActiveProcessor.go @@ -1252,22 +1252,17 @@ Update_History_Loop: if createDecisionTask { // Create a transfer task to schedule a decision task - var err error - transferTasks, timerTasks, err = context.scheduleNewDecision(transferTasks, timerTasks) + err := scheduleDecision(msBuilder, t.shard.GetTimeSource(), t.logger) if err != nil { return err } } - // Generate a transaction ID for appending events to history - transactionID, err2 := t.shard.GetNextTransferTaskID() - if err2 != nil { - return err2 - } - // We apply the update to execution using optimistic concurrency. If it fails due to a conflict then reload // the history and try the operation again. - if err := context.updateAsActive(transferTasks, timerTasks, transactionID); err != nil { + msBuilder.AddTransferTasks(transferTasks...) + msBuilder.AddTimerTasks(timerTasks...) + if err := context.updateWorkflowExecutionAsActive(t.shard.GetTimeSource().Now()); err != nil { if err == ErrConflict { continue Update_History_Loop } diff --git a/service/history/workflowExecutionContext.go b/service/history/workflowExecutionContext.go index c1476eb2482..96dacc943f1 100644 --- a/service/history/workflowExecutionContext.go +++ b/service/history/workflowExecutionContext.go @@ -25,13 +25,11 @@ import ( "fmt" "time" - h "github.com/uber/cadence/.gen/go/history" workflow "github.com/uber/cadence/.gen/go/shared" "github.com/uber/cadence/common" "github.com/uber/cadence/common/backoff" "github.com/uber/cadence/common/clock" "github.com/uber/cadence/common/cluster" - "github.com/uber/cadence/common/errors" "github.com/uber/cadence/common/locks" "github.com/uber/cadence/common/log" "github.com/uber/cadence/common/log/tag" @@ -56,6 +54,9 @@ type ( lock(ctx context.Context) error unlock() + getHistorySize() int64 + setHistorySize(size int64) + persistFirstWorkflowEvents( workflowEvents *persistence.WorkflowEvents, ) (int64, error) @@ -63,15 +64,6 @@ type ( workflowEvents *persistence.WorkflowEvents, ) (int64, error) - appendFirstBatchEventsForActive( - msBuilder mutableState, - createReplicationTask bool, - ) (int64, persistence.Task, error) - appendFirstBatchEventsForStandby( - msBuilder mutableState, - history []*workflow.HistoryEvent, - ) (int64, persistence.Task, error) - createWorkflowExecution( newWorkflow *persistence.WorkflowSnapshot, historySize int64, @@ -80,14 +72,30 @@ type ( prevRunID string, prevLastWriteVersion int64, ) error - - replicateWorkflowExecution( - request *h.ReplicateEventsRequest, - transferTasks []persistence.Task, - timerTasks []persistence.Task, - lastEventID int64, + updateWorkflowExecutionAsActive( + now time.Time, + ) error + updateWorkflowExecutionWithNewAsActive( + now time.Time, + newContext workflowExecutionContext, + newMutableState mutableState, + ) error + updateWorkflowExecutionAsPassive( + now time.Time, + ) error + updateWorkflowExecutionWithNewAsPassive( now time.Time, + newContext workflowExecutionContext, + newMutableState mutableState, + ) error + updateWorkflowExecutionWithNew( + now time.Time, + newContext workflowExecutionContext, + newMutableState mutableState, + currentWorkflowTransactionPolicy transactionPolicy, + newWorkflowTransactionPolicy *transactionPolicy, ) error + resetMutableState( prevRunID string, prevLastWriteVersion int64, @@ -112,31 +120,6 @@ type ( baseRunID string, baseRunNextEventID int64, ) (retError error) - scheduleNewDecision( - transferTasks []persistence.Task, - timerTasks []persistence.Task, - ) ([]persistence.Task, []persistence.Task, error) - - updateAsActive( - transferTasks []persistence.Task, - timerTasks []persistence.Task, - transactionID int64, - ) error - updateAsActiveWithNew( - transferTasks []persistence.Task, - timerTasks []persistence.Task, - transactionID int64, - newStateBuilder mutableState, - ) error - updateAsPassive( - transferTasks []persistence.Task, - timerTasks []persistence.Task, - transactionID int64, - now time.Time, - createReplicationTask bool, - standbyHistoryBuilder *historyBuilder, - sourceCluster string, - ) error } ) @@ -151,11 +134,10 @@ type ( metricsClient metrics.Client timeSource clock.TimeSource - locker locks.Mutex - msBuilder mutableState - stats *persistence.ExecutionStats - updateCondition int64 - createReplicationTask bool + mutex locks.Mutex + msBuilder mutableState + stats *persistence.ExecutionStats + updateCondition int64 } ) @@ -188,16 +170,25 @@ func newWorkflowExecutionContext( logger: lg, metricsClient: shard.GetMetricsClient(), timeSource: shard.GetTimeSource(), - locker: locks.NewMutex(), + mutex: locks.NewMutex(), + stats: &persistence.ExecutionStats{ + HistorySize: 0, + }, } } func (c *workflowExecutionContextImpl) lock(ctx context.Context) error { - return c.locker.Lock(ctx) + return c.mutex.Lock(ctx) } func (c *workflowExecutionContextImpl) unlock() { - c.locker.Unlock() + c.mutex.Unlock() +} + +func (c *workflowExecutionContextImpl) clear() { + c.metricsClient.IncCounter(metrics.WorkflowContextScope, metrics.WorkflowContextCleared) + c.msBuilder = nil + c.stats = nil } func (c *workflowExecutionContextImpl) getDomainID() string { @@ -212,6 +203,22 @@ func (c *workflowExecutionContextImpl) getLogger() log.Logger { return c.logger } +func (c *workflowExecutionContextImpl) getDomainName() string { + domainEntry, err := c.shard.GetDomainCache().GetDomainByID(c.domainID) + if err != nil { + return "" + } + return domainEntry.GetInfo().Name +} + +func (c *workflowExecutionContextImpl) getHistorySize() int64 { + return c.stats.HistorySize +} + +func (c *workflowExecutionContextImpl) setHistorySize(size int64) { + c.stats.HistorySize = size +} + func (c *workflowExecutionContextImpl) loadExecutionStats() (*persistence.ExecutionStats, error) { _, err := c.loadWorkflowExecution() if err != nil { @@ -242,11 +249,6 @@ func (c *workflowExecutionContextImpl) loadWorkflowExecutionInternal() error { Execution: c.workflowExecution, }) if err != nil { - if common.IsPersistenceTransientError(err) { - c.logger.Error("Persistent store operation failure", - tag.StoreOperationGetWorkflowExecution, - tag.Error(err)) - } return err } @@ -269,6 +271,25 @@ func (c *workflowExecutionContextImpl) loadWorkflowExecutionInternal() error { return nil } +func (c *workflowExecutionContextImpl) updateVersion() error { + if c.shard.GetService().GetClusterMetadata().IsGlobalDomainEnabled() && c.msBuilder.GetReplicationState() != nil { + if !c.msBuilder.IsWorkflowExecutionRunning() { + // we should not update the version on mutable state when the workflow is finished + return nil + } + // Support for global domains is enabled and we are performing an update for global domain + domainEntry, err := c.shard.GetDomainCache().GetDomainByID(c.domainID) + if err != nil { + return err + } + c.msBuilder.UpdateReplicationStateVersion(domainEntry.GetFailoverVersion(), false) + + // this is a hack, only create replication task if have # target cluster > 1, for more see #868 + c.msBuilder.UpdateReplicationPolicy(domainEntry.GetReplicationPolicy()) + } + return nil +} + func (c *workflowExecutionContextImpl) createWorkflowExecution( newWorkflow *persistence.WorkflowSnapshot, historySize int64, @@ -286,554 +307,147 @@ func (c *workflowExecutionContextImpl) createWorkflowExecution( NewWorkflowSnapshot: *newWorkflow, } - // the history size needs to be special treated + createRequest.NewWorkflowSnapshot.ExecutionStats = &persistence.ExecutionStats{ HistorySize: historySize, } - _, err := c.shard.CreateWorkflowExecution(createRequest) + _, err := c.createWorkflowExecutionWithRetry(createRequest) return err } -func (c *workflowExecutionContextImpl) resetMutableState( - prevRunID string, - prevLastWriteVersion int64, - prevState int, - replicationTasks []persistence.Task, - transferTasks []persistence.Task, - timerTasks []persistence.Task, - resetBuilder mutableState, - resetHistorySize int64, -) (mutableState, error) { +func (c *workflowExecutionContextImpl) updateWorkflowExecutionAsActive( + now time.Time, +) error { - // this only resets one mutableState for a workflow - snapshotRequest := resetBuilder.ResetSnapshot( // TODO - prevRunID, - prevLastWriteVersion, - prevState, - replicationTasks, - transferTasks, - timerTasks, + return c.updateWorkflowExecutionWithNew( + now, + nil, + nil, + transactionPolicyActive, + nil, ) - snapshotRequest.ResetWorkflowSnapshot.ExecutionStats = &persistence.ExecutionStats{ - HistorySize: resetHistorySize, - } - snapshotRequest.ResetWorkflowSnapshot.Condition = c.updateCondition +} - err := c.shard.ResetMutableState(snapshotRequest) - if err != nil { - return nil, err - } +func (c *workflowExecutionContextImpl) updateWorkflowExecutionWithNewAsActive( + now time.Time, + newContext workflowExecutionContext, + newMutableState mutableState, +) error { - c.clear() - return c.loadWorkflowExecution() + return c.updateWorkflowExecutionWithNew( + now, + newContext, + newMutableState, + transactionPolicyActive, + transactionPolicyActive.ptr(), + ) } -// this reset is more complex than "resetMutableState", it involes currentMutableState and newMutableState: -// 1. append history to new run -// 2. append history to current run if current run is not closed -// 3. update mutableState(terminate current run if not closed) and create new run -func (c *workflowExecutionContextImpl) resetWorkflowExecution( - currMutableState mutableState, - updateCurr bool, - closeTask persistence.Task, - cleanupTask persistence.Task, - newMutableState mutableState, - newHistorySize int64, - newTransferTasks []persistence.Task, - newTimerTasks []persistence.Task, - currReplicationTasks []persistence.Task, - newReplicationTasks []persistence.Task, - baseRunID string, - baseRunNextEventID int64, -) (retError error) { +func (c *workflowExecutionContextImpl) updateWorkflowExecutionAsPassive( + now time.Time, +) error { - now := c.timeSource.Now() - currTransferTasks := []persistence.Task{} - currTimerTasks := []persistence.Task{} - if closeTask != nil { - currTransferTasks = append(currTransferTasks, closeTask) - } - if cleanupTask != nil { - currTimerTasks = append(currTimerTasks, cleanupTask) - } - setTaskInfo(currMutableState.GetCurrentVersion(), now, currTransferTasks, currTimerTasks) - setTaskInfo(newMutableState.GetCurrentVersion(), now, newTransferTasks, newTimerTasks) + return c.updateWorkflowExecutionWithNew( + now, + nil, + nil, + transactionPolicyPassive, + nil, + ) +} - transactionID, retError := c.shard.GetNextTransferTaskID() - if retError != nil { - return retError - } +func (c *workflowExecutionContextImpl) updateWorkflowExecutionWithNewAsPassive( + now time.Time, + newContext workflowExecutionContext, + newMutableState mutableState, +) error { - // Since we always reset to decision task, there shouldn't be any buffered events. - // Therefore currently ResetWorkflowExecution persistence API doesn't implement setting buffered events. - if newMutableState.HasBufferedEvents() { - retError = &workflow.InternalServiceError{ - Message: fmt.Sprintf("reset workflow execution shouldn't have buffered events"), - } - return - } + return c.updateWorkflowExecutionWithNew( + now, + newContext, + newMutableState, + transactionPolicyPassive, + transactionPolicyPassive.ptr(), + ) +} - // call FlushBufferedEvents to assign task id to event - // as well as update last event task id in ms state builder - retError = currMutableState.FlushBufferedEvents() - if retError != nil { - return retError - } - retError = newMutableState.FlushBufferedEvents() - if retError != nil { - return retError - } +func (c *workflowExecutionContextImpl) updateWorkflowExecutionWithNew( + now time.Time, + newContext workflowExecutionContext, + newMutableState mutableState, + currentWorkflowTransactionPolicy transactionPolicy, + newWorkflowTransactionPolicy *transactionPolicy, +) (retError error) { - if updateCurr { - hBuilder := currMutableState.GetHistoryBuilder() - var size int - // TODO workflow execution reset logic generates replication tasks in its own business logic - // should use append history events in the future - size, _, retError = c.appendHistoryEvents(hBuilder.GetHistory().GetEvents(), transactionID, true, false, nil) + defer func() { if retError != nil { - return + c.clear() } - c.stats.HistorySize += int64(size) - } + }() - // Note: we already made sure that newMutableState is using eventsV2 - hBuilder := newMutableState.GetHistoryBuilder() - size, retError := c.shard.AppendHistoryV2Events(&persistence.AppendHistoryNodesRequest{ - IsNewBranch: false, - BranchToken: newMutableState.GetCurrentBranch(), - Events: hBuilder.GetHistory().GetEvents(), - TransactionID: transactionID, - }, c.domainID, c.workflowExecution) - if retError != nil { - return + currentWorkflow, workflowEventsSeq, err := c.msBuilder.CloseTransactionAsMutation(now, currentWorkflowTransactionPolicy) + if err != nil { + return err } - newHistorySize += int64(size) - - // ResetSnapshot function used here really does rely on inputs below - snapshotRequest := newMutableState.ResetSnapshot("", 0, 0, nil, nil, nil) - if len(snapshotRequest.ResetWorkflowSnapshot.ChildExecutionInfos) > 0 || - len(snapshotRequest.ResetWorkflowSnapshot.SignalInfos) > 0 || - len(snapshotRequest.ResetWorkflowSnapshot.SignalRequestedIDs) > 0 { - return &workflow.InternalServiceError{ - Message: fmt.Sprintf("something went wrong, we shouldn't see any pending childWF, sending Signal or signal requested"), + currentWorkflowSize := c.getHistorySize() + for _, workflowEvents := range workflowEventsSeq { + eventsSize, err := c.persistNonFirstWorkflowEvents(workflowEvents) + if err != nil { + return err } + currentWorkflowSize += eventsSize + } + c.setHistorySize(currentWorkflowSize) + currentWorkflow.ExecutionStats = &persistence.ExecutionStats{ + HistorySize: currentWorkflowSize, } - resetWFReq := &persistence.ResetWorkflowExecutionRequest{ - BaseRunID: baseRunID, - BaseRunNextEventID: baseRunNextEventID, + var newWorkflow *persistence.WorkflowSnapshot + if newContext != nil && newMutableState != nil && newWorkflowTransactionPolicy != nil { + defer func() { + if retError != nil { + newContext.clear() + } + }() - CurrentRunID: currMutableState.GetExecutionInfo().RunID, - CurrentRunNextEventID: currMutableState.GetExecutionInfo().NextEventID, + newWorkflow, workflowEventsSeq, err = newMutableState.CloseTransactionAsSnapshot(now, *newWorkflowTransactionPolicy) + if err != nil { + return err + } - CurrentWorkflowMutation: nil, + newWorkflowSizeSize := newContext.getHistorySize() + eventsSize, err := c.persistFirstWorkflowEvents(workflowEventsSeq[0]) + if err != nil { + return err + } + newWorkflowSizeSize += eventsSize - NewWorkflowSnapshot: persistence.WorkflowSnapshot{ - ExecutionInfo: newMutableState.GetExecutionInfo(), - ExecutionStats: &persistence.ExecutionStats{ - HistorySize: newHistorySize, - }, - ReplicationState: newMutableState.GetReplicationState(), + newContext.setHistorySize(currentWorkflowSize) + newWorkflow.ExecutionStats = &persistence.ExecutionStats{ + HistorySize: currentWorkflowSize, + } + } - ActivityInfos: snapshotRequest.ResetWorkflowSnapshot.ActivityInfos, - TimerInfos: snapshotRequest.ResetWorkflowSnapshot.TimerInfos, - ChildExecutionInfos: snapshotRequest.ResetWorkflowSnapshot.ChildExecutionInfos, - RequestCancelInfos: snapshotRequest.ResetWorkflowSnapshot.RequestCancelInfos, - SignalInfos: snapshotRequest.ResetWorkflowSnapshot.SignalInfos, - SignalRequestedIDs: snapshotRequest.ResetWorkflowSnapshot.SignalRequestedIDs, + if err := c.mergeContinueAsNewReplicationTasks( + currentWorkflow, + newWorkflow, + ); err != nil { + return err + } - TransferTasks: newTransferTasks, - ReplicationTasks: newReplicationTasks, - TimerTasks: newTimerTasks, - }, + resp, err := c.updateWorkflowExecutionWithRetry(&persistence.UpdateWorkflowExecutionRequest{ + // RangeID , this is set by shard context + UpdateWorkflowMutation: *currentWorkflow, + NewWorkflowSnapshot: newWorkflow, + // Encoding, this is set by shard context + }) + if err != nil { + return err } - if updateCurr { - resetWFReq.CurrentWorkflowMutation = &persistence.WorkflowMutation{ - ExecutionInfo: currMutableState.GetExecutionInfo(), - ExecutionStats: &persistence.ExecutionStats{ - HistorySize: c.stats.HistorySize, - }, - ReplicationState: currMutableState.GetReplicationState(), - - UpsertActivityInfos: []*persistence.ActivityInfo{}, - DeleteActivityInfos: []int64{}, - UpserTimerInfos: []*persistence.TimerInfo{}, - DeleteTimerInfos: []string{}, - UpsertChildExecutionInfos: []*persistence.ChildExecutionInfo{}, - DeleteChildExecutionInfo: nil, - UpsertRequestCancelInfos: []*persistence.RequestCancelInfo{}, - DeleteRequestCancelInfo: nil, - UpsertSignalInfos: []*persistence.SignalInfo{}, - DeleteSignalInfo: nil, - UpsertSignalRequestedIDs: []string{}, - DeleteSignalRequestedID: "", - NewBufferedEvents: []*workflow.HistoryEvent{}, - ClearBufferedEvents: false, - - TransferTasks: currTransferTasks, - ReplicationTasks: currReplicationTasks, - TimerTasks: currTimerTasks, - - Condition: c.updateCondition, - } - } - - return c.shard.ResetWorkflowExecution(resetWFReq) -} - -func (c *workflowExecutionContextImpl) replicateWorkflowExecution( - request *h.ReplicateEventsRequest, - transferTasks []persistence.Task, - timerTasks []persistence.Task, - lastEventID int64, - now time.Time, -) error { - - transactionID, err := c.shard.GetNextTransferTaskID() - if err != nil { - return err - } - - nextEventID := lastEventID + 1 - c.msBuilder.GetExecutionInfo().SetNextEventID(nextEventID) - - standbyHistoryBuilder := newHistoryBuilderFromEvents(request.History.Events, c.logger) - return c.updateAsPassive( - transferTasks, - timerTasks, - transactionID, - now, - false, - standbyHistoryBuilder, - request.GetSourceCluster(), - ) -} - -func (c *workflowExecutionContextImpl) updateVersion() error { - if c.shard.GetService().GetClusterMetadata().IsGlobalDomainEnabled() && c.msBuilder.GetReplicationState() != nil { - if !c.msBuilder.IsWorkflowExecutionRunning() { - // we should not update the version on mutable state when the workflow is finished - return nil - } - // Support for global domains is enabled and we are performing an update for global domain - domainEntry, err := c.shard.GetDomainCache().GetDomainByID(c.domainID) - if err != nil { - return err - } - c.msBuilder.UpdateReplicationStateVersion(domainEntry.GetFailoverVersion(), false) - - // this is a hack, only create replication task if have # target cluster > 1, for more see #868 - c.createReplicationTask = domainEntry.CanReplicateEvent() - } - return nil -} - -func (c *workflowExecutionContextImpl) updateAsActive( - transferTasks []persistence.Task, - timerTasks []persistence.Task, - transactionID int64, -) error { - return c.updateAsActiveWithNew(transferTasks, timerTasks, transactionID, nil) -} - -func (c *workflowExecutionContextImpl) updateAsActiveWithNew( - transferTasks []persistence.Task, - timerTasks []persistence.Task, - transactionID int64, - newStateBuilder mutableState, -) error { - - if c.msBuilder.GetReplicationState() != nil { - currentVersion := c.msBuilder.GetCurrentVersion() - - activeCluster := c.clusterMetadata.ClusterNameForFailoverVersion(currentVersion) - currentCluster := c.clusterMetadata.GetCurrentClusterName() - if activeCluster != currentCluster { - domainID := c.msBuilder.GetExecutionInfo().DomainID - c.clear() - return errors.NewDomainNotActiveError(domainID, currentCluster, activeCluster) - } - - // Handling mutable state turn from standby to active, while having a decision on the fly - if di, ok := c.msBuilder.GetInFlightDecisionTask(); ok && c.msBuilder.IsWorkflowExecutionRunning() { - if di.Version < currentVersion { - // we have a decision on the fly with a lower version, fail it - if _, err := c.msBuilder.AddDecisionTaskFailedEvent( - di.ScheduleID, - di.StartedID, - workflow.DecisionTaskFailedCauseFailoverCloseDecision, - nil, - identityHistoryService, - "", - "", - "", - 0, - ); err != nil { - return err - } - - var transT, timerT []persistence.Task - transT, timerT, err := c.scheduleNewDecision(transT, timerT) - if err != nil { - return err - } - transferTasks = append(transferTasks, transT...) - timerTasks = append(timerTasks, timerT...) - } - } - } - - if !c.createReplicationTask { - c.logger.Debug(fmt.Sprintf( - "Skipping replication task creation: %v, workflowID: %v, runID: %v, firstEventID: %v, nextEventID: %v.", - c.domainID, - c.workflowExecution.GetWorkflowId(), - c.workflowExecution.GetRunId(), - c.msBuilder.GetExecutionInfo().LastFirstEventID, - c.msBuilder.GetExecutionInfo().NextEventID), - ) - } - - // compare with bad binaries and schedule a reset task - if len(c.msBuilder.GetPendingChildExecutionInfos()) == 0 { - // only schedule reset task if current doesn't have childWFs. - // TODO: This will be removed once our reset allows childWFs - - domainEntry, err := c.shard.GetDomainCache().GetDomainByID(c.domainID) - if err != nil { - return err - } - _, pt := FindAutoResetPoint(c.timeSource, &domainEntry.GetConfig().BadBinaries, c.msBuilder.GetExecutionInfo().AutoResetPoints) - if pt != nil { - transferTasks = append(transferTasks, &persistence.ResetWorkflowTask{}) - c.logger.Info("Auto-Reset task is scheduled", - tag.WorkflowDomainName(domainEntry.GetInfo().Name), - tag.WorkflowID(c.msBuilder.GetExecutionInfo().WorkflowID), - tag.WorkflowRunID(c.msBuilder.GetExecutionInfo().RunID), - tag.WorkflowResetBaseRunID(pt.GetRunId()), - tag.WorkflowEventID(pt.GetFirstDecisionCompletedId()), - tag.WorkflowBinaryChecksum(pt.GetBinaryChecksum())) - } - } - - now := c.timeSource.Now() - return c.update( - transferTasks, - timerTasks, - transactionID, - now, - c.createReplicationTask, - nil, - "", - newStateBuilder, - ) -} - -func (c *workflowExecutionContextImpl) updateAsPassive( - transferTasks []persistence.Task, - timerTasks []persistence.Task, - transactionID int64, - now time.Time, - createReplicationTask bool, - standbyHistoryBuilder *historyBuilder, - sourceCluster string, -) error { - - return c.update( - transferTasks, - timerTasks, - transactionID, - now, - createReplicationTask, - standbyHistoryBuilder, - sourceCluster, - nil, - ) -} - -func (c *workflowExecutionContextImpl) update( - transferTasks []persistence.Task, - timerTasks []persistence.Task, - transactionID int64, - now time.Time, - createReplicationTask bool, - standbyHistoryBuilder *historyBuilder, - sourceCluster string, - newStateBuilder mutableState, -) (errRet error) { - - defer func() { - if errRet != nil { - // Clear all cached state in case of error - c.clear() - } - }() - - // Take a snapshot of all updates we have accumulated for this execution - updates, err := c.msBuilder.CloseUpdateSession() - if err != nil { - if err == ErrBufferedEventsLimitExceeded { - if err1 := c.failInflightDecision(); err1 != nil { - return err1 - } - - // Buffered events are flushed, we want upper layer to retry - return ErrConflict - } - return err - } - - executionInfo := c.msBuilder.GetExecutionInfo() - - // this builder has events generated locally - hasNewStandbyHistoryEvents := standbyHistoryBuilder != nil && len(standbyHistoryBuilder.history) > 0 - activeHistoryBuilder := updates.newEventsBuilder - hasNewActiveHistoryEvents := len(activeHistoryBuilder.history) > 0 - - if hasNewStandbyHistoryEvents && hasNewActiveHistoryEvents { - c.logger.Fatal("Both standby and active history builder has events.", - tag.WorkflowID(executionInfo.WorkflowID), - tag.WorkflowRunID(executionInfo.RunID), - tag.WorkflowDomainID(executionInfo.DomainID), - tag.WorkflowFirstEventID(executionInfo.LastFirstEventID), - tag.WorkflowNextEventID(executionInfo.NextEventID), - tag.ReplicationState(c.msBuilder.GetReplicationState()), - ) - } - - // Replication state should only be updated after the UpdateSession is closed. IDs for certain events are only - // generated on CloseSession as they could be buffered events. The value for NextEventID will be wrong on - // mutable state if read before flushing the buffered events. - crossDCEnabled := c.msBuilder.GetReplicationState() != nil - if crossDCEnabled { - // always standby history first - if hasNewStandbyHistoryEvents { - lastEvent := standbyHistoryBuilder.history[len(standbyHistoryBuilder.history)-1] - c.msBuilder.UpdateReplicationStateLastEventID( - lastEvent.GetVersion(), - lastEvent.GetEventId(), - ) - } - - if hasNewActiveHistoryEvents { - c.msBuilder.UpdateReplicationStateLastEventID( - c.msBuilder.GetCurrentVersion(), - executionInfo.NextEventID-1, - ) - } - } - - historySize := 0 - var replicationTasks []persistence.Task - - // always standby history first - if hasNewStandbyHistoryEvents { - firstEvent := standbyHistoryBuilder.GetFirstEvent() - // Note: standby events has no transient decision events - historySize, _, err = c.appendHistoryEvents(standbyHistoryBuilder.history, transactionID, true, false, nil) - if err != nil { - return err - } - - executionInfo.SetLastFirstEventID(firstEvent.GetEventId()) - } - - // Some operations only update the mutable state. For example RecordActivityTaskHeartbeat. - if hasNewActiveHistoryEvents { - var newReplicationTask persistence.Task - - // Transient decision events need to be written as a separate batch - if activeHistoryBuilder.HasTransientEvents() { - // transient decision events batch should not perform last event check - size, newReplicationTask, err := c.appendHistoryEvents(activeHistoryBuilder.transientHistory, transactionID, false, createReplicationTask, newStateBuilder) - if err != nil { - return err - } - if newReplicationTask != nil { - replicationTasks = append(replicationTasks, newReplicationTask) - } - executionInfo.SetLastFirstEventID(activeHistoryBuilder.transientHistory[0].GetEventId()) - historySize += size - } - - size, newReplicationTask, err := c.appendHistoryEvents(activeHistoryBuilder.history, transactionID, true, createReplicationTask, newStateBuilder) - if err != nil { - return err - } - if newReplicationTask != nil { - replicationTasks = append(replicationTasks, newReplicationTask) - } - - executionInfo.SetLastFirstEventID(activeHistoryBuilder.history[0].GetEventId()) - historySize += size - } // end of update history events for active builder - - if executionInfo.State == persistence.WorkflowStateCompleted { - // clear stickyness - c.msBuilder.ClearStickyness() - } - - if createReplicationTask { - replicationTasks = append(replicationTasks, updates.syncActivityTasks...) - } - setTaskInfo(c.msBuilder.GetCurrentVersion(), now, transferTasks, timerTasks) - - // Update history size on mutableState before calling UpdateWorkflowExecution - c.stats.HistorySize += int64(historySize) - if updates.continueAsNew != nil { - newHistorySize, err := c.appendFirstBatchHistoryForContinueAsNew(updates.continueAsNewWorkflowEvents, transactionID) - if err != nil { - return err - } - updates.continueAsNew.ExecutionStats = &persistence.ExecutionStats{ - HistorySize: newHistorySize, - } - } - - var resp *persistence.UpdateWorkflowExecutionResponse - var err1 error - if resp, err1 = c.updateWorkflowExecutionWithRetry(&persistence.UpdateWorkflowExecutionRequest{ - UpdateWorkflowMutation: persistence.WorkflowMutation{ - ExecutionInfo: executionInfo, - ExecutionStats: c.stats, - ReplicationState: c.msBuilder.GetReplicationState(), - TransferTasks: transferTasks, - ReplicationTasks: replicationTasks, - TimerTasks: timerTasks, - Condition: c.updateCondition, - UpsertActivityInfos: updates.updateActivityInfos, - DeleteActivityInfos: updates.deleteActivityInfos, - UpserTimerInfos: updates.updateTimerInfos, - DeleteTimerInfos: updates.deleteTimerInfos, - UpsertChildExecutionInfos: updates.updateChildExecutionInfos, - DeleteChildExecutionInfo: updates.deleteChildExecutionInfo, - UpsertRequestCancelInfos: updates.updateCancelExecutionInfos, - DeleteRequestCancelInfo: updates.deleteCancelExecutionInfo, - UpsertSignalInfos: updates.updateSignalInfos, - DeleteSignalInfo: updates.deleteSignalInfo, - UpsertSignalRequestedIDs: updates.updateSignalRequestedIDs, - DeleteSignalRequestedID: updates.deleteSignalRequestedID, - NewBufferedEvents: updates.newBufferedEvents, - ClearBufferedEvents: updates.clearBufferedEvents, - }, - NewWorkflowSnapshot: updates.continueAsNew, - }); err1 != nil { - switch err1.(type) { - case *persistence.ConditionFailedError: - return ErrConflict - } - - c.logger.Error("Persistent store operation failure", - tag.StoreOperationUpdateWorkflowExecution, - tag.Error(err), tag.Number(c.updateCondition)) - return err1 - } - - // Update went through so update the condition for new updates - c.updateCondition = c.msBuilder.GetNextEventID() - c.msBuilder.GetExecutionInfo().LastUpdatedTimestamp = c.timeSource.Now() + // TODO remove updateCondition in favor of condition in mutable state + c.updateCondition = currentWorkflow.ExecutionInfo.NextEventID // for any change in the workflow, send a event _ = c.shard.NotifyNewHistoryEvent(newHistoryEventNotification( @@ -851,7 +465,7 @@ func (c *workflowExecutionContextImpl) update( c.metricsClient, domainName, int(c.stats.HistorySize), - int(executionInfo.NextEventID-1), + int(c.msBuilder.GetNextEventID()-1), ) emitSessionUpdateStats( c.metricsClient, @@ -860,7 +474,7 @@ func (c *workflowExecutionContextImpl) update( ) // emit workflow completion stats if any - if executionInfo.State == persistence.WorkflowStateCompleted { + if currentWorkflow.ExecutionInfo.State == persistence.WorkflowStateCompleted { if event, ok := c.msBuilder.GetCompletionEvent(); ok { emitWorkflowCompletionStats( c.metricsClient, @@ -873,457 +487,477 @@ func (c *workflowExecutionContextImpl) update( return nil } -func (c *workflowExecutionContextImpl) appendFirstBatchEventsForActive( - msBuilder mutableState, - createReplicationTask bool, -) (int64, persistence.Task, error) { - - // call FlushBufferedEvents to assign task id to event - // as well as update last event task id in mutable state builder - err := msBuilder.FlushBufferedEvents() - if err != nil { - return 0, nil, err +func (c *workflowExecutionContextImpl) mergeContinueAsNewReplicationTasks( + currentWorkflowMutation *persistence.WorkflowMutation, + newWorkflowSnapshot *persistence.WorkflowSnapshot, +) error { + if currentWorkflowMutation.ExecutionInfo.CloseStatus != persistence.WorkflowCloseStatusContinuedAsNew { + return nil } - events := msBuilder.GetHistoryBuilder().GetHistory().Events - return c.appendFirstBatchEvents(msBuilder, events, createReplicationTask) -} - -func (c *workflowExecutionContextImpl) appendFirstBatchEventsForStandby( - msBuilder mutableState, - history []*workflow.HistoryEvent, -) (int64, persistence.Task, error) { - - return c.appendFirstBatchEvents(msBuilder, history, false) -} -func (c *workflowExecutionContextImpl) appendFirstBatchEvents( - msBuilder mutableState, - history []*workflow.HistoryEvent, - replicateEvents bool, -) (int64, persistence.Task, error) { - - firstEvent := history[0] - lastEvent := history[len(history)-1] - var historySize int - var err error - - if msBuilder.GetEventStoreVersion() == persistence.EventStoreVersionV2 { - historySize, err = c.shard.AppendHistoryV2Events(&persistence.AppendHistoryNodesRequest{ - IsNewBranch: true, - Info: historyGarbageCleanupInfo( - c.domainID, - c.workflowExecution.GetWorkflowId(), - c.workflowExecution.GetRunId(), - ), - BranchToken: msBuilder.GetCurrentBranch(), - Events: history, - // It is ok to use 0 for TransactionID because RunID is unique so there are - // no potential duplicates to override. - TransactionID: 0, - }, c.domainID, c.workflowExecution) - } else { - historySize, err = c.shard.AppendHistoryEvents(&persistence.AppendHistoryEventsRequest{ - DomainID: c.domainID, - Execution: c.workflowExecution, - // It is ok to use 0 for TransactionID because RunID is unique so there are - // no potential duplicates to override. - TransactionID: 0, - FirstEventID: firstEvent.GetEventId(), - EventBatchVersion: firstEvent.GetVersion(), - Events: history, - }) - } + // current workflow is doing continue as new - var replicationTask persistence.Task - if err == nil { - if replicateEvents && msBuilder.GetReplicationState() != nil { - replicationTask = &persistence.HistoryReplicationTask{ - FirstEventID: firstEvent.GetEventId(), - NextEventID: lastEvent.GetEventId() + 1, - Version: firstEvent.GetVersion(), - LastReplicationInfo: msBuilder.GetReplicationState().LastReplicationInfo, - EventStoreVersion: msBuilder.GetEventStoreVersion(), - BranchToken: msBuilder.GetCurrentBranch(), - NewRunEventStoreVersion: 0, // no new run - NewRunBranchToken: nil, // no new run - } - } + // it is possible that continue as new is done as part of passive logic + if len(currentWorkflowMutation.ReplicationTasks) == 0 { + return nil } - return int64(historySize), replicationTask, err -} -func (c *workflowExecutionContextImpl) appendHistoryEvents( - history []*workflow.HistoryEvent, - transactionID int64, - doLastEventValidation bool, - replicateEvents bool, - newStateBuilder mutableState, -) (int, persistence.Task, error) { - - if doLastEventValidation { - if err := c.validateNoEventsAfterWorkflowFinish(history); err != nil { - return 0, nil, err + if newWorkflowSnapshot == nil || len(newWorkflowSnapshot.ReplicationTasks) != 1 { + return &workflow.InternalServiceError{ + Message: "unable to find replication task from new workflow for continue as new replication", } } - firstEvent := history[0] - lastEvent := history[len(history)-1] - var historySize int - var err error + // merge the new run first event batch replication task + // to current event batch replication task + newRunTask := newWorkflowSnapshot.ReplicationTasks[0].(*persistence.HistoryReplicationTask) + newWorkflowSnapshot.ReplicationTasks = nil - if c.msBuilder.GetEventStoreVersion() == persistence.EventStoreVersionV2 { - historySize, err = c.shard.AppendHistoryV2Events(&persistence.AppendHistoryNodesRequest{ - IsNewBranch: false, - BranchToken: c.msBuilder.GetCurrentBranch(), - Events: history, - TransactionID: transactionID, - }, c.domainID, c.workflowExecution) - } else { - historySize, err = c.shard.AppendHistoryEvents(&persistence.AppendHistoryEventsRequest{ - DomainID: c.domainID, - Execution: c.workflowExecution, - TransactionID: transactionID, - FirstEventID: firstEvent.GetEventId(), - EventBatchVersion: firstEvent.GetVersion(), - Events: history, - }) + newRunBranchToken := newRunTask.BranchToken + newRunEventStoreVersion := newRunTask.EventStoreVersion + taskUpdated := false + for _, replicationTask := range currentWorkflowMutation.ReplicationTasks { + if task, ok := replicationTask.(*persistence.HistoryReplicationTask); ok { + taskUpdated = true + task.NewRunBranchToken = newRunBranchToken + task.NewRunEventStoreVersion = newRunEventStoreVersion + } } - - if err != nil { - switch err.(type) { - case *persistence.ConditionFailedError: - return historySize, nil, ErrConflict + if !taskUpdated { + return &workflow.InternalServiceError{ + Message: "unable to find replication task from current workflow for continue as new replication", } - - c.logger.Error("Persistent store operation failure", - tag.StoreOperationUpdateWorkflowExecution, - tag.Error(err), - tag.Number(c.updateCondition)) - return historySize, nil, err } + return nil +} - var replicationTask persistence.Task - if replicateEvents && c.msBuilder.GetReplicationState() != nil { - var newRunEventStoreVersion int32 - var newRunBranchToken []byte - if newStateBuilder != nil { - newRunEventStoreVersion = newStateBuilder.GetEventStoreVersion() - newRunBranchToken = newStateBuilder.GetCurrentBranch() - } +func (c *workflowExecutionContextImpl) persistFirstWorkflowEvents( + workflowEvents *persistence.WorkflowEvents, +) (int64, error) { - replicationTask = &persistence.HistoryReplicationTask{ - FirstEventID: firstEvent.GetEventId(), - NextEventID: lastEvent.GetEventId() + 1, - Version: firstEvent.GetVersion(), - LastReplicationInfo: c.msBuilder.GetReplicationState().LastReplicationInfo, - EventStoreVersion: c.msBuilder.GetEventStoreVersion(), - BranchToken: c.msBuilder.GetCurrentBranch(), - NewRunEventStoreVersion: newRunEventStoreVersion, - NewRunBranchToken: newRunBranchToken, + if len(workflowEvents.Events) == 0 { + return 0, &workflow.InternalServiceError{ + Message: "cannot persist first workflow events with empty events", } } - return historySize, replicationTask, nil -} -func (c *workflowExecutionContextImpl) appendFirstBatchHistoryForContinueAsNew( - newWorkflowEvents *persistence.WorkflowEvents, - transactionID int64, -) (int64, error) { + transactionID, err := c.shard.GetNextTransferTaskID() + if err != nil { + return 0, err + } - domainID := newWorkflowEvents.DomainID - workflowID := newWorkflowEvents.WorkflowID - runID := newWorkflowEvents.RunID + domainID := workflowEvents.DomainID + workflowID := workflowEvents.WorkflowID + runID := workflowEvents.RunID execution := workflow.WorkflowExecution{ - WorkflowId: common.StringPtr(workflowID), - RunId: common.StringPtr(runID), + WorkflowId: common.StringPtr(workflowEvents.WorkflowID), + RunId: common.StringPtr(workflowEvents.RunID), } + branchToken := workflowEvents.BranchToken + events := workflowEvents.Events + firstEvent := events[0] - firstEvent := newWorkflowEvents.Events[0] - var historySize int - var err error - if len(newWorkflowEvents.BranchToken) != 0 { - historySize, err = c.shard.AppendHistoryV2Events(&persistence.AppendHistoryNodesRequest{ - IsNewBranch: true, - Info: historyGarbageCleanupInfo(domainID, workflowID, runID), - BranchToken: newWorkflowEvents.BranchToken, - Events: newWorkflowEvents.Events, - TransactionID: transactionID, - }, domainID, execution) - } else { - historySize, err = c.shard.AppendHistoryEvents(&persistence.AppendHistoryEventsRequest{ + if len(branchToken) == 0 { + size, err := c.appendHistoryEventsWithRetry(&persistence.AppendHistoryEventsRequest{ DomainID: domainID, Execution: execution, TransactionID: transactionID, FirstEventID: firstEvent.GetEventId(), EventBatchVersion: firstEvent.GetVersion(), - Events: newWorkflowEvents.Events, + Events: events, }) + return int64(size), err } - return int64(historySize), err + size, err := c.appendHistoryV2EventsWithRetry( + domainID, + execution, + &persistence.AppendHistoryNodesRequest{ + IsNewBranch: true, + Info: historyGarbageCleanupInfo(domainID, workflowID, runID), + BranchToken: branchToken, + Events: events, + TransactionID: transactionID, + }, + ) + return int64(size), err } -func (c *workflowExecutionContextImpl) getWorkflowExecutionWithRetry( - request *persistence.GetWorkflowExecutionRequest, -) (*persistence.GetWorkflowExecutionResponse, error) { - - var response *persistence.GetWorkflowExecutionResponse - op := func() error { - var err error - response, err = c.executionManager.GetWorkflowExecution(request) +func (c *workflowExecutionContextImpl) persistNonFirstWorkflowEvents( + workflowEvents *persistence.WorkflowEvents, +) (int64, error) { - return err + if len(workflowEvents.Events) == 0 { + return 0, nil // allow update workflow without events } - err := backoff.Retry(op, persistenceOperationRetryPolicy, common.IsPersistenceTransientError) + transactionID, err := c.shard.GetNextTransferTaskID() if err != nil { - return nil, err + return 0, err } - return response, nil -} - -func (c *workflowExecutionContextImpl) updateWorkflowExecutionWithRetry( - request *persistence.UpdateWorkflowExecutionRequest, -) (*persistence.UpdateWorkflowExecutionResponse, error) { + domainID := workflowEvents.DomainID + execution := workflow.WorkflowExecution{ + WorkflowId: common.StringPtr(workflowEvents.WorkflowID), + RunId: common.StringPtr(workflowEvents.RunID), + } + branchToken := workflowEvents.BranchToken + events := workflowEvents.Events + firstEvent := events[0] + + if len(branchToken) == 0 { + size, err := c.appendHistoryEventsWithRetry(&persistence.AppendHistoryEventsRequest{ + DomainID: domainID, + Execution: execution, + TransactionID: transactionID, + FirstEventID: firstEvent.GetEventId(), + EventBatchVersion: firstEvent.GetVersion(), + Events: events, + }) + return int64(size), err + } - resp := &persistence.UpdateWorkflowExecutionResponse{} + size, err := c.appendHistoryV2EventsWithRetry( + domainID, + execution, + &persistence.AppendHistoryNodesRequest{ + IsNewBranch: false, + BranchToken: branchToken, + Events: events, + TransactionID: transactionID, + }, + ) + return int64(size), err +} + +func (c *workflowExecutionContextImpl) appendHistoryEventsWithRetry( + request *persistence.AppendHistoryEventsRequest, +) (int64, error) { + + resp := 0 op := func() error { var err error - resp, err = c.shard.UpdateWorkflowExecution(request) + resp, err = c.shard.AppendHistoryEvents(request) return err } - err := backoff.Retry(op, persistenceOperationRetryPolicy, common.IsPersistenceTransientError) - return resp, err + err := backoff.Retry( + op, + persistenceOperationRetryPolicy, + common.IsPersistenceTransientError, + ) + return int64(resp), err } -func (c *workflowExecutionContextImpl) clear() { - c.metricsClient.IncCounter(metrics.WorkflowContextScope, metrics.WorkflowContextCleared) - c.msBuilder = nil - c.stats = nil +func (c *workflowExecutionContextImpl) appendHistoryV2EventsWithRetry( + domainID string, + execution workflow.WorkflowExecution, + request *persistence.AppendHistoryNodesRequest, +) (int64, error) { + + resp := 0 + op := func() error { + var err error + resp, err = c.shard.AppendHistoryV2Events(request, domainID, execution) + return err + } + + err := backoff.Retry( + op, + persistenceOperationRetryPolicy, + common.IsPersistenceTransientError, + ) + return int64(resp), err } -// scheduleNewDecision is helper method which has the logic for scheduling new decision for a workflow execution. -// This function takes in a slice of transferTasks and timerTasks already scheduled for the current transaction -// and may append more tasks to it. It also returns back the slice with new tasks appended to it. It is expected -// caller to assign returned slice to original passed in slices. For this reason we return the original slices -// even if the method fails due to an error on loading workflow execution. -func (c *workflowExecutionContextImpl) scheduleNewDecision( - transferTasks []persistence.Task, - timerTasks []persistence.Task, -) ([]persistence.Task, []persistence.Task, error) { +func (c *workflowExecutionContextImpl) createWorkflowExecutionWithRetry( + request *persistence.CreateWorkflowExecutionRequest, +) (*persistence.CreateWorkflowExecutionResponse, error) { - msBuilder, err := c.loadWorkflowExecution() - if err != nil { - return transferTasks, timerTasks, err + var resp *persistence.CreateWorkflowExecutionResponse + op := func() error { + var err error + resp, err = c.shard.CreateWorkflowExecution(request) + return err } - executionInfo := msBuilder.GetExecutionInfo() - if !msBuilder.HasPendingDecisionTask() { - di, err := msBuilder.AddDecisionTaskScheduledEvent() - if err != nil { - return nil, nil, &workflow.InternalServiceError{Message: "Failed to add decision scheduled event."} - } - transferTasks = append(transferTasks, &persistence.DecisionTask{ - DomainID: executionInfo.DomainID, - TaskList: di.TaskList, - ScheduleID: di.ScheduleID, - }) - if msBuilder.IsStickyTaskListEnabled() { - tBuilder := newTimerBuilder(c.shard.GetConfig(), c.logger, clock.NewRealTimeSource()) - stickyTaskTimeoutTimer := tBuilder.AddScheduleToStartDecisionTimoutTask(di.ScheduleID, di.Attempt, - executionInfo.StickyScheduleToStartTimeout) - timerTasks = append(timerTasks, stickyTaskTimeoutTimer) - } + err := backoff.Retry( + op, + persistenceOperationRetryPolicy, + common.IsPersistenceTransientError, + ) + switch err.(type) { + case nil: + return resp, nil + case *persistence.WorkflowExecutionAlreadyStartedError: + // it is possible that workflow already exists and caller need to apply + // workflow ID reuse policy + return nil, err + default: + c.logger.Error( + "Persistent store operation failure", + tag.StoreOperationCreateWorkflowExecution, + tag.Error(err), + ) + return nil, err } - - return transferTasks, timerTasks, nil } -func (c *workflowExecutionContextImpl) failInflightDecision() error { - c.clear() +func (c *workflowExecutionContextImpl) getWorkflowExecutionWithRetry( + request *persistence.GetWorkflowExecutionRequest, +) (*persistence.GetWorkflowExecutionResponse, error) { + + var resp *persistence.GetWorkflowExecutionResponse + op := func() error { + var err error + resp, err = c.executionManager.GetWorkflowExecution(request) - // Reload workflow execution so we can apply the decision task failure event - msBuilder, err := c.loadWorkflowExecution() - if err != nil { return err } - if di, ok := msBuilder.GetInFlightDecisionTask(); ok { - if _, err := msBuilder.AddDecisionTaskFailedEvent( - di.ScheduleID, - di.StartedID, - workflow.DecisionTaskFailedCauseForceCloseDecision, - nil, - identityHistoryService, - "", - "", - "", - 0, - ); err != nil { - return err - } + err := backoff.Retry( + op, + persistenceOperationRetryPolicy, + common.IsPersistenceTransientError, + ) + switch err.(type) { + case nil: + return resp, nil + case *workflow.EntityNotExistsError: + // it is possible that workflow does not exists + return nil, err + default: + c.logger.Error( + "Persistent fetch operation failure", + tag.StoreOperationGetWorkflowExecution, + tag.Error(err), + ) + return nil, err + } +} - var transT, timerT []persistence.Task - transT, timerT, err = c.scheduleNewDecision(transT, timerT) - if err != nil { - return err - } +func (c *workflowExecutionContextImpl) updateWorkflowExecutionWithRetry( + request *persistence.UpdateWorkflowExecutionRequest, +) (*persistence.UpdateWorkflowExecutionResponse, error) { - // Generate a transaction ID for appending events to history - transactionID, err := c.shard.GetNextTransferTaskID() - if err != nil { - return err - } - err = c.updateAsActive(transT, timerT, transactionID) - if err != nil { - return err - } + var resp *persistence.UpdateWorkflowExecutionResponse + op := func() error { + var err error + resp, err = c.shard.UpdateWorkflowExecution(request) + return err } - return nil -} -func (c *workflowExecutionContextImpl) getDomainName() string { - domainEntry, err := c.shard.GetDomainCache().GetDomainByID(c.domainID) - if err != nil { - return "" + err := backoff.Retry( + op, persistenceOperationRetryPolicy, + common.IsPersistenceTransientError, + ) + switch err.(type) { + case nil: + return resp, nil + case *persistence.ConditionFailedError: + // TODO get rid of ErrConflict + return nil, ErrConflict + default: + c.logger.Error( + "Persistent store operation failure", + tag.StoreOperationUpdateWorkflowExecution, + tag.Error(err), + tag.Number(c.updateCondition), + ) + return nil, err } - return domainEntry.GetInfo().Name } -// validateNoEventsAfterWorkflowFinish perform check on history event batch -// NOTE: do not apply this check on every batch, since transient -// decision && workflow finish will be broken (the first batch) -func (c *workflowExecutionContextImpl) validateNoEventsAfterWorkflowFinish( - input []*workflow.HistoryEvent, -) error { +func (c *workflowExecutionContextImpl) resetMutableState( + prevRunID string, + prevLastWriteVersion int64, + prevState int, + replicationTasks []persistence.Task, + transferTasks []persistence.Task, + timerTasks []persistence.Task, + resetBuilder mutableState, + resetHistorySize int64, +) (mutableState, error) { - if len(input) == 0 { - return nil + // this only resets one mutableState for a workflow + snapshotRequest := resetBuilder.ResetSnapshot( // TODO + prevRunID, + prevLastWriteVersion, + prevState, + replicationTasks, + transferTasks, + timerTasks, + ) + snapshotRequest.ResetWorkflowSnapshot.ExecutionStats = &persistence.ExecutionStats{ + HistorySize: resetHistorySize, } + snapshotRequest.ResetWorkflowSnapshot.Condition = c.updateCondition - // if workflow is still running, no check is necessary - if c.msBuilder.IsWorkflowExecutionRunning() { - return nil + err := c.shard.ResetMutableState(snapshotRequest) + if err != nil { + return nil, err } - // workflow close - // this will perform check on the last event of last batch - // NOTE: do not apply this check on every batch, since transient - // decision && workflow finish will be broken (the first batch) - lastEvent := input[len(input)-1] - switch lastEvent.GetEventType() { - case workflow.EventTypeWorkflowExecutionCompleted, - workflow.EventTypeWorkflowExecutionFailed, - workflow.EventTypeWorkflowExecutionTimedOut, - workflow.EventTypeWorkflowExecutionTerminated, - workflow.EventTypeWorkflowExecutionContinuedAsNew, - workflow.EventTypeWorkflowExecutionCanceled: - return nil + c.clear() + return c.loadWorkflowExecution() +} - default: - c.logger.Error("encounter case where events appears after workflow finish.", - tag.WorkflowID(c.workflowExecution.GetWorkflowId()), - tag.WorkflowRunID(c.workflowExecution.GetRunId()), - tag.WorkflowDomainID(c.domainID)) +// this reset is more complex than "resetMutableState", it involes currentMutableState and newMutableState: +// 1. append history to new run +// 2. append history to current run if current run is not closed +// 3. update mutableState(terminate current run if not closed) and create new run +func (c *workflowExecutionContextImpl) resetWorkflowExecution( + currMutableState mutableState, + updateCurr bool, + closeTask persistence.Task, + cleanupTask persistence.Task, + newMutableState mutableState, + newHistorySize int64, + newTransferTasks []persistence.Task, + newTimerTasks []persistence.Task, + currReplicationTasks []persistence.Task, + newReplicationTasks []persistence.Task, + baseRunID string, + baseRunNextEventID int64, +) (retError error) { - return ErrEventsAterWorkflowFinish + now := c.timeSource.Now() + currTransferTasks := []persistence.Task{} + currTimerTasks := []persistence.Task{} + if closeTask != nil { + currTransferTasks = append(currTransferTasks, closeTask) } + if cleanupTask != nil { + currTimerTasks = append(currTimerTasks, cleanupTask) + } + setTaskInfo(currMutableState.GetCurrentVersion(), now, currTransferTasks, currTimerTasks) + setTaskInfo(newMutableState.GetCurrentVersion(), now, newTransferTasks, newTimerTasks) -} - -func (c *workflowExecutionContextImpl) persistFirstWorkflowEvents( - workflowEvents *persistence.WorkflowEvents, -) (int64, error) { + transactionID, retError := c.shard.GetNextTransferTaskID() + if retError != nil { + return retError + } - if len(workflowEvents.Events) == 0 { - return 0, &workflow.InternalServiceError{ - Message: "cannot persist first workflow events with empty events", + // Since we always reset to decision task, there shouldn't be any buffered events. + // Therefore currently ResetWorkflowExecution persistence API doesn't implement setting buffered events. + if newMutableState.HasBufferedEvents() { + retError = &workflow.InternalServiceError{ + Message: fmt.Sprintf("reset workflow execution shouldn't have buffered events"), } + return } - transactionID, err := c.shard.GetNextTransferTaskID() - if err != nil { - return 0, err + // call FlushBufferedEvents to assign task id to event + // as well as update last event task id in ms state builder + retError = currMutableState.FlushBufferedEvents() + if retError != nil { + return retError } - - domainID := workflowEvents.DomainID - workflowID := workflowEvents.WorkflowID - runID := workflowEvents.RunID - execution := workflow.WorkflowExecution{ - WorkflowId: common.StringPtr(workflowEvents.WorkflowID), - RunId: common.StringPtr(workflowEvents.RunID), + retError = newMutableState.FlushBufferedEvents() + if retError != nil { + return retError } - branchToken := workflowEvents.BranchToken - events := workflowEvents.Events - firstEvent := events[0] - if len(branchToken) == 0 { - size, err := c.shard.AppendHistoryEvents(&persistence.AppendHistoryEventsRequest{ - DomainID: domainID, - Execution: execution, - TransactionID: transactionID, - FirstEventID: firstEvent.GetEventId(), - EventBatchVersion: firstEvent.GetVersion(), - Events: events, + if updateCurr { + hBuilder := currMutableState.GetHistoryBuilder() + var size int64 + // TODO workflow execution reset logic generates replication tasks in its own business logic + currentExecutionInfo := currMutableState.GetExecutionInfo() + size, retError = c.persistNonFirstWorkflowEvents(&persistence.WorkflowEvents{ + DomainID: currentExecutionInfo.DomainID, + WorkflowID: currentExecutionInfo.WorkflowID, + RunID: currentExecutionInfo.RunID, + BranchToken: currMutableState.GetCurrentBranch(), + Events: hBuilder.GetHistory().GetEvents(), }) - return int64(size), err + if retError != nil { + return + } + c.stats.HistorySize += int64(size) } - size, err := c.shard.AppendHistoryV2Events( - &persistence.AppendHistoryNodesRequest{ - IsNewBranch: true, - Info: historyGarbageCleanupInfo(domainID, workflowID, runID), - BranchToken: branchToken, - Events: events, - TransactionID: transactionID, - }, - domainID, - execution, - ) - return int64(size), err -} - -func (c *workflowExecutionContextImpl) persistNonFirstWorkflowEvents( - workflowEvents *persistence.WorkflowEvents, -) (int64, error) { - - if len(workflowEvents.Events) == 0 { - return 0, nil // allow update workflow without events + // Note: we already made sure that newMutableState is using eventsV2 + hBuilder := newMutableState.GetHistoryBuilder() + size, retError := c.shard.AppendHistoryV2Events(&persistence.AppendHistoryNodesRequest{ + IsNewBranch: false, + BranchToken: newMutableState.GetCurrentBranch(), + Events: hBuilder.GetHistory().GetEvents(), + TransactionID: transactionID, + }, c.domainID, c.workflowExecution) + if retError != nil { + return } + newHistorySize += int64(size) - transactionID, err := c.shard.GetNextTransferTaskID() - if err != nil { - return 0, err + // ResetSnapshot function used here really does rely on inputs below + snapshotRequest := newMutableState.ResetSnapshot("", 0, 0, nil, nil, nil) + if len(snapshotRequest.ResetWorkflowSnapshot.ChildExecutionInfos) > 0 || + len(snapshotRequest.ResetWorkflowSnapshot.SignalInfos) > 0 || + len(snapshotRequest.ResetWorkflowSnapshot.SignalRequestedIDs) > 0 { + return &workflow.InternalServiceError{ + Message: fmt.Sprintf("something went wrong, we shouldn't see any pending childWF, sending Signal or signal requested"), + } } - domainID := workflowEvents.DomainID - execution := workflow.WorkflowExecution{ - WorkflowId: common.StringPtr(workflowEvents.WorkflowID), - RunId: common.StringPtr(workflowEvents.RunID), + resetWFReq := &persistence.ResetWorkflowExecutionRequest{ + BaseRunID: baseRunID, + BaseRunNextEventID: baseRunNextEventID, + + CurrentRunID: currMutableState.GetExecutionInfo().RunID, + CurrentRunNextEventID: currMutableState.GetExecutionInfo().NextEventID, + + CurrentWorkflowMutation: nil, + + NewWorkflowSnapshot: persistence.WorkflowSnapshot{ + ExecutionInfo: newMutableState.GetExecutionInfo(), + ExecutionStats: &persistence.ExecutionStats{ + HistorySize: newHistorySize, + }, + ReplicationState: newMutableState.GetReplicationState(), + + ActivityInfos: snapshotRequest.ResetWorkflowSnapshot.ActivityInfos, + TimerInfos: snapshotRequest.ResetWorkflowSnapshot.TimerInfos, + ChildExecutionInfos: snapshotRequest.ResetWorkflowSnapshot.ChildExecutionInfos, + RequestCancelInfos: snapshotRequest.ResetWorkflowSnapshot.RequestCancelInfos, + SignalInfos: snapshotRequest.ResetWorkflowSnapshot.SignalInfos, + SignalRequestedIDs: snapshotRequest.ResetWorkflowSnapshot.SignalRequestedIDs, + + TransferTasks: newTransferTasks, + ReplicationTasks: newReplicationTasks, + TimerTasks: newTimerTasks, + }, } - branchToken := workflowEvents.BranchToken - events := workflowEvents.Events - firstEvent := events[0] - if len(branchToken) == 0 { - size, err := c.shard.AppendHistoryEvents(&persistence.AppendHistoryEventsRequest{ - DomainID: domainID, - Execution: execution, - TransactionID: transactionID, - FirstEventID: firstEvent.GetEventId(), - EventBatchVersion: firstEvent.GetVersion(), - Events: events, - }) - return int64(size), err + if updateCurr { + resetWFReq.CurrentWorkflowMutation = &persistence.WorkflowMutation{ + ExecutionInfo: currMutableState.GetExecutionInfo(), + ExecutionStats: &persistence.ExecutionStats{ + HistorySize: c.stats.HistorySize, + }, + ReplicationState: currMutableState.GetReplicationState(), + + UpsertActivityInfos: []*persistence.ActivityInfo{}, + DeleteActivityInfos: []int64{}, + UpserTimerInfos: []*persistence.TimerInfo{}, + DeleteTimerInfos: []string{}, + UpsertChildExecutionInfos: []*persistence.ChildExecutionInfo{}, + DeleteChildExecutionInfo: nil, + UpsertRequestCancelInfos: []*persistence.RequestCancelInfo{}, + DeleteRequestCancelInfo: nil, + UpsertSignalInfos: []*persistence.SignalInfo{}, + DeleteSignalInfo: nil, + UpsertSignalRequestedIDs: []string{}, + DeleteSignalRequestedID: "", + NewBufferedEvents: []*workflow.HistoryEvent{}, + ClearBufferedEvents: false, + + TransferTasks: currTransferTasks, + ReplicationTasks: currReplicationTasks, + TimerTasks: currTimerTasks, + + Condition: c.updateCondition, + } } - size, err := c.shard.AppendHistoryV2Events(&persistence.AppendHistoryNodesRequest{ - IsNewBranch: false, - BranchToken: branchToken, - Events: events, - TransactionID: transactionID, - }, domainID, execution) - return int64(size), err + return c.shard.ResetWorkflowExecution(resetWFReq) } diff --git a/service/history/workflowExecutionUtil.go b/service/history/workflowExecutionUtil.go new file mode 100644 index 00000000000..46350eac0b8 --- /dev/null +++ b/service/history/workflowExecutionUtil.go @@ -0,0 +1,84 @@ +// Copyright (c) 2019 Uber Technologies, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +package history + +import ( + workflow "github.com/uber/cadence/.gen/go/shared" + "github.com/uber/cadence/common/clock" + "github.com/uber/cadence/common/log" + "github.com/uber/cadence/common/persistence" +) + +func failDecision( + mutableState mutableState, + di *decisionInfo, + decisionFailureCause workflow.DecisionTaskFailedCause, +) error { + + _, err := mutableState.AddDecisionTaskFailedEvent( + di.ScheduleID, + di.StartedID, + decisionFailureCause, + nil, + identityHistoryService, + "", + "", + "", + 0, + ) + return err +} + +func scheduleDecision( + mutableState mutableState, + timeSource clock.TimeSource, + logger log.Logger, +) error { + + if mutableState.HasPendingDecisionTask() { + return nil + } + + di, err := mutableState.AddDecisionTaskScheduledEvent() + if err != nil { + return &workflow.InternalServiceError{Message: "Failed to add decision scheduled event."} + } + + executionInfo := mutableState.GetExecutionInfo() + transferTask := &persistence.DecisionTask{ + DomainID: executionInfo.DomainID, + TaskList: di.TaskList, + ScheduleID: di.ScheduleID, + } + mutableState.AddTransferTasks(transferTask) + + if mutableState.IsStickyTaskListEnabled() { + tBuilder := newTimerBuilder(logger, timeSource) + timerTask := tBuilder.AddScheduleToStartDecisionTimoutTask( + di.ScheduleID, + di.Attempt, + executionInfo.StickyScheduleToStartTimeout, + ) + mutableState.AddTimerTasks(timerTask) + } + + return nil +} diff --git a/service/history/workflowResetor.go b/service/history/workflowResetor.go index 8e14f94eec4..fe96c1785c9 100644 --- a/service/history/workflowResetor.go +++ b/service/history/workflowResetor.go @@ -259,7 +259,7 @@ func (w *workflowResetorImpl) buildNewMutableStateForReset( // replay history to reset point(exclusive) to rebuild mutableState forkEventVersion, wfTimeoutSecs, receivedSignals, continueRunID, newStateBuilder, historySize, retError := w.replayHistoryEvents( - resetDecisionCompletedEventID, requestedID, baseMutableState, newRunID, + domainEntry, resetDecisionCompletedEventID, requestedID, baseMutableState, newRunID, ) if retError != nil { return @@ -268,7 +268,10 @@ func (w *workflowResetorImpl) buildNewMutableStateForReset( // before this, the mutable state is in replay mode // need to close / flush the mutable state for new changes - _, _, retError = newMutableState.CloseTransactionAsSnapshot(w.timeSource.Now()) + _, _, retError = newMutableState.CloseTransactionAsSnapshot( + w.timeSource.Now(), + transactionPolicyPassive, + ) if retError != nil { return } @@ -404,7 +407,7 @@ func (w *workflowResetorImpl) generateReplicationTasksForReset( if terminateCurr { // we will generate 2 replication tasks for this case firstEventIDForCurr := w.setEventIDsWithHistory(currMutableState) - if domainEntry.CanReplicateEvent() { + if domainEntry.GetReplicationPolicy() == cache.ReplicationPolicyMultiCluster { replicationTask := &persistence.HistoryReplicationTask{ Version: currMutableState.GetCurrentVersion(), LastReplicationInfo: currMutableState.GetReplicationState().LastReplicationInfo, @@ -417,7 +420,7 @@ func (w *workflowResetorImpl) generateReplicationTasksForReset( } } firstEventIDForNew := w.setEventIDsWithHistory(newMutableState) - if domainEntry.CanReplicateEvent() { + if domainEntry.GetReplicationPolicy() == cache.ReplicationPolicyMultiCluster { replicationTask := &persistence.HistoryReplicationTask{ Version: newMutableState.GetCurrentVersion(), LastReplicationInfo: newMutableState.GetReplicationState().LastReplicationInfo, @@ -559,6 +562,7 @@ func getRespondActivityTaskFailedRequestFromActivity(ai *persistence.ActivityInf // TODO: @shreyassrivatsan reduce the number of return parameters from this method or return a struct func (w *workflowResetorImpl) replayHistoryEvents( + domainEntry *cache.DomainCacheEntry, decisionFinishEventID int64, requestID string, prevMutableState mutableState, @@ -623,6 +627,7 @@ func (w *workflowResetorImpl) replayHistoryEvents( w.eng.shard.GetEventsCache(), w.eng.logger, firstEvent.GetVersion(), + domainEntry.GetReplicationPolicy(), ) } else { resetMutableState = newMutableStateBuilder(w.eng.shard, w.eng.shard.GetEventsCache(), w.eng.logger) @@ -858,6 +863,9 @@ func (w *workflowResetorImpl) replicateResetEvent( w.eng.shard.GetEventsCache(), w.eng.logger, firstEvent.GetVersion(), + // if can see replication task, meaning that domain is + // global domain with > 1 target clusters + cache.ReplicationPolicyMultiCluster, ) newMsBuilder.GetExecutionInfo().EventStoreVersion = persistence.EventStoreVersionV2 sBuilder = newStateBuilder(w.eng.shard, newMsBuilder, w.eng.logger) diff --git a/service/history/xdcUtil.go b/service/history/xdcUtil.go index 0a3658f9f02..bbc4a8284b7 100644 --- a/service/history/xdcUtil.go +++ b/service/history/xdcUtil.go @@ -22,6 +22,7 @@ package history import ( "fmt" + workflow "github.com/uber/cadence/.gen/go/shared" "github.com/uber/cadence/common/log" "github.com/uber/cadence/common/log/tag" @@ -58,6 +59,7 @@ var ( tag.WorkflowDomainID(timerTask.DomainID), tag.TaskID(timerTask.TaskID), tag.TaskType(timerTask.TaskType), + tag.WorkflowTimeoutType(int64(timerTask.TimeoutType)), tag.FailoverVersion(timerTask.GetVersion()), tag.Timestamp(timerTask.VisibilityTimestamp), tag.WorkflowEventID(timerTask.EventID)) @@ -110,7 +112,6 @@ func loadMutableStateForTransferTask(context workflowExecutionContext, transferT if transferTask.ScheduleID >= msBuilder.GetNextEventID() && !isDecisionRetry { metricsClient.IncCounter(metrics.TransferQueueProcessorScope, metrics.StaleMutableStateCounter) - logger.Debug(fmt.Sprintf("Transfer Task Processor: task event ID: %v >= MS NextEventID: %v.", transferTask.ScheduleID, msBuilder.GetNextEventID())) context.clear() msBuilder, err = context.loadWorkflowExecution() @@ -119,7 +120,7 @@ func loadMutableStateForTransferTask(context workflowExecutionContext, transferT } // after refresh, still mutable state's next event ID <= task ID if transferTask.ScheduleID >= msBuilder.GetNextEventID() { - logger.Info("Transfer Task Processor: task event ID: %v >= MS NextEventID: %v, skip.", + logger.Info("Transfer Task Processor: task event ID >= MS NextEventID, skip.", tag.WorkflowScheduleID(transferTask.ScheduleID), tag.WorkflowNextEventID(msBuilder.GetNextEventID())) return nil, nil @@ -150,7 +151,6 @@ func loadMutableStateForTimerTask(context workflowExecutionContext, timerTask *p if timerTask.EventID >= msBuilder.GetNextEventID() && !isDecisionRetry { metricsClient.IncCounter(metrics.TimerQueueProcessorScope, metrics.StaleMutableStateCounter) - logger.Debug(fmt.Sprintf("Timer Task Processor: task event ID: %v >= MS NextEventID: %v.", timerTask.EventID, msBuilder.GetNextEventID())) context.clear() msBuilder, err = context.loadWorkflowExecution() @@ -159,7 +159,7 @@ func loadMutableStateForTimerTask(context workflowExecutionContext, timerTask *p } // after refresh, still mutable state's next event ID <= task ID if timerTask.EventID >= msBuilder.GetNextEventID() { - logger.Info("Timer Task Processor: task event ID: %v >= MS NextEventID: %v, skip.", + logger.Info("Timer Task Processor: task event ID >= MS NextEventID, skip.", tag.WorkflowEventID(timerTask.EventID), tag.WorkflowNextEventID(msBuilder.GetNextEventID())) return nil, nil