Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Wf-Diagnostics] Incorporate retry diagnostics in workflow diagnostics workflow #6532

Merged
merged 2 commits into from
Dec 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions service/worker/diagnostics/activities.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
"github.com/uber/cadence/service/worker/diagnostics/analytics"
"github.com/uber/cadence/service/worker/diagnostics/invariant"
"github.com/uber/cadence/service/worker/diagnostics/invariant/failure"
"github.com/uber/cadence/service/worker/diagnostics/invariant/retry"
"github.com/uber/cadence/service/worker/diagnostics/invariant/timeout"
)

Expand Down Expand Up @@ -82,6 +83,15 @@ func (w *dw) identifyIssues(ctx context.Context, info identifyIssuesParams) ([]i
}
result = append(result, failureIssues...)

retryInvariant := retry.NewInvariant(retry.Params{
WorkflowExecutionHistory: info.History,
})
retryIssues, err := retryInvariant.Check(ctx)
if err != nil {
return nil, err
}
result = append(result, retryIssues...)

return result, nil
}

Expand Down
22 changes: 22 additions & 0 deletions service/worker/diagnostics/activities_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import (
"github.com/uber/cadence/service/worker/diagnostics/analytics"
"github.com/uber/cadence/service/worker/diagnostics/invariant"
"github.com/uber/cadence/service/worker/diagnostics/invariant/failure"
"github.com/uber/cadence/service/worker/diagnostics/invariant/retry"
"github.com/uber/cadence/service/worker/diagnostics/invariant/timeout"
)

Expand Down Expand Up @@ -85,6 +86,10 @@ func Test__identifyIssues(t *testing.T) {
ActivityScheduled: &types.ActivityTaskScheduledEventAttributes{
ActivityID: "101",
ActivityType: &types.ActivityType{Name: "test-activity"},
RetryPolicy: &types.RetryPolicy{
InitialIntervalInSeconds: 1,
MaximumAttempts: 1,
},
},
ActivityStarted: &types.ActivityTaskStartedEventAttributes{
Identity: "localhost",
Expand All @@ -93,6 +98,14 @@ func Test__identifyIssues(t *testing.T) {
}
actMetadataInBytes, err := json.Marshal(actMetadata)
require.NoError(t, err)
retryMetadata := retry.RetryMetadata{
RetryPolicy: &types.RetryPolicy{
InitialIntervalInSeconds: 1,
MaximumAttempts: 1,
},
}
retryMetadataInBytes, err := json.Marshal(retryMetadata)
require.NoError(t, err)
expectedResult := []invariant.InvariantCheckResult{
{
InvariantType: timeout.TimeoutTypeExecution.String(),
Expand All @@ -104,6 +117,11 @@ func Test__identifyIssues(t *testing.T) {
Reason: failure.GenericError.String(),
Metadata: actMetadataInBytes,
},
{
InvariantType: retry.ActivityRetryIssue.String(),
Reason: retry.RetryPolicyValidationMaxAttempts.String(),
Metadata: retryMetadataInBytes,
},
}
result, err := dwtest.identifyIssues(context.Background(), identifyIssuesParams{History: testWorkflowExecutionHistoryResponse()})
require.NoError(t, err)
Expand Down Expand Up @@ -219,6 +237,10 @@ func testWorkflowExecutionHistoryResponse() *types.GetWorkflowExecutionHistoryRe
ActivityTaskScheduledEventAttributes: &types.ActivityTaskScheduledEventAttributes{
ActivityID: "101",
ActivityType: &types.ActivityType{Name: "test-activity"},
RetryPolicy: &types.RetryPolicy{
InitialIntervalInSeconds: 1,
MaximumAttempts: 1,
},
},
},
{
Expand Down
40 changes: 40 additions & 0 deletions service/worker/diagnostics/workflow.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"github.com/uber/cadence/common/types"
"github.com/uber/cadence/service/worker/diagnostics/invariant"
"github.com/uber/cadence/service/worker/diagnostics/invariant/failure"
"github.com/uber/cadence/service/worker/diagnostics/invariant/retry"
"github.com/uber/cadence/service/worker/diagnostics/invariant/timeout"
)

Expand All @@ -54,6 +55,7 @@ type DiagnosticsWorkflowInput struct {
type DiagnosticsWorkflowResult struct {
Timeouts *timeoutDiagnostics
Failures *failureDiagnostics
Retries *retryDiagnostics
}

type timeoutDiagnostics struct {
Expand Down Expand Up @@ -89,6 +91,17 @@ type failuresIssuesResult struct {
Metadata failure.FailureMetadata
}

type retryDiagnostics struct {
Issues []*retryIssuesResult
Runbooks []string
}

type retryIssuesResult struct {
InvariantType string
Reason string
Metadata retry.RetryMetadata
}

func (w *dw) DiagnosticsWorkflow(ctx workflow.Context, params DiagnosticsWorkflowInput) (*DiagnosticsWorkflowResult, error) {
scope := w.metricsClient.Scope(metrics.DiagnosticsWorkflowScope, metrics.DomainTag(params.Domain))
scope.IncCounter(metrics.DiagnosticsWorkflowStartedCount)
Expand All @@ -97,6 +110,7 @@ func (w *dw) DiagnosticsWorkflow(ctx workflow.Context, params DiagnosticsWorkflo

var timeoutsResult timeoutDiagnostics
var failureResult failureDiagnostics
var retryResult retryDiagnostics
var checkResult []invariant.InvariantCheckResult
var rootCauseResult []invariant.InvariantRootCauseResult

Expand Down Expand Up @@ -158,10 +172,17 @@ func (w *dw) DiagnosticsWorkflow(ctx workflow.Context, params DiagnosticsWorkflo
failureResult.RootCause = retrieveFailureRootCause(rootCauseResult)
failureResult.Runbooks = []string{linkToFailuresRunbook}

retryIssues, err := retrieveRetryIssues(checkResult)
if err != nil {
return nil, fmt.Errorf("RetrieveRetryIssues: %w", err)
}
retryResult.Issues = retryIssues

scope.IncCounter(metrics.DiagnosticsWorkflowSuccess)
return &DiagnosticsWorkflowResult{
Timeouts: &timeoutsResult,
Failures: &failureResult,
Retries: &retryResult,
}, nil
}

Expand Down Expand Up @@ -276,6 +297,25 @@ func retrieveFailureRootCause(rootCause []invariant.InvariantRootCauseResult) []
return result
}

func retrieveRetryIssues(issues []invariant.InvariantCheckResult) ([]*retryIssuesResult, error) {
result := make([]*retryIssuesResult, 0)
for _, issue := range issues {
if issue.InvariantType == retry.WorkflowRetryIssue.String() || issue.InvariantType == retry.WorkflowRetryInfo.String() || issue.InvariantType == retry.ActivityRetryIssue.String() {
var data retry.RetryMetadata
err := json.Unmarshal(issue.Metadata, &data)
if err != nil {
return nil, err
}
result = append(result, &retryIssuesResult{
InvariantType: issue.InvariantType,
Reason: issue.Reason,
Metadata: data,
})
}
}
return result, nil
}

func rootCauseHeartBeatRelated(rootCause invariant.RootCause) bool {
for _, rc := range []invariant.RootCause{invariant.RootCauseTypeNoHeartBeatTimeoutNoRetryPolicy,
invariant.RootCauseTypeHeartBeatingNotEnabledWithRetryPolicy,
Expand Down
39 changes: 39 additions & 0 deletions service/worker/diagnostics/workflow_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ import (
"github.com/uber/cadence/common/types"
"github.com/uber/cadence/service/worker/diagnostics/invariant"
"github.com/uber/cadence/service/worker/diagnostics/invariant/failure"
"github.com/uber/cadence/service/worker/diagnostics/invariant/retry"
"github.com/uber/cadence/service/worker/diagnostics/invariant/timeout"
)

Expand Down Expand Up @@ -318,3 +319,41 @@ func (s *diagnosticsWorkflowTestSuite) Test__retrieveFailureIssues() {
s.NoError(err)
s.Equal(failureIssues, result)
}

func (s *diagnosticsWorkflowTestSuite) Test__retrieveRetryIssues() {
retryMetadata := retry.RetryMetadata{
RetryPolicy: &types.RetryPolicy{
InitialIntervalInSeconds: 1,
MaximumAttempts: 1,
},
}
retryMetadataInBytes, err := json.Marshal(retryMetadata)
s.NoError(err)
issues := []invariant.InvariantCheckResult{
{
InvariantType: retry.ActivityRetryIssue.String(),
Reason: retry.RetryPolicyValidationMaxAttempts.String(),
Metadata: retryMetadataInBytes,
},
{
InvariantType: retry.WorkflowRetryIssue.String(),
Reason: retry.RetryPolicyValidationMaxAttempts.String(),
Metadata: retryMetadataInBytes,
},
}
retryIssues := []*retryIssuesResult{
{
InvariantType: retry.ActivityRetryIssue.String(),
Reason: retry.RetryPolicyValidationMaxAttempts.String(),
Metadata: retryMetadata,
},
{
InvariantType: retry.WorkflowRetryIssue.String(),
Reason: retry.RetryPolicyValidationMaxAttempts.String(),
Metadata: retryMetadata,
},
}
result, err := retrieveRetryIssues(issues)
s.NoError(err)
s.Equal(retryIssues, result)
}