-
Notifications
You must be signed in to change notification settings - Fork 814
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add emitWorkflowVersionMetrics for pinot #6190
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,6 +31,8 @@ | |
cclient "go.uber.org/cadence/client" | ||
"go.uber.org/cadence/workflow" | ||
"go.uber.org/zap" | ||
|
||
"github.com/uber/cadence/common/pinot" | ||
) | ||
|
||
const ( | ||
|
@@ -146,6 +148,47 @@ | |
`, domain.GetInfo().ID), nil | ||
} | ||
|
||
func (w *Workflow) getWorkflowTypePinotQuery(domainName string) (string, error) { | ||
domain, err := w.analyzer.domainCache.GetDomain(domainName) | ||
if err != nil { | ||
return "", err | ||
} | ||
// exclude uninitialized workflow executions by checking whether record has start time field | ||
// there's a "LIMIT 10" because in ES, Aggr clause by default returns the top 10 results | ||
return fmt.Sprintf(` | ||
SELECT WorkflowType, COUNT(*) AS count | ||
FROM %s | ||
WHERE DomainID = '%s' | ||
AND CloseStatus = -1 | ||
AND StartTime > 0 | ||
GROUP BY WorkflowType | ||
ORDER BY count DESC | ||
LIMIT 10 | ||
OFFSET 0 | ||
`, w.analyzer.pinotTableName, domain.GetInfo().ID), nil | ||
} | ||
|
||
func (w *Workflow) getWorkflowVersionPinotQuery(domainName string, wfType string) (string, error) { | ||
domain, err := w.analyzer.domainCache.GetDomain(domainName) | ||
if err != nil { | ||
return "", err | ||
} | ||
// exclude uninitialized workflow executions by checking whether record has start time field | ||
// there's a "LIMIT 10" because in ES, Aggr clause by default returns the top 10 results | ||
return fmt.Sprintf(` | ||
SELECT JSON_EXTRACT_SCALAR(Attr, '$.CadenceChangeVersion', 'STRING_ARRAY') AS CadenceChangeVersion, COUNT(*) AS count | ||
FROM %s | ||
WHERE DomainID = '%s' | ||
AND CloseStatus = -1 | ||
AND StartTime > 0 | ||
AND WorkflowType = '%s' | ||
GROUP BY JSON_EXTRACT_SCALAR(Attr, '$.CadenceChangeVersion', 'STRING_ARRAY') AS CadenceChangeVersion | ||
ORDER BY count DESC | ||
LIMIT 10 | ||
OFFSET 0 | ||
`, w.analyzer.pinotTableName, domain.GetInfo().ID, wfType), nil | ||
} | ||
|
||
// emitWorkflowVersionMetrics is an activity that emits the running WF versions of a domain | ||
func (w *Workflow) emitWorkflowVersionMetrics(ctx context.Context) error { | ||
logger := activity.GetLogger(ctx) | ||
|
@@ -160,6 +203,8 @@ | |
switch w.analyzer.readMode { | ||
case ES: | ||
err = w.emitWorkflowVersionMetricsES(ctx, domainName, logger) | ||
case Pinot: | ||
err = w.emitWorkflowVersionMetricsPinot(domainName, logger) | ||
default: | ||
err = w.emitWorkflowVersionMetricsES(ctx, domainName, logger) | ||
} | ||
|
@@ -171,6 +216,123 @@ | |
return nil | ||
} | ||
|
||
func (w *Workflow) emitWorkflowVersionMetricsPinot(domainName string, logger *zap.Logger) error { | ||
wfVersionPinotQuery, err := w.getWorkflowTypePinotQuery(domainName) | ||
if err != nil { | ||
logger.Error("Failed to get Pinot query to find workflow type Info", | ||
zap.Error(err), | ||
zap.String("DomainName", domainName), | ||
) | ||
return err | ||
} | ||
response, err := w.analyzer.pinotClient.SearchAggr(&pinot.SearchRequest{Query: wfVersionPinotQuery}) | ||
if err != nil { | ||
logger.Error("Failed to query Pinot to find workflow type count Info", | ||
zap.Error(err), | ||
zap.String("VisibilityQuery", wfVersionPinotQuery), | ||
zap.String("DomainName", domainName), | ||
) | ||
return fmt.Errorf("failed to query Pinot to find workflow type count Info: %s, error: %s", domainName, err.Error()) | ||
} | ||
foundAggregation := len(response) > 0 | ||
|
||
if !foundAggregation { | ||
logger.Error("Pinot error: aggregation failed.", | ||
zap.Error(err), | ||
zap.String("Aggregation", fmt.Sprintf("%v", response)), | ||
zap.String("DomainName", domainName), | ||
zap.String("VisibilityQuery", wfVersionPinotQuery), | ||
) | ||
return fmt.Errorf("aggregation failed for domain in Pinot: %s", domainName) | ||
} | ||
var domainWorkflowVersionCount DomainWorkflowVersionCount | ||
for _, row := range response { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 10x latency might be an issue for metrics emission. Could you parallelize it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we do this in parallel with multiple threads, is there a risk when metrics are emitted, the workflow still doesn't have all the data? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This metrics doesn't care about the latency, since we run it every 5 or 10 minutes. But we can eliminate the calls when we aggregate by both version and type There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Discussed this with Ender offline as well. We are going to keep this approach. |
||
workflowType := row[0].(string) | ||
workflowCount, ok := row[1].(int) | ||
if !ok { | ||
logger.Error("Error parsing workflow count", | ||
zap.Error(err), | ||
zap.String("WorkflowType", workflowType), | ||
zap.String("DomainName", domainName), | ||
) | ||
return fmt.Errorf("error parsing workflow count for workflow type %s", workflowType) | ||
} | ||
workflowVersions, err := w.queryWorkflowVersionsWithType(domainName, workflowType, logger) | ||
|
||
if err != nil { | ||
logger.Error("Error querying workflow versions", | ||
zap.Error(err), | ||
zap.String("WorkflowType", workflowType), | ||
zap.String("DomainName", domainName), | ||
) | ||
return fmt.Errorf("error querying workflow versions for workflow type: %s: error: %s", workflowType, err.Error()) | ||
} | ||
|
||
domainWorkflowVersionCount.WorkflowTypes = append(domainWorkflowVersionCount.WorkflowTypes, WorkflowTypeCount{ | ||
EsAggregateCount: EsAggregateCount{ | ||
AggregateKey: workflowType, | ||
AggregateCount: int64(workflowCount), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. workflowCount is from first call; this will be different from the summation of counts from subsequent calls by workflowtypes. But you could instead use the summation to be at least self consistent. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here's one sample result from ES:
The count of workflow type is different from the summation of the counts of CadenceChangeVersions. I was thinking if this is designed on purpose. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about we group by WorkflowType and CadenceChangeVersion, so it can have the count per version and per type. I tried and it is working
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That query means to count all the workflowTypes which has CadenceChangeVersion. This is different from the ES result. For that ES query, it means to first, find the top 10 workflow types by count, and then, within these 10 workflow types, identify the top 10 CadenceChangeVersions count for each. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Discussed offline, group by version and type will filter the records without CadenceChangeVersion. Need to verify if we need to emit that count, if not we can go with this approach. |
||
}, | ||
WorkflowVersions: workflowVersions, | ||
}) | ||
} | ||
|
||
for _, workflowType := range domainWorkflowVersionCount.WorkflowTypes { | ||
for _, workflowVersion := range workflowType.WorkflowVersions.WorkflowVersions { | ||
w.analyzer.tallyScope.Tagged( | ||
map[string]string{domainTag: domainName, workflowVersionTag: workflowVersion.AggregateKey, workflowTypeTag: workflowType.AggregateKey}, | ||
).Gauge(workflowVersionCountMetrics).Update(float64(workflowVersion.AggregateCount)) | ||
} | ||
} | ||
return nil | ||
} | ||
|
||
func (w *Workflow) queryWorkflowVersionsWithType(domainName string, wfType string, logger *zap.Logger) (WorkflowVersionCount, error) { | ||
wfVersionPinotQuery, err := w.getWorkflowVersionPinotQuery(domainName, wfType) | ||
if err != nil { | ||
logger.Error("Failed to get Pinot query to find workflow version Info", | ||
zap.Error(err), | ||
zap.String("DomainName", domainName), | ||
) | ||
return WorkflowVersionCount{}, err | ||
} | ||
|
||
response, err := w.analyzer.pinotClient.SearchAggr(&pinot.SearchRequest{Query: wfVersionPinotQuery}) | ||
if err != nil { | ||
logger.Error("Failed to query Pinot to find workflow type count Info", | ||
zap.Error(err), | ||
zap.String("VisibilityQuery", wfVersionPinotQuery), | ||
zap.String("DomainName", domainName), | ||
) | ||
return WorkflowVersionCount{}, err | ||
} | ||
foundAggregation := len(response) > 0 | ||
|
||
// if no CadenceChangeVersion is found, return an empty WorkflowVersionCount, no errors | ||
if !foundAggregation { | ||
return WorkflowVersionCount{}, nil | ||
} | ||
|
||
var workflowVersions WorkflowVersionCount | ||
for _, row := range response { | ||
workflowVersion := row[0].(string) | ||
workflowCount, ok := row[1].(int) | ||
if !ok { | ||
logger.Error("Error parsing workflow count", | ||
zap.Error(err), | ||
zap.String("WorkflowVersion", workflowVersion), | ||
zap.String("DomainName", domainName), | ||
) | ||
return WorkflowVersionCount{}, fmt.Errorf("error parsing workflow count for workflow version %s", workflowVersion) | ||
} | ||
workflowVersions.WorkflowVersions = append(workflowVersions.WorkflowVersions, EsAggregateCount{ | ||
AggregateKey: workflowVersion, | ||
AggregateCount: int64(workflowCount), | ||
}) | ||
} | ||
return workflowVersions, nil | ||
} | ||
|
||
func (w *Workflow) emitWorkflowVersionMetricsES(ctx context.Context, domainName string, logger *zap.Logger) error { | ||
wfVersionEsQuery, err := w.getWorkflowVersionQuery(domainName) | ||
if err != nil { | ||
|
@@ -198,7 +360,7 @@ | |
zap.String("DomainName", domainName), | ||
zap.String("VisibilityQuery", wfVersionEsQuery), | ||
) | ||
return err | ||
return fmt.Errorf("aggregation failed for domain in ES: %s", domainName) | ||
} | ||
var domainWorkflowVersionCount DomainWorkflowVersionCount | ||
err = json.Unmarshal(agg, &domainWorkflowVersionCount) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit:
%q
to replace'%s'
according to https://pkg.go.dev/fmtThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For strings, %q returns a double-quoted string safely escaped with Go syntax, but in Pinot, Where DomainID = "" doesn't work. It has to be single quoted.