From 6b756c3e82554461f72af608ac4c5dc2fa72346c Mon Sep 17 00:00:00 2001 From: Rohit Agrawal Date: Sat, 6 Nov 2021 16:00:45 -0400 Subject: [PATCH] feat(analysis): Add Dry-Run Mode Signed-off-by: Rohit Agrawal --- analysis/analysis.go | 192 ++++++++++++------ analysis/analysis_test.go | 163 ++++++++++----- controller/metrics/analysis.go | 18 +- controller/metrics/analysis_test.go | 42 ++-- controller/metrics/prommetrics.go | 6 +- docs/features/analysis.md | 47 +++++ manifests/crds/analysis-run-crd.yaml | 4 + manifests/crds/analysis-template-crd.yaml | 2 + .../crds/cluster-analysis-template-crd.yaml | 2 + manifests/install.yaml | 8 + manifests/namespace-install.yaml | 8 + pkg/apis/rollouts/v1alpha1/analysis_types.go | 4 + pkg/apis/rollouts/v1alpha1/generated.pb.go | 1 + pkg/apis/rollouts/v1alpha1/generated.proto | 6 + .../rollouts/v1alpha1/openapi_generated.go | 14 ++ 15 files changed, 381 insertions(+), 136 deletions(-) diff --git a/analysis/analysis.go b/analysis/analysis.go index 046e3899b8..99e69137cd 100644 --- a/analysis/analysis.go +++ b/analysis/analysis.go @@ -26,7 +26,10 @@ const ( DefaultMeasurementHistoryLimit = 10 // DefaultErrorRetryInterval is the default interval to retry a measurement upon error, in the // event an interval was not specified - DefaultErrorRetryInterval time.Duration = 10 * time.Second + DefaultErrorRetryInterval = 10 * time.Second + // SuccessfulAssessmentRunTerminatedResult is used for logging purposes when the metrics evaluation + // is successful and the run is terminated. + SuccessfulAssessmentRunTerminatedResult = "Metric Assessment Result - Successful: Run Terminated" ) // metricTask holds the metric which need to be measured during this reconciliation along with @@ -36,11 +39,19 @@ type metricTask struct { incompleteMeasurement *v1alpha1.Measurement } +// dryRunStatus holds the stats of the metrics being evaluated in the Dry-Run mode. +type dryRunStatus struct { + totalMetrics int32 + failedMetrics int32 + inconclusiveMetrics int32 + successfulMetrics int32 +} + func (c *Controller) reconcileAnalysisRun(origRun *v1alpha1.AnalysisRun) *v1alpha1.AnalysisRun { if origRun.Status.Phase.Completed() { return origRun } - log := logutil.WithAnalysisRun(origRun) + logger := logutil.WithAnalysisRun(origRun) run := origRun.DeepCopy() if run.Status.MetricResults == nil { @@ -49,8 +60,8 @@ func (c *Controller) reconcileAnalysisRun(origRun *v1alpha1.AnalysisRun) *v1alph resolvedMetrics, err := getResolvedMetricsWithoutSecrets(run.Spec.Metrics, run.Spec.Args) if err != nil { - message := fmt.Sprintf("unable to resolve metric arguments: %v", err) - log.Warn(message) + message := fmt.Sprintf("Unable to resolve metric arguments: %v", err) + logger.Warn(message) run.Status.Phase = v1alpha1.AnalysisPhaseError run.Status.Message = message c.recordAnalysisRunCompletionEvent(run) @@ -59,8 +70,8 @@ func (c *Controller) reconcileAnalysisRun(origRun *v1alpha1.AnalysisRun) *v1alph err = analysisutil.ValidateMetrics(resolvedMetrics) if err != nil { - message := fmt.Sprintf("analysis spec invalid: %v", err) - log.Warn(message) + message := fmt.Sprintf("Analysis spec invalid: %v", err) + logger.Warn(message) run.Status.Phase = v1alpha1.AnalysisPhaseError run.Status.Message = message c.recordAnalysisRunCompletionEvent(run) @@ -68,11 +79,11 @@ func (c *Controller) reconcileAnalysisRun(origRun *v1alpha1.AnalysisRun) *v1alph } tasks := generateMetricTasks(run, resolvedMetrics) - log.Infof("taking %d measurements", len(tasks)) + logger.Infof("Taking %d Measurement(s)...", len(tasks)) err = c.runMeasurements(run, tasks) if err != nil { - message := fmt.Sprintf("unable to resolve metric arguments: %v", err) - log.Warn(message) + message := fmt.Sprintf("Unable to resolve metric arguments: %v", err) + logger.Warn(message) run.Status.Phase = v1alpha1.AnalysisPhaseError run.Status.Message = message c.recordAnalysisRunCompletionEvent(run) @@ -91,7 +102,7 @@ func (c *Controller) reconcileAnalysisRun(origRun *v1alpha1.AnalysisRun) *v1alph err = c.garbageCollectMeasurements(run, DefaultMeasurementHistoryLimit) if err != nil { // TODO(jessesuen): surface errors to controller so they can be retried - log.Warnf("Failed to garbage collect measurements: %v", err) + logger.Warnf("Failed to garbage collect measurements: %v", err) } nextReconcileTime := calculateNextReconcileTime(run, resolvedMetrics) @@ -100,7 +111,7 @@ func (c *Controller) reconcileAnalysisRun(origRun *v1alpha1.AnalysisRun) *v1alph if enqueueSeconds < 0 { enqueueSeconds = 0 } - log.Infof("enqueueing analysis after %v", enqueueSeconds) + logger.Infof("Enqueueing analysis after %v", enqueueSeconds) c.enqueueAnalysisAfter(run, enqueueSeconds) } return run @@ -133,7 +144,7 @@ func (c *Controller) recordAnalysisRunCompletionEvent(run *v1alpha1.AnalysisRun) case v1alpha1.AnalysisPhaseError, v1alpha1.AnalysisPhaseFailed: eventType = corev1.EventTypeWarning } - c.recorder.Eventf(run, record.EventOptions{EventType: eventType, EventReason: "AnalysisRun" + string(run.Status.Phase)}, "analysis completed %s", run.Status.Phase) + c.recorder.Eventf(run, record.EventOptions{EventType: eventType, EventReason: "AnalysisRun" + string(run.Status.Phase)}, "Analysis Completed. Result: %s", run.Status.Phase) } // generateMetricTasks generates a list of metrics tasks needed to be measured as part of this @@ -141,7 +152,7 @@ func (c *Controller) recordAnalysisRunCompletionEvent(run *v1alpha1.AnalysisRun) // terminating (e.g. due to manual termination or failing metric), will not schedule further // measurements other than to resume any in-flight measurements. func generateMetricTasks(run *v1alpha1.AnalysisRun, metrics []v1alpha1.Metric) []metricTask { - log := logutil.WithAnalysisRun(run) + logger := logutil.WithAnalysisRun(run) var tasks []metricTask terminating := analysisutil.IsTerminating(run) @@ -149,7 +160,7 @@ func generateMetricTasks(run *v1alpha1.AnalysisRun, metrics []v1alpha1.Metric) [ if analysisutil.MetricCompleted(run, metric.Name) { continue } - logCtx := log.WithField("metric", metric.Name) + logCtx := logger.WithField("metric", metric.Name) lastMeasurement := analysisutil.LastMeasurement(run, metric.Name) if lastMeasurement != nil && lastMeasurement.FinishedAt == nil { now := metav1.Now() @@ -157,7 +168,7 @@ func generateMetricTasks(run *v1alpha1.AnalysisRun, metrics []v1alpha1.Metric) [ continue } // last measurement is still in-progress. need to complete it - logCtx.Infof("resuming in-progress measurement") + logCtx.Infof("Resuming in-progress measurement") tasks = append(tasks, metricTask{ metric: run.Spec.Metrics[i], incompleteMeasurement: lastMeasurement, @@ -165,7 +176,7 @@ func generateMetricTasks(run *v1alpha1.AnalysisRun, metrics []v1alpha1.Metric) [ continue } if terminating { - logCtx.Infof("skipping measurement: run is terminating") + logCtx.Infof("Skipping measurement: run is terminating") continue } if lastMeasurement == nil { @@ -179,13 +190,13 @@ func generateMetricTasks(run *v1alpha1.AnalysisRun, metrics []v1alpha1.Metric) [ continue } if run.Status.StartedAt.Add(duration).After(time.Now()) { - logCtx.Infof("waiting until start delay duration passes") + logCtx.Infof("Waiting until start delay duration passes") continue } } // measurement never taken tasks = append(tasks, metricTask{metric: run.Spec.Metrics[i]}) - logCtx.Infof("running initial measurement") + logCtx.Infof("Running initial measurement") continue } metricResult := analysisutil.GetResult(run, metric.Name) @@ -201,22 +212,32 @@ func generateMetricTasks(run *v1alpha1.AnalysisRun, metrics []v1alpha1.Metric) [ if lastMeasurement.Phase == v1alpha1.AnalysisPhaseError { interval = DefaultErrorRetryInterval } else if metric.Interval != "" { - metricInterval, err := metric.Interval.Duration() + parsedInterval, err := parseMetricInterval(*logCtx, metric.Interval) if err != nil { - logCtx.Warnf("failed to parse interval: %v", err) continue } - interval = metricInterval + interval = parsedInterval } if time.Now().After(lastMeasurement.FinishedAt.Add(interval)) { tasks = append(tasks, metricTask{metric: run.Spec.Metrics[i]}) - logCtx.Infof("running overdue measurement") + logCtx.Infof("Running overdue measurement") continue } } return tasks } +// parseMetricInterval is a helper method to parse the given metric interval and return the +// parsed duration or error (if any) +func parseMetricInterval(logCtx log.Entry, metricDurationString v1alpha1.DurationString) (time.Duration, error) { + metricInterval, err := metricDurationString.Duration() + if err != nil { + logCtx.Warnf("Failed to parse interval: %v", err) + return -1, err + } + return metricInterval, nil +} + // resolveArgs resolves args for metricTasks, including secret references // returns resolved metricTasks and secrets for log redaction func (c *Controller) resolveArgs(tasks []metricTask, args []v1alpha1.Argument, namespace string) ([]metricTask, []string, error) { @@ -286,7 +307,7 @@ func (c *Controller) runMeasurements(run *v1alpha1.AnalysisRun, tasks []metricTa go func(t metricTask) { defer wg.Done() //redact secret values from logs - log := logutil.WithRedactor(*logutil.WithAnalysisRun(run).WithField("metric", t.metric.Name), secrets) + logger := logutil.WithRedactor(*logutil.WithAnalysisRun(run).WithField("metric", t.metric.Name), secrets) resultsLock.Lock() metricResult := analysisutil.GetResult(run, t.metric.Name) @@ -294,13 +315,14 @@ func (c *Controller) runMeasurements(run *v1alpha1.AnalysisRun, tasks []metricTa if metricResult == nil { metricResult = &v1alpha1.MetricResult{ - Name: t.metric.Name, - Phase: v1alpha1.AnalysisPhaseRunning, + Name: t.metric.Name, + Phase: v1alpha1.AnalysisPhaseRunning, + DryRun: t.metric.DryRun, } } var newMeasurement v1alpha1.Measurement - provider, err := c.newProvider(*log, t.metric) + provider, err := c.newProvider(*logger, t.metric) if err != nil { if t.incompleteMeasurement != nil { newMeasurement = *t.incompleteMeasurement @@ -316,10 +338,10 @@ func (c *Controller) runMeasurements(run *v1alpha1.AnalysisRun, tasks []metricTa } else { // metric is incomplete. either terminate or resume it if terminating { - log.Infof("terminating in-progress measurement") + logger.Infof("Terminating in-progress measurement") newMeasurement = provider.Terminate(run, t.metric, *t.incompleteMeasurement) if newMeasurement.Phase == v1alpha1.AnalysisPhaseSuccessful { - newMeasurement.Message = "metric terminated" + newMeasurement.Message = "Metric Terminated" } } else { newMeasurement = provider.Resume(run, t.metric, *t.incompleteMeasurement) @@ -328,7 +350,7 @@ func (c *Controller) runMeasurements(run *v1alpha1.AnalysisRun, tasks []metricTa } if newMeasurement.Phase.Completed() { - log.Infof("measurement completed %s", newMeasurement.Phase) + logger.Infof("Measurement Completed. Result: %s", newMeasurement.Phase) if newMeasurement.FinishedAt == nil { finishedAt := metav1.Now() newMeasurement.FinishedAt = &finishedAt @@ -349,7 +371,7 @@ func (c *Controller) runMeasurements(run *v1alpha1.AnalysisRun, tasks []metricTa case v1alpha1.AnalysisPhaseError: metricResult.Error++ metricResult.ConsecutiveError++ - log.Warnf("measurement had error: %s", newMeasurement.Message) + logger.Warnf("Measurement had error: %s", newMeasurement.Message) } } @@ -391,23 +413,33 @@ func (c *Controller) assessRunStatus(run *v1alpha1.AnalysisRun, metrics []v1alph run.Status.StartedAt = &now } if run.Spec.Terminate { - worstMessage = "run terminated" + worstMessage = "Run Terminated" } // Iterate all metrics and update MetricResult.Phase fields based on latest measurement(s) + dryRunStatus := dryRunStatus{ + totalMetrics: 0, + failedMetrics: 0, + inconclusiveMetrics: 0, + successfulMetrics: 0, + } for _, metric := range metrics { + if metric.DryRun { + log.Infof("Metric '%s' is running in Dry-Run mode.", metric.Name) + dryRunStatus.totalMetrics++ + } if result := analysisutil.GetResult(run, metric.Name); result != nil { - log := logutil.WithAnalysisRun(run).WithField("metric", metric.Name) + logger := logutil.WithAnalysisRun(run).WithField("metric", metric.Name) metricStatus := assessMetricStatus(metric, *result, terminating) if result.Phase != metricStatus { - log.Infof("metric transitioned from %s -> %s", result.Phase, metricStatus) + logger.Infof("Metric '%s' transitioned from %s -> %s", metric.Name, result.Phase, metricStatus) if metricStatus.Completed() { eventType := corev1.EventTypeNormal switch metricStatus { case v1alpha1.AnalysisPhaseError, v1alpha1.AnalysisPhaseFailed: eventType = corev1.EventTypeWarning } - c.recorder.Eventf(run, record.EventOptions{EventType: eventType, EventReason: "Metric" + string(metricStatus)}, "metric '%s' completed %s", metric.Name, metricStatus) + c.recorder.Eventf(run, record.EventOptions{EventType: eventType, EventReason: "Metric" + string(metricStatus)}, "Metric '%s' Completed. Result: %s", metric.Name, metricStatus) } if lastMeasurement := analysisutil.LastMeasurement(run, metric.Name); lastMeasurement != nil { result.Message = lastMeasurement.Message @@ -419,31 +451,58 @@ func (c *Controller) assessRunStatus(run *v1alpha1.AnalysisRun, metrics []v1alph // if any metric is in-progress, then entire analysis run will be considered running everythingCompleted = false } else { + phase, message := assessMetricFailureInconclusiveOrError(metric, *result) + // NOTE: We don't care about the status if the metric is marked as a Dry-Run // otherwise, remember the worst status of all completed metric results - if worstStatus == "" || analysisutil.IsWorse(worstStatus, metricStatus) { - worstStatus = metricStatus - _, message := assessMetricFailureInconclusiveOrError(metric, *result) + if !metric.DryRun { + if worstStatus == "" || analysisutil.IsWorse(worstStatus, metricStatus) { + worstStatus = metricStatus + if message != "" { + worstMessage = fmt.Sprintf("Metric \"%s\" assessed %s due to %s", metric.Name, metricStatus, message) + if result.Message != "" { + worstMessage += fmt.Sprintf(": \"Error Message: %s\"", result.Message) + } + } + } + } else { + // Update metric result message if message != "" { - worstMessage = fmt.Sprintf("metric \"%s\" assessed %s due to %s", metric.Name, metricStatus, message) + failureMessage := fmt.Sprintf("Metric assessed %s due to %s", metricStatus, message) if result.Message != "" { - worstMessage += fmt.Sprintf(": \"Error Message: %s\"", result.Message) + result.Message = fmt.Sprintf("%s: \"Error Message: %s\"", failureMessage, result.Message) + } else { + result.Message = failureMessage } + analysisutil.SetResult(run, *result) + } + // Update DryRun stats + switch phase { + case v1alpha1.AnalysisPhaseError, v1alpha1.AnalysisPhaseFailed: + dryRunStatus.failedMetrics++ + case v1alpha1.AnalysisPhaseInconclusive: + dryRunStatus.inconclusiveMetrics++ + case v1alpha1.AnalysisPhaseSuccessful: + dryRunStatus.successfulMetrics++ + default: + // We'll mark the status as success by default if it doesn't match anything. + dryRunStatus.successfulMetrics++ } } } } else { - // metric hasn't started running. possible cases where some of the metrics starts with delay + // metric hasn't started running. possible cases where some metrics starts with delay everythingCompleted = false } } - + // Append Dry-Run metrics results if any. + worstMessage = appendDryRunResults(strings.TrimSpace(worstMessage), dryRunStatus) if terminating { if worstStatus == "" { // we have yet to take a single measurement, but have already been instructed to stop - log.Infof("metric assessed %s: run terminated", v1alpha1.AnalysisPhaseSuccessful) + log.Infof(SuccessfulAssessmentRunTerminatedResult) return v1alpha1.AnalysisPhaseSuccessful, worstMessage } - log.Infof("metric assessed %s: run terminated", worstStatus) + log.Infof("Metric Assessment Result - %s: Run Terminated", worstStatus) return worstStatus, worstMessage } if !everythingCompleted || worstStatus == "" { @@ -452,26 +511,39 @@ func (c *Controller) assessRunStatus(run *v1alpha1.AnalysisRun, metrics []v1alph return worstStatus, worstMessage } +// appendDryRunResults is a helper method to append the Dry-Run metrics status to the AnalysisRun message. +func appendDryRunResults(worstMessage string, dryRunStatus dryRunStatus) string { + if dryRunStatus.totalMetrics > 0 { + dryRunResults := fmt.Sprintf("Dry-Run Summary: Total=%d, Successful=%d, Failed=%d, Inconclusive=%d", dryRunStatus.totalMetrics, dryRunStatus.successfulMetrics, dryRunStatus.failedMetrics, dryRunStatus.inconclusiveMetrics) + log.Infof(dryRunResults) + if worstMessage == "" { + return dryRunResults + } + return fmt.Sprintf("%s; %s", worstMessage, dryRunResults) + } + return worstMessage +} + // assessMetricStatus assesses the status of a single metric based on: -// * current/latest measurement status +// * current or latest measurement status // * parameters given by the metric (failureLimit, count, etc...) -// * whether or not we are terminating (e.g. due to failing run, or termination request) +// * whether we are terminating (e.g. due to failing run, or termination request) func assessMetricStatus(metric v1alpha1.Metric, result v1alpha1.MetricResult, terminating bool) v1alpha1.AnalysisPhase { if result.Phase.Completed() { return result.Phase } - log := log.WithField("metric", metric.Name) + logger := log.WithField("metric", metric.Name) if len(result.Measurements) == 0 { if terminating { // we have yet to take a single measurement, but have already been instructed to stop - log.Infof("metric assessed %s: run terminated", v1alpha1.AnalysisPhaseSuccessful) + logger.Infof(SuccessfulAssessmentRunTerminatedResult) return v1alpha1.AnalysisPhaseSuccessful } return v1alpha1.AnalysisPhasePending } lastMeasurement := result.Measurements[len(result.Measurements)-1] if !lastMeasurement.Phase.Completed() { - // we still have a in-flight measurement + // we still have an in-flight measurement return v1alpha1.AnalysisPhaseRunning } @@ -479,7 +551,7 @@ func assessMetricStatus(metric v1alpha1.Metric, result v1alpha1.MetricResult, te // If true, then return AnalysisRunPhase as Failed, Inconclusive, or Error respectively phaseFailureInconclusiveOrError, message := assessMetricFailureInconclusiveOrError(metric, result) if phaseFailureInconclusiveOrError != "" { - log.Infof("metric assessed %s: %s", phaseFailureInconclusiveOrError, message) + logger.Infof("Metric Assessment Result - %s: %s", phaseFailureInconclusiveOrError, message) return phaseFailureInconclusiveOrError } @@ -488,12 +560,12 @@ func assessMetricStatus(metric v1alpha1.Metric, result v1alpha1.MetricResult, te // taken into consideration above, and we do not want to fail if failures < failureLimit. effectiveCount := metric.EffectiveCount() if effectiveCount != nil && result.Count >= int32(effectiveCount.IntValue()) { - log.Infof("metric assessed %s: count (%s) reached", v1alpha1.AnalysisPhaseSuccessful, effectiveCount.String()) + logger.Infof("Metric Assessment Result - %s: Count (%s) Reached", v1alpha1.AnalysisPhaseSuccessful, effectiveCount.String()) return v1alpha1.AnalysisPhaseSuccessful } // if we get here, this metric runs indefinitely if terminating { - log.Infof("metric assessed %s: run terminated", v1alpha1.AnalysisPhaseSuccessful) + logger.Infof(SuccessfulAssessmentRunTerminatedResult) return v1alpha1.AnalysisPhaseSuccessful } return v1alpha1.AnalysisPhaseRunning @@ -546,19 +618,18 @@ func calculateNextReconcileTime(run *v1alpha1.AnalysisRun, metrics []v1alpha1.Me if run.Status.StartedAt != nil { startTime = *run.Status.StartedAt } - duration, err := metric.InitialDelay.Duration() + parsedInterval, err := parseMetricInterval(*logCtx, metric.InitialDelay) if err != nil { - logCtx.Warnf("failed to parse interval: %v", err) continue } - endInitialDelay := startTime.Add(duration) + endInitialDelay := startTime.Add(parsedInterval) if reconcileTime == nil || reconcileTime.After(endInitialDelay) { reconcileTime = &endInitialDelay } continue } // no measurement was started . we should never get here - logCtx.Warnf("metric never started. not factored into enqueue time") + logCtx.Warnf("Metric never started. Not factored into enqueue time.") continue } if lastMeasurement.FinishedAt == nil { @@ -580,18 +651,17 @@ func calculateNextReconcileTime(run *v1alpha1.AnalysisRun, metrics []v1alpha1.Me if lastMeasurement.Phase == v1alpha1.AnalysisPhaseError { interval = DefaultErrorRetryInterval } else if metric.Interval != "" { - metricInterval, err := metric.Interval.Duration() + parsedInterval, err := parseMetricInterval(*logCtx, metric.Interval) if err != nil { - logCtx.Warnf("failed to parse interval: %v", err) continue } - interval = metricInterval + interval = parsedInterval } else { // if we get here, an interval was not set (meaning reoccurrence was not desired), and // there was no error (meaning we don't need to retry). no need to requeue this metric. // NOTE: we shouldn't ever get here since it means we are not doing proper bookkeeping // of count. - logCtx.Warnf("skipping requeue. no interval or error (count: %d, effectiveCount: %s)", metricResult.Count, metric.EffectiveCount().String()) + logCtx.Warnf("Skipping requeue. No interval or error (count: %d, effectiveCount: %s)", metricResult.Count, metric.EffectiveCount().String()) continue } // Take the earliest time of all metrics @@ -619,8 +689,8 @@ func (c *Controller) garbageCollectMeasurements(run *v1alpha1.AnalysisRun, limit if !ok { continue } - log := logutil.WithAnalysisRun(run).WithField("metric", metric.Name) - provider, err := c.newProvider(*log, metric) + logger := logutil.WithAnalysisRun(run).WithField("metric", metric.Name) + provider, err := c.newProvider(*logger, metric) if err != nil { errors = append(errors, err) continue diff --git a/analysis/analysis_test.go b/analysis/analysis_test.go index 5b1fa56062..919edd7a15 100644 --- a/analysis/analysis_test.go +++ b/analysis/analysis_test.go @@ -91,18 +91,20 @@ func newRun() *v1alpha1.AnalysisRun { } // newTerminatingRun returns a run which is terminating because of the given status -func newTerminatingRun(status v1alpha1.AnalysisPhase) *v1alpha1.AnalysisRun { +func newTerminatingRun(status v1alpha1.AnalysisPhase, isDryRun bool) *v1alpha1.AnalysisRun { run := v1alpha1.AnalysisRun{ Spec: v1alpha1.AnalysisRunSpec{ Metrics: []v1alpha1.Metric{ { - Name: "run-forever", + Name: "run-forever", + DryRun: isDryRun, Provider: v1alpha1.MetricProvider{ Job: &v1alpha1.JobMetric{}, }, }, { - Name: "failed-metric", + Name: "failed-metric", + DryRun: isDryRun, Provider: v1alpha1.MetricProvider{ Job: &v1alpha1.JobMetric{}, }, @@ -113,16 +115,18 @@ func newTerminatingRun(status v1alpha1.AnalysisPhase) *v1alpha1.AnalysisRun { Phase: v1alpha1.AnalysisPhaseRunning, MetricResults: []v1alpha1.MetricResult{ { - Name: "run-forever", - Phase: v1alpha1.AnalysisPhaseRunning, + Name: "run-forever", + DryRun: isDryRun, + Phase: v1alpha1.AnalysisPhaseRunning, Measurements: []v1alpha1.Measurement{{ Phase: v1alpha1.AnalysisPhaseRunning, StartedAt: timePtr(metav1.NewTime(time.Now().Add(-60 * time.Second))), }}, }, { - Name: "failed-metric", - Count: 1, + Name: "failed-metric", + Count: 1, + DryRun: isDryRun, Measurements: []v1alpha1.Measurement{{ Phase: status, StartedAt: timePtr(metav1.NewTime(time.Now().Add(-60 * time.Second))), @@ -941,7 +945,7 @@ func TestReconcileAnalysisRunTerminateSiblingAfterFail(t *testing.T) { f.provider.On("Terminate", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(newMeasurement(v1alpha1.AnalysisPhaseSuccessful), nil) for _, status := range []v1alpha1.AnalysisPhase{v1alpha1.AnalysisPhaseFailed, v1alpha1.AnalysisPhaseInconclusive, v1alpha1.AnalysisPhaseError} { - run := newTerminatingRun(status) + run := newTerminatingRun(status, false) newRun := c.reconcileAnalysisRun(run) assert.Equal(t, status, newRun.Status.Phase) @@ -950,8 +954,8 @@ func TestReconcileAnalysisRunTerminateSiblingAfterFail(t *testing.T) { // ensure the in-progress measurement is now terminated assert.Equal(t, v1alpha1.AnalysisPhaseSuccessful, newRun.Status.MetricResults[0].Measurements[0].Phase) assert.NotNil(t, newRun.Status.MetricResults[0].Measurements[0].FinishedAt) - assert.Equal(t, "metric terminated", newRun.Status.MetricResults[0].Message) - assert.Equal(t, "metric terminated", newRun.Status.MetricResults[0].Measurements[0].Message) + assert.Equal(t, "Metric Terminated", newRun.Status.MetricResults[0].Message) + assert.Equal(t, "Metric Terminated", newRun.Status.MetricResults[0].Measurements[0].Message) } } @@ -1069,22 +1073,26 @@ func TestResolveMetricArgsUnableToSubstitute(t *testing.T) { f := newFixture(t) defer f.Close() c, _, _ := f.newController(noResyncPeriodFunc) - run := &v1alpha1.AnalysisRun{ - Spec: v1alpha1.AnalysisRunSpec{ - Metrics: []v1alpha1.Metric{{ - Name: "rate", - SuccessCondition: "{{args.does-not-exist}}", - Provider: v1alpha1.MetricProvider{ - Prometheus: &v1alpha1.PrometheusMetric{ - Query: "{{args.metric-name}}", + // Dry-Run or not if the args resolution fails then we should fail the analysis + for _, isDryRun := range [3]bool{false, true, false} { + run := &v1alpha1.AnalysisRun{ + Spec: v1alpha1.AnalysisRunSpec{ + Metrics: []v1alpha1.Metric{{ + Name: "rate", + DryRun: isDryRun, + SuccessCondition: "{{args.does-not-exist}}", + Provider: v1alpha1.MetricProvider{ + Prometheus: &v1alpha1.PrometheusMetric{ + Query: "{{args.metric-name}}", + }, }, - }, - }}, - }, + }}, + }, + } + newRun := c.reconcileAnalysisRun(run) + assert.Equal(t, v1alpha1.AnalysisPhaseError, newRun.Status.Phase) + assert.Equal(t, "Unable to resolve metric arguments: failed to resolve {{args.metric-name}}", newRun.Status.Message) } - newRun := c.reconcileAnalysisRun(run) - assert.Equal(t, v1alpha1.AnalysisPhaseError, newRun.Status.Phase) - assert.Equal(t, "unable to resolve metric arguments: failed to resolve {{args.metric-name}}", newRun.Status.Message) } // TestSecretContentReferenceSuccess verifies that secret arguments are properly resolved @@ -1396,72 +1404,114 @@ func TestAssessMetricFailureInconclusiveOrError(t *testing.T) { assert.Equal(t, phase, assessMetricStatus(metric, result, true)) } -func TestAssessRunStatusErrorMessageAnalysisPhaseFail(t *testing.T) { +func StartAssessRunStatusErrorMessageAnalysisPhaseFail(t *testing.T, isDryRun bool) (v1alpha1.AnalysisPhase, string) { f := newFixture(t) defer f.Close() c, _, _ := f.newController(noResyncPeriodFunc) - run := newTerminatingRun(v1alpha1.AnalysisPhaseFailed) + run := newTerminatingRun(v1alpha1.AnalysisPhaseFailed, isDryRun) run.Status.MetricResults[0].Phase = v1alpha1.AnalysisPhaseSuccessful - status, message := c.assessRunStatus(run, run.Spec.Metrics) + return c.assessRunStatus(run, run.Spec.Metrics) +} + +func TestAssessRunStatusErrorMessageAnalysisPhaseFail(t *testing.T) { + status, message := StartAssessRunStatusErrorMessageAnalysisPhaseFail(t, false) assert.Equal(t, v1alpha1.AnalysisPhaseFailed, status) - assert.Equal(t, "metric \"failed-metric\" assessed Failed due to failed (1) > failureLimit (0)", message) + assert.Equal(t, "Metric \"failed-metric\" assessed Failed due to failed (1) > failureLimit (0)", message) } -// TestAssessRunStatusErrorMessageFromProvider verifies that the message returned by assessRunStatus -// includes the error message from the provider -func TestAssessRunStatusErrorMessageFromProvider(t *testing.T) { +func TestAssessRunStatusErrorMessageAnalysisPhaseFailInDryRunMode(t *testing.T) { + status, message := StartAssessRunStatusErrorMessageAnalysisPhaseFail(t, true) + assert.Equal(t, v1alpha1.AnalysisPhaseSuccessful, status) + assert.Equal(t, "Dry-Run Summary: Total=2, Successful=1, Failed=1, Inconclusive=0", message) +} + +func StartAssessRunStatusErrorMessageFromProvider(t *testing.T, providerMessage string, isDryRun bool) (v1alpha1.AnalysisPhase, string) { f := newFixture(t) defer f.Close() c, _, _ := f.newController(noResyncPeriodFunc) - run := newTerminatingRun(v1alpha1.AnalysisPhaseFailed) + run := newTerminatingRun(v1alpha1.AnalysisPhaseFailed, isDryRun) run.Status.MetricResults[0].Phase = v1alpha1.AnalysisPhaseSuccessful // All metrics must complete, or assessRunStatus will not return message - - providerMessage := "Provider error" run.Status.MetricResults[1].Message = providerMessage - status, message := c.assessRunStatus(run, run.Spec.Metrics) - expectedMessage := fmt.Sprintf("metric \"failed-metric\" assessed Failed due to failed (1) > failureLimit (0): \"Error Message: %s\"", providerMessage) + return c.assessRunStatus(run, run.Spec.Metrics) +} + +// TestAssessRunStatusErrorMessageFromProvider verifies that the message returned by assessRunStatus +// includes the error message from the provider +func TestAssessRunStatusErrorMessageFromProvider(t *testing.T) { + providerMessage := "Provider Error" + status, message := StartAssessRunStatusErrorMessageFromProvider(t, providerMessage, false) + expectedMessage := fmt.Sprintf("Metric \"failed-metric\" assessed Failed due to failed (1) > failureLimit (0): \"Error Message: %s\"", providerMessage) assert.Equal(t, v1alpha1.AnalysisPhaseFailed, status) assert.Equal(t, expectedMessage, message) } -// TestAssessRunStatusMultipleFailures verifies that if there are multiple failed metrics, assessRunStatus returns the message -// from the first failed metric -func TestAssessRunStatusMultipleFailures(t *testing.T) { +func TestAssessRunStatusErrorMessageFromProviderInDryRunMode(t *testing.T) { + providerMessage := "Provider Error" + status, message := StartAssessRunStatusErrorMessageFromProvider(t, providerMessage, true) + assert.Equal(t, v1alpha1.AnalysisPhaseSuccessful, status) + assert.Equal(t, "Dry-Run Summary: Total=2, Successful=1, Failed=1, Inconclusive=0", message) +} + +func StartAssessRunStatusMultipleFailures(t *testing.T, isDryRun bool) (v1alpha1.AnalysisPhase, string) { f := newFixture(t) defer f.Close() c, _, _ := f.newController(noResyncPeriodFunc) - run := newTerminatingRun(v1alpha1.AnalysisPhaseFailed) + run := newTerminatingRun(v1alpha1.AnalysisPhaseFailed, isDryRun) run.Status.MetricResults[0].Phase = v1alpha1.AnalysisPhaseFailed run.Status.MetricResults[0].Failed = 1 - status, message := c.assessRunStatus(run, run.Spec.Metrics) + return c.assessRunStatus(run, run.Spec.Metrics) +} + +// TestAssessRunStatusMultipleFailures verifies that if there are multiple failed metrics, assessRunStatus returns the message +// from the first failed metric +func TestAssessRunStatusMultipleFailures(t *testing.T) { + status, message := StartAssessRunStatusMultipleFailures(t, false) assert.Equal(t, v1alpha1.AnalysisPhaseFailed, status) - assert.Equal(t, "metric \"run-forever\" assessed Failed due to failed (1) > failureLimit (0)", message) + assert.Equal(t, "Metric \"run-forever\" assessed Failed due to failed (1) > failureLimit (0)", message) } -// TestAssessRunStatusWorstMessageInReconcileAnalysisRun verifies that the worstMessage returned by assessRunStatus is set as the -// status of the AnalysisRun returned by reconcileAnalysisRun -func TestAssessRunStatusWorstMessageInReconcileAnalysisRun(t *testing.T) { +func TestAssessRunStatusMultipleFailuresInDryRunMode(t *testing.T) { + status, message := StartAssessRunStatusMultipleFailures(t, true) + assert.Equal(t, v1alpha1.AnalysisPhaseSuccessful, status) + assert.Equal(t, "Dry-Run Summary: Total=2, Successful=0, Failed=2, Inconclusive=0", message) +} + +func StartAssessRunStatusWorstMessageInReconcileAnalysisRun(t *testing.T, isDryRun bool) *v1alpha1.AnalysisRun { f := newFixture(t) defer f.Close() c, _, _ := f.newController(noResyncPeriodFunc) - run := newTerminatingRun(v1alpha1.AnalysisPhaseFailed) + run := newTerminatingRun(v1alpha1.AnalysisPhaseFailed, isDryRun) run.Status.MetricResults[0].Phase = v1alpha1.AnalysisPhaseFailed run.Status.MetricResults[0].Failed = 1 f.provider.On("Run", mock.Anything, mock.Anything, mock.Anything).Return(newMeasurement(v1alpha1.AnalysisPhaseFailed), nil) - newRun := c.reconcileAnalysisRun(run) + return c.reconcileAnalysisRun(run) +} + +// TestAssessRunStatusWorstMessageInReconcileAnalysisRun verifies that the worstMessage returned by assessRunStatus is set as the +// status of the AnalysisRun returned by reconcileAnalysisRun +func TestAssessRunStatusWorstMessageInReconcileAnalysisRun(t *testing.T) { + newRun := StartAssessRunStatusWorstMessageInReconcileAnalysisRun(t, false) assert.Equal(t, v1alpha1.AnalysisPhaseFailed, newRun.Status.Phase) - assert.Equal(t, "metric \"run-forever\" assessed Failed due to failed (1) > failureLimit (0)", newRun.Status.Message) + assert.Equal(t, "Metric \"run-forever\" assessed Failed due to failed (1) > failureLimit (0)", newRun.Status.Message) } -func TestTerminateAnalysisRun(t *testing.T) { +func TestAssessRunStatusWorstMessageInReconcileAnalysisRunInDryRunMode(t *testing.T) { + newRun := StartAssessRunStatusWorstMessageInReconcileAnalysisRun(t, true) + assert.Equal(t, v1alpha1.AnalysisPhaseSuccessful, newRun.Status.Phase) + assert.Equal(t, "Dry-Run Summary: Total=2, Successful=0, Failed=2, Inconclusive=0", newRun.Status.Message) + assert.Equal(t, "Metric assessed Failed due to failed (1) > failureLimit (0)", newRun.Status.MetricResults[0].Message) + assert.Equal(t, "Metric assessed Failed due to failed (1) > failureLimit (0)", newRun.Status.MetricResults[1].Message) +} + +func StartTerminatingAnalysisRun(t *testing.T, isDryRun bool) *v1alpha1.AnalysisRun { f := newFixture(t) defer f.Close() c, _, _ := f.newController(noResyncPeriodFunc) @@ -1480,6 +1530,7 @@ func TestTerminateAnalysisRun(t *testing.T) { }, Metrics: []v1alpha1.Metric{{ Name: "success-rate", + DryRun: isDryRun, InitialDelay: "20s", Interval: "20s", SuccessCondition: "result[0] > 0.90", @@ -1493,7 +1544,17 @@ func TestTerminateAnalysisRun(t *testing.T) { Phase: v1alpha1.AnalysisPhaseRunning, }, } - newRun := c.reconcileAnalysisRun(run) + return c.reconcileAnalysisRun(run) +} + +func TestTerminateAnalysisRun(t *testing.T) { + newRun := StartTerminatingAnalysisRun(t, false) + assert.Equal(t, v1alpha1.AnalysisPhaseSuccessful, newRun.Status.Phase) + assert.Equal(t, "Run Terminated", newRun.Status.Message) +} + +func TestTerminateAnalysisRunInDryRunMode(t *testing.T) { + newRun := StartTerminatingAnalysisRun(t, true) assert.Equal(t, v1alpha1.AnalysisPhaseSuccessful, newRun.Status.Phase) - assert.Equal(t, "run terminated", newRun.Status.Message) + assert.Equal(t, "Run Terminated; Dry-Run Summary: Total=1, Successful=0, Failed=0, Inconclusive=0", newRun.Status.Message) } diff --git a/controller/metrics/analysis.go b/controller/metrics/analysis.go index d8634b92c6..1f7a95eb38 100644 --- a/controller/metrics/analysis.go +++ b/controller/metrics/analysis.go @@ -1,6 +1,8 @@ package metrics import ( + "fmt" + "github.com/prometheus/client_golang/prometheus" log "github.com/sirupsen/logrus" "k8s.io/apimachinery/pkg/labels" @@ -83,17 +85,17 @@ func collectAnalysisRuns(ch chan<- prometheus.Metric, ar *v1alpha1.AnalysisRun) for _, metric := range ar.Spec.Metrics { metricType := metricproviders.Type(metric) metricResult := analysis.GetResult(ar, metric.Name) - addGauge(MetricAnalysisRunMetricType, 1, metric.Name, metricType) + addGauge(MetricAnalysisRunMetricType, 1, metric.Name, metricType, fmt.Sprint(metric.DryRun)) calculatedPhase := v1alpha1.AnalysisPhase("") if metricResult != nil { calculatedPhase = metricResult.Phase } - addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhasePending || calculatedPhase == ""), metric.Name, metricType, string(v1alpha1.AnalysisPhasePending)) - addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseError), metric.Name, metricType, string(v1alpha1.AnalysisPhaseError)) - addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseFailed), metric.Name, metricType, string(v1alpha1.AnalysisPhaseFailed)) - addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseSuccessful), metric.Name, metricType, string(v1alpha1.AnalysisPhaseSuccessful)) - addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseRunning), metric.Name, metricType, string(v1alpha1.AnalysisPhaseRunning)) - addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseInconclusive), metric.Name, metricType, string(v1alpha1.AnalysisPhaseInconclusive)) + addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhasePending || calculatedPhase == ""), metric.Name, metricType, fmt.Sprint(metric.DryRun), string(v1alpha1.AnalysisPhasePending)) + addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseError), metric.Name, metricType, fmt.Sprint(metric.DryRun), string(v1alpha1.AnalysisPhaseError)) + addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseFailed), metric.Name, metricType, fmt.Sprint(metric.DryRun), string(v1alpha1.AnalysisPhaseFailed)) + addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseSuccessful), metric.Name, metricType, fmt.Sprint(metric.DryRun), string(v1alpha1.AnalysisPhaseSuccessful)) + addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseRunning), metric.Name, metricType, fmt.Sprint(metric.DryRun), string(v1alpha1.AnalysisPhaseRunning)) + addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseInconclusive), metric.Name, metricType, fmt.Sprint(metric.DryRun), string(v1alpha1.AnalysisPhaseInconclusive)) } } @@ -106,6 +108,6 @@ func collectAnalysisTemplate(ch chan<- prometheus.Metric, namespace, name string for _, metric := range at.Metrics { metricType := metricproviders.Type(metric) - addGauge(MetricAnalysisTemplateMetricInfo, 1, metricType, metric.Name) + addGauge(MetricAnalysisTemplateMetricInfo, 1, metricType, metric.Name, fmt.Sprint(metric.DryRun)) } } diff --git a/controller/metrics/analysis_test.go b/controller/metrics/analysis_test.go index 4c8a6f70fd..3cda3e5091 100644 --- a/controller/metrics/analysis_test.go +++ b/controller/metrics/analysis_test.go @@ -54,12 +54,19 @@ metadata: namespace: jesse-test spec: metrics: - - name: webmetric + - name: web-metric-1 provider: web: jsonPath: . url: https://www.google.com successCondition: "true" + - name: web-metric-2 + dryRun: true + provider: + web: + jsonPath: . + url: https://www.msn.com + successCondition: "false" ` fakeClusterAnalysisTemplate = ` @@ -67,15 +74,22 @@ apiVersion: argoproj.io/v1alpha1 kind: ClusterAnalysisTemplate metadata: creationTimestamp: "2020-03-16T20:01:13Z" - name: http-benchmark-test + name: http-benchmark-cluster-test spec: metrics: - - name: webmetric + - name: web-metric-1 provider: web: jsonPath: . url: https://www.google.com successCondition: "true" + - name: web-metric-2 + dryRun: true + provider: + web: + jsonPath: . + url: https://www.msn.com + successCondition: "false" ` ) const expectedAnalysisRunResponse = `# HELP analysis_run_info Information about analysis run. @@ -83,15 +97,15 @@ const expectedAnalysisRunResponse = `# HELP analysis_run_info Information about analysis_run_info{name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Error"} 1 # HELP analysis_run_metric_phase Information on the duration of a specific metric in the Analysis Run # TYPE analysis_run_metric_phase gauge -analysis_run_metric_phase{metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Error",type="Web"} 1 -analysis_run_metric_phase{metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Failed",type="Web"} 0 -analysis_run_metric_phase{metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Inconclusive",type="Web"} 0 -analysis_run_metric_phase{metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Pending",type="Web"} 0 -analysis_run_metric_phase{metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Running",type="Web"} 0 -analysis_run_metric_phase{metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Successful",type="Web"} 0 +analysis_run_metric_phase{dryRun="false",metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Error",type="Web"} 1 +analysis_run_metric_phase{dryRun="false",metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Failed",type="Web"} 0 +analysis_run_metric_phase{dryRun="false",metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Inconclusive",type="Web"} 0 +analysis_run_metric_phase{dryRun="false",metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Pending",type="Web"} 0 +analysis_run_metric_phase{dryRun="false",metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Running",type="Web"} 0 +analysis_run_metric_phase{dryRun="false",metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Successful",type="Web"} 0 # HELP analysis_run_metric_type Information on the type of a specific metric in the Analysis Runs # TYPE analysis_run_metric_type gauge -analysis_run_metric_type{metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",type="Web"} 1 +analysis_run_metric_type{dryRun="false",metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",type="Web"} 1 # HELP analysis_run_phase Information on the state of the Analysis Run (DEPRECATED - use analysis_run_info) # TYPE analysis_run_phase gauge analysis_run_phase{name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Error"} 1 @@ -175,12 +189,14 @@ analysis_run_reconcile_count{name="ar-test",namespace="ar-namespace"} 1` func TestAnalysisTemplateDescribe(t *testing.T) { expectedResponse := `# TYPE analysis_template_info gauge -analysis_template_info{name="http-benchmark-test",namespace=""} 1 +analysis_template_info{name="http-benchmark-cluster-test",namespace=""} 1 analysis_template_info{name="http-benchmark-test",namespace="jesse-test"} 1 # HELP analysis_template_metric_info Information on metrics in analysis templates. # TYPE analysis_template_metric_info gauge -analysis_template_metric_info{metric="webmetric",name="http-benchmark-test",namespace="",type="Web"} 1 -analysis_template_metric_info{metric="webmetric",name="http-benchmark-test",namespace="jesse-test",type="Web"} 1 +analysis_template_metric_info{dryRun="false",metric="web-metric-1",name="http-benchmark-cluster-test",namespace="",type="Web"} 1 +analysis_template_metric_info{dryRun="false",metric="web-metric-1",name="http-benchmark-test",namespace="jesse-test",type="Web"} 1 +analysis_template_metric_info{dryRun="true",metric="web-metric-2",name="http-benchmark-cluster-test",namespace="",type="Web"} 1 +analysis_template_metric_info{dryRun="true",metric="web-metric-2",name="http-benchmark-test",namespace="jesse-test",type="Web"} 1 ` registry := prometheus.NewRegistry() at := newFakeAnalysisTemplate(fakeAnalysisTemplate) diff --git a/controller/metrics/prommetrics.go b/controller/metrics/prommetrics.go index ff2017765e..eb763e73ef 100644 --- a/controller/metrics/prommetrics.go +++ b/controller/metrics/prommetrics.go @@ -108,14 +108,14 @@ var ( MetricAnalysisRunMetricType = prometheus.NewDesc( "analysis_run_metric_type", "Information on the type of a specific metric in the Analysis Runs", - append(namespaceNameLabels, "metric", "type"), + append(namespaceNameLabels, "metric", "type", "dryRun"), nil, ) MetricAnalysisRunMetricPhase = prometheus.NewDesc( "analysis_run_metric_phase", "Information on the duration of a specific metric in the Analysis Run", - append(namespaceNameLabels, "metric", "type", "phase"), + append(namespaceNameLabels, "metric", "type", "dryRun", "phase"), nil, ) ) @@ -132,7 +132,7 @@ var ( MetricAnalysisTemplateMetricInfo = prometheus.NewDesc( "analysis_template_metric_info", "Information on metrics in analysis templates.", - append(namespaceNameLabels, "type", "metric"), + append(namespaceNameLabels, "type", "metric", "dryRun"), nil, ) ) diff --git a/docs/features/analysis.md b/docs/features/analysis.md index 0c563591ad..a9936556e4 100644 --- a/docs/features/analysis.md +++ b/docs/features/analysis.md @@ -541,6 +541,53 @@ encountered. )) ``` +## Dry-Run Mode + +`dryRun` can be used on a metric to control whether or not to evaluate that metric in a dry-run mode. A metric running +in the dry-run mode won't impact the final state of the rollout or experiment even if it fails or the evaluation comes +out as inconclusive. + +The following example queries prometheus every 5 minutes to get the total number of 4XX and 5XX errors, and even if one +or both of them fails, the analysis run will pass. + +```yaml hl_lines="4" + metrics: + - name: total-5xx-errors + interval: 5m + dryRun: true + failureCondition: result[0] >= 10 + failureLimit: 3 + provider: + prometheus: + address: http://prometheus.example.com:9090 + query: | + sum(irate( + istio_requests_total{reporter="source",destination_service=~"{{args.service-name}}",response_code~"5.*"}[5m] + )) + - name: total-4xx-errors + interval: 5m + dryRun: true + failureCondition: result[0] >= 10 + failureLimit: 3 + provider: + prometheus: + address: http://prometheus.example.com:9090 + query: | + sum(irate( + istio_requests_total{reporter="source",destination_service=~"{{args.service-name}}",response_code~"4.*"}[5m] + )) +``` + +If one or more metrics are running in the dry-run mode, the summary of the dry-run results gets appended to the analysis +run message. Assuming that the `total-4xx-errors` metric fails in the above example but, the `total-5xx-errors` +succeeds, the final dry-run summary will look like this. + +```yaml hl_lines="1" +Message: Dry-Run Summary: Total=2, Successful=1, Failed=1, Inconclusive=0 +Metric Results: +... +``` + ## Inconclusive Runs Analysis runs can also be considered `Inconclusive`, which indicates the run was neither successful, diff --git a/manifests/crds/analysis-run-crd.yaml b/manifests/crds/analysis-run-crd.yaml index 9d612a9206..22bb325790 100644 --- a/manifests/crds/analysis-run-crd.yaml +++ b/manifests/crds/analysis-run-crd.yaml @@ -81,6 +81,8 @@ spec: - type: integer - type: string x-kubernetes-int-or-string: true + dryRun: + type: boolean failureCondition: type: string failureLimit: @@ -2628,6 +2630,8 @@ spec: count: format: int32 type: integer + dryRun: + type: boolean error: format: int32 type: integer diff --git a/manifests/crds/analysis-template-crd.yaml b/manifests/crds/analysis-template-crd.yaml index 516a5bf5a8..21e9c4ab69 100644 --- a/manifests/crds/analysis-template-crd.yaml +++ b/manifests/crds/analysis-template-crd.yaml @@ -77,6 +77,8 @@ spec: - type: integer - type: string x-kubernetes-int-or-string: true + dryRun: + type: boolean failureCondition: type: string failureLimit: diff --git a/manifests/crds/cluster-analysis-template-crd.yaml b/manifests/crds/cluster-analysis-template-crd.yaml index b10ac4a166..c7219eb532 100644 --- a/manifests/crds/cluster-analysis-template-crd.yaml +++ b/manifests/crds/cluster-analysis-template-crd.yaml @@ -77,6 +77,8 @@ spec: - type: integer - type: string x-kubernetes-int-or-string: true + dryRun: + type: boolean failureCondition: type: string failureLimit: diff --git a/manifests/install.yaml b/manifests/install.yaml index fb7aa00ee7..1986288727 100644 --- a/manifests/install.yaml +++ b/manifests/install.yaml @@ -82,6 +82,8 @@ spec: - type: integer - type: string x-kubernetes-int-or-string: true + dryRun: + type: boolean failureCondition: type: string failureLimit: @@ -2629,6 +2631,8 @@ spec: count: format: int32 type: integer + dryRun: + type: boolean error: format: int32 type: integer @@ -2772,6 +2776,8 @@ spec: - type: integer - type: string x-kubernetes-int-or-string: true + dryRun: + type: boolean failureCondition: type: string failureLimit: @@ -5390,6 +5396,8 @@ spec: - type: integer - type: string x-kubernetes-int-or-string: true + dryRun: + type: boolean failureCondition: type: string failureLimit: diff --git a/manifests/namespace-install.yaml b/manifests/namespace-install.yaml index bfa060f41c..6f97d504b5 100644 --- a/manifests/namespace-install.yaml +++ b/manifests/namespace-install.yaml @@ -82,6 +82,8 @@ spec: - type: integer - type: string x-kubernetes-int-or-string: true + dryRun: + type: boolean failureCondition: type: string failureLimit: @@ -2629,6 +2631,8 @@ spec: count: format: int32 type: integer + dryRun: + type: boolean error: format: int32 type: integer @@ -2772,6 +2776,8 @@ spec: - type: integer - type: string x-kubernetes-int-or-string: true + dryRun: + type: boolean failureCondition: type: string failureLimit: @@ -5390,6 +5396,8 @@ spec: - type: integer - type: string x-kubernetes-int-or-string: true + dryRun: + type: boolean failureCondition: type: string failureLimit: diff --git a/pkg/apis/rollouts/v1alpha1/analysis_types.go b/pkg/apis/rollouts/v1alpha1/analysis_types.go index 65d54789c6..550fbbd735 100644 --- a/pkg/apis/rollouts/v1alpha1/analysis_types.go +++ b/pkg/apis/rollouts/v1alpha1/analysis_types.go @@ -106,6 +106,8 @@ type Metric struct { ConsecutiveErrorLimit *intstrutil.IntOrString `json:"consecutiveErrorLimit,omitempty" protobuf:"bytes,9,opt,name=consecutiveErrorLimit"` // Provider configuration to the external system to use to verify the analysis Provider MetricProvider `json:"provider" protobuf:"bytes,10,opt,name=provider"` + // Whether to evaluate this metric in a Dry-Run mode + DryRun bool `json:"dryRun,omitempty" protobuf:"varint,11,opt,name=dryRun"` } // EffectiveCount is the effective count based on whether or not count/interval is specified @@ -343,6 +345,8 @@ type MetricResult struct { // ConsecutiveError is the number of times an error was encountered during measurement in succession // Resets to zero when non-errors are encountered ConsecutiveError int32 `json:"consecutiveError,omitempty" protobuf:"varint,10,opt,name=consecutiveError"` + // Whether this metric is running in a Dry-Run mode + DryRun bool `json:"dryRun,omitempty" protobuf:"varint,11,opt,name=dryRun"` } // Measurement is a point in time result value of a single metric, and the time it was measured diff --git a/pkg/apis/rollouts/v1alpha1/generated.pb.go b/pkg/apis/rollouts/v1alpha1/generated.pb.go index 4f0148edd0..69c16f1ab5 100644 --- a/pkg/apis/rollouts/v1alpha1/generated.pb.go +++ b/pkg/apis/rollouts/v1alpha1/generated.pb.go @@ -10090,6 +10090,7 @@ func (this *MetricResult) String() string { `Phase:` + fmt.Sprintf("%v", this.Phase) + `,`, `Measurements:` + repeatedStringForMeasurements + `,`, `Message:` + fmt.Sprintf("%v", this.Message) + `,`, + `Dry-Run:` + fmt.Sprintf("%v", this.DryRun) + `,`, `Count:` + fmt.Sprintf("%v", this.Count) + `,`, `Successful:` + fmt.Sprintf("%v", this.Successful) + `,`, `Failed:` + fmt.Sprintf("%v", this.Failed) + `,`, diff --git a/pkg/apis/rollouts/v1alpha1/generated.proto b/pkg/apis/rollouts/v1alpha1/generated.proto index a4dc739791..a7e2c6e71a 100644 --- a/pkg/apis/rollouts/v1alpha1/generated.proto +++ b/pkg/apis/rollouts/v1alpha1/generated.proto @@ -772,6 +772,9 @@ message Metric { // Provider configuration to the external system to use to verify the analysis optional MetricProvider provider = 10; + + // Whether to evaluate this metric in a Dry-Run mode + optional bool dryRun = 11; } // MetricProvider which external system to use to verify the analysis @@ -839,6 +842,9 @@ message MetricResult { // ConsecutiveError is the number of times an error was encountered during measurement in succession // Resets to zero when non-errors are encountered optional int32 consecutiveError = 10; + + // Whether this metric is running in a Dry-Run mode + optional bool dryRun = 11; } // NewRelicMetric defines the newrelic query to perform canary analysis diff --git a/pkg/apis/rollouts/v1alpha1/openapi_generated.go b/pkg/apis/rollouts/v1alpha1/openapi_generated.go index 11c0f0a1ca..3c06ebe192 100644 --- a/pkg/apis/rollouts/v1alpha1/openapi_generated.go +++ b/pkg/apis/rollouts/v1alpha1/openapi_generated.go @@ -2280,6 +2280,13 @@ func schema_pkg_apis_rollouts_v1alpha1_Metric(ref common.ReferenceCallback) comm Ref: ref("github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1.MetricProvider"), }, }, + "dryRun": { + SchemaProps: spec.SchemaProps{ + Description: "Whether to evaluate this metric in a Dry-Run mode", + Type: []string{"boolean"}, + Format: "", + }, + }, }, Required: []string{"name", "provider"}, }, @@ -2444,6 +2451,13 @@ func schema_pkg_apis_rollouts_v1alpha1_MetricResult(ref common.ReferenceCallback Format: "int32", }, }, + "dryRun": { + SchemaProps: spec.SchemaProps{ + Description: "Whether this metric is running in a Dry-Run mode", + Type: []string{"boolean"}, + Format: "", + }, + }, }, Required: []string{"name", "phase"}, },