From 6b756c3e82554461f72af608ac4c5dc2fa72346c Mon Sep 17 00:00:00 2001
From: Rohit Agrawal <rohit.agrawal@databricks.com>
Date: Sat, 6 Nov 2021 16:00:45 -0400
Subject: [PATCH] feat(analysis): Add Dry-Run Mode

Signed-off-by: Rohit Agrawal <rohit.agrawal@databricks.com>
---
 analysis/analysis.go                          | 192 ++++++++++++------
 analysis/analysis_test.go                     | 163 ++++++++++-----
 controller/metrics/analysis.go                |  18 +-
 controller/metrics/analysis_test.go           |  42 ++--
 controller/metrics/prommetrics.go             |   6 +-
 docs/features/analysis.md                     |  47 +++++
 manifests/crds/analysis-run-crd.yaml          |   4 +
 manifests/crds/analysis-template-crd.yaml     |   2 +
 .../crds/cluster-analysis-template-crd.yaml   |   2 +
 manifests/install.yaml                        |   8 +
 manifests/namespace-install.yaml              |   8 +
 pkg/apis/rollouts/v1alpha1/analysis_types.go  |   4 +
 pkg/apis/rollouts/v1alpha1/generated.pb.go    |   1 +
 pkg/apis/rollouts/v1alpha1/generated.proto    |   6 +
 .../rollouts/v1alpha1/openapi_generated.go    |  14 ++
 15 files changed, 381 insertions(+), 136 deletions(-)

diff --git a/analysis/analysis.go b/analysis/analysis.go
index 046e3899b8..99e69137cd 100644
--- a/analysis/analysis.go
+++ b/analysis/analysis.go
@@ -26,7 +26,10 @@ const (
 	DefaultMeasurementHistoryLimit = 10
 	// DefaultErrorRetryInterval is the default interval to retry a measurement upon error, in the
 	// event an interval was not specified
-	DefaultErrorRetryInterval time.Duration = 10 * time.Second
+	DefaultErrorRetryInterval = 10 * time.Second
+	// SuccessfulAssessmentRunTerminatedResult is used for logging purposes when the metrics evaluation
+	// is successful and the run is terminated.
+	SuccessfulAssessmentRunTerminatedResult = "Metric Assessment Result - Successful: Run Terminated"
 )
 
 // metricTask holds the metric which need to be measured during this reconciliation along with
@@ -36,11 +39,19 @@ type metricTask struct {
 	incompleteMeasurement *v1alpha1.Measurement
 }
 
+// dryRunStatus holds the stats of the metrics being evaluated in the Dry-Run mode.
+type dryRunStatus struct {
+	totalMetrics        int32
+	failedMetrics       int32
+	inconclusiveMetrics int32
+	successfulMetrics   int32
+}
+
 func (c *Controller) reconcileAnalysisRun(origRun *v1alpha1.AnalysisRun) *v1alpha1.AnalysisRun {
 	if origRun.Status.Phase.Completed() {
 		return origRun
 	}
-	log := logutil.WithAnalysisRun(origRun)
+	logger := logutil.WithAnalysisRun(origRun)
 	run := origRun.DeepCopy()
 
 	if run.Status.MetricResults == nil {
@@ -49,8 +60,8 @@ func (c *Controller) reconcileAnalysisRun(origRun *v1alpha1.AnalysisRun) *v1alph
 
 	resolvedMetrics, err := getResolvedMetricsWithoutSecrets(run.Spec.Metrics, run.Spec.Args)
 	if err != nil {
-		message := fmt.Sprintf("unable to resolve metric arguments: %v", err)
-		log.Warn(message)
+		message := fmt.Sprintf("Unable to resolve metric arguments: %v", err)
+		logger.Warn(message)
 		run.Status.Phase = v1alpha1.AnalysisPhaseError
 		run.Status.Message = message
 		c.recordAnalysisRunCompletionEvent(run)
@@ -59,8 +70,8 @@ func (c *Controller) reconcileAnalysisRun(origRun *v1alpha1.AnalysisRun) *v1alph
 
 	err = analysisutil.ValidateMetrics(resolvedMetrics)
 	if err != nil {
-		message := fmt.Sprintf("analysis spec invalid: %v", err)
-		log.Warn(message)
+		message := fmt.Sprintf("Analysis spec invalid: %v", err)
+		logger.Warn(message)
 		run.Status.Phase = v1alpha1.AnalysisPhaseError
 		run.Status.Message = message
 		c.recordAnalysisRunCompletionEvent(run)
@@ -68,11 +79,11 @@ func (c *Controller) reconcileAnalysisRun(origRun *v1alpha1.AnalysisRun) *v1alph
 	}
 
 	tasks := generateMetricTasks(run, resolvedMetrics)
-	log.Infof("taking %d measurements", len(tasks))
+	logger.Infof("Taking %d Measurement(s)...", len(tasks))
 	err = c.runMeasurements(run, tasks)
 	if err != nil {
-		message := fmt.Sprintf("unable to resolve metric arguments: %v", err)
-		log.Warn(message)
+		message := fmt.Sprintf("Unable to resolve metric arguments: %v", err)
+		logger.Warn(message)
 		run.Status.Phase = v1alpha1.AnalysisPhaseError
 		run.Status.Message = message
 		c.recordAnalysisRunCompletionEvent(run)
@@ -91,7 +102,7 @@ func (c *Controller) reconcileAnalysisRun(origRun *v1alpha1.AnalysisRun) *v1alph
 	err = c.garbageCollectMeasurements(run, DefaultMeasurementHistoryLimit)
 	if err != nil {
 		// TODO(jessesuen): surface errors to controller so they can be retried
-		log.Warnf("Failed to garbage collect measurements: %v", err)
+		logger.Warnf("Failed to garbage collect measurements: %v", err)
 	}
 
 	nextReconcileTime := calculateNextReconcileTime(run, resolvedMetrics)
@@ -100,7 +111,7 @@ func (c *Controller) reconcileAnalysisRun(origRun *v1alpha1.AnalysisRun) *v1alph
 		if enqueueSeconds < 0 {
 			enqueueSeconds = 0
 		}
-		log.Infof("enqueueing analysis after %v", enqueueSeconds)
+		logger.Infof("Enqueueing analysis after %v", enqueueSeconds)
 		c.enqueueAnalysisAfter(run, enqueueSeconds)
 	}
 	return run
@@ -133,7 +144,7 @@ func (c *Controller) recordAnalysisRunCompletionEvent(run *v1alpha1.AnalysisRun)
 	case v1alpha1.AnalysisPhaseError, v1alpha1.AnalysisPhaseFailed:
 		eventType = corev1.EventTypeWarning
 	}
-	c.recorder.Eventf(run, record.EventOptions{EventType: eventType, EventReason: "AnalysisRun" + string(run.Status.Phase)}, "analysis completed %s", run.Status.Phase)
+	c.recorder.Eventf(run, record.EventOptions{EventType: eventType, EventReason: "AnalysisRun" + string(run.Status.Phase)}, "Analysis Completed. Result: %s", run.Status.Phase)
 }
 
 // generateMetricTasks generates a list of metrics tasks needed to be measured as part of this
@@ -141,7 +152,7 @@ func (c *Controller) recordAnalysisRunCompletionEvent(run *v1alpha1.AnalysisRun)
 // terminating (e.g. due to manual termination or failing metric), will not schedule further
 // measurements other than to resume any in-flight measurements.
 func generateMetricTasks(run *v1alpha1.AnalysisRun, metrics []v1alpha1.Metric) []metricTask {
-	log := logutil.WithAnalysisRun(run)
+	logger := logutil.WithAnalysisRun(run)
 	var tasks []metricTask
 	terminating := analysisutil.IsTerminating(run)
 
@@ -149,7 +160,7 @@ func generateMetricTasks(run *v1alpha1.AnalysisRun, metrics []v1alpha1.Metric) [
 		if analysisutil.MetricCompleted(run, metric.Name) {
 			continue
 		}
-		logCtx := log.WithField("metric", metric.Name)
+		logCtx := logger.WithField("metric", metric.Name)
 		lastMeasurement := analysisutil.LastMeasurement(run, metric.Name)
 		if lastMeasurement != nil && lastMeasurement.FinishedAt == nil {
 			now := metav1.Now()
@@ -157,7 +168,7 @@ func generateMetricTasks(run *v1alpha1.AnalysisRun, metrics []v1alpha1.Metric) [
 				continue
 			}
 			// last measurement is still in-progress. need to complete it
-			logCtx.Infof("resuming in-progress measurement")
+			logCtx.Infof("Resuming in-progress measurement")
 			tasks = append(tasks, metricTask{
 				metric:                run.Spec.Metrics[i],
 				incompleteMeasurement: lastMeasurement,
@@ -165,7 +176,7 @@ func generateMetricTasks(run *v1alpha1.AnalysisRun, metrics []v1alpha1.Metric) [
 			continue
 		}
 		if terminating {
-			logCtx.Infof("skipping measurement: run is terminating")
+			logCtx.Infof("Skipping measurement: run is terminating")
 			continue
 		}
 		if lastMeasurement == nil {
@@ -179,13 +190,13 @@ func generateMetricTasks(run *v1alpha1.AnalysisRun, metrics []v1alpha1.Metric) [
 					continue
 				}
 				if run.Status.StartedAt.Add(duration).After(time.Now()) {
-					logCtx.Infof("waiting until start delay duration passes")
+					logCtx.Infof("Waiting until start delay duration passes")
 					continue
 				}
 			}
 			// measurement never taken
 			tasks = append(tasks, metricTask{metric: run.Spec.Metrics[i]})
-			logCtx.Infof("running initial measurement")
+			logCtx.Infof("Running initial measurement")
 			continue
 		}
 		metricResult := analysisutil.GetResult(run, metric.Name)
@@ -201,22 +212,32 @@ func generateMetricTasks(run *v1alpha1.AnalysisRun, metrics []v1alpha1.Metric) [
 		if lastMeasurement.Phase == v1alpha1.AnalysisPhaseError {
 			interval = DefaultErrorRetryInterval
 		} else if metric.Interval != "" {
-			metricInterval, err := metric.Interval.Duration()
+			parsedInterval, err := parseMetricInterval(*logCtx, metric.Interval)
 			if err != nil {
-				logCtx.Warnf("failed to parse interval: %v", err)
 				continue
 			}
-			interval = metricInterval
+			interval = parsedInterval
 		}
 		if time.Now().After(lastMeasurement.FinishedAt.Add(interval)) {
 			tasks = append(tasks, metricTask{metric: run.Spec.Metrics[i]})
-			logCtx.Infof("running overdue measurement")
+			logCtx.Infof("Running overdue measurement")
 			continue
 		}
 	}
 	return tasks
 }
 
+// parseMetricInterval is a helper method to parse the given metric interval and return the
+// parsed duration or error (if any)
+func parseMetricInterval(logCtx log.Entry, metricDurationString v1alpha1.DurationString) (time.Duration, error) {
+	metricInterval, err := metricDurationString.Duration()
+	if err != nil {
+		logCtx.Warnf("Failed to parse interval: %v", err)
+		return -1, err
+	}
+	return metricInterval, nil
+}
+
 // resolveArgs resolves args for metricTasks, including secret references
 // returns resolved metricTasks and secrets for log redaction
 func (c *Controller) resolveArgs(tasks []metricTask, args []v1alpha1.Argument, namespace string) ([]metricTask, []string, error) {
@@ -286,7 +307,7 @@ func (c *Controller) runMeasurements(run *v1alpha1.AnalysisRun, tasks []metricTa
 		go func(t metricTask) {
 			defer wg.Done()
 			//redact secret values from logs
-			log := logutil.WithRedactor(*logutil.WithAnalysisRun(run).WithField("metric", t.metric.Name), secrets)
+			logger := logutil.WithRedactor(*logutil.WithAnalysisRun(run).WithField("metric", t.metric.Name), secrets)
 
 			resultsLock.Lock()
 			metricResult := analysisutil.GetResult(run, t.metric.Name)
@@ -294,13 +315,14 @@ func (c *Controller) runMeasurements(run *v1alpha1.AnalysisRun, tasks []metricTa
 
 			if metricResult == nil {
 				metricResult = &v1alpha1.MetricResult{
-					Name:  t.metric.Name,
-					Phase: v1alpha1.AnalysisPhaseRunning,
+					Name:   t.metric.Name,
+					Phase:  v1alpha1.AnalysisPhaseRunning,
+					DryRun: t.metric.DryRun,
 				}
 			}
 
 			var newMeasurement v1alpha1.Measurement
-			provider, err := c.newProvider(*log, t.metric)
+			provider, err := c.newProvider(*logger, t.metric)
 			if err != nil {
 				if t.incompleteMeasurement != nil {
 					newMeasurement = *t.incompleteMeasurement
@@ -316,10 +338,10 @@ func (c *Controller) runMeasurements(run *v1alpha1.AnalysisRun, tasks []metricTa
 				} else {
 					// metric is incomplete. either terminate or resume it
 					if terminating {
-						log.Infof("terminating in-progress measurement")
+						logger.Infof("Terminating in-progress measurement")
 						newMeasurement = provider.Terminate(run, t.metric, *t.incompleteMeasurement)
 						if newMeasurement.Phase == v1alpha1.AnalysisPhaseSuccessful {
-							newMeasurement.Message = "metric terminated"
+							newMeasurement.Message = "Metric Terminated"
 						}
 					} else {
 						newMeasurement = provider.Resume(run, t.metric, *t.incompleteMeasurement)
@@ -328,7 +350,7 @@ func (c *Controller) runMeasurements(run *v1alpha1.AnalysisRun, tasks []metricTa
 			}
 
 			if newMeasurement.Phase.Completed() {
-				log.Infof("measurement completed %s", newMeasurement.Phase)
+				logger.Infof("Measurement Completed. Result: %s", newMeasurement.Phase)
 				if newMeasurement.FinishedAt == nil {
 					finishedAt := metav1.Now()
 					newMeasurement.FinishedAt = &finishedAt
@@ -349,7 +371,7 @@ func (c *Controller) runMeasurements(run *v1alpha1.AnalysisRun, tasks []metricTa
 				case v1alpha1.AnalysisPhaseError:
 					metricResult.Error++
 					metricResult.ConsecutiveError++
-					log.Warnf("measurement had error: %s", newMeasurement.Message)
+					logger.Warnf("Measurement had error: %s", newMeasurement.Message)
 				}
 			}
 
@@ -391,23 +413,33 @@ func (c *Controller) assessRunStatus(run *v1alpha1.AnalysisRun, metrics []v1alph
 		run.Status.StartedAt = &now
 	}
 	if run.Spec.Terminate {
-		worstMessage = "run terminated"
+		worstMessage = "Run Terminated"
 	}
 
 	// Iterate all metrics and update MetricResult.Phase fields based on latest measurement(s)
+	dryRunStatus := dryRunStatus{
+		totalMetrics:        0,
+		failedMetrics:       0,
+		inconclusiveMetrics: 0,
+		successfulMetrics:   0,
+	}
 	for _, metric := range metrics {
+		if metric.DryRun {
+			log.Infof("Metric '%s' is running in Dry-Run mode.", metric.Name)
+			dryRunStatus.totalMetrics++
+		}
 		if result := analysisutil.GetResult(run, metric.Name); result != nil {
-			log := logutil.WithAnalysisRun(run).WithField("metric", metric.Name)
+			logger := logutil.WithAnalysisRun(run).WithField("metric", metric.Name)
 			metricStatus := assessMetricStatus(metric, *result, terminating)
 			if result.Phase != metricStatus {
-				log.Infof("metric transitioned from %s -> %s", result.Phase, metricStatus)
+				logger.Infof("Metric '%s' transitioned from %s -> %s", metric.Name, result.Phase, metricStatus)
 				if metricStatus.Completed() {
 					eventType := corev1.EventTypeNormal
 					switch metricStatus {
 					case v1alpha1.AnalysisPhaseError, v1alpha1.AnalysisPhaseFailed:
 						eventType = corev1.EventTypeWarning
 					}
-					c.recorder.Eventf(run, record.EventOptions{EventType: eventType, EventReason: "Metric" + string(metricStatus)}, "metric '%s' completed %s", metric.Name, metricStatus)
+					c.recorder.Eventf(run, record.EventOptions{EventType: eventType, EventReason: "Metric" + string(metricStatus)}, "Metric '%s' Completed. Result: %s", metric.Name, metricStatus)
 				}
 				if lastMeasurement := analysisutil.LastMeasurement(run, metric.Name); lastMeasurement != nil {
 					result.Message = lastMeasurement.Message
@@ -419,31 +451,58 @@ func (c *Controller) assessRunStatus(run *v1alpha1.AnalysisRun, metrics []v1alph
 				// if any metric is in-progress, then entire analysis run will be considered running
 				everythingCompleted = false
 			} else {
+				phase, message := assessMetricFailureInconclusiveOrError(metric, *result)
+				// NOTE: We don't care about the status if the metric is marked as a Dry-Run
 				// otherwise, remember the worst status of all completed metric results
-				if worstStatus == "" || analysisutil.IsWorse(worstStatus, metricStatus) {
-					worstStatus = metricStatus
-					_, message := assessMetricFailureInconclusiveOrError(metric, *result)
+				if !metric.DryRun {
+					if worstStatus == "" || analysisutil.IsWorse(worstStatus, metricStatus) {
+						worstStatus = metricStatus
+						if message != "" {
+							worstMessage = fmt.Sprintf("Metric \"%s\" assessed %s due to %s", metric.Name, metricStatus, message)
+							if result.Message != "" {
+								worstMessage += fmt.Sprintf(": \"Error Message: %s\"", result.Message)
+							}
+						}
+					}
+				} else {
+					// Update metric result message
 					if message != "" {
-						worstMessage = fmt.Sprintf("metric \"%s\" assessed %s due to %s", metric.Name, metricStatus, message)
+						failureMessage := fmt.Sprintf("Metric assessed %s due to %s", metricStatus, message)
 						if result.Message != "" {
-							worstMessage += fmt.Sprintf(": \"Error Message: %s\"", result.Message)
+							result.Message = fmt.Sprintf("%s: \"Error Message: %s\"", failureMessage, result.Message)
+						} else {
+							result.Message = failureMessage
 						}
+						analysisutil.SetResult(run, *result)
+					}
+					// Update DryRun stats
+					switch phase {
+					case v1alpha1.AnalysisPhaseError, v1alpha1.AnalysisPhaseFailed:
+						dryRunStatus.failedMetrics++
+					case v1alpha1.AnalysisPhaseInconclusive:
+						dryRunStatus.inconclusiveMetrics++
+					case v1alpha1.AnalysisPhaseSuccessful:
+						dryRunStatus.successfulMetrics++
+					default:
+						// We'll mark the status as success by default if it doesn't match anything.
+						dryRunStatus.successfulMetrics++
 					}
 				}
 			}
 		} else {
-			// metric hasn't started running. possible cases where some of the metrics starts with delay
+			// metric hasn't started running. possible cases where some metrics starts with delay
 			everythingCompleted = false
 		}
 	}
-
+	// Append Dry-Run metrics results if any.
+	worstMessage = appendDryRunResults(strings.TrimSpace(worstMessage), dryRunStatus)
 	if terminating {
 		if worstStatus == "" {
 			// we have yet to take a single measurement, but have already been instructed to stop
-			log.Infof("metric assessed %s: run terminated", v1alpha1.AnalysisPhaseSuccessful)
+			log.Infof(SuccessfulAssessmentRunTerminatedResult)
 			return v1alpha1.AnalysisPhaseSuccessful, worstMessage
 		}
-		log.Infof("metric assessed %s: run terminated", worstStatus)
+		log.Infof("Metric Assessment Result - %s: Run Terminated", worstStatus)
 		return worstStatus, worstMessage
 	}
 	if !everythingCompleted || worstStatus == "" {
@@ -452,26 +511,39 @@ func (c *Controller) assessRunStatus(run *v1alpha1.AnalysisRun, metrics []v1alph
 	return worstStatus, worstMessage
 }
 
+// appendDryRunResults is a helper method to append the Dry-Run metrics status to the AnalysisRun message.
+func appendDryRunResults(worstMessage string, dryRunStatus dryRunStatus) string {
+	if dryRunStatus.totalMetrics > 0 {
+		dryRunResults := fmt.Sprintf("Dry-Run Summary: Total=%d, Successful=%d, Failed=%d, Inconclusive=%d", dryRunStatus.totalMetrics, dryRunStatus.successfulMetrics, dryRunStatus.failedMetrics, dryRunStatus.inconclusiveMetrics)
+		log.Infof(dryRunResults)
+		if worstMessage == "" {
+			return dryRunResults
+		}
+		return fmt.Sprintf("%s; %s", worstMessage, dryRunResults)
+	}
+	return worstMessage
+}
+
 // assessMetricStatus assesses the status of a single metric based on:
-// * current/latest measurement status
+// * current or latest measurement status
 // * parameters given by the metric (failureLimit, count, etc...)
-// * whether or not we are terminating (e.g. due to failing run, or termination request)
+// * whether we are terminating (e.g. due to failing run, or termination request)
 func assessMetricStatus(metric v1alpha1.Metric, result v1alpha1.MetricResult, terminating bool) v1alpha1.AnalysisPhase {
 	if result.Phase.Completed() {
 		return result.Phase
 	}
-	log := log.WithField("metric", metric.Name)
+	logger := log.WithField("metric", metric.Name)
 	if len(result.Measurements) == 0 {
 		if terminating {
 			// we have yet to take a single measurement, but have already been instructed to stop
-			log.Infof("metric assessed %s: run terminated", v1alpha1.AnalysisPhaseSuccessful)
+			logger.Infof(SuccessfulAssessmentRunTerminatedResult)
 			return v1alpha1.AnalysisPhaseSuccessful
 		}
 		return v1alpha1.AnalysisPhasePending
 	}
 	lastMeasurement := result.Measurements[len(result.Measurements)-1]
 	if !lastMeasurement.Phase.Completed() {
-		// we still have a in-flight measurement
+		// we still have an in-flight measurement
 		return v1alpha1.AnalysisPhaseRunning
 	}
 
@@ -479,7 +551,7 @@ func assessMetricStatus(metric v1alpha1.Metric, result v1alpha1.MetricResult, te
 	// If true, then return AnalysisRunPhase as Failed, Inconclusive, or Error respectively
 	phaseFailureInconclusiveOrError, message := assessMetricFailureInconclusiveOrError(metric, result)
 	if phaseFailureInconclusiveOrError != "" {
-		log.Infof("metric assessed %s: %s", phaseFailureInconclusiveOrError, message)
+		logger.Infof("Metric Assessment Result - %s: %s", phaseFailureInconclusiveOrError, message)
 		return phaseFailureInconclusiveOrError
 	}
 
@@ -488,12 +560,12 @@ func assessMetricStatus(metric v1alpha1.Metric, result v1alpha1.MetricResult, te
 	// taken into consideration above, and we do not want to fail if failures < failureLimit.
 	effectiveCount := metric.EffectiveCount()
 	if effectiveCount != nil && result.Count >= int32(effectiveCount.IntValue()) {
-		log.Infof("metric assessed %s: count (%s) reached", v1alpha1.AnalysisPhaseSuccessful, effectiveCount.String())
+		logger.Infof("Metric Assessment Result - %s: Count (%s) Reached", v1alpha1.AnalysisPhaseSuccessful, effectiveCount.String())
 		return v1alpha1.AnalysisPhaseSuccessful
 	}
 	// if we get here, this metric runs indefinitely
 	if terminating {
-		log.Infof("metric assessed %s: run terminated", v1alpha1.AnalysisPhaseSuccessful)
+		logger.Infof(SuccessfulAssessmentRunTerminatedResult)
 		return v1alpha1.AnalysisPhaseSuccessful
 	}
 	return v1alpha1.AnalysisPhaseRunning
@@ -546,19 +618,18 @@ func calculateNextReconcileTime(run *v1alpha1.AnalysisRun, metrics []v1alpha1.Me
 				if run.Status.StartedAt != nil {
 					startTime = *run.Status.StartedAt
 				}
-				duration, err := metric.InitialDelay.Duration()
+				parsedInterval, err := parseMetricInterval(*logCtx, metric.InitialDelay)
 				if err != nil {
-					logCtx.Warnf("failed to parse interval: %v", err)
 					continue
 				}
-				endInitialDelay := startTime.Add(duration)
+				endInitialDelay := startTime.Add(parsedInterval)
 				if reconcileTime == nil || reconcileTime.After(endInitialDelay) {
 					reconcileTime = &endInitialDelay
 				}
 				continue
 			}
 			// no measurement was started . we should never get here
-			logCtx.Warnf("metric never started. not factored into enqueue time")
+			logCtx.Warnf("Metric never started. Not factored into enqueue time.")
 			continue
 		}
 		if lastMeasurement.FinishedAt == nil {
@@ -580,18 +651,17 @@ func calculateNextReconcileTime(run *v1alpha1.AnalysisRun, metrics []v1alpha1.Me
 		if lastMeasurement.Phase == v1alpha1.AnalysisPhaseError {
 			interval = DefaultErrorRetryInterval
 		} else if metric.Interval != "" {
-			metricInterval, err := metric.Interval.Duration()
+			parsedInterval, err := parseMetricInterval(*logCtx, metric.Interval)
 			if err != nil {
-				logCtx.Warnf("failed to parse interval: %v", err)
 				continue
 			}
-			interval = metricInterval
+			interval = parsedInterval
 		} else {
 			// if we get here, an interval was not set (meaning reoccurrence was not desired), and
 			// there was no error (meaning we don't need to retry). no need to requeue this metric.
 			// NOTE: we shouldn't ever get here since it means we are not doing proper bookkeeping
 			// of count.
-			logCtx.Warnf("skipping requeue. no interval or error (count: %d, effectiveCount: %s)", metricResult.Count, metric.EffectiveCount().String())
+			logCtx.Warnf("Skipping requeue. No interval or error (count: %d, effectiveCount: %s)", metricResult.Count, metric.EffectiveCount().String())
 			continue
 		}
 		// Take the earliest time of all metrics
@@ -619,8 +689,8 @@ func (c *Controller) garbageCollectMeasurements(run *v1alpha1.AnalysisRun, limit
 			if !ok {
 				continue
 			}
-			log := logutil.WithAnalysisRun(run).WithField("metric", metric.Name)
-			provider, err := c.newProvider(*log, metric)
+			logger := logutil.WithAnalysisRun(run).WithField("metric", metric.Name)
+			provider, err := c.newProvider(*logger, metric)
 			if err != nil {
 				errors = append(errors, err)
 				continue
diff --git a/analysis/analysis_test.go b/analysis/analysis_test.go
index 5b1fa56062..919edd7a15 100644
--- a/analysis/analysis_test.go
+++ b/analysis/analysis_test.go
@@ -91,18 +91,20 @@ func newRun() *v1alpha1.AnalysisRun {
 }
 
 // newTerminatingRun returns a run which is terminating because of the given status
-func newTerminatingRun(status v1alpha1.AnalysisPhase) *v1alpha1.AnalysisRun {
+func newTerminatingRun(status v1alpha1.AnalysisPhase, isDryRun bool) *v1alpha1.AnalysisRun {
 	run := v1alpha1.AnalysisRun{
 		Spec: v1alpha1.AnalysisRunSpec{
 			Metrics: []v1alpha1.Metric{
 				{
-					Name: "run-forever",
+					Name:   "run-forever",
+					DryRun: isDryRun,
 					Provider: v1alpha1.MetricProvider{
 						Job: &v1alpha1.JobMetric{},
 					},
 				},
 				{
-					Name: "failed-metric",
+					Name:   "failed-metric",
+					DryRun: isDryRun,
 					Provider: v1alpha1.MetricProvider{
 						Job: &v1alpha1.JobMetric{},
 					},
@@ -113,16 +115,18 @@ func newTerminatingRun(status v1alpha1.AnalysisPhase) *v1alpha1.AnalysisRun {
 			Phase: v1alpha1.AnalysisPhaseRunning,
 			MetricResults: []v1alpha1.MetricResult{
 				{
-					Name:  "run-forever",
-					Phase: v1alpha1.AnalysisPhaseRunning,
+					Name:   "run-forever",
+					DryRun: isDryRun,
+					Phase:  v1alpha1.AnalysisPhaseRunning,
 					Measurements: []v1alpha1.Measurement{{
 						Phase:     v1alpha1.AnalysisPhaseRunning,
 						StartedAt: timePtr(metav1.NewTime(time.Now().Add(-60 * time.Second))),
 					}},
 				},
 				{
-					Name:  "failed-metric",
-					Count: 1,
+					Name:   "failed-metric",
+					Count:  1,
+					DryRun: isDryRun,
 					Measurements: []v1alpha1.Measurement{{
 						Phase:      status,
 						StartedAt:  timePtr(metav1.NewTime(time.Now().Add(-60 * time.Second))),
@@ -941,7 +945,7 @@ func TestReconcileAnalysisRunTerminateSiblingAfterFail(t *testing.T) {
 	f.provider.On("Terminate", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(newMeasurement(v1alpha1.AnalysisPhaseSuccessful), nil)
 
 	for _, status := range []v1alpha1.AnalysisPhase{v1alpha1.AnalysisPhaseFailed, v1alpha1.AnalysisPhaseInconclusive, v1alpha1.AnalysisPhaseError} {
-		run := newTerminatingRun(status)
+		run := newTerminatingRun(status, false)
 		newRun := c.reconcileAnalysisRun(run)
 
 		assert.Equal(t, status, newRun.Status.Phase)
@@ -950,8 +954,8 @@ func TestReconcileAnalysisRunTerminateSiblingAfterFail(t *testing.T) {
 		// ensure the in-progress measurement is now terminated
 		assert.Equal(t, v1alpha1.AnalysisPhaseSuccessful, newRun.Status.MetricResults[0].Measurements[0].Phase)
 		assert.NotNil(t, newRun.Status.MetricResults[0].Measurements[0].FinishedAt)
-		assert.Equal(t, "metric terminated", newRun.Status.MetricResults[0].Message)
-		assert.Equal(t, "metric terminated", newRun.Status.MetricResults[0].Measurements[0].Message)
+		assert.Equal(t, "Metric Terminated", newRun.Status.MetricResults[0].Message)
+		assert.Equal(t, "Metric Terminated", newRun.Status.MetricResults[0].Measurements[0].Message)
 	}
 }
 
@@ -1069,22 +1073,26 @@ func TestResolveMetricArgsUnableToSubstitute(t *testing.T) {
 	f := newFixture(t)
 	defer f.Close()
 	c, _, _ := f.newController(noResyncPeriodFunc)
-	run := &v1alpha1.AnalysisRun{
-		Spec: v1alpha1.AnalysisRunSpec{
-			Metrics: []v1alpha1.Metric{{
-				Name:             "rate",
-				SuccessCondition: "{{args.does-not-exist}}",
-				Provider: v1alpha1.MetricProvider{
-					Prometheus: &v1alpha1.PrometheusMetric{
-						Query: "{{args.metric-name}}",
+	// Dry-Run or not if the args resolution fails then we should fail the analysis
+	for _, isDryRun := range [3]bool{false, true, false} {
+		run := &v1alpha1.AnalysisRun{
+			Spec: v1alpha1.AnalysisRunSpec{
+				Metrics: []v1alpha1.Metric{{
+					Name:             "rate",
+					DryRun:           isDryRun,
+					SuccessCondition: "{{args.does-not-exist}}",
+					Provider: v1alpha1.MetricProvider{
+						Prometheus: &v1alpha1.PrometheusMetric{
+							Query: "{{args.metric-name}}",
+						},
 					},
-				},
-			}},
-		},
+				}},
+			},
+		}
+		newRun := c.reconcileAnalysisRun(run)
+		assert.Equal(t, v1alpha1.AnalysisPhaseError, newRun.Status.Phase)
+		assert.Equal(t, "Unable to resolve metric arguments: failed to resolve {{args.metric-name}}", newRun.Status.Message)
 	}
-	newRun := c.reconcileAnalysisRun(run)
-	assert.Equal(t, v1alpha1.AnalysisPhaseError, newRun.Status.Phase)
-	assert.Equal(t, "unable to resolve metric arguments: failed to resolve {{args.metric-name}}", newRun.Status.Message)
 }
 
 // TestSecretContentReferenceSuccess verifies that secret arguments are properly resolved
@@ -1396,72 +1404,114 @@ func TestAssessMetricFailureInconclusiveOrError(t *testing.T) {
 	assert.Equal(t, phase, assessMetricStatus(metric, result, true))
 }
 
-func TestAssessRunStatusErrorMessageAnalysisPhaseFail(t *testing.T) {
+func StartAssessRunStatusErrorMessageAnalysisPhaseFail(t *testing.T, isDryRun bool) (v1alpha1.AnalysisPhase, string) {
 	f := newFixture(t)
 	defer f.Close()
 	c, _, _ := f.newController(noResyncPeriodFunc)
 
-	run := newTerminatingRun(v1alpha1.AnalysisPhaseFailed)
+	run := newTerminatingRun(v1alpha1.AnalysisPhaseFailed, isDryRun)
 	run.Status.MetricResults[0].Phase = v1alpha1.AnalysisPhaseSuccessful
-	status, message := c.assessRunStatus(run, run.Spec.Metrics)
+	return c.assessRunStatus(run, run.Spec.Metrics)
+}
+
+func TestAssessRunStatusErrorMessageAnalysisPhaseFail(t *testing.T) {
+	status, message := StartAssessRunStatusErrorMessageAnalysisPhaseFail(t, false)
 	assert.Equal(t, v1alpha1.AnalysisPhaseFailed, status)
-	assert.Equal(t, "metric \"failed-metric\" assessed Failed due to failed (1) > failureLimit (0)", message)
+	assert.Equal(t, "Metric \"failed-metric\" assessed Failed due to failed (1) > failureLimit (0)", message)
 }
 
-// TestAssessRunStatusErrorMessageFromProvider verifies that the message returned by assessRunStatus
-// includes the error message from the provider
-func TestAssessRunStatusErrorMessageFromProvider(t *testing.T) {
+func TestAssessRunStatusErrorMessageAnalysisPhaseFailInDryRunMode(t *testing.T) {
+	status, message := StartAssessRunStatusErrorMessageAnalysisPhaseFail(t, true)
+	assert.Equal(t, v1alpha1.AnalysisPhaseSuccessful, status)
+	assert.Equal(t, "Dry-Run Summary: Total=2, Successful=1, Failed=1, Inconclusive=0", message)
+}
+
+func StartAssessRunStatusErrorMessageFromProvider(t *testing.T, providerMessage string, isDryRun bool) (v1alpha1.AnalysisPhase, string) {
 	f := newFixture(t)
 	defer f.Close()
 	c, _, _ := f.newController(noResyncPeriodFunc)
 
-	run := newTerminatingRun(v1alpha1.AnalysisPhaseFailed)
+	run := newTerminatingRun(v1alpha1.AnalysisPhaseFailed, isDryRun)
 	run.Status.MetricResults[0].Phase = v1alpha1.AnalysisPhaseSuccessful // All metrics must complete, or assessRunStatus will not return message
-
-	providerMessage := "Provider error"
 	run.Status.MetricResults[1].Message = providerMessage
 
-	status, message := c.assessRunStatus(run, run.Spec.Metrics)
-	expectedMessage := fmt.Sprintf("metric \"failed-metric\" assessed Failed due to failed (1) > failureLimit (0): \"Error Message: %s\"", providerMessage)
+	return c.assessRunStatus(run, run.Spec.Metrics)
+}
+
+// TestAssessRunStatusErrorMessageFromProvider verifies that the message returned by assessRunStatus
+// includes the error message from the provider
+func TestAssessRunStatusErrorMessageFromProvider(t *testing.T) {
+	providerMessage := "Provider Error"
+	status, message := StartAssessRunStatusErrorMessageFromProvider(t, providerMessage, false)
+	expectedMessage := fmt.Sprintf("Metric \"failed-metric\" assessed Failed due to failed (1) > failureLimit (0): \"Error Message: %s\"", providerMessage)
 	assert.Equal(t, v1alpha1.AnalysisPhaseFailed, status)
 	assert.Equal(t, expectedMessage, message)
 }
 
-// TestAssessRunStatusMultipleFailures verifies that if there are multiple failed metrics, assessRunStatus returns the message
-// from the first failed metric
-func TestAssessRunStatusMultipleFailures(t *testing.T) {
+func TestAssessRunStatusErrorMessageFromProviderInDryRunMode(t *testing.T) {
+	providerMessage := "Provider Error"
+	status, message := StartAssessRunStatusErrorMessageFromProvider(t, providerMessage, true)
+	assert.Equal(t, v1alpha1.AnalysisPhaseSuccessful, status)
+	assert.Equal(t, "Dry-Run Summary: Total=2, Successful=1, Failed=1, Inconclusive=0", message)
+}
+
+func StartAssessRunStatusMultipleFailures(t *testing.T, isDryRun bool) (v1alpha1.AnalysisPhase, string) {
 	f := newFixture(t)
 	defer f.Close()
 	c, _, _ := f.newController(noResyncPeriodFunc)
 
-	run := newTerminatingRun(v1alpha1.AnalysisPhaseFailed)
+	run := newTerminatingRun(v1alpha1.AnalysisPhaseFailed, isDryRun)
 	run.Status.MetricResults[0].Phase = v1alpha1.AnalysisPhaseFailed
 	run.Status.MetricResults[0].Failed = 1
 
-	status, message := c.assessRunStatus(run, run.Spec.Metrics)
+	return c.assessRunStatus(run, run.Spec.Metrics)
+}
+
+// TestAssessRunStatusMultipleFailures verifies that if there are multiple failed metrics, assessRunStatus returns the message
+// from the first failed metric
+func TestAssessRunStatusMultipleFailures(t *testing.T) {
+	status, message := StartAssessRunStatusMultipleFailures(t, false)
 	assert.Equal(t, v1alpha1.AnalysisPhaseFailed, status)
-	assert.Equal(t, "metric \"run-forever\" assessed Failed due to failed (1) > failureLimit (0)", message)
+	assert.Equal(t, "Metric \"run-forever\" assessed Failed due to failed (1) > failureLimit (0)", message)
 }
 
-// TestAssessRunStatusWorstMessageInReconcileAnalysisRun verifies that the worstMessage returned by assessRunStatus is set as the
-// status of the AnalysisRun returned by reconcileAnalysisRun
-func TestAssessRunStatusWorstMessageInReconcileAnalysisRun(t *testing.T) {
+func TestAssessRunStatusMultipleFailuresInDryRunMode(t *testing.T) {
+	status, message := StartAssessRunStatusMultipleFailures(t, true)
+	assert.Equal(t, v1alpha1.AnalysisPhaseSuccessful, status)
+	assert.Equal(t, "Dry-Run Summary: Total=2, Successful=0, Failed=2, Inconclusive=0", message)
+}
+
+func StartAssessRunStatusWorstMessageInReconcileAnalysisRun(t *testing.T, isDryRun bool) *v1alpha1.AnalysisRun {
 	f := newFixture(t)
 	defer f.Close()
 	c, _, _ := f.newController(noResyncPeriodFunc)
 
-	run := newTerminatingRun(v1alpha1.AnalysisPhaseFailed)
+	run := newTerminatingRun(v1alpha1.AnalysisPhaseFailed, isDryRun)
 	run.Status.MetricResults[0].Phase = v1alpha1.AnalysisPhaseFailed
 	run.Status.MetricResults[0].Failed = 1
 
 	f.provider.On("Run", mock.Anything, mock.Anything, mock.Anything).Return(newMeasurement(v1alpha1.AnalysisPhaseFailed), nil)
 
-	newRun := c.reconcileAnalysisRun(run)
+	return c.reconcileAnalysisRun(run)
+}
+
+// TestAssessRunStatusWorstMessageInReconcileAnalysisRun verifies that the worstMessage returned by assessRunStatus is set as the
+// status of the AnalysisRun returned by reconcileAnalysisRun
+func TestAssessRunStatusWorstMessageInReconcileAnalysisRun(t *testing.T) {
+	newRun := StartAssessRunStatusWorstMessageInReconcileAnalysisRun(t, false)
 	assert.Equal(t, v1alpha1.AnalysisPhaseFailed, newRun.Status.Phase)
-	assert.Equal(t, "metric \"run-forever\" assessed Failed due to failed (1) > failureLimit (0)", newRun.Status.Message)
+	assert.Equal(t, "Metric \"run-forever\" assessed Failed due to failed (1) > failureLimit (0)", newRun.Status.Message)
 }
 
-func TestTerminateAnalysisRun(t *testing.T) {
+func TestAssessRunStatusWorstMessageInReconcileAnalysisRunInDryRunMode(t *testing.T) {
+	newRun := StartAssessRunStatusWorstMessageInReconcileAnalysisRun(t, true)
+	assert.Equal(t, v1alpha1.AnalysisPhaseSuccessful, newRun.Status.Phase)
+	assert.Equal(t, "Dry-Run Summary: Total=2, Successful=0, Failed=2, Inconclusive=0", newRun.Status.Message)
+	assert.Equal(t, "Metric assessed Failed due to failed (1) > failureLimit (0)", newRun.Status.MetricResults[0].Message)
+	assert.Equal(t, "Metric assessed Failed due to failed (1) > failureLimit (0)", newRun.Status.MetricResults[1].Message)
+}
+
+func StartTerminatingAnalysisRun(t *testing.T, isDryRun bool) *v1alpha1.AnalysisRun {
 	f := newFixture(t)
 	defer f.Close()
 	c, _, _ := f.newController(noResyncPeriodFunc)
@@ -1480,6 +1530,7 @@ func TestTerminateAnalysisRun(t *testing.T) {
 			},
 			Metrics: []v1alpha1.Metric{{
 				Name:             "success-rate",
+				DryRun:           isDryRun,
 				InitialDelay:     "20s",
 				Interval:         "20s",
 				SuccessCondition: "result[0] > 0.90",
@@ -1493,7 +1544,17 @@ func TestTerminateAnalysisRun(t *testing.T) {
 			Phase:     v1alpha1.AnalysisPhaseRunning,
 		},
 	}
-	newRun := c.reconcileAnalysisRun(run)
+	return c.reconcileAnalysisRun(run)
+}
+
+func TestTerminateAnalysisRun(t *testing.T) {
+	newRun := StartTerminatingAnalysisRun(t, false)
+	assert.Equal(t, v1alpha1.AnalysisPhaseSuccessful, newRun.Status.Phase)
+	assert.Equal(t, "Run Terminated", newRun.Status.Message)
+}
+
+func TestTerminateAnalysisRunInDryRunMode(t *testing.T) {
+	newRun := StartTerminatingAnalysisRun(t, true)
 	assert.Equal(t, v1alpha1.AnalysisPhaseSuccessful, newRun.Status.Phase)
-	assert.Equal(t, "run terminated", newRun.Status.Message)
+	assert.Equal(t, "Run Terminated; Dry-Run Summary: Total=1, Successful=0, Failed=0, Inconclusive=0", newRun.Status.Message)
 }
diff --git a/controller/metrics/analysis.go b/controller/metrics/analysis.go
index d8634b92c6..1f7a95eb38 100644
--- a/controller/metrics/analysis.go
+++ b/controller/metrics/analysis.go
@@ -1,6 +1,8 @@
 package metrics
 
 import (
+	"fmt"
+
 	"github.com/prometheus/client_golang/prometheus"
 	log "github.com/sirupsen/logrus"
 	"k8s.io/apimachinery/pkg/labels"
@@ -83,17 +85,17 @@ func collectAnalysisRuns(ch chan<- prometheus.Metric, ar *v1alpha1.AnalysisRun)
 	for _, metric := range ar.Spec.Metrics {
 		metricType := metricproviders.Type(metric)
 		metricResult := analysis.GetResult(ar, metric.Name)
-		addGauge(MetricAnalysisRunMetricType, 1, metric.Name, metricType)
+		addGauge(MetricAnalysisRunMetricType, 1, metric.Name, metricType, fmt.Sprint(metric.DryRun))
 		calculatedPhase := v1alpha1.AnalysisPhase("")
 		if metricResult != nil {
 			calculatedPhase = metricResult.Phase
 		}
-		addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhasePending || calculatedPhase == ""), metric.Name, metricType, string(v1alpha1.AnalysisPhasePending))
-		addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseError), metric.Name, metricType, string(v1alpha1.AnalysisPhaseError))
-		addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseFailed), metric.Name, metricType, string(v1alpha1.AnalysisPhaseFailed))
-		addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseSuccessful), metric.Name, metricType, string(v1alpha1.AnalysisPhaseSuccessful))
-		addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseRunning), metric.Name, metricType, string(v1alpha1.AnalysisPhaseRunning))
-		addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseInconclusive), metric.Name, metricType, string(v1alpha1.AnalysisPhaseInconclusive))
+		addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhasePending || calculatedPhase == ""), metric.Name, metricType, fmt.Sprint(metric.DryRun), string(v1alpha1.AnalysisPhasePending))
+		addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseError), metric.Name, metricType, fmt.Sprint(metric.DryRun), string(v1alpha1.AnalysisPhaseError))
+		addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseFailed), metric.Name, metricType, fmt.Sprint(metric.DryRun), string(v1alpha1.AnalysisPhaseFailed))
+		addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseSuccessful), metric.Name, metricType, fmt.Sprint(metric.DryRun), string(v1alpha1.AnalysisPhaseSuccessful))
+		addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseRunning), metric.Name, metricType, fmt.Sprint(metric.DryRun), string(v1alpha1.AnalysisPhaseRunning))
+		addGauge(MetricAnalysisRunMetricPhase, boolFloat64(calculatedPhase == v1alpha1.AnalysisPhaseInconclusive), metric.Name, metricType, fmt.Sprint(metric.DryRun), string(v1alpha1.AnalysisPhaseInconclusive))
 	}
 }
 
@@ -106,6 +108,6 @@ func collectAnalysisTemplate(ch chan<- prometheus.Metric, namespace, name string
 
 	for _, metric := range at.Metrics {
 		metricType := metricproviders.Type(metric)
-		addGauge(MetricAnalysisTemplateMetricInfo, 1, metricType, metric.Name)
+		addGauge(MetricAnalysisTemplateMetricInfo, 1, metricType, metric.Name, fmt.Sprint(metric.DryRun))
 	}
 }
diff --git a/controller/metrics/analysis_test.go b/controller/metrics/analysis_test.go
index 4c8a6f70fd..3cda3e5091 100644
--- a/controller/metrics/analysis_test.go
+++ b/controller/metrics/analysis_test.go
@@ -54,12 +54,19 @@ metadata:
   namespace: jesse-test
 spec:
   metrics:
-  - name: webmetric
+  - name: web-metric-1
     provider:
       web:
         jsonPath: .
         url: https://www.google.com
     successCondition: "true"
+  - name: web-metric-2
+    dryRun: true
+    provider:
+      web:
+        jsonPath: .
+        url: https://www.msn.com
+    successCondition: "false"
 `
 
 	fakeClusterAnalysisTemplate = `
@@ -67,15 +74,22 @@ apiVersion: argoproj.io/v1alpha1
 kind: ClusterAnalysisTemplate
 metadata:
   creationTimestamp: "2020-03-16T20:01:13Z"
-  name: http-benchmark-test
+  name: http-benchmark-cluster-test
 spec:
   metrics:
-  - name: webmetric
+  - name: web-metric-1
     provider:
       web:
         jsonPath: .
         url: https://www.google.com
     successCondition: "true"
+  - name: web-metric-2
+    dryRun: true
+    provider:
+      web:
+        jsonPath: .
+        url: https://www.msn.com
+    successCondition: "false"
 `
 )
 const expectedAnalysisRunResponse = `# HELP analysis_run_info Information about analysis run.
@@ -83,15 +97,15 @@ const expectedAnalysisRunResponse = `# HELP analysis_run_info Information about
 analysis_run_info{name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Error"} 1
 # HELP analysis_run_metric_phase Information on the duration of a specific metric in the Analysis Run
 # TYPE analysis_run_metric_phase gauge
-analysis_run_metric_phase{metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Error",type="Web"} 1
-analysis_run_metric_phase{metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Failed",type="Web"} 0
-analysis_run_metric_phase{metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Inconclusive",type="Web"} 0
-analysis_run_metric_phase{metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Pending",type="Web"} 0
-analysis_run_metric_phase{metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Running",type="Web"} 0
-analysis_run_metric_phase{metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Successful",type="Web"} 0
+analysis_run_metric_phase{dryRun="false",metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Error",type="Web"} 1
+analysis_run_metric_phase{dryRun="false",metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Failed",type="Web"} 0
+analysis_run_metric_phase{dryRun="false",metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Inconclusive",type="Web"} 0
+analysis_run_metric_phase{dryRun="false",metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Pending",type="Web"} 0
+analysis_run_metric_phase{dryRun="false",metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Running",type="Web"} 0
+analysis_run_metric_phase{dryRun="false",metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Successful",type="Web"} 0
 # HELP analysis_run_metric_type Information on the type of a specific metric in the Analysis Runs
 # TYPE analysis_run_metric_type gauge
-analysis_run_metric_type{metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",type="Web"} 1
+analysis_run_metric_type{dryRun="false",metric="webmetric",name="http-benchmark-test-tr8rn",namespace="jesse-test",type="Web"} 1
 # HELP analysis_run_phase Information on the state of the Analysis Run (DEPRECATED - use analysis_run_info)
 # TYPE analysis_run_phase gauge
 analysis_run_phase{name="http-benchmark-test-tr8rn",namespace="jesse-test",phase="Error"} 1
@@ -175,12 +189,14 @@ analysis_run_reconcile_count{name="ar-test",namespace="ar-namespace"} 1`
 
 func TestAnalysisTemplateDescribe(t *testing.T) {
 	expectedResponse := `# TYPE analysis_template_info gauge
-analysis_template_info{name="http-benchmark-test",namespace=""} 1
+analysis_template_info{name="http-benchmark-cluster-test",namespace=""} 1
 analysis_template_info{name="http-benchmark-test",namespace="jesse-test"} 1
 # HELP analysis_template_metric_info Information on metrics in analysis templates.
 # TYPE analysis_template_metric_info gauge
-analysis_template_metric_info{metric="webmetric",name="http-benchmark-test",namespace="",type="Web"} 1
-analysis_template_metric_info{metric="webmetric",name="http-benchmark-test",namespace="jesse-test",type="Web"} 1
+analysis_template_metric_info{dryRun="false",metric="web-metric-1",name="http-benchmark-cluster-test",namespace="",type="Web"} 1
+analysis_template_metric_info{dryRun="false",metric="web-metric-1",name="http-benchmark-test",namespace="jesse-test",type="Web"} 1
+analysis_template_metric_info{dryRun="true",metric="web-metric-2",name="http-benchmark-cluster-test",namespace="",type="Web"} 1
+analysis_template_metric_info{dryRun="true",metric="web-metric-2",name="http-benchmark-test",namespace="jesse-test",type="Web"} 1
 `
 	registry := prometheus.NewRegistry()
 	at := newFakeAnalysisTemplate(fakeAnalysisTemplate)
diff --git a/controller/metrics/prommetrics.go b/controller/metrics/prommetrics.go
index ff2017765e..eb763e73ef 100644
--- a/controller/metrics/prommetrics.go
+++ b/controller/metrics/prommetrics.go
@@ -108,14 +108,14 @@ var (
 	MetricAnalysisRunMetricType = prometheus.NewDesc(
 		"analysis_run_metric_type",
 		"Information on the type of a specific metric in the Analysis Runs",
-		append(namespaceNameLabels, "metric", "type"),
+		append(namespaceNameLabels, "metric", "type", "dryRun"),
 		nil,
 	)
 
 	MetricAnalysisRunMetricPhase = prometheus.NewDesc(
 		"analysis_run_metric_phase",
 		"Information on the duration of a specific metric in the Analysis Run",
-		append(namespaceNameLabels, "metric", "type", "phase"),
+		append(namespaceNameLabels, "metric", "type", "dryRun", "phase"),
 		nil,
 	)
 )
@@ -132,7 +132,7 @@ var (
 	MetricAnalysisTemplateMetricInfo = prometheus.NewDesc(
 		"analysis_template_metric_info",
 		"Information on metrics in analysis templates.",
-		append(namespaceNameLabels, "type", "metric"),
+		append(namespaceNameLabels, "type", "metric", "dryRun"),
 		nil,
 	)
 )
diff --git a/docs/features/analysis.md b/docs/features/analysis.md
index 0c563591ad..a9936556e4 100644
--- a/docs/features/analysis.md
+++ b/docs/features/analysis.md
@@ -541,6 +541,53 @@ encountered.
           ))
 ```
 
+## Dry-Run Mode
+
+`dryRun` can be used on a metric to control whether or not to evaluate that metric in a dry-run mode. A metric running 
+in the dry-run mode won't impact the final state of the rollout or experiment even if it fails or the evaluation comes 
+out as inconclusive.
+
+The following example queries prometheus every 5 minutes to get the total number of 4XX and 5XX errors, and even if one 
+or both of them fails, the analysis run will pass.
+
+```yaml hl_lines="4"
+  metrics:
+  - name: total-5xx-errors
+    interval: 5m
+    dryRun: true
+    failureCondition: result[0] >= 10
+    failureLimit: 3
+    provider:
+      prometheus:
+        address: http://prometheus.example.com:9090
+        query: |
+          sum(irate(
+            istio_requests_total{reporter="source",destination_service=~"{{args.service-name}}",response_code~"5.*"}[5m]
+          ))
+  - name: total-4xx-errors
+    interval: 5m
+    dryRun: true
+    failureCondition: result[0] >= 10
+    failureLimit: 3
+    provider:
+      prometheus:
+        address: http://prometheus.example.com:9090
+        query: |
+          sum(irate(
+            istio_requests_total{reporter="source",destination_service=~"{{args.service-name}}",response_code~"4.*"}[5m]
+          ))
+```
+
+If one or more metrics are running in the dry-run mode, the summary of the dry-run results gets appended to the analysis 
+run message. Assuming that the `total-4xx-errors` metric fails in the above example but, the `total-5xx-errors` 
+succeeds, the final dry-run summary will look like this.
+
+```yaml hl_lines="1"
+Message: Dry-Run Summary: Total=2, Successful=1, Failed=1, Inconclusive=0
+Metric Results:
+...
+```
+
 ## Inconclusive Runs
 
 Analysis runs can also be considered `Inconclusive`, which indicates the run was neither successful,
diff --git a/manifests/crds/analysis-run-crd.yaml b/manifests/crds/analysis-run-crd.yaml
index 9d612a9206..22bb325790 100644
--- a/manifests/crds/analysis-run-crd.yaml
+++ b/manifests/crds/analysis-run-crd.yaml
@@ -81,6 +81,8 @@ spec:
                       - type: integer
                       - type: string
                       x-kubernetes-int-or-string: true
+                    dryRun:
+                      type: boolean
                     failureCondition:
                       type: string
                     failureLimit:
@@ -2628,6 +2630,8 @@ spec:
                     count:
                       format: int32
                       type: integer
+                    dryRun:
+                      type: boolean
                     error:
                       format: int32
                       type: integer
diff --git a/manifests/crds/analysis-template-crd.yaml b/manifests/crds/analysis-template-crd.yaml
index 516a5bf5a8..21e9c4ab69 100644
--- a/manifests/crds/analysis-template-crd.yaml
+++ b/manifests/crds/analysis-template-crd.yaml
@@ -77,6 +77,8 @@ spec:
                       - type: integer
                       - type: string
                       x-kubernetes-int-or-string: true
+                    dryRun:
+                      type: boolean
                     failureCondition:
                       type: string
                     failureLimit:
diff --git a/manifests/crds/cluster-analysis-template-crd.yaml b/manifests/crds/cluster-analysis-template-crd.yaml
index b10ac4a166..c7219eb532 100644
--- a/manifests/crds/cluster-analysis-template-crd.yaml
+++ b/manifests/crds/cluster-analysis-template-crd.yaml
@@ -77,6 +77,8 @@ spec:
                       - type: integer
                       - type: string
                       x-kubernetes-int-or-string: true
+                    dryRun:
+                      type: boolean
                     failureCondition:
                       type: string
                     failureLimit:
diff --git a/manifests/install.yaml b/manifests/install.yaml
index fb7aa00ee7..1986288727 100644
--- a/manifests/install.yaml
+++ b/manifests/install.yaml
@@ -82,6 +82,8 @@ spec:
                       - type: integer
                       - type: string
                       x-kubernetes-int-or-string: true
+                    dryRun:
+                      type: boolean
                     failureCondition:
                       type: string
                     failureLimit:
@@ -2629,6 +2631,8 @@ spec:
                     count:
                       format: int32
                       type: integer
+                    dryRun:
+                      type: boolean
                     error:
                       format: int32
                       type: integer
@@ -2772,6 +2776,8 @@ spec:
                       - type: integer
                       - type: string
                       x-kubernetes-int-or-string: true
+                    dryRun:
+                      type: boolean
                     failureCondition:
                       type: string
                     failureLimit:
@@ -5390,6 +5396,8 @@ spec:
                       - type: integer
                       - type: string
                       x-kubernetes-int-or-string: true
+                    dryRun:
+                      type: boolean
                     failureCondition:
                       type: string
                     failureLimit:
diff --git a/manifests/namespace-install.yaml b/manifests/namespace-install.yaml
index bfa060f41c..6f97d504b5 100644
--- a/manifests/namespace-install.yaml
+++ b/manifests/namespace-install.yaml
@@ -82,6 +82,8 @@ spec:
                       - type: integer
                       - type: string
                       x-kubernetes-int-or-string: true
+                    dryRun:
+                      type: boolean
                     failureCondition:
                       type: string
                     failureLimit:
@@ -2629,6 +2631,8 @@ spec:
                     count:
                       format: int32
                       type: integer
+                    dryRun:
+                      type: boolean
                     error:
                       format: int32
                       type: integer
@@ -2772,6 +2776,8 @@ spec:
                       - type: integer
                       - type: string
                       x-kubernetes-int-or-string: true
+                    dryRun:
+                      type: boolean
                     failureCondition:
                       type: string
                     failureLimit:
@@ -5390,6 +5396,8 @@ spec:
                       - type: integer
                       - type: string
                       x-kubernetes-int-or-string: true
+                    dryRun:
+                      type: boolean
                     failureCondition:
                       type: string
                     failureLimit:
diff --git a/pkg/apis/rollouts/v1alpha1/analysis_types.go b/pkg/apis/rollouts/v1alpha1/analysis_types.go
index 65d54789c6..550fbbd735 100644
--- a/pkg/apis/rollouts/v1alpha1/analysis_types.go
+++ b/pkg/apis/rollouts/v1alpha1/analysis_types.go
@@ -106,6 +106,8 @@ type Metric struct {
 	ConsecutiveErrorLimit *intstrutil.IntOrString `json:"consecutiveErrorLimit,omitempty" protobuf:"bytes,9,opt,name=consecutiveErrorLimit"`
 	// Provider configuration to the external system to use to verify the analysis
 	Provider MetricProvider `json:"provider" protobuf:"bytes,10,opt,name=provider"`
+	// Whether to evaluate this metric in a Dry-Run mode
+	DryRun bool `json:"dryRun,omitempty" protobuf:"varint,11,opt,name=dryRun"`
 }
 
 // EffectiveCount is the effective count based on whether or not count/interval is specified
@@ -343,6 +345,8 @@ type MetricResult struct {
 	// ConsecutiveError is the number of times an error was encountered during measurement in succession
 	// Resets to zero when non-errors are encountered
 	ConsecutiveError int32 `json:"consecutiveError,omitempty" protobuf:"varint,10,opt,name=consecutiveError"`
+	// Whether this metric is running in a Dry-Run mode
+	DryRun bool `json:"dryRun,omitempty" protobuf:"varint,11,opt,name=dryRun"`
 }
 
 // Measurement is a point in time result value of a single metric, and the time it was measured
diff --git a/pkg/apis/rollouts/v1alpha1/generated.pb.go b/pkg/apis/rollouts/v1alpha1/generated.pb.go
index 4f0148edd0..69c16f1ab5 100644
--- a/pkg/apis/rollouts/v1alpha1/generated.pb.go
+++ b/pkg/apis/rollouts/v1alpha1/generated.pb.go
@@ -10090,6 +10090,7 @@ func (this *MetricResult) String() string {
 		`Phase:` + fmt.Sprintf("%v", this.Phase) + `,`,
 		`Measurements:` + repeatedStringForMeasurements + `,`,
 		`Message:` + fmt.Sprintf("%v", this.Message) + `,`,
+		`Dry-Run:` + fmt.Sprintf("%v", this.DryRun) + `,`,
 		`Count:` + fmt.Sprintf("%v", this.Count) + `,`,
 		`Successful:` + fmt.Sprintf("%v", this.Successful) + `,`,
 		`Failed:` + fmt.Sprintf("%v", this.Failed) + `,`,
diff --git a/pkg/apis/rollouts/v1alpha1/generated.proto b/pkg/apis/rollouts/v1alpha1/generated.proto
index a4dc739791..a7e2c6e71a 100644
--- a/pkg/apis/rollouts/v1alpha1/generated.proto
+++ b/pkg/apis/rollouts/v1alpha1/generated.proto
@@ -772,6 +772,9 @@ message Metric {
 
   // Provider configuration to the external system to use to verify the analysis
   optional MetricProvider provider = 10;
+
+  // Whether to evaluate this metric in a Dry-Run mode
+  optional bool dryRun = 11;
 }
 
 // MetricProvider which external system to use to verify the analysis
@@ -839,6 +842,9 @@ message MetricResult {
   // ConsecutiveError is the number of times an error was encountered during measurement in succession
   // Resets to zero when non-errors are encountered
   optional int32 consecutiveError = 10;
+
+  // Whether this metric is running in a Dry-Run mode
+  optional bool dryRun = 11;
 }
 
 // NewRelicMetric defines the newrelic query to perform canary analysis
diff --git a/pkg/apis/rollouts/v1alpha1/openapi_generated.go b/pkg/apis/rollouts/v1alpha1/openapi_generated.go
index 11c0f0a1ca..3c06ebe192 100644
--- a/pkg/apis/rollouts/v1alpha1/openapi_generated.go
+++ b/pkg/apis/rollouts/v1alpha1/openapi_generated.go
@@ -2280,6 +2280,13 @@ func schema_pkg_apis_rollouts_v1alpha1_Metric(ref common.ReferenceCallback) comm
 							Ref:         ref("github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1.MetricProvider"),
 						},
 					},
+					"dryRun": {
+						SchemaProps: spec.SchemaProps{
+							Description: "Whether to evaluate this metric in a Dry-Run mode",
+							Type:        []string{"boolean"},
+							Format:      "",
+						},
+					},
 				},
 				Required: []string{"name", "provider"},
 			},
@@ -2444,6 +2451,13 @@ func schema_pkg_apis_rollouts_v1alpha1_MetricResult(ref common.ReferenceCallback
 							Format:      "int32",
 						},
 					},
+					"dryRun": {
+						SchemaProps: spec.SchemaProps{
+							Description: "Whether this metric is running in a Dry-Run mode",
+							Type:        []string{"boolean"},
+							Format:      "",
+						},
+					},
 				},
 				Required: []string{"name", "phase"},
 			},