From a5239bfe15acfa76346d6097a92f669a95ad27d5 Mon Sep 17 00:00:00 2001
From: Jesse Suen <jesse_suen@intuit.com>
Date: Wed, 14 Jul 2021 00:31:56 -0700
Subject: [PATCH] feat: ability to verify AWS TargetGroup after canary stable
 promotion

Signed-off-by: Jesse Suen <jessesuen@gmail.com>
---
 docs/features/traffic-management/alb.md       | 113 ++++-
 experiments/replicaset.go                     |   2 +-
 ingress/alb.go                                |   2 +-
 manifests/install.yaml                        |  16 +-
 manifests/namespace-install.yaml              |  16 +-
 rollout/analysis.go                           |   5 +-
 rollout/analysis_test.go                      |   2 -
 rollout/bluegreen.go                          |  41 +-
 rollout/bluegreen_test.go                     |  44 +-
 rollout/canary.go                             |  63 ++-
 rollout/controller_test.go                    |  30 +-
 rollout/ephemeralmetadata_test.go             |   1 -
 rollout/replicaset.go                         |  38 +-
 rollout/replicaset_test.go                    |   2 +
 rollout/service.go                            |  53 ++-
 rollout/service_test.go                       | 432 ++++++++++++++++++
 rollout/sync.go                               |   9 -
 rollout/sync_test.go                          |   1 -
 rollout/trafficrouting.go                     |  15 +-
 rollout/trafficrouting/alb/alb.go             |  11 +-
 rollout/trafficrouting/alb/alb_test.go        |   9 +-
 .../ambassador/ambassador_test.go             |  11 +-
 rollout/trafficrouting/smi/smi_test.go        |  44 +-
 rollout/trafficrouting_test.go                |  23 +-
 test/e2e/aws_test.go                          |  19 +-
 test/e2e/canary_test.go                       |   1 +
 .../e2e/functional/alb-bluegreen-rollout.yaml |  71 +++
 ...b-rollout.yaml => alb-canary-rollout.yaml} |  34 +-
 test/fixtures/e2e_suite.go                    |   2 +
 test/fixtures/when.go                         |  21 +-
 test/util/util.go                             |  10 +-
 utils/aws/aws.go                              |  90 ++--
 utils/aws/aws_test.go                         | 236 +++++++++-
 utils/aws/mocks/ELBv2APIClient.go             |  30 ++
 utils/conditions/conditions.go                |  12 +
 utils/defaults/defaults_test.go               |  27 ++
 utils/istio/istio_test.go                     |   5 +-
 utils/rollout/rolloututil.go                  |  29 +-
 utils/rollout/rolloututil_test.go             |  35 ++
 39 files changed, 1376 insertions(+), 229 deletions(-)
 create mode 100644 test/e2e/functional/alb-bluegreen-rollout.yaml
 rename test/e2e/functional/{alb-rollout.yaml => alb-canary-rollout.yaml} (71%)

diff --git a/docs/features/traffic-management/alb.md b/docs/features/traffic-management/alb.md
index ef614573c7..7f9e8e031e 100644
--- a/docs/features/traffic-management/alb.md
+++ b/docs/features/traffic-management/alb.md
@@ -158,24 +158,107 @@ spec:
 ...
 ```
 
-### Weight verification
+### Zero-Downtime Updates with AWS TargetGroup Verification
+
+Argo Rollouts contains two features to help ensure zero-downtime updates when used with the AWS 
+LoadBalancer controller: TargetGroup IP verification and TargetGroup weight verification. Both
+features involve the Rollout controller performing additional safety checks to AWS, to verify
+the changes made to the Ingress object are reflected in the underlying AWS TargetGroup.
+
+#### TargetGroup IP Verification
+
+!!! note
+
+    Target Group IP verification available since Argo Rollouts v1.1
+
+The AWS LoadBalancer controller can run in one of two modes:
+* [Instance mode](https://kubernetes-sigs.github.io/aws-load-balancer-controller/v2.2/how-it-works/#instance-mode)
+* [IP mode](https://kubernetes-sigs.github.io/aws-load-balancer-controller/v2.2/how-it-works/#ip-mode)
+
+TargetGroup IP Verification is only applicable when the AWS LoadBalancer controller in IP mode.
+When using the AWS LoadBalancer controller in IP mode (e.g. using the AWS CNI), the ALB LoadBalancer
+targets individual Pod IPs, as opposed to K8s node instances. Targeting Pod IPs comes with an
+increased risk of downtime during an update, because the Pod IPs behind the underlying AWS TargetGroup
+can more easily become outdated from the *_actual_* availability and status of pods, causing HTTP 502
+errors when the TargetGroup points to pods which have already been scaled down.
+
+To mitigate this risk, AWS recommends the use of
+[pod readiness gate injection](https://kubernetes-sigs.github.io/aws-load-balancer-controller/v2.2/deploy/pod_readiness_gate/)
+when running the AWS LoadBalancer in IP mode. Readiness gates allow for the AWS LoadBalancer 
+controller to verify that TargetGroups are accurate before marking newly created Pods as "ready",
+preventing premature scale down of the older ReplicaSet.
+
+Pod readiness gate injection uses a mutating webhook which decides to inject readiness gates when a
+pod is created based on the following conditions:
+* There exists a service matching the pod labels in the same namespace
+* There exists at least one target group binding that refers to the matching service
+
+Another way to describe this is: the AWS LoadBalancer controller injects readiness gates onto Pods
+only if they are "reachable"  from an ALB Ingress at the time of pod creation. A pod is considered
+reachable if an (ALB) Ingress references a Service which matches the pod labels. It ignores all other Pods.
+
+One challenge with this manner of pod readiness gate injection, is that modifications to the Service
+selector labels (`spec.selector`) do not allow for the AWS LoadBalancer controller to inject the
+readiness gates, because by that time the Pod was already created (and readiness gates are immutable).
+Note that this is an issue when you change Service selectors of *_any_* ALB Service, not just ones
+involved in Argo Rollouts.
+
+Because Argo Rollout's blue-green strategy works by modifying the activeService selector to the new
+ReplicaSet labels during promotion, it suffers from the issue where readiness gates for the
+`spec.strategy.blueGreen.activeService` fail to be injected. This means there is a possibility of
+downtime in the following problematic scenario during an update from V1 to V2:
+
+1. Update is triggered and V2 ReplicaSet stack is scaled up
+2. V2 ReplicaSet pods become fully available and ready to be promoted
+3. Rollout promotes V2 by updating the label selectors of the active service to point to the V2 stack (from V1)
+4. Due to unknown issues (e.g. AWS load balancer controller downtime, AWS rate limiting), registration
+   of the V2 Pod IPs to the TargetGroup does not happen or is delayed.
+5. V1 ReplicaSet is scaled down to complete the update
+
+After step 5, when the V1 ReplicaSet is scaled down, the outdated TargetGroup would still be pointing
+to the V1 Pods IPs which no longer exist, causing downtime. 
+
+To allow for zero-downtime updates, Argo Rollouts has the ability to perform TargetGroup IP
+verification as an additional safety measure during an update. When this feature is enabled, whenever
+a service selector modification is made, the Rollout controller blocks progression of the update
+until it can verify the TargetGroup is accurately targeting the new Pod IPs of the
+`bluegreen.activeService`. Verification is achieved by querying AWS APIs to describe the underlying
+TargetGroup, iterating its registered IPs, and ensuring all Pod IPs of the activeService's
+`Endpoints` list are registered in the TargetGroup. Verification must succeed before running
+postPromotionAnalysis or scaling down the old ReplicaSet.
+
+Similarly for the canary strategy, after updating the `canary.stableService` selector labels to
+point to the new ReplicaSet, the TargetGroup IP verification feature allows the controller to block
+the scale down of the old ReplicaSet until it verifies the Pods IP behind the stableService
+TargetGroup are accurate.
+
+#### TargetGroup Weight Verification
 
 !!! note
 
-    Since Argo Rollouts v1.0
+    TargetGroup weight verification available since Argo Rollouts v1.0
+
+TargetGroup weight verification addresses a similar problem to TargetGroup IP verification, but
+instead of verifying that the Pod IPs of a service are reflected accurately in the TargetGroup, the
+controller verifies that the traffic *_weights_* are accurate from what was set in the ingress
+annotations. Weight verification is applicable to AWS LoadBalancer controllers which are running
+either in IP mode or Instance mode.
+
+After Argo Rollouts adjusts a canary weight by updating the Ingress annotation, it moves on to the
+next step. However, due to external factors (e.g. AWS rate limiting, AWS load balancer controller
+downtime) it is possible that the weight modifications made to the Ingress, did not take effect in
+the underlying TargetGroup. This is potentially dangerous as the controller will believe it is safe
+to scale down the old stable stack when in reality, the outdated TargetGroup may still be pointing
+to it.
+
+Using the TargetGroup weight verification feature, the rollout controller will additionally *verify*
+the canary weight after a `setWeight` canary step. It accomplishes this by querying AWS LoadBalancer
+APIs directly, to confirm that the Rules, Actions, and TargetGroups reflect the desire of Ingress
+annotation.
 
-When Argo Rollouts adjusts a canary weight by updating the Ingress annotation, it assumes that
-the new weight immediately takes effect and moves on to the next step. However, due to external
-factors (e.g. AWS rate limiting, AWS load balancer controller downtime) it is possible that the
-ingress modification may take a long time to take effect (or possibly never even made). This is
-potentially dangerous when the rollout completes its steps, it will scale down the old stack. If
-the ALB Rules/Actions were still directing traffic to the old stack (because the weights never took
-effect), then this would cause downtime to the service when the old stack was scaled down.
+#### Usage
 
-To mitigate this, the rollout controller has a feature to additionally *verify* the canary weight 
-after a `setWeight` canary step. It accomplishes this by querying AWS LoadBalancer APIs directly,
-to confirm that the Rules, Actions, and TargetGroups reflect the desire of Ingress annotation.
-To enable ALB weight verification, add `--alb-verify-weight` flag to the rollout-controller flags:
+To enable AWS target group verification, add `--aws-verify-target-group` flag to the rollout-controller flags:
 
 ```yaml
 apiVersion: apps/v1
@@ -187,7 +270,8 @@ spec:
     spec:
       containers:
       - name: argo-rollouts
-        args: [--alb-verify-weight]
+        args: [--aws-verify-target-group]
+        # NOTE: in v1.0, the --alb-verify-weight flag should be used instead
 ```
 
 For this feature to work, the argo-rollouts deployment requires the following AWS API permissions
@@ -198,6 +282,7 @@ under the [Elastic Load Balancing API](https://docs.aws.amazon.com/elasticloadba
 * `DescribeListeners`
 * `DescribeRules`
 * `DescribeTags`
+* `DescribeTargetHealth`
 
 There are various ways of granting AWS privileges to the argo-rollouts pods, which is highly
 dependent to your cluster's AWS environment, and out-of-scope of this documentation. Some solutions
diff --git a/experiments/replicaset.go b/experiments/replicaset.go
index 698d9cfe03..5d97eb15c1 100644
--- a/experiments/replicaset.go
+++ b/experiments/replicaset.go
@@ -231,7 +231,7 @@ func (ec *experimentContext) addScaleDownDelay(rs *appsv1.ReplicaSet) (bool, err
 	patch := fmt.Sprintf(addScaleDownAtAnnotationsPatch, v1alpha1.DefaultReplicaSetScaleDownDeadlineAnnotationKey, deadline)
 	_, err := ec.kubeclientset.AppsV1().ReplicaSets(rs.Namespace).Patch(ctx, rs.Name, patchtypes.JSONPatchType, []byte(patch), metav1.PatchOptions{})
 	if err == nil {
-		ec.log.Infof("Set '%s' annotation on '%s' to %s (%ds)", v1alpha1.DefaultReplicaSetScaleDownDeadlineAnnotationKey, rs.Name, deadline, scaleDownDelaySeconds)
+		ec.log.Infof("Set '%s' annotation on '%s' to %s (%s)", v1alpha1.DefaultReplicaSetScaleDownDeadlineAnnotationKey, rs.Name, deadline, scaleDownDelaySeconds)
 		rsIsUpdated = true
 	}
 	return rsIsUpdated, err
diff --git a/ingress/alb.go b/ingress/alb.go
index 1fb9a33839..b3454d3d8d 100644
--- a/ingress/alb.go
+++ b/ingress/alb.go
@@ -40,7 +40,7 @@ func (c *Controller) syncALBIngress(ingress *extensionsv1beta1.Ingress, rollouts
 			delete(managedActions, roName)
 			resetALBAction, err := getResetALBActionStr(ingress, actionKey)
 			if err != nil {
-				log.WithField(logutil.IngressKey, ingress.Name).WithField(logutil.NamespaceKey, ingress.Namespace).Error(err)
+				log.WithField(logutil.RolloutKey, roName).WithField(logutil.IngressKey, ingress.Name).WithField(logutil.NamespaceKey, ingress.Namespace).Error(err)
 				return nil
 			}
 			newIngress.Annotations[actionKey] = resetALBAction
diff --git a/manifests/install.yaml b/manifests/install.yaml
index 9fe2b1648b..450bc75af7 100644
--- a/manifests/install.yaml
+++ b/manifests/install.yaml
@@ -12726,8 +12726,10 @@ rules:
   - patch
 - apiGroups:
   - getambassador.io
+  - x.getambassador.io
   resources:
   - mappings
+  - ambassadormappings
   verbs:
   - create
   - watch
@@ -12736,16 +12738,18 @@ rules:
   - list
   - delete
 - apiGroups:
-  - x.getambassador.io
+  - ""
   resources:
-  - ambassadormappings
+  - endpoints
   verbs:
-  - create
-  - watch
   - get
-  - update
+- apiGroups:
+  - elbv2.k8s.aws
+  resources:
+  - targetgroupbindings
+  verbs:
   - list
-  - delete
+  - get
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
diff --git a/manifests/namespace-install.yaml b/manifests/namespace-install.yaml
index c971be8405..a0d92de442 100644
--- a/manifests/namespace-install.yaml
+++ b/manifests/namespace-install.yaml
@@ -12726,8 +12726,10 @@ rules:
   - patch
 - apiGroups:
   - getambassador.io
+  - x.getambassador.io
   resources:
   - mappings
+  - ambassadormappings
   verbs:
   - create
   - watch
@@ -12736,16 +12738,18 @@ rules:
   - list
   - delete
 - apiGroups:
-  - x.getambassador.io
+  - ""
   resources:
-  - ambassadormappings
+  - endpoints
   verbs:
-  - create
-  - watch
   - get
-  - update
+- apiGroups:
+  - elbv2.k8s.aws
+  resources:
+  - targetgroupbindings
+  verbs:
   - list
-  - delete
+  - get
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole
diff --git a/rollout/analysis.go b/rollout/analysis.go
index bb132941d5..e495b07656 100644
--- a/rollout/analysis.go
+++ b/rollout/analysis.go
@@ -21,6 +21,7 @@ import (
 	logutil "github.com/argoproj/argo-rollouts/utils/log"
 	"github.com/argoproj/argo-rollouts/utils/record"
 	replicasetutil "github.com/argoproj/argo-rollouts/utils/replicaset"
+	rolloututil "github.com/argoproj/argo-rollouts/utils/rollout"
 )
 
 const (
@@ -176,7 +177,7 @@ func needsNewAnalysisRun(currentAr *v1alpha1.AnalysisRun, rollout *v1alpha1.Roll
 
 // emitAnalysisRunStatusChanges emits a Kubernetes event if the analysis run of that type has changed status
 func (c *rolloutContext) emitAnalysisRunStatusChanges(prevStatus *v1alpha1.RolloutAnalysisRunStatus, ar *v1alpha1.AnalysisRun, arType string) {
-	if ar != nil {
+	if ar != nil && ar.Status.Phase != "" {
 		if prevStatus == nil || prevStatus.Name == ar.Name && prevStatus.Status != ar.Status.Phase {
 			prevStatusStr := "NoPreviousStatus"
 			if prevStatus != nil {
@@ -318,7 +319,7 @@ func (c *rolloutContext) reconcileBackgroundAnalysisRun() (*v1alpha1.AnalysisRun
 	}
 
 	// Do not create a background run if the rollout is completely rolled out, just created, before the starting step
-	if c.rollout.Status.StableRS == c.rollout.Status.CurrentPodHash || c.rollout.Status.StableRS == "" || c.rollout.Status.CurrentPodHash == "" || replicasetutil.BeforeStartingStep(c.rollout) {
+	if rolloututil.IsFullyPromoted(c.rollout) || c.rollout.Status.StableRS == "" || c.rollout.Status.CurrentPodHash == "" || replicasetutil.BeforeStartingStep(c.rollout) {
 		return nil, nil
 	}
 
diff --git a/rollout/analysis_test.go b/rollout/analysis_test.go
index e32aa5e982..2678c26b1a 100644
--- a/rollout/analysis_test.go
+++ b/rollout/analysis_test.go
@@ -1847,7 +1847,6 @@ func TestRolloutPrePromotionAnalysisSwitchServiceAfterSuccess(t *testing.T) {
 	f.serviceLister = append(f.serviceLister, activeSvc)
 
 	f.expectPatchServiceAction(activeSvc, rs2PodHash)
-	f.expectPatchReplicaSetAction(rs1)
 	patchIndex := f.expectPatchRolloutActionWithPatch(r2, OnlyObservedGenerationPatch)
 	f.run(getKey(r2, t))
 	patch := f.getPatchedRolloutWithoutConditions(patchIndex)
@@ -1915,7 +1914,6 @@ func TestRolloutPrePromotionAnalysisHonorAutoPromotionSeconds(t *testing.T) {
 	f.serviceLister = append(f.serviceLister, activeSvc)
 
 	f.expectPatchServiceAction(activeSvc, rs2PodHash)
-	f.expectPatchReplicaSetAction(rs1)
 	patchIndex := f.expectPatchRolloutActionWithPatch(r2, OnlyObservedGenerationPatch)
 	f.run(getKey(r2, t))
 	patch := f.getPatchedRolloutWithoutConditions(patchIndex)
diff --git a/rollout/bluegreen.go b/rollout/bluegreen.go
index 672a52ee17..ef6df170ee 100644
--- a/rollout/bluegreen.go
+++ b/rollout/bluegreen.go
@@ -224,16 +224,23 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForBlueGreen(oldRSs []*appsv1.Re
 			c.log.Warnf("Prevented inadvertent scaleDown of RS '%s'", targetRS.Name)
 			continue
 		}
-
+		if *targetRS.Spec.Replicas == 0 {
+			// cannot scale down this ReplicaSet.
+			continue
+		}
 		var desiredReplicaCount int32
-		annotationedRSs, desiredReplicaCount = c.ScaleDownDelayHelper(targetRS, annotationedRSs, rolloutReplicas)
+		var err error
+		annotationedRSs, desiredReplicaCount, err = c.scaleDownDelayHelper(targetRS, annotationedRSs, rolloutReplicas)
+		if err != nil {
+			return false, err
+		}
 
-		if *(targetRS.Spec.Replicas) == desiredReplicaCount {
-			// at desired account
+		if *targetRS.Spec.Replicas == desiredReplicaCount {
+			// already at desired account, nothing to do
 			continue
 		}
 		// Scale down.
-		_, _, err := c.scaleReplicaSetAndRecordEvent(targetRS, desiredReplicaCount)
+		_, _, err = c.scaleReplicaSetAndRecordEvent(targetRS, desiredReplicaCount)
 		if err != nil {
 			return false, err
 		}
@@ -243,30 +250,6 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForBlueGreen(oldRSs []*appsv1.Re
 	return hasScaled, nil
 }
 
-func (c *rolloutContext) ScaleDownDelayHelper(rs *appsv1.ReplicaSet, annotationedRSs int32, rolloutReplicas int32) (int32, int32) {
-	desiredReplicaCount := int32(0)
-	scaleDownRevisionLimit := GetScaleDownRevisionLimit(c.rollout)
-	if replicasetutil.HasScaleDownDeadline(rs) {
-		annotationedRSs++
-		if annotationedRSs > scaleDownRevisionLimit {
-			c.log.Infof("At ScaleDownDelayRevisionLimit (%d) and scaling down the rest", scaleDownRevisionLimit)
-		} else {
-			remainingTime, err := replicasetutil.GetTimeRemainingBeforeScaleDownDeadline(rs)
-			if err != nil {
-				c.log.Warnf("%v", err)
-			} else if remainingTime != nil {
-				c.log.Infof("RS '%s' has not reached the scaleDownTime", rs.Name)
-				if *remainingTime < c.resyncPeriod {
-					c.enqueueRolloutAfter(c.rollout, *remainingTime)
-				}
-				desiredReplicaCount = rolloutReplicas
-			}
-		}
-	}
-
-	return annotationedRSs, desiredReplicaCount
-}
-
 func GetScaleDownRevisionLimit(ro *v1alpha1.Rollout) int32 {
 	if ro.Spec.Strategy.BlueGreen != nil {
 		if ro.Spec.Strategy.BlueGreen.ScaleDownDelayRevisionLimit != nil {
diff --git a/rollout/bluegreen_test.go b/rollout/bluegreen_test.go
index a016e8ec89..a83b9071c2 100644
--- a/rollout/bluegreen_test.go
+++ b/rollout/bluegreen_test.go
@@ -500,7 +500,6 @@ func TestBlueGreenHandlePause(t *testing.T) {
 		}`
 		expectedPatch := calculatePatch(r2, fmt.Sprintf(expectedPatchWithoutSubs, rs2PodHash, rs2PodHash, rs2PodHash))
 		f.expectPatchServiceAction(activeSvc, rs2PodHash)
-		f.expectPatchReplicaSetAction(rs1)
 		patchRolloutIndex := f.expectPatchRolloutActionWithPatch(r2, expectedPatch)
 		f.run(getKey(r2, t))
 
@@ -580,7 +579,6 @@ func TestBlueGreenHandlePause(t *testing.T) {
 		f.serviceLister = append(f.serviceLister, activeSvc)
 
 		servicePatchIndex := f.expectPatchServiceAction(activeSvc, rs2PodHash)
-		patchedRSIndex := f.expectPatchReplicaSetAction(rs1)
 
 		generatedConditions := generateConditionsPatch(true, conditions.ReplicaSetUpdatedReason, rs2, true, "")
 		newSelector := metav1.FormatLabelSelector(rs2.Spec.Selector)
@@ -600,7 +598,6 @@ func TestBlueGreenHandlePause(t *testing.T) {
 		patchIndex := f.expectPatchRolloutActionWithPatch(r2, expectedPatch)
 		f.run(getKey(r2, t))
 		f.verifyPatchedService(servicePatchIndex, rs2PodHash, "")
-		f.verifyPatchedReplicaSet(patchedRSIndex, 10)
 
 		rolloutPatch := f.getPatchedRollout(patchIndex)
 		assert.Equal(t, expectedPatch, rolloutPatch)
@@ -730,13 +727,11 @@ func TestBlueGreenHandlePause(t *testing.T) {
 		f.serviceLister = append(f.serviceLister, activeSvc, previewSvc)
 
 		servicePatchIndex := f.expectPatchServiceAction(activeSvc, rs2PodHash)
-		patchedRSIndex := f.expectPatchReplicaSetAction(rs1)
 		unpausePatchIndex := f.expectPatchRolloutAction(r2)
 		patchRolloutIndex := f.expectPatchRolloutAction(r2)
 		f.run(getKey(r2, t))
 
 		f.verifyPatchedService(servicePatchIndex, rs2PodHash, "")
-		f.verifyPatchedReplicaSet(patchedRSIndex, 10)
 		unpausePatch := f.getPatchedRollout(unpausePatchIndex)
 		unpauseConditions := generateConditionsPatch(true, conditions.RolloutResumedReason, rs2, true, "")
 		expectedUnpausePatch := `{
@@ -791,10 +786,8 @@ func TestBlueGreenAddScaleDownDelayToPreviousActiveReplicaSet(t *testing.T) {
 	f.serviceLister = append(f.serviceLister, s)
 
 	f.expectPatchServiceAction(s, rs2PodHash)
-	patchedRSIndex := f.expectPatchReplicaSetAction(rs1)
 	patchIndex := f.expectPatchRolloutAction(r2)
 	f.run(getKey(r2, t))
-	f.verifyPatchedReplicaSet(patchedRSIndex, 10)
 
 	patch := f.getPatchedRollout(patchIndex)
 	expectedPatchWithoutSubs := `{
@@ -997,7 +990,6 @@ func TestPreviewReplicaCountHandleScaleUpPreviewCheckPoint(t *testing.T) {
 		f.kubeobjects = append(f.kubeobjects, activeSvc)
 		f.serviceLister = append(f.serviceLister, activeSvc)
 
-		f.expectPatchReplicaSetAction(rs1)
 		patchIndex := f.expectPatchRolloutAction(r1)
 
 		f.run(getKey(r2, t))
@@ -1452,10 +1444,44 @@ func TestBlueGreenHandlePauseAutoPromoteWithConditions(t *testing.T) {
 	assert.Nil(t, err)
 	expectedPatch := calculatePatch(r2, fmt.Sprintf(expectedPatchWithoutSubs, rs2PodHash, string(availableCondBytes), string(pausedCondBytes), string(progressingCondBytes), rs2PodHash, rs2PodHash))
 	f.expectPatchServiceAction(activeSvc, rs2PodHash)
-	f.expectPatchReplicaSetAction(rs1)
 	patchRolloutIndex := f.expectPatchRolloutActionWithPatch(r2, expectedPatch)
 	f.run(getKey(r2, t))
 
 	rolloutPatch := f.getPatchedRollout(patchRolloutIndex)
 	assert.Equal(t, expectedPatch, rolloutPatch)
 }
+
+// Verifies with blue-green, we add a scaledown delay to the old ReplicaSet after promoting desired
+// ReplicaSet to stable.
+// NOTE: As of v1.1, scale down delays are added to  ReplicaSets on *subsequent* reconciliations
+// after the desired RS has been promoted to stable
+func TestBlueGreenAddScaleDownDelay(t *testing.T) {
+	f := newFixture(t)
+	defer f.Close()
+
+	r1 := newBlueGreenRollout("foo", 1, nil, "active", "")
+	r2 := bumpVersion(r1)
+
+	rs1 := newReplicaSetWithStatus(r1, 1, 1)
+	rs2 := newReplicaSetWithStatus(r2, 1, 1)
+	rs2PodHash := rs2.Labels[v1alpha1.DefaultRolloutUniqueLabelKey]
+	r2.Status.ObservedGeneration = strconv.Itoa(int(r2.Generation))
+	r2 = updateBlueGreenRolloutStatus(r2, "", rs2PodHash, rs2PodHash, 1, 1, 2, 1, false, true)
+	completedCondition, _ := newCompletedCondition(true)
+	conditions.SetRolloutCondition(&r2.Status, completedCondition)
+	progressingCondition, _ := newProgressingCondition(conditions.NewRSAvailableReason, rs2, "")
+	conditions.SetRolloutCondition(&r2.Status, progressingCondition)
+
+	activeSelector := map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: rs2PodHash}
+	activeSvc := newService("active", 80, activeSelector, r2)
+
+	f.kubeobjects = append(f.kubeobjects, rs1, rs2, activeSvc)
+	f.replicaSetLister = append(f.replicaSetLister, rs1, rs2)
+	f.rolloutLister = append(f.rolloutLister, r2)
+	f.objects = append(f.objects, r2)
+
+	rs1Patch := f.expectPatchReplicaSetAction(rs1) // set scale-down-deadline annotation
+	f.run(getKey(r2, t))
+
+	f.verifyPatchedReplicaSet(rs1Patch, 30)
+}
diff --git a/rollout/canary.go b/rollout/canary.go
index 6f3062b004..11813ed44b 100644
--- a/rollout/canary.go
+++ b/rollout/canary.go
@@ -157,6 +157,10 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForCanary(oldRSs []*appsv1.Repli
 
 	sort.Sort(sort.Reverse(replicasetutil.ReplicaSetsByRevisionNumber(oldRSs)))
 
+	if canProceed, err := c.canProceedWithScaleDownAnnotation(oldRSs); !canProceed || err != nil {
+		return 0, err
+	}
+
 	annotationedRSs := int32(0)
 	rolloutReplicas := defaults.GetReplicasOrDefault(c.rollout.Spec.Replicas)
 	for _, targetRS := range oldRSs {
@@ -169,7 +173,7 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForCanary(oldRSs []*appsv1.Repli
 		if maxScaleDown <= 0 {
 			break
 		}
-		if *(targetRS.Spec.Replicas) == 0 {
+		if *targetRS.Spec.Replicas == 0 {
 			// cannot scale down this ReplicaSet.
 			continue
 		}
@@ -177,15 +181,18 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForCanary(oldRSs []*appsv1.Repli
 		if c.rollout.Spec.Strategy.Canary.TrafficRouting == nil {
 			// For basic canary, we must scale down all other ReplicaSets because existence of
 			// those pods will cause traffic to be served by them
-			if *(targetRS.Spec.Replicas) > maxScaleDown {
-				desiredReplicaCount = *(targetRS.Spec.Replicas) - maxScaleDown
+			if *targetRS.Spec.Replicas > maxScaleDown {
+				desiredReplicaCount = *targetRS.Spec.Replicas - maxScaleDown
 			}
 		} else {
 			// For traffic shaped canary, we leave the old ReplicaSets up until scaleDownDelaySeconds
-			annotationedRSs, desiredReplicaCount = c.ScaleDownDelayHelper(targetRS, annotationedRSs, rolloutReplicas)
+			annotationedRSs, desiredReplicaCount, err = c.scaleDownDelayHelper(targetRS, annotationedRSs, rolloutReplicas)
+			if err != nil {
+				return totalScaledDown, err
+			}
 		}
-		if *(targetRS.Spec.Replicas) == desiredReplicaCount {
-			// at desired account
+		if *targetRS.Spec.Replicas == desiredReplicaCount {
+			// already at desired account, nothing to do
 			continue
 		}
 		// Scale down.
@@ -201,6 +208,50 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForCanary(oldRSs []*appsv1.Repli
 	return totalScaledDown, nil
 }
 
+// canProceedWithScaleDownAnnotation returns whether or not it is safe to proceed with annotating
+// old replicasets with the scale-down-deadline in the traffic-routed canary strategy.
+// This method only matters with ALB canary + the target group verification feature.
+// The safety guarantees we provide are that we will not scale down *anything* unless we can verify
+// stable target group endpoints are registered properly.
+// NOTE: this method was written in a way which avoids AWS API calls.
+func (c *rolloutContext) canProceedWithScaleDownAnnotation(oldRSs []*appsv1.ReplicaSet) (bool, error) {
+	isALBCanary := c.rollout.Spec.Strategy.Canary != nil && c.rollout.Spec.Strategy.Canary.TrafficRouting != nil && c.rollout.Spec.Strategy.Canary.TrafficRouting.ALB != nil
+	if !isALBCanary {
+		// Only ALB
+		return true, nil
+	}
+
+	needToVerifyTargetGroups := false
+	for _, targetRS := range oldRSs {
+		if *targetRS.Spec.Replicas > 0 && !replicasetutil.HasScaleDownDeadline(targetRS) {
+			// We encountered an old ReplicaSet that is not yet scaled down, and is not annotated
+			// We only verify target groups if there is something to scale down.
+			needToVerifyTargetGroups = true
+			break
+		}
+	}
+	if !needToVerifyTargetGroups {
+		// All ReplicaSets are either scaled down, or have a scale-down-deadline annotation.
+		// The presence of the scale-down-deadline on all oldRSs, implies we can proceed with
+		// scale down, because we only add that annotation when target groups have been verified.
+		// Therefore, we return true to avoid performing verification again and making unnecessary
+		// AWS API calls.
+		return true, nil
+	}
+	stableSvc, err := c.servicesLister.Services(c.rollout.Namespace).Get(c.rollout.Spec.Strategy.Canary.StableService)
+	if err != nil {
+		return false, err
+	}
+	err = c.awsVerifyTargetGroups(stableSvc)
+	if err != nil {
+		return false, err
+	}
+
+	canProceed := c.areTargetsVerified()
+	c.log.Infof("Proceed with scaledown: %v", canProceed)
+	return canProceed, nil
+}
+
 func (c *rolloutContext) completedCurrentCanaryStep() bool {
 	if c.rollout.Spec.Paused {
 		return false
diff --git a/rollout/controller_test.go b/rollout/controller_test.go
index 8f9b7d346c..8fa0bd56cb 100644
--- a/rollout/controller_test.go
+++ b/rollout/controller_test.go
@@ -10,8 +10,6 @@ import (
 	"testing"
 	"time"
 
-	"github.com/argoproj/argo-rollouts/utils/queue"
-
 	"github.com/ghodss/yaml"
 	log "github.com/sirupsen/logrus"
 	"github.com/stretchr/testify/assert"
@@ -51,6 +49,7 @@ import (
 	"github.com/argoproj/argo-rollouts/utils/conditions"
 	"github.com/argoproj/argo-rollouts/utils/defaults"
 	istioutil "github.com/argoproj/argo-rollouts/utils/istio"
+	"github.com/argoproj/argo-rollouts/utils/queue"
 	"github.com/argoproj/argo-rollouts/utils/record"
 	replicasetutil "github.com/argoproj/argo-rollouts/utils/replicaset"
 	rolloututil "github.com/argoproj/argo-rollouts/utils/rollout"
@@ -102,6 +101,8 @@ type fixture struct {
 	enqueuedObjects map[string]int
 	unfreezeTime    func() error
 
+	// events holds all the K8s Event Reasons emitted during the run
+	events             []string
 	fakeTrafficRouting *mocks.TrafficRoutingReconciler
 }
 
@@ -480,7 +481,17 @@ func (f *fixture) newController(resync resyncFunc) (*Controller, informers.Share
 	k8sI := kubeinformers.NewSharedInformerFactory(f.kubeclient, resync())
 
 	// Pass in objects to to dynamicClient
-	dynamicClient := dynamicfake.NewSimpleDynamicClient(runtime.NewScheme())
+	scheme := runtime.NewScheme()
+	v1alpha1.AddToScheme(scheme)
+	tgbGVR := schema.GroupVersionResource{
+		Group:    "elbv2.k8s.aws",
+		Version:  "v1beta1",
+		Resource: "targetgroupbindings",
+	}
+	listMapping := map[schema.GroupVersionResource]string{
+		tgbGVR: "TargetGroupBindingList",
+	}
+	dynamicClient := dynamicfake.NewSimpleDynamicClientWithCustomListKinds(scheme, listMapping, f.objects...)
 	dynamicInformerFactory := dynamicinformer.NewDynamicSharedInformerFactory(dynamicClient, 0)
 	istioVirtualServiceInformer := dynamicInformerFactory.ForResource(istioutil.GetIstioVirtualServiceGVR()).Informer()
 	istioDestinationRuleInformer := dynamicInformerFactory.ForResource(istioutil.GetIstioDestinationRuleGVR()).Informer()
@@ -632,6 +643,8 @@ func (f *fixture) runController(rolloutName string, startInformers bool, expectE
 	if len(f.kubeactions) > len(k8sActions) {
 		f.t.Errorf("%d expected actions did not happen:%+v", len(f.kubeactions)-len(k8sActions), f.kubeactions[len(k8sActions):])
 	}
+	fakeRecorder := c.recorder.(*record.FakeEventRecorder)
+	f.events = fakeRecorder.Events
 	return c
 }
 
@@ -821,6 +834,12 @@ func (f *fixture) expectPatchRolloutActionWithPatch(rollout *v1alpha1.Rollout, p
 	return len
 }
 
+func (f *fixture) expectGetEndpointsAction(ep *corev1.Endpoints) int {
+	len := len(f.kubeactions)
+	f.kubeactions = append(f.kubeactions, core.NewGetAction(schema.GroupVersionResource{Resource: "endpoints"}, ep.Namespace, ep.Name))
+	return len
+}
+
 func (f *fixture) getCreatedReplicaSet(index int) *appsv1.ReplicaSet {
 	action := filterInformerActions(f.kubeclient.Actions())[index]
 	createAction, ok := action.(core.CreateAction)
@@ -1072,6 +1091,11 @@ func (f *fixture) getUpdatedPod(index int) *corev1.Pod {
 	return pod
 }
 
+func (f *fixture) assertEvents(events []string) {
+	f.t.Helper()
+	assert.Equal(f.t, events, f.events)
+}
+
 func TestDontSyncRolloutsWithEmptyPodSelector(t *testing.T) {
 	f := newFixture(t)
 	defer f.Close()
diff --git a/rollout/ephemeralmetadata_test.go b/rollout/ephemeralmetadata_test.go
index a68e4f39d2..24f76d6bac 100644
--- a/rollout/ephemeralmetadata_test.go
+++ b/rollout/ephemeralmetadata_test.go
@@ -215,7 +215,6 @@ func TestSyncBlueGreenEphemeralMetadataSecondRevision(t *testing.T) {
 	f.expectListPodAction(r1.Namespace)                // list pods to patch ephemeral data on revision 1 ReplicaSets pods`
 	podIdx := f.expectUpdatePodAction(&pod)            // Update pod with ephemeral data
 	rs1idx := f.expectUpdateReplicaSetAction(rs1)      // update stable replicaset with stable metadata
-	f.expectPatchReplicaSetAction(rs1)                 // scale revision 1 ReplicaSet down
 	f.expectPatchRolloutAction(r2)                     // Patch Rollout status
 
 	f.run(getKey(r2, t))
diff --git a/rollout/replicaset.go b/rollout/replicaset.go
index 32e63a589e..e2ff4ebaa3 100644
--- a/rollout/replicaset.go
+++ b/rollout/replicaset.go
@@ -57,7 +57,7 @@ func (c *rolloutContext) addScaleDownDelay(rs *appsv1.ReplicaSet, scaleDownDelay
 	patch := fmt.Sprintf(addScaleDownAtAnnotationsPatch, v1alpha1.DefaultReplicaSetScaleDownDeadlineAnnotationKey, deadline)
 	_, err := c.kubeclientset.AppsV1().ReplicaSets(rs.Namespace).Patch(ctx, rs.Name, patchtypes.JSONPatchType, []byte(patch), metav1.PatchOptions{})
 	if err == nil {
-		c.log.Infof("Set '%s' annotation on '%s' to %s (%ds)", v1alpha1.DefaultReplicaSetScaleDownDeadlineAnnotationKey, rs.Name, deadline, scaleDownDelaySeconds)
+		c.log.Infof("Set '%s' annotation on '%s' to %s (%s)", v1alpha1.DefaultReplicaSetScaleDownDeadlineAnnotationKey, rs.Name, deadline, scaleDownDelaySeconds)
 	}
 	return err
 }
@@ -230,3 +230,39 @@ func (c *rolloutContext) cleanupUnhealthyReplicas(oldRSs []*appsv1.ReplicaSet) (
 	}
 	return oldRSs, totalScaledDown, nil
 }
+
+func (c *rolloutContext) scaleDownDelayHelper(rs *appsv1.ReplicaSet, annotationedRSs int32, rolloutReplicas int32) (int32, int32, error) {
+	desiredReplicaCount := int32(0)
+	scaleDownRevisionLimit := GetScaleDownRevisionLimit(c.rollout)
+	if !replicasetutil.HasScaleDownDeadline(rs) && *rs.Spec.Replicas > 0 {
+		// This ReplicaSet is scaled up but does not have a scale down deadline. Add one.
+		if annotationedRSs < scaleDownRevisionLimit {
+			annotationedRSs++
+			desiredReplicaCount = *rs.Spec.Replicas
+			scaleDownDelaySeconds := defaults.GetScaleDownDelaySecondsOrDefault(c.rollout)
+			err := c.addScaleDownDelay(rs, scaleDownDelaySeconds)
+			if err != nil {
+				return annotationedRSs, desiredReplicaCount, err
+			}
+			c.enqueueRolloutAfter(c.rollout, scaleDownDelaySeconds)
+		}
+	} else if replicasetutil.HasScaleDownDeadline(rs) {
+		annotationedRSs++
+		if annotationedRSs > scaleDownRevisionLimit {
+			c.log.Infof("At ScaleDownDelayRevisionLimit (%d) and scaling down the rest", scaleDownRevisionLimit)
+		} else {
+			remainingTime, err := replicasetutil.GetTimeRemainingBeforeScaleDownDeadline(rs)
+			if err != nil {
+				c.log.Warnf("%v", err)
+			} else if remainingTime != nil {
+				c.log.Infof("RS '%s' has not reached the scaleDownTime", rs.Name)
+				if *remainingTime < c.resyncPeriod {
+					c.enqueueRolloutAfter(c.rollout, *remainingTime)
+				}
+				desiredReplicaCount = rolloutReplicas
+			}
+		}
+	}
+
+	return annotationedRSs, desiredReplicaCount, nil
+}
diff --git a/rollout/replicaset_test.go b/rollout/replicaset_test.go
index 5bf7ac86ed..481454d843 100644
--- a/rollout/replicaset_test.go
+++ b/rollout/replicaset_test.go
@@ -10,6 +10,7 @@ import (
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	k8sfake "k8s.io/client-go/kubernetes/fake"
 	core "k8s.io/client-go/testing"
+	"k8s.io/utils/pointer"
 
 	"github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1"
 	"github.com/argoproj/argo-rollouts/pkg/client/clientset/versioned/fake"
@@ -307,6 +308,7 @@ func TestReconcileOldReplicaSet(t *testing.T) {
 			oldRS.Annotations = map[string]string{annotations.DesiredReplicasAnnotation: strconv.Itoa(test.oldReplicas)}
 			oldRS.Status.AvailableReplicas = int32(test.readyPodsFromOldRS)
 			rollout := newBlueGreenRollout("foo", test.rolloutReplicas, nil, "", "")
+			rollout.Spec.Strategy.BlueGreen.ScaleDownDelayRevisionLimit = pointer.Int32Ptr(0)
 			rollout.Spec.Selector = &metav1.LabelSelector{MatchLabels: newSelector}
 			f := newFixture(t)
 			defer f.Close()
diff --git a/rollout/service.go b/rollout/service.go
index 4ce7878c85..12f285b782 100644
--- a/rollout/service.go
+++ b/rollout/service.go
@@ -14,7 +14,9 @@ import (
 	"github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1"
 	"github.com/argoproj/argo-rollouts/utils/annotations"
 	"github.com/argoproj/argo-rollouts/utils/aws"
+	"github.com/argoproj/argo-rollouts/utils/conditions"
 	"github.com/argoproj/argo-rollouts/utils/defaults"
+	logutil "github.com/argoproj/argo-rollouts/utils/log"
 	"github.com/argoproj/argo-rollouts/utils/record"
 	replicasetutil "github.com/argoproj/argo-rollouts/utils/replicaset"
 	serviceutil "github.com/argoproj/argo-rollouts/utils/service"
@@ -115,14 +117,17 @@ func (c *rolloutContext) areTargetsVerified() bool {
 	return c.targetsVerified == nil || *c.targetsVerified
 }
 
-// awsVerifyTargetGroups examines a Service and verifies that underlying AWS TargetGroup is properly
-// targetting the Service's Endpoint IPs and port. Only valid for services which are reachable
+// awsVerifyTargetGroups examines a Service and verifies that the underlying AWS TargetGroup has all
+// of the Service's Endpoint IPs and ports registered. Only valid for services which are reachable
 // by an ALB Ingress, which can be determined if there exists a TargetGroupBinding object in the
 // namespace that references the given service
 func (c *rolloutContext) awsVerifyTargetGroups(svc *corev1.Service) error {
 	if !c.shouldVerifyTargetGroup(svc) {
 		return nil
 	}
+	logCtx := c.log.WithField(logutil.ServiceKey, svc.Name)
+	logCtx.Infof("Verifying target group")
+
 	ctx := context.TODO()
 	// find all TargetGroupBindings in the namespace which reference the service name + port
 	tgBindings, err := aws.GetTargetGroupBindingsByService(ctx, c.dynamicclientset, *svc)
@@ -134,7 +139,7 @@ func (c *rolloutContext) awsVerifyTargetGroups(svc *corev1.Service) error {
 		return nil
 	}
 
-	c.targetsVerified = pointer.BoolPtr(true)
+	c.targetsVerified = pointer.BoolPtr(false)
 
 	// get endpoints of service
 	endpoints, err := c.kubeclientset.CoreV1().Endpoints(svc.Namespace).Get(ctx, svc.Name, metav1.GetOptions{})
@@ -148,30 +153,40 @@ func (c *rolloutContext) awsVerifyTargetGroups(svc *corev1.Service) error {
 	}
 
 	for _, tgb := range tgBindings {
-		verified, err := aws.VerifyTargetGroupBinding(ctx, c.log, awsClnt, tgb, endpoints, svc)
+		verifyRes, err := aws.VerifyTargetGroupBinding(ctx, c.log, awsClnt, tgb, endpoints, svc)
 		if err != nil {
+			c.recorder.Warnf(c.rollout, record.EventOptions{EventReason: conditions.TargetGroupVerifyErrorReason}, conditions.TargetGroupVerifyErrorMessage, svc.Name, tgb.Spec.TargetGroupARN, err)
 			return err
 		}
-		if !verified {
-			c.targetsVerified = pointer.BoolPtr(false)
+		if verifyRes == nil {
+			// verification not applicable
+			continue
+		}
+		if !verifyRes.Verified {
+			c.recorder.Warnf(c.rollout, record.EventOptions{EventReason: conditions.TargetGroupUnverifiedReason}, conditions.TargetGroupUnverifiedRegistrationMessage, svc.Name, tgb.Spec.TargetGroupARN, verifyRes.EndpointsRegistered, verifyRes.EndpointsTotal)
 			c.enqueueRolloutAfter(c.rollout, 10*time.Second)
 			return nil
 		}
+		c.recorder.Eventf(c.rollout, record.EventOptions{EventReason: conditions.TargetGroupVerifiedReason}, conditions.TargetGroupVerifiedRegistrationMessage, svc.Name, tgb.Spec.TargetGroupARN, verifyRes.EndpointsRegistered)
 	}
+	c.targetsVerified = pointer.BoolPtr(true)
 	return nil
 }
 
 // shouldVerifyTargetGroup returns whether or not we should verify the target group
 func (c *rolloutContext) shouldVerifyTargetGroup(svc *corev1.Service) bool {
 	if !defaults.VerifyTargetGroup() {
+		// feature is disabled
 		return false
 	}
 	desiredPodHash := c.newRS.Labels[v1alpha1.DefaultRolloutUniqueLabelKey]
-	if c.rollout.Status.StableRS == desiredPodHash {
-		// we are fully promoted
-		return false
-	}
 	if c.rollout.Spec.Strategy.BlueGreen != nil {
+		if c.rollout.Status.StableRS == desiredPodHash {
+			// for blue-green, we only verify targets right after switching active service. So if
+			// we are fully promoted, then there is no need to verify targets.
+			// NOTE: this is the opposite of canary, where we only verify targets if stable == desired
+			return false
+		}
 		svcPodHash := svc.Spec.Selector[v1alpha1.DefaultRolloutUniqueLabelKey]
 		if svcPodHash != desiredPodHash {
 			// we have not yet switched service selector
@@ -181,17 +196,23 @@ func (c *rolloutContext) shouldVerifyTargetGroup(svc *corev1.Service) bool {
 			// we already started post-promotion analysis, so verification already occurred
 			return false
 		}
+		return true
 	} else if c.rollout.Spec.Strategy.Canary != nil {
 		if c.rollout.Spec.Strategy.Canary.TrafficRouting == nil || c.rollout.Spec.Strategy.Canary.TrafficRouting.ALB == nil {
+			// not ALB canary, so no need to verify targets
 			return false
 		}
-		// we don't support target group verification on ALB canary yet
-		return false
-	} else {
-		// should not get here
-		return false
+		if c.rollout.Status.StableRS != desiredPodHash {
+			// for canary, we only verify targets right after switching stable service, which happens
+			// after the update. So if stable != desired, we are still in the middle of an update
+			// and there is no need to verify targets.
+			// NOTE: this is the opposite of blue-green, where we only verify targets if stable != active
+			return false
+		}
+		return true
 	}
-	return true
+	// should not get here
+	return false
 }
 
 func (c *rolloutContext) getPreviewAndActiveServices() (*corev1.Service, *corev1.Service, error) {
diff --git a/rollout/service_test.go b/rollout/service_test.go
index b4a4d5f059..b3d7f63c87 100644
--- a/rollout/service_test.go
+++ b/rollout/service_test.go
@@ -2,17 +2,28 @@ package rollout
 
 import (
 	"fmt"
+	"strconv"
 	"strings"
 	"testing"
 
+	elbv2 "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2"
+	elbv2types "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2/types"
 	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/mock"
 	corev1 "k8s.io/api/core/v1"
+	extensionsv1beta1 "k8s.io/api/extensions/v1beta1"
 	"k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
 	"k8s.io/apimachinery/pkg/util/intstr"
+	"k8s.io/utils/pointer"
 
 	"github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1"
+	"github.com/argoproj/argo-rollouts/utils/aws"
+	"github.com/argoproj/argo-rollouts/utils/aws/mocks"
 	"github.com/argoproj/argo-rollouts/utils/conditions"
+	"github.com/argoproj/argo-rollouts/utils/defaults"
+	unstructuredutil "github.com/argoproj/argo-rollouts/utils/unstructured"
 )
 
 func newService(name string, port int, selector map[string]string, ro *v1alpha1.Rollout) *corev1.Service {
@@ -162,3 +173,424 @@ func TestPreviewServiceNotFound(t *testing.T) {
 	assert.Equal(t, calculatePatch(r, fmt.Sprintf(expectedPatch, pausedCondition, conditions.InvalidSpecReason, strings.ReplaceAll(errmsg, "\"", "\\\""))), patch)
 
 }
+
+func newEndpoints(name string, ips ...string) *corev1.Endpoints {
+	ep := corev1.Endpoints{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      name,
+			Namespace: metav1.NamespaceDefault,
+		},
+		Subsets: []corev1.EndpointSubset{
+			{
+				Addresses: []corev1.EndpointAddress{},
+			},
+		},
+	}
+	for _, ip := range ips {
+		address := corev1.EndpointAddress{
+			IP: ip,
+		}
+		ep.Subsets[0].Addresses = append(ep.Subsets[0].Addresses, address)
+	}
+	return &ep
+}
+
+func newTargetGroupBinding(name string) *unstructured.Unstructured {
+	return unstructuredutil.StrToUnstructuredUnsafe(fmt.Sprintf(`
+apiVersion: elbv2.k8s.aws/v1beta1
+kind: TargetGroupBinding
+metadata:
+  name: %s
+  namespace: default
+spec:
+  serviceRef:
+    name: %s
+    port: 80
+  targetGroupARN: arn::1234
+  targetType: ip
+`, name, name))
+}
+
+func newIngress(name string, canary, stable *corev1.Service) *extensionsv1beta1.Ingress {
+	ingress := extensionsv1beta1.Ingress{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      name,
+			Namespace: metav1.NamespaceDefault,
+		},
+		Spec: extensionsv1beta1.IngressSpec{
+			Rules: []extensionsv1beta1.IngressRule{
+				{
+					Host: "fakehost.example.com",
+					IngressRuleValue: extensionsv1beta1.IngressRuleValue{
+						HTTP: &extensionsv1beta1.HTTPIngressRuleValue{
+							Paths: []extensionsv1beta1.HTTPIngressPath{
+								{
+									Path: "/foo",
+									Backend: extensionsv1beta1.IngressBackend{
+										ServiceName: "root",
+										ServicePort: intstr.FromString("use-annotations"),
+									},
+								},
+							},
+						},
+					},
+				},
+			},
+		},
+	}
+	return &ingress
+}
+
+// TestBlueGreenAWSVerifyTargetGroupsNotYetReady verifies we don't proceed with setting stable with
+// the blue-green strategy until target group verification is successful
+func TestBlueGreenAWSVerifyTargetGroupsNotYetReady(t *testing.T) {
+	defaults.SetVerifyTargetGroup(true)
+	defer defaults.SetVerifyTargetGroup(false)
+
+	// Allow us to fake out the AWS API
+	fakeELB := mocks.ELBv2APIClient{}
+	aws.NewClient = aws.FakeNewClientFunc(&fakeELB)
+	defer func() {
+		aws.NewClient = aws.DefaultNewClientFunc
+	}()
+
+	f := newFixture(t)
+	defer f.Close()
+
+	tgb := newTargetGroupBinding("active")
+	ep := newEndpoints("active", "1.2.3.4", "5.6.7.8", "2.4.6.8")
+	thOut := elbv2.DescribeTargetHealthOutput{
+		TargetHealthDescriptions: []elbv2types.TargetHealthDescription{
+			{
+				Target: &elbv2types.TargetDescription{
+					Id:   pointer.StringPtr("1.2.3.4"),
+					Port: pointer.Int32Ptr(80),
+				},
+			},
+			{
+				Target: &elbv2types.TargetDescription{
+					Id:   pointer.StringPtr("5.6.7.8"),
+					Port: pointer.Int32Ptr(80),
+				},
+			},
+			{
+				Target: &elbv2types.TargetDescription{
+					Id:   pointer.StringPtr("2.4.6.8"), // irrelevant
+					Port: pointer.Int32Ptr(81),         // wrong port
+				},
+			},
+			{
+				Target: &elbv2types.TargetDescription{
+					Id:   pointer.StringPtr("9.8.7.6"), // irrelevant ip
+					Port: pointer.Int32Ptr(80),
+				},
+			},
+		},
+	}
+	fakeELB.On("DescribeTargetHealth", mock.Anything, mock.Anything).Return(&thOut, nil)
+
+	r1 := newBlueGreenRollout("foo", 3, nil, "active", "")
+	r2 := bumpVersion(r1)
+
+	rs1 := newReplicaSetWithStatus(r1, 3, 3)
+	rs2 := newReplicaSetWithStatus(r2, 3, 3)
+
+	rs1PodHash := rs1.Labels[v1alpha1.DefaultRolloutUniqueLabelKey]
+	rs2PodHash := rs2.Labels[v1alpha1.DefaultRolloutUniqueLabelKey]
+
+	svc := newService("active", 80, map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: rs2PodHash}, r2)
+	r2 = updateBlueGreenRolloutStatus(r2, "", rs2PodHash, rs1PodHash, 3, 3, 6, 3, false, true)
+	r2.Status.Message = ""
+	r2.Status.ObservedGeneration = strconv.Itoa(int(r2.Generation))
+	completedCondition, _ := newCompletedCondition(true)
+	conditions.SetRolloutCondition(&r2.Status, completedCondition)
+	progressingCondition, _ := newProgressingCondition(conditions.NewRSAvailableReason, rs2, "")
+	conditions.SetRolloutCondition(&r2.Status, progressingCondition)
+
+	f.rolloutLister = append(f.rolloutLister, r2)
+	f.objects = append(f.objects, r2, tgb)
+	f.kubeobjects = append(f.kubeobjects, rs1, rs2, svc, ep)
+	f.serviceLister = append(f.serviceLister, svc)
+
+	f.expectGetEndpointsAction(ep)
+	patchIndex := f.expectPatchRolloutAction(r2) // update status message
+	f.run(getKey(r2, t))
+
+	patch := f.getPatchedRollout(patchIndex)
+	expectedPatch := `{"status":{"message":"waiting for post-promotion verification to complete"}}`
+	assert.Equal(t, expectedPatch, patch)
+	f.assertEvents([]string{
+		conditions.TargetGroupUnverifiedReason,
+	})
+}
+
+// TestBlueGreenAWSVerifyTargetGroupsReady verifies we proceed with setting stable with
+// the blue-green strategy when target group verification is successful
+func TestBlueGreenAWSVerifyTargetGroupsReady(t *testing.T) {
+	defaults.SetVerifyTargetGroup(true)
+	defer defaults.SetVerifyTargetGroup(false)
+
+	// Allow us to fake out the AWS API
+	fakeELB := mocks.ELBv2APIClient{}
+	aws.NewClient = aws.FakeNewClientFunc(&fakeELB)
+	defer func() {
+		aws.NewClient = aws.DefaultNewClientFunc
+	}()
+
+	f := newFixture(t)
+	defer f.Close()
+
+	tgb := newTargetGroupBinding("active")
+	ep := newEndpoints("active", "1.2.3.4", "5.6.7.8", "2.4.6.8")
+	thOut := elbv2.DescribeTargetHealthOutput{
+		TargetHealthDescriptions: []elbv2types.TargetHealthDescription{
+			{
+				Target: &elbv2types.TargetDescription{
+					Id:   pointer.StringPtr("1.2.3.4"),
+					Port: pointer.Int32Ptr(80),
+				},
+			},
+			{
+				Target: &elbv2types.TargetDescription{
+					Id:   pointer.StringPtr("5.6.7.8"),
+					Port: pointer.Int32Ptr(80),
+				},
+			},
+			{
+				Target: &elbv2types.TargetDescription{
+					Id:   pointer.StringPtr("2.4.6.8"),
+					Port: pointer.Int32Ptr(80),
+				},
+			},
+			{
+				Target: &elbv2types.TargetDescription{
+					Id:   pointer.StringPtr("9.8.7.6"), // irrelevant ip
+					Port: pointer.Int32Ptr(80),
+				},
+			},
+		},
+	}
+	fakeELB.On("DescribeTargetHealth", mock.Anything, mock.Anything).Return(&thOut, nil)
+
+	r1 := newBlueGreenRollout("foo", 3, nil, "active", "")
+	r2 := bumpVersion(r1)
+
+	rs1 := newReplicaSetWithStatus(r1, 3, 3)
+	rs2 := newReplicaSetWithStatus(r2, 3, 3)
+
+	rs1PodHash := rs1.Labels[v1alpha1.DefaultRolloutUniqueLabelKey]
+	rs2PodHash := rs2.Labels[v1alpha1.DefaultRolloutUniqueLabelKey]
+
+	svc := newService("active", 80, map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: rs2PodHash}, r2)
+	r2 = updateBlueGreenRolloutStatus(r2, "", rs2PodHash, rs1PodHash, 3, 3, 6, 3, false, true)
+	r2.Status.Message = "waiting for post-promotion verification to complete"
+	r2.Status.ObservedGeneration = strconv.Itoa(int(r2.Generation))
+	completedCondition, _ := newCompletedCondition(true)
+	conditions.SetRolloutCondition(&r2.Status, completedCondition)
+	progressingCondition, _ := newProgressingCondition(conditions.NewRSAvailableReason, rs2, "")
+	conditions.SetRolloutCondition(&r2.Status, progressingCondition)
+
+	f.rolloutLister = append(f.rolloutLister, r2)
+	f.objects = append(f.objects, r2, tgb)
+	f.kubeobjects = append(f.kubeobjects, rs1, rs2, svc, ep)
+	f.serviceLister = append(f.serviceLister, svc)
+
+	f.expectGetEndpointsAction(ep)
+	patchIndex := f.expectPatchRolloutAction(r2) // update status message
+	f.run(getKey(r2, t))
+
+	patch := f.getPatchedRollout(patchIndex)
+	expectedPatch := fmt.Sprintf(`{"status":{"message":null,"phase":"Healthy","stableRS":"%s"}}`, rs2PodHash)
+	assert.Equal(t, expectedPatch, patch)
+	f.assertEvents([]string{
+		conditions.TargetGroupVerifiedReason,
+		conditions.RolloutCompletedReason,
+	})
+}
+
+// TestCanaryAWSVerifyTargetGroupsNotYetReady verifies we don't proceed with scale down of old
+// ReplicaSets in the canary strategy until target group verification is successful
+func TestCanaryAWSVerifyTargetGroupsNotYetReady(t *testing.T) {
+	defaults.SetVerifyTargetGroup(true)
+	defer defaults.SetVerifyTargetGroup(false)
+
+	// Allow us to fake out the AWS API
+	fakeELB := mocks.ELBv2APIClient{}
+	aws.NewClient = aws.FakeNewClientFunc(&fakeELB)
+	defer func() {
+		aws.NewClient = aws.DefaultNewClientFunc
+	}()
+
+	f := newFixture(t)
+	defer f.Close()
+
+	tgb := newTargetGroupBinding("stable")
+	ep := newEndpoints("stable", "1.2.3.4", "5.6.7.8", "2.4.6.8")
+	thOut := elbv2.DescribeTargetHealthOutput{
+		TargetHealthDescriptions: []elbv2types.TargetHealthDescription{
+			{
+				Target: &elbv2types.TargetDescription{
+					Id:   pointer.StringPtr("1.2.3.4"),
+					Port: pointer.Int32Ptr(80),
+				},
+			},
+			{
+				Target: &elbv2types.TargetDescription{
+					Id:   pointer.StringPtr("5.6.7.8"),
+					Port: pointer.Int32Ptr(80),
+				},
+			},
+			{
+				Target: &elbv2types.TargetDescription{
+					Id:   pointer.StringPtr("2.4.6.8"), // irrelevant
+					Port: pointer.Int32Ptr(81),         // wrong port
+				},
+			},
+			{
+				Target: &elbv2types.TargetDescription{
+					Id:   pointer.StringPtr("9.8.7.6"), // irrelevant ip
+					Port: pointer.Int32Ptr(80),
+				},
+			},
+		},
+	}
+	fakeELB.On("DescribeTargetHealth", mock.Anything, mock.Anything).Return(&thOut, nil)
+
+	r1 := newCanaryRollout("foo", 3, nil, nil, nil, intstr.FromString("25%"), intstr.FromString("25%"))
+	r1.Spec.Strategy.Canary.TrafficRouting = &v1alpha1.RolloutTrafficRouting{
+		ALB: &v1alpha1.ALBTrafficRouting{
+			Ingress:     "ingress",
+			RootService: "root",
+		},
+	}
+	r1.Spec.Strategy.Canary.CanaryService = "canary"
+	r1.Spec.Strategy.Canary.StableService = "stable"
+	r2 := bumpVersion(r1)
+
+	rs1 := newReplicaSetWithStatus(r1, 3, 3)
+	rs2 := newReplicaSetWithStatus(r2, 3, 3)
+
+	rs2PodHash := rs2.Labels[v1alpha1.DefaultRolloutUniqueLabelKey]
+
+	rootSvc := newService("root", 80, map[string]string{"app": "foo"}, nil)
+	stableSvc := newService("canary", 80, map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: rs2PodHash}, r2)
+	canarySvc := newService("stable", 80, map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: rs2PodHash}, r2)
+	ing := newIngress("ingress", canarySvc, stableSvc)
+
+	r2 = updateCanaryRolloutStatus(r2, rs2PodHash, 6, 3, 6, false)
+	r2.Status.Message = ""
+	r2.Status.ObservedGeneration = strconv.Itoa(int(r2.Generation))
+	r2.Status.StableRS = rs2PodHash
+	availableCondition, _ := newAvailableCondition(true)
+	conditions.SetRolloutCondition(&r2.Status, availableCondition)
+	completedCondition, _ := newCompletedCondition(false)
+	conditions.SetRolloutCondition(&r2.Status, completedCondition)
+	progressingCondition, _ := newProgressingCondition(conditions.NewRSAvailableReason, rs2, "")
+	conditions.SetRolloutCondition(&r2.Status, progressingCondition)
+
+	f.rolloutLister = append(f.rolloutLister, r2)
+	f.objects = append(f.objects, r2, tgb)
+	f.kubeobjects = append(f.kubeobjects, rs1, rs2, ing, rootSvc, canarySvc, stableSvc, ep)
+	f.serviceLister = append(f.serviceLister, rootSvc, canarySvc, stableSvc)
+
+	f.expectGetEndpointsAction(ep)
+	f.run(getKey(r2, t))
+	f.assertEvents([]string{
+		conditions.TargetGroupUnverifiedReason,
+	})
+}
+
+// TestCanaryAWSVerifyTargetGroupsReady verifies we proceed with scale down of old
+// ReplicaSets in the canary strategy after target group verification is successful
+func TestCanaryAWSVerifyTargetGroupsReady(t *testing.T) {
+	defaults.SetVerifyTargetGroup(true)
+	defer defaults.SetVerifyTargetGroup(false)
+
+	// Allow us to fake out the AWS API
+	fakeELB := mocks.ELBv2APIClient{}
+	aws.NewClient = aws.FakeNewClientFunc(&fakeELB)
+	defer func() {
+		aws.NewClient = aws.DefaultNewClientFunc
+	}()
+
+	f := newFixture(t)
+	defer f.Close()
+
+	tgb := newTargetGroupBinding("stable")
+	ep := newEndpoints("stable", "1.2.3.4", "5.6.7.8", "2.4.6.8")
+	thOut := elbv2.DescribeTargetHealthOutput{
+		TargetHealthDescriptions: []elbv2types.TargetHealthDescription{
+			{
+				Target: &elbv2types.TargetDescription{
+					Id:   pointer.StringPtr("1.2.3.4"),
+					Port: pointer.Int32Ptr(80),
+				},
+			},
+			{
+				Target: &elbv2types.TargetDescription{
+					Id:   pointer.StringPtr("5.6.7.8"),
+					Port: pointer.Int32Ptr(80),
+				},
+			},
+			{
+				Target: &elbv2types.TargetDescription{
+					Id:   pointer.StringPtr("2.4.6.8"), // irrelevant
+					Port: pointer.Int32Ptr(80),         // wrong port
+				},
+			},
+			{
+				Target: &elbv2types.TargetDescription{
+					Id:   pointer.StringPtr("9.8.7.6"), // irrelevant ip
+					Port: pointer.Int32Ptr(80),
+				},
+			},
+		},
+	}
+	fakeELB.On("DescribeTargetHealth", mock.Anything, mock.Anything).Return(&thOut, nil)
+
+	r1 := newCanaryRollout("foo", 3, nil, nil, nil, intstr.FromString("25%"), intstr.FromString("25%"))
+	r1.Spec.Strategy.Canary.TrafficRouting = &v1alpha1.RolloutTrafficRouting{
+		ALB: &v1alpha1.ALBTrafficRouting{
+			Ingress:     "ingress",
+			RootService: "root",
+		},
+	}
+	r1.Spec.Strategy.Canary.CanaryService = "canary"
+	r1.Spec.Strategy.Canary.StableService = "stable"
+	r2 := bumpVersion(r1)
+
+	rs1 := newReplicaSetWithStatus(r1, 3, 3)
+	rs2 := newReplicaSetWithStatus(r2, 3, 3)
+
+	rs2PodHash := rs2.Labels[v1alpha1.DefaultRolloutUniqueLabelKey]
+
+	rootSvc := newService("root", 80, map[string]string{"app": "foo"}, nil)
+	stableSvc := newService("canary", 80, map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: rs2PodHash}, r2)
+	canarySvc := newService("stable", 80, map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: rs2PodHash}, r2)
+	ing := newIngress("ingress", canarySvc, stableSvc)
+
+	r2 = updateCanaryRolloutStatus(r2, rs2PodHash, 6, 3, 6, false)
+	r2.Status.Message = ""
+	r2.Status.ObservedGeneration = strconv.Itoa(int(r2.Generation))
+	r2.Status.StableRS = rs2PodHash
+	availableCondition, _ := newAvailableCondition(true)
+	conditions.SetRolloutCondition(&r2.Status, availableCondition)
+	completedCondition, _ := newCompletedCondition(false)
+	conditions.SetRolloutCondition(&r2.Status, completedCondition)
+	progressingCondition, _ := newProgressingCondition(conditions.NewRSAvailableReason, rs2, "")
+	conditions.SetRolloutCondition(&r2.Status, progressingCondition)
+
+	f.rolloutLister = append(f.rolloutLister, r2)
+	f.objects = append(f.objects, r2, tgb)
+	f.kubeobjects = append(f.kubeobjects, rs1, rs2, ing, rootSvc, canarySvc, stableSvc, ep)
+	f.serviceLister = append(f.serviceLister, rootSvc, canarySvc, stableSvc)
+
+	f.expectGetEndpointsAction(ep)
+	scaleDownRSIndex := f.expectPatchReplicaSetAction(rs1)
+	f.run(getKey(r2, t))
+	f.verifyPatchedReplicaSet(scaleDownRSIndex, 30)
+	f.assertEvents([]string{
+		conditions.TargetGroupVerifiedReason,
+	})
+
+}
diff --git a/rollout/sync.go b/rollout/sync.go
index 747c82eb09..e54f3375e6 100644
--- a/rollout/sync.go
+++ b/rollout/sync.go
@@ -936,15 +936,6 @@ func (c *rolloutContext) promoteStable(newStatus *v1alpha1.RolloutStatus, reason
 		revision, _ := replicasetutil.Revision(c.rollout)
 		c.recorder.Eventf(c.rollout, record.EventOptions{EventReason: conditions.RolloutCompletedReason},
 			conditions.RolloutCompletedMessage, revision, newStatus.CurrentPodHash, reason)
-		// Now that we've marked the desired RS as stable, start the scale-down countdown on the previous stable RS
-		previousStableRS, _ := replicasetutil.GetReplicaSetByTemplateHash(c.olderRSs, previousStableHash)
-		if replicasetutil.GetReplicaCountForReplicaSets([]*appsv1.ReplicaSet{previousStableRS}) > 0 {
-			scaleDownDelaySeconds := defaults.GetScaleDownDelaySecondsOrDefault(c.rollout)
-			err := c.addScaleDownDelay(previousStableRS, scaleDownDelaySeconds)
-			if err != nil {
-				return err
-			}
-		}
 	}
 	return nil
 }
diff --git a/rollout/sync_test.go b/rollout/sync_test.go
index ba3eb6f2c5..2687d55fce 100644
--- a/rollout/sync_test.go
+++ b/rollout/sync_test.go
@@ -365,7 +365,6 @@ func TestBlueGreenPromoteFull(t *testing.T) {
 	f.replicaSetLister = append(f.replicaSetLister, rs1, rs2)
 
 	f.expectPatchServiceAction(activeSvc, rs2PodHash) // update active to rs2
-	f.expectPatchReplicaSetAction(rs1)                // set scaledown delay on rs1
 	patchRolloutIdx := f.expectPatchRolloutAction(r2) // update rollout status
 	f.run(getKey(r2, t))
 
diff --git a/rollout/trafficrouting.go b/rollout/trafficrouting.go
index 27d6740727..4cffd358e5 100644
--- a/rollout/trafficrouting.go
+++ b/rollout/trafficrouting.go
@@ -13,6 +13,7 @@ import (
 	"github.com/argoproj/argo-rollouts/rollout/trafficrouting"
 	"github.com/argoproj/argo-rollouts/utils/record"
 	replicasetutil "github.com/argoproj/argo-rollouts/utils/replicaset"
+	rolloututil "github.com/argoproj/argo-rollouts/utils/rollout"
 )
 
 // NewTrafficRoutingReconciler identifies return the TrafficRouting Plugin that the rollout wants to modify
@@ -86,7 +87,7 @@ func (c *rolloutContext) reconcileTrafficRouting() error {
 	currentStep, index := replicasetutil.GetCurrentCanaryStep(c.rollout)
 	desiredWeight := int32(0)
 	weightDestinations := make([]trafficrouting.WeightDestination, 0)
-	if c.rollout.Status.StableRS == c.rollout.Status.CurrentPodHash {
+	if rolloututil.IsFullyPromoted(c.rollout) {
 		// when we are fully promoted. desired canary weight should be 0
 	} else if c.pauseContext.IsAborted() {
 		// when promote aborted. desired canary weight should be 0
@@ -142,10 +143,14 @@ func (c *rolloutContext) reconcileTrafficRouting() error {
 		return err
 	}
 
-	// If we are at a setWeight step, also perform weight verification. Note that we don't do this
-	// every reconciliation because weight verification typically involves API calls to the cloud
-	// provider which could incur rate limiting
-	if currentStep != nil && currentStep.SetWeight != nil {
+	// If we are in the middle of an update at a setWeight step, also perform weight verification.
+	// Note that we don't do this every reconciliation because weight verification typically involves
+	// API calls to the cloud provider which could incur rate limiting
+	shouldVerifyWeight := c.rollout.Status.StableRS != "" &&
+		c.rollout.Status.CurrentPodHash != c.rollout.Status.StableRS &&
+		currentStep != nil && currentStep.SetWeight != nil
+
+	if shouldVerifyWeight {
 		weightVerified, err := reconciler.VerifyWeight(desiredWeight)
 		if err != nil {
 			return err
diff --git a/rollout/trafficrouting/alb/alb.go b/rollout/trafficrouting/alb/alb.go
index 32152f62e1..8f660c3fe9 100644
--- a/rollout/trafficrouting/alb/alb.go
+++ b/rollout/trafficrouting/alb/alb.go
@@ -17,6 +17,7 @@ import (
 	"github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1"
 	"github.com/argoproj/argo-rollouts/rollout/trafficrouting"
 	"github.com/argoproj/argo-rollouts/utils/aws"
+	"github.com/argoproj/argo-rollouts/utils/conditions"
 	"github.com/argoproj/argo-rollouts/utils/defaults"
 	"github.com/argoproj/argo-rollouts/utils/diff"
 	ingressutil "github.com/argoproj/argo-rollouts/utils/ingress"
@@ -137,6 +138,7 @@ func (r *Reconciler) VerifyWeight(desiredWeight int32) (bool, error) {
 		}
 		lb, err := r.aws.FindLoadBalancerByDNSName(ctx, lbIngress.Hostname)
 		if err != nil {
+			r.cfg.Recorder.Warnf(rollout, record.EventOptions{EventReason: conditions.TargetGroupVerifyErrorReason}, conditions.TargetGroupVerifyErrorMessage, canaryService, "unknown", err.Error())
 			return false, err
 		}
 		if lb == nil || lb.LoadBalancerArn == nil {
@@ -145,6 +147,7 @@ func (r *Reconciler) VerifyWeight(desiredWeight int32) (bool, error) {
 		}
 		lbTargetGroups, err := r.aws.GetTargetGroupMetadata(ctx, *lb.LoadBalancerArn)
 		if err != nil {
+			r.cfg.Recorder.Warnf(rollout, record.EventOptions{EventReason: conditions.TargetGroupVerifyErrorReason}, conditions.TargetGroupVerifyErrorMessage, canaryService, "unknown", err.Error())
 			return false, err
 		}
 		logCtx := r.log.WithField("lb", *lb.LoadBalancerArn)
@@ -153,7 +156,13 @@ func (r *Reconciler) VerifyWeight(desiredWeight int32) (bool, error) {
 				if tg.Weight != nil {
 					logCtx := logCtx.WithField("tg", *tg.TargetGroupArn)
 					logCtx.Infof("canary weight of %s (desired: %d, current: %d)", resourceID, desiredWeight, *tg.Weight)
-					return *tg.Weight == desiredWeight, nil
+					verified := *tg.Weight == desiredWeight
+					if verified {
+						r.cfg.Recorder.Eventf(rollout, record.EventOptions{EventReason: conditions.TargetGroupVerifiedReason}, conditions.TargetGroupVerifiedWeightsMessage, canaryService, *tg.TargetGroupArn, desiredWeight)
+					} else {
+						r.cfg.Recorder.Warnf(rollout, record.EventOptions{EventReason: conditions.TargetGroupUnverifiedReason}, conditions.TargetGroupUnverifiedWeightsMessage, canaryService, *tg.TargetGroupArn, desiredWeight, *tg.Weight)
+					}
+					return verified, nil
 				}
 			}
 		}
diff --git a/rollout/trafficrouting/alb/alb_test.go b/rollout/trafficrouting/alb/alb_test.go
index 949a73df38..c8146e6b45 100644
--- a/rollout/trafficrouting/alb/alb_test.go
+++ b/rollout/trafficrouting/alb/alb_test.go
@@ -271,8 +271,9 @@ func TestErrorPatching(t *testing.T) {
 }
 
 type fakeAWSClient struct {
-	targetGroups []aws.TargetGroupMeta
-	loadBalancer *elbv2types.LoadBalancer
+	targetGroups             []aws.TargetGroupMeta
+	loadBalancer             *elbv2types.LoadBalancer
+	targetHealthDescriptions []elbv2types.TargetHealthDescription
 }
 
 func (f *fakeAWSClient) GetTargetGroupMetadata(ctx context.Context, loadBalancerARN string) ([]aws.TargetGroupMeta, error) {
@@ -283,6 +284,10 @@ func (f *fakeAWSClient) FindLoadBalancerByDNSName(ctx context.Context, dnsName s
 	return f.loadBalancer, nil
 }
 
+func (f *fakeAWSClient) GetTargetGroupHealth(ctx context.Context, targetGroupARN string) ([]elbv2types.TargetHealthDescription, error) {
+	return f.targetHealthDescriptions, nil
+}
+
 func TestVerifyWeight(t *testing.T) {
 	newFakeReconciler := func() (*Reconciler, *fakeAWSClient) {
 		ro := fakeRollout("stable-svc", "canary-svc", "ingress", 443)
diff --git a/rollout/trafficrouting/ambassador/ambassador_test.go b/rollout/trafficrouting/ambassador/ambassador_test.go
index 4777d109a5..39bebf4d4a 100644
--- a/rollout/trafficrouting/ambassador/ambassador_test.go
+++ b/rollout/trafficrouting/ambassador/ambassador_test.go
@@ -17,6 +17,7 @@ import (
 
 	"github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1"
 	"github.com/argoproj/argo-rollouts/rollout/trafficrouting/ambassador"
+	"github.com/argoproj/argo-rollouts/utils/defaults"
 	"github.com/argoproj/argo-rollouts/utils/record"
 )
 
@@ -544,7 +545,7 @@ func TestGetMappingGVR(t *testing.T) {
 	})
 	t.Run("will get gvr successfully", func(t *testing.T) {
 		// given
-		ambassador.SetAPIVersion("v1")
+		defaults.SetAmbassadorAPIVersion("v1")
 
 		// when
 		gvr := ambassador.GetMappingGVR()
@@ -557,7 +558,7 @@ func TestGetMappingGVR(t *testing.T) {
 	t.Run("will get valid gvr even if apiVersion has the wrong domain", func(t *testing.T) {
 		// given
 		apiVersion := "invalid.com/v1alpha1"
-		ambassador.SetAPIVersion(apiVersion)
+		defaults.SetAmbassadorAPIVersion(apiVersion)
 
 		// when
 		gvr := ambassador.GetMappingGVR()
@@ -566,12 +567,12 @@ func TestGetMappingGVR(t *testing.T) {
 		assert.Equal(t, "invalid.com", gvr.Group)
 		assert.Equal(t, "v1alpha1", gvr.Version)
 		assert.Equal(t, "mappings", gvr.Resource)
-		assert.Equal(t, apiVersion, ambassador.GetAPIVersion())
+		assert.Equal(t, apiVersion, defaults.GetAmbassadorAPIVersion())
 	})
 	t.Run("will get correct gvr for x.getambassador.io api group", func(t *testing.T) {
 		// given
 		apiVersion := "x.getambassador.io/v3alpha1"
-		ambassador.SetAPIVersion(apiVersion)
+		defaults.SetAmbassadorAPIVersion(apiVersion)
 
 		// when
 		gvr := ambassador.GetMappingGVR()
@@ -580,7 +581,7 @@ func TestGetMappingGVR(t *testing.T) {
 		assert.Equal(t, "x.getambassador.io", gvr.Group)
 		assert.Equal(t, "v3alpha1", gvr.Version)
 		assert.Equal(t, "ambassadormappings", gvr.Resource)
-		assert.Equal(t, apiVersion, ambassador.GetAPIVersion())
+		assert.Equal(t, apiVersion, defaults.GetAmbassadorAPIVersion())
 	})
 }
 
diff --git a/rollout/trafficrouting/smi/smi_test.go b/rollout/trafficrouting/smi/smi_test.go
index ba40903dcb..038c4a7e77 100644
--- a/rollout/trafficrouting/smi/smi_test.go
+++ b/rollout/trafficrouting/smi/smi_test.go
@@ -65,8 +65,8 @@ func TestType(t *testing.T) {
 func TestUnsupportedTrafficSplitApiVersionError(t *testing.T) {
 	ro := fakeRollout("stable-service", "canary-service", "root-service", "traffic-split-name")
 	client := fake.NewSimpleClientset()
-	SetSMIAPIVersion("does-not-exist")
-	defer SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion)
+	defaults.SetSMIAPIVersion("does-not-exist")
+	defer defaults.SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion)
 	_, err := NewReconciler(ReconcilerConfig{
 		Rollout:        ro,
 		Client:         client,
@@ -114,8 +114,8 @@ func TestReconcileCreateNewTrafficSplit(t *testing.T) {
 	t.Run("v1alpha2", func(t *testing.T) {
 		ro := fakeRollout("stable-service", "canary-service", "root-service", "traffic-split-name")
 		client := fake.NewSimpleClientset()
-		SetSMIAPIVersion("v1alpha2")
-		defer SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion)
+		defaults.SetSMIAPIVersion("v1alpha2")
+		defer defaults.SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion)
 		r, err := NewReconciler(ReconcilerConfig{
 			Rollout:        ro,
 			Client:         client,
@@ -145,8 +145,8 @@ func TestReconcileCreateNewTrafficSplit(t *testing.T) {
 	t.Run("v1alpha3", func(t *testing.T) {
 		ro := fakeRollout("stable-service", "canary-service", "root-service", "traffic-split-name")
 		client := fake.NewSimpleClientset()
-		SetSMIAPIVersion("v1alpha3")
-		defer SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion)
+		defaults.SetSMIAPIVersion("v1alpha3")
+		defer defaults.SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion)
 		r, err := NewReconciler(ReconcilerConfig{
 			Rollout:        ro,
 			Client:         client,
@@ -215,8 +215,8 @@ func TestReconcilePatchExistingTrafficSplit(t *testing.T) {
 	t.Run("v1alpha2", func(t *testing.T) {
 		ts2 := trafficSplitV1Alpha2(ro, objectMeta, "root-service", int32(10))
 		client := fake.NewSimpleClientset(ts2)
-		SetSMIAPIVersion("v1alpha2")
-		defer SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion)
+		defaults.SetSMIAPIVersion("v1alpha2")
+		defer defaults.SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion)
 		r, err := NewReconciler(ReconcilerConfig{
 			Rollout:        ro,
 			Client:         client,
@@ -247,8 +247,8 @@ func TestReconcilePatchExistingTrafficSplit(t *testing.T) {
 	t.Run("v1alpha3", func(t *testing.T) {
 		ts3 := trafficSplitV1Alpha3(ro, objectMeta, "root-service", int32(10))
 		client := fake.NewSimpleClientset(ts3)
-		SetSMIAPIVersion("v1alpha3")
-		defer SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion)
+		defaults.SetSMIAPIVersion("v1alpha3")
+		defer defaults.SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion)
 		r, err := NewReconciler(ReconcilerConfig{
 			Rollout:        ro,
 			Client:         client,
@@ -307,8 +307,8 @@ func TestReconcilePatchExistingTrafficSplitNoChange(t *testing.T) {
 		objMeta := objectMeta("traffic-split-v1alpha2", ro, schema.GroupVersionKind{})
 		ts2 := trafficSplitV1Alpha2(ro, objMeta, "root-service", int32(10))
 		client := fake.NewSimpleClientset(ts2)
-		SetSMIAPIVersion("v1alpha2")
-		defer SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion)
+		defaults.SetSMIAPIVersion("v1alpha2")
+		defer defaults.SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion)
 		r, err := NewReconciler(ReconcilerConfig{
 			Rollout:        ro,
 			Client:         client,
@@ -332,8 +332,8 @@ func TestReconcilePatchExistingTrafficSplitNoChange(t *testing.T) {
 		objMeta := objectMeta("traffic-split-v1alpha3", ro, schema.GroupVersionKind{})
 		ts3 := trafficSplitV1Alpha3(ro, objMeta, "root-service", int32(10))
 		client := fake.NewSimpleClientset(ts3)
-		SetSMIAPIVersion("v1alpha3")
-		defer SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion)
+		defaults.SetSMIAPIVersion("v1alpha3")
+		defer defaults.SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion)
 		r, err := NewReconciler(ReconcilerConfig{
 			Rollout:        ro,
 			Client:         client,
@@ -397,8 +397,8 @@ func TestReconcileRolloutDoesNotOwnTrafficSplitError(t *testing.T) {
 	t.Run("v1alpha2", func(t *testing.T) {
 		ts2 := trafficSplitV1Alpha2(ro, objMeta, "root-service", int32(10))
 		ts2.OwnerReferences = nil
-		SetSMIAPIVersion("v1alpha2")
-		defer SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion)
+		defaults.SetSMIAPIVersion("v1alpha2")
+		defer defaults.SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion)
 
 		client := fake.NewSimpleClientset(ts2)
 		r, err := NewReconciler(ReconcilerConfig{
@@ -416,8 +416,8 @@ func TestReconcileRolloutDoesNotOwnTrafficSplitError(t *testing.T) {
 	t.Run("v1alpha3", func(t *testing.T) {
 		ts3 := trafficSplitV1Alpha3(ro, objMeta, "root-service", int32(10))
 		ts3.OwnerReferences = nil
-		SetSMIAPIVersion("v1alpha3")
-		defer SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion)
+		defaults.SetSMIAPIVersion("v1alpha3")
+		defer defaults.SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion)
 
 		client := fake.NewSimpleClientset(ts3)
 		r, err := NewReconciler(ReconcilerConfig{
@@ -490,8 +490,8 @@ func TestCreateTrafficSplitForMultipleBackends(t *testing.T) {
 	})
 
 	t.Run("v1alpha2", func(t *testing.T) {
-		SetSMIAPIVersion("v1alpha2")
-		defer SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion)
+		defaults.SetSMIAPIVersion("v1alpha2")
+		defer defaults.SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion)
 
 		client := fake.NewSimpleClientset()
 		r, err := NewReconciler(ReconcilerConfig{
@@ -534,8 +534,8 @@ func TestCreateTrafficSplitForMultipleBackends(t *testing.T) {
 	})
 
 	t.Run("v1alpha3", func(t *testing.T) {
-		SetSMIAPIVersion("v1alpha3")
-		defer SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion)
+		defaults.SetSMIAPIVersion("v1alpha3")
+		defer defaults.SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion)
 
 		client := fake.NewSimpleClientset()
 		r, err := NewReconciler(ReconcilerConfig{
diff --git a/rollout/trafficrouting_test.go b/rollout/trafficrouting_test.go
index 57fb408f27..e5214ccf99 100644
--- a/rollout/trafficrouting_test.go
+++ b/rollout/trafficrouting_test.go
@@ -470,7 +470,9 @@ func TestNewTrafficRoutingReconciler(t *testing.T) {
 }
 
 // Verifies with a canary using traffic routing, we add a scaledown delay to the old ReplicaSet
-// after promoting desired ReplicaSet to stable
+// after promoting desired ReplicaSet to stable.
+// NOTE: As of v1.1, scale down delays are added to  ReplicaSets on *subsequent* reconciliations
+// after the desired RS has been promoted to stable
 func TestCanaryWithTrafficRoutingAddScaleDownDelay(t *testing.T) {
 	f := newFixture(t)
 	defer f.Close()
@@ -483,30 +485,27 @@ func TestCanaryWithTrafficRoutingAddScaleDownDelay(t *testing.T) {
 	}
 	r2 := bumpVersion(r1)
 	rs1 := newReplicaSetWithStatus(r1, 1, 1)
-	rs1PodHash := rs1.Labels[v1alpha1.DefaultRolloutUniqueLabelKey]
 	rs2 := newReplicaSetWithStatus(r2, 1, 1)
-	r2 = updateCanaryRolloutStatus(r2, rs1PodHash, 2, 2, 2, false)
 	rs2PodHash := rs2.Labels[v1alpha1.DefaultRolloutUniqueLabelKey]
+	r2 = updateCanaryRolloutStatus(r2, rs2PodHash, 2, 1, 2, false)
 	r2.Status.ObservedGeneration = strconv.Itoa(int(r2.Generation))
+	r2.Status.CurrentStepIndex = nil
+	availableCondition, _ := newAvailableCondition(true)
+	conditions.SetRolloutCondition(&r2.Status, availableCondition)
 
-	canarySelector := map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: rs2PodHash}
-	stableSelector := map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: rs1PodHash}
-	canarySvc := newService("canary", 80, canarySelector, r2)
-	stableSvc := newService("stable", 80, stableSelector, r2)
+	selector := map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: rs2PodHash}
+	canarySvc := newService("canary", 80, selector, r2)
+	stableSvc := newService("stable", 80, selector, r2)
 
 	f.kubeobjects = append(f.kubeobjects, rs1, rs2, canarySvc, stableSvc)
 	f.replicaSetLister = append(f.replicaSetLister, rs1, rs2)
 	f.rolloutLister = append(f.rolloutLister, r2)
 	f.objects = append(f.objects, r2)
 
-	rs1Patch := f.expectPatchReplicaSetAction(rs1) // adds the annotation
-	patchIndex := f.expectPatchRolloutAction(r2)   // updates the rollout status
+	rs1Patch := f.expectPatchReplicaSetAction(rs1) // set scale-down-deadline annotation
 	f.run(getKey(r2, t))
 
 	f.verifyPatchedReplicaSet(rs1Patch, 30)
-	roPatchObj := f.getPatchedRolloutAsObject(patchIndex)
-	assert.Equal(t, rs2PodHash, roPatchObj.Status.StableRS)
-	assert.Nil(t, roPatchObj.Status.CurrentStepIndex)
 }
 
 // Verifies with a canary using traffic routing, we scale down old ReplicaSets which exceed our limit
diff --git a/test/e2e/aws_test.go b/test/e2e/aws_test.go
index 762a5ecc03..4190449aef 100644
--- a/test/e2e/aws_test.go
+++ b/test/e2e/aws_test.go
@@ -20,17 +20,28 @@ func TestAWSSuite(t *testing.T) {
 }
 
 // TestALBUpdate is a simple integration test which verifies the controller can work in a real AWS
-// environment. It is intended to be run with the `--alb-verify-weight` controller flag. Success of
+// environment. It is intended to be run with the `--aws-verify-target-group` controller flag. Success of
 // this test against a controller using that flag, indicates that the controller was able to perform
 // weight verification using AWS APIs.
 // This test will be skipped unless E2E_ALB_INGESS_ANNOTATIONS is set (can be an empty struct). e.g.:
-// make test-e2e E2E_INSTANCE_ID= E2E_TEST_OPTIONS="-testify.m TestALBUpdate$" E2E_ALB_INGESS_ANNOTATIONS='{"kubernetes.io/ingress.class": "aws-alb", "alb.ingress.kubernetes.io/security-groups": "iks-intuit-cidr-ingress-tcp-443"}'
-func (s *AWSSuite) TestALBUpdate() {
+// make test-e2e E2E_TEST_OPTIONS="-testify.m TestALBCanaryUpdate$" E2E_IMAGE_PREFIX="docker.intuit.com/docker-rmt/" E2E_INSTANCE_ID= E2E_ALB_INGESS_ANNOTATIONS='{"kubernetes.io/ingress.class": "aws-alb", "alb.ingress.kubernetes.io/security-groups": "iks-intuit-cidr-ingress-tcp-443"}'
+func (s *AWSSuite) TestALBCanaryUpdate() {
 	if val, _ := os.LookupEnv(fixtures.EnvVarE2EALBIngressAnnotations); val == "" {
 		s.T().SkipNow()
 	}
 	s.Given().
-		HealthyRollout(`@functional/alb-rollout.yaml`).
+		HealthyRollout(`@functional/alb-canary-rollout.yaml`).
+		When().
+		UpdateSpec().
+		WaitForRolloutStatus("Healthy")
+}
+
+func (s *AWSSuite) TestALBBlueGreenUpdate() {
+	if val, _ := os.LookupEnv(fixtures.EnvVarE2EALBIngressAnnotations); val == "" {
+		s.T().SkipNow()
+	}
+	s.Given().
+		HealthyRollout(`@functional/alb-bluegreen-rollout.yaml`).
 		When().
 		UpdateSpec().
 		WaitForRolloutStatus("Healthy")
diff --git a/test/e2e/canary_test.go b/test/e2e/canary_test.go
index 07fb2e523d..723fc9cddd 100644
--- a/test/e2e/canary_test.go
+++ b/test/e2e/canary_test.go
@@ -477,6 +477,7 @@ spec:
       annotations:
         rev: two`). // update to revision 2
 		WaitForRolloutStatus("Healthy").
+		Sleep(2 * time.Second). // sleep is necessary since scale down delay annotation happens in s subsequent reconciliation
 		Then().
 		Assert(func(t *fixtures.Then) {
 			rs1 := t.GetReplicaSetByRevision("1")
diff --git a/test/e2e/functional/alb-bluegreen-rollout.yaml b/test/e2e/functional/alb-bluegreen-rollout.yaml
new file mode 100644
index 0000000000..e06c2b5bea
--- /dev/null
+++ b/test/e2e/functional/alb-bluegreen-rollout.yaml
@@ -0,0 +1,71 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: alb-bluegreen-desired
+spec:
+  type: NodePort
+  ports:
+  - port: 80
+    targetPort: http
+    protocol: TCP
+    name: http
+  selector:
+    app: alb-bluegreen
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: alb-bluegreen-stable
+spec:
+  type: NodePort
+  ports:
+  - port: 80
+    targetPort: http
+    protocol: TCP
+    name: http
+  selector:
+    app: alb-bluegreen
+---
+apiVersion: networking.k8s.io/v1beta1
+kind: Ingress
+metadata:
+  name: alb-bluegreen-ingress
+  annotations:
+    kubernetes.io/ingress.class: alb
+spec:
+  rules:
+  - http:
+      paths:
+      - path: /*
+        backend:
+          serviceName: alb-bluegreen-stable
+          servicePort: 80
+---
+apiVersion: argoproj.io/v1alpha1
+kind: Rollout
+metadata:
+  name: alb-bluegreen
+spec:
+  selector:
+    matchLabels:
+      app: alb-bluegreen
+  template:
+    metadata:
+      labels:
+        app: alb-bluegreen
+    spec:
+      containers:
+      - name: alb-bluegreen
+        image: nginx:1.19-alpine
+        ports:
+        - name: http
+          containerPort: 80
+          protocol: TCP
+        resources:
+          requests:
+            memory: 16Mi
+            cpu: 5m
+  strategy:
+    blueGreen:
+      previewService: alb-bluegreen-desired
+      activeService: alb-bluegreen-stable
diff --git a/test/e2e/functional/alb-rollout.yaml b/test/e2e/functional/alb-canary-rollout.yaml
similarity index 71%
rename from test/e2e/functional/alb-rollout.yaml
rename to test/e2e/functional/alb-canary-rollout.yaml
index ad8157b3c7..df90ecc3cd 100644
--- a/test/e2e/functional/alb-rollout.yaml
+++ b/test/e2e/functional/alb-canary-rollout.yaml
@@ -1,7 +1,7 @@
 apiVersion: v1
 kind: Service
 metadata:
-  name: alb-rollout-root
+  name: alb-canary-root
 spec:
   type: NodePort
   ports:
@@ -10,12 +10,12 @@ spec:
     protocol: TCP
     name: http
   selector:
-    app: alb-rollout
+    app: alb-canary
 ---
 apiVersion: v1
 kind: Service
 metadata:
-  name: alb-rollout-canary
+  name: alb-canary-desired
 spec:
   type: NodePort
   ports:
@@ -24,12 +24,12 @@ spec:
     protocol: TCP
     name: http
   selector:
-    app: alb-rollout
+    app: alb-canary
 ---
 apiVersion: v1
 kind: Service
 metadata:
-  name: alb-rollout-stable
+  name: alb-canary-stable
 spec:
   type: NodePort
   ports:
@@ -38,12 +38,12 @@ spec:
     protocol: TCP
     name: http
   selector:
-    app: alb-rollout
+    app: alb-canary
 ---
 apiVersion: networking.k8s.io/v1beta1
 kind: Ingress
 metadata:
-  name: alb-rollout-ingress
+  name: alb-canary-ingress
   annotations:
     kubernetes.io/ingress.class: alb
 spec:
@@ -52,24 +52,24 @@ spec:
       paths:
       - path: /*
         backend:
-          serviceName: alb-rollout-root
+          serviceName: alb-canary-root
           servicePort: use-annotation
 ---
 apiVersion: argoproj.io/v1alpha1
 kind: Rollout
 metadata:
-  name: alb-rollout
+  name: alb-canary
 spec:
   selector:
     matchLabels:
-      app: alb-rollout
+      app: alb-canary
   template:
     metadata:
       labels:
-        app: alb-rollout
+        app: alb-canary
     spec:
       containers:
-      - name: alb-rollout
+      - name: alb-canary
         image: nginx:1.19-alpine
         ports:
         - name: http
@@ -81,15 +81,15 @@ spec:
             cpu: 5m
   strategy:
     canary:
-      canaryService: alb-rollout-canary
-      stableService: alb-rollout-stable
+      canaryService: alb-canary-desired
+      stableService: alb-canary-stable
       trafficRouting:
         alb:
-          ingress: alb-rollout-ingress
-          rootService: alb-rollout-root
+          ingress: alb-canary-ingress
+          rootService: alb-canary-root
           servicePort: 80
       steps:
       - setWeight: 10
       - pause: {duration: 5s}
       - setWeight: 20
-      - pause: {duration: 5s}
+      - pause: {duration: 5s}
\ No newline at end of file
diff --git a/test/fixtures/e2e_suite.go b/test/fixtures/e2e_suite.go
index 71589276cf..e7ac739ab6 100644
--- a/test/fixtures/e2e_suite.go
+++ b/test/fixtures/e2e_suite.go
@@ -39,6 +39,8 @@ const (
 	// E2E_POD_DELAY slows down pod startup and shutdown by the value in seconds (default: 0)
 	// Used humans slow down rollout activity during a test
 	EnvVarE2EPodDelay = "E2E_POD_DELAY"
+	// EnvVarE2EImagePrefix is a prefix that will be prefixed to images used by the e2e tests
+	EnvVarE2EImagePrefix = "E2E_IMAGE_PREFIX"
 	// E2E_DEBUG makes e2e testing easier to debug by not tearing down the suite
 	EnvVarE2EDebug = "E2E_DEBUG"
 	// E2E_ALB_INGESS_ANNOTATIONS is a map of annotations to apply to ingress for AWS Load Balancer Controller
diff --git a/test/fixtures/when.go b/test/fixtures/when.go
index 81d44628e9..3820a6afd7 100644
--- a/test/fixtures/when.go
+++ b/test/fixtures/when.go
@@ -53,8 +53,9 @@ func (w *When) ApplyManifests(yaml ...string) *When {
 		objects = w.parseTextToObjects(yaml[0])
 	}
 	for _, obj := range objects {
-		if obj.GetKind() == "Rollout" && E2EPodDelay > 0 {
+		if obj.GetKind() == "Rollout" {
 			w.injectDelays(obj)
+			w.injectImagePrefix(obj)
 		}
 		if obj.GetKind() == "Ingress" {
 			w.injectIngressAnnotations(obj)
@@ -72,6 +73,9 @@ func (w *When) DeleteObject(kind, name string) *When {
 // injectDelays adds postStart/preStop handlers to slow down readiness/termination by adding a
 // preStart and postStart handlers which sleeps for the specified duration.
 func (w *When) injectDelays(un *unstructured.Unstructured) {
+	if E2EPodDelay == 0 {
+		return
+	}
 	sleepHandler := corev1.Handler{
 		Exec: &corev1.ExecAction{
 			Command: []string{"sleep", strconv.Itoa(E2EPodDelay)},
@@ -92,6 +96,21 @@ func (w *When) injectDelays(un *unstructured.Unstructured) {
 	w.CheckError(err)
 }
 
+// injectImagePrefix prefixes images used in tests with a prefix. Useful if container registries are blocked
+func (w *When) injectImagePrefix(un *unstructured.Unstructured) {
+	imagePrefix := os.Getenv(EnvVarE2EImagePrefix)
+	if imagePrefix == "" {
+		return
+	}
+	containersIf, _, err := unstructured.NestedSlice(un.Object, "spec", "template", "spec", "containers")
+	w.CheckError(err)
+	container := containersIf[0].(map[string]interface{})
+	container["image"] = imagePrefix + container["image"].(string)
+	containersIf[0] = container
+	err = unstructured.SetNestedSlice(un.Object, containersIf, "spec", "template", "spec", "containers")
+	w.CheckError(err)
+}
+
 // injectIngressAnnotations injects ingress annotations defined in environment variables. Currently
 // E2E_ALB_INGESS_ANNOTATIONS
 func (w *When) injectIngressAnnotations(un *unstructured.Unstructured) {
diff --git a/test/util/util.go b/test/util/util.go
index 234e006d00..a783107ebf 100644
--- a/test/util/util.go
+++ b/test/util/util.go
@@ -40,15 +40,21 @@ func NewFakeDynamicClient(objects ...runtime.Object) *dynamicfake.FakeDynamicCli
 	scheme := runtime.NewScheme()
 	vsvcGVR := istioutil.GetIstioVirtualServiceGVR()
 	druleGVR := istioutil.GetIstioDestinationRuleGVR()
+	tgbGVR := schema.GroupVersionResource{
+		Group:    "elbv2.k8s.aws",
+		Version:  "v1beta1",
+		Resource: "targetgroupbindings",
+	}
 
 	listMapping := map[schema.GroupVersionResource]string{
-		vsvcGVR:                             vsvcGVR.Resource + "List",
-		druleGVR:                            druleGVR.Resource + "List",
+		vsvcGVR:                             "VirtualServiceList",
+		druleGVR:                            "DestinationRuleList",
 		v1alpha1.RolloutGVR:                 rollouts.RolloutKind + "List",
 		v1alpha1.AnalysisTemplateGVR:        rollouts.AnalysisTemplateKind + "List",
 		v1alpha1.AnalysisRunGVR:             rollouts.AnalysisRunKind + "List",
 		v1alpha1.ExperimentGVR:              rollouts.ExperimentKind + "List",
 		v1alpha1.ClusterAnalysisTemplateGVR: rollouts.ClusterAnalysisTemplateKind + "List",
+		tgbGVR:                              "TargetGroupBindingList",
 	}
 	return dynamicfake.NewSimpleDynamicClientWithCustomListKinds(scheme, listMapping, objects...)
 }
diff --git a/utils/aws/aws.go b/utils/aws/aws.go
index 84f659bd86..ad59b41985 100644
--- a/utils/aws/aws.go
+++ b/utils/aws/aws.go
@@ -38,7 +38,7 @@ const (
 )
 
 type Client interface {
-	GetTargetGroupTargets(ctx context.Context, targetGroupARN string) ([]elbv2types.TargetHealthDescription, error)
+	GetTargetGroupHealth(ctx context.Context, targetGroupARN string) ([]elbv2types.TargetHealthDescription, error)
 	GetTargetGroupMetadata(ctx context.Context, loadBalancerARN string) ([]TargetGroupMeta, error)
 	FindLoadBalancerByDNSName(ctx context.Context, dnsName string) (*elbv2types.LoadBalancer, error)
 }
@@ -53,7 +53,8 @@ type ELBv2APIClient interface {
 	DescribeTags(ctx context.Context, params *elbv2.DescribeTagsInput, optFns ...func(*elbv2.Options)) (*elbv2.DescribeTagsOutput, error)
 }
 
-type client struct {
+// ClientAdapter implements the Client interface
+type ClientAdapter struct {
 	ELBV2 ELBv2APIClient
 
 	// loadBalancerDNStoARN is a cache that maps a LoadBalancer DNSName to an ARN
@@ -100,19 +101,32 @@ type ServiceReference struct {
 	Port intstr.IntOrString `json:"port"`
 }
 
-func NewClient() (Client, error) {
+// NewClient instantiates a new AWS Client. It is declared as a variable to allow mocking
+var NewClient = DefaultNewClientFunc
+
+func DefaultNewClientFunc() (Client, error) {
 	cfg, err := config.LoadDefaultConfig(context.TODO())
 	if err != nil {
 		return nil, err
 	}
-	c := client{
+	c := ClientAdapter{
 		ELBV2:                elbv2.NewFromConfig(cfg),
 		loadBalancerDNStoARN: make(map[string]string),
 	}
 	return &c, nil
 }
 
-func (c *client) FindLoadBalancerByDNSName(ctx context.Context, dnsName string) (*elbv2types.LoadBalancer, error) {
+func FakeNewClientFunc(elbClient ELBv2APIClient) func() (Client, error) {
+	return func() (Client, error) {
+		c := ClientAdapter{
+			ELBV2:                elbClient,
+			loadBalancerDNStoARN: make(map[string]string),
+		}
+		return &c, nil
+	}
+}
+
+func (c *ClientAdapter) FindLoadBalancerByDNSName(ctx context.Context, dnsName string) (*elbv2types.LoadBalancer, error) {
 	lbOutput, err := c.ELBV2.DescribeLoadBalancers(ctx, &elbv2.DescribeLoadBalancersInput{})
 	if err != nil {
 		return nil, err
@@ -127,7 +141,7 @@ func (c *client) FindLoadBalancerByDNSName(ctx context.Context, dnsName string)
 
 // GetTargetGroupMetadata is a convenience to retrieve the target groups of a load balancer along
 // with relevant metadata (tags, and traffic weights).
-func (c *client) GetTargetGroupMetadata(ctx context.Context, loadBalancerARN string) ([]TargetGroupMeta, error) {
+func (c *ClientAdapter) GetTargetGroupMetadata(ctx context.Context, loadBalancerARN string) ([]TargetGroupMeta, error) {
 	// Get target groups associated with LoadBalancer
 	tgIn := elbv2.DescribeTargetGroupsInput{
 		LoadBalancerArn: &loadBalancerARN,
@@ -206,8 +220,9 @@ func (c *client) GetTargetGroupMetadata(ctx context.Context, loadBalancerARN str
 	return tgMeta, nil
 }
 
-func (c *client) GetTargetGroupTargets(ctx context.Context, targetGroupARN string) ([]elbv2types.TargetHealthDescription, error) {
-	// Get target groups associated with LoadBalancer
+// GetTargetGroupHealth returns health descriptions of registered targets in a target group.
+// A TargetHealthDescription is an IP:port pair, along with its health status.
+func (c *ClientAdapter) GetTargetGroupHealth(ctx context.Context, targetGroupARN string) ([]elbv2types.TargetHealthDescription, error) {
 	thIn := elbv2.DescribeTargetHealthInput{
 		TargetGroupArn: &targetGroupARN,
 	}
@@ -296,18 +311,29 @@ func getNumericPort(tgb TargetGroupBinding, svc corev1.Service) int32 {
 	return 0
 }
 
-// VerifyTargetGroupBinding verifies if the underlying AWS TargetGroup:
-// 1. targets all the Pod IPs and port in the given service
-// 2. those targets are in a healthy state
-func VerifyTargetGroupBinding(ctx context.Context, logCtx *log.Entry, awsClnt Client, tgb TargetGroupBinding, endpoints *corev1.Endpoints, svc *corev1.Service) (bool, error) {
+// TargetGroupVerifyResult returns metadata when a target group is verified.
+type TargetGroupVerifyResult struct {
+	Service             string
+	Verified            bool
+	EndpointsRegistered int
+	EndpointsTotal      int
+}
+
+// VerifyTargetGroupBinding verifies if the underlying AWS TargetGroup has all Pod IPs and ports
+// from the given service (the K8s Endpoints list) registered to the TargetGroup.
+// NOTE: a previous version of this method used to additionally verify that all registered targets
+// were "healthy" (in addition to registered), but the health of registered targets is actually
+// irrelevant for our purposes of verifying the service label change was reflected in the LB.
+// Returns nil if the verification is not applicable (e.g. target type is not IP)
+func VerifyTargetGroupBinding(ctx context.Context, logCtx *log.Entry, awsClnt Client, tgb TargetGroupBinding, endpoints *corev1.Endpoints, svc *corev1.Service) (*TargetGroupVerifyResult, error) {
 	if tgb.Spec.TargetType == nil || *tgb.Spec.TargetType != TargetTypeIP {
 		// We only need to verify target groups using AWS CNI (spec.targetType: ip)
-		return true, nil
+		return nil, nil
 	}
 	port := getNumericPort(tgb, *svc)
 	if port == 0 {
 		logCtx.Warn("Unable to match TargetGroupBinding spec.serviceRef.port to Service spec.ports")
-		return false, nil
+		return nil, nil
 	}
 	logCtx = logCtx.WithFields(map[string]interface{}{
 		"service":            svc.Name,
@@ -315,12 +341,12 @@ func VerifyTargetGroupBinding(ctx context.Context, logCtx *log.Entry, awsClnt Cl
 		"tg":                 tgb.Spec.TargetGroupARN,
 		"port":               port,
 	})
-	targets, err := awsClnt.GetTargetGroupTargets(ctx, tgb.Spec.TargetGroupARN)
+	targets, err := awsClnt.GetTargetGroupHealth(ctx, tgb.Spec.TargetGroupARN)
 	if err != nil {
-		return false, err
+		return nil, err
 	}
 
-	// Remember all of the ip:port of the endpoints list
+	// Remember/initialize all of the ip:port of the endpoints list that we expect to see registered
 	endpointIPs := make(map[string]bool)
 	for _, subset := range endpoints.Subsets {
 		for _, addr := range subset.Addresses {
@@ -330,9 +356,9 @@ func VerifyTargetGroupBinding(ctx context.Context, logCtx *log.Entry, awsClnt Cl
 
 	logCtx.Infof("verifying %d endpoint addresses (of %d targets)", len(endpointIPs), len(targets))
 
-	// Iterate all targets in AWS TargetGroup. Mark all endpoint IPs which are healthy
+	// Iterate all registered targets in AWS TargetGroup. Mark all endpoint IPs which we see registered
 	for _, target := range targets {
-		if target.Target == nil || target.Target.Id == nil || target.Target.Port == nil || target.TargetHealth == nil {
+		if target.Target == nil || target.Target.Id == nil || target.Target.Port == nil {
 			logCtx.Warnf("Invalid target in TargetGroup: %v", target)
 			continue
 		}
@@ -342,17 +368,25 @@ func VerifyTargetGroupBinding(ctx context.Context, logCtx *log.Entry, awsClnt Cl
 			// this is a target for something not in the endpoint list (e.g. old endpoint entry). Ignore it
 			continue
 		}
-		// Mark the endpoint IP as healthy or not
-		endpointIPs[targetStr] = bool(target.TargetHealth.State == elbv2types.TargetHealthStateEnumHealthy)
+		// Verify we see the endpoint IP registered to the TargetGroup
+		// NOTE: we used to check health here, but health is not relevant for verifying service label change
+		endpointIPs[targetStr] = true
+	}
+
+	tgvr := TargetGroupVerifyResult{
+		Service:             svc.Name,
+		EndpointsTotal:      len(endpointIPs),
+		EndpointsRegistered: 0,
 	}
 
-	// Check if any of our desired endpoints are not yet healthy
-	for epIP, healthy := range endpointIPs {
-		if !healthy {
-			logCtx.Infof("Service endpoint IP %s not yet targeted or healthy", epIP)
-			return false, nil
+	// Check if any of our desired endpoints are not yet registered
+	for epIP, seen := range endpointIPs {
+		if !seen {
+			logCtx.Infof("Service endpoint IP %s not yet registered", epIP)
+		} else {
+			tgvr.EndpointsRegistered++
 		}
 	}
-	logCtx.Info("TargetGroupBinding verified")
-	return true, nil
+	tgvr.Verified = bool(tgvr.EndpointsRegistered == tgvr.EndpointsTotal)
+	return &tgvr, nil
 }
diff --git a/utils/aws/aws_test.go b/utils/aws/aws_test.go
index 1e9904e083..94836f25bb 100644
--- a/utils/aws/aws_test.go
+++ b/utils/aws/aws_test.go
@@ -5,21 +5,24 @@ import (
 	"testing"
 
 	elbv2 "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2"
-	"github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2/types"
+	elbv2types "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2/types"
+	log "github.com/sirupsen/logrus"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/mock"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/intstr"
 	"k8s.io/utils/pointer"
 
+	testutil "github.com/argoproj/argo-rollouts/test/util"
 	"github.com/argoproj/argo-rollouts/utils/aws/mocks"
+	unstructuredutil "github.com/argoproj/argo-rollouts/utils/unstructured"
 )
 
 func newFakeClient() (*mocks.ELBv2APIClient, Client) {
 	fakeELB := mocks.ELBv2APIClient{}
-	c := client{
-		ELBV2:                &fakeELB,
-		loadBalancerDNStoARN: make(map[string]string),
-	}
-	return &fakeELB, &c
+	awsClient, _ := FakeNewClientFunc(&fakeELB)()
+	return &fakeELB, awsClient
 }
 
 func TestFindLoadBalancerByDNSName(t *testing.T) {
@@ -36,12 +39,12 @@ func TestFindLoadBalancerByDNSName(t *testing.T) {
 	{
 		fakeELB, c := newFakeClient()
 		// Mock output
-		expectedLB := types.LoadBalancer{
+		expectedLB := elbv2types.LoadBalancer{
 			LoadBalancerArn: pointer.StringPtr("lb-abc123"),
 			DNSName:         pointer.StringPtr("find-loadbalancer-test-abc-123.us-west-2.elb.amazonaws.com"),
 		}
 		lbOut := elbv2.DescribeLoadBalancersOutput{
-			LoadBalancers: []types.LoadBalancer{
+			LoadBalancers: []elbv2types.LoadBalancer{
 				expectedLB,
 			},
 		}
@@ -59,7 +62,7 @@ func TestGetTargetGroupMetadata(t *testing.T) {
 
 	// mock the output
 	tgOut := elbv2.DescribeTargetGroupsOutput{
-		TargetGroups: []types.TargetGroup{
+		TargetGroups: []elbv2types.TargetGroup{
 			{
 				TargetGroupArn: pointer.StringPtr("tg-abc123"),
 			},
@@ -71,10 +74,10 @@ func TestGetTargetGroupMetadata(t *testing.T) {
 	fakeELB.On("DescribeTargetGroups", mock.Anything, mock.Anything).Return(&tgOut, nil)
 
 	tagsOut := elbv2.DescribeTagsOutput{
-		TagDescriptions: []types.TagDescription{
+		TagDescriptions: []elbv2types.TagDescription{
 			{
 				ResourceArn: pointer.StringPtr("tg-abc123"),
-				Tags: []types.Tag{
+				Tags: []elbv2types.Tag{
 					{
 						Key:   pointer.StringPtr("foo"),
 						Value: pointer.StringPtr("bar"),
@@ -86,7 +89,7 @@ func TestGetTargetGroupMetadata(t *testing.T) {
 	fakeELB.On("DescribeTags", mock.Anything, mock.Anything).Return(&tagsOut, nil)
 
 	listenersOut := elbv2.DescribeListenersOutput{
-		Listeners: []types.Listener{
+		Listeners: []elbv2types.Listener{
 			{
 				ListenerArn:     pointer.StringPtr("lst-abc123"),
 				LoadBalancerArn: pointer.StringPtr("lb-abc123"),
@@ -96,12 +99,12 @@ func TestGetTargetGroupMetadata(t *testing.T) {
 	fakeELB.On("DescribeListeners", mock.Anything, mock.Anything).Return(&listenersOut, nil)
 
 	rulesOut := elbv2.DescribeRulesOutput{
-		Rules: []types.Rule{
+		Rules: []elbv2types.Rule{
 			{
-				Actions: []types.Action{
+				Actions: []elbv2types.Action{
 					{
-						ForwardConfig: &types.ForwardActionConfig{
-							TargetGroups: []types.TargetGroupTuple{
+						ForwardConfig: &elbv2types.ForwardActionConfig{
+							TargetGroups: []elbv2types.TargetGroupTuple{
 								{
 									TargetGroupArn: pointer.StringPtr("tg-abc123"),
 									Weight:         pointer.Int32Ptr(10),
@@ -128,6 +131,203 @@ func TestGetTargetGroupMetadata(t *testing.T) {
 	assert.Nil(t, tgMeta[1].Weight)
 }
 
-func TestBuildV2TargetGroupID(t *testing.T) {
-	assert.Equal(t, "default/ingress-svc:80", BuildV2TargetGroupID("default", "ingress", "svc", 80))
+func TestBuildTargetGroupResourceID(t *testing.T) {
+	assert.Equal(t, "default/ingress-svc:80", BuildTargetGroupResourceID("default", "ingress", "svc", 80))
+}
+
+func TestGetTargetGroupHealth(t *testing.T) {
+	fakeELB, c := newFakeClient()
+	expectedHealth := elbv2.DescribeTargetHealthOutput{
+		TargetHealthDescriptions: []elbv2types.TargetHealthDescription{
+			{
+				HealthCheckPort: pointer.StringPtr("80"),
+				Target:          &elbv2types.TargetDescription{},
+				TargetHealth: &elbv2types.TargetHealth{
+					State: elbv2types.TargetHealthStateEnumHealthy,
+				},
+			},
+		},
+	}
+	fakeELB.On("DescribeTargetHealth", mock.Anything, mock.Anything).Return(&expectedHealth, nil)
+
+	// Test
+	health, err := c.GetTargetGroupHealth(context.TODO(), "tg-abc123")
+	assert.NoError(t, err)
+	assert.Equal(t, expectedHealth.TargetHealthDescriptions, health)
+}
+
+var testTargetGroupBinding = `
+apiVersion: elbv2.k8s.aws/v1beta1
+kind: TargetGroupBinding
+metadata:
+  name: active
+  namespace: default
+spec:
+  serviceRef:
+    name: active
+    port: 80
+  targetGroupARN: arn::1234
+  targetType: ip
+`
+
+func TestGetTargetGroupBindingsByService(t *testing.T) {
+	{
+		svc1 := corev1.Service{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      "active",
+				Namespace: metav1.NamespaceDefault,
+			},
+			Spec: corev1.ServiceSpec{
+				Selector: map[string]string{},
+				Ports: []corev1.ServicePort{{
+					Protocol:   "TCP",
+					Port:       int32(80),
+					TargetPort: intstr.FromInt(80),
+				}},
+			},
+		}
+		obj := unstructuredutil.StrToUnstructuredUnsafe(testTargetGroupBinding)
+		dynamicClientSet := testutil.NewFakeDynamicClient(obj)
+		tgbs, err := GetTargetGroupBindingsByService(context.TODO(), dynamicClientSet, svc1)
+		assert.NoError(t, err)
+		assert.Equal(t, 80, tgbs[0].Spec.ServiceRef.Port.IntValue())
+		assert.Equal(t, "arn::1234", tgbs[0].Spec.TargetGroupARN)
+		assert.Equal(t, "ip", string(*tgbs[0].Spec.TargetType))
+	}
+	{
+		svc2 := corev1.Service{
+			ObjectMeta: metav1.ObjectMeta{
+				Name:      "active",
+				Namespace: metav1.NamespaceDefault,
+			},
+			Spec: corev1.ServiceSpec{
+				Selector: map[string]string{},
+				Ports: []corev1.ServicePort{{
+					Protocol:   "TCP",
+					Name:       "foo",
+					TargetPort: intstr.FromInt(80),
+				}},
+			},
+		}
+		obj := unstructuredutil.StrToUnstructuredUnsafe(`
+apiVersion: elbv2.k8s.aws/v1beta1
+kind: TargetGroupBinding
+metadata:
+  name: active
+  namespace: default
+spec:
+  serviceRef:
+    name: active
+    port: foo
+  targetGroupARN: arn::1234
+  targetType: instance
+`)
+		dynamicClientSet := testutil.NewFakeDynamicClient(obj)
+		tgbs, err := GetTargetGroupBindingsByService(context.TODO(), dynamicClientSet, svc2)
+		assert.NoError(t, err)
+		assert.Equal(t, "foo", tgbs[0].Spec.ServiceRef.Port.StrVal)
+		assert.Equal(t, "arn::1234", tgbs[0].Spec.TargetGroupARN)
+		assert.Equal(t, "instance", string(*tgbs[0].Spec.TargetType))
+	}
+}
+
+func TestVerifyTargetGroupBindingIgnoreInstanceMode(t *testing.T) {
+	logCtx := log.NewEntry(log.New())
+	_, awsClnt := newFakeClient()
+	tgb := TargetGroupBinding{
+		Spec: TargetGroupBindingSpec{
+			TargetType: (*TargetType)(pointer.StringPtr("instance")),
+		},
+	}
+	res, err := VerifyTargetGroupBinding(context.TODO(), logCtx, awsClnt, tgb, nil, nil)
+	assert.Nil(t, res)
+	assert.NoError(t, err)
+}
+
+func TestVerifyTargetGroupBinding(t *testing.T) {
+	logCtx := log.NewEntry(log.New())
+	tgb := TargetGroupBinding{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "active",
+			Namespace: metav1.NamespaceDefault,
+		},
+		Spec: TargetGroupBindingSpec{
+			TargetType:     (*TargetType)(pointer.StringPtr("ip")),
+			TargetGroupARN: "arn::1234",
+		},
+	}
+	ep := corev1.Endpoints{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "active",
+			Namespace: metav1.NamespaceDefault,
+		},
+		Subsets: []corev1.EndpointSubset{
+			{
+				Addresses: []corev1.EndpointAddress{
+					{
+						IP: "1.2.3.4", // registered
+					},
+					{
+						IP: "5.6.7.8", // registered
+					},
+					{
+						IP: "2.4.6.8", // not registered
+					},
+				},
+			},
+		},
+	}
+	svc := corev1.Service{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "active",
+			Namespace: metav1.NamespaceDefault,
+		},
+		Spec: corev1.ServiceSpec{
+			Selector: map[string]string{},
+			Ports: []corev1.ServicePort{{
+				Protocol:   "TCP",
+				Port:       int32(80),
+				TargetPort: intstr.FromInt(80),
+			}},
+		},
+	}
+	fakeELB, awsClnt := newFakeClient()
+	thOut := elbv2.DescribeTargetHealthOutput{
+		TargetHealthDescriptions: []elbv2types.TargetHealthDescription{
+			{
+				Target: &elbv2types.TargetDescription{
+					Id:   pointer.StringPtr("1.2.3.4"),
+					Port: pointer.Int32Ptr(80),
+				},
+			},
+			{
+				Target: &elbv2types.TargetDescription{
+					Id:   pointer.StringPtr("5.6.7.8"),
+					Port: pointer.Int32Ptr(80),
+				},
+			},
+			{
+				Target: &elbv2types.TargetDescription{
+					Id:   pointer.StringPtr("2.4.6.8"), // irrelevant
+					Port: pointer.Int32Ptr(81),         // wrong port
+				},
+			},
+			{
+				Target: &elbv2types.TargetDescription{
+					Id:   pointer.StringPtr("9.8.7.6"), // irrelevant ip
+					Port: pointer.Int32Ptr(80),
+				},
+			},
+		},
+	}
+	fakeELB.On("DescribeTargetHealth", mock.Anything, mock.Anything).Return(&thOut, nil)
+	res, err := VerifyTargetGroupBinding(context.TODO(), logCtx, awsClnt, tgb, &ep, &svc)
+	expectedRes := TargetGroupVerifyResult{
+		Service:             "active",
+		Verified:            false,
+		EndpointsRegistered: 2,
+		EndpointsTotal:      3,
+	}
+	assert.Equal(t, expectedRes, *res)
+	assert.NoError(t, err)
 }
diff --git a/utils/aws/mocks/ELBv2APIClient.go b/utils/aws/mocks/ELBv2APIClient.go
index 8dae8549e5..7f621b0563 100644
--- a/utils/aws/mocks/ELBv2APIClient.go
+++ b/utils/aws/mocks/ELBv2APIClient.go
@@ -163,3 +163,33 @@ func (_m *ELBv2APIClient) DescribeTargetGroups(_a0 context.Context, _a1 *elastic
 
 	return r0, r1
 }
+
+// DescribeTargetHealth provides a mock function with given fields: ctx, params, optFns
+func (_m *ELBv2APIClient) DescribeTargetHealth(ctx context.Context, params *elasticloadbalancingv2.DescribeTargetHealthInput, optFns ...func(*elasticloadbalancingv2.Options)) (*elasticloadbalancingv2.DescribeTargetHealthOutput, error) {
+	_va := make([]interface{}, len(optFns))
+	for _i := range optFns {
+		_va[_i] = optFns[_i]
+	}
+	var _ca []interface{}
+	_ca = append(_ca, ctx, params)
+	_ca = append(_ca, _va...)
+	ret := _m.Called(_ca...)
+
+	var r0 *elasticloadbalancingv2.DescribeTargetHealthOutput
+	if rf, ok := ret.Get(0).(func(context.Context, *elasticloadbalancingv2.DescribeTargetHealthInput, ...func(*elasticloadbalancingv2.Options)) *elasticloadbalancingv2.DescribeTargetHealthOutput); ok {
+		r0 = rf(ctx, params, optFns...)
+	} else {
+		if ret.Get(0) != nil {
+			r0 = ret.Get(0).(*elasticloadbalancingv2.DescribeTargetHealthOutput)
+		}
+	}
+
+	var r1 error
+	if rf, ok := ret.Get(1).(func(context.Context, *elasticloadbalancingv2.DescribeTargetHealthInput, ...func(*elasticloadbalancingv2.Options)) error); ok {
+		r1 = rf(ctx, params, optFns...)
+	} else {
+		r1 = ret.Error(1)
+	}
+
+	return r0, r1
+}
diff --git a/utils/conditions/conditions.go b/utils/conditions/conditions.go
index 0715c19fd0..40d173374a 100644
--- a/utils/conditions/conditions.go
+++ b/utils/conditions/conditions.go
@@ -135,6 +135,18 @@ const (
 	ServiceReferenceReason = "ServiceReferenceError"
 	// ServiceReferencingManagedService is added in a rollout when the multiple rollouts reference a Rollout
 	ServiceReferencingManagedService = "Service %q is managed by another Rollout"
+
+	// TargetGroupHealthyReason is emitted when target group has been verified
+	TargetGroupVerifiedReason              = "TargetGroupVerified"
+	TargetGroupVerifiedRegistrationMessage = "Service %s (TargetGroup %s) verified: %d endpoints registered"
+	TargetGroupVerifiedWeightsMessage      = "Service %s (TargetGroup %s) verified: canary weight %d set"
+	// TargetGroupHealthyReason is emitted when target group has not been verified
+	TargetGroupUnverifiedReason              = "TargetGroupUnverified"
+	TargetGroupUnverifiedRegistrationMessage = "Service %s (TargetGroup %s) not verified: %d/%d endpoints registered"
+	TargetGroupUnverifiedWeightsMessage      = "Service %s (TargetGroup %s) not verified: canary weight %d not yet set (current: %d)"
+	// TargetGroupVerifyErrorReason is emitted when we fail to verify the health of a target group due to error
+	TargetGroupVerifyErrorReason  = "TargetGroupVerifyError"
+	TargetGroupVerifyErrorMessage = "Failed to verify Service %s (TargetGroup %s): %s"
 )
 
 // NewRolloutCondition creates a new rollout condition.
diff --git a/utils/defaults/defaults_test.go b/utils/defaults/defaults_test.go
index a3e5322c3e..129fd1c959 100644
--- a/utils/defaults/defaults_test.go
+++ b/utils/defaults/defaults_test.go
@@ -355,3 +355,30 @@ func TestGetConsecutiveErrorLimitOrDefault(t *testing.T) {
 	metricDefaultValue := &v1alpha1.Metric{}
 	assert.Equal(t, DefaultConsecutiveErrorLimit, GetConsecutiveErrorLimitOrDefault(metricDefaultValue))
 }
+
+func TestSetDefaults(t *testing.T) {
+	SetVerifyTargetGroup(true)
+	assert.True(t, VerifyTargetGroup())
+	SetVerifyTargetGroup(false)
+	assert.False(t, VerifyTargetGroup())
+
+	SetIstioAPIVersion("v1alpha9")
+	assert.Equal(t, "v1alpha9", GetIstioAPIVersion())
+	SetIstioAPIVersion(DefaultIstioVersion)
+	assert.Equal(t, DefaultIstioVersion, GetIstioAPIVersion())
+
+	SetAmbassadorAPIVersion("v1alpha9")
+	assert.Equal(t, "v1alpha9", GetAmbassadorAPIVersion())
+	SetAmbassadorAPIVersion(DefaultAmbassadorVersion)
+	assert.Equal(t, DefaultAmbassadorVersion, GetAmbassadorAPIVersion())
+
+	SetSMIAPIVersion("v1alpha9")
+	assert.Equal(t, "v1alpha9", GetSMIAPIVersion())
+	SetSMIAPIVersion(DefaultSMITrafficSplitVersion)
+	assert.Equal(t, DefaultSMITrafficSplitVersion, GetSMIAPIVersion())
+
+	SetTargetGroupBindingAPIVersion("v1alpha9")
+	assert.Equal(t, "v1alpha9", GetTargetGroupBindingAPIVersion())
+	SetTargetGroupBindingAPIVersion(DefaultTargetGroupBindingAPIVersion)
+	assert.Equal(t, DefaultTargetGroupBindingAPIVersion, GetTargetGroupBindingAPIVersion())
+}
diff --git a/utils/istio/istio_test.go b/utils/istio/istio_test.go
index 6eb78f0188..e6f8b78e71 100644
--- a/utils/istio/istio_test.go
+++ b/utils/istio/istio_test.go
@@ -7,6 +7,7 @@ import (
 	"k8s.io/apimachinery/pkg/runtime/schema"
 
 	"github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1"
+	"github.com/argoproj/argo-rollouts/utils/defaults"
 	"github.com/stretchr/testify/assert"
 	"k8s.io/apimachinery/pkg/runtime"
 	dynamicfake "k8s.io/client-go/dynamic/fake"
@@ -39,12 +40,12 @@ func TestGetIstioVirtualServiceGVR(t *testing.T) {
 }
 
 func TestGetIstioDestinationRuleGVR(t *testing.T) {
-	SetIstioAPIVersion("v1alpha4")
+	defaults.SetIstioAPIVersion("v1alpha4")
 	gvr := GetIstioDestinationRuleGVR()
 	assert.Equal(t, "networking.istio.io", gvr.Group)
 	assert.Equal(t, "v1alpha4", gvr.Version)
 	assert.Equal(t, "destinationrules", gvr.Resource)
-	SetIstioAPIVersion("v1alpha3")
+	defaults.SetIstioAPIVersion("v1alpha3")
 }
 
 func TestGetRolloutVirtualServiceKeys(t *testing.T) {
diff --git a/utils/rollout/rolloututil.go b/utils/rollout/rolloututil.go
index e3eaab68eb..c2e997aa00 100644
--- a/utils/rollout/rolloututil.go
+++ b/utils/rollout/rolloututil.go
@@ -10,6 +10,12 @@ import (
 	"github.com/argoproj/argo-rollouts/utils/defaults"
 )
 
+// IsFullyPromoted returns whether or not the given rollout is in a fully promoted state.
+// (versus being in the middle of an update). This is determined by checking if stable hash == desired hash
+func IsFullyPromoted(ro *v1alpha1.Rollout) bool {
+	return ro.Status.StableRS == ro.Status.CurrentPodHash
+}
+
 // GetRolloutPhase returns a status and message for a rollout. Takes into consideration whether
 // or not metadata.generation was observed in status.observedGeneration
 // use this instead of CalculateRolloutPhase
@@ -95,8 +101,15 @@ func CalculateRolloutPhase(spec v1alpha1.RolloutSpec, status v1alpha1.RolloutSta
 		if ro.Status.BlueGreen.ActiveSelector == "" || ro.Status.BlueGreen.ActiveSelector != ro.Status.CurrentPodHash {
 			return v1alpha1.RolloutPhaseProgressing, "active service cutover pending"
 		}
-		if ro.Status.StableRS == "" || ro.Status.StableRS != ro.Status.CurrentPodHash {
-			return v1alpha1.RolloutPhaseProgressing, "waiting for analysis to complete"
+		if ro.Status.StableRS == "" || !IsFullyPromoted(&ro) {
+			// we switched the active selector to the desired ReplicaSet, but we have yet to mark it
+			// as stable. This could be caused by one of two things:
+			// 1. post-promotion analysis has yet to complete successfully
+			// 2. post-promotion verification (i.e. target group verification)
+			if waitingForBlueGreenPostPromotionAnalysis(&ro) {
+				return v1alpha1.RolloutPhaseProgressing, "waiting for analysis to complete"
+			}
+			return v1alpha1.RolloutPhaseProgressing, "waiting for post-promotion verification to complete"
 		}
 	} else if ro.Spec.Strategy.Canary != nil {
 		if ro.Spec.Strategy.Canary.TrafficRouting == nil {
@@ -107,13 +120,23 @@ func CalculateRolloutPhase(spec v1alpha1.RolloutSpec, status v1alpha1.RolloutSta
 				return v1alpha1.RolloutPhaseProgressing, "old replicas are pending termination"
 			}
 		}
-		if ro.Status.StableRS == "" || ro.Status.StableRS != ro.Status.CurrentPodHash {
+		if ro.Status.StableRS == "" || !IsFullyPromoted(&ro) {
 			return v1alpha1.RolloutPhaseProgressing, "waiting for all steps to complete"
 		}
 	}
 	return v1alpha1.RolloutPhaseHealthy, ""
 }
 
+// waitingForBlueGreenPostPromotionAnalysis returns we are waiting for blue-green post promotion to complete
+func waitingForBlueGreenPostPromotionAnalysis(ro *v1alpha1.Rollout) bool {
+	if ro.Spec.Strategy.BlueGreen.PostPromotionAnalysis != nil {
+		if ro.Status.BlueGreen.PostPromotionAnalysisRunStatus == nil || !ro.Status.BlueGreen.PostPromotionAnalysisRunStatus.Status.Completed() {
+			return true
+		}
+	}
+	return false
+}
+
 // CanaryStepString returns a string representation of a canary step
 func CanaryStepString(c v1alpha1.CanaryStep) string {
 	if c.SetWeight != nil {
diff --git a/utils/rollout/rolloututil_test.go b/utils/rollout/rolloututil_test.go
index 0a2071e5a9..4fc15a4c63 100644
--- a/utils/rollout/rolloututil_test.go
+++ b/utils/rollout/rolloututil_test.go
@@ -71,6 +71,27 @@ func newBlueGreenRollout() *v1alpha1.Rollout {
 	}
 }
 
+func TestIsFullyPromoted(t *testing.T) {
+	{
+		ro := &v1alpha1.Rollout{
+			Status: v1alpha1.RolloutStatus{
+				StableRS:       "abc123",
+				CurrentPodHash: "abc123",
+			},
+		}
+		assert.True(t, IsFullyPromoted(ro))
+	}
+	{
+		ro := &v1alpha1.Rollout{
+			Status: v1alpha1.RolloutStatus{
+				StableRS:       "abc123",
+				CurrentPodHash: "def456",
+			},
+		}
+		assert.False(t, IsFullyPromoted(ro))
+	}
+}
+
 func TestRolloutStatusDegraded(t *testing.T) {
 	ro := newCanaryRollout()
 	ro.Status.Conditions = append(ro.Status.Conditions, v1alpha1.RolloutCondition{
@@ -137,6 +158,7 @@ func TestRolloutStatusProgressing(t *testing.T) {
 	}
 	{
 		ro := newBlueGreenRollout()
+		ro.Spec.Strategy.BlueGreen.PostPromotionAnalysis = &v1alpha1.RolloutAnalysis{}
 		ro.Status.BlueGreen.ActiveSelector = "def5678"
 		ro.Status.StableRS = "abc1234"
 		ro.Status.CurrentPodHash = "def5678"
@@ -148,6 +170,19 @@ func TestRolloutStatusProgressing(t *testing.T) {
 		assert.Equal(t, v1alpha1.RolloutPhaseProgressing, status)
 		assert.Equal(t, "waiting for analysis to complete", message)
 	}
+	{
+		ro := newBlueGreenRollout()
+		ro.Status.BlueGreen.ActiveSelector = "def5678"
+		ro.Status.StableRS = "abc1234"
+		ro.Status.CurrentPodHash = "def5678"
+		ro.Spec.Replicas = pointer.Int32Ptr(5)
+		ro.Status.Replicas = 5
+		ro.Status.UpdatedReplicas = 5
+		ro.Status.AvailableReplicas = 5
+		status, message := GetRolloutPhase(ro)
+		assert.Equal(t, v1alpha1.RolloutPhaseProgressing, status)
+		assert.Equal(t, "waiting for post-promotion verification to complete", message)
+	}
 	{
 		// Scenario when a newly created rollout has partially filled in status (with hashes)
 		// but no updated replica count