feat: ability to verify AWS TargetGroup after canary stable promotion

Signed-off-by: Jesse Suen <[email protected]>
argoproj · Aug 24, 2021 · 27d4fd3 · 27d4fd3
1 parent 682dada
commit 27d4fd3
Show file tree

Hide file tree

Showing 28 changed files with 578 additions and 174 deletions.
diff --git a/docs/features/traffic-management/alb.md b/docs/features/traffic-management/alb.md
@@ -158,24 +158,100 @@ spec:
 ...
 ```
 
-### Weight verification
+### Zero-Downtime Updates with AWS TargetGroup Verification
+
+Argo Rollouts contains two features to help ensure zero-downtime updates when used with the AWS 
+LoadBalancer controller: TargetGroup IP verification and TargetGroup weight verification. Both
+features involve the Rollout controller performing additional safety checks to AWS, to verify
+the changes made to the Ingress object are reflected in the underlying AWS TargetGroup.
+
+#### TargetGroup IP Verification
+
+!!! note
+
+    Target Group IP verification available since Argo Rollouts v1.1
+
+The AWS LoadBalancer controller can run in one of two modes:
+* [Instance mode](https://kubernetes-sigs.github.io/aws-load-balancer-controller/v2.2/how-it-works/#instance-mode)
+* [IP mode](https://kubernetes-sigs.github.io/aws-load-balancer-controller/v2.2/how-it-works/#ip-mode)
+
+When using the AWS LoadBalancer controller in IP mode (e.g. using the AWS CNI), the ALB LoadBalancer
+targets individual Pod IPs, as opposed to K8s node instances. Targeting Pod IPs comes with an
+increased risk of downtime during an update, because the Pod IPs behind the underlying AWS TargetGroup
+can more easily become outdated from the *_actual_* availability and status of pods, causing HTTP 502 errors
+when the TargetGroup points to pods which have already been scaled down.
+
+To mitigate this risk, AWS recommends the use of
+[pod readiness gate injection](https://kubernetes-sigs.github.io/aws-load-balancer-controller/v2.2/deploy/pod_readiness_gate/)
+when running the AWS LoadBalancer in IP mode. Readiness gates allow for the AWS LoadBalancer 
+controller to verify that TargetGroups are accurate before marking newly created Pods as "ready",
+preventing premature scale down of the older ReplicaSet.
+
+Pod readiness gate injection uses a mutating webhook which decides to inject readiness gates when a
+pod is created based on the following conditions:
+* There exists a service matching the pod labels in the same namespace
+* There exists at least one target group binding that refers to the matching service
+
+Another way to describe this is: the AWS LoadBalancer controller injects readiness gates onto Pods
+only if they are "reachable"  from an ALB Ingress at the time of pod creation. A pod is considered
+reachable if an (ALB) Ingress references a Service which matches the pod labels. It ignores all other Pods.
+
+One challenge with this manner of pod readiness gate injection, is that modifications to the Service
+selector labels (`spec.selector`) do not allow for the AWS LoadBalancer controller to inject the
+readiness gates, because by that time the Pod was already created (and readiness gates are immutable).
+As a consequence, Argo Rollout's blue-green update strategy, which modifies the active service
+selector to the new ReplicaSet labels during promotion, does not allow for the readiness gate for the
+`spec.strategy.blueGreen.activeService` to be injected. This means there is a possibility of downtime
+in the following problematic scenario during an update from V1 to V2:
+
+1. V2 ReplicaSet stack is scaled up
+2. V2 ReplicaSet pods become fully available
+3. Rollout updates the label selectors of the active service to point to the V2 stack (from V1)
+4. Due to unknown issues (e.g. AWS load balancer controller downtime, AWS rate limiting), the
+   re-targeting of Pod IPs in the TargetGroup does not happen or is delayed.
+5. V1 ReplicaSet is scaled down (to complete the update)
+
+At step 5, when the V1 ReplicaSet is scaled down, the outdated TargetGroup would still be pointing
+to the V1 Pods IPs which no longer exist, causing downtime. 
+
+To eliminate the possibility of downtime, Argo Rollouts has the ability to perform TargetGroup IP
+verification as an additional safety measure during an update. When this feature is enabled, whenever
+a service selector modification is made, the Rollout controller then makes a series of AWS API calls,
+and blocks until it can verify the TargetGroup is accurately targeting the new Pod IPs of the
+`bluegreen.activeService`, before proceeding with the rest of the update. This verification happens
+before running postPromotionAnalysis or scaling down the old ReplicaSet.
+
+Similarly for the canary strategy, after updating the `canary.stableService` to point to the new
+ReplicaSet, the TargetGroup IP verification feature allows the controller to block the scale down
+of the old ReplicaSet until it verifies the Pods IP behind the stableService TargetGroup are accurate.
+
+#### TargetGroup Weight Verification
 
 !!! note
 
-    Since Argo Rollouts v1.0
+    TargetGroup weight verification available since Argo Rollouts v1.0
+
+TargetGroup weight verification addresses a similar problem to TargetGroup IP verification, but
+instead of verifying that the Pod IPs of a service are reflected accurately in the TargetGroup, the
+controller verifies that the traffic *_weights_* are accurate from what was set in the ingress
+annotations. Unlike IP verification, weight verification is applicable to AWS LoadBalancer
+controllers which are running either in IP mode or Instance mode.
+
+After Argo Rollouts adjusts a canary weight by updating the Ingress annotation, it moves on to the
+next step. However, due to external factors (e.g. AWS rate limiting, AWS load balancer controller
+downtime) it is possible that the weight modifications made to the Ingress, did not take effect in
+the underlying TargetGroup. This is potentially dangerous as the controller will believe it is safe
+to scale down the old stable stack when in reality, the outdated TargetGroup may still be pointing
+to it.
+
+Using the TargetGroup weight verification feature, the rollout controller will additionally *verify*
+the canary weight after a `setWeight` canary step. It accomplishes this by querying AWS LoadBalancer
+APIs directly, to confirm that the Rules, Actions, and TargetGroups reflect the desire of Ingress
+annotation.
 
-When Argo Rollouts adjusts a canary weight by updating the Ingress annotation, it assumes that
-the new weight immediately takes effect and moves on to the next step. However, due to external
-factors (e.g. AWS rate limiting, AWS load balancer controller downtime) it is possible that the
-ingress modification may take a long time to take effect (or possibly never even made). This is
-potentially dangerous when the rollout completes its steps, it will scale down the old stack. If
-the ALB Rules/Actions were still directing traffic to the old stack (because the weights never took
-effect), then this would cause downtime to the service when the old stack was scaled down.
+#### Usage
 
-To mitigate this, the rollout controller has a feature to additionally *verify* the canary weight 
-after a `setWeight` canary step. It accomplishes this by querying AWS LoadBalancer APIs directly,
-to confirm that the Rules, Actions, and TargetGroups reflect the desire of Ingress annotation.
-To enable ALB weight verification, add `--alb-verify-weight` flag to the rollout-controller flags:
+To enable AWS target group verification, add `--aws-verify-target-group` flag to the rollout-controller flags:
 
 ```yaml
 apiVersion: apps/v1
@@ -187,7 +263,8 @@ spec:
     spec:
       containers:
       - name: argo-rollouts
-        args: [--alb-verify-weight]
+        args: [--aws-verify-target-group]
+        # NOTE: in v1.0, the --alb-verify-weight flag should be used instead
 ```
 
 For this feature to work, the argo-rollouts deployment requires the following AWS API permissions
@@ -198,6 +275,7 @@ under the [Elastic Load Balancing API](https://docs.aws.amazon.com/elasticloadba
 * `DescribeListeners`
 * `DescribeRules`
 * `DescribeTags`
+* `DescribeTargetHealth`
 
 There are various ways of granting AWS privileges to the argo-rollouts pods, which is highly
 dependent to your cluster's AWS environment, and out-of-scope of this documentation. Some solutions

diff --git a/ingress/alb.go b/ingress/alb.go
@@ -40,7 +40,7 @@ func (c *Controller) syncALBIngress(ingress *extensionsv1beta1.Ingress, rollouts
 			delete(managedActions, roName)
 			resetALBAction, err := getResetALBActionStr(ingress, actionKey)
 			if err != nil {
-				log.WithField(logutil.IngressKey, ingress.Name).WithField(logutil.NamespaceKey, ingress.Namespace).Error(err)
+				log.WithField(logutil.RolloutKey, roName).WithField(logutil.IngressKey, ingress.Name).WithField(logutil.NamespaceKey, ingress.Namespace).Error(err)
 				return nil
 			}
 			newIngress.Annotations[actionKey] = resetALBAction

diff --git a/manifests/install.yaml b/manifests/install.yaml
@@ -12726,8 +12726,10 @@ rules:
   - patch
 - apiGroups:
   - getambassador.io
+  - x.getambassador.io
   resources:
   - mappings
+  - ambassadormappings
   verbs:
   - create
   - watch
@@ -12736,16 +12738,18 @@ rules:
   - list
   - delete
 - apiGroups:
-  - x.getambassador.io
+  - ""
   resources:
-  - ambassadormappings
+  - endpoints
   verbs:
-  - create
-  - watch
   - get
-  - update
+- apiGroups:
+  - elbv2.k8s.aws
+  resources:
+  - targetgroupbindings
+  verbs:
   - list
-  - delete
+  - get
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole

diff --git a/manifests/namespace-install.yaml b/manifests/namespace-install.yaml
@@ -12726,8 +12726,10 @@ rules:
   - patch
 - apiGroups:
   - getambassador.io
+  - x.getambassador.io
   resources:
   - mappings
+  - ambassadormappings
   verbs:
   - create
   - watch
@@ -12736,16 +12738,18 @@ rules:
   - list
   - delete
 - apiGroups:
-  - x.getambassador.io
+  - ""
   resources:
-  - ambassadormappings
+  - endpoints
   verbs:
-  - create
-  - watch
   - get
-  - update
+- apiGroups:
+  - elbv2.k8s.aws
+  resources:
+  - targetgroupbindings
+  verbs:
   - list
-  - delete
+  - get
 ---
 apiVersion: rbac.authorization.k8s.io/v1
 kind: ClusterRole

diff --git a/rollout/analysis.go b/rollout/analysis.go
@@ -21,6 +21,7 @@ import (
 	logutil "github.com/argoproj/argo-rollouts/utils/log"
 	"github.com/argoproj/argo-rollouts/utils/record"
 	replicasetutil "github.com/argoproj/argo-rollouts/utils/replicaset"
+	rolloututil "github.com/argoproj/argo-rollouts/utils/rollout"
 )
 
 const (
@@ -176,7 +177,7 @@ func needsNewAnalysisRun(currentAr *v1alpha1.AnalysisRun, rollout *v1alpha1.Roll
 
 // emitAnalysisRunStatusChanges emits a Kubernetes event if the analysis run of that type has changed status
 func (c *rolloutContext) emitAnalysisRunStatusChanges(prevStatus *v1alpha1.RolloutAnalysisRunStatus, ar *v1alpha1.AnalysisRun, arType string) {
-	if ar != nil {
+	if ar != nil && ar.Status.Phase != "" {
 		if prevStatus == nil || prevStatus.Name == ar.Name && prevStatus.Status != ar.Status.Phase {
 			prevStatusStr := "NoPreviousStatus"
 			if prevStatus != nil {
@@ -318,7 +319,7 @@ func (c *rolloutContext) reconcileBackgroundAnalysisRun() (*v1alpha1.AnalysisRun
 	}
 
 	// Do not create a background run if the rollout is completely rolled out, just created, before the starting step
-	if c.rollout.Status.StableRS == c.rollout.Status.CurrentPodHash || c.rollout.Status.StableRS == "" || c.rollout.Status.CurrentPodHash == "" || replicasetutil.BeforeStartingStep(c.rollout) {
+	if rolloututil.IsFullyPromoted(c.rollout) || c.rollout.Status.StableRS == "" || c.rollout.Status.CurrentPodHash == "" || replicasetutil.BeforeStartingStep(c.rollout) {
 		return nil, nil
 	}
 

diff --git a/rollout/bluegreen.go b/rollout/bluegreen.go
@@ -225,11 +225,23 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForBlueGreen(oldRSs []*appsv1.Re
 			continue
 		}
 
+		if !replicasetutil.HasScaleDownDeadline(targetRS) {
+			// This replicaSet is scaled up but does not have a scale down deadline. Add one.
+			scaleDownDelaySeconds := defaults.GetScaleDownDelaySecondsOrDefault(c.rollout)
+			err := c.addScaleDownDelay(targetRS, scaleDownDelaySeconds)
+			if err != nil {
+				return hasScaled, err
+			}
+			c.enqueueRolloutAfter(c.rollout, scaleDownDelaySeconds)
+			annotationedRSs += 1
+			continue
+		}
+
 		var desiredReplicaCount int32
-		annotationedRSs, desiredReplicaCount = c.ScaleDownDelayHelper(targetRS, annotationedRSs, rolloutReplicas)
+		annotationedRSs, desiredReplicaCount = c.scaleDownDelayHelper(targetRS, annotationedRSs, rolloutReplicas)
 
-		if *(targetRS.Spec.Replicas) == desiredReplicaCount {
-			// at desired account
+		if *targetRS.Spec.Replicas == desiredReplicaCount {
+			// already at desired account, nothing to do
 			continue
 		}
 		// Scale down.
@@ -243,30 +255,6 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForBlueGreen(oldRSs []*appsv1.Re
 	return hasScaled, nil
 }
 
-func (c *rolloutContext) ScaleDownDelayHelper(rs *appsv1.ReplicaSet, annotationedRSs int32, rolloutReplicas int32) (int32, int32) {
-	desiredReplicaCount := int32(0)
-	scaleDownRevisionLimit := GetScaleDownRevisionLimit(c.rollout)
-	if replicasetutil.HasScaleDownDeadline(rs) {
-		annotationedRSs++
-		if annotationedRSs > scaleDownRevisionLimit {
-			c.log.Infof("At ScaleDownDelayRevisionLimit (%d) and scaling down the rest", scaleDownRevisionLimit)
-		} else {
-			remainingTime, err := replicasetutil.GetTimeRemainingBeforeScaleDownDeadline(rs)
-			if err != nil {
-				c.log.Warnf("%v", err)
-			} else if remainingTime != nil {
-				c.log.Infof("RS '%s' has not reached the scaleDownTime", rs.Name)
-				if *remainingTime < c.resyncPeriod {
-					c.enqueueRolloutAfter(c.rollout, *remainingTime)
-				}
-				desiredReplicaCount = rolloutReplicas
-			}
-		}
-	}
-
-	return annotationedRSs, desiredReplicaCount
-}
-
 func GetScaleDownRevisionLimit(ro *v1alpha1.Rollout) int32 {
 	if ro.Spec.Strategy.BlueGreen != nil {
 		if ro.Spec.Strategy.BlueGreen.ScaleDownDelayRevisionLimit != nil {