From a5239bfe15acfa76346d6097a92f669a95ad27d5 Mon Sep 17 00:00:00 2001 From: Jesse Suen Date: Wed, 14 Jul 2021 00:31:56 -0700 Subject: [PATCH] feat: ability to verify AWS TargetGroup after canary stable promotion Signed-off-by: Jesse Suen --- docs/features/traffic-management/alb.md | 113 ++++- experiments/replicaset.go | 2 +- ingress/alb.go | 2 +- manifests/install.yaml | 16 +- manifests/namespace-install.yaml | 16 +- rollout/analysis.go | 5 +- rollout/analysis_test.go | 2 - rollout/bluegreen.go | 41 +- rollout/bluegreen_test.go | 44 +- rollout/canary.go | 63 ++- rollout/controller_test.go | 30 +- rollout/ephemeralmetadata_test.go | 1 - rollout/replicaset.go | 38 +- rollout/replicaset_test.go | 2 + rollout/service.go | 53 ++- rollout/service_test.go | 432 ++++++++++++++++++ rollout/sync.go | 9 - rollout/sync_test.go | 1 - rollout/trafficrouting.go | 15 +- rollout/trafficrouting/alb/alb.go | 11 +- rollout/trafficrouting/alb/alb_test.go | 9 +- .../ambassador/ambassador_test.go | 11 +- rollout/trafficrouting/smi/smi_test.go | 44 +- rollout/trafficrouting_test.go | 23 +- test/e2e/aws_test.go | 19 +- test/e2e/canary_test.go | 1 + .../e2e/functional/alb-bluegreen-rollout.yaml | 71 +++ ...b-rollout.yaml => alb-canary-rollout.yaml} | 34 +- test/fixtures/e2e_suite.go | 2 + test/fixtures/when.go | 21 +- test/util/util.go | 10 +- utils/aws/aws.go | 90 ++-- utils/aws/aws_test.go | 236 +++++++++- utils/aws/mocks/ELBv2APIClient.go | 30 ++ utils/conditions/conditions.go | 12 + utils/defaults/defaults_test.go | 27 ++ utils/istio/istio_test.go | 5 +- utils/rollout/rolloututil.go | 29 +- utils/rollout/rolloututil_test.go | 35 ++ 39 files changed, 1376 insertions(+), 229 deletions(-) create mode 100644 test/e2e/functional/alb-bluegreen-rollout.yaml rename test/e2e/functional/{alb-rollout.yaml => alb-canary-rollout.yaml} (71%) diff --git a/docs/features/traffic-management/alb.md b/docs/features/traffic-management/alb.md index ef614573c7..7f9e8e031e 100644 --- a/docs/features/traffic-management/alb.md +++ b/docs/features/traffic-management/alb.md @@ -158,24 +158,107 @@ spec: ... ``` -### Weight verification +### Zero-Downtime Updates with AWS TargetGroup Verification + +Argo Rollouts contains two features to help ensure zero-downtime updates when used with the AWS +LoadBalancer controller: TargetGroup IP verification and TargetGroup weight verification. Both +features involve the Rollout controller performing additional safety checks to AWS, to verify +the changes made to the Ingress object are reflected in the underlying AWS TargetGroup. + +#### TargetGroup IP Verification + +!!! note + + Target Group IP verification available since Argo Rollouts v1.1 + +The AWS LoadBalancer controller can run in one of two modes: +* [Instance mode](https://kubernetes-sigs.github.io/aws-load-balancer-controller/v2.2/how-it-works/#instance-mode) +* [IP mode](https://kubernetes-sigs.github.io/aws-load-balancer-controller/v2.2/how-it-works/#ip-mode) + +TargetGroup IP Verification is only applicable when the AWS LoadBalancer controller in IP mode. +When using the AWS LoadBalancer controller in IP mode (e.g. using the AWS CNI), the ALB LoadBalancer +targets individual Pod IPs, as opposed to K8s node instances. Targeting Pod IPs comes with an +increased risk of downtime during an update, because the Pod IPs behind the underlying AWS TargetGroup +can more easily become outdated from the *_actual_* availability and status of pods, causing HTTP 502 +errors when the TargetGroup points to pods which have already been scaled down. + +To mitigate this risk, AWS recommends the use of +[pod readiness gate injection](https://kubernetes-sigs.github.io/aws-load-balancer-controller/v2.2/deploy/pod_readiness_gate/) +when running the AWS LoadBalancer in IP mode. Readiness gates allow for the AWS LoadBalancer +controller to verify that TargetGroups are accurate before marking newly created Pods as "ready", +preventing premature scale down of the older ReplicaSet. + +Pod readiness gate injection uses a mutating webhook which decides to inject readiness gates when a +pod is created based on the following conditions: +* There exists a service matching the pod labels in the same namespace +* There exists at least one target group binding that refers to the matching service + +Another way to describe this is: the AWS LoadBalancer controller injects readiness gates onto Pods +only if they are "reachable" from an ALB Ingress at the time of pod creation. A pod is considered +reachable if an (ALB) Ingress references a Service which matches the pod labels. It ignores all other Pods. + +One challenge with this manner of pod readiness gate injection, is that modifications to the Service +selector labels (`spec.selector`) do not allow for the AWS LoadBalancer controller to inject the +readiness gates, because by that time the Pod was already created (and readiness gates are immutable). +Note that this is an issue when you change Service selectors of *_any_* ALB Service, not just ones +involved in Argo Rollouts. + +Because Argo Rollout's blue-green strategy works by modifying the activeService selector to the new +ReplicaSet labels during promotion, it suffers from the issue where readiness gates for the +`spec.strategy.blueGreen.activeService` fail to be injected. This means there is a possibility of +downtime in the following problematic scenario during an update from V1 to V2: + +1. Update is triggered and V2 ReplicaSet stack is scaled up +2. V2 ReplicaSet pods become fully available and ready to be promoted +3. Rollout promotes V2 by updating the label selectors of the active service to point to the V2 stack (from V1) +4. Due to unknown issues (e.g. AWS load balancer controller downtime, AWS rate limiting), registration + of the V2 Pod IPs to the TargetGroup does not happen or is delayed. +5. V1 ReplicaSet is scaled down to complete the update + +After step 5, when the V1 ReplicaSet is scaled down, the outdated TargetGroup would still be pointing +to the V1 Pods IPs which no longer exist, causing downtime. + +To allow for zero-downtime updates, Argo Rollouts has the ability to perform TargetGroup IP +verification as an additional safety measure during an update. When this feature is enabled, whenever +a service selector modification is made, the Rollout controller blocks progression of the update +until it can verify the TargetGroup is accurately targeting the new Pod IPs of the +`bluegreen.activeService`. Verification is achieved by querying AWS APIs to describe the underlying +TargetGroup, iterating its registered IPs, and ensuring all Pod IPs of the activeService's +`Endpoints` list are registered in the TargetGroup. Verification must succeed before running +postPromotionAnalysis or scaling down the old ReplicaSet. + +Similarly for the canary strategy, after updating the `canary.stableService` selector labels to +point to the new ReplicaSet, the TargetGroup IP verification feature allows the controller to block +the scale down of the old ReplicaSet until it verifies the Pods IP behind the stableService +TargetGroup are accurate. + +#### TargetGroup Weight Verification !!! note - Since Argo Rollouts v1.0 + TargetGroup weight verification available since Argo Rollouts v1.0 + +TargetGroup weight verification addresses a similar problem to TargetGroup IP verification, but +instead of verifying that the Pod IPs of a service are reflected accurately in the TargetGroup, the +controller verifies that the traffic *_weights_* are accurate from what was set in the ingress +annotations. Weight verification is applicable to AWS LoadBalancer controllers which are running +either in IP mode or Instance mode. + +After Argo Rollouts adjusts a canary weight by updating the Ingress annotation, it moves on to the +next step. However, due to external factors (e.g. AWS rate limiting, AWS load balancer controller +downtime) it is possible that the weight modifications made to the Ingress, did not take effect in +the underlying TargetGroup. This is potentially dangerous as the controller will believe it is safe +to scale down the old stable stack when in reality, the outdated TargetGroup may still be pointing +to it. + +Using the TargetGroup weight verification feature, the rollout controller will additionally *verify* +the canary weight after a `setWeight` canary step. It accomplishes this by querying AWS LoadBalancer +APIs directly, to confirm that the Rules, Actions, and TargetGroups reflect the desire of Ingress +annotation. -When Argo Rollouts adjusts a canary weight by updating the Ingress annotation, it assumes that -the new weight immediately takes effect and moves on to the next step. However, due to external -factors (e.g. AWS rate limiting, AWS load balancer controller downtime) it is possible that the -ingress modification may take a long time to take effect (or possibly never even made). This is -potentially dangerous when the rollout completes its steps, it will scale down the old stack. If -the ALB Rules/Actions were still directing traffic to the old stack (because the weights never took -effect), then this would cause downtime to the service when the old stack was scaled down. +#### Usage -To mitigate this, the rollout controller has a feature to additionally *verify* the canary weight -after a `setWeight` canary step. It accomplishes this by querying AWS LoadBalancer APIs directly, -to confirm that the Rules, Actions, and TargetGroups reflect the desire of Ingress annotation. -To enable ALB weight verification, add `--alb-verify-weight` flag to the rollout-controller flags: +To enable AWS target group verification, add `--aws-verify-target-group` flag to the rollout-controller flags: ```yaml apiVersion: apps/v1 @@ -187,7 +270,8 @@ spec: spec: containers: - name: argo-rollouts - args: [--alb-verify-weight] + args: [--aws-verify-target-group] + # NOTE: in v1.0, the --alb-verify-weight flag should be used instead ``` For this feature to work, the argo-rollouts deployment requires the following AWS API permissions @@ -198,6 +282,7 @@ under the [Elastic Load Balancing API](https://docs.aws.amazon.com/elasticloadba * `DescribeListeners` * `DescribeRules` * `DescribeTags` +* `DescribeTargetHealth` There are various ways of granting AWS privileges to the argo-rollouts pods, which is highly dependent to your cluster's AWS environment, and out-of-scope of this documentation. Some solutions diff --git a/experiments/replicaset.go b/experiments/replicaset.go index 698d9cfe03..5d97eb15c1 100644 --- a/experiments/replicaset.go +++ b/experiments/replicaset.go @@ -231,7 +231,7 @@ func (ec *experimentContext) addScaleDownDelay(rs *appsv1.ReplicaSet) (bool, err patch := fmt.Sprintf(addScaleDownAtAnnotationsPatch, v1alpha1.DefaultReplicaSetScaleDownDeadlineAnnotationKey, deadline) _, err := ec.kubeclientset.AppsV1().ReplicaSets(rs.Namespace).Patch(ctx, rs.Name, patchtypes.JSONPatchType, []byte(patch), metav1.PatchOptions{}) if err == nil { - ec.log.Infof("Set '%s' annotation on '%s' to %s (%ds)", v1alpha1.DefaultReplicaSetScaleDownDeadlineAnnotationKey, rs.Name, deadline, scaleDownDelaySeconds) + ec.log.Infof("Set '%s' annotation on '%s' to %s (%s)", v1alpha1.DefaultReplicaSetScaleDownDeadlineAnnotationKey, rs.Name, deadline, scaleDownDelaySeconds) rsIsUpdated = true } return rsIsUpdated, err diff --git a/ingress/alb.go b/ingress/alb.go index 1fb9a33839..b3454d3d8d 100644 --- a/ingress/alb.go +++ b/ingress/alb.go @@ -40,7 +40,7 @@ func (c *Controller) syncALBIngress(ingress *extensionsv1beta1.Ingress, rollouts delete(managedActions, roName) resetALBAction, err := getResetALBActionStr(ingress, actionKey) if err != nil { - log.WithField(logutil.IngressKey, ingress.Name).WithField(logutil.NamespaceKey, ingress.Namespace).Error(err) + log.WithField(logutil.RolloutKey, roName).WithField(logutil.IngressKey, ingress.Name).WithField(logutil.NamespaceKey, ingress.Namespace).Error(err) return nil } newIngress.Annotations[actionKey] = resetALBAction diff --git a/manifests/install.yaml b/manifests/install.yaml index 9fe2b1648b..450bc75af7 100644 --- a/manifests/install.yaml +++ b/manifests/install.yaml @@ -12726,8 +12726,10 @@ rules: - patch - apiGroups: - getambassador.io + - x.getambassador.io resources: - mappings + - ambassadormappings verbs: - create - watch @@ -12736,16 +12738,18 @@ rules: - list - delete - apiGroups: - - x.getambassador.io + - "" resources: - - ambassadormappings + - endpoints verbs: - - create - - watch - get - - update +- apiGroups: + - elbv2.k8s.aws + resources: + - targetgroupbindings + verbs: - list - - delete + - get --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole diff --git a/manifests/namespace-install.yaml b/manifests/namespace-install.yaml index c971be8405..a0d92de442 100644 --- a/manifests/namespace-install.yaml +++ b/manifests/namespace-install.yaml @@ -12726,8 +12726,10 @@ rules: - patch - apiGroups: - getambassador.io + - x.getambassador.io resources: - mappings + - ambassadormappings verbs: - create - watch @@ -12736,16 +12738,18 @@ rules: - list - delete - apiGroups: - - x.getambassador.io + - "" resources: - - ambassadormappings + - endpoints verbs: - - create - - watch - get - - update +- apiGroups: + - elbv2.k8s.aws + resources: + - targetgroupbindings + verbs: - list - - delete + - get --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole diff --git a/rollout/analysis.go b/rollout/analysis.go index bb132941d5..e495b07656 100644 --- a/rollout/analysis.go +++ b/rollout/analysis.go @@ -21,6 +21,7 @@ import ( logutil "github.com/argoproj/argo-rollouts/utils/log" "github.com/argoproj/argo-rollouts/utils/record" replicasetutil "github.com/argoproj/argo-rollouts/utils/replicaset" + rolloututil "github.com/argoproj/argo-rollouts/utils/rollout" ) const ( @@ -176,7 +177,7 @@ func needsNewAnalysisRun(currentAr *v1alpha1.AnalysisRun, rollout *v1alpha1.Roll // emitAnalysisRunStatusChanges emits a Kubernetes event if the analysis run of that type has changed status func (c *rolloutContext) emitAnalysisRunStatusChanges(prevStatus *v1alpha1.RolloutAnalysisRunStatus, ar *v1alpha1.AnalysisRun, arType string) { - if ar != nil { + if ar != nil && ar.Status.Phase != "" { if prevStatus == nil || prevStatus.Name == ar.Name && prevStatus.Status != ar.Status.Phase { prevStatusStr := "NoPreviousStatus" if prevStatus != nil { @@ -318,7 +319,7 @@ func (c *rolloutContext) reconcileBackgroundAnalysisRun() (*v1alpha1.AnalysisRun } // Do not create a background run if the rollout is completely rolled out, just created, before the starting step - if c.rollout.Status.StableRS == c.rollout.Status.CurrentPodHash || c.rollout.Status.StableRS == "" || c.rollout.Status.CurrentPodHash == "" || replicasetutil.BeforeStartingStep(c.rollout) { + if rolloututil.IsFullyPromoted(c.rollout) || c.rollout.Status.StableRS == "" || c.rollout.Status.CurrentPodHash == "" || replicasetutil.BeforeStartingStep(c.rollout) { return nil, nil } diff --git a/rollout/analysis_test.go b/rollout/analysis_test.go index e32aa5e982..2678c26b1a 100644 --- a/rollout/analysis_test.go +++ b/rollout/analysis_test.go @@ -1847,7 +1847,6 @@ func TestRolloutPrePromotionAnalysisSwitchServiceAfterSuccess(t *testing.T) { f.serviceLister = append(f.serviceLister, activeSvc) f.expectPatchServiceAction(activeSvc, rs2PodHash) - f.expectPatchReplicaSetAction(rs1) patchIndex := f.expectPatchRolloutActionWithPatch(r2, OnlyObservedGenerationPatch) f.run(getKey(r2, t)) patch := f.getPatchedRolloutWithoutConditions(patchIndex) @@ -1915,7 +1914,6 @@ func TestRolloutPrePromotionAnalysisHonorAutoPromotionSeconds(t *testing.T) { f.serviceLister = append(f.serviceLister, activeSvc) f.expectPatchServiceAction(activeSvc, rs2PodHash) - f.expectPatchReplicaSetAction(rs1) patchIndex := f.expectPatchRolloutActionWithPatch(r2, OnlyObservedGenerationPatch) f.run(getKey(r2, t)) patch := f.getPatchedRolloutWithoutConditions(patchIndex) diff --git a/rollout/bluegreen.go b/rollout/bluegreen.go index 672a52ee17..ef6df170ee 100644 --- a/rollout/bluegreen.go +++ b/rollout/bluegreen.go @@ -224,16 +224,23 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForBlueGreen(oldRSs []*appsv1.Re c.log.Warnf("Prevented inadvertent scaleDown of RS '%s'", targetRS.Name) continue } - + if *targetRS.Spec.Replicas == 0 { + // cannot scale down this ReplicaSet. + continue + } var desiredReplicaCount int32 - annotationedRSs, desiredReplicaCount = c.ScaleDownDelayHelper(targetRS, annotationedRSs, rolloutReplicas) + var err error + annotationedRSs, desiredReplicaCount, err = c.scaleDownDelayHelper(targetRS, annotationedRSs, rolloutReplicas) + if err != nil { + return false, err + } - if *(targetRS.Spec.Replicas) == desiredReplicaCount { - // at desired account + if *targetRS.Spec.Replicas == desiredReplicaCount { + // already at desired account, nothing to do continue } // Scale down. - _, _, err := c.scaleReplicaSetAndRecordEvent(targetRS, desiredReplicaCount) + _, _, err = c.scaleReplicaSetAndRecordEvent(targetRS, desiredReplicaCount) if err != nil { return false, err } @@ -243,30 +250,6 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForBlueGreen(oldRSs []*appsv1.Re return hasScaled, nil } -func (c *rolloutContext) ScaleDownDelayHelper(rs *appsv1.ReplicaSet, annotationedRSs int32, rolloutReplicas int32) (int32, int32) { - desiredReplicaCount := int32(0) - scaleDownRevisionLimit := GetScaleDownRevisionLimit(c.rollout) - if replicasetutil.HasScaleDownDeadline(rs) { - annotationedRSs++ - if annotationedRSs > scaleDownRevisionLimit { - c.log.Infof("At ScaleDownDelayRevisionLimit (%d) and scaling down the rest", scaleDownRevisionLimit) - } else { - remainingTime, err := replicasetutil.GetTimeRemainingBeforeScaleDownDeadline(rs) - if err != nil { - c.log.Warnf("%v", err) - } else if remainingTime != nil { - c.log.Infof("RS '%s' has not reached the scaleDownTime", rs.Name) - if *remainingTime < c.resyncPeriod { - c.enqueueRolloutAfter(c.rollout, *remainingTime) - } - desiredReplicaCount = rolloutReplicas - } - } - } - - return annotationedRSs, desiredReplicaCount -} - func GetScaleDownRevisionLimit(ro *v1alpha1.Rollout) int32 { if ro.Spec.Strategy.BlueGreen != nil { if ro.Spec.Strategy.BlueGreen.ScaleDownDelayRevisionLimit != nil { diff --git a/rollout/bluegreen_test.go b/rollout/bluegreen_test.go index a016e8ec89..a83b9071c2 100644 --- a/rollout/bluegreen_test.go +++ b/rollout/bluegreen_test.go @@ -500,7 +500,6 @@ func TestBlueGreenHandlePause(t *testing.T) { }` expectedPatch := calculatePatch(r2, fmt.Sprintf(expectedPatchWithoutSubs, rs2PodHash, rs2PodHash, rs2PodHash)) f.expectPatchServiceAction(activeSvc, rs2PodHash) - f.expectPatchReplicaSetAction(rs1) patchRolloutIndex := f.expectPatchRolloutActionWithPatch(r2, expectedPatch) f.run(getKey(r2, t)) @@ -580,7 +579,6 @@ func TestBlueGreenHandlePause(t *testing.T) { f.serviceLister = append(f.serviceLister, activeSvc) servicePatchIndex := f.expectPatchServiceAction(activeSvc, rs2PodHash) - patchedRSIndex := f.expectPatchReplicaSetAction(rs1) generatedConditions := generateConditionsPatch(true, conditions.ReplicaSetUpdatedReason, rs2, true, "") newSelector := metav1.FormatLabelSelector(rs2.Spec.Selector) @@ -600,7 +598,6 @@ func TestBlueGreenHandlePause(t *testing.T) { patchIndex := f.expectPatchRolloutActionWithPatch(r2, expectedPatch) f.run(getKey(r2, t)) f.verifyPatchedService(servicePatchIndex, rs2PodHash, "") - f.verifyPatchedReplicaSet(patchedRSIndex, 10) rolloutPatch := f.getPatchedRollout(patchIndex) assert.Equal(t, expectedPatch, rolloutPatch) @@ -730,13 +727,11 @@ func TestBlueGreenHandlePause(t *testing.T) { f.serviceLister = append(f.serviceLister, activeSvc, previewSvc) servicePatchIndex := f.expectPatchServiceAction(activeSvc, rs2PodHash) - patchedRSIndex := f.expectPatchReplicaSetAction(rs1) unpausePatchIndex := f.expectPatchRolloutAction(r2) patchRolloutIndex := f.expectPatchRolloutAction(r2) f.run(getKey(r2, t)) f.verifyPatchedService(servicePatchIndex, rs2PodHash, "") - f.verifyPatchedReplicaSet(patchedRSIndex, 10) unpausePatch := f.getPatchedRollout(unpausePatchIndex) unpauseConditions := generateConditionsPatch(true, conditions.RolloutResumedReason, rs2, true, "") expectedUnpausePatch := `{ @@ -791,10 +786,8 @@ func TestBlueGreenAddScaleDownDelayToPreviousActiveReplicaSet(t *testing.T) { f.serviceLister = append(f.serviceLister, s) f.expectPatchServiceAction(s, rs2PodHash) - patchedRSIndex := f.expectPatchReplicaSetAction(rs1) patchIndex := f.expectPatchRolloutAction(r2) f.run(getKey(r2, t)) - f.verifyPatchedReplicaSet(patchedRSIndex, 10) patch := f.getPatchedRollout(patchIndex) expectedPatchWithoutSubs := `{ @@ -997,7 +990,6 @@ func TestPreviewReplicaCountHandleScaleUpPreviewCheckPoint(t *testing.T) { f.kubeobjects = append(f.kubeobjects, activeSvc) f.serviceLister = append(f.serviceLister, activeSvc) - f.expectPatchReplicaSetAction(rs1) patchIndex := f.expectPatchRolloutAction(r1) f.run(getKey(r2, t)) @@ -1452,10 +1444,44 @@ func TestBlueGreenHandlePauseAutoPromoteWithConditions(t *testing.T) { assert.Nil(t, err) expectedPatch := calculatePatch(r2, fmt.Sprintf(expectedPatchWithoutSubs, rs2PodHash, string(availableCondBytes), string(pausedCondBytes), string(progressingCondBytes), rs2PodHash, rs2PodHash)) f.expectPatchServiceAction(activeSvc, rs2PodHash) - f.expectPatchReplicaSetAction(rs1) patchRolloutIndex := f.expectPatchRolloutActionWithPatch(r2, expectedPatch) f.run(getKey(r2, t)) rolloutPatch := f.getPatchedRollout(patchRolloutIndex) assert.Equal(t, expectedPatch, rolloutPatch) } + +// Verifies with blue-green, we add a scaledown delay to the old ReplicaSet after promoting desired +// ReplicaSet to stable. +// NOTE: As of v1.1, scale down delays are added to ReplicaSets on *subsequent* reconciliations +// after the desired RS has been promoted to stable +func TestBlueGreenAddScaleDownDelay(t *testing.T) { + f := newFixture(t) + defer f.Close() + + r1 := newBlueGreenRollout("foo", 1, nil, "active", "") + r2 := bumpVersion(r1) + + rs1 := newReplicaSetWithStatus(r1, 1, 1) + rs2 := newReplicaSetWithStatus(r2, 1, 1) + rs2PodHash := rs2.Labels[v1alpha1.DefaultRolloutUniqueLabelKey] + r2.Status.ObservedGeneration = strconv.Itoa(int(r2.Generation)) + r2 = updateBlueGreenRolloutStatus(r2, "", rs2PodHash, rs2PodHash, 1, 1, 2, 1, false, true) + completedCondition, _ := newCompletedCondition(true) + conditions.SetRolloutCondition(&r2.Status, completedCondition) + progressingCondition, _ := newProgressingCondition(conditions.NewRSAvailableReason, rs2, "") + conditions.SetRolloutCondition(&r2.Status, progressingCondition) + + activeSelector := map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: rs2PodHash} + activeSvc := newService("active", 80, activeSelector, r2) + + f.kubeobjects = append(f.kubeobjects, rs1, rs2, activeSvc) + f.replicaSetLister = append(f.replicaSetLister, rs1, rs2) + f.rolloutLister = append(f.rolloutLister, r2) + f.objects = append(f.objects, r2) + + rs1Patch := f.expectPatchReplicaSetAction(rs1) // set scale-down-deadline annotation + f.run(getKey(r2, t)) + + f.verifyPatchedReplicaSet(rs1Patch, 30) +} diff --git a/rollout/canary.go b/rollout/canary.go index 6f3062b004..11813ed44b 100644 --- a/rollout/canary.go +++ b/rollout/canary.go @@ -157,6 +157,10 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForCanary(oldRSs []*appsv1.Repli sort.Sort(sort.Reverse(replicasetutil.ReplicaSetsByRevisionNumber(oldRSs))) + if canProceed, err := c.canProceedWithScaleDownAnnotation(oldRSs); !canProceed || err != nil { + return 0, err + } + annotationedRSs := int32(0) rolloutReplicas := defaults.GetReplicasOrDefault(c.rollout.Spec.Replicas) for _, targetRS := range oldRSs { @@ -169,7 +173,7 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForCanary(oldRSs []*appsv1.Repli if maxScaleDown <= 0 { break } - if *(targetRS.Spec.Replicas) == 0 { + if *targetRS.Spec.Replicas == 0 { // cannot scale down this ReplicaSet. continue } @@ -177,15 +181,18 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForCanary(oldRSs []*appsv1.Repli if c.rollout.Spec.Strategy.Canary.TrafficRouting == nil { // For basic canary, we must scale down all other ReplicaSets because existence of // those pods will cause traffic to be served by them - if *(targetRS.Spec.Replicas) > maxScaleDown { - desiredReplicaCount = *(targetRS.Spec.Replicas) - maxScaleDown + if *targetRS.Spec.Replicas > maxScaleDown { + desiredReplicaCount = *targetRS.Spec.Replicas - maxScaleDown } } else { // For traffic shaped canary, we leave the old ReplicaSets up until scaleDownDelaySeconds - annotationedRSs, desiredReplicaCount = c.ScaleDownDelayHelper(targetRS, annotationedRSs, rolloutReplicas) + annotationedRSs, desiredReplicaCount, err = c.scaleDownDelayHelper(targetRS, annotationedRSs, rolloutReplicas) + if err != nil { + return totalScaledDown, err + } } - if *(targetRS.Spec.Replicas) == desiredReplicaCount { - // at desired account + if *targetRS.Spec.Replicas == desiredReplicaCount { + // already at desired account, nothing to do continue } // Scale down. @@ -201,6 +208,50 @@ func (c *rolloutContext) scaleDownOldReplicaSetsForCanary(oldRSs []*appsv1.Repli return totalScaledDown, nil } +// canProceedWithScaleDownAnnotation returns whether or not it is safe to proceed with annotating +// old replicasets with the scale-down-deadline in the traffic-routed canary strategy. +// This method only matters with ALB canary + the target group verification feature. +// The safety guarantees we provide are that we will not scale down *anything* unless we can verify +// stable target group endpoints are registered properly. +// NOTE: this method was written in a way which avoids AWS API calls. +func (c *rolloutContext) canProceedWithScaleDownAnnotation(oldRSs []*appsv1.ReplicaSet) (bool, error) { + isALBCanary := c.rollout.Spec.Strategy.Canary != nil && c.rollout.Spec.Strategy.Canary.TrafficRouting != nil && c.rollout.Spec.Strategy.Canary.TrafficRouting.ALB != nil + if !isALBCanary { + // Only ALB + return true, nil + } + + needToVerifyTargetGroups := false + for _, targetRS := range oldRSs { + if *targetRS.Spec.Replicas > 0 && !replicasetutil.HasScaleDownDeadline(targetRS) { + // We encountered an old ReplicaSet that is not yet scaled down, and is not annotated + // We only verify target groups if there is something to scale down. + needToVerifyTargetGroups = true + break + } + } + if !needToVerifyTargetGroups { + // All ReplicaSets are either scaled down, or have a scale-down-deadline annotation. + // The presence of the scale-down-deadline on all oldRSs, implies we can proceed with + // scale down, because we only add that annotation when target groups have been verified. + // Therefore, we return true to avoid performing verification again and making unnecessary + // AWS API calls. + return true, nil + } + stableSvc, err := c.servicesLister.Services(c.rollout.Namespace).Get(c.rollout.Spec.Strategy.Canary.StableService) + if err != nil { + return false, err + } + err = c.awsVerifyTargetGroups(stableSvc) + if err != nil { + return false, err + } + + canProceed := c.areTargetsVerified() + c.log.Infof("Proceed with scaledown: %v", canProceed) + return canProceed, nil +} + func (c *rolloutContext) completedCurrentCanaryStep() bool { if c.rollout.Spec.Paused { return false diff --git a/rollout/controller_test.go b/rollout/controller_test.go index 8f9b7d346c..8fa0bd56cb 100644 --- a/rollout/controller_test.go +++ b/rollout/controller_test.go @@ -10,8 +10,6 @@ import ( "testing" "time" - "github.com/argoproj/argo-rollouts/utils/queue" - "github.com/ghodss/yaml" log "github.com/sirupsen/logrus" "github.com/stretchr/testify/assert" @@ -51,6 +49,7 @@ import ( "github.com/argoproj/argo-rollouts/utils/conditions" "github.com/argoproj/argo-rollouts/utils/defaults" istioutil "github.com/argoproj/argo-rollouts/utils/istio" + "github.com/argoproj/argo-rollouts/utils/queue" "github.com/argoproj/argo-rollouts/utils/record" replicasetutil "github.com/argoproj/argo-rollouts/utils/replicaset" rolloututil "github.com/argoproj/argo-rollouts/utils/rollout" @@ -102,6 +101,8 @@ type fixture struct { enqueuedObjects map[string]int unfreezeTime func() error + // events holds all the K8s Event Reasons emitted during the run + events []string fakeTrafficRouting *mocks.TrafficRoutingReconciler } @@ -480,7 +481,17 @@ func (f *fixture) newController(resync resyncFunc) (*Controller, informers.Share k8sI := kubeinformers.NewSharedInformerFactory(f.kubeclient, resync()) // Pass in objects to to dynamicClient - dynamicClient := dynamicfake.NewSimpleDynamicClient(runtime.NewScheme()) + scheme := runtime.NewScheme() + v1alpha1.AddToScheme(scheme) + tgbGVR := schema.GroupVersionResource{ + Group: "elbv2.k8s.aws", + Version: "v1beta1", + Resource: "targetgroupbindings", + } + listMapping := map[schema.GroupVersionResource]string{ + tgbGVR: "TargetGroupBindingList", + } + dynamicClient := dynamicfake.NewSimpleDynamicClientWithCustomListKinds(scheme, listMapping, f.objects...) dynamicInformerFactory := dynamicinformer.NewDynamicSharedInformerFactory(dynamicClient, 0) istioVirtualServiceInformer := dynamicInformerFactory.ForResource(istioutil.GetIstioVirtualServiceGVR()).Informer() istioDestinationRuleInformer := dynamicInformerFactory.ForResource(istioutil.GetIstioDestinationRuleGVR()).Informer() @@ -632,6 +643,8 @@ func (f *fixture) runController(rolloutName string, startInformers bool, expectE if len(f.kubeactions) > len(k8sActions) { f.t.Errorf("%d expected actions did not happen:%+v", len(f.kubeactions)-len(k8sActions), f.kubeactions[len(k8sActions):]) } + fakeRecorder := c.recorder.(*record.FakeEventRecorder) + f.events = fakeRecorder.Events return c } @@ -821,6 +834,12 @@ func (f *fixture) expectPatchRolloutActionWithPatch(rollout *v1alpha1.Rollout, p return len } +func (f *fixture) expectGetEndpointsAction(ep *corev1.Endpoints) int { + len := len(f.kubeactions) + f.kubeactions = append(f.kubeactions, core.NewGetAction(schema.GroupVersionResource{Resource: "endpoints"}, ep.Namespace, ep.Name)) + return len +} + func (f *fixture) getCreatedReplicaSet(index int) *appsv1.ReplicaSet { action := filterInformerActions(f.kubeclient.Actions())[index] createAction, ok := action.(core.CreateAction) @@ -1072,6 +1091,11 @@ func (f *fixture) getUpdatedPod(index int) *corev1.Pod { return pod } +func (f *fixture) assertEvents(events []string) { + f.t.Helper() + assert.Equal(f.t, events, f.events) +} + func TestDontSyncRolloutsWithEmptyPodSelector(t *testing.T) { f := newFixture(t) defer f.Close() diff --git a/rollout/ephemeralmetadata_test.go b/rollout/ephemeralmetadata_test.go index a68e4f39d2..24f76d6bac 100644 --- a/rollout/ephemeralmetadata_test.go +++ b/rollout/ephemeralmetadata_test.go @@ -215,7 +215,6 @@ func TestSyncBlueGreenEphemeralMetadataSecondRevision(t *testing.T) { f.expectListPodAction(r1.Namespace) // list pods to patch ephemeral data on revision 1 ReplicaSets pods` podIdx := f.expectUpdatePodAction(&pod) // Update pod with ephemeral data rs1idx := f.expectUpdateReplicaSetAction(rs1) // update stable replicaset with stable metadata - f.expectPatchReplicaSetAction(rs1) // scale revision 1 ReplicaSet down f.expectPatchRolloutAction(r2) // Patch Rollout status f.run(getKey(r2, t)) diff --git a/rollout/replicaset.go b/rollout/replicaset.go index 32e63a589e..e2ff4ebaa3 100644 --- a/rollout/replicaset.go +++ b/rollout/replicaset.go @@ -57,7 +57,7 @@ func (c *rolloutContext) addScaleDownDelay(rs *appsv1.ReplicaSet, scaleDownDelay patch := fmt.Sprintf(addScaleDownAtAnnotationsPatch, v1alpha1.DefaultReplicaSetScaleDownDeadlineAnnotationKey, deadline) _, err := c.kubeclientset.AppsV1().ReplicaSets(rs.Namespace).Patch(ctx, rs.Name, patchtypes.JSONPatchType, []byte(patch), metav1.PatchOptions{}) if err == nil { - c.log.Infof("Set '%s' annotation on '%s' to %s (%ds)", v1alpha1.DefaultReplicaSetScaleDownDeadlineAnnotationKey, rs.Name, deadline, scaleDownDelaySeconds) + c.log.Infof("Set '%s' annotation on '%s' to %s (%s)", v1alpha1.DefaultReplicaSetScaleDownDeadlineAnnotationKey, rs.Name, deadline, scaleDownDelaySeconds) } return err } @@ -230,3 +230,39 @@ func (c *rolloutContext) cleanupUnhealthyReplicas(oldRSs []*appsv1.ReplicaSet) ( } return oldRSs, totalScaledDown, nil } + +func (c *rolloutContext) scaleDownDelayHelper(rs *appsv1.ReplicaSet, annotationedRSs int32, rolloutReplicas int32) (int32, int32, error) { + desiredReplicaCount := int32(0) + scaleDownRevisionLimit := GetScaleDownRevisionLimit(c.rollout) + if !replicasetutil.HasScaleDownDeadline(rs) && *rs.Spec.Replicas > 0 { + // This ReplicaSet is scaled up but does not have a scale down deadline. Add one. + if annotationedRSs < scaleDownRevisionLimit { + annotationedRSs++ + desiredReplicaCount = *rs.Spec.Replicas + scaleDownDelaySeconds := defaults.GetScaleDownDelaySecondsOrDefault(c.rollout) + err := c.addScaleDownDelay(rs, scaleDownDelaySeconds) + if err != nil { + return annotationedRSs, desiredReplicaCount, err + } + c.enqueueRolloutAfter(c.rollout, scaleDownDelaySeconds) + } + } else if replicasetutil.HasScaleDownDeadline(rs) { + annotationedRSs++ + if annotationedRSs > scaleDownRevisionLimit { + c.log.Infof("At ScaleDownDelayRevisionLimit (%d) and scaling down the rest", scaleDownRevisionLimit) + } else { + remainingTime, err := replicasetutil.GetTimeRemainingBeforeScaleDownDeadline(rs) + if err != nil { + c.log.Warnf("%v", err) + } else if remainingTime != nil { + c.log.Infof("RS '%s' has not reached the scaleDownTime", rs.Name) + if *remainingTime < c.resyncPeriod { + c.enqueueRolloutAfter(c.rollout, *remainingTime) + } + desiredReplicaCount = rolloutReplicas + } + } + } + + return annotationedRSs, desiredReplicaCount, nil +} diff --git a/rollout/replicaset_test.go b/rollout/replicaset_test.go index 5bf7ac86ed..481454d843 100644 --- a/rollout/replicaset_test.go +++ b/rollout/replicaset_test.go @@ -10,6 +10,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" k8sfake "k8s.io/client-go/kubernetes/fake" core "k8s.io/client-go/testing" + "k8s.io/utils/pointer" "github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1" "github.com/argoproj/argo-rollouts/pkg/client/clientset/versioned/fake" @@ -307,6 +308,7 @@ func TestReconcileOldReplicaSet(t *testing.T) { oldRS.Annotations = map[string]string{annotations.DesiredReplicasAnnotation: strconv.Itoa(test.oldReplicas)} oldRS.Status.AvailableReplicas = int32(test.readyPodsFromOldRS) rollout := newBlueGreenRollout("foo", test.rolloutReplicas, nil, "", "") + rollout.Spec.Strategy.BlueGreen.ScaleDownDelayRevisionLimit = pointer.Int32Ptr(0) rollout.Spec.Selector = &metav1.LabelSelector{MatchLabels: newSelector} f := newFixture(t) defer f.Close() diff --git a/rollout/service.go b/rollout/service.go index 4ce7878c85..12f285b782 100644 --- a/rollout/service.go +++ b/rollout/service.go @@ -14,7 +14,9 @@ import ( "github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1" "github.com/argoproj/argo-rollouts/utils/annotations" "github.com/argoproj/argo-rollouts/utils/aws" + "github.com/argoproj/argo-rollouts/utils/conditions" "github.com/argoproj/argo-rollouts/utils/defaults" + logutil "github.com/argoproj/argo-rollouts/utils/log" "github.com/argoproj/argo-rollouts/utils/record" replicasetutil "github.com/argoproj/argo-rollouts/utils/replicaset" serviceutil "github.com/argoproj/argo-rollouts/utils/service" @@ -115,14 +117,17 @@ func (c *rolloutContext) areTargetsVerified() bool { return c.targetsVerified == nil || *c.targetsVerified } -// awsVerifyTargetGroups examines a Service and verifies that underlying AWS TargetGroup is properly -// targetting the Service's Endpoint IPs and port. Only valid for services which are reachable +// awsVerifyTargetGroups examines a Service and verifies that the underlying AWS TargetGroup has all +// of the Service's Endpoint IPs and ports registered. Only valid for services which are reachable // by an ALB Ingress, which can be determined if there exists a TargetGroupBinding object in the // namespace that references the given service func (c *rolloutContext) awsVerifyTargetGroups(svc *corev1.Service) error { if !c.shouldVerifyTargetGroup(svc) { return nil } + logCtx := c.log.WithField(logutil.ServiceKey, svc.Name) + logCtx.Infof("Verifying target group") + ctx := context.TODO() // find all TargetGroupBindings in the namespace which reference the service name + port tgBindings, err := aws.GetTargetGroupBindingsByService(ctx, c.dynamicclientset, *svc) @@ -134,7 +139,7 @@ func (c *rolloutContext) awsVerifyTargetGroups(svc *corev1.Service) error { return nil } - c.targetsVerified = pointer.BoolPtr(true) + c.targetsVerified = pointer.BoolPtr(false) // get endpoints of service endpoints, err := c.kubeclientset.CoreV1().Endpoints(svc.Namespace).Get(ctx, svc.Name, metav1.GetOptions{}) @@ -148,30 +153,40 @@ func (c *rolloutContext) awsVerifyTargetGroups(svc *corev1.Service) error { } for _, tgb := range tgBindings { - verified, err := aws.VerifyTargetGroupBinding(ctx, c.log, awsClnt, tgb, endpoints, svc) + verifyRes, err := aws.VerifyTargetGroupBinding(ctx, c.log, awsClnt, tgb, endpoints, svc) if err != nil { + c.recorder.Warnf(c.rollout, record.EventOptions{EventReason: conditions.TargetGroupVerifyErrorReason}, conditions.TargetGroupVerifyErrorMessage, svc.Name, tgb.Spec.TargetGroupARN, err) return err } - if !verified { - c.targetsVerified = pointer.BoolPtr(false) + if verifyRes == nil { + // verification not applicable + continue + } + if !verifyRes.Verified { + c.recorder.Warnf(c.rollout, record.EventOptions{EventReason: conditions.TargetGroupUnverifiedReason}, conditions.TargetGroupUnverifiedRegistrationMessage, svc.Name, tgb.Spec.TargetGroupARN, verifyRes.EndpointsRegistered, verifyRes.EndpointsTotal) c.enqueueRolloutAfter(c.rollout, 10*time.Second) return nil } + c.recorder.Eventf(c.rollout, record.EventOptions{EventReason: conditions.TargetGroupVerifiedReason}, conditions.TargetGroupVerifiedRegistrationMessage, svc.Name, tgb.Spec.TargetGroupARN, verifyRes.EndpointsRegistered) } + c.targetsVerified = pointer.BoolPtr(true) return nil } // shouldVerifyTargetGroup returns whether or not we should verify the target group func (c *rolloutContext) shouldVerifyTargetGroup(svc *corev1.Service) bool { if !defaults.VerifyTargetGroup() { + // feature is disabled return false } desiredPodHash := c.newRS.Labels[v1alpha1.DefaultRolloutUniqueLabelKey] - if c.rollout.Status.StableRS == desiredPodHash { - // we are fully promoted - return false - } if c.rollout.Spec.Strategy.BlueGreen != nil { + if c.rollout.Status.StableRS == desiredPodHash { + // for blue-green, we only verify targets right after switching active service. So if + // we are fully promoted, then there is no need to verify targets. + // NOTE: this is the opposite of canary, where we only verify targets if stable == desired + return false + } svcPodHash := svc.Spec.Selector[v1alpha1.DefaultRolloutUniqueLabelKey] if svcPodHash != desiredPodHash { // we have not yet switched service selector @@ -181,17 +196,23 @@ func (c *rolloutContext) shouldVerifyTargetGroup(svc *corev1.Service) bool { // we already started post-promotion analysis, so verification already occurred return false } + return true } else if c.rollout.Spec.Strategy.Canary != nil { if c.rollout.Spec.Strategy.Canary.TrafficRouting == nil || c.rollout.Spec.Strategy.Canary.TrafficRouting.ALB == nil { + // not ALB canary, so no need to verify targets return false } - // we don't support target group verification on ALB canary yet - return false - } else { - // should not get here - return false + if c.rollout.Status.StableRS != desiredPodHash { + // for canary, we only verify targets right after switching stable service, which happens + // after the update. So if stable != desired, we are still in the middle of an update + // and there is no need to verify targets. + // NOTE: this is the opposite of blue-green, where we only verify targets if stable != active + return false + } + return true } - return true + // should not get here + return false } func (c *rolloutContext) getPreviewAndActiveServices() (*corev1.Service, *corev1.Service, error) { diff --git a/rollout/service_test.go b/rollout/service_test.go index b4a4d5f059..b3d7f63c87 100644 --- a/rollout/service_test.go +++ b/rollout/service_test.go @@ -2,17 +2,28 @@ package rollout import ( "fmt" + "strconv" "strings" "testing" + elbv2 "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2" + elbv2types "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2/types" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/mock" corev1 "k8s.io/api/core/v1" + extensionsv1beta1 "k8s.io/api/extensions/v1beta1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/utils/pointer" "github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1" + "github.com/argoproj/argo-rollouts/utils/aws" + "github.com/argoproj/argo-rollouts/utils/aws/mocks" "github.com/argoproj/argo-rollouts/utils/conditions" + "github.com/argoproj/argo-rollouts/utils/defaults" + unstructuredutil "github.com/argoproj/argo-rollouts/utils/unstructured" ) func newService(name string, port int, selector map[string]string, ro *v1alpha1.Rollout) *corev1.Service { @@ -162,3 +173,424 @@ func TestPreviewServiceNotFound(t *testing.T) { assert.Equal(t, calculatePatch(r, fmt.Sprintf(expectedPatch, pausedCondition, conditions.InvalidSpecReason, strings.ReplaceAll(errmsg, "\"", "\\\""))), patch) } + +func newEndpoints(name string, ips ...string) *corev1.Endpoints { + ep := corev1.Endpoints{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: metav1.NamespaceDefault, + }, + Subsets: []corev1.EndpointSubset{ + { + Addresses: []corev1.EndpointAddress{}, + }, + }, + } + for _, ip := range ips { + address := corev1.EndpointAddress{ + IP: ip, + } + ep.Subsets[0].Addresses = append(ep.Subsets[0].Addresses, address) + } + return &ep +} + +func newTargetGroupBinding(name string) *unstructured.Unstructured { + return unstructuredutil.StrToUnstructuredUnsafe(fmt.Sprintf(` +apiVersion: elbv2.k8s.aws/v1beta1 +kind: TargetGroupBinding +metadata: + name: %s + namespace: default +spec: + serviceRef: + name: %s + port: 80 + targetGroupARN: arn::1234 + targetType: ip +`, name, name)) +} + +func newIngress(name string, canary, stable *corev1.Service) *extensionsv1beta1.Ingress { + ingress := extensionsv1beta1.Ingress{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: metav1.NamespaceDefault, + }, + Spec: extensionsv1beta1.IngressSpec{ + Rules: []extensionsv1beta1.IngressRule{ + { + Host: "fakehost.example.com", + IngressRuleValue: extensionsv1beta1.IngressRuleValue{ + HTTP: &extensionsv1beta1.HTTPIngressRuleValue{ + Paths: []extensionsv1beta1.HTTPIngressPath{ + { + Path: "/foo", + Backend: extensionsv1beta1.IngressBackend{ + ServiceName: "root", + ServicePort: intstr.FromString("use-annotations"), + }, + }, + }, + }, + }, + }, + }, + }, + } + return &ingress +} + +// TestBlueGreenAWSVerifyTargetGroupsNotYetReady verifies we don't proceed with setting stable with +// the blue-green strategy until target group verification is successful +func TestBlueGreenAWSVerifyTargetGroupsNotYetReady(t *testing.T) { + defaults.SetVerifyTargetGroup(true) + defer defaults.SetVerifyTargetGroup(false) + + // Allow us to fake out the AWS API + fakeELB := mocks.ELBv2APIClient{} + aws.NewClient = aws.FakeNewClientFunc(&fakeELB) + defer func() { + aws.NewClient = aws.DefaultNewClientFunc + }() + + f := newFixture(t) + defer f.Close() + + tgb := newTargetGroupBinding("active") + ep := newEndpoints("active", "1.2.3.4", "5.6.7.8", "2.4.6.8") + thOut := elbv2.DescribeTargetHealthOutput{ + TargetHealthDescriptions: []elbv2types.TargetHealthDescription{ + { + Target: &elbv2types.TargetDescription{ + Id: pointer.StringPtr("1.2.3.4"), + Port: pointer.Int32Ptr(80), + }, + }, + { + Target: &elbv2types.TargetDescription{ + Id: pointer.StringPtr("5.6.7.8"), + Port: pointer.Int32Ptr(80), + }, + }, + { + Target: &elbv2types.TargetDescription{ + Id: pointer.StringPtr("2.4.6.8"), // irrelevant + Port: pointer.Int32Ptr(81), // wrong port + }, + }, + { + Target: &elbv2types.TargetDescription{ + Id: pointer.StringPtr("9.8.7.6"), // irrelevant ip + Port: pointer.Int32Ptr(80), + }, + }, + }, + } + fakeELB.On("DescribeTargetHealth", mock.Anything, mock.Anything).Return(&thOut, nil) + + r1 := newBlueGreenRollout("foo", 3, nil, "active", "") + r2 := bumpVersion(r1) + + rs1 := newReplicaSetWithStatus(r1, 3, 3) + rs2 := newReplicaSetWithStatus(r2, 3, 3) + + rs1PodHash := rs1.Labels[v1alpha1.DefaultRolloutUniqueLabelKey] + rs2PodHash := rs2.Labels[v1alpha1.DefaultRolloutUniqueLabelKey] + + svc := newService("active", 80, map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: rs2PodHash}, r2) + r2 = updateBlueGreenRolloutStatus(r2, "", rs2PodHash, rs1PodHash, 3, 3, 6, 3, false, true) + r2.Status.Message = "" + r2.Status.ObservedGeneration = strconv.Itoa(int(r2.Generation)) + completedCondition, _ := newCompletedCondition(true) + conditions.SetRolloutCondition(&r2.Status, completedCondition) + progressingCondition, _ := newProgressingCondition(conditions.NewRSAvailableReason, rs2, "") + conditions.SetRolloutCondition(&r2.Status, progressingCondition) + + f.rolloutLister = append(f.rolloutLister, r2) + f.objects = append(f.objects, r2, tgb) + f.kubeobjects = append(f.kubeobjects, rs1, rs2, svc, ep) + f.serviceLister = append(f.serviceLister, svc) + + f.expectGetEndpointsAction(ep) + patchIndex := f.expectPatchRolloutAction(r2) // update status message + f.run(getKey(r2, t)) + + patch := f.getPatchedRollout(patchIndex) + expectedPatch := `{"status":{"message":"waiting for post-promotion verification to complete"}}` + assert.Equal(t, expectedPatch, patch) + f.assertEvents([]string{ + conditions.TargetGroupUnverifiedReason, + }) +} + +// TestBlueGreenAWSVerifyTargetGroupsReady verifies we proceed with setting stable with +// the blue-green strategy when target group verification is successful +func TestBlueGreenAWSVerifyTargetGroupsReady(t *testing.T) { + defaults.SetVerifyTargetGroup(true) + defer defaults.SetVerifyTargetGroup(false) + + // Allow us to fake out the AWS API + fakeELB := mocks.ELBv2APIClient{} + aws.NewClient = aws.FakeNewClientFunc(&fakeELB) + defer func() { + aws.NewClient = aws.DefaultNewClientFunc + }() + + f := newFixture(t) + defer f.Close() + + tgb := newTargetGroupBinding("active") + ep := newEndpoints("active", "1.2.3.4", "5.6.7.8", "2.4.6.8") + thOut := elbv2.DescribeTargetHealthOutput{ + TargetHealthDescriptions: []elbv2types.TargetHealthDescription{ + { + Target: &elbv2types.TargetDescription{ + Id: pointer.StringPtr("1.2.3.4"), + Port: pointer.Int32Ptr(80), + }, + }, + { + Target: &elbv2types.TargetDescription{ + Id: pointer.StringPtr("5.6.7.8"), + Port: pointer.Int32Ptr(80), + }, + }, + { + Target: &elbv2types.TargetDescription{ + Id: pointer.StringPtr("2.4.6.8"), + Port: pointer.Int32Ptr(80), + }, + }, + { + Target: &elbv2types.TargetDescription{ + Id: pointer.StringPtr("9.8.7.6"), // irrelevant ip + Port: pointer.Int32Ptr(80), + }, + }, + }, + } + fakeELB.On("DescribeTargetHealth", mock.Anything, mock.Anything).Return(&thOut, nil) + + r1 := newBlueGreenRollout("foo", 3, nil, "active", "") + r2 := bumpVersion(r1) + + rs1 := newReplicaSetWithStatus(r1, 3, 3) + rs2 := newReplicaSetWithStatus(r2, 3, 3) + + rs1PodHash := rs1.Labels[v1alpha1.DefaultRolloutUniqueLabelKey] + rs2PodHash := rs2.Labels[v1alpha1.DefaultRolloutUniqueLabelKey] + + svc := newService("active", 80, map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: rs2PodHash}, r2) + r2 = updateBlueGreenRolloutStatus(r2, "", rs2PodHash, rs1PodHash, 3, 3, 6, 3, false, true) + r2.Status.Message = "waiting for post-promotion verification to complete" + r2.Status.ObservedGeneration = strconv.Itoa(int(r2.Generation)) + completedCondition, _ := newCompletedCondition(true) + conditions.SetRolloutCondition(&r2.Status, completedCondition) + progressingCondition, _ := newProgressingCondition(conditions.NewRSAvailableReason, rs2, "") + conditions.SetRolloutCondition(&r2.Status, progressingCondition) + + f.rolloutLister = append(f.rolloutLister, r2) + f.objects = append(f.objects, r2, tgb) + f.kubeobjects = append(f.kubeobjects, rs1, rs2, svc, ep) + f.serviceLister = append(f.serviceLister, svc) + + f.expectGetEndpointsAction(ep) + patchIndex := f.expectPatchRolloutAction(r2) // update status message + f.run(getKey(r2, t)) + + patch := f.getPatchedRollout(patchIndex) + expectedPatch := fmt.Sprintf(`{"status":{"message":null,"phase":"Healthy","stableRS":"%s"}}`, rs2PodHash) + assert.Equal(t, expectedPatch, patch) + f.assertEvents([]string{ + conditions.TargetGroupVerifiedReason, + conditions.RolloutCompletedReason, + }) +} + +// TestCanaryAWSVerifyTargetGroupsNotYetReady verifies we don't proceed with scale down of old +// ReplicaSets in the canary strategy until target group verification is successful +func TestCanaryAWSVerifyTargetGroupsNotYetReady(t *testing.T) { + defaults.SetVerifyTargetGroup(true) + defer defaults.SetVerifyTargetGroup(false) + + // Allow us to fake out the AWS API + fakeELB := mocks.ELBv2APIClient{} + aws.NewClient = aws.FakeNewClientFunc(&fakeELB) + defer func() { + aws.NewClient = aws.DefaultNewClientFunc + }() + + f := newFixture(t) + defer f.Close() + + tgb := newTargetGroupBinding("stable") + ep := newEndpoints("stable", "1.2.3.4", "5.6.7.8", "2.4.6.8") + thOut := elbv2.DescribeTargetHealthOutput{ + TargetHealthDescriptions: []elbv2types.TargetHealthDescription{ + { + Target: &elbv2types.TargetDescription{ + Id: pointer.StringPtr("1.2.3.4"), + Port: pointer.Int32Ptr(80), + }, + }, + { + Target: &elbv2types.TargetDescription{ + Id: pointer.StringPtr("5.6.7.8"), + Port: pointer.Int32Ptr(80), + }, + }, + { + Target: &elbv2types.TargetDescription{ + Id: pointer.StringPtr("2.4.6.8"), // irrelevant + Port: pointer.Int32Ptr(81), // wrong port + }, + }, + { + Target: &elbv2types.TargetDescription{ + Id: pointer.StringPtr("9.8.7.6"), // irrelevant ip + Port: pointer.Int32Ptr(80), + }, + }, + }, + } + fakeELB.On("DescribeTargetHealth", mock.Anything, mock.Anything).Return(&thOut, nil) + + r1 := newCanaryRollout("foo", 3, nil, nil, nil, intstr.FromString("25%"), intstr.FromString("25%")) + r1.Spec.Strategy.Canary.TrafficRouting = &v1alpha1.RolloutTrafficRouting{ + ALB: &v1alpha1.ALBTrafficRouting{ + Ingress: "ingress", + RootService: "root", + }, + } + r1.Spec.Strategy.Canary.CanaryService = "canary" + r1.Spec.Strategy.Canary.StableService = "stable" + r2 := bumpVersion(r1) + + rs1 := newReplicaSetWithStatus(r1, 3, 3) + rs2 := newReplicaSetWithStatus(r2, 3, 3) + + rs2PodHash := rs2.Labels[v1alpha1.DefaultRolloutUniqueLabelKey] + + rootSvc := newService("root", 80, map[string]string{"app": "foo"}, nil) + stableSvc := newService("canary", 80, map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: rs2PodHash}, r2) + canarySvc := newService("stable", 80, map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: rs2PodHash}, r2) + ing := newIngress("ingress", canarySvc, stableSvc) + + r2 = updateCanaryRolloutStatus(r2, rs2PodHash, 6, 3, 6, false) + r2.Status.Message = "" + r2.Status.ObservedGeneration = strconv.Itoa(int(r2.Generation)) + r2.Status.StableRS = rs2PodHash + availableCondition, _ := newAvailableCondition(true) + conditions.SetRolloutCondition(&r2.Status, availableCondition) + completedCondition, _ := newCompletedCondition(false) + conditions.SetRolloutCondition(&r2.Status, completedCondition) + progressingCondition, _ := newProgressingCondition(conditions.NewRSAvailableReason, rs2, "") + conditions.SetRolloutCondition(&r2.Status, progressingCondition) + + f.rolloutLister = append(f.rolloutLister, r2) + f.objects = append(f.objects, r2, tgb) + f.kubeobjects = append(f.kubeobjects, rs1, rs2, ing, rootSvc, canarySvc, stableSvc, ep) + f.serviceLister = append(f.serviceLister, rootSvc, canarySvc, stableSvc) + + f.expectGetEndpointsAction(ep) + f.run(getKey(r2, t)) + f.assertEvents([]string{ + conditions.TargetGroupUnverifiedReason, + }) +} + +// TestCanaryAWSVerifyTargetGroupsReady verifies we proceed with scale down of old +// ReplicaSets in the canary strategy after target group verification is successful +func TestCanaryAWSVerifyTargetGroupsReady(t *testing.T) { + defaults.SetVerifyTargetGroup(true) + defer defaults.SetVerifyTargetGroup(false) + + // Allow us to fake out the AWS API + fakeELB := mocks.ELBv2APIClient{} + aws.NewClient = aws.FakeNewClientFunc(&fakeELB) + defer func() { + aws.NewClient = aws.DefaultNewClientFunc + }() + + f := newFixture(t) + defer f.Close() + + tgb := newTargetGroupBinding("stable") + ep := newEndpoints("stable", "1.2.3.4", "5.6.7.8", "2.4.6.8") + thOut := elbv2.DescribeTargetHealthOutput{ + TargetHealthDescriptions: []elbv2types.TargetHealthDescription{ + { + Target: &elbv2types.TargetDescription{ + Id: pointer.StringPtr("1.2.3.4"), + Port: pointer.Int32Ptr(80), + }, + }, + { + Target: &elbv2types.TargetDescription{ + Id: pointer.StringPtr("5.6.7.8"), + Port: pointer.Int32Ptr(80), + }, + }, + { + Target: &elbv2types.TargetDescription{ + Id: pointer.StringPtr("2.4.6.8"), // irrelevant + Port: pointer.Int32Ptr(80), // wrong port + }, + }, + { + Target: &elbv2types.TargetDescription{ + Id: pointer.StringPtr("9.8.7.6"), // irrelevant ip + Port: pointer.Int32Ptr(80), + }, + }, + }, + } + fakeELB.On("DescribeTargetHealth", mock.Anything, mock.Anything).Return(&thOut, nil) + + r1 := newCanaryRollout("foo", 3, nil, nil, nil, intstr.FromString("25%"), intstr.FromString("25%")) + r1.Spec.Strategy.Canary.TrafficRouting = &v1alpha1.RolloutTrafficRouting{ + ALB: &v1alpha1.ALBTrafficRouting{ + Ingress: "ingress", + RootService: "root", + }, + } + r1.Spec.Strategy.Canary.CanaryService = "canary" + r1.Spec.Strategy.Canary.StableService = "stable" + r2 := bumpVersion(r1) + + rs1 := newReplicaSetWithStatus(r1, 3, 3) + rs2 := newReplicaSetWithStatus(r2, 3, 3) + + rs2PodHash := rs2.Labels[v1alpha1.DefaultRolloutUniqueLabelKey] + + rootSvc := newService("root", 80, map[string]string{"app": "foo"}, nil) + stableSvc := newService("canary", 80, map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: rs2PodHash}, r2) + canarySvc := newService("stable", 80, map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: rs2PodHash}, r2) + ing := newIngress("ingress", canarySvc, stableSvc) + + r2 = updateCanaryRolloutStatus(r2, rs2PodHash, 6, 3, 6, false) + r2.Status.Message = "" + r2.Status.ObservedGeneration = strconv.Itoa(int(r2.Generation)) + r2.Status.StableRS = rs2PodHash + availableCondition, _ := newAvailableCondition(true) + conditions.SetRolloutCondition(&r2.Status, availableCondition) + completedCondition, _ := newCompletedCondition(false) + conditions.SetRolloutCondition(&r2.Status, completedCondition) + progressingCondition, _ := newProgressingCondition(conditions.NewRSAvailableReason, rs2, "") + conditions.SetRolloutCondition(&r2.Status, progressingCondition) + + f.rolloutLister = append(f.rolloutLister, r2) + f.objects = append(f.objects, r2, tgb) + f.kubeobjects = append(f.kubeobjects, rs1, rs2, ing, rootSvc, canarySvc, stableSvc, ep) + f.serviceLister = append(f.serviceLister, rootSvc, canarySvc, stableSvc) + + f.expectGetEndpointsAction(ep) + scaleDownRSIndex := f.expectPatchReplicaSetAction(rs1) + f.run(getKey(r2, t)) + f.verifyPatchedReplicaSet(scaleDownRSIndex, 30) + f.assertEvents([]string{ + conditions.TargetGroupVerifiedReason, + }) + +} diff --git a/rollout/sync.go b/rollout/sync.go index 747c82eb09..e54f3375e6 100644 --- a/rollout/sync.go +++ b/rollout/sync.go @@ -936,15 +936,6 @@ func (c *rolloutContext) promoteStable(newStatus *v1alpha1.RolloutStatus, reason revision, _ := replicasetutil.Revision(c.rollout) c.recorder.Eventf(c.rollout, record.EventOptions{EventReason: conditions.RolloutCompletedReason}, conditions.RolloutCompletedMessage, revision, newStatus.CurrentPodHash, reason) - // Now that we've marked the desired RS as stable, start the scale-down countdown on the previous stable RS - previousStableRS, _ := replicasetutil.GetReplicaSetByTemplateHash(c.olderRSs, previousStableHash) - if replicasetutil.GetReplicaCountForReplicaSets([]*appsv1.ReplicaSet{previousStableRS}) > 0 { - scaleDownDelaySeconds := defaults.GetScaleDownDelaySecondsOrDefault(c.rollout) - err := c.addScaleDownDelay(previousStableRS, scaleDownDelaySeconds) - if err != nil { - return err - } - } } return nil } diff --git a/rollout/sync_test.go b/rollout/sync_test.go index ba3eb6f2c5..2687d55fce 100644 --- a/rollout/sync_test.go +++ b/rollout/sync_test.go @@ -365,7 +365,6 @@ func TestBlueGreenPromoteFull(t *testing.T) { f.replicaSetLister = append(f.replicaSetLister, rs1, rs2) f.expectPatchServiceAction(activeSvc, rs2PodHash) // update active to rs2 - f.expectPatchReplicaSetAction(rs1) // set scaledown delay on rs1 patchRolloutIdx := f.expectPatchRolloutAction(r2) // update rollout status f.run(getKey(r2, t)) diff --git a/rollout/trafficrouting.go b/rollout/trafficrouting.go index 27d6740727..4cffd358e5 100644 --- a/rollout/trafficrouting.go +++ b/rollout/trafficrouting.go @@ -13,6 +13,7 @@ import ( "github.com/argoproj/argo-rollouts/rollout/trafficrouting" "github.com/argoproj/argo-rollouts/utils/record" replicasetutil "github.com/argoproj/argo-rollouts/utils/replicaset" + rolloututil "github.com/argoproj/argo-rollouts/utils/rollout" ) // NewTrafficRoutingReconciler identifies return the TrafficRouting Plugin that the rollout wants to modify @@ -86,7 +87,7 @@ func (c *rolloutContext) reconcileTrafficRouting() error { currentStep, index := replicasetutil.GetCurrentCanaryStep(c.rollout) desiredWeight := int32(0) weightDestinations := make([]trafficrouting.WeightDestination, 0) - if c.rollout.Status.StableRS == c.rollout.Status.CurrentPodHash { + if rolloututil.IsFullyPromoted(c.rollout) { // when we are fully promoted. desired canary weight should be 0 } else if c.pauseContext.IsAborted() { // when promote aborted. desired canary weight should be 0 @@ -142,10 +143,14 @@ func (c *rolloutContext) reconcileTrafficRouting() error { return err } - // If we are at a setWeight step, also perform weight verification. Note that we don't do this - // every reconciliation because weight verification typically involves API calls to the cloud - // provider which could incur rate limiting - if currentStep != nil && currentStep.SetWeight != nil { + // If we are in the middle of an update at a setWeight step, also perform weight verification. + // Note that we don't do this every reconciliation because weight verification typically involves + // API calls to the cloud provider which could incur rate limiting + shouldVerifyWeight := c.rollout.Status.StableRS != "" && + c.rollout.Status.CurrentPodHash != c.rollout.Status.StableRS && + currentStep != nil && currentStep.SetWeight != nil + + if shouldVerifyWeight { weightVerified, err := reconciler.VerifyWeight(desiredWeight) if err != nil { return err diff --git a/rollout/trafficrouting/alb/alb.go b/rollout/trafficrouting/alb/alb.go index 32152f62e1..8f660c3fe9 100644 --- a/rollout/trafficrouting/alb/alb.go +++ b/rollout/trafficrouting/alb/alb.go @@ -17,6 +17,7 @@ import ( "github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1" "github.com/argoproj/argo-rollouts/rollout/trafficrouting" "github.com/argoproj/argo-rollouts/utils/aws" + "github.com/argoproj/argo-rollouts/utils/conditions" "github.com/argoproj/argo-rollouts/utils/defaults" "github.com/argoproj/argo-rollouts/utils/diff" ingressutil "github.com/argoproj/argo-rollouts/utils/ingress" @@ -137,6 +138,7 @@ func (r *Reconciler) VerifyWeight(desiredWeight int32) (bool, error) { } lb, err := r.aws.FindLoadBalancerByDNSName(ctx, lbIngress.Hostname) if err != nil { + r.cfg.Recorder.Warnf(rollout, record.EventOptions{EventReason: conditions.TargetGroupVerifyErrorReason}, conditions.TargetGroupVerifyErrorMessage, canaryService, "unknown", err.Error()) return false, err } if lb == nil || lb.LoadBalancerArn == nil { @@ -145,6 +147,7 @@ func (r *Reconciler) VerifyWeight(desiredWeight int32) (bool, error) { } lbTargetGroups, err := r.aws.GetTargetGroupMetadata(ctx, *lb.LoadBalancerArn) if err != nil { + r.cfg.Recorder.Warnf(rollout, record.EventOptions{EventReason: conditions.TargetGroupVerifyErrorReason}, conditions.TargetGroupVerifyErrorMessage, canaryService, "unknown", err.Error()) return false, err } logCtx := r.log.WithField("lb", *lb.LoadBalancerArn) @@ -153,7 +156,13 @@ func (r *Reconciler) VerifyWeight(desiredWeight int32) (bool, error) { if tg.Weight != nil { logCtx := logCtx.WithField("tg", *tg.TargetGroupArn) logCtx.Infof("canary weight of %s (desired: %d, current: %d)", resourceID, desiredWeight, *tg.Weight) - return *tg.Weight == desiredWeight, nil + verified := *tg.Weight == desiredWeight + if verified { + r.cfg.Recorder.Eventf(rollout, record.EventOptions{EventReason: conditions.TargetGroupVerifiedReason}, conditions.TargetGroupVerifiedWeightsMessage, canaryService, *tg.TargetGroupArn, desiredWeight) + } else { + r.cfg.Recorder.Warnf(rollout, record.EventOptions{EventReason: conditions.TargetGroupUnverifiedReason}, conditions.TargetGroupUnverifiedWeightsMessage, canaryService, *tg.TargetGroupArn, desiredWeight, *tg.Weight) + } + return verified, nil } } } diff --git a/rollout/trafficrouting/alb/alb_test.go b/rollout/trafficrouting/alb/alb_test.go index 949a73df38..c8146e6b45 100644 --- a/rollout/trafficrouting/alb/alb_test.go +++ b/rollout/trafficrouting/alb/alb_test.go @@ -271,8 +271,9 @@ func TestErrorPatching(t *testing.T) { } type fakeAWSClient struct { - targetGroups []aws.TargetGroupMeta - loadBalancer *elbv2types.LoadBalancer + targetGroups []aws.TargetGroupMeta + loadBalancer *elbv2types.LoadBalancer + targetHealthDescriptions []elbv2types.TargetHealthDescription } func (f *fakeAWSClient) GetTargetGroupMetadata(ctx context.Context, loadBalancerARN string) ([]aws.TargetGroupMeta, error) { @@ -283,6 +284,10 @@ func (f *fakeAWSClient) FindLoadBalancerByDNSName(ctx context.Context, dnsName s return f.loadBalancer, nil } +func (f *fakeAWSClient) GetTargetGroupHealth(ctx context.Context, targetGroupARN string) ([]elbv2types.TargetHealthDescription, error) { + return f.targetHealthDescriptions, nil +} + func TestVerifyWeight(t *testing.T) { newFakeReconciler := func() (*Reconciler, *fakeAWSClient) { ro := fakeRollout("stable-svc", "canary-svc", "ingress", 443) diff --git a/rollout/trafficrouting/ambassador/ambassador_test.go b/rollout/trafficrouting/ambassador/ambassador_test.go index 4777d109a5..39bebf4d4a 100644 --- a/rollout/trafficrouting/ambassador/ambassador_test.go +++ b/rollout/trafficrouting/ambassador/ambassador_test.go @@ -17,6 +17,7 @@ import ( "github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1" "github.com/argoproj/argo-rollouts/rollout/trafficrouting/ambassador" + "github.com/argoproj/argo-rollouts/utils/defaults" "github.com/argoproj/argo-rollouts/utils/record" ) @@ -544,7 +545,7 @@ func TestGetMappingGVR(t *testing.T) { }) t.Run("will get gvr successfully", func(t *testing.T) { // given - ambassador.SetAPIVersion("v1") + defaults.SetAmbassadorAPIVersion("v1") // when gvr := ambassador.GetMappingGVR() @@ -557,7 +558,7 @@ func TestGetMappingGVR(t *testing.T) { t.Run("will get valid gvr even if apiVersion has the wrong domain", func(t *testing.T) { // given apiVersion := "invalid.com/v1alpha1" - ambassador.SetAPIVersion(apiVersion) + defaults.SetAmbassadorAPIVersion(apiVersion) // when gvr := ambassador.GetMappingGVR() @@ -566,12 +567,12 @@ func TestGetMappingGVR(t *testing.T) { assert.Equal(t, "invalid.com", gvr.Group) assert.Equal(t, "v1alpha1", gvr.Version) assert.Equal(t, "mappings", gvr.Resource) - assert.Equal(t, apiVersion, ambassador.GetAPIVersion()) + assert.Equal(t, apiVersion, defaults.GetAmbassadorAPIVersion()) }) t.Run("will get correct gvr for x.getambassador.io api group", func(t *testing.T) { // given apiVersion := "x.getambassador.io/v3alpha1" - ambassador.SetAPIVersion(apiVersion) + defaults.SetAmbassadorAPIVersion(apiVersion) // when gvr := ambassador.GetMappingGVR() @@ -580,7 +581,7 @@ func TestGetMappingGVR(t *testing.T) { assert.Equal(t, "x.getambassador.io", gvr.Group) assert.Equal(t, "v3alpha1", gvr.Version) assert.Equal(t, "ambassadormappings", gvr.Resource) - assert.Equal(t, apiVersion, ambassador.GetAPIVersion()) + assert.Equal(t, apiVersion, defaults.GetAmbassadorAPIVersion()) }) } diff --git a/rollout/trafficrouting/smi/smi_test.go b/rollout/trafficrouting/smi/smi_test.go index ba40903dcb..038c4a7e77 100644 --- a/rollout/trafficrouting/smi/smi_test.go +++ b/rollout/trafficrouting/smi/smi_test.go @@ -65,8 +65,8 @@ func TestType(t *testing.T) { func TestUnsupportedTrafficSplitApiVersionError(t *testing.T) { ro := fakeRollout("stable-service", "canary-service", "root-service", "traffic-split-name") client := fake.NewSimpleClientset() - SetSMIAPIVersion("does-not-exist") - defer SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion) + defaults.SetSMIAPIVersion("does-not-exist") + defer defaults.SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion) _, err := NewReconciler(ReconcilerConfig{ Rollout: ro, Client: client, @@ -114,8 +114,8 @@ func TestReconcileCreateNewTrafficSplit(t *testing.T) { t.Run("v1alpha2", func(t *testing.T) { ro := fakeRollout("stable-service", "canary-service", "root-service", "traffic-split-name") client := fake.NewSimpleClientset() - SetSMIAPIVersion("v1alpha2") - defer SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion) + defaults.SetSMIAPIVersion("v1alpha2") + defer defaults.SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion) r, err := NewReconciler(ReconcilerConfig{ Rollout: ro, Client: client, @@ -145,8 +145,8 @@ func TestReconcileCreateNewTrafficSplit(t *testing.T) { t.Run("v1alpha3", func(t *testing.T) { ro := fakeRollout("stable-service", "canary-service", "root-service", "traffic-split-name") client := fake.NewSimpleClientset() - SetSMIAPIVersion("v1alpha3") - defer SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion) + defaults.SetSMIAPIVersion("v1alpha3") + defer defaults.SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion) r, err := NewReconciler(ReconcilerConfig{ Rollout: ro, Client: client, @@ -215,8 +215,8 @@ func TestReconcilePatchExistingTrafficSplit(t *testing.T) { t.Run("v1alpha2", func(t *testing.T) { ts2 := trafficSplitV1Alpha2(ro, objectMeta, "root-service", int32(10)) client := fake.NewSimpleClientset(ts2) - SetSMIAPIVersion("v1alpha2") - defer SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion) + defaults.SetSMIAPIVersion("v1alpha2") + defer defaults.SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion) r, err := NewReconciler(ReconcilerConfig{ Rollout: ro, Client: client, @@ -247,8 +247,8 @@ func TestReconcilePatchExistingTrafficSplit(t *testing.T) { t.Run("v1alpha3", func(t *testing.T) { ts3 := trafficSplitV1Alpha3(ro, objectMeta, "root-service", int32(10)) client := fake.NewSimpleClientset(ts3) - SetSMIAPIVersion("v1alpha3") - defer SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion) + defaults.SetSMIAPIVersion("v1alpha3") + defer defaults.SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion) r, err := NewReconciler(ReconcilerConfig{ Rollout: ro, Client: client, @@ -307,8 +307,8 @@ func TestReconcilePatchExistingTrafficSplitNoChange(t *testing.T) { objMeta := objectMeta("traffic-split-v1alpha2", ro, schema.GroupVersionKind{}) ts2 := trafficSplitV1Alpha2(ro, objMeta, "root-service", int32(10)) client := fake.NewSimpleClientset(ts2) - SetSMIAPIVersion("v1alpha2") - defer SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion) + defaults.SetSMIAPIVersion("v1alpha2") + defer defaults.SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion) r, err := NewReconciler(ReconcilerConfig{ Rollout: ro, Client: client, @@ -332,8 +332,8 @@ func TestReconcilePatchExistingTrafficSplitNoChange(t *testing.T) { objMeta := objectMeta("traffic-split-v1alpha3", ro, schema.GroupVersionKind{}) ts3 := trafficSplitV1Alpha3(ro, objMeta, "root-service", int32(10)) client := fake.NewSimpleClientset(ts3) - SetSMIAPIVersion("v1alpha3") - defer SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion) + defaults.SetSMIAPIVersion("v1alpha3") + defer defaults.SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion) r, err := NewReconciler(ReconcilerConfig{ Rollout: ro, Client: client, @@ -397,8 +397,8 @@ func TestReconcileRolloutDoesNotOwnTrafficSplitError(t *testing.T) { t.Run("v1alpha2", func(t *testing.T) { ts2 := trafficSplitV1Alpha2(ro, objMeta, "root-service", int32(10)) ts2.OwnerReferences = nil - SetSMIAPIVersion("v1alpha2") - defer SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion) + defaults.SetSMIAPIVersion("v1alpha2") + defer defaults.SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion) client := fake.NewSimpleClientset(ts2) r, err := NewReconciler(ReconcilerConfig{ @@ -416,8 +416,8 @@ func TestReconcileRolloutDoesNotOwnTrafficSplitError(t *testing.T) { t.Run("v1alpha3", func(t *testing.T) { ts3 := trafficSplitV1Alpha3(ro, objMeta, "root-service", int32(10)) ts3.OwnerReferences = nil - SetSMIAPIVersion("v1alpha3") - defer SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion) + defaults.SetSMIAPIVersion("v1alpha3") + defer defaults.SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion) client := fake.NewSimpleClientset(ts3) r, err := NewReconciler(ReconcilerConfig{ @@ -490,8 +490,8 @@ func TestCreateTrafficSplitForMultipleBackends(t *testing.T) { }) t.Run("v1alpha2", func(t *testing.T) { - SetSMIAPIVersion("v1alpha2") - defer SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion) + defaults.SetSMIAPIVersion("v1alpha2") + defer defaults.SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion) client := fake.NewSimpleClientset() r, err := NewReconciler(ReconcilerConfig{ @@ -534,8 +534,8 @@ func TestCreateTrafficSplitForMultipleBackends(t *testing.T) { }) t.Run("v1alpha3", func(t *testing.T) { - SetSMIAPIVersion("v1alpha3") - defer SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion) + defaults.SetSMIAPIVersion("v1alpha3") + defer defaults.SetSMIAPIVersion(defaults.DefaultSMITrafficSplitVersion) client := fake.NewSimpleClientset() r, err := NewReconciler(ReconcilerConfig{ diff --git a/rollout/trafficrouting_test.go b/rollout/trafficrouting_test.go index 57fb408f27..e5214ccf99 100644 --- a/rollout/trafficrouting_test.go +++ b/rollout/trafficrouting_test.go @@ -470,7 +470,9 @@ func TestNewTrafficRoutingReconciler(t *testing.T) { } // Verifies with a canary using traffic routing, we add a scaledown delay to the old ReplicaSet -// after promoting desired ReplicaSet to stable +// after promoting desired ReplicaSet to stable. +// NOTE: As of v1.1, scale down delays are added to ReplicaSets on *subsequent* reconciliations +// after the desired RS has been promoted to stable func TestCanaryWithTrafficRoutingAddScaleDownDelay(t *testing.T) { f := newFixture(t) defer f.Close() @@ -483,30 +485,27 @@ func TestCanaryWithTrafficRoutingAddScaleDownDelay(t *testing.T) { } r2 := bumpVersion(r1) rs1 := newReplicaSetWithStatus(r1, 1, 1) - rs1PodHash := rs1.Labels[v1alpha1.DefaultRolloutUniqueLabelKey] rs2 := newReplicaSetWithStatus(r2, 1, 1) - r2 = updateCanaryRolloutStatus(r2, rs1PodHash, 2, 2, 2, false) rs2PodHash := rs2.Labels[v1alpha1.DefaultRolloutUniqueLabelKey] + r2 = updateCanaryRolloutStatus(r2, rs2PodHash, 2, 1, 2, false) r2.Status.ObservedGeneration = strconv.Itoa(int(r2.Generation)) + r2.Status.CurrentStepIndex = nil + availableCondition, _ := newAvailableCondition(true) + conditions.SetRolloutCondition(&r2.Status, availableCondition) - canarySelector := map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: rs2PodHash} - stableSelector := map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: rs1PodHash} - canarySvc := newService("canary", 80, canarySelector, r2) - stableSvc := newService("stable", 80, stableSelector, r2) + selector := map[string]string{v1alpha1.DefaultRolloutUniqueLabelKey: rs2PodHash} + canarySvc := newService("canary", 80, selector, r2) + stableSvc := newService("stable", 80, selector, r2) f.kubeobjects = append(f.kubeobjects, rs1, rs2, canarySvc, stableSvc) f.replicaSetLister = append(f.replicaSetLister, rs1, rs2) f.rolloutLister = append(f.rolloutLister, r2) f.objects = append(f.objects, r2) - rs1Patch := f.expectPatchReplicaSetAction(rs1) // adds the annotation - patchIndex := f.expectPatchRolloutAction(r2) // updates the rollout status + rs1Patch := f.expectPatchReplicaSetAction(rs1) // set scale-down-deadline annotation f.run(getKey(r2, t)) f.verifyPatchedReplicaSet(rs1Patch, 30) - roPatchObj := f.getPatchedRolloutAsObject(patchIndex) - assert.Equal(t, rs2PodHash, roPatchObj.Status.StableRS) - assert.Nil(t, roPatchObj.Status.CurrentStepIndex) } // Verifies with a canary using traffic routing, we scale down old ReplicaSets which exceed our limit diff --git a/test/e2e/aws_test.go b/test/e2e/aws_test.go index 762a5ecc03..4190449aef 100644 --- a/test/e2e/aws_test.go +++ b/test/e2e/aws_test.go @@ -20,17 +20,28 @@ func TestAWSSuite(t *testing.T) { } // TestALBUpdate is a simple integration test which verifies the controller can work in a real AWS -// environment. It is intended to be run with the `--alb-verify-weight` controller flag. Success of +// environment. It is intended to be run with the `--aws-verify-target-group` controller flag. Success of // this test against a controller using that flag, indicates that the controller was able to perform // weight verification using AWS APIs. // This test will be skipped unless E2E_ALB_INGESS_ANNOTATIONS is set (can be an empty struct). e.g.: -// make test-e2e E2E_INSTANCE_ID= E2E_TEST_OPTIONS="-testify.m TestALBUpdate$" E2E_ALB_INGESS_ANNOTATIONS='{"kubernetes.io/ingress.class": "aws-alb", "alb.ingress.kubernetes.io/security-groups": "iks-intuit-cidr-ingress-tcp-443"}' -func (s *AWSSuite) TestALBUpdate() { +// make test-e2e E2E_TEST_OPTIONS="-testify.m TestALBCanaryUpdate$" E2E_IMAGE_PREFIX="docker.intuit.com/docker-rmt/" E2E_INSTANCE_ID= E2E_ALB_INGESS_ANNOTATIONS='{"kubernetes.io/ingress.class": "aws-alb", "alb.ingress.kubernetes.io/security-groups": "iks-intuit-cidr-ingress-tcp-443"}' +func (s *AWSSuite) TestALBCanaryUpdate() { if val, _ := os.LookupEnv(fixtures.EnvVarE2EALBIngressAnnotations); val == "" { s.T().SkipNow() } s.Given(). - HealthyRollout(`@functional/alb-rollout.yaml`). + HealthyRollout(`@functional/alb-canary-rollout.yaml`). + When(). + UpdateSpec(). + WaitForRolloutStatus("Healthy") +} + +func (s *AWSSuite) TestALBBlueGreenUpdate() { + if val, _ := os.LookupEnv(fixtures.EnvVarE2EALBIngressAnnotations); val == "" { + s.T().SkipNow() + } + s.Given(). + HealthyRollout(`@functional/alb-bluegreen-rollout.yaml`). When(). UpdateSpec(). WaitForRolloutStatus("Healthy") diff --git a/test/e2e/canary_test.go b/test/e2e/canary_test.go index 07fb2e523d..723fc9cddd 100644 --- a/test/e2e/canary_test.go +++ b/test/e2e/canary_test.go @@ -477,6 +477,7 @@ spec: annotations: rev: two`). // update to revision 2 WaitForRolloutStatus("Healthy"). + Sleep(2 * time.Second). // sleep is necessary since scale down delay annotation happens in s subsequent reconciliation Then(). Assert(func(t *fixtures.Then) { rs1 := t.GetReplicaSetByRevision("1") diff --git a/test/e2e/functional/alb-bluegreen-rollout.yaml b/test/e2e/functional/alb-bluegreen-rollout.yaml new file mode 100644 index 0000000000..e06c2b5bea --- /dev/null +++ b/test/e2e/functional/alb-bluegreen-rollout.yaml @@ -0,0 +1,71 @@ +apiVersion: v1 +kind: Service +metadata: + name: alb-bluegreen-desired +spec: + type: NodePort + ports: + - port: 80 + targetPort: http + protocol: TCP + name: http + selector: + app: alb-bluegreen +--- +apiVersion: v1 +kind: Service +metadata: + name: alb-bluegreen-stable +spec: + type: NodePort + ports: + - port: 80 + targetPort: http + protocol: TCP + name: http + selector: + app: alb-bluegreen +--- +apiVersion: networking.k8s.io/v1beta1 +kind: Ingress +metadata: + name: alb-bluegreen-ingress + annotations: + kubernetes.io/ingress.class: alb +spec: + rules: + - http: + paths: + - path: /* + backend: + serviceName: alb-bluegreen-stable + servicePort: 80 +--- +apiVersion: argoproj.io/v1alpha1 +kind: Rollout +metadata: + name: alb-bluegreen +spec: + selector: + matchLabels: + app: alb-bluegreen + template: + metadata: + labels: + app: alb-bluegreen + spec: + containers: + - name: alb-bluegreen + image: nginx:1.19-alpine + ports: + - name: http + containerPort: 80 + protocol: TCP + resources: + requests: + memory: 16Mi + cpu: 5m + strategy: + blueGreen: + previewService: alb-bluegreen-desired + activeService: alb-bluegreen-stable diff --git a/test/e2e/functional/alb-rollout.yaml b/test/e2e/functional/alb-canary-rollout.yaml similarity index 71% rename from test/e2e/functional/alb-rollout.yaml rename to test/e2e/functional/alb-canary-rollout.yaml index ad8157b3c7..df90ecc3cd 100644 --- a/test/e2e/functional/alb-rollout.yaml +++ b/test/e2e/functional/alb-canary-rollout.yaml @@ -1,7 +1,7 @@ apiVersion: v1 kind: Service metadata: - name: alb-rollout-root + name: alb-canary-root spec: type: NodePort ports: @@ -10,12 +10,12 @@ spec: protocol: TCP name: http selector: - app: alb-rollout + app: alb-canary --- apiVersion: v1 kind: Service metadata: - name: alb-rollout-canary + name: alb-canary-desired spec: type: NodePort ports: @@ -24,12 +24,12 @@ spec: protocol: TCP name: http selector: - app: alb-rollout + app: alb-canary --- apiVersion: v1 kind: Service metadata: - name: alb-rollout-stable + name: alb-canary-stable spec: type: NodePort ports: @@ -38,12 +38,12 @@ spec: protocol: TCP name: http selector: - app: alb-rollout + app: alb-canary --- apiVersion: networking.k8s.io/v1beta1 kind: Ingress metadata: - name: alb-rollout-ingress + name: alb-canary-ingress annotations: kubernetes.io/ingress.class: alb spec: @@ -52,24 +52,24 @@ spec: paths: - path: /* backend: - serviceName: alb-rollout-root + serviceName: alb-canary-root servicePort: use-annotation --- apiVersion: argoproj.io/v1alpha1 kind: Rollout metadata: - name: alb-rollout + name: alb-canary spec: selector: matchLabels: - app: alb-rollout + app: alb-canary template: metadata: labels: - app: alb-rollout + app: alb-canary spec: containers: - - name: alb-rollout + - name: alb-canary image: nginx:1.19-alpine ports: - name: http @@ -81,15 +81,15 @@ spec: cpu: 5m strategy: canary: - canaryService: alb-rollout-canary - stableService: alb-rollout-stable + canaryService: alb-canary-desired + stableService: alb-canary-stable trafficRouting: alb: - ingress: alb-rollout-ingress - rootService: alb-rollout-root + ingress: alb-canary-ingress + rootService: alb-canary-root servicePort: 80 steps: - setWeight: 10 - pause: {duration: 5s} - setWeight: 20 - - pause: {duration: 5s} + - pause: {duration: 5s} \ No newline at end of file diff --git a/test/fixtures/e2e_suite.go b/test/fixtures/e2e_suite.go index 71589276cf..e7ac739ab6 100644 --- a/test/fixtures/e2e_suite.go +++ b/test/fixtures/e2e_suite.go @@ -39,6 +39,8 @@ const ( // E2E_POD_DELAY slows down pod startup and shutdown by the value in seconds (default: 0) // Used humans slow down rollout activity during a test EnvVarE2EPodDelay = "E2E_POD_DELAY" + // EnvVarE2EImagePrefix is a prefix that will be prefixed to images used by the e2e tests + EnvVarE2EImagePrefix = "E2E_IMAGE_PREFIX" // E2E_DEBUG makes e2e testing easier to debug by not tearing down the suite EnvVarE2EDebug = "E2E_DEBUG" // E2E_ALB_INGESS_ANNOTATIONS is a map of annotations to apply to ingress for AWS Load Balancer Controller diff --git a/test/fixtures/when.go b/test/fixtures/when.go index 81d44628e9..3820a6afd7 100644 --- a/test/fixtures/when.go +++ b/test/fixtures/when.go @@ -53,8 +53,9 @@ func (w *When) ApplyManifests(yaml ...string) *When { objects = w.parseTextToObjects(yaml[0]) } for _, obj := range objects { - if obj.GetKind() == "Rollout" && E2EPodDelay > 0 { + if obj.GetKind() == "Rollout" { w.injectDelays(obj) + w.injectImagePrefix(obj) } if obj.GetKind() == "Ingress" { w.injectIngressAnnotations(obj) @@ -72,6 +73,9 @@ func (w *When) DeleteObject(kind, name string) *When { // injectDelays adds postStart/preStop handlers to slow down readiness/termination by adding a // preStart and postStart handlers which sleeps for the specified duration. func (w *When) injectDelays(un *unstructured.Unstructured) { + if E2EPodDelay == 0 { + return + } sleepHandler := corev1.Handler{ Exec: &corev1.ExecAction{ Command: []string{"sleep", strconv.Itoa(E2EPodDelay)}, @@ -92,6 +96,21 @@ func (w *When) injectDelays(un *unstructured.Unstructured) { w.CheckError(err) } +// injectImagePrefix prefixes images used in tests with a prefix. Useful if container registries are blocked +func (w *When) injectImagePrefix(un *unstructured.Unstructured) { + imagePrefix := os.Getenv(EnvVarE2EImagePrefix) + if imagePrefix == "" { + return + } + containersIf, _, err := unstructured.NestedSlice(un.Object, "spec", "template", "spec", "containers") + w.CheckError(err) + container := containersIf[0].(map[string]interface{}) + container["image"] = imagePrefix + container["image"].(string) + containersIf[0] = container + err = unstructured.SetNestedSlice(un.Object, containersIf, "spec", "template", "spec", "containers") + w.CheckError(err) +} + // injectIngressAnnotations injects ingress annotations defined in environment variables. Currently // E2E_ALB_INGESS_ANNOTATIONS func (w *When) injectIngressAnnotations(un *unstructured.Unstructured) { diff --git a/test/util/util.go b/test/util/util.go index 234e006d00..a783107ebf 100644 --- a/test/util/util.go +++ b/test/util/util.go @@ -40,15 +40,21 @@ func NewFakeDynamicClient(objects ...runtime.Object) *dynamicfake.FakeDynamicCli scheme := runtime.NewScheme() vsvcGVR := istioutil.GetIstioVirtualServiceGVR() druleGVR := istioutil.GetIstioDestinationRuleGVR() + tgbGVR := schema.GroupVersionResource{ + Group: "elbv2.k8s.aws", + Version: "v1beta1", + Resource: "targetgroupbindings", + } listMapping := map[schema.GroupVersionResource]string{ - vsvcGVR: vsvcGVR.Resource + "List", - druleGVR: druleGVR.Resource + "List", + vsvcGVR: "VirtualServiceList", + druleGVR: "DestinationRuleList", v1alpha1.RolloutGVR: rollouts.RolloutKind + "List", v1alpha1.AnalysisTemplateGVR: rollouts.AnalysisTemplateKind + "List", v1alpha1.AnalysisRunGVR: rollouts.AnalysisRunKind + "List", v1alpha1.ExperimentGVR: rollouts.ExperimentKind + "List", v1alpha1.ClusterAnalysisTemplateGVR: rollouts.ClusterAnalysisTemplateKind + "List", + tgbGVR: "TargetGroupBindingList", } return dynamicfake.NewSimpleDynamicClientWithCustomListKinds(scheme, listMapping, objects...) } diff --git a/utils/aws/aws.go b/utils/aws/aws.go index 84f659bd86..ad59b41985 100644 --- a/utils/aws/aws.go +++ b/utils/aws/aws.go @@ -38,7 +38,7 @@ const ( ) type Client interface { - GetTargetGroupTargets(ctx context.Context, targetGroupARN string) ([]elbv2types.TargetHealthDescription, error) + GetTargetGroupHealth(ctx context.Context, targetGroupARN string) ([]elbv2types.TargetHealthDescription, error) GetTargetGroupMetadata(ctx context.Context, loadBalancerARN string) ([]TargetGroupMeta, error) FindLoadBalancerByDNSName(ctx context.Context, dnsName string) (*elbv2types.LoadBalancer, error) } @@ -53,7 +53,8 @@ type ELBv2APIClient interface { DescribeTags(ctx context.Context, params *elbv2.DescribeTagsInput, optFns ...func(*elbv2.Options)) (*elbv2.DescribeTagsOutput, error) } -type client struct { +// ClientAdapter implements the Client interface +type ClientAdapter struct { ELBV2 ELBv2APIClient // loadBalancerDNStoARN is a cache that maps a LoadBalancer DNSName to an ARN @@ -100,19 +101,32 @@ type ServiceReference struct { Port intstr.IntOrString `json:"port"` } -func NewClient() (Client, error) { +// NewClient instantiates a new AWS Client. It is declared as a variable to allow mocking +var NewClient = DefaultNewClientFunc + +func DefaultNewClientFunc() (Client, error) { cfg, err := config.LoadDefaultConfig(context.TODO()) if err != nil { return nil, err } - c := client{ + c := ClientAdapter{ ELBV2: elbv2.NewFromConfig(cfg), loadBalancerDNStoARN: make(map[string]string), } return &c, nil } -func (c *client) FindLoadBalancerByDNSName(ctx context.Context, dnsName string) (*elbv2types.LoadBalancer, error) { +func FakeNewClientFunc(elbClient ELBv2APIClient) func() (Client, error) { + return func() (Client, error) { + c := ClientAdapter{ + ELBV2: elbClient, + loadBalancerDNStoARN: make(map[string]string), + } + return &c, nil + } +} + +func (c *ClientAdapter) FindLoadBalancerByDNSName(ctx context.Context, dnsName string) (*elbv2types.LoadBalancer, error) { lbOutput, err := c.ELBV2.DescribeLoadBalancers(ctx, &elbv2.DescribeLoadBalancersInput{}) if err != nil { return nil, err @@ -127,7 +141,7 @@ func (c *client) FindLoadBalancerByDNSName(ctx context.Context, dnsName string) // GetTargetGroupMetadata is a convenience to retrieve the target groups of a load balancer along // with relevant metadata (tags, and traffic weights). -func (c *client) GetTargetGroupMetadata(ctx context.Context, loadBalancerARN string) ([]TargetGroupMeta, error) { +func (c *ClientAdapter) GetTargetGroupMetadata(ctx context.Context, loadBalancerARN string) ([]TargetGroupMeta, error) { // Get target groups associated with LoadBalancer tgIn := elbv2.DescribeTargetGroupsInput{ LoadBalancerArn: &loadBalancerARN, @@ -206,8 +220,9 @@ func (c *client) GetTargetGroupMetadata(ctx context.Context, loadBalancerARN str return tgMeta, nil } -func (c *client) GetTargetGroupTargets(ctx context.Context, targetGroupARN string) ([]elbv2types.TargetHealthDescription, error) { - // Get target groups associated with LoadBalancer +// GetTargetGroupHealth returns health descriptions of registered targets in a target group. +// A TargetHealthDescription is an IP:port pair, along with its health status. +func (c *ClientAdapter) GetTargetGroupHealth(ctx context.Context, targetGroupARN string) ([]elbv2types.TargetHealthDescription, error) { thIn := elbv2.DescribeTargetHealthInput{ TargetGroupArn: &targetGroupARN, } @@ -296,18 +311,29 @@ func getNumericPort(tgb TargetGroupBinding, svc corev1.Service) int32 { return 0 } -// VerifyTargetGroupBinding verifies if the underlying AWS TargetGroup: -// 1. targets all the Pod IPs and port in the given service -// 2. those targets are in a healthy state -func VerifyTargetGroupBinding(ctx context.Context, logCtx *log.Entry, awsClnt Client, tgb TargetGroupBinding, endpoints *corev1.Endpoints, svc *corev1.Service) (bool, error) { +// TargetGroupVerifyResult returns metadata when a target group is verified. +type TargetGroupVerifyResult struct { + Service string + Verified bool + EndpointsRegistered int + EndpointsTotal int +} + +// VerifyTargetGroupBinding verifies if the underlying AWS TargetGroup has all Pod IPs and ports +// from the given service (the K8s Endpoints list) registered to the TargetGroup. +// NOTE: a previous version of this method used to additionally verify that all registered targets +// were "healthy" (in addition to registered), but the health of registered targets is actually +// irrelevant for our purposes of verifying the service label change was reflected in the LB. +// Returns nil if the verification is not applicable (e.g. target type is not IP) +func VerifyTargetGroupBinding(ctx context.Context, logCtx *log.Entry, awsClnt Client, tgb TargetGroupBinding, endpoints *corev1.Endpoints, svc *corev1.Service) (*TargetGroupVerifyResult, error) { if tgb.Spec.TargetType == nil || *tgb.Spec.TargetType != TargetTypeIP { // We only need to verify target groups using AWS CNI (spec.targetType: ip) - return true, nil + return nil, nil } port := getNumericPort(tgb, *svc) if port == 0 { logCtx.Warn("Unable to match TargetGroupBinding spec.serviceRef.port to Service spec.ports") - return false, nil + return nil, nil } logCtx = logCtx.WithFields(map[string]interface{}{ "service": svc.Name, @@ -315,12 +341,12 @@ func VerifyTargetGroupBinding(ctx context.Context, logCtx *log.Entry, awsClnt Cl "tg": tgb.Spec.TargetGroupARN, "port": port, }) - targets, err := awsClnt.GetTargetGroupTargets(ctx, tgb.Spec.TargetGroupARN) + targets, err := awsClnt.GetTargetGroupHealth(ctx, tgb.Spec.TargetGroupARN) if err != nil { - return false, err + return nil, err } - // Remember all of the ip:port of the endpoints list + // Remember/initialize all of the ip:port of the endpoints list that we expect to see registered endpointIPs := make(map[string]bool) for _, subset := range endpoints.Subsets { for _, addr := range subset.Addresses { @@ -330,9 +356,9 @@ func VerifyTargetGroupBinding(ctx context.Context, logCtx *log.Entry, awsClnt Cl logCtx.Infof("verifying %d endpoint addresses (of %d targets)", len(endpointIPs), len(targets)) - // Iterate all targets in AWS TargetGroup. Mark all endpoint IPs which are healthy + // Iterate all registered targets in AWS TargetGroup. Mark all endpoint IPs which we see registered for _, target := range targets { - if target.Target == nil || target.Target.Id == nil || target.Target.Port == nil || target.TargetHealth == nil { + if target.Target == nil || target.Target.Id == nil || target.Target.Port == nil { logCtx.Warnf("Invalid target in TargetGroup: %v", target) continue } @@ -342,17 +368,25 @@ func VerifyTargetGroupBinding(ctx context.Context, logCtx *log.Entry, awsClnt Cl // this is a target for something not in the endpoint list (e.g. old endpoint entry). Ignore it continue } - // Mark the endpoint IP as healthy or not - endpointIPs[targetStr] = bool(target.TargetHealth.State == elbv2types.TargetHealthStateEnumHealthy) + // Verify we see the endpoint IP registered to the TargetGroup + // NOTE: we used to check health here, but health is not relevant for verifying service label change + endpointIPs[targetStr] = true + } + + tgvr := TargetGroupVerifyResult{ + Service: svc.Name, + EndpointsTotal: len(endpointIPs), + EndpointsRegistered: 0, } - // Check if any of our desired endpoints are not yet healthy - for epIP, healthy := range endpointIPs { - if !healthy { - logCtx.Infof("Service endpoint IP %s not yet targeted or healthy", epIP) - return false, nil + // Check if any of our desired endpoints are not yet registered + for epIP, seen := range endpointIPs { + if !seen { + logCtx.Infof("Service endpoint IP %s not yet registered", epIP) + } else { + tgvr.EndpointsRegistered++ } } - logCtx.Info("TargetGroupBinding verified") - return true, nil + tgvr.Verified = bool(tgvr.EndpointsRegistered == tgvr.EndpointsTotal) + return &tgvr, nil } diff --git a/utils/aws/aws_test.go b/utils/aws/aws_test.go index 1e9904e083..94836f25bb 100644 --- a/utils/aws/aws_test.go +++ b/utils/aws/aws_test.go @@ -5,21 +5,24 @@ import ( "testing" elbv2 "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2" - "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2/types" + elbv2types "github.com/aws/aws-sdk-go-v2/service/elasticloadbalancingv2/types" + log "github.com/sirupsen/logrus" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/mock" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/utils/pointer" + testutil "github.com/argoproj/argo-rollouts/test/util" "github.com/argoproj/argo-rollouts/utils/aws/mocks" + unstructuredutil "github.com/argoproj/argo-rollouts/utils/unstructured" ) func newFakeClient() (*mocks.ELBv2APIClient, Client) { fakeELB := mocks.ELBv2APIClient{} - c := client{ - ELBV2: &fakeELB, - loadBalancerDNStoARN: make(map[string]string), - } - return &fakeELB, &c + awsClient, _ := FakeNewClientFunc(&fakeELB)() + return &fakeELB, awsClient } func TestFindLoadBalancerByDNSName(t *testing.T) { @@ -36,12 +39,12 @@ func TestFindLoadBalancerByDNSName(t *testing.T) { { fakeELB, c := newFakeClient() // Mock output - expectedLB := types.LoadBalancer{ + expectedLB := elbv2types.LoadBalancer{ LoadBalancerArn: pointer.StringPtr("lb-abc123"), DNSName: pointer.StringPtr("find-loadbalancer-test-abc-123.us-west-2.elb.amazonaws.com"), } lbOut := elbv2.DescribeLoadBalancersOutput{ - LoadBalancers: []types.LoadBalancer{ + LoadBalancers: []elbv2types.LoadBalancer{ expectedLB, }, } @@ -59,7 +62,7 @@ func TestGetTargetGroupMetadata(t *testing.T) { // mock the output tgOut := elbv2.DescribeTargetGroupsOutput{ - TargetGroups: []types.TargetGroup{ + TargetGroups: []elbv2types.TargetGroup{ { TargetGroupArn: pointer.StringPtr("tg-abc123"), }, @@ -71,10 +74,10 @@ func TestGetTargetGroupMetadata(t *testing.T) { fakeELB.On("DescribeTargetGroups", mock.Anything, mock.Anything).Return(&tgOut, nil) tagsOut := elbv2.DescribeTagsOutput{ - TagDescriptions: []types.TagDescription{ + TagDescriptions: []elbv2types.TagDescription{ { ResourceArn: pointer.StringPtr("tg-abc123"), - Tags: []types.Tag{ + Tags: []elbv2types.Tag{ { Key: pointer.StringPtr("foo"), Value: pointer.StringPtr("bar"), @@ -86,7 +89,7 @@ func TestGetTargetGroupMetadata(t *testing.T) { fakeELB.On("DescribeTags", mock.Anything, mock.Anything).Return(&tagsOut, nil) listenersOut := elbv2.DescribeListenersOutput{ - Listeners: []types.Listener{ + Listeners: []elbv2types.Listener{ { ListenerArn: pointer.StringPtr("lst-abc123"), LoadBalancerArn: pointer.StringPtr("lb-abc123"), @@ -96,12 +99,12 @@ func TestGetTargetGroupMetadata(t *testing.T) { fakeELB.On("DescribeListeners", mock.Anything, mock.Anything).Return(&listenersOut, nil) rulesOut := elbv2.DescribeRulesOutput{ - Rules: []types.Rule{ + Rules: []elbv2types.Rule{ { - Actions: []types.Action{ + Actions: []elbv2types.Action{ { - ForwardConfig: &types.ForwardActionConfig{ - TargetGroups: []types.TargetGroupTuple{ + ForwardConfig: &elbv2types.ForwardActionConfig{ + TargetGroups: []elbv2types.TargetGroupTuple{ { TargetGroupArn: pointer.StringPtr("tg-abc123"), Weight: pointer.Int32Ptr(10), @@ -128,6 +131,203 @@ func TestGetTargetGroupMetadata(t *testing.T) { assert.Nil(t, tgMeta[1].Weight) } -func TestBuildV2TargetGroupID(t *testing.T) { - assert.Equal(t, "default/ingress-svc:80", BuildV2TargetGroupID("default", "ingress", "svc", 80)) +func TestBuildTargetGroupResourceID(t *testing.T) { + assert.Equal(t, "default/ingress-svc:80", BuildTargetGroupResourceID("default", "ingress", "svc", 80)) +} + +func TestGetTargetGroupHealth(t *testing.T) { + fakeELB, c := newFakeClient() + expectedHealth := elbv2.DescribeTargetHealthOutput{ + TargetHealthDescriptions: []elbv2types.TargetHealthDescription{ + { + HealthCheckPort: pointer.StringPtr("80"), + Target: &elbv2types.TargetDescription{}, + TargetHealth: &elbv2types.TargetHealth{ + State: elbv2types.TargetHealthStateEnumHealthy, + }, + }, + }, + } + fakeELB.On("DescribeTargetHealth", mock.Anything, mock.Anything).Return(&expectedHealth, nil) + + // Test + health, err := c.GetTargetGroupHealth(context.TODO(), "tg-abc123") + assert.NoError(t, err) + assert.Equal(t, expectedHealth.TargetHealthDescriptions, health) +} + +var testTargetGroupBinding = ` +apiVersion: elbv2.k8s.aws/v1beta1 +kind: TargetGroupBinding +metadata: + name: active + namespace: default +spec: + serviceRef: + name: active + port: 80 + targetGroupARN: arn::1234 + targetType: ip +` + +func TestGetTargetGroupBindingsByService(t *testing.T) { + { + svc1 := corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "active", + Namespace: metav1.NamespaceDefault, + }, + Spec: corev1.ServiceSpec{ + Selector: map[string]string{}, + Ports: []corev1.ServicePort{{ + Protocol: "TCP", + Port: int32(80), + TargetPort: intstr.FromInt(80), + }}, + }, + } + obj := unstructuredutil.StrToUnstructuredUnsafe(testTargetGroupBinding) + dynamicClientSet := testutil.NewFakeDynamicClient(obj) + tgbs, err := GetTargetGroupBindingsByService(context.TODO(), dynamicClientSet, svc1) + assert.NoError(t, err) + assert.Equal(t, 80, tgbs[0].Spec.ServiceRef.Port.IntValue()) + assert.Equal(t, "arn::1234", tgbs[0].Spec.TargetGroupARN) + assert.Equal(t, "ip", string(*tgbs[0].Spec.TargetType)) + } + { + svc2 := corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "active", + Namespace: metav1.NamespaceDefault, + }, + Spec: corev1.ServiceSpec{ + Selector: map[string]string{}, + Ports: []corev1.ServicePort{{ + Protocol: "TCP", + Name: "foo", + TargetPort: intstr.FromInt(80), + }}, + }, + } + obj := unstructuredutil.StrToUnstructuredUnsafe(` +apiVersion: elbv2.k8s.aws/v1beta1 +kind: TargetGroupBinding +metadata: + name: active + namespace: default +spec: + serviceRef: + name: active + port: foo + targetGroupARN: arn::1234 + targetType: instance +`) + dynamicClientSet := testutil.NewFakeDynamicClient(obj) + tgbs, err := GetTargetGroupBindingsByService(context.TODO(), dynamicClientSet, svc2) + assert.NoError(t, err) + assert.Equal(t, "foo", tgbs[0].Spec.ServiceRef.Port.StrVal) + assert.Equal(t, "arn::1234", tgbs[0].Spec.TargetGroupARN) + assert.Equal(t, "instance", string(*tgbs[0].Spec.TargetType)) + } +} + +func TestVerifyTargetGroupBindingIgnoreInstanceMode(t *testing.T) { + logCtx := log.NewEntry(log.New()) + _, awsClnt := newFakeClient() + tgb := TargetGroupBinding{ + Spec: TargetGroupBindingSpec{ + TargetType: (*TargetType)(pointer.StringPtr("instance")), + }, + } + res, err := VerifyTargetGroupBinding(context.TODO(), logCtx, awsClnt, tgb, nil, nil) + assert.Nil(t, res) + assert.NoError(t, err) +} + +func TestVerifyTargetGroupBinding(t *testing.T) { + logCtx := log.NewEntry(log.New()) + tgb := TargetGroupBinding{ + ObjectMeta: metav1.ObjectMeta{ + Name: "active", + Namespace: metav1.NamespaceDefault, + }, + Spec: TargetGroupBindingSpec{ + TargetType: (*TargetType)(pointer.StringPtr("ip")), + TargetGroupARN: "arn::1234", + }, + } + ep := corev1.Endpoints{ + ObjectMeta: metav1.ObjectMeta{ + Name: "active", + Namespace: metav1.NamespaceDefault, + }, + Subsets: []corev1.EndpointSubset{ + { + Addresses: []corev1.EndpointAddress{ + { + IP: "1.2.3.4", // registered + }, + { + IP: "5.6.7.8", // registered + }, + { + IP: "2.4.6.8", // not registered + }, + }, + }, + }, + } + svc := corev1.Service{ + ObjectMeta: metav1.ObjectMeta{ + Name: "active", + Namespace: metav1.NamespaceDefault, + }, + Spec: corev1.ServiceSpec{ + Selector: map[string]string{}, + Ports: []corev1.ServicePort{{ + Protocol: "TCP", + Port: int32(80), + TargetPort: intstr.FromInt(80), + }}, + }, + } + fakeELB, awsClnt := newFakeClient() + thOut := elbv2.DescribeTargetHealthOutput{ + TargetHealthDescriptions: []elbv2types.TargetHealthDescription{ + { + Target: &elbv2types.TargetDescription{ + Id: pointer.StringPtr("1.2.3.4"), + Port: pointer.Int32Ptr(80), + }, + }, + { + Target: &elbv2types.TargetDescription{ + Id: pointer.StringPtr("5.6.7.8"), + Port: pointer.Int32Ptr(80), + }, + }, + { + Target: &elbv2types.TargetDescription{ + Id: pointer.StringPtr("2.4.6.8"), // irrelevant + Port: pointer.Int32Ptr(81), // wrong port + }, + }, + { + Target: &elbv2types.TargetDescription{ + Id: pointer.StringPtr("9.8.7.6"), // irrelevant ip + Port: pointer.Int32Ptr(80), + }, + }, + }, + } + fakeELB.On("DescribeTargetHealth", mock.Anything, mock.Anything).Return(&thOut, nil) + res, err := VerifyTargetGroupBinding(context.TODO(), logCtx, awsClnt, tgb, &ep, &svc) + expectedRes := TargetGroupVerifyResult{ + Service: "active", + Verified: false, + EndpointsRegistered: 2, + EndpointsTotal: 3, + } + assert.Equal(t, expectedRes, *res) + assert.NoError(t, err) } diff --git a/utils/aws/mocks/ELBv2APIClient.go b/utils/aws/mocks/ELBv2APIClient.go index 8dae8549e5..7f621b0563 100644 --- a/utils/aws/mocks/ELBv2APIClient.go +++ b/utils/aws/mocks/ELBv2APIClient.go @@ -163,3 +163,33 @@ func (_m *ELBv2APIClient) DescribeTargetGroups(_a0 context.Context, _a1 *elastic return r0, r1 } + +// DescribeTargetHealth provides a mock function with given fields: ctx, params, optFns +func (_m *ELBv2APIClient) DescribeTargetHealth(ctx context.Context, params *elasticloadbalancingv2.DescribeTargetHealthInput, optFns ...func(*elasticloadbalancingv2.Options)) (*elasticloadbalancingv2.DescribeTargetHealthOutput, error) { + _va := make([]interface{}, len(optFns)) + for _i := range optFns { + _va[_i] = optFns[_i] + } + var _ca []interface{} + _ca = append(_ca, ctx, params) + _ca = append(_ca, _va...) + ret := _m.Called(_ca...) + + var r0 *elasticloadbalancingv2.DescribeTargetHealthOutput + if rf, ok := ret.Get(0).(func(context.Context, *elasticloadbalancingv2.DescribeTargetHealthInput, ...func(*elasticloadbalancingv2.Options)) *elasticloadbalancingv2.DescribeTargetHealthOutput); ok { + r0 = rf(ctx, params, optFns...) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(*elasticloadbalancingv2.DescribeTargetHealthOutput) + } + } + + var r1 error + if rf, ok := ret.Get(1).(func(context.Context, *elasticloadbalancingv2.DescribeTargetHealthInput, ...func(*elasticloadbalancingv2.Options)) error); ok { + r1 = rf(ctx, params, optFns...) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} diff --git a/utils/conditions/conditions.go b/utils/conditions/conditions.go index 0715c19fd0..40d173374a 100644 --- a/utils/conditions/conditions.go +++ b/utils/conditions/conditions.go @@ -135,6 +135,18 @@ const ( ServiceReferenceReason = "ServiceReferenceError" // ServiceReferencingManagedService is added in a rollout when the multiple rollouts reference a Rollout ServiceReferencingManagedService = "Service %q is managed by another Rollout" + + // TargetGroupHealthyReason is emitted when target group has been verified + TargetGroupVerifiedReason = "TargetGroupVerified" + TargetGroupVerifiedRegistrationMessage = "Service %s (TargetGroup %s) verified: %d endpoints registered" + TargetGroupVerifiedWeightsMessage = "Service %s (TargetGroup %s) verified: canary weight %d set" + // TargetGroupHealthyReason is emitted when target group has not been verified + TargetGroupUnverifiedReason = "TargetGroupUnverified" + TargetGroupUnverifiedRegistrationMessage = "Service %s (TargetGroup %s) not verified: %d/%d endpoints registered" + TargetGroupUnverifiedWeightsMessage = "Service %s (TargetGroup %s) not verified: canary weight %d not yet set (current: %d)" + // TargetGroupVerifyErrorReason is emitted when we fail to verify the health of a target group due to error + TargetGroupVerifyErrorReason = "TargetGroupVerifyError" + TargetGroupVerifyErrorMessage = "Failed to verify Service %s (TargetGroup %s): %s" ) // NewRolloutCondition creates a new rollout condition. diff --git a/utils/defaults/defaults_test.go b/utils/defaults/defaults_test.go index a3e5322c3e..129fd1c959 100644 --- a/utils/defaults/defaults_test.go +++ b/utils/defaults/defaults_test.go @@ -355,3 +355,30 @@ func TestGetConsecutiveErrorLimitOrDefault(t *testing.T) { metricDefaultValue := &v1alpha1.Metric{} assert.Equal(t, DefaultConsecutiveErrorLimit, GetConsecutiveErrorLimitOrDefault(metricDefaultValue)) } + +func TestSetDefaults(t *testing.T) { + SetVerifyTargetGroup(true) + assert.True(t, VerifyTargetGroup()) + SetVerifyTargetGroup(false) + assert.False(t, VerifyTargetGroup()) + + SetIstioAPIVersion("v1alpha9") + assert.Equal(t, "v1alpha9", GetIstioAPIVersion()) + SetIstioAPIVersion(DefaultIstioVersion) + assert.Equal(t, DefaultIstioVersion, GetIstioAPIVersion()) + + SetAmbassadorAPIVersion("v1alpha9") + assert.Equal(t, "v1alpha9", GetAmbassadorAPIVersion()) + SetAmbassadorAPIVersion(DefaultAmbassadorVersion) + assert.Equal(t, DefaultAmbassadorVersion, GetAmbassadorAPIVersion()) + + SetSMIAPIVersion("v1alpha9") + assert.Equal(t, "v1alpha9", GetSMIAPIVersion()) + SetSMIAPIVersion(DefaultSMITrafficSplitVersion) + assert.Equal(t, DefaultSMITrafficSplitVersion, GetSMIAPIVersion()) + + SetTargetGroupBindingAPIVersion("v1alpha9") + assert.Equal(t, "v1alpha9", GetTargetGroupBindingAPIVersion()) + SetTargetGroupBindingAPIVersion(DefaultTargetGroupBindingAPIVersion) + assert.Equal(t, DefaultTargetGroupBindingAPIVersion, GetTargetGroupBindingAPIVersion()) +} diff --git a/utils/istio/istio_test.go b/utils/istio/istio_test.go index 6eb78f0188..e6f8b78e71 100644 --- a/utils/istio/istio_test.go +++ b/utils/istio/istio_test.go @@ -7,6 +7,7 @@ import ( "k8s.io/apimachinery/pkg/runtime/schema" "github.com/argoproj/argo-rollouts/pkg/apis/rollouts/v1alpha1" + "github.com/argoproj/argo-rollouts/utils/defaults" "github.com/stretchr/testify/assert" "k8s.io/apimachinery/pkg/runtime" dynamicfake "k8s.io/client-go/dynamic/fake" @@ -39,12 +40,12 @@ func TestGetIstioVirtualServiceGVR(t *testing.T) { } func TestGetIstioDestinationRuleGVR(t *testing.T) { - SetIstioAPIVersion("v1alpha4") + defaults.SetIstioAPIVersion("v1alpha4") gvr := GetIstioDestinationRuleGVR() assert.Equal(t, "networking.istio.io", gvr.Group) assert.Equal(t, "v1alpha4", gvr.Version) assert.Equal(t, "destinationrules", gvr.Resource) - SetIstioAPIVersion("v1alpha3") + defaults.SetIstioAPIVersion("v1alpha3") } func TestGetRolloutVirtualServiceKeys(t *testing.T) { diff --git a/utils/rollout/rolloututil.go b/utils/rollout/rolloututil.go index e3eaab68eb..c2e997aa00 100644 --- a/utils/rollout/rolloututil.go +++ b/utils/rollout/rolloututil.go @@ -10,6 +10,12 @@ import ( "github.com/argoproj/argo-rollouts/utils/defaults" ) +// IsFullyPromoted returns whether or not the given rollout is in a fully promoted state. +// (versus being in the middle of an update). This is determined by checking if stable hash == desired hash +func IsFullyPromoted(ro *v1alpha1.Rollout) bool { + return ro.Status.StableRS == ro.Status.CurrentPodHash +} + // GetRolloutPhase returns a status and message for a rollout. Takes into consideration whether // or not metadata.generation was observed in status.observedGeneration // use this instead of CalculateRolloutPhase @@ -95,8 +101,15 @@ func CalculateRolloutPhase(spec v1alpha1.RolloutSpec, status v1alpha1.RolloutSta if ro.Status.BlueGreen.ActiveSelector == "" || ro.Status.BlueGreen.ActiveSelector != ro.Status.CurrentPodHash { return v1alpha1.RolloutPhaseProgressing, "active service cutover pending" } - if ro.Status.StableRS == "" || ro.Status.StableRS != ro.Status.CurrentPodHash { - return v1alpha1.RolloutPhaseProgressing, "waiting for analysis to complete" + if ro.Status.StableRS == "" || !IsFullyPromoted(&ro) { + // we switched the active selector to the desired ReplicaSet, but we have yet to mark it + // as stable. This could be caused by one of two things: + // 1. post-promotion analysis has yet to complete successfully + // 2. post-promotion verification (i.e. target group verification) + if waitingForBlueGreenPostPromotionAnalysis(&ro) { + return v1alpha1.RolloutPhaseProgressing, "waiting for analysis to complete" + } + return v1alpha1.RolloutPhaseProgressing, "waiting for post-promotion verification to complete" } } else if ro.Spec.Strategy.Canary != nil { if ro.Spec.Strategy.Canary.TrafficRouting == nil { @@ -107,13 +120,23 @@ func CalculateRolloutPhase(spec v1alpha1.RolloutSpec, status v1alpha1.RolloutSta return v1alpha1.RolloutPhaseProgressing, "old replicas are pending termination" } } - if ro.Status.StableRS == "" || ro.Status.StableRS != ro.Status.CurrentPodHash { + if ro.Status.StableRS == "" || !IsFullyPromoted(&ro) { return v1alpha1.RolloutPhaseProgressing, "waiting for all steps to complete" } } return v1alpha1.RolloutPhaseHealthy, "" } +// waitingForBlueGreenPostPromotionAnalysis returns we are waiting for blue-green post promotion to complete +func waitingForBlueGreenPostPromotionAnalysis(ro *v1alpha1.Rollout) bool { + if ro.Spec.Strategy.BlueGreen.PostPromotionAnalysis != nil { + if ro.Status.BlueGreen.PostPromotionAnalysisRunStatus == nil || !ro.Status.BlueGreen.PostPromotionAnalysisRunStatus.Status.Completed() { + return true + } + } + return false +} + // CanaryStepString returns a string representation of a canary step func CanaryStepString(c v1alpha1.CanaryStep) string { if c.SetWeight != nil { diff --git a/utils/rollout/rolloututil_test.go b/utils/rollout/rolloututil_test.go index 0a2071e5a9..4fc15a4c63 100644 --- a/utils/rollout/rolloututil_test.go +++ b/utils/rollout/rolloututil_test.go @@ -71,6 +71,27 @@ func newBlueGreenRollout() *v1alpha1.Rollout { } } +func TestIsFullyPromoted(t *testing.T) { + { + ro := &v1alpha1.Rollout{ + Status: v1alpha1.RolloutStatus{ + StableRS: "abc123", + CurrentPodHash: "abc123", + }, + } + assert.True(t, IsFullyPromoted(ro)) + } + { + ro := &v1alpha1.Rollout{ + Status: v1alpha1.RolloutStatus{ + StableRS: "abc123", + CurrentPodHash: "def456", + }, + } + assert.False(t, IsFullyPromoted(ro)) + } +} + func TestRolloutStatusDegraded(t *testing.T) { ro := newCanaryRollout() ro.Status.Conditions = append(ro.Status.Conditions, v1alpha1.RolloutCondition{ @@ -137,6 +158,7 @@ func TestRolloutStatusProgressing(t *testing.T) { } { ro := newBlueGreenRollout() + ro.Spec.Strategy.BlueGreen.PostPromotionAnalysis = &v1alpha1.RolloutAnalysis{} ro.Status.BlueGreen.ActiveSelector = "def5678" ro.Status.StableRS = "abc1234" ro.Status.CurrentPodHash = "def5678" @@ -148,6 +170,19 @@ func TestRolloutStatusProgressing(t *testing.T) { assert.Equal(t, v1alpha1.RolloutPhaseProgressing, status) assert.Equal(t, "waiting for analysis to complete", message) } + { + ro := newBlueGreenRollout() + ro.Status.BlueGreen.ActiveSelector = "def5678" + ro.Status.StableRS = "abc1234" + ro.Status.CurrentPodHash = "def5678" + ro.Spec.Replicas = pointer.Int32Ptr(5) + ro.Status.Replicas = 5 + ro.Status.UpdatedReplicas = 5 + ro.Status.AvailableReplicas = 5 + status, message := GetRolloutPhase(ro) + assert.Equal(t, v1alpha1.RolloutPhaseProgressing, status) + assert.Equal(t, "waiting for post-promotion verification to complete", message) + } { // Scenario when a newly created rollout has partially filled in status (with hashes) // but no updated replica count