From 7c6a0c519f226afa9638610837e75bf5adfb09b9 Mon Sep 17 00:00:00 2001 From: harikrongali <81331774+harikrongali@users.noreply.github.com> Date: Mon, 23 Aug 2021 13:12:07 -0700 Subject: [PATCH] fix: canary scaledown event could violate maxUnavailable (#1429) Signed-off-by: hari rongali --- test/e2e/canary_test.go | 58 +++++++++++++++++++- utils/replicaset/canary.go | 93 +++++++++++++++++++++++++-------- utils/replicaset/canary_test.go | 16 ++++++ 3 files changed, 143 insertions(+), 24 deletions(-) diff --git a/test/e2e/canary_test.go b/test/e2e/canary_test.go index acc0f0d600..07fb2e523d 100644 --- a/test/e2e/canary_test.go +++ b/test/e2e/canary_test.go @@ -112,6 +112,58 @@ func (s *CanarySuite) TestRolloutScalingWhenPaused() { ExpectCanaryStablePodCount(1, 3) } + +// TestRolloutWithMaxSurgeScalingDuringUpdate verifies behavior when scaling a rollout up/down in middle of update and with maxSurge 100% +func (s *CanarySuite) TestRolloutWithMaxSurgeScalingDuringUpdate() { + s.Given(). + HealthyRollout(` +apiVersion: argoproj.io/v1alpha1 +kind: Rollout +metadata: + name: updatescaling +spec: + replicas: 4 + strategy: + canary: + maxSurge: 100% + selector: + matchLabels: + app: updatescaling + template: + metadata: + labels: + app: updatescaling + spec: + containers: + - name: updatescaling + image: nginx:1.19-alpine + resources: + requests: + memory: 16Mi + cpu: 1m`). + When(). + PatchSpec(` +spec: + template: + spec: + containers: + - name: updatescaling + command: [/bad-command]`). + WaitForRolloutReplicas(7). + Then(). + ExpectCanaryStablePodCount(4, 3). + When(). + ScaleRollout(8). + WaitForRolloutReplicas(11). + Then(). + ExpectCanaryStablePodCount(8, 3). + When(). + ScaleRollout(4). + WaitForRolloutReplicas(7). + Then(). + ExpectCanaryStablePodCount(4, 3) +} + // TestRolloutScalingDuringUpdate verifies behavior when scaling a rollout up/down in middle of update func (s *CanarySuite) TestRolloutScalingDuringUpdate() { s.Given(). @@ -160,8 +212,10 @@ spec: // See: https://github.com/argoproj/argo-rollouts/issues/738 ExpectCanaryStablePodCount(6, 4). When(). - ScaleRollout(4) - // WaitForRolloutReplicas(4) // this doesn't work yet (bug) + ScaleRollout(4). + WaitForRolloutReplicas(6). + Then(). + ExpectCanaryStablePodCount(2, 4) } // TestReduceWeightAndHonorMaxUnavailable verifies we honor maxUnavailable when decreasing weight or aborting diff --git a/utils/replicaset/canary.go b/utils/replicaset/canary.go index e03b9bdeab..2068e4b983 100644 --- a/utils/replicaset/canary.go +++ b/utils/replicaset/canary.go @@ -175,12 +175,12 @@ func CalculateReplicaCountsForCanary(rollout *v1alpha1.Rollout, newRS *appsv1.Re } minAvailableReplicaCount := rolloutSpecReplica - MaxUnavailable(rollout) + // isIncreasing indicates if we are supposed to be increasing our canary replica count. // If so, we can ignore pod availability of the stableRS. Otherwise, if we are reducing our // weight (e.g. we are aborting), then we can ignore pod availability of the canaryRS. isIncreasing := newRS == nil || desiredNewRSReplicaCount >= *newRS.Spec.Replicas replicasToScaleDown := GetReplicasForScaleDown(newRS, !isIncreasing) + GetReplicasForScaleDown(stableRS, isIncreasing) - if replicasToScaleDown <= minAvailableReplicaCount { // Cannot scale down stableRS or newRS without going below min available replica count return newRSReplicaCount, stableRSReplicaCount @@ -188,31 +188,80 @@ func CalculateReplicaCountsForCanary(rollout *v1alpha1.Rollout, newRS *appsv1.Re scaleDownCount := replicasToScaleDown - minAvailableReplicaCount - if newRS != nil && *newRS.Spec.Replicas > desiredNewRSReplicaCount { - // if the controller doesn't have to use every replica to achieve the desired count, it only scales down to the - // desired count. - if *newRS.Spec.Replicas-scaleDownCount < desiredNewRSReplicaCount { - newRSReplicaCount = desiredNewRSReplicaCount - // Calculating how many replicas were used to scale down to the desired count - scaleDownCount = scaleDownCount - (*newRS.Spec.Replicas - desiredNewRSReplicaCount) - } else { - // The controller is using every replica it can to get closer to desired state. - newRSReplicaCount = *newRS.Spec.Replicas - scaleDownCount - scaleDownCount = 0 - } + if !isIncreasing { + // Skip scalingDown Stable replicaSet when Canary availability is not taken into calculation for scaleDown + newRSReplicaCount = calculateScaleDownReplicaCount(newRS, desiredNewRSReplicaCount, scaleDownCount, newRSReplicaCount) + newRSReplicaCount, stableRSReplicaCount = adjustReplicaWithinLimits(newRS, stableRS, newRSReplicaCount, stableRSReplicaCount, maxReplicaCountAllowed, minAvailableReplicaCount) + } else { + // Skip scalingDown canary replicaSet when StableSet availability is not taken into calculation for scaleDown + stableRSReplicaCount = calculateScaleDownReplicaCount(stableRS, desiredStableRSReplicaCount, scaleDownCount, stableRSReplicaCount) + stableRSReplicaCount, newRSReplicaCount = adjustReplicaWithinLimits(stableRS, newRS, stableRSReplicaCount, newRSReplicaCount, maxReplicaCountAllowed, minAvailableReplicaCount) } + return newRSReplicaCount, stableRSReplicaCount +} - if scaleStableRS && *stableRS.Spec.Replicas > desiredStableRSReplicaCount { - // This follows the same logic as scaling down the newRS except with the stableRS and it does not need to - // set the scaleDownCount again since it's not used again - if *stableRS.Spec.Replicas-scaleDownCount < desiredStableRSReplicaCount { - stableRSReplicaCount = desiredStableRSReplicaCount - } else { - stableRSReplicaCount = *stableRS.Spec.Replicas - scaleDownCount - } +// calculateScaleDownReplicaCount calculates drainRSReplicaCount +// drainRSReplicaCount can be either stableRS count or canaryRS count +// drainRSReplicaCount corresponds to RS whose availability is not considered in calculating replicasToScaleDown +func calculateScaleDownReplicaCount(drainRS *appsv1.ReplicaSet, desireRSReplicaCount int32, scaleDownCount int32, drainRSReplicaCount int32) int32 { + if drainRS != nil && *drainRS.Spec.Replicas > desireRSReplicaCount { + // if the controller doesn't have to use every replica to achieve the desired count, + // it can scales down to the desired count or get closer to desired state. + drainRSReplicaCount = maxValue(desireRSReplicaCount, *drainRS.Spec.Replicas-scaleDownCount) } + return drainRSReplicaCount +} - return newRSReplicaCount, stableRSReplicaCount +// adjustReplicaWithinLimits adjusts replicaCounters to be within maxSurge & maxUnavailable limits +// drainRSReplicaCount corresponds to RS whose availability is not considered in calculating replicasToScaleDown +// adjustRSReplicaCount corresponds to RS whose availability is to taken account while adjusting maxUnavailable limit +func adjustReplicaWithinLimits(drainRS *appsv1.ReplicaSet, adjustRS *appsv1.ReplicaSet, drainRSReplicaCount int32, adjustRSReplicaCount int32, maxReplicaCountAllowed int32, minAvailableReplicaCount int32) (int32, int32) { + extraAvailableAdjustRS := int32(0) + totalAvailableReplicas := int32(0) + // calculates current limit over the allowed value + overTheLimitVal := maxValue(0, adjustRSReplicaCount+drainRSReplicaCount-maxReplicaCountAllowed) + if drainRS != nil { + totalAvailableReplicas = totalAvailableReplicas + minValue(drainRS.Status.AvailableReplicas, drainRSReplicaCount) + } + if adjustRS != nil { + // 1. adjust adjustRSReplicaCount to be within maxSurge + adjustRSReplicaCount = adjustRSReplicaCount - overTheLimitVal + // 2. Calculate availability corresponding to adjusted count + totalAvailableReplicas = totalAvailableReplicas + minValue(adjustRS.Status.AvailableReplicas, adjustRSReplicaCount) + // 3. Calculate decrease in availability of adjustRS because of (1) + extraAvailableAdjustRS = maxValue(0, adjustRS.Status.AvailableReplicas-adjustRSReplicaCount) + + // 4. Now calculate how far count is from maxUnavailable limit + moreToNeedAvailableReplicas := maxValue(0, minAvailableReplicaCount-totalAvailableReplicas) + // 5. From (3), we got the count for decrease in availability because of (1), + // take the min of (3) & (4) and add it back to adjustRS + // remaining of moreToNeedAvailableReplicas can be ignored as it is part of drainRS, + // there is no case of deviating from maxUnavailable limit from drainRS as in the event of said case, + // scaleDown calculation wont even occur as we are checking + // replicasToScaleDown <= minAvailableReplicaCount in caller function + adjustRSReplicaCount = adjustRSReplicaCount + minValue(extraAvailableAdjustRS, moreToNeedAvailableReplicas) + // 6. Calculate final overTheLimit because of adjustment + overTheLimitVal = maxValue(0, adjustRSReplicaCount+drainRSReplicaCount-maxReplicaCountAllowed) + // 7. we can safely subtract from drainRS and other cases like deviation from maxUnavailable limit + // wont occur as said in (5) + drainRSReplicaCount = drainRSReplicaCount - overTheLimitVal + } + + return drainRSReplicaCount, adjustRSReplicaCount +} + +func minValue(countA int32, countB int32) int32 { + if countA > countB { + return countB + } + return countA +} + +func maxValue(countA int32, countB int32) int32 { + if countA < countB { + return countB + } + return countA } // BeforeStartingStep checks if canary rollout is at the starting step diff --git a/utils/replicaset/canary_test.go b/utils/replicaset/canary_test.go index 65df64f2a6..28e3f3e3c3 100644 --- a/utils/replicaset/canary_test.go +++ b/utils/replicaset/canary_test.go @@ -339,6 +339,22 @@ func TestCalculateReplicaCountsForCanary(t *testing.T) { expectedStableReplicaCount: 7, expectedCanaryReplicaCount: 3, }, + { + name: "Scale down stable and canary available", + rolloutSpecReplicas: 10, + setWeight: 100, + maxSurge: intstr.FromInt(1), + maxUnavailable: intstr.FromInt(0), + + stableSpecReplica: 10, + stableAvailableReplica: 2, + + canarySpecReplica: 10, + canaryAvailableReplica: 8, + + expectedStableReplicaCount: 2, + expectedCanaryReplicaCount: 9, + }, { name: "Do not scale down newRS or stable when older RS count >= scaleDownCount", rolloutSpecReplicas: 10,