From 7c6a0c519f226afa9638610837e75bf5adfb09b9 Mon Sep 17 00:00:00 2001
From: harikrongali <81331774+harikrongali@users.noreply.github.com>
Date: Mon, 23 Aug 2021 13:12:07 -0700
Subject: [PATCH] fix: canary scaledown event could violate maxUnavailable
 (#1429)

Signed-off-by: hari rongali <hari_rongali@intuit.com>
---
 test/e2e/canary_test.go         | 58 +++++++++++++++++++-
 utils/replicaset/canary.go      | 93 +++++++++++++++++++++++++--------
 utils/replicaset/canary_test.go | 16 ++++++
 3 files changed, 143 insertions(+), 24 deletions(-)

diff --git a/test/e2e/canary_test.go b/test/e2e/canary_test.go
index acc0f0d600..07fb2e523d 100644
--- a/test/e2e/canary_test.go
+++ b/test/e2e/canary_test.go
@@ -112,6 +112,58 @@ func (s *CanarySuite) TestRolloutScalingWhenPaused() {
 		ExpectCanaryStablePodCount(1, 3)
 }
 
+
+// TestRolloutWithMaxSurgeScalingDuringUpdate verifies behavior when scaling a rollout up/down in middle of update and with maxSurge 100%
+func (s *CanarySuite) TestRolloutWithMaxSurgeScalingDuringUpdate() {
+	s.Given().
+		HealthyRollout(`
+apiVersion: argoproj.io/v1alpha1
+kind: Rollout
+metadata:
+  name: updatescaling
+spec:
+  replicas: 4
+  strategy:
+    canary:
+      maxSurge: 100%
+  selector:
+    matchLabels:
+      app: updatescaling
+  template:
+    metadata:
+      labels:
+        app: updatescaling
+    spec:
+      containers:
+      - name: updatescaling
+        image: nginx:1.19-alpine
+        resources:
+          requests:
+            memory: 16Mi
+            cpu: 1m`).
+		When().
+		PatchSpec(`
+spec:
+  template:
+    spec:
+      containers:
+      - name: updatescaling
+        command: [/bad-command]`).
+		WaitForRolloutReplicas(7).
+		Then().
+		ExpectCanaryStablePodCount(4, 3).
+		When().
+		ScaleRollout(8).
+		WaitForRolloutReplicas(11).
+		Then().
+		ExpectCanaryStablePodCount(8, 3).
+		When().
+		ScaleRollout(4).
+		WaitForRolloutReplicas(7).
+		Then().
+		ExpectCanaryStablePodCount(4, 3)
+}
+
 // TestRolloutScalingDuringUpdate verifies behavior when scaling a rollout up/down in middle of update
 func (s *CanarySuite) TestRolloutScalingDuringUpdate() {
 	s.Given().
@@ -160,8 +212,10 @@ spec:
 		// See: https://github.com/argoproj/argo-rollouts/issues/738
 		ExpectCanaryStablePodCount(6, 4).
 		When().
-		ScaleRollout(4)
-	// WaitForRolloutReplicas(4) // this doesn't work yet (bug)
+		ScaleRollout(4).
+	    WaitForRolloutReplicas(6).
+		Then().
+		ExpectCanaryStablePodCount(2, 4)
 }
 
 // TestReduceWeightAndHonorMaxUnavailable verifies we honor maxUnavailable when decreasing weight or aborting
diff --git a/utils/replicaset/canary.go b/utils/replicaset/canary.go
index e03b9bdeab..2068e4b983 100644
--- a/utils/replicaset/canary.go
+++ b/utils/replicaset/canary.go
@@ -175,12 +175,12 @@ func CalculateReplicaCountsForCanary(rollout *v1alpha1.Rollout, newRS *appsv1.Re
 	}
 
 	minAvailableReplicaCount := rolloutSpecReplica - MaxUnavailable(rollout)
+
 	// isIncreasing indicates if we are supposed to be increasing our canary replica count.
 	// If so, we can ignore pod availability of the stableRS. Otherwise, if we are reducing our
 	// weight (e.g. we are aborting), then we can ignore pod availability of the canaryRS.
 	isIncreasing := newRS == nil || desiredNewRSReplicaCount >= *newRS.Spec.Replicas
 	replicasToScaleDown := GetReplicasForScaleDown(newRS, !isIncreasing) + GetReplicasForScaleDown(stableRS, isIncreasing)
-
 	if replicasToScaleDown <= minAvailableReplicaCount {
 		// Cannot scale down stableRS or newRS without going below min available replica count
 		return newRSReplicaCount, stableRSReplicaCount
@@ -188,31 +188,80 @@ func CalculateReplicaCountsForCanary(rollout *v1alpha1.Rollout, newRS *appsv1.Re
 
 	scaleDownCount := replicasToScaleDown - minAvailableReplicaCount
 
-	if newRS != nil && *newRS.Spec.Replicas > desiredNewRSReplicaCount {
-		// if the controller doesn't have to use every replica to achieve the desired count, it only scales down to the
-		// desired count.
-		if *newRS.Spec.Replicas-scaleDownCount < desiredNewRSReplicaCount {
-			newRSReplicaCount = desiredNewRSReplicaCount
-			// Calculating how many replicas were used to scale down to the desired count
-			scaleDownCount = scaleDownCount - (*newRS.Spec.Replicas - desiredNewRSReplicaCount)
-		} else {
-			// The controller is using every replica it can to get closer to desired state.
-			newRSReplicaCount = *newRS.Spec.Replicas - scaleDownCount
-			scaleDownCount = 0
-		}
+	if !isIncreasing {
+		// Skip scalingDown Stable replicaSet when Canary availability is not taken into calculation for scaleDown
+		newRSReplicaCount = calculateScaleDownReplicaCount(newRS, desiredNewRSReplicaCount, scaleDownCount, newRSReplicaCount)
+		newRSReplicaCount, stableRSReplicaCount = adjustReplicaWithinLimits(newRS, stableRS, newRSReplicaCount, stableRSReplicaCount, maxReplicaCountAllowed, minAvailableReplicaCount)
+	} else {
+		// Skip scalingDown canary replicaSet when StableSet availability is not taken into calculation for scaleDown
+		stableRSReplicaCount = calculateScaleDownReplicaCount(stableRS, desiredStableRSReplicaCount, scaleDownCount, stableRSReplicaCount)
+		stableRSReplicaCount, newRSReplicaCount = adjustReplicaWithinLimits(stableRS, newRS, stableRSReplicaCount, newRSReplicaCount, maxReplicaCountAllowed, minAvailableReplicaCount)
 	}
+	return newRSReplicaCount, stableRSReplicaCount
+}
 
-	if scaleStableRS && *stableRS.Spec.Replicas > desiredStableRSReplicaCount {
-		// This follows the same logic as scaling down the newRS except with the stableRS and it does not need to
-		// set the scaleDownCount again since it's not used again
-		if *stableRS.Spec.Replicas-scaleDownCount < desiredStableRSReplicaCount {
-			stableRSReplicaCount = desiredStableRSReplicaCount
-		} else {
-			stableRSReplicaCount = *stableRS.Spec.Replicas - scaleDownCount
-		}
+// calculateScaleDownReplicaCount calculates drainRSReplicaCount
+// drainRSReplicaCount can be either stableRS count or canaryRS count
+// drainRSReplicaCount corresponds to RS whose availability is not considered in calculating replicasToScaleDown
+func calculateScaleDownReplicaCount(drainRS *appsv1.ReplicaSet, desireRSReplicaCount int32, scaleDownCount int32, drainRSReplicaCount int32) int32 {
+	if drainRS != nil && *drainRS.Spec.Replicas > desireRSReplicaCount {
+		// if the controller doesn't have to use every replica to achieve the desired count,
+		// it can scales down to the desired count or get closer to desired state.
+		drainRSReplicaCount = maxValue(desireRSReplicaCount, *drainRS.Spec.Replicas-scaleDownCount)
 	}
+	return drainRSReplicaCount
+}
 
-	return newRSReplicaCount, stableRSReplicaCount
+// adjustReplicaWithinLimits adjusts replicaCounters to be within maxSurge & maxUnavailable limits
+// drainRSReplicaCount corresponds to RS whose availability is not considered in calculating replicasToScaleDown
+// adjustRSReplicaCount corresponds to RS whose availability is to taken account while adjusting maxUnavailable limit
+func adjustReplicaWithinLimits(drainRS *appsv1.ReplicaSet, adjustRS *appsv1.ReplicaSet, drainRSReplicaCount int32, adjustRSReplicaCount int32, maxReplicaCountAllowed int32, minAvailableReplicaCount int32) (int32, int32) {
+	extraAvailableAdjustRS := int32(0)
+	totalAvailableReplicas := int32(0)
+	// calculates current limit over the allowed value
+	overTheLimitVal := maxValue(0, adjustRSReplicaCount+drainRSReplicaCount-maxReplicaCountAllowed)
+	if drainRS != nil {
+		totalAvailableReplicas = totalAvailableReplicas + minValue(drainRS.Status.AvailableReplicas, drainRSReplicaCount)
+	}
+	if adjustRS != nil {
+		// 1. adjust adjustRSReplicaCount to be within maxSurge
+		adjustRSReplicaCount = adjustRSReplicaCount - overTheLimitVal
+		// 2. Calculate availability corresponding to adjusted count
+		totalAvailableReplicas = totalAvailableReplicas + minValue(adjustRS.Status.AvailableReplicas, adjustRSReplicaCount)
+		// 3. Calculate decrease in availability of adjustRS because of (1)
+		extraAvailableAdjustRS = maxValue(0, adjustRS.Status.AvailableReplicas-adjustRSReplicaCount)
+
+		// 4. Now calculate how far count is from maxUnavailable limit
+		moreToNeedAvailableReplicas := maxValue(0, minAvailableReplicaCount-totalAvailableReplicas)
+		// 5. From (3), we got the count for decrease in availability because of (1),
+		// take the min of (3) & (4) and add it back to adjustRS
+		// remaining of moreToNeedAvailableReplicas can be ignored as it is part of drainRS,
+		// there is no case of deviating from maxUnavailable limit from drainRS as in the event of said case,
+		// scaleDown calculation wont even occur as we are checking
+		// replicasToScaleDown <= minAvailableReplicaCount in caller function
+		adjustRSReplicaCount = adjustRSReplicaCount + minValue(extraAvailableAdjustRS, moreToNeedAvailableReplicas)
+		// 6. Calculate final overTheLimit because of adjustment
+		overTheLimitVal = maxValue(0, adjustRSReplicaCount+drainRSReplicaCount-maxReplicaCountAllowed)
+		// 7. we can safely subtract from drainRS and other cases like deviation from maxUnavailable limit
+		// wont occur as said in (5)
+		drainRSReplicaCount = drainRSReplicaCount - overTheLimitVal
+	}
+
+	return drainRSReplicaCount, adjustRSReplicaCount
+}
+
+func minValue(countA int32, countB int32) int32 {
+	if countA > countB {
+		return countB
+	}
+	return countA
+}
+
+func maxValue(countA int32, countB int32) int32 {
+	if countA < countB {
+		return countB
+	}
+	return countA
 }
 
 // BeforeStartingStep checks if canary rollout is at the starting step
diff --git a/utils/replicaset/canary_test.go b/utils/replicaset/canary_test.go
index 65df64f2a6..28e3f3e3c3 100644
--- a/utils/replicaset/canary_test.go
+++ b/utils/replicaset/canary_test.go
@@ -339,6 +339,22 @@ func TestCalculateReplicaCountsForCanary(t *testing.T) {
 			expectedStableReplicaCount: 7,
 			expectedCanaryReplicaCount: 3,
 		},
+		{
+			name:                "Scale down stable and canary available",
+			rolloutSpecReplicas: 10,
+			setWeight:           100,
+			maxSurge:            intstr.FromInt(1),
+			maxUnavailable:      intstr.FromInt(0),
+
+			stableSpecReplica:      10,
+			stableAvailableReplica: 2,
+
+			canarySpecReplica:      10,
+			canaryAvailableReplica: 8,
+
+			expectedStableReplicaCount: 2,
+			expectedCanaryReplicaCount: 9,
+		},
 		{
 			name:                "Do not scale down newRS or stable when older RS count >= scaleDownCount",
 			rolloutSpecReplicas: 10,