Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

consider in-progress instances for UniformAcrossAZ update strategy #438

Merged
merged 13 commits into from
Mar 15, 2024
4 changes: 2 additions & 2 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ jobs:
uses: actions/checkout@v4

- name: Golangci-lint
uses: golangci/golangci-lint-action@v3
uses: golangci/golangci-lint-action@v4
with:
# Required: the version of golangci-lint is required and must be specified without patch version: we always use the latest patch version.
version: v1.53.2
version: v1.54
args: --timeout 3m

- name: Get kubebuilder
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ all: manager
# Run tests
ENVTEST_ASSETS_DIR=$(shell pwd)/testbin
test: manifests generate fmt vet envtest
go test ./controllers/... ./api/...
go test ./controllers/... ./api/... -coverprofile=coverage.txt
go tool cover -html=./coverage.txt -o cover.html

# Build manager binary
Expand Down
36 changes: 22 additions & 14 deletions controllers/helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -192,15 +192,23 @@ type MockEC2 struct {

var _ ec2iface.EC2API = &MockEC2{}

func createASGInstance(instanceID string, launchConfigName string) *autoscaling.Instance {
func createASGInstance(instanceID string, launchConfigName string, az string) *autoscaling.Instance {
return &autoscaling.Instance{
InstanceId: &instanceID,
LaunchConfigurationName: &launchConfigName,
AvailabilityZone: aws.String("az-1"),
AvailabilityZone: aws.String(az),
LifecycleState: aws.String("InService"),
}
}

func createDriftedASGInstance(instanceID string, az string) *autoscaling.Instance {
return &autoscaling.Instance{
InstanceId: &instanceID,
AvailabilityZone: aws.String(az),
LifecycleState: aws.String("InService"),
}
}

func createASGInstanceWithLaunchTemplate(instanceID string, launchTemplateName string) *autoscaling.Instance {
return &autoscaling.Instance{
InstanceId: &instanceID,
Expand Down Expand Up @@ -231,9 +239,9 @@ func createASG(asgName string, launchConfigName string) *autoscaling.Group {
AutoScalingGroupName: &asgName,
LaunchConfigurationName: &launchConfigName,
Instances: []*autoscaling.Instance{
createASGInstance("mock-instance-1", launchConfigName),
createASGInstance("mock-instance-2", launchConfigName),
createASGInstance("mock-instance-3", launchConfigName),
createASGInstance("mock-instance-1", launchConfigName, "az-1"),
createASGInstance("mock-instance-2", launchConfigName, "az-2"),
createASGInstance("mock-instance-3", launchConfigName, "az-3"),
},
DesiredCapacity: func(x int) *int64 { i := int64(x); return &i }(3),
}
Expand All @@ -246,9 +254,9 @@ func createASGWithLaunchTemplate(asgName string, launchTemplate string) *autosca
LaunchTemplateName: &asgName,
},
Instances: []*autoscaling.Instance{
createASGInstance("mock-instance-1", launchTemplate),
createASGInstance("mock-instance-2", launchTemplate),
createASGInstance("mock-instance-3", launchTemplate),
createASGInstance("mock-instance-1", launchTemplate, "az-1"),
createASGInstance("mock-instance-2", launchTemplate, "az-2"),
createASGInstance("mock-instance-3", launchTemplate, "az-3"),
},
DesiredCapacity: func(x int) *int64 { i := int64(x); return &i }(3),
}
Expand All @@ -265,9 +273,9 @@ func createASGWithMixedInstanceLaunchTemplate(asgName string, launchTemplate str
},
},
Instances: []*autoscaling.Instance{
createASGInstance("mock-instance-1", launchTemplate),
createASGInstance("mock-instance-2", launchTemplate),
createASGInstance("mock-instance-3", launchTemplate),
createASGInstance("mock-instance-1", launchTemplate, "az-1"),
createASGInstance("mock-instance-2", launchTemplate, "az-2"),
createASGInstance("mock-instance-3", launchTemplate, "az-3"),
},
DesiredCapacity: func(x int) *int64 { i := int64(x); return &i }(3),
}
Expand All @@ -278,9 +286,9 @@ func createDriftedASG(asgName string, launchConfigName string) *autoscaling.Grou
AutoScalingGroupName: &asgName,
LaunchConfigurationName: &launchConfigName,
Instances: []*autoscaling.Instance{
createASGInstance("mock-instance-1", "different-launch-config"),
createASGInstance("mock-instance-2", "different-launch-config"),
createASGInstance("mock-instance-3", "different-launch-config"),
createDriftedASGInstance("mock-instance-1", "az-1"),
createDriftedASGInstance("mock-instance-2", "az-2"),
createDriftedASGInstance("mock-instance-3", "az-3"),
},
DesiredCapacity: func(x int) *int64 { i := int64(x); return &i }(3),
}
Expand Down
92 changes: 55 additions & 37 deletions controllers/upgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,9 @@
inProcessingNodes = make(map[string]*v1alpha1.NodeInProcessing)
}

//Early-Cordon - Cordon all the nodes to avoid any further scheduling of new pods.
//Early-Cordon - Cordon all the nodes to prevent scheduling of new pods on older nodes.
if r.EarlyCordonNodes {
r.Info("early-cordon has been enabled, all the instances in the node group will be cordoned", "name", r.RollingUpgrade.NamespacedName())
if ok, err := r.CordonUncordonAllNodes(true); !ok {
return ok, err
}
Expand Down Expand Up @@ -443,9 +444,11 @@

func (r *RollingUpgradeContext) SelectTargets(scalingGroup *autoscaling.Group, excludedInstances []string) []*autoscaling.Instance {
var (
batchSize = r.RollingUpgrade.MaxUnavailable()
totalNodes = int(aws.Int64Value(scalingGroup.DesiredCapacity))
targets = make([]*autoscaling.Instance, 0)
batchSize = r.RollingUpgrade.MaxUnavailable()
totalNodes = int(aws.Int64Value(scalingGroup.DesiredCapacity))
inprogressTargets = make([]*autoscaling.Instance, 0)
unrpocessedTargets = make([]*autoscaling.Instance, 0)
finalTargets = make([]*autoscaling.Instance, 0)
)
unavailableInt := CalculateMaxUnavailable(batchSize, totalNodes)

Expand All @@ -458,64 +461,73 @@
if selectedInstance := awsprovider.SelectScalingGroupInstance(instance, scalingGroup); !reflect.DeepEqual(selectedInstance, &autoscaling.Instance{}) {
//In-progress instances shouldn't be considered if they are in terminating state.
if !common.ContainsEqualFold(awsprovider.TerminatingInstanceStates, aws.StringValue(selectedInstance.LifecycleState)) {
targets = append(targets, selectedInstance)
inprogressTargets = append(inprogressTargets, selectedInstance)
}
}
}

if len(targets) > 0 {
r.Info("found in-progress instances", "instances", awsprovider.GetInstanceIDs(targets))
if len(inprogressTargets) > 0 {
r.Info("found in-progress instances", "instances", awsprovider.GetInstanceIDs(inprogressTargets), "name", r.RollingUpgrade.NamespacedName())
}

// select via strategy if there are no in-progress instances
if r.RollingUpgrade.UpdateStrategyType() == v1alpha1.RandomUpdateStrategy {
for _, instance := range scalingGroup.Instances {
if r.IsInstanceDrifted(instance) && !common.ContainsEqualFold(awsprovider.GetInstanceIDs(targets), aws.StringValue(instance.InstanceId)) && !common.ContainsEqualFold(excludedInstances, aws.StringValue(instance.InstanceId)) {
targets = append(targets, instance)
// continue to select other instances, if any.
instances, err := r.Cloud.AmazonClientSet.DescribeInstancesWithoutTagValue(instanceStateTagKey, inProgressTagValue)
if err != nil {
r.Info("unable to select targets, will retry.", "name", r.RollingUpgrade.NamespacedName())
return nil
}

Check warning on line 478 in controllers/upgrade.go

View check run for this annotation

Codecov / codecov/patch

controllers/upgrade.go#L476-L478

Added lines #L476 - L478 were not covered by tests

for _, instance := range instances {
//don't consider instances when - terminating, empty, duplicates, excluded (errored our previously), not drifted.
if selectedInstance := awsprovider.SelectScalingGroupInstance(instance, scalingGroup); !reflect.DeepEqual(selectedInstance, &autoscaling.Instance{}) {
if r.IsInstanceDrifted(selectedInstance) && !common.ContainsEqualFold(awsprovider.TerminatingInstanceStates, aws.StringValue(selectedInstance.LifecycleState)) {
if !common.ContainsEqualFold(awsprovider.GetInstanceIDs(unrpocessedTargets), aws.StringValue(selectedInstance.InstanceId)) && !common.ContainsEqualFold(excludedInstances, aws.StringValue(selectedInstance.InstanceId)) {
unrpocessedTargets = append(unrpocessedTargets, selectedInstance)
}
}
}
if unavailableInt > len(targets) {
unavailableInt = len(targets)
}
return targets[:unavailableInt]
}

r.Info("found unprocessed instances", "instances", unrpocessedTargets, "name", r.RollingUpgrade.NamespacedName())

if r.RollingUpgrade.UpdateStrategyType() == v1alpha1.RandomUpdateStrategy {

finalTargets = append(inprogressTargets, unrpocessedTargets...)

} else if r.RollingUpgrade.UpdateStrategyType() == v1alpha1.UniformAcrossAzUpdateStrategy {
for _, instance := range scalingGroup.Instances {
if r.IsInstanceDrifted(instance) && !common.ContainsEqualFold(awsprovider.GetInstanceIDs(targets), aws.StringValue(instance.InstanceId)) && !common.ContainsEqualFold(excludedInstances, aws.StringValue(instance.InstanceId)) {
targets = append(targets, instance)
}
}

var AZtargets = make([]*autoscaling.Instance, 0)
var uniformAZTargets = make([]*autoscaling.Instance, 0)

// split targets into groups based on their AZ
targetsByAZ := map[string][]*autoscaling.Instance{}
for _, target := range targets {
targetsByAZMap := map[string][]*autoscaling.Instance{}
for _, target := range unrpocessedTargets {
az := aws.StringValue(target.AvailabilityZone)
targetsByAZ[az] = append(targetsByAZ[az], target)
targetsByAZMap[az] = append(targetsByAZMap[az], target)
}

// round-robin across the AZs with targets uniformly first and then best effort with remaining
for {
if len(AZtargets) == len(targets) {
if len(uniformAZTargets) == len(unrpocessedTargets) {
break
}

for az := range targetsByAZ {
targetsByAZGroupSize := len(targetsByAZ[az])
for az := range targetsByAZMap {
targetsByAZGroupSize := len(targetsByAZMap[az])
if targetsByAZGroupSize > 0 {
AZtargets = append(AZtargets, targetsByAZ[az][targetsByAZGroupSize-1]) // append last target
targetsByAZ[az] = targetsByAZ[az][:targetsByAZGroupSize-1] // pop last target
uniformAZTargets = append(uniformAZTargets, targetsByAZMap[az][targetsByAZGroupSize-1]) // append last target
targetsByAZMap[az] = targetsByAZMap[az][:targetsByAZGroupSize-1] // pop last target
}
}
}

if unavailableInt > len(AZtargets) {
unavailableInt = len(AZtargets)
}
return AZtargets[:unavailableInt]
finalTargets = append(inprogressTargets, uniformAZTargets...)
}

if unavailableInt > len(finalTargets) {
unavailableInt = len(finalTargets)
}
return targets

return finalTargets[:unavailableInt]
}

func (r *RollingUpgradeContext) IsInstanceDrifted(instance *autoscaling.Instance) bool {
Expand Down Expand Up @@ -559,6 +571,7 @@
}
} else if scalingGroup.LaunchTemplate != nil {
if instance.LaunchTemplate == nil {
r.Info("instance is drifted, instance launchtemplate is empty", "name", r.RollingUpgrade.NamespacedName())
return true
}

Expand All @@ -575,13 +588,16 @@
}

if !strings.EqualFold(launchTemplateName, instanceTemplateName) {
r.Info("instance is drifted, mismatch in launchtemplate name", "instanceID", instanceID, "instanceLT", instanceTemplateName, "asgLT", launchTemplateName, "name", r.RollingUpgrade.NamespacedName())
return true
} else if !strings.EqualFold(instanceTemplateVersion, templateVersion) {
r.Info("instance is drifted, mismatch in launchtemplate version", "instanceID", instanceID, "instanceLT-version", instanceTemplateVersion, "asgLT-version", templateVersion, "name", r.RollingUpgrade.NamespacedName())

Check warning on line 594 in controllers/upgrade.go

View check run for this annotation

Codecov / codecov/patch

controllers/upgrade.go#L594

Added line #L594 was not covered by tests
return true
}

} else if scalingGroup.MixedInstancesPolicy != nil {
if instance.LaunchTemplate == nil {
r.Info("instance is drifted, instance launchtemplate is empty", "name", r.RollingUpgrade.NamespacedName())
return true
}

Expand All @@ -598,8 +614,10 @@
}

if !strings.EqualFold(launchTemplateName, instanceTemplateName) {
r.Info("instance is drifted, mismatch in launchtemplate name", "instanceID", instanceID, "instanceLT", instanceTemplateName, "asgLT", launchTemplateName, "name", r.RollingUpgrade.NamespacedName())
return true
} else if !strings.EqualFold(instanceTemplateVersion, templateVersion) {
r.Info("instance is drifted, mismatch in launchtemplate version", "instanceID", instanceID, "instanceLT-version", instanceTemplateVersion, "asgLT-version", templateVersion, "name", r.RollingUpgrade.NamespacedName())

Check warning on line 620 in controllers/upgrade.go

View check run for this annotation

Codecov / codecov/patch

controllers/upgrade.go#L620

Added line #L620 was not covered by tests
return true
}
}
Expand Down Expand Up @@ -773,12 +791,12 @@
} else {
instanceIDs, err = r.Auth.DescribeTaggedInstanceIDs(instanceStateTagKey, earlyCordonedTagValue)
if err != nil {
r.Error(err, "failed to discover ec2 instances with early-cordoned tag", "name", r.RollingUpgrade.NamespacedName())
r.Info("failed to discover ec2 instances with early-cordoned tag", "name", r.RollingUpgrade.NamespacedName())

Check warning on line 794 in controllers/upgrade.go

View check run for this annotation

Codecov / codecov/patch

controllers/upgrade.go#L794

Added line #L794 was not covered by tests
}

r.Info("removing early-cordoning tag while uncordoning instances", "name", r.RollingUpgrade.NamespacedName())
if err := r.Auth.UntagEC2instances(instanceIDs, instanceStateTagKey, earlyCordonedTagValue); err != nil {
r.Error(err, "failed to delete early-cordoned tag for instances", "name", r.RollingUpgrade.NamespacedName())
r.Info("failed to delete early-cordoned tag for instances", "name", r.RollingUpgrade.NamespacedName())

Check warning on line 799 in controllers/upgrade.go

View check run for this annotation

Codecov / codecov/patch

controllers/upgrade.go#L799

Added line #L799 was not covered by tests
}
// add unit test as well.

Expand Down
Loading
Loading