Skip to content

Commit

Permalink
Change handling of scale up options for ZeroToMaxNodeScaling in orche…
Browse files Browse the repository at this point in the history
…strator

* Started handling scale up options for ZeroToMaxNodeScaling with the existing estimator
* Skip setting similar node groups for the node groups that use
  ZeroToMaxNodeScaling
* Renamed the autoscaling option from "AtomicScaleUp" to "AtomicScaling"
* Merged multiple tests into one single table driven test.
* Fixed some typos.
  • Loading branch information
hbostan committed Jun 30, 2023
1 parent 79c611c commit 8ba34ea
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 38 deletions.
4 changes: 2 additions & 2 deletions cluster-autoscaler/config/autoscaling_options.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ type NodeGroupAutoscalingOptions struct {
ScaleDownUnreadyTime time.Duration
// Maximum time CA waits for node to be provisioned
MaxNodeProvisionTime time.Duration
// AtomicScaleUp means that a node group should be scaled up to maximum size in a scale up.
AtomicScaleUp bool
// AtomicScaling means that a node group should be scaled up or down all at once instead of one-by-one
AtomicScaling bool
}

// GCEOptions contain autoscaling options specific to GCE cloud provider.
Expand Down
58 changes: 23 additions & 35 deletions cluster-autoscaler/core/scaleup/orchestrator/orchestrator.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ import (
"k8s.io/autoscaler/cluster-autoscaler/core/scaleup/equivalence"
"k8s.io/autoscaler/cluster-autoscaler/core/scaleup/resource"
"k8s.io/autoscaler/cluster-autoscaler/core/utils"
"k8s.io/autoscaler/cluster-autoscaler/estimator"
"k8s.io/autoscaler/cluster-autoscaler/expander"
"k8s.io/autoscaler/cluster-autoscaler/metrics"
ca_processors "k8s.io/autoscaler/cluster-autoscaler/processors"
Expand Down Expand Up @@ -363,7 +362,7 @@ func (o *ScaleUpOrchestrator) ScaleUpToNodeGroupMinSize(
continue
}

if skipReason := o.IsNodeGroupResourceExceeded(resourcesLeft, ng, nodeInfo); skipReason != nil {
if skipReason := o.IsNodeGroupResourceExceeded(resourcesLeft, ng, nodeInfo, 1); skipReason != nil {
klog.Warning("ScaleUpToNodeGroupMinSize: node group resource excceded: %v", skipReason)
continue
}
Expand Down Expand Up @@ -446,8 +445,10 @@ func (o *ScaleUpOrchestrator) filterValidScaleUpNodeGroups(
if err != nil {
klog.Errorf("Couldn't get autoscaling options for ng: %v", nodeGroup.Id())
}
if autoscalingOptions != nil && autoscalingOptions.AtomicScaleUp {
if o.autoscalingContext.MaxNodesTotal != 0 && currentNodeCount+(nodeGroup.MaxSize()-currentTargetSize) > o.autoscalingContext.MaxNodesTotal {
numNodes := 1
if autoscalingOptions != nil && autoscalingOptions.AtomicScaling {
numNodes = nodeGroup.MaxSize() - currentTargetSize
if o.autoscalingContext.MaxNodesTotal != 0 && currentNodeCount+numNodes > o.autoscalingContext.MaxNodesTotal {
klog.V(4).Infof("Skipping node group %s - atomic scale-up exceeds cluster node count limit", nodeGroup.Id())
skippedNodeGroups[nodeGroup.Id()] = NewSkippedReasons("atomic scale-up exceeds cluster node count limit")
continue
Expand All @@ -460,7 +461,7 @@ func (o *ScaleUpOrchestrator) filterValidScaleUpNodeGroups(
skippedNodeGroups[nodeGroup.Id()] = NotReadyReason
continue
}
if skipReason := o.IsNodeGroupResourceExceeded(resourcesLeft, nodeGroup, nodeInfo); skipReason != nil {
if skipReason := o.IsNodeGroupResourceExceeded(resourcesLeft, nodeGroup, nodeInfo, numNodes); skipReason != nil {
skippedNodeGroups[nodeGroup.Id()] = skipReason
continue
}
Expand All @@ -486,35 +487,19 @@ func (o *ScaleUpOrchestrator) ComputeExpansionOption(
return option
}

estimator := o.autoscalingContext.EstimatorBuilder(o.autoscalingContext.PredicateChecker, o.autoscalingContext.ClusterSnapshot)
option.NodeCount, option.Pods = estimator.Estimate(pods, nodeInfo, nodeGroup)
option.SimilarNodeGroups = o.ComputeSimilarNodeGroups(nodeGroup, nodeInfos, schedulablePods, now)

autoscalingOptions, err := nodeGroup.GetOptions(o.autoscalingContext.NodeGroupDefaults)
if err != nil {
klog.Errorf("Failed to get autoscaling options for node group %s: %v", nodeGroup.Id(), err)
}
if autoscalingOptions != nil && autoscalingOptions.AtomicScaleUp {
// If atomic, there should be no similar node groups for balancing
// Also the number of pods should be capped according to the node count.

// Limit the number of nodes we estimate to the max scale up possible.
currentTargetSize, err := nodeGroup.TargetSize()
if err != nil {
klog.Errorf("Failed to get node group size: %v", err)
return option
}
// When the node group has the atomic scale up option, the number of nodes
// that can be added to the node group should be capped according to the
// maximum size of the node group and not the 'MaxNodesPerScaleUp' option.
nodeEstimationCap := nodeGroup.MaxSize() - currentTargetSize
limiter := estimator.NewThresholdBasedEstimationLimiter(nodeEstimationCap, 0)
estimator := estimator.NewBinpackingNodeEstimator(o.autoscalingContext.PredicateChecker, o.autoscalingContext.ClusterSnapshot, limiter, estimator.NewDecreasingPodOrderer())
option.NodeCount, option.Pods = estimator.Estimate(pods, nodeInfo, nodeGroup)
if autoscalingOptions != nil && autoscalingOptions.AtomicScaling {
if option.NodeCount > 0 && option.NodeCount != nodeGroup.MaxSize() {
option.NodeCount = nodeGroup.MaxSize()
}
return option
}
estimator := o.autoscalingContext.EstimatorBuilder(o.autoscalingContext.PredicateChecker, o.autoscalingContext.ClusterSnapshot)
option.NodeCount, option.Pods = estimator.Estimate(pods, nodeInfo, nodeGroup)
option.SimilarNodeGroups = o.ComputeSimilarNodeGroups(nodeGroup, nodeInfos, schedulablePods, now)
return option
}

Expand Down Expand Up @@ -590,20 +575,15 @@ func (o *ScaleUpOrchestrator) IsNodeGroupReadyToScaleUp(nodeGroup cloudprovider.
}

// IsNodeGroupResourceExceeded returns nil if node group resource limits are not exceeded, otherwise a reason is provided.
func (o *ScaleUpOrchestrator) IsNodeGroupResourceExceeded(resourcesLeft resource.Limits, nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo) status.Reasons {
func (o *ScaleUpOrchestrator) IsNodeGroupResourceExceeded(resourcesLeft resource.Limits, nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo, numNodes int) status.Reasons {
resourcesDelta, err := o.resourceManager.DeltaForNode(o.autoscalingContext, nodeInfo, nodeGroup)
if err != nil {
klog.Errorf("Skipping node group %s; error getting node group resources: %v", nodeGroup.Id(), err)
return NotReadyReason
}
autoscalingOptions, aErr := nodeGroup.GetOptions(o.autoscalingContext.NodeGroupDefaults)
if aErr != nil {
klog.Errorf("Couldn't get autoscaling options for ng: %v", nodeGroup.Id())
}
if autoscalingOptions != nil && autoscalingOptions.AtomicScaleUp {
for resource, delta := range resourcesDelta {
resourcesDelta[resource] = delta * int64(nodeGroup.MaxSize())
}

for resource, delta := range resourcesDelta {
resourcesDelta[resource] = delta * int64(numNodes)
}

checkResult := resource.CheckDeltaWithinLimits(resourcesLeft, resourcesDelta)
Expand Down Expand Up @@ -649,6 +629,14 @@ func (o *ScaleUpOrchestrator) ComputeSimilarNodeGroups(
return nil
}

autoscalingOptions, err := nodeGroup.GetOptions(o.autoscalingContext.NodeGroupDefaults)
if err != nil {
klog.Errorf("Failed to get autoscaling options for node group %s: %v", nodeGroup.Id(), err)
}
if autoscalingOptions != nil && autoscalingOptions.AtomicScaling {
return nil
}

groupSchedulablePods, found := schedulablePods[nodeGroup.Id()]
if !found || len(groupSchedulablePods) == 0 {
return nil
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ func TestMixedScaleUp(t *testing.T) {

func TestAtomicScaleUp(t *testing.T) {
options := defaultOptions
options.NodeGroupDefaults.AtomicScaleUp = true
options.NodeGroupDefaults.AtomicScaling = true

optionsWithLimitedMaxCores := options
optionsWithLimitedMaxCores.MaxCoresTotal = 3
Expand Down

0 comments on commit 8ba34ea

Please sign in to comment.