Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce noRetry Parameter for checkcapacity ProvisioningRequest #7496

Merged
merged 4 commits into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,14 @@ import (
schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
)

const (
// NoRetryParameterKey is a a key for ProvReq's Parameters that describes
// if ProvisioningRequest should be retried in case CA cannot provision it.
// Supported values are "true" and "false" - by default ProvisioningRequests are always retried.
// Currently supported only for checkcapacity class.
NoRetryParameterKey = "noRetry"
)

type checkCapacityProvClass struct {
context *context.AutoscalingContext
client *provreqclient.ProvisioningRequestClient
Expand Down Expand Up @@ -139,7 +147,16 @@ func (o *checkCapacityProvClass) checkcapacity(unschedulablePods []*apiv1.Pod, p
err, cleanupErr := clustersnapshot.WithForkedSnapshot(o.context.ClusterSnapshot, func() (bool, error) {
st, _, err := o.schedulingSimulator.TrySchedulePods(o.context.ClusterSnapshot, unschedulablePods, scheduling.ScheduleAnywhere, true)
if len(st) < len(unschedulablePods) || err != nil {
conditions.AddOrUpdateCondition(provReq, v1.Provisioned, metav1.ConditionFalse, conditions.CapacityIsNotFoundReason, "Capacity is not found, CA will try to find it later.", metav1.Now())
if noRetry, ok := provReq.Spec.Parameters[NoRetryParameterKey]; ok && noRetry == "true" {
// Failed=true condition triggers retry in Kueue. Otherwise ProvisioningRequest with Provisioned=Failed
// condition block capacity in Kueue even if it's in the middle of backoff waiting time.
conditions.AddOrUpdateCondition(provReq, v1.Failed, metav1.ConditionTrue, conditions.CapacityIsNotFoundReason, "CA could not find requested capacity", metav1.Now())
} else {
if noRetry, ok := provReq.Spec.Parameters[NoRetryParameterKey]; ok && noRetry != "false" {
klog.Errorf("Ignoring Parameter %v with invalid value: %v in ProvisioningRequest: %v. Supported values are: \"true\", \"false\"", NoRetryParameterKey, noRetry, provReq.Name)
}
conditions.AddOrUpdateCondition(provReq, v1.Provisioned, metav1.ConditionFalse, conditions.CapacityIsNotFoundReason, "Capacity is not found, CA will try to find it later.", metav1.Now())
}
capacityAvailable = false
} else {
conditions.AddOrUpdateCondition(provReq, v1.Provisioned, metav1.ConditionTrue, conditions.CapacityIsFoundReason, conditions.CapacityIsFoundMsg, metav1.Now())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,7 @@ func TestScaleUp(t *testing.T) {
batchTimebox time.Duration
numProvisionedTrue int
numProvisionedFalse int
numFailedTrue int
}{
{
name: "no ProvisioningRequests",
Expand Down Expand Up @@ -242,6 +243,15 @@ func TestScaleUp(t *testing.T) {
provReqToScaleUp: newCheckCapacityCpuProvReq,
scaleUpResult: status.ScaleUpSuccessful,
},
{
name: "impossible check-capacity, with noRetry parameter",
provReqs: []*provreqwrapper.ProvisioningRequest{
impossibleCheckCapacityReq.CopyWithParameters(map[string]v1.Parameter{"noRetry": "true"}),
},
provReqToScaleUp: impossibleCheckCapacityReq,
scaleUpResult: status.ScaleUpNoOptionsAvailable,
numFailedTrue: 1,
},
{
name: "some capacity is pre-booked, atomic scale-up not needed",
provReqs: []*provreqwrapper.ProvisioningRequest{bookedCapacityProvReq, atomicScaleUpProvReq},
Expand Down Expand Up @@ -438,6 +448,7 @@ func TestScaleUp(t *testing.T) {
provReqsAfterScaleUp, err := client.ProvisioningRequestsNoCache()
assert.NoError(t, err)
assert.Equal(t, len(tc.provReqs), len(provReqsAfterScaleUp))
assert.Equal(t, tc.numFailedTrue, NumProvisioningRequestsWithCondition(provReqsAfterScaleUp, v1.Failed, metav1.ConditionTrue))

if tc.batchProcessing {
// Since batch processing returns aggregated result, we need to check the number of provisioned requests which have the provisioned condition.
Expand Down
12 changes: 12 additions & 0 deletions cluster-autoscaler/provisioningrequest/provreqwrapper/testutils.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,3 +159,15 @@ func BuildTestPods(namespace, name string, podCount int) []*apiv1.Pod {
}
return pods
}

// CopyWithParameters makes a deep copy of embedded ProvReq and sets its CopyWithParameters
func (pr *ProvisioningRequest) CopyWithParameters(params map[string]v1.Parameter) *ProvisioningRequest {
prCopy := pr.DeepCopy()
if prCopy.Spec.Parameters == nil {
prCopy.Spec.Parameters = make(map[string]v1.Parameter, len(params))
}
for key, val := range params {
prCopy.Spec.Parameters[key] = val
}
return &ProvisioningRequest{prCopy, pr.PodTemplates}
}
Loading