From 6e956b8854896c14f4778f7aadd0a92069441b0c Mon Sep 17 00:00:00 2001 From: Patryk Bundyra Date: Thu, 14 Nov 2024 13:50:49 +0000 Subject: [PATCH 1/4] Introduce noRetry Parameter for checkcapacity ProvisioningRequest --- .../checkcapacity/provisioningclass.go | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/cluster-autoscaler/provisioningrequest/checkcapacity/provisioningclass.go b/cluster-autoscaler/provisioningrequest/checkcapacity/provisioningclass.go index 1a3b45cbbaf5..8e42db979f6d 100644 --- a/cluster-autoscaler/provisioningrequest/checkcapacity/provisioningclass.go +++ b/cluster-autoscaler/provisioningrequest/checkcapacity/provisioningclass.go @@ -44,6 +44,10 @@ import ( schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework" ) +const ( + NoRetryParameter = "noRetry" +) + type checkCapacityProvClass struct { context *context.AutoscalingContext client *provreqclient.ProvisioningRequestClient @@ -139,7 +143,11 @@ func (o *checkCapacityProvClass) checkcapacity(unschedulablePods []*apiv1.Pod, p err, cleanupErr := clustersnapshot.WithForkedSnapshot(o.context.ClusterSnapshot, func() (bool, error) { st, _, err := o.schedulingSimulator.TrySchedulePods(o.context.ClusterSnapshot, unschedulablePods, scheduling.ScheduleAnywhere, true) if len(st) < len(unschedulablePods) || err != nil { - conditions.AddOrUpdateCondition(provReq, v1.Provisioned, metav1.ConditionFalse, conditions.CapacityIsNotFoundReason, "Capacity is not found, CA will try to find it later.", metav1.Now()) + if _, ok := provReq.Spec.Parameters[NoRetryParameter]; ok { + conditions.AddOrUpdateCondition(provReq, v1.Failed, metav1.ConditionTrue, conditions.CapacityIsNotFoundReason, "CA could not find requested capacity", metav1.Now()) + } else { + conditions.AddOrUpdateCondition(provReq, v1.Provisioned, metav1.ConditionFalse, conditions.CapacityIsNotFoundReason, "Capacity is not found, CA will try to find it later.", metav1.Now()) + } capacityAvailable = false } else { conditions.AddOrUpdateCondition(provReq, v1.Provisioned, metav1.ConditionTrue, conditions.CapacityIsFoundReason, conditions.CapacityIsFoundMsg, metav1.Now()) From 3b371f7179e46d4f1abb762a013c7a6bd48d209e Mon Sep 17 00:00:00 2001 From: Patryk Bundyra Date: Mon, 18 Nov 2024 10:37:09 +0000 Subject: [PATCH 2/4] Add unit test --- .../checkcapacity/provisioningclass.go | 4 +++- .../orchestrator/orchestrator_test.go | 11 +++++++++++ .../provisioningrequest/provreqwrapper/wrapper.go | 12 ++++++++++++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/cluster-autoscaler/provisioningrequest/checkcapacity/provisioningclass.go b/cluster-autoscaler/provisioningrequest/checkcapacity/provisioningclass.go index 8e42db979f6d..544923077261 100644 --- a/cluster-autoscaler/provisioningrequest/checkcapacity/provisioningclass.go +++ b/cluster-autoscaler/provisioningrequest/checkcapacity/provisioningclass.go @@ -143,7 +143,9 @@ func (o *checkCapacityProvClass) checkcapacity(unschedulablePods []*apiv1.Pod, p err, cleanupErr := clustersnapshot.WithForkedSnapshot(o.context.ClusterSnapshot, func() (bool, error) { st, _, err := o.schedulingSimulator.TrySchedulePods(o.context.ClusterSnapshot, unschedulablePods, scheduling.ScheduleAnywhere, true) if len(st) < len(unschedulablePods) || err != nil { - if _, ok := provReq.Spec.Parameters[NoRetryParameter]; ok { + if noRetry, ok := provReq.Spec.Parameters[NoRetryParameter]; ok && noRetry == "true" { + // Failed=true condition triggers retry in Kueue. Otherwise ProvisioningRequest with Provisioned=Failed + // condition block capacity in Kueue even it's in the middle of backoff waiting time. conditions.AddOrUpdateCondition(provReq, v1.Failed, metav1.ConditionTrue, conditions.CapacityIsNotFoundReason, "CA could not find requested capacity", metav1.Now()) } else { conditions.AddOrUpdateCondition(provReq, v1.Provisioned, metav1.ConditionFalse, conditions.CapacityIsNotFoundReason, "Capacity is not found, CA will try to find it later.", metav1.Now()) diff --git a/cluster-autoscaler/provisioningrequest/orchestrator/orchestrator_test.go b/cluster-autoscaler/provisioningrequest/orchestrator/orchestrator_test.go index 0b527625f589..d2ee5a3f700f 100644 --- a/cluster-autoscaler/provisioningrequest/orchestrator/orchestrator_test.go +++ b/cluster-autoscaler/provisioningrequest/orchestrator/orchestrator_test.go @@ -206,6 +206,7 @@ func TestScaleUp(t *testing.T) { batchTimebox time.Duration numProvisionedTrue int numProvisionedFalse int + numFailedTrue int }{ { name: "no ProvisioningRequests", @@ -242,6 +243,15 @@ func TestScaleUp(t *testing.T) { provReqToScaleUp: newCheckCapacityCpuProvReq, scaleUpResult: status.ScaleUpSuccessful, }, + { + name: "impossible check-capacity, with noRetry parameter", + provReqs: []*provreqwrapper.ProvisioningRequest{ + impossibleCheckCapacityReq.Parameters(map[string]v1.Parameter{"noRetry": "true"}), + }, + provReqToScaleUp: impossibleCheckCapacityReq, + scaleUpResult: status.ScaleUpNoOptionsAvailable, + numFailedTrue: 1, + }, { name: "some capacity is pre-booked, atomic scale-up not needed", provReqs: []*provreqwrapper.ProvisioningRequest{bookedCapacityProvReq, atomicScaleUpProvReq}, @@ -438,6 +448,7 @@ func TestScaleUp(t *testing.T) { provReqsAfterScaleUp, err := client.ProvisioningRequestsNoCache() assert.NoError(t, err) assert.Equal(t, len(tc.provReqs), len(provReqsAfterScaleUp)) + assert.Equal(t, tc.numFailedTrue, NumProvisioningRequestsWithCondition(provReqsAfterScaleUp, v1.Failed, metav1.ConditionTrue)) if tc.batchProcessing { // Since batch processing returns aggregated result, we need to check the number of provisioned requests which have the provisioned condition. diff --git a/cluster-autoscaler/provisioningrequest/provreqwrapper/wrapper.go b/cluster-autoscaler/provisioningrequest/provreqwrapper/wrapper.go index c4213385b4fc..bd5e1cf29e56 100644 --- a/cluster-autoscaler/provisioningrequest/provreqwrapper/wrapper.go +++ b/cluster-autoscaler/provisioningrequest/provreqwrapper/wrapper.go @@ -81,3 +81,15 @@ func errMissingPodTemplates(podSets []v1.PodSet, podTemplates []*apiv1.PodTempla } return fmt.Errorf("missing pod templates, %d pod templates were referenced, %d templates were missing: %s", len(podSets), len(missingTemplates), strings.Join(missingTemplates, ",")) } + +// Parameters makes a deep copy of embedded ProvReq and sets its Parameters +func (pr *ProvisioningRequest) Parameters(params map[string]v1.Parameter) *ProvisioningRequest { + prCopy := pr.DeepCopy() + if prCopy.Spec.Parameters == nil { + prCopy.Spec.Parameters = make(map[string]v1.Parameter, len(params)) + } + for key, val := range params { + prCopy.Spec.Parameters[key] = val + } + return &ProvisioningRequest{prCopy, pr.PodTemplates} +} From ddcb59dd23f97b42335a1d9eff9bd28caae2e0f8 Mon Sep 17 00:00:00 2001 From: Patryk Bundyra Date: Wed, 20 Nov 2024 15:29:27 +0000 Subject: [PATCH 3/4] Add logging, add comment, move test util function --- .../checkcapacity/provisioningclass.go | 11 +++++++++-- .../provisioningrequest/provreqwrapper/testutils.go | 12 ++++++++++++ .../provisioningrequest/provreqwrapper/wrapper.go | 12 ------------ 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/cluster-autoscaler/provisioningrequest/checkcapacity/provisioningclass.go b/cluster-autoscaler/provisioningrequest/checkcapacity/provisioningclass.go index 544923077261..b31f567dacf5 100644 --- a/cluster-autoscaler/provisioningrequest/checkcapacity/provisioningclass.go +++ b/cluster-autoscaler/provisioningrequest/checkcapacity/provisioningclass.go @@ -45,7 +45,11 @@ import ( ) const ( - NoRetryParameter = "noRetry" + // NoRetryParameterKey is a a key for ProvReq's Parameters that describes + // if ProvisioningRequest should be retried in case CA cannot provision it. + // Supported values are "true" and "false" - by default ProvisioningRequests are always retried. + // Currently supported only for checkcapacity class. + NoRetryParameterKey = "noRetry" ) type checkCapacityProvClass struct { @@ -143,11 +147,14 @@ func (o *checkCapacityProvClass) checkcapacity(unschedulablePods []*apiv1.Pod, p err, cleanupErr := clustersnapshot.WithForkedSnapshot(o.context.ClusterSnapshot, func() (bool, error) { st, _, err := o.schedulingSimulator.TrySchedulePods(o.context.ClusterSnapshot, unschedulablePods, scheduling.ScheduleAnywhere, true) if len(st) < len(unschedulablePods) || err != nil { - if noRetry, ok := provReq.Spec.Parameters[NoRetryParameter]; ok && noRetry == "true" { + if noRetry, ok := provReq.Spec.Parameters[NoRetryParameterKey]; ok && noRetry == "true" { // Failed=true condition triggers retry in Kueue. Otherwise ProvisioningRequest with Provisioned=Failed // condition block capacity in Kueue even it's in the middle of backoff waiting time. conditions.AddOrUpdateCondition(provReq, v1.Failed, metav1.ConditionTrue, conditions.CapacityIsNotFoundReason, "CA could not find requested capacity", metav1.Now()) } else { + if noRetry, ok := provReq.Spec.Parameters[NoRetryParameterKey]; ok && noRetry != "false" { + klog.Errorf("Invalid value: %v for %v Parameter in %v ProvisioningRequest. Supported values are: \"true\", \"false\"", noRetry, NoRetryParameterKey, provReq.Name) + } conditions.AddOrUpdateCondition(provReq, v1.Provisioned, metav1.ConditionFalse, conditions.CapacityIsNotFoundReason, "Capacity is not found, CA will try to find it later.", metav1.Now()) } capacityAvailable = false diff --git a/cluster-autoscaler/provisioningrequest/provreqwrapper/testutils.go b/cluster-autoscaler/provisioningrequest/provreqwrapper/testutils.go index 6529fd968d5c..69c8695c510c 100644 --- a/cluster-autoscaler/provisioningrequest/provreqwrapper/testutils.go +++ b/cluster-autoscaler/provisioningrequest/provreqwrapper/testutils.go @@ -159,3 +159,15 @@ func BuildTestPods(namespace, name string, podCount int) []*apiv1.Pod { } return pods } + +// Parameters makes a deep copy of embedded ProvReq and sets its Parameters +func (pr *ProvisioningRequest) Parameters(params map[string]v1.Parameter) *ProvisioningRequest { + prCopy := pr.DeepCopy() + if prCopy.Spec.Parameters == nil { + prCopy.Spec.Parameters = make(map[string]v1.Parameter, len(params)) + } + for key, val := range params { + prCopy.Spec.Parameters[key] = val + } + return &ProvisioningRequest{prCopy, pr.PodTemplates} +} diff --git a/cluster-autoscaler/provisioningrequest/provreqwrapper/wrapper.go b/cluster-autoscaler/provisioningrequest/provreqwrapper/wrapper.go index bd5e1cf29e56..c4213385b4fc 100644 --- a/cluster-autoscaler/provisioningrequest/provreqwrapper/wrapper.go +++ b/cluster-autoscaler/provisioningrequest/provreqwrapper/wrapper.go @@ -81,15 +81,3 @@ func errMissingPodTemplates(podSets []v1.PodSet, podTemplates []*apiv1.PodTempla } return fmt.Errorf("missing pod templates, %d pod templates were referenced, %d templates were missing: %s", len(podSets), len(missingTemplates), strings.Join(missingTemplates, ",")) } - -// Parameters makes a deep copy of embedded ProvReq and sets its Parameters -func (pr *ProvisioningRequest) Parameters(params map[string]v1.Parameter) *ProvisioningRequest { - prCopy := pr.DeepCopy() - if prCopy.Spec.Parameters == nil { - prCopy.Spec.Parameters = make(map[string]v1.Parameter, len(params)) - } - for key, val := range params { - prCopy.Spec.Parameters[key] = val - } - return &ProvisioningRequest{prCopy, pr.PodTemplates} -} From 14067b766c74bd56ffb12175093642d790c3490a Mon Sep 17 00:00:00 2001 From: Patryk Bundyra Date: Thu, 21 Nov 2024 13:37:13 +0000 Subject: [PATCH 4/4] Address nits --- .../provisioningrequest/checkcapacity/provisioningclass.go | 4 ++-- .../provisioningrequest/orchestrator/orchestrator_test.go | 2 +- .../provisioningrequest/provreqwrapper/testutils.go | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cluster-autoscaler/provisioningrequest/checkcapacity/provisioningclass.go b/cluster-autoscaler/provisioningrequest/checkcapacity/provisioningclass.go index b31f567dacf5..a7b793768b35 100644 --- a/cluster-autoscaler/provisioningrequest/checkcapacity/provisioningclass.go +++ b/cluster-autoscaler/provisioningrequest/checkcapacity/provisioningclass.go @@ -149,11 +149,11 @@ func (o *checkCapacityProvClass) checkcapacity(unschedulablePods []*apiv1.Pod, p if len(st) < len(unschedulablePods) || err != nil { if noRetry, ok := provReq.Spec.Parameters[NoRetryParameterKey]; ok && noRetry == "true" { // Failed=true condition triggers retry in Kueue. Otherwise ProvisioningRequest with Provisioned=Failed - // condition block capacity in Kueue even it's in the middle of backoff waiting time. + // condition block capacity in Kueue even if it's in the middle of backoff waiting time. conditions.AddOrUpdateCondition(provReq, v1.Failed, metav1.ConditionTrue, conditions.CapacityIsNotFoundReason, "CA could not find requested capacity", metav1.Now()) } else { if noRetry, ok := provReq.Spec.Parameters[NoRetryParameterKey]; ok && noRetry != "false" { - klog.Errorf("Invalid value: %v for %v Parameter in %v ProvisioningRequest. Supported values are: \"true\", \"false\"", noRetry, NoRetryParameterKey, provReq.Name) + klog.Errorf("Ignoring Parameter %v with invalid value: %v in ProvisioningRequest: %v. Supported values are: \"true\", \"false\"", NoRetryParameterKey, noRetry, provReq.Name) } conditions.AddOrUpdateCondition(provReq, v1.Provisioned, metav1.ConditionFalse, conditions.CapacityIsNotFoundReason, "Capacity is not found, CA will try to find it later.", metav1.Now()) } diff --git a/cluster-autoscaler/provisioningrequest/orchestrator/orchestrator_test.go b/cluster-autoscaler/provisioningrequest/orchestrator/orchestrator_test.go index d2ee5a3f700f..8b9c682cd70b 100644 --- a/cluster-autoscaler/provisioningrequest/orchestrator/orchestrator_test.go +++ b/cluster-autoscaler/provisioningrequest/orchestrator/orchestrator_test.go @@ -246,7 +246,7 @@ func TestScaleUp(t *testing.T) { { name: "impossible check-capacity, with noRetry parameter", provReqs: []*provreqwrapper.ProvisioningRequest{ - impossibleCheckCapacityReq.Parameters(map[string]v1.Parameter{"noRetry": "true"}), + impossibleCheckCapacityReq.CopyWithParameters(map[string]v1.Parameter{"noRetry": "true"}), }, provReqToScaleUp: impossibleCheckCapacityReq, scaleUpResult: status.ScaleUpNoOptionsAvailable, diff --git a/cluster-autoscaler/provisioningrequest/provreqwrapper/testutils.go b/cluster-autoscaler/provisioningrequest/provreqwrapper/testutils.go index 69c8695c510c..37d3ce01bc5f 100644 --- a/cluster-autoscaler/provisioningrequest/provreqwrapper/testutils.go +++ b/cluster-autoscaler/provisioningrequest/provreqwrapper/testutils.go @@ -160,8 +160,8 @@ func BuildTestPods(namespace, name string, podCount int) []*apiv1.Pod { return pods } -// Parameters makes a deep copy of embedded ProvReq and sets its Parameters -func (pr *ProvisioningRequest) Parameters(params map[string]v1.Parameter) *ProvisioningRequest { +// CopyWithParameters makes a deep copy of embedded ProvReq and sets its CopyWithParameters +func (pr *ProvisioningRequest) CopyWithParameters(params map[string]v1.Parameter) *ProvisioningRequest { prCopy := pr.DeepCopy() if prCopy.Spec.Parameters == nil { prCopy.Spec.Parameters = make(map[string]v1.Parameter, len(params))