From 03a7d6095ce9cdea03a514235a23717481759433 Mon Sep 17 00:00:00 2001 From: jigisha620 Date: Thu, 6 Feb 2025 09:52:52 -0800 Subject: [PATCH] chore: add NodeRegistrationHealthy status condition to nodepool --- kwok/charts/crds/karpenter.sh_nodepools.yaml | 6 + pkg/apis/crds/karpenter.sh_nodepools.yaml | 6 + pkg/apis/v1/nodepool_status.go | 6 + pkg/controllers/controllers.go | 2 + .../nodeclaim/lifecycle/initialization.go | 62 ++++++++-- .../lifecycle/initialization_test.go | 2 + .../nodeclaim/lifecycle/liveness.go | 30 ++++- .../nodeclaim/lifecycle/liveness_test.go | 44 +++++++ .../nodepool/registrationhealth/controller.go | 101 +++++++++++++++ .../nodepool/registrationhealth/suite_test.go | 115 ++++++++++++++++++ 10 files changed, 361 insertions(+), 13 deletions(-) create mode 100644 pkg/controllers/nodepool/registrationhealth/controller.go create mode 100644 pkg/controllers/nodepool/registrationhealth/suite_test.go diff --git a/kwok/charts/crds/karpenter.sh_nodepools.yaml b/kwok/charts/crds/karpenter.sh_nodepools.yaml index bd608d655f..16eae68fa8 100644 --- a/kwok/charts/crds/karpenter.sh_nodepools.yaml +++ b/kwok/charts/crds/karpenter.sh_nodepools.yaml @@ -498,6 +498,12 @@ spec: - type type: object type: array + nodeClassObservedGeneration: + description: |- + NodeClassObservedGeneration represents the nodeClass generation for referenced nodeClass. If this does not match + the actual NodeClass Generation, NodeRegistrationHealthy status condition on the NodePool will be reset + format: int64 + type: integer resources: additionalProperties: anyOf: diff --git a/pkg/apis/crds/karpenter.sh_nodepools.yaml b/pkg/apis/crds/karpenter.sh_nodepools.yaml index 36ecac075d..3622496ba3 100644 --- a/pkg/apis/crds/karpenter.sh_nodepools.yaml +++ b/pkg/apis/crds/karpenter.sh_nodepools.yaml @@ -496,6 +496,12 @@ spec: - type type: object type: array + nodeClassObservedGeneration: + description: |- + NodeClassObservedGeneration represents the nodeClass generation for referenced nodeClass. If this does not match + the actual NodeClass Generation, NodeRegistrationHealthy status condition on the NodePool will be reset + format: int64 + type: integer resources: additionalProperties: anyOf: diff --git a/pkg/apis/v1/nodepool_status.go b/pkg/apis/v1/nodepool_status.go index 1b3f974694..0ce19c5fa2 100644 --- a/pkg/apis/v1/nodepool_status.go +++ b/pkg/apis/v1/nodepool_status.go @@ -27,6 +27,8 @@ const ( ConditionTypeValidationSucceeded = "ValidationSucceeded" // ConditionTypeNodeClassReady = "NodeClassReady" condition indicates that underlying nodeClass was resolved and is reporting as Ready ConditionTypeNodeClassReady = "NodeClassReady" + // ConditionTypeNodeRegistrationHealthy = "NodeRegistrationHealthy" condition indicates if a misconfiguration exists that is preventing successful node launch/registrations that requires manual investigation + ConditionTypeNodeRegistrationHealthy = "NodeRegistrationHealthy" ) // NodePoolStatus defines the observed state of NodePool @@ -34,6 +36,10 @@ type NodePoolStatus struct { // Resources is the list of resources that have been provisioned. // +optional Resources v1.ResourceList `json:"resources,omitempty"` + // NodeClassObservedGeneration represents the nodeClass generation for referenced nodeClass. If this does not match + // the actual NodeClass Generation, NodeRegistrationHealthy status condition on the NodePool will be reset + // +optional + NodeClassObservedGeneration int64 `json:"nodeClassObservedGeneration,omitempty"` // Conditions contains signals for health and readiness // +optional Conditions []status.Condition `json:"conditions,omitempty"` diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index 682bf172fd..544def960e 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -50,6 +50,7 @@ import ( nodepoolcounter "sigs.k8s.io/karpenter/pkg/controllers/nodepool/counter" nodepoolhash "sigs.k8s.io/karpenter/pkg/controllers/nodepool/hash" nodepoolreadiness "sigs.k8s.io/karpenter/pkg/controllers/nodepool/readiness" + nodepoolregistrationhealth "sigs.k8s.io/karpenter/pkg/controllers/nodepool/registrationhealth" nodepoolvalidation "sigs.k8s.io/karpenter/pkg/controllers/nodepool/validation" "sigs.k8s.io/karpenter/pkg/controllers/provisioning" "sigs.k8s.io/karpenter/pkg/controllers/state" @@ -88,6 +89,7 @@ func NewControllers( metricsnodepool.NewController(kubeClient, cloudProvider), metricsnode.NewController(cluster), nodepoolreadiness.NewController(kubeClient, cloudProvider), + nodepoolregistrationhealth.NewController(kubeClient, cloudProvider), nodepoolcounter.NewController(kubeClient, cloudProvider, cluster), nodepoolvalidation.NewController(kubeClient, cloudProvider), podevents.NewController(clock, kubeClient, cloudProvider), diff --git a/pkg/controllers/nodeclaim/lifecycle/initialization.go b/pkg/controllers/nodeclaim/lifecycle/initialization.go index 4a4405fbf3..ba6f8aa2c2 100644 --- a/pkg/controllers/nodeclaim/lifecycle/initialization.go +++ b/pkg/controllers/nodeclaim/lifecycle/initialization.go @@ -20,6 +20,9 @@ import ( "context" "fmt" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" + "github.com/samber/lo" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/equality" @@ -54,38 +57,73 @@ func (i *Initialization) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) return reconcile.Result{}, nil } ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("provider-id", nodeClaim.Status.ProviderID)) + requeue, err := i.updateNodePoolRegistrationHealth(ctx, nodeClaim) + if requeue { + return reconcile.Result{Requeue: true}, nil + } + if err != nil { + return reconcile.Result{}, err + } node, err := nodeclaimutils.NodeForNodeClaim(ctx, i.kubeClient, nodeClaim) if err != nil { nodeClaim.StatusConditions().SetUnknownWithReason(v1.ConditionTypeInitialized, "NodeNotFound", "Node not registered with cluster") return reconcile.Result{}, nil //nolint:nilerr } ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Node", klog.KRef("", node.Name))) + if updateNodeClaimInitializedUnknown(node, nodeClaim) { + return reconcile.Result{}, nil + } + stored := node.DeepCopy() + node.Labels = lo.Assign(node.Labels, map[string]string{v1.NodeInitializedLabelKey: "true"}) + if !equality.Semantic.DeepEqual(stored, node) { + if err = i.kubeClient.Patch(ctx, node, client.MergeFrom(stored)); err != nil { + return reconcile.Result{}, err + } + } + log.FromContext(ctx).WithValues("allocatable", node.Status.Allocatable).Info("initialized nodeclaim") + nodeClaim.StatusConditions().SetTrue(v1.ConditionTypeInitialized) + return reconcile.Result{}, nil +} + +func updateNodeClaimInitializedUnknown(node *corev1.Node, nodeClaim *v1.NodeClaim) bool { if nodeutils.GetCondition(node, corev1.NodeReady).Status != corev1.ConditionTrue { nodeClaim.StatusConditions().SetUnknownWithReason(v1.ConditionTypeInitialized, "NodeNotReady", "Node status is NotReady") - return reconcile.Result{}, nil + return true } if taint, ok := StartupTaintsRemoved(node, nodeClaim); !ok { nodeClaim.StatusConditions().SetUnknownWithReason(v1.ConditionTypeInitialized, "StartupTaintsExist", fmt.Sprintf("StartupTaint %q still exists", formatTaint(taint))) - return reconcile.Result{}, nil + return true } if taint, ok := KnownEphemeralTaintsRemoved(node); !ok { nodeClaim.StatusConditions().SetUnknownWithReason(v1.ConditionTypeInitialized, "KnownEphemeralTaintsExist", fmt.Sprintf("KnownEphemeralTaint %q still exists", formatTaint(taint))) - return reconcile.Result{}, nil + return true } if name, ok := RequestedResourcesRegistered(node, nodeClaim); !ok { nodeClaim.StatusConditions().SetUnknownWithReason(v1.ConditionTypeInitialized, "ResourceNotRegistered", fmt.Sprintf("Resource %q was requested but not registered", name)) - return reconcile.Result{}, nil + return true } - stored := node.DeepCopy() - node.Labels = lo.Assign(node.Labels, map[string]string{v1.NodeInitializedLabelKey: "true"}) - if !equality.Semantic.DeepEqual(stored, node) { - if err = i.kubeClient.Patch(ctx, node, client.MergeFrom(stored)); err != nil { - return reconcile.Result{}, err + return false +} + +func (i *Initialization) updateNodePoolRegistrationHealth(ctx context.Context, nodeClaim *v1.NodeClaim) (bool, error) { + nodePool := &v1.NodePool{} + if err := i.kubeClient.Get(ctx, types.NamespacedName{Name: nodeClaim.Labels[v1.NodePoolLabelKey]}, nodePool); err != nil { + return false, err + } + storedNodePool := nodePool.DeepCopy() + nodePool.StatusConditions().SetTrue(v1.ConditionTypeNodeRegistrationHealthy) + if !equality.Semantic.DeepEqual(storedNodePool, nodePool) { + // We use client.MergeFromWithOptimisticLock because patching a list with a JSON merge patch + // can cause races due to the fact that it fully replaces the list on a change + // Here, we are updating the status condition list + if err := i.kubeClient.Status().Patch(ctx, nodePool, client.MergeFromWithOptions(storedNodePool, client.MergeFromWithOptimisticLock{})); client.IgnoreNotFound(err) != nil { + if errors.IsConflict(err) { + return true, nil + } + return false, err } } - log.FromContext(ctx).WithValues("allocatable", node.Status.Allocatable).Info("initialized nodeclaim") - nodeClaim.StatusConditions().SetTrue(v1.ConditionTypeInitialized) - return reconcile.Result{}, nil + return false, nil } // KnownEphemeralTaintsRemoved validates whether all the ephemeral taints are removed diff --git a/pkg/controllers/nodeclaim/lifecycle/initialization_test.go b/pkg/controllers/nodeclaim/lifecycle/initialization_test.go index ade6be99f5..1db6a5b2a0 100644 --- a/pkg/controllers/nodeclaim/lifecycle/initialization_test.go +++ b/pkg/controllers/nodeclaim/lifecycle/initialization_test.go @@ -202,6 +202,8 @@ var _ = Describe("Initialization", func() { ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) ExpectMakeNodesReady(ctx, env.Client, node) // Remove the not-ready taint ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) + nodePool = ExpectExists(ctx, env.Client, nodePool) + Expect(ExpectStatusConditionExists(nodePool, v1.ConditionTypeNodeRegistrationHealthy).Status).To(Equal(metav1.ConditionTrue)) node = ExpectExists(ctx, env.Client, node) Expect(node.Labels).To(HaveKeyWithValue(v1.NodeInitializedLabelKey, "true")) diff --git a/pkg/controllers/nodeclaim/lifecycle/liveness.go b/pkg/controllers/nodeclaim/lifecycle/liveness.go index fc1a272752..4a87d068b1 100644 --- a/pkg/controllers/nodeclaim/lifecycle/liveness.go +++ b/pkg/controllers/nodeclaim/lifecycle/liveness.go @@ -20,9 +20,13 @@ import ( "context" "time" + "k8s.io/apimachinery/pkg/api/errors" + + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/log" + "k8s.io/utils/clock" "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/reconcile" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" @@ -51,6 +55,30 @@ func (l *Liveness) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (reco if ttl := registrationTTL - l.clock.Since(registered.LastTransitionTime.Time); ttl > 0 { return reconcile.Result{RequeueAfter: ttl}, nil } + nodePool := &v1.NodePool{} + if err := l.kubeClient.Get(ctx, types.NamespacedName{Name: nodeClaim.Labels[v1.NodePoolLabelKey]}, nodePool); err != nil { + return reconcile.Result{}, err + } + if nodePool.StatusConditions().Get(v1.ConditionTypeNodeRegistrationHealthy).IsUnknown() { + stored := nodePool.DeepCopy() + // If the nodeClaim failed to launch/register during the TTL set NodeRegistrationHealthy status condition on + // NodePool to False. If the launch failed get the launch failure reason and message from nodeClaim. + if nodeClaim.StatusConditions().IsTrue(v1.ConditionTypeLaunched) { + nodePool.StatusConditions().SetFalse(v1.ConditionTypeNodeRegistrationHealthy, "Unhealthy", "Failed to register node") + } else { + launchFailure := nodeClaim.StatusConditions().Get(v1.ConditionTypeLaunched) + nodePool.StatusConditions().SetFalse(v1.ConditionTypeNodeRegistrationHealthy, launchFailure.Reason, launchFailure.Message) + } + // We use client.MergeFromWithOptimisticLock because patching a list with a JSON merge patch + // can cause races due to the fact that it fully replaces the list on a change + // Here, we are updating the status condition list + if err := l.kubeClient.Status().Patch(ctx, nodePool, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); client.IgnoreNotFound(err) != nil { + if errors.IsConflict(err) { + return reconcile.Result{Requeue: true}, nil + } + return reconcile.Result{}, err + } + } // Delete the NodeClaim if we believe the NodeClaim won't register since we haven't seen the node if err := l.kubeClient.Delete(ctx, nodeClaim); err != nil { return reconcile.Result{}, client.IgnoreNotFound(err) diff --git a/pkg/controllers/nodeclaim/lifecycle/liveness_test.go b/pkg/controllers/nodeclaim/lifecycle/liveness_test.go index 8fe3421782..f75b0d4adf 100644 --- a/pkg/controllers/nodeclaim/lifecycle/liveness_test.go +++ b/pkg/controllers/nodeclaim/lifecycle/liveness_test.go @@ -20,6 +20,7 @@ import ( "time" . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -78,6 +79,11 @@ var _ = Describe("Liveness", func() { ExpectFinalizersRemoved(ctx, env.Client, nodeClaim) if isManagedNodeClaim { ExpectNotFound(ctx, env.Client, nodeClaim) + nodePool = ExpectExists(ctx, env.Client, nodePool) + nodeRegistrationHealthySC := ExpectStatusConditionExists(nodePool, v1.ConditionTypeNodeRegistrationHealthy) + Expect(nodeRegistrationHealthySC.Status).To(Equal(metav1.ConditionFalse)) + Expect(nodeRegistrationHealthySC.Reason).To(Equal("Unhealthy")) + Expect(nodeRegistrationHealthySC.Message).To(Equal("Failed to register node")) } else { ExpectExists(ctx, env.Client, nodeClaim) } @@ -141,6 +147,44 @@ var _ = Describe("Liveness", func() { // If the node hasn't registered in the registration timeframe, then we deprovision the nodeClaim fakeClock.Step(time.Minute * 20) _ = ExpectObjectReconcileFailed(ctx, env.Client, nodeClaimController, nodeClaim) + nodePool = ExpectExists(ctx, env.Client, nodePool) + nodeRegistrationHealthySC := ExpectStatusConditionExists(nodePool, v1.ConditionTypeNodeRegistrationHealthy) + Expect(nodeRegistrationHealthySC.Status).To(Equal(metav1.ConditionFalse)) + Expect(nodeRegistrationHealthySC.Reason).To(Equal(nodeClaim.StatusConditions().Get(v1.ConditionTypeLaunched).Reason)) + Expect(nodeRegistrationHealthySC.Message).To(Equal(nodeClaim.StatusConditions().Get(v1.ConditionTypeLaunched).Message)) + ExpectFinalizersRemoved(ctx, env.Client, nodeClaim) + ExpectNotFound(ctx, env.Client, nodeClaim) + }) + It("should not update NodeRegistrationHealthy status condition if it is already set to True", func() { + nodePool.StatusConditions().SetTrue(v1.ConditionTypeNodeRegistrationHealthy) + nodeClaim := test.NodeClaim(v1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1.NodePoolLabelKey: nodePool.Name, + }, + }, + Spec: v1.NodeClaimSpec{ + Resources: v1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("50Mi"), + corev1.ResourcePods: resource.MustParse("5"), + fake.ResourceGPUVendorA: resource.MustParse("1"), + }, + }, + }, + }) + cloudProvider.AllowedCreateCalls = 0 // Don't allow Create() calls to succeed + ExpectApplied(ctx, env.Client, nodePool, nodeClaim) + _ = ExpectObjectReconcileFailed(ctx, env.Client, nodeClaimController, nodeClaim) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + + // If the node hasn't registered in the registration timeframe, then we deprovision the nodeClaim + fakeClock.Step(time.Minute * 20) + _ = ExpectObjectReconcileFailed(ctx, env.Client, nodeClaimController, nodeClaim) + nodePool = ExpectExists(ctx, env.Client, nodePool) + // NodeClaim registration failed, but we should not update the NodeRegistrationHealthy status condition if it is already True + Expect(ExpectStatusConditionExists(nodePool, v1.ConditionTypeNodeRegistrationHealthy).Status).To(Equal(metav1.ConditionTrue)) ExpectFinalizersRemoved(ctx, env.Client, nodeClaim) ExpectNotFound(ctx, env.Client, nodeClaim) }) diff --git a/pkg/controllers/nodepool/registrationhealth/controller.go b/pkg/controllers/nodepool/registrationhealth/controller.go new file mode 100644 index 0000000000..c1a9ca5beb --- /dev/null +++ b/pkg/controllers/nodepool/registrationhealth/controller.go @@ -0,0 +1,101 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package registrationhealth + +import ( + "context" + + "github.com/awslabs/operatorpkg/object" + "github.com/awslabs/operatorpkg/status" + "github.com/samber/lo" + "k8s.io/apimachinery/pkg/api/equality" + "k8s.io/apimachinery/pkg/api/errors" + + "sigs.k8s.io/karpenter/pkg/operator/injection" + + controllerruntime "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + "sigs.k8s.io/karpenter/pkg/cloudprovider" + nodepoolutils "sigs.k8s.io/karpenter/pkg/utils/nodepool" +) + +// Controller for the resource +type Controller struct { + kubeClient client.Client + cloudProvider cloudprovider.CloudProvider +} + +// NewController is a constructor +func NewController(kubeClient client.Client, cloudProvider cloudprovider.CloudProvider) *Controller { + return &Controller{ + kubeClient: kubeClient, + cloudProvider: cloudProvider, + } +} + +func (c *Controller) Reconcile(ctx context.Context, nodePool *v1.NodePool) (reconcile.Result, error) { + ctx = injection.WithControllerName(ctx, "nodepool.healthyregistration") + stored := nodePool.DeepCopy() + + nodeClass, ok := lo.Find(c.cloudProvider.GetSupportedNodeClasses(), func(nc status.Object) bool { + return object.GVK(nc).GroupKind() == nodePool.Spec.Template.Spec.NodeClassRef.GroupKind() + }) + if !ok { + // Ignore NodePools which aren't using a supported NodeClass. + return reconcile.Result{}, nil + } + if err := c.kubeClient.Get(ctx, client.ObjectKey{Name: nodePool.Spec.Template.Spec.NodeClassRef.Name}, nodeClass); err != nil { + return reconcile.Result{}, client.IgnoreNotFound(err) + } + + // If NodeClass/NodePool have been updated then NodeRegistrationHealthy = Unknown + if (nodePool.Status.NodeClassObservedGeneration != nodeClass.GetGeneration()) || + (nodePool.Generation != nodePool.StatusConditions().Get(v1.ConditionTypeNodeRegistrationHealthy).ObservedGeneration) { + nodePool.StatusConditions().SetUnknown(v1.ConditionTypeNodeRegistrationHealthy) + nodePool.Status.NodeClassObservedGeneration = nodeClass.GetGeneration() + } + + if !equality.Semantic.DeepEqual(stored, nodePool) { + // We use client.MergeFromWithOptimisticLock because patching a list with a JSON merge patch + // can cause races due to the fact that it fully replaces the list on a change + // Here, we are updating the status condition list + if err := c.kubeClient.Status().Patch(ctx, nodePool, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); client.IgnoreNotFound(err) != nil { + if errors.IsConflict(err) { + return reconcile.Result{Requeue: true}, nil + } + return reconcile.Result{}, err + } + } + return reconcile.Result{}, nil +} + +func (c *Controller) Register(_ context.Context, m manager.Manager) error { + b := controllerruntime.NewControllerManagedBy(m). + Named("nodepool.healthyregistration"). + For(&v1.NodePool{}, builder.WithPredicates(nodepoolutils.IsManagedPredicateFuncs(c.cloudProvider))). + WithOptions(controller.Options{MaxConcurrentReconciles: 10}) + for _, nodeClass := range c.cloudProvider.GetSupportedNodeClasses() { + b.Watches(nodeClass, nodepoolutils.NodeClassEventHandler(c.kubeClient)) + } + return b.Complete(reconcile.AsReconciler(m.GetClient(), c)) +} diff --git a/pkg/controllers/nodepool/registrationhealth/suite_test.go b/pkg/controllers/nodepool/registrationhealth/suite_test.go new file mode 100644 index 0000000000..5b90816de9 --- /dev/null +++ b/pkg/controllers/nodepool/registrationhealth/suite_test.go @@ -0,0 +1,115 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package registrationhealth_test + +import ( + "context" + "testing" + + "sigs.k8s.io/karpenter/pkg/controllers/nodepool/registrationhealth" + + "github.com/awslabs/operatorpkg/object" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "sigs.k8s.io/karpenter/pkg/apis" + v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + "sigs.k8s.io/karpenter/pkg/cloudprovider/fake" + "sigs.k8s.io/karpenter/pkg/test" + . "sigs.k8s.io/karpenter/pkg/test/expectations" + "sigs.k8s.io/karpenter/pkg/test/v1alpha1" + . "sigs.k8s.io/karpenter/pkg/utils/testing" +) + +var ( + controller *registrationhealth.Controller + ctx context.Context + env *test.Environment + cloudProvider *fake.CloudProvider + nodePool *v1.NodePool + nodeClass *v1alpha1.TestNodeClass +) + +func TestAPIs(t *testing.T) { + ctx = TestContextWithLogger(t) + RegisterFailHandler(Fail) + RunSpecs(t, "RegistrationHealth") +} + +var _ = BeforeSuite(func() { + cloudProvider = fake.NewCloudProvider() + env = test.NewEnvironment(test.WithCRDs(apis.CRDs...), test.WithCRDs(v1alpha1.CRDs...)) + controller = registrationhealth.NewController(env.Client, cloudProvider) +}) +var _ = AfterEach(func() { + ExpectCleanedUp(ctx, env.Client) +}) + +var _ = AfterSuite(func() { + Expect(env.Stop()).To(Succeed(), "Failed to stop environment") +}) + +var _ = Describe("RegistrationHealth", func() { + BeforeEach(func() { + nodePool = test.NodePool() + nodeClass = test.NodeClass(v1alpha1.TestNodeClass{ + ObjectMeta: metav1.ObjectMeta{Name: nodePool.Spec.Template.Spec.NodeClassRef.Name}, + }) + nodePool.Spec.Template.Spec.NodeClassRef.Group = object.GVK(nodeClass).Group + nodePool.Spec.Template.Spec.NodeClassRef.Kind = object.GVK(nodeClass).Kind + _ = nodePool.StatusConditions().Clear(v1.ConditionTypeNodeRegistrationHealthy) + }) + It("should ignore setting NodeRegistrationHealthy status condition on NodePools which aren't managed by this instance of Karpenter", func() { + nodePool.Spec.Template.Spec.NodeClassRef = &v1.NodeClassReference{ + Group: "karpenter.test.sh", + Kind: "UnmanagedNodeClass", + Name: "default", + } + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + _ = ExpectObjectReconciled(ctx, env.Client, controller, nodePool) + nodePool = ExpectExists(ctx, env.Client, nodePool) + Expect(nodePool.StatusConditions().Get(v1.ConditionTypeNodeRegistrationHealthy)).To(BeNil()) + }) + It("should not set NodeRegistrationHealthy status condition on nodePool when nodeClass does not exist", func() { + ExpectApplied(ctx, env.Client, nodePool) + ExpectObjectReconciled(ctx, env.Client, controller, nodePool) + nodePool = ExpectExists(ctx, env.Client, nodePool) + Expect(nodePool.StatusConditions().Get(v1.ConditionTypeNodeRegistrationHealthy)).To(BeNil()) + }) + It("should set NodeRegistrationHealthy status condition on nodePool as Unknown if the nodeClass observed generation doesn't match with that on nodePool", func() { + nodePool.StatusConditions().SetFalse(v1.ConditionTypeNodeRegistrationHealthy, "unhealthy", "unhealthy") + nodePool.Status.NodeClassObservedGeneration = int64(1) + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + + nodePool.Spec.Limits = map[corev1.ResourceName]resource.Quantity{corev1.ResourceCPU: resource.MustParse("14")} + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + _ = ExpectObjectReconciled(ctx, env.Client, controller, nodePool) + nodePool = ExpectExists(ctx, env.Client, nodePool) + Expect(nodePool.StatusConditions().Get(v1.ConditionTypeNodeRegistrationHealthy).IsUnknown()).To(BeTrue()) + Expect(nodePool.Status.NodeClassObservedGeneration).To(Equal(int64(1))) + }) + It("should set NodeRegistrationHealthy status condition on nodePool as Unknown if the nodePool is updated", func() { + ExpectApplied(ctx, env.Client, nodePool, nodeClass) + _ = ExpectObjectReconciled(ctx, env.Client, controller, nodePool) + nodePool = ExpectExists(ctx, env.Client, nodePool) + Expect(nodePool.StatusConditions().Get(v1.ConditionTypeNodeRegistrationHealthy).IsUnknown()).To(BeTrue()) + Expect(nodePool.Status.NodeClassObservedGeneration).To(Equal(int64(1))) + }) +})