-
Notifications
You must be signed in to change notification settings - Fork 236
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: add NodeRegistrationHealthy status condition to nodepool #1969
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,9 +20,14 @@ import ( | |
"context" | ||
"time" | ||
|
||
"k8s.io/apimachinery/pkg/api/errors" | ||
|
||
"k8s.io/apimachinery/pkg/types" | ||
|
||
"sigs.k8s.io/controller-runtime/pkg/log" | ||
|
||
"k8s.io/utils/clock" | ||
"sigs.k8s.io/controller-runtime/pkg/client" | ||
"sigs.k8s.io/controller-runtime/pkg/log" | ||
"sigs.k8s.io/controller-runtime/pkg/reconcile" | ||
|
||
v1 "sigs.k8s.io/karpenter/pkg/apis/v1" | ||
|
@@ -51,6 +56,12 @@ func (l *Liveness) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (reco | |
if ttl := registrationTTL - l.clock.Since(registered.LastTransitionTime.Time); ttl > 0 { | ||
return reconcile.Result{RequeueAfter: ttl}, nil | ||
} | ||
if err := l.updateNodePoolRegistrationHealth(ctx, nodeClaim); err != nil { | ||
if errors.IsConflict(err) { | ||
return reconcile.Result{Requeue: true}, nil | ||
} | ||
return reconcile.Result{}, err | ||
} | ||
// Delete the NodeClaim if we believe the NodeClaim won't register since we haven't seen the node | ||
if err := l.kubeClient.Delete(ctx, nodeClaim); err != nil { | ||
return reconcile.Result{}, client.IgnoreNotFound(err) | ||
|
@@ -61,6 +72,34 @@ func (l *Liveness) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (reco | |
metrics.NodePoolLabel: nodeClaim.Labels[v1.NodePoolLabelKey], | ||
metrics.CapacityTypeLabel: nodeClaim.Labels[v1.CapacityTypeLabelKey], | ||
}) | ||
|
||
return reconcile.Result{}, nil | ||
} | ||
|
||
// updateNodePoolRegistrationHealth sets the NodeRegistrationHealthy=False | ||
// on the NodePool if the nodeClaim fails to launch/register | ||
func (l *Liveness) updateNodePoolRegistrationHealth(ctx context.Context, nodeClaim *v1.NodeClaim) error { | ||
nodePoolName, ok := nodeClaim.Labels[v1.NodePoolLabelKey] | ||
if ok && len(nodePoolName) != 0 { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Just check |
||
nodePool := &v1.NodePool{} | ||
if err := l.kubeClient.Get(ctx, types.NamespacedName{Name: nodePoolName}, nodePool); err != nil { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you properly handle the NodePool NotFound error? |
||
return err | ||
} | ||
if nodePool.StatusConditions().Get(v1.ConditionTypeNodeRegistrationHealthy).IsUnknown() { | ||
stored := nodePool.DeepCopy() | ||
// If the nodeClaim failed to register during the TTL set NodeRegistrationHealthy status condition on | ||
// NodePool to False. If the launch failed get the launch failure reason and message from nodeClaim. | ||
if launchCondition := nodeClaim.StatusConditions().Get(v1.ConditionTypeLaunched); launchCondition.IsTrue() { | ||
nodePool.StatusConditions().SetFalse(v1.ConditionTypeNodeRegistrationHealthy, "RegistrationFailed", "Failed to register node") | ||
} else { | ||
nodePool.StatusConditions().SetFalse(v1.ConditionTypeNodeRegistrationHealthy, launchCondition.Reason, launchCondition.Message) | ||
} | ||
// We use client.MergeFromWithOptimisticLock because patching a list with a JSON merge patch | ||
// can cause races due to the fact that it fully replaces the list on a change | ||
// Here, we are updating the status condition list | ||
if err := l.kubeClient.Status().Patch(ctx, nodePool, client.MergeFromWithOptions(stored, client.MergeFromWithOptimisticLock{})); client.IgnoreNotFound(err) != nil { | ||
return err | ||
} | ||
} | ||
} | ||
return nil | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,9 @@ package lifecycle_test | |
import ( | ||
"time" | ||
|
||
"github.com/awslabs/operatorpkg/status" | ||
|
||
operatorpkg "github.com/awslabs/operatorpkg/test/expectations" | ||
. "github.com/onsi/ginkgo/v2" | ||
corev1 "k8s.io/api/core/v1" | ||
"k8s.io/apimachinery/pkg/api/resource" | ||
|
@@ -78,6 +81,12 @@ var _ = Describe("Liveness", func() { | |
ExpectFinalizersRemoved(ctx, env.Client, nodeClaim) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should validate this: Do we have a test that we succeed when the NodeClaim has no owning NodePool? |
||
if isManagedNodeClaim { | ||
ExpectNotFound(ctx, env.Client, nodeClaim) | ||
operatorpkg.ExpectStatusConditions(ctx, env.Client, 1*time.Minute, nodePool, status.Condition{ | ||
Type: v1.ConditionTypeNodeRegistrationHealthy, | ||
Status: metav1.ConditionFalse, | ||
Reason: "RegistrationFailed", | ||
Message: "Failed to register node", | ||
}) | ||
} else { | ||
ExpectExists(ctx, env.Client, nodeClaim) | ||
} | ||
|
@@ -141,6 +150,45 @@ var _ = Describe("Liveness", func() { | |
// If the node hasn't registered in the registration timeframe, then we deprovision the nodeClaim | ||
fakeClock.Step(time.Minute * 20) | ||
_ = ExpectObjectReconcileFailed(ctx, env.Client, nodeClaimController, nodeClaim) | ||
operatorpkg.ExpectStatusConditions(ctx, env.Client, 1*time.Minute, nodePool, status.Condition{ | ||
Type: v1.ConditionTypeNodeRegistrationHealthy, | ||
Status: metav1.ConditionFalse, | ||
Reason: nodeClaim.StatusConditions().Get(v1.ConditionTypeLaunched).Reason, | ||
Message: nodeClaim.StatusConditions().Get(v1.ConditionTypeLaunched).Message, | ||
}) | ||
ExpectFinalizersRemoved(ctx, env.Client, nodeClaim) | ||
ExpectNotFound(ctx, env.Client, nodeClaim) | ||
}) | ||
It("should not update NodeRegistrationHealthy status condition if it is already set to True", func() { | ||
nodePool.StatusConditions().SetTrue(v1.ConditionTypeNodeRegistrationHealthy) | ||
nodeClaim := test.NodeClaim(v1.NodeClaim{ | ||
ObjectMeta: metav1.ObjectMeta{ | ||
Labels: map[string]string{ | ||
v1.NodePoolLabelKey: nodePool.Name, | ||
}, | ||
}, | ||
Spec: v1.NodeClaimSpec{ | ||
Resources: v1.ResourceRequirements{ | ||
Requests: corev1.ResourceList{ | ||
corev1.ResourceCPU: resource.MustParse("2"), | ||
corev1.ResourceMemory: resource.MustParse("50Mi"), | ||
corev1.ResourcePods: resource.MustParse("5"), | ||
fake.ResourceGPUVendorA: resource.MustParse("1"), | ||
}, | ||
}, | ||
}, | ||
}) | ||
cloudProvider.AllowedCreateCalls = 0 // Don't allow Create() calls to succeed | ||
ExpectApplied(ctx, env.Client, nodePool, nodeClaim) | ||
_ = ExpectObjectReconcileFailed(ctx, env.Client, nodeClaimController, nodeClaim) | ||
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) | ||
|
||
// If the node hasn't registered in the registration timeframe, then we deprovision the nodeClaim | ||
fakeClock.Step(time.Minute * 20) | ||
_ = ExpectObjectReconcileFailed(ctx, env.Client, nodeClaimController, nodeClaim) | ||
|
||
// NodeClaim registration failed, but we should not update the NodeRegistrationHealthy status condition if it is already True | ||
operatorpkg.ExpectStatusConditions(ctx, env.Client, 1*time.Minute, nodePool, status.Condition{Type: v1.ConditionTypeNodeRegistrationHealthy, Status: metav1.ConditionTrue}) | ||
ExpectFinalizersRemoved(ctx, env.Client, nodeClaim) | ||
ExpectNotFound(ctx, env.Client, nodeClaim) | ||
}) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,6 +20,8 @@ import ( | |
"context" | ||
"fmt" | ||
|
||
"k8s.io/apimachinery/pkg/types" | ||
|
||
"github.com/samber/lo" | ||
corev1 "k8s.io/api/core/v1" | ||
"k8s.io/apimachinery/pkg/api/equality" | ||
|
@@ -82,9 +84,37 @@ func (r *Registration) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) ( | |
metrics.NodesCreatedTotal.Inc(map[string]string{ | ||
metrics.NodePoolLabel: nodeClaim.Labels[v1.NodePoolLabelKey], | ||
}) | ||
if err := r.updateNodePoolRegistrationHealth(ctx, nodeClaim); err != nil { | ||
if errors.IsConflict(err) { | ||
return reconcile.Result{Requeue: true}, nil | ||
} | ||
return reconcile.Result{}, err | ||
} | ||
return reconcile.Result{}, nil | ||
} | ||
|
||
// updateNodePoolRegistrationHealth sets the NodeRegistrationHealthy=True | ||
// on the NodePool if the nodeClaim that registered is owned by a NodePool | ||
func (r *Registration) updateNodePoolRegistrationHealth(ctx context.Context, nodeClaim *v1.NodeClaim) error { | ||
jigisha620 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
nodePoolName, ok := nodeClaim.Labels[v1.NodePoolLabelKey] | ||
if ok && len(nodePoolName) != 0 { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment as in the liveness controller -- I would just check if the value is not equal to |
||
nodePool := &v1.NodePool{} | ||
if err := r.kubeClient.Get(ctx, types.NamespacedName{Name: nodePoolName}, nodePool); err != nil { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Handle NotFound error here too |
||
return err | ||
} | ||
storedNodePool := nodePool.DeepCopy() | ||
if nodePool.StatusConditions().SetTrue(v1.ConditionTypeNodeRegistrationHealthy) { | ||
// We use client.MergeFromWithOptimisticLock because patching a list with a JSON merge patch | ||
// can cause races due to the fact that it fully replaces the list on a change | ||
// Here, we are updating the status condition list | ||
if err := r.kubeClient.Status().Patch(ctx, nodePool, client.MergeFromWithOptions(storedNodePool, client.MergeFromWithOptimisticLock{})); client.IgnoreNotFound(err) != nil { | ||
return err | ||
} | ||
} | ||
} | ||
return nil | ||
} | ||
|
||
func (r *Registration) syncNode(ctx context.Context, nodeClaim *v1.NodeClaim, node *corev1.Node) error { | ||
stored := node.DeepCopy() | ||
controllerutil.AddFinalizer(node, v1.TerminationFinalizer) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,10 @@ limitations under the License. | |
package lifecycle_test | ||
|
||
import ( | ||
"time" | ||
|
||
"github.com/awslabs/operatorpkg/status" | ||
operatorpkg "github.com/awslabs/operatorpkg/test/expectations" | ||
. "github.com/onsi/ginkgo/v2" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should validate this: Do we have a test that we succeed when the NodeClaim has no owning NodePool? |
||
. "github.com/onsi/gomega" | ||
corev1 "k8s.io/api/core/v1" | ||
|
@@ -54,6 +58,7 @@ var _ = Describe("Registration", func() { | |
}) | ||
} | ||
nodeClaim := test.NodeClaim(nodeClaimOpts...) | ||
nodePool.StatusConditions().SetUnknown(v1.ConditionTypeNodeRegistrationHealthy) | ||
ExpectApplied(ctx, env.Client, nodePool, nodeClaim) | ||
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) | ||
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) | ||
|
@@ -66,6 +71,10 @@ var _ = Describe("Registration", func() { | |
if isManagedNodeClaim { | ||
jigisha620 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeRegistered).IsTrue()).To(BeTrue()) | ||
Expect(nodeClaim.Status.NodeName).To(Equal(node.Name)) | ||
operatorpkg.ExpectStatusConditions(ctx, env.Client, 1*time.Minute, nodePool, status.Condition{ | ||
Type: v1.ConditionTypeNodeRegistrationHealthy, | ||
Status: metav1.ConditionTrue, | ||
}) | ||
} else { | ||
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeRegistered).IsUnknown()).To(BeTrue()) | ||
Expect(nodeClaim.Status.NodeName).To(Equal("")) | ||
|
@@ -368,4 +377,30 @@ var _ = Describe("Registration", func() { | |
node = ExpectExists(ctx, env.Client, node) | ||
Expect(node.Spec.Taints).To(HaveLen(0)) | ||
}) | ||
It("should add NodeRegistrationHealthy=true on the nodePool if registration succeeds and if it was previously false", func() { | ||
nodeClaimOpts := []v1.NodeClaim{{ | ||
ObjectMeta: metav1.ObjectMeta{ | ||
Labels: map[string]string{ | ||
v1.NodePoolLabelKey: nodePool.Name, | ||
}, | ||
}, | ||
}} | ||
nodeClaim := test.NodeClaim(nodeClaimOpts...) | ||
nodePool.StatusConditions().SetFalse(v1.ConditionTypeNodeRegistrationHealthy, "unhealthy", "unhealthy") | ||
ExpectApplied(ctx, env.Client, nodePool, nodeClaim) | ||
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) | ||
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) | ||
|
||
node := test.Node(test.NodeOptions{ProviderID: nodeClaim.Status.ProviderID, Taints: []corev1.Taint{v1.UnregisteredNoExecuteTaint}}) | ||
ExpectApplied(ctx, env.Client, node) | ||
ExpectObjectReconciled(ctx, env.Client, nodeClaimController, nodeClaim) | ||
|
||
nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) | ||
Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeRegistered).IsTrue()).To(BeTrue()) | ||
Expect(nodeClaim.Status.NodeName).To(Equal(node.Name)) | ||
operatorpkg.ExpectStatusConditions(ctx, env.Client, 1*time.Minute, nodePool, status.Condition{ | ||
Type: v1.ConditionTypeNodeRegistrationHealthy, | ||
Status: metav1.ConditionTrue, | ||
}) | ||
}) | ||
}) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we just create a separate function to access the
PodSchedulingNodeRegistrationHealthySuccessTime
or something like this -- I think a boolean here is a bit hard to reason about