From 39aed08c5b96f5441543a81e46665cd55542cad3 Mon Sep 17 00:00:00 2001 From: Todd Neal Date: Thu, 10 Mar 2022 09:27:56 -0600 Subject: [PATCH] add a namespace selector to the provisioner spec This is modeled after the NamespaceSelector on pod affinity terms and works the same way. Fixes #1493 --- Makefile | 1 + .../crds/karpenter.sh_provisioners.yaml | 57 +++++++ charts/karpenter/templates/clusterrole.yaml | 3 + pkg/apis/provisioning/v1alpha5/provisioner.go | 13 ++ .../v1alpha5/zz_generated.deepcopy.go | 11 ++ pkg/controllers/selection/controller.go | 55 ++++++- pkg/controllers/selection/suite_test.go | 145 ++++++++++++++++++ website/content/en/preview/provisioner.md | 21 +++ 8 files changed, 303 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 438a9ae49677..39e2d320eae8 100644 --- a/Makefile +++ b/Makefile @@ -52,6 +52,7 @@ apply: ## Deploy the controller into your ~/.kube/config cluster $(HELM_OPTS) \ --set controller.image=$(shell $(WITH_GOFLAGS) ko build -B github.com/aws/karpenter/cmd/controller) \ --set webhook.image=$(shell $(WITH_GOFLAGS) ko build -B github.com/aws/karpenter/cmd/webhook) + kubectl apply -f charts/karpenter/crds/ delete: ## Delete the controller from your ~/.kube/config cluster helm uninstall karpenter --namespace karpenter diff --git a/charts/karpenter/crds/karpenter.sh_provisioners.yaml b/charts/karpenter/crds/karpenter.sh_provisioners.yaml index ff6dac202d4d..5c64c78d4ed6 100644 --- a/charts/karpenter/crds/karpenter.sh_provisioners.yaml +++ b/charts/karpenter/crds/karpenter.sh_provisioners.yaml @@ -71,6 +71,63 @@ spec: that Karpenter supports for limiting. type: object type: object + namespaceSelector: + description: A label query over the set of namespaces that the provisioner + applies to. The provisioner is applied to the union of the namespaces + selected by this field and the ones listed in the namespaces field. + null selector and null or empty namespaces list means provision + for all namespaces An empty selector ({}) matches all namespaces. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector requirement is a selector that + contains values, a key, and an operator that relates the key + and values. + properties: + key: + description: key is the label key that the selector applies + to. + type: string + operator: + description: operator represents a key's relationship to + a set of values. Valid operators are In, NotIn, Exists + and DoesNotExist. + type: string + values: + description: values is an array of string values. If the + operator is In or NotIn, the values array must be non-empty. + If the operator is Exists or DoesNotExist, the values + array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. A single + {key,value} in the matchLabels map is equivalent to an element + of matchExpressions, whose key field is "key", the operator + is "In", and the values array contains only "value". The requirements + are ANDed. + type: object + type: object + namespaces: + description: namespaces specifies a static list of namespace names + that the provisioner applies to. The provisioner is applied to the + union of the namespaces listed in this field and the ones selected + by namespaceSelector. null or empty namespaces list and null namespaceSelector + means provision for all namespaces + items: + type: string + type: array provider: description: Provider contains fields specific to your cloudprovider. type: object diff --git a/charts/karpenter/templates/clusterrole.yaml b/charts/karpenter/templates/clusterrole.yaml index 152e3b94e8b1..4e39c4228e2d 100644 --- a/charts/karpenter/templates/clusterrole.yaml +++ b/charts/karpenter/templates/clusterrole.yaml @@ -30,6 +30,9 @@ rules: - apiGroups: [""] resources: ["nodes"] verbs: ["create"] + - apiGroups: [ "" ] + resources: [ "namespaces" ] + verbs: [ "get", "list", "watch" ] - apiGroups: [""] resources: ["pods/binding", "pods/eviction"] verbs: ["create"] diff --git a/pkg/apis/provisioning/v1alpha5/provisioner.go b/pkg/apis/provisioning/v1alpha5/provisioner.go index 899ee7ff7149..b73bf761bd93 100644 --- a/pkg/apis/provisioning/v1alpha5/provisioner.go +++ b/pkg/apis/provisioning/v1alpha5/provisioner.go @@ -25,6 +25,19 @@ import ( type ProvisionerSpec struct { // Constraints are applied to all nodes launched by this provisioner. Constraints `json:",inline"` + // namespaces specifies a static list of namespace names that the provisioner applies to. + // The provisioner is applied to the union of the namespaces listed in this field + // and the ones selected by namespaceSelector. + // null or empty namespaces list and null namespaceSelector means provision for all namespaces + // +optional + Namespaces []string `json:"namespaces,omitempty" protobuf:"bytes,2,rep,name=namespaces"` + // A label query over the set of namespaces that the provisioner applies to. + // The provisioner is applied to the union of the namespaces selected by this field + // and the ones listed in the namespaces field. + // null selector and null or empty namespaces list means provision for all namespaces + // An empty selector ({}) matches all namespaces. + // +optional + NamespaceSelector *metav1.LabelSelector `json:"namespaceSelector,omitempty" protobuf:"bytes,4,opt,name=namespaceSelector"` // TTLSecondsAfterEmpty is the number of seconds the controller will wait // before attempting to delete a node, measured from when the node is // detected to be empty. A Node is considered to be empty when it does not diff --git a/pkg/apis/provisioning/v1alpha5/zz_generated.deepcopy.go b/pkg/apis/provisioning/v1alpha5/zz_generated.deepcopy.go index 9d40280ebfdd..55ae8c29bb9c 100644 --- a/pkg/apis/provisioning/v1alpha5/zz_generated.deepcopy.go +++ b/pkg/apis/provisioning/v1alpha5/zz_generated.deepcopy.go @@ -22,6 +22,7 @@ package v1alpha5 import ( "github.com/aws/karpenter/pkg/utils/sets" "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "knative.dev/pkg/apis" ) @@ -167,6 +168,16 @@ func (in *ProvisionerList) DeepCopyObject() runtime.Object { func (in *ProvisionerSpec) DeepCopyInto(out *ProvisionerSpec) { *out = *in in.Constraints.DeepCopyInto(&out.Constraints) + if in.Namespaces != nil { + in, out := &in.Namespaces, &out.Namespaces + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.NamespaceSelector != nil { + in, out := &in.NamespaceSelector, &out.NamespaceSelector + *out = new(metav1.LabelSelector) + (*in).DeepCopyInto(*out) + } if in.TTLSecondsAfterEmpty != nil { in, out := &in.TTLSecondsAfterEmpty, &out.TTLSecondsAfterEmpty *out = new(int64) diff --git a/pkg/controllers/selection/controller.go b/pkg/controllers/selection/controller.go index eb4a918c5e01..d7e909639046 100644 --- a/pkg/controllers/selection/controller.go +++ b/pkg/controllers/selection/controller.go @@ -19,6 +19,9 @@ import ( "fmt" "time" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "github.com/go-logr/zapr" "go.uber.org/multierr" "go.uber.org/zap" @@ -75,6 +78,7 @@ func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reco logging.FromContext(ctx).Errorf("Ignoring pod, %s", err) return reconcile.Result{}, nil } + // Select a provisioner, wait for it to bind the pod, and verify scheduling succeeded in the next loop if err := c.selectProvisioner(ctx, pod); err != nil { logging.FromContext(ctx).Debugf("Could not schedule pod, %s", err) @@ -96,13 +100,29 @@ func (c *Controller) selectProvisioner(ctx context.Context, pod *v1.Pod) (errs e if len(provisioners) == 0 { return nil } + + // lookup the pod namespace for matching against the provisioner + var podNamespace v1.Namespace + if err := c.kubeClient.Get(ctx, client.ObjectKey{Name: pod.Namespace}, &podNamespace); err != nil { + return err + } + for _, candidate := range c.provisioners.List(ctx) { + // check if the provisioner is allowed to provision pods in this namespace + if err := validateNamespace(candidate, podNamespace); err != nil { + errs = multierr.Append(errs, fmt.Errorf("tried provisioner/%s: %w", candidate.Name, err)) + continue + } + + // ValidatePod is on Constraints, which is embedded in ProvisionerSpec. If that gets reworked, consider moving + // validateNamespace to there as well if err := candidate.Spec.DeepCopy().ValidatePod(pod); err != nil { errs = multierr.Append(errs, fmt.Errorf("tried provisioner/%s: %w", candidate.Name, err)) - } else { - provisioner = candidate - break + continue } + + provisioner = candidate + break } if provisioner == nil { return fmt.Errorf("matched 0/%d provisioners, %w", len(multierr.Errors(errs)), errs) @@ -114,6 +134,35 @@ func (c *Controller) selectProvisioner(ctx context.Context, pod *v1.Pod) (errs e return nil } +// validateNamespace returns nil if the candidate provisioner is configured to provision pods in the provided +// namespace +func validateNamespace(candidate *provisioning.Provisioner, namespace v1.Namespace) error { + // no namespace list or label selector provided, so everything passes + if len(candidate.Spec.Namespaces) == 0 && candidate.Spec.NamespaceSelector == nil { + return nil + } + + // the namespace of the pod must match one of the list of namespaces or the selector + for _, ns := range candidate.Spec.Namespaces { + if ns == namespace.Name { + return nil + } + } + + // For an undefined namespace selector, the selector itself matches nothing. This + // provides the desired semantics here as we know there is either a namespace list + // or a namespace label selector and the namespace has already failed to match the + // possibly empty list + selector, err := metav1.LabelSelectorAsSelector(candidate.Spec.NamespaceSelector) + if err != nil { + return err + } + if !selector.Matches(labels.Set(namespace.Labels)) { + return fmt.Errorf("doesn't match namespaces being provisioned") + } + return nil +} + func isProvisionable(p *v1.Pod) bool { return !pod.IsScheduled(p) && !pod.IsPreempting(p) && diff --git a/pkg/controllers/selection/suite_test.go b/pkg/controllers/selection/suite_test.go index c59f4c6a7696..24e4375a6237 100644 --- a/pkg/controllers/selection/suite_test.go +++ b/pkg/controllers/selection/suite_test.go @@ -76,6 +76,151 @@ var _ = AfterEach(func() { ExpectProvisioningCleanedUp(ctx, env.Client, provisioners) }) +var _ = Describe("Namespace Selector", func() { + It("should schedule if there is no namespace selector", func() { + provisioner.Spec.NamespaceSelector = nil + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner) + pod := ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + test.UnschedulablePod(), + )[0] + ExpectScheduled(ctx, env.Client, pod) + }) + It("should schedule if there is an empty namespace selector", func() { + provisioner.Spec.NamespaceSelector = &metav1.LabelSelector{ + MatchLabels: map[string]string{}, + MatchExpressions: []metav1.LabelSelectorRequirement{}, + } + + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner) + pod := ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + test.UnschedulablePod(), + )[0] + ExpectScheduled(ctx, env.Client, pod) + }) + It("should not schedule if the pod isn't in a matching namespace, namespace list", func() { + provisioner.Spec.Namespaces = []string{"foo"} + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner) + pod := ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + test.UnschedulablePod(), + )[0] + ExpectNotScheduled(ctx, env.Client, pod) + }) + It("should not schedule if the pod isn't in a matching namespace, MatchLabels", func() { + provisioner.Spec.NamespaceSelector = &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "foo": "bar", + }, + } + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner) + pod := ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + test.UnschedulablePod(), + )[0] + ExpectNotScheduled(ctx, env.Client, pod) + }) + It("should not schedule if the pod isn't in a matching namespace, MatchExpressions", func() { + provisioner.Spec.NamespaceSelector = &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "foo", + Operator: metav1.LabelSelectorOpIn, + Values: []string{"bar"}, + }, + }, + } + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner) + pod := ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + test.UnschedulablePod(), + )[0] + ExpectNotScheduled(ctx, env.Client, pod) + }) + It("should schedule if the pod is in a matching namespace, namespace list", func() { + ns := randomdata.Noun() + randomdata.Adjective() + ExpectCreated(ctx, env.Client, &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: ns}}) + provisioner.Spec.Namespaces = []string{ns} + + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner) + pod := ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + test.UnschedulablePod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Namespace: ns}}), + )[0] + ExpectScheduled(ctx, env.Client, pod) + }) + It("should schedule if the pod is in a matching namespace, MatchLabels", func() { + ns := randomdata.Noun() + randomdata.Adjective() // need a lowercase name here + ExpectCreated(ctx, env.Client, &v1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: ns, + Labels: map[string]string{"foo": "bar"}, + }, + }) + + // select for namespaces with the label foo=bar + provisioner.Spec.NamespaceSelector = &metav1.LabelSelector{ + MatchLabels: map[string]string{"foo": "bar"}, + } + pod := ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + test.UnschedulablePod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Namespace: ns}}), + )[0] + ExpectScheduled(ctx, env.Client, pod) + }) + It("should schedule if the pod is in a matching namespace, MatchExpressions", func() { + ns := randomdata.Noun() + randomdata.Adjective() + ExpectCreated(ctx, env.Client, &v1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: ns, + Labels: map[string]string{"foo": "bar"}, + }, + }) + + // select for namespaces with the label foo in ["bar"] + provisioner.Spec.NamespaceSelector = &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "foo", + Operator: metav1.LabelSelectorOpIn, + Values: []string{"bar"}, + }, + }, + } + pod := ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + test.UnschedulablePod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Namespace: ns}}), + )[0] + ExpectScheduled(ctx, env.Client, pod) + }) + It("should schedule if the pod is in a matching namespace list but fails selector ", func() { + ns := randomdata.Noun() + randomdata.Adjective() + ExpectCreated(ctx, env.Client, &v1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: ns}}) + provisioner.Spec.Namespaces = []string{ns} + provisioner.Spec.NamespaceSelector = &metav1.LabelSelector{ + MatchLabels: map[string]string{"foo": "bar"}, + } + + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner) + pod := ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + test.UnschedulablePod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Namespace: ns}}), + )[0] + ExpectScheduled(ctx, env.Client, pod) + }) + It("should schedule if the pod is not in a matching namespace list but passes selector", func() { + ns := randomdata.Noun() + randomdata.Adjective() // need a lowercase name here + ExpectCreated(ctx, env.Client, &v1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: ns, + Labels: map[string]string{"foo": "bar"}, + }, + }) + + // will fail the namespaec list match, but pass the selector + provisioner.Spec.Namespaces = []string{"somethingelse"} + provisioner.Spec.NamespaceSelector = &metav1.LabelSelector{ + MatchLabels: map[string]string{"foo": "bar"}, + } + pod := ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + test.UnschedulablePod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Namespace: ns}}), + )[0] + ExpectScheduled(ctx, env.Client, pod) + }) +}) + var _ = Describe("Volume Topology Requirements", func() { var storageClass *storagev1.StorageClass BeforeEach(func() { diff --git a/website/content/en/preview/provisioner.md b/website/content/en/preview/provisioner.md index 97e949e1988a..d5d29bb4cdd4 100644 --- a/website/content/en/preview/provisioner.md +++ b/website/content/en/preview/provisioner.md @@ -22,6 +22,19 @@ spec: # If omitted, the feature is disabled, nodes will never scale down due to low utilization ttlSecondsAfterEmpty: 30 + # If both namespaces and namespaceSelector are omitted, Karpenter will provision nodes for unschedulable pods in + # any namespace. If both are provided, Karpenter will provision for the union of the namespaces selected. + + # Provision for unschedulable pods in these specific namespaces + namespaces: + - namespace1 + - namespace2 + + # Provision for unschedulable pods in namespaces where the namespaces match these labels + namespaceSelector: + matchLabels: + karpenter: "yes" + # Provisioned nodes will have these taints # Taints may prevent pods from scheduling if they are not tolerated taints: @@ -69,6 +82,14 @@ spec: If neither of these values are set, Karpenter will *not* delete instances. It is recommended to set the `ttlSecondsAfterEmpty` value, to enable scale down of the cluster. +### spec.namespaces + +Setting a value here causes Karpenter to only respond to unschedulable pods within the namespaces that are matched by the union of namespaces and namespaceSelector. This is a list of namespace names. + +### spec.namespaceSelector + +Setting a value here causes Karpenter to only respond to unschedulable pods within the namespaces that are matched by the union of namespaces and namespaceSelector. This matches against labels on the namespaces. + ### spec.ttlSecondsAfterEmpty Setting a value here enables Karpenter to delete empty/unnecessary instances. DaemonSets are excluded from considering a node "empty". This value is in seconds.