diff --git a/Makefile b/Makefile index 438a9ae49677..f6b702d56480 100644 --- a/Makefile +++ b/Makefile @@ -52,6 +52,8 @@ apply: ## Deploy the controller into your ~/.kube/config cluster $(HELM_OPTS) \ --set controller.image=$(shell $(WITH_GOFLAGS) ko build -B github.com/aws/karpenter/cmd/controller) \ --set webhook.image=$(shell $(WITH_GOFLAGS) ko build -B github.com/aws/karpenter/cmd/webhook) + @# apply any modified/added CRDs to the server + @git status charts/karpenter/crds -s | grep "^ [MA]" | sed 's/^ [MA] //' | xargs -n 1 kubectl apply -f delete: ## Delete the controller from your ~/.kube/config cluster helm uninstall karpenter --namespace karpenter diff --git a/charts/karpenter/crds/karpenter.sh_provisioners.yaml b/charts/karpenter/crds/karpenter.sh_provisioners.yaml index ff6dac202d4d..9d07d99e1b94 100644 --- a/charts/karpenter/crds/karpenter.sh_provisioners.yaml +++ b/charts/karpenter/crds/karpenter.sh_provisioners.yaml @@ -71,6 +71,53 @@ spec: that Karpenter supports for limiting. type: object type: object + namespaceSelector: + description: A label query over the set of namespaces that the provisioner + applies to. The provisioner is applied to the pods in the namespaces + selected by this field. An omitted selector or an empty selector + ({}) matches all namespaces. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector requirement is a selector that + contains values, a key, and an operator that relates the key + and values. + properties: + key: + description: key is the label key that the selector applies + to. + type: string + operator: + description: operator represents a key's relationship to + a set of values. Valid operators are In, NotIn, Exists + and DoesNotExist. + type: string + values: + description: values is an array of string values. If the + operator is In or NotIn, the values array must be non-empty. + If the operator is Exists or DoesNotExist, the values + array must be empty. This array is replaced during a strategic + merge patch. + items: + type: string + type: array + required: + - key + - operator + type: object + type: array + matchLabels: + additionalProperties: + type: string + description: matchLabels is a map of {key,value} pairs. A single + {key,value} in the matchLabels map is equivalent to an element + of matchExpressions, whose key field is "key", the operator + is "In", and the values array contains only "value". The requirements + are ANDed. + type: object + type: object provider: description: Provider contains fields specific to your cloudprovider. type: object diff --git a/charts/karpenter/templates/clusterrole.yaml b/charts/karpenter/templates/clusterrole.yaml index 152e3b94e8b1..4e39c4228e2d 100644 --- a/charts/karpenter/templates/clusterrole.yaml +++ b/charts/karpenter/templates/clusterrole.yaml @@ -30,6 +30,9 @@ rules: - apiGroups: [""] resources: ["nodes"] verbs: ["create"] + - apiGroups: [ "" ] + resources: [ "namespaces" ] + verbs: [ "get", "list", "watch" ] - apiGroups: [""] resources: ["pods/binding", "pods/eviction"] verbs: ["create"] diff --git a/pkg/apis/provisioning/v1alpha5/provisioner.go b/pkg/apis/provisioning/v1alpha5/provisioner.go index 899ee7ff7149..d2827aabd849 100644 --- a/pkg/apis/provisioning/v1alpha5/provisioner.go +++ b/pkg/apis/provisioning/v1alpha5/provisioner.go @@ -25,6 +25,11 @@ import ( type ProvisionerSpec struct { // Constraints are applied to all nodes launched by this provisioner. Constraints `json:",inline"` + // A label query over the set of namespaces that the provisioner applies to. + // The provisioner is applied to the pods in the namespaces selected by this field. + // An omitted selector or an empty selector ({}) matches all namespaces. + // +optional + NamespaceSelector *metav1.LabelSelector `json:"namespaceSelector,omitempty" protobuf:"bytes,4,opt,name=namespaceSelector"` // TTLSecondsAfterEmpty is the number of seconds the controller will wait // before attempting to delete a node, measured from when the node is // detected to be empty. A Node is considered to be empty when it does not diff --git a/pkg/apis/provisioning/v1alpha5/zz_generated.deepcopy.go b/pkg/apis/provisioning/v1alpha5/zz_generated.deepcopy.go index 9d40280ebfdd..6d0edf5d46cf 100644 --- a/pkg/apis/provisioning/v1alpha5/zz_generated.deepcopy.go +++ b/pkg/apis/provisioning/v1alpha5/zz_generated.deepcopy.go @@ -22,6 +22,7 @@ package v1alpha5 import ( "github.com/aws/karpenter/pkg/utils/sets" "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "knative.dev/pkg/apis" ) @@ -167,6 +168,11 @@ func (in *ProvisionerList) DeepCopyObject() runtime.Object { func (in *ProvisionerSpec) DeepCopyInto(out *ProvisionerSpec) { *out = *in in.Constraints.DeepCopyInto(&out.Constraints) + if in.NamespaceSelector != nil { + in, out := &in.NamespaceSelector, &out.NamespaceSelector + *out = new(metav1.LabelSelector) + (*in).DeepCopyInto(*out) + } if in.TTLSecondsAfterEmpty != nil { in, out := &in.TTLSecondsAfterEmpty, &out.TTLSecondsAfterEmpty *out = new(int64) diff --git a/pkg/controllers/manager.go b/pkg/controllers/manager.go index 7e519fafc774..2ff0114a1596 100644 --- a/pkg/controllers/manager.go +++ b/pkg/controllers/manager.go @@ -41,6 +41,13 @@ func NewManagerOrDie(ctx context.Context, config *rest.Config, options controlle }); err != nil { panic(fmt.Sprintf("Failed to setup pod indexer, %s", err)) } + // metadata.name normally works as a field selector against the API server, however the client is cached and this is + // required to enable matching against the cached fields + if err := newManager.GetFieldIndexer().IndexField(ctx, &v1.Namespace{}, "metadata.name", func(o client.Object) []string { + return []string{o.(*v1.Namespace).Name} + }); err != nil { + panic(fmt.Sprintf("Failed to setup namespace indexer, %s", err)) + } return &GenericControllerManager{Manager: newManager} } diff --git a/pkg/controllers/selection/controller.go b/pkg/controllers/selection/controller.go index eb4a918c5e01..bd03178fbff2 100644 --- a/pkg/controllers/selection/controller.go +++ b/pkg/controllers/selection/controller.go @@ -19,6 +19,10 @@ import ( "fmt" "time" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/fields" + "k8s.io/apimachinery/pkg/labels" + "github.com/go-logr/zapr" "go.uber.org/multierr" "go.uber.org/zap" @@ -75,6 +79,7 @@ func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reco logging.FromContext(ctx).Errorf("Ignoring pod, %s", err) return reconcile.Result{}, nil } + // Select a provisioner, wait for it to bind the pod, and verify scheduling succeeded in the next loop if err := c.selectProvisioner(ctx, pod); err != nil { logging.FromContext(ctx).Debugf("Could not schedule pod, %s", err) @@ -96,13 +101,30 @@ func (c *Controller) selectProvisioner(ctx context.Context, pod *v1.Pod) (errs e if len(provisioners) == 0 { return nil } + + // lookup the pod namespace for matching against the provisioner + podNamespace, err := c.getPodNamespace(ctx, pod) + if err != nil { + return err + } + for _, candidate := range c.provisioners.List(ctx) { + // check if the provisioner is allowed to provision pods in this namespace + if err := provisionerCanProvision(candidate, podNamespace); err != nil { + errs = multierr.Append(errs, fmt.Errorf("tried provisioner/%s: %w", candidate.Name, err)) + continue + } + + // ValidatePod is on Constraints, which is embedded in ProvisionerSpec. If that gets reworked, consider moving + // provisionerCanProvision to there as well if err := candidate.Spec.DeepCopy().ValidatePod(pod); err != nil { errs = multierr.Append(errs, fmt.Errorf("tried provisioner/%s: %w", candidate.Name, err)) - } else { - provisioner = candidate - break + continue } + + // found a matching provisioner + provisioner = candidate + break } if provisioner == nil { return fmt.Errorf("matched 0/%d provisioners, %w", len(multierr.Errors(errs)), errs) @@ -114,6 +136,36 @@ func (c *Controller) selectProvisioner(ctx context.Context, pod *v1.Pod) (errs e return nil } +// provisionerCanProvision returns nil if the candidate provisioner is configured to provision pods in the provided +// namespace +func provisionerCanProvision(candidate *provisioning.Provisioner, podNamespace v1.Namespace) error { + // no namespace selector, so we accept everything + if candidate.Spec.NamespaceSelector == nil { + return nil + } + // validate that the pod is in a namespace that the provisioner provisions for + selector, err := metav1.LabelSelectorAsSelector(candidate.Spec.NamespaceSelector) + if err != nil { + return err + } + if !selector.Matches(labels.Set(podNamespace.Labels)) { + return fmt.Errorf("pod doesn't match namespace label selector") + } + return nil +} + +func (c *Controller) getPodNamespace(ctx context.Context, pod *v1.Pod) (v1.Namespace, error) { + nsList := &v1.NamespaceList{} + if err := c.kubeClient.List(ctx, nsList, &client.ListOptions{FieldSelector: fields.OneTermEqualSelector("metadata.name", pod.Namespace)}); err != nil { + return v1.Namespace{}, fmt.Errorf("unable to list namespaces %w", err) + } + if len(nsList.Items) == 0 { + // shouldn't happen + return v1.Namespace{}, fmt.Errorf("namespace %s not found", pod.Namespace) + } + return nsList.Items[0], nil +} + func isProvisionable(p *v1.Pod) bool { return !pod.IsScheduled(p) && !pod.IsPreempting(p) && diff --git a/pkg/controllers/selection/suite_test.go b/pkg/controllers/selection/suite_test.go index c59f4c6a7696..0242f6485f30 100644 --- a/pkg/controllers/selection/suite_test.go +++ b/pkg/controllers/selection/suite_test.go @@ -76,6 +76,99 @@ var _ = AfterEach(func() { ExpectProvisioningCleanedUp(ctx, env.Client, provisioners) }) +var _ = Describe("Namespace Selector", func() { + It("should schedule if there is no namespace selector", func() { + provisioner.Spec.NamespaceSelector = nil + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner) + pod := ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + test.UnschedulablePod(), + )[0] + ExpectScheduled(ctx, env.Client, pod) + }) + It("should schedule if there is an empty namespace selector", func() { + provisioner.Spec.NamespaceSelector = &metav1.LabelSelector{ + MatchLabels: map[string]string{}, + MatchExpressions: []metav1.LabelSelectorRequirement{}, + } + + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner) + pod := ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + test.UnschedulablePod(), + )[0] + ExpectScheduled(ctx, env.Client, pod) + }) + It("should not schedule if the pod isn't in a matching namespace, MatchLabels", func() { + provisioner.Spec.NamespaceSelector = &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "foo": "bar", + }, + } + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner) + pod := ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + test.UnschedulablePod(), + )[0] + ExpectNotScheduled(ctx, env.Client, pod) + }) + It("should not schedule if the pod isn't in a matching namespace, MatchExpressions", func() { + provisioner.Spec.NamespaceSelector = &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "foo", + Operator: metav1.LabelSelectorOpIn, + Values: []string{"bar"}, + }, + }, + } + ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner) + pod := ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + test.UnschedulablePod(), + )[0] + ExpectNotScheduled(ctx, env.Client, pod) + }) + It("should schedule if the pod is in a matching namespace, MatchLabels", func() { + ns := randomdata.Noun() + randomdata.Adjective() // need a lowercase name here + ExpectCreated(ctx, env.Client, &v1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: ns, + Labels: map[string]string{"foo": "bar"}, + }, + }) + + // select for namespaces with the label foo=bar + provisioner.Spec.NamespaceSelector = &metav1.LabelSelector{ + MatchLabels: map[string]string{"foo": "bar"}, + } + pod := ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + test.UnschedulablePod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Namespace: ns}}), + )[0] + ExpectScheduled(ctx, env.Client, pod) + }) + It("should schedule if the pod is in a matching namespace, MatchExpressions", func() { + ns := randomdata.Noun() + randomdata.Adjective() + ExpectCreated(ctx, env.Client, &v1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: ns, + Labels: map[string]string{"foo": "bar"}, + }, + }) + + // select for namespaces with the label foo in ["bar"] + provisioner.Spec.NamespaceSelector = &metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: "foo", + Operator: metav1.LabelSelectorOpIn, + Values: []string{"bar"}, + }, + }, + } + pod := ExpectProvisioned(ctx, env.Client, selectionController, provisioners, provisioner, + test.UnschedulablePod(test.PodOptions{ObjectMeta: metav1.ObjectMeta{Namespace: ns}}), + )[0] + ExpectScheduled(ctx, env.Client, pod) + }) +}) + var _ = Describe("Volume Topology Requirements", func() { var storageClass *storagev1.StorageClass BeforeEach(func() { diff --git a/website/content/en/preview/provisioner.md b/website/content/en/preview/provisioner.md index 97e949e1988a..3396f045b3ba 100644 --- a/website/content/en/preview/provisioner.md +++ b/website/content/en/preview/provisioner.md @@ -22,6 +22,11 @@ spec: # If omitted, the feature is disabled, nodes will never scale down due to low utilization ttlSecondsAfterEmpty: 30 + # If omitted, Karpenter will create nodes for pods in any namespace + namespaceSelector: + matchLabels: + karpenter: "yes" + # Provisioned nodes will have these taints # Taints may prevent pods from scheduling if they are not tolerated taints: @@ -69,6 +74,10 @@ spec: If neither of these values are set, Karpenter will *not* delete instances. It is recommended to set the `ttlSecondsAfterEmpty` value, to enable scale down of the cluster. +### spec.namespaceSelector + +Setting a value here causes Karpenter to only respond to unschedulable pods within the namespaces that are matched by the namespaceSelector. This matches against labels on the namespaces. + ### spec.ttlSecondsAfterEmpty Setting a value here enables Karpenter to delete empty/unnecessary instances. DaemonSets are excluded from considering a node "empty". This value is in seconds.