diff --git a/examples/28-instance-selector.yaml b/examples/28-instance-selector.yaml index 8e2c0a2550..e854d30089 100644 --- a/examples/28-instance-selector.yaml +++ b/examples/28-instance-selector.yaml @@ -11,6 +11,7 @@ nodeGroups: instanceSelector: vCPUs: 2 memory: "4" # 4 GiB, unit defaults to GiB + gpus: 0 # when set to 0, will only select non-GPU instance types managedNodeGroups: - name: mng diff --git a/pkg/apis/eksctl.io/v1alpha5/gpu_validation_test.go b/pkg/apis/eksctl.io/v1alpha5/gpu_validation_test.go index 5976164e3b..8523bd9519 100644 --- a/pkg/apis/eksctl.io/v1alpha5/gpu_validation_test.go +++ b/pkg/apis/eksctl.io/v1alpha5/gpu_validation_test.go @@ -1,8 +1,10 @@ package v1alpha5_test import ( + "bytes" "fmt" + "github.com/kris-nova/logger" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -15,8 +17,10 @@ var _ = Describe("GPU instance support", func() { gpuInstanceType string amiFamily string instanceTypeName string + instanceSelector *api.InstanceSelector expectUnsupportedErr bool + expectWarning bool } assertValidationError := func(e gpuInstanceEntry, err error) { @@ -127,6 +131,72 @@ var _ = Describe("GPU instance support", func() { }), ) + DescribeTable("GPU drivers", func(e gpuInstanceEntry) { + ng := api.NewNodeGroup() + ng.AMIFamily = e.amiFamily + ng.InstanceType = e.gpuInstanceType + ng.InstanceSelector = e.instanceSelector + + mng := api.NewManagedNodeGroup() + mng.AMIFamily = e.amiFamily + mng.InstanceType = e.gpuInstanceType + mng.InstanceSelector = e.instanceSelector + if mng.InstanceSelector == nil { + mng.InstanceSelector = &api.InstanceSelector{} + } + + output := &bytes.Buffer{} + logger.Writer = output + Expect(api.ValidateNodeGroup(0, ng, api.NewClusterConfig())).NotTo(HaveOccurred()) + if e.expectWarning { + Expect(output.String()).To(ContainSubstring(api.GPUDriversWarning(mng.AMIFamily))) + } else { + Expect(output.String()).NotTo(ContainSubstring(api.GPUDriversWarning(mng.AMIFamily))) + } + + output = &bytes.Buffer{} + logger.Writer = output + Expect(api.ValidateManagedNodeGroup(0, mng)).NotTo(HaveOccurred()) + if e.expectWarning { + Expect(output.String()).To(ContainSubstring(api.GPUDriversWarning(mng.AMIFamily))) + } else { + Expect(output.String()).NotTo(ContainSubstring(api.GPUDriversWarning(mng.AMIFamily))) + } + }, + Entry("Windows without GPU instances", gpuInstanceEntry{ + amiFamily: api.NodeImageFamilyUbuntu2004, + instanceSelector: &api.InstanceSelector{ + VCPUs: 4, + GPUs: newInt(0), + }, + }), + Entry("Windows with explicit GPU instance", gpuInstanceEntry{ + amiFamily: api.NodeImageFamilyWindowsServer2019FullContainer, + gpuInstanceType: "g4dn.xlarge", + expectWarning: true, + }), + Entry("Windows with implicit GPU instance", gpuInstanceEntry{ + amiFamily: api.NodeImageFamilyWindowsServer2022CoreContainer, + instanceSelector: &api.InstanceSelector{ + VCPUs: 4, + }, + expectWarning: true, + }), + Entry("Ubuntu with explicit GPU instance", gpuInstanceEntry{ + amiFamily: api.NodeImageFamilyUbuntu1804, + gpuInstanceType: "g4dn.xlarge", + expectWarning: true, + }), + Entry("Ubuntu with implicit GPU instance", gpuInstanceEntry{ + amiFamily: api.NodeImageFamilyUbuntu2004, + instanceSelector: &api.InstanceSelector{ + VCPUs: 4, + GPUs: newInt(2), + }, + expectWarning: true, + }), + ) + DescribeTable("ARM-based GPU instance type support", func(amiFamily string, expectErr bool) { ng := api.NewNodeGroup() ng.InstanceType = "g5g.medium" diff --git a/pkg/apis/eksctl.io/v1alpha5/validation.go b/pkg/apis/eksctl.io/v1alpha5/validation.go index 127e96737a..f3b2a5c975 100644 --- a/pkg/apis/eksctl.io/v1alpha5/validation.go +++ b/pkg/apis/eksctl.io/v1alpha5/validation.go @@ -52,6 +52,10 @@ var ( ErrPodIdentityAgentNotInstalled = func(suggestion string) error { return fmt.Errorf("the %q addon must be installed to create pod identity associations; %s", PodIdentityAgentAddon, suggestion) } + + GPUDriversWarning = func(amiFamily string) string { + return fmt.Sprintf("%s does not ship with NVIDIA GPU drivers installed, hence won't support running GPU-accelerated workloads out of the box", amiFamily) + } ) // NOTE: we don't use k8s.io/apimachinery/pkg/util/sets here to keep API package free of dependencies @@ -637,9 +641,14 @@ func validateNodeGroupBase(np NodePool, path string, controlPlaneOnOutposts bool } } - if instanceutils.IsNvidiaInstanceType(SelectInstanceType(np)) && - (ng.AMIFamily != NodeImageFamilyAmazonLinux2 && ng.AMIFamily != NodeImageFamilyBottlerocket && ng.AMIFamily != "") { - logger.Warning("%s does not ship with NVIDIA GPU drivers installed, hence won't support running GPU-accelerated workloads out of the box", ng.AMIFamily) + if ng.AMIFamily != NodeImageFamilyAmazonLinux2 && ng.AMIFamily != NodeImageFamilyBottlerocket && ng.AMIFamily != "" { + if instanceutils.IsNvidiaInstanceType(SelectInstanceType(np)) { + logger.Warning(GPUDriversWarning(ng.AMIFamily)) + } + if ng.InstanceSelector != nil && !ng.InstanceSelector.IsZero() && + (ng.InstanceSelector.GPUs == nil || *ng.InstanceSelector.GPUs != 0) { + logger.Warning("instance selector may/will select GPU instance types, " + GPUDriversWarning(ng.AMIFamily)) + } } if ng.AMIFamily != NodeImageFamilyAmazonLinux2 && ng.AMIFamily != "" { diff --git a/pkg/cfn/builder/managed_nodegroup.go b/pkg/cfn/builder/managed_nodegroup.go index 892668fc47..90bda7c9f5 100644 --- a/pkg/cfn/builder/managed_nodegroup.go +++ b/pkg/cfn/builder/managed_nodegroup.go @@ -33,12 +33,6 @@ type ManagedNodeGroupResourceSet struct { const ManagedNodeGroupResourceName = "ManagedNodeGroup" -// Windows AMI types are not in sdk-v2 yet, so the constants here are temporary; will remove after sdk is updated -const AMITypesWindows2019FullX8664 ekstypes.AMITypes = "WINDOWS_FULL_2019_x86_64" -const AMITypesWindows2019CoreX8664 ekstypes.AMITypes = "WINDOWS_CORE_2019_x86_64" -const AMITypesWindows2022FullX8664 ekstypes.AMITypes = "WINDOWS_FULL_2022_x86_64" -const AMITypesWindows2022CoreX8664 ekstypes.AMITypes = "WINDOWS_CORE_2022_x86_64" - // NewManagedNodeGroup creates a new ManagedNodeGroupResourceSet func NewManagedNodeGroup(ec2API awsapi.EC2, cluster *api.ClusterConfig, nodeGroup *api.ManagedNodeGroup, launchTemplateFetcher *LaunchTemplateFetcher, bootstrapper nodebootstrap.Bootstrapper, forceAddCNIPolicy bool, vpcImporter vpc.Importer) *ManagedNodeGroupResourceSet { return &ManagedNodeGroupResourceSet{ @@ -285,18 +279,21 @@ func getAMIType(ng *api.ManagedNodeGroup, instanceType string) ekstypes.AMITypes ARM: ekstypes.AMITypesBottlerocketArm64, ARMGPU: ekstypes.AMITypesBottlerocketArm64Nvidia, }, - // Windows AMI Types are not in sdk-v2 yet, so the constant here is temporary; will remove after sdk is updated api.NodeImageFamilyWindowsServer2019FullContainer: { - X86x64: AMITypesWindows2019FullX8664, + X86x64: ekstypes.AMITypesWindowsFull2019X8664, + X86GPU: ekstypes.AMITypesWindowsFull2019X8664, }, api.NodeImageFamilyWindowsServer2019CoreContainer: { - X86x64: AMITypesWindows2019CoreX8664, + X86x64: ekstypes.AMITypesWindowsCore2019X8664, + X86GPU: ekstypes.AMITypesWindowsCore2019X8664, }, api.NodeImageFamilyWindowsServer2022FullContainer: { - X86x64: AMITypesWindows2022FullX8664, + X86x64: ekstypes.AMITypesWindowsFull2022X8664, + X86GPU: ekstypes.AMITypesWindowsFull2022X8664, }, api.NodeImageFamilyWindowsServer2022CoreContainer: { - X86x64: AMITypesWindows2022CoreX8664, + X86x64: ekstypes.AMITypesWindowsCore2022X8664, + X86GPU: ekstypes.AMITypesWindowsCore2022X8664, }, } diff --git a/userdocs/src/usage/instance-selector.md b/userdocs/src/usage/instance-selector.md index 6b92e38e30..0ba096d10e 100644 --- a/userdocs/src/usage/instance-selector.md +++ b/userdocs/src/usage/instance-selector.md @@ -5,7 +5,7 @@ users have to spend time figuring out which instance types would be well suited when using Spot instances because you need to choose a set of instances that works together well with the Cluster Autoscaler. eksctl now integrates with the [EC2 instance selector](https://github.com/aws/amazon-ec2-instance-selector), -which addresses this problem by generating a list of instance types based on resource criteria such as vCPUs, memory, etc. +which addresses this problem by generating a list of instance types based on resource criteria: vCPUs, memory, # of GPUs and CPU architecture. When the instance selector criteria is passed, eksctl creates a nodegroup with the instance types set to the instance types matching the supplied criteria. @@ -62,6 +62,9 @@ The following instance selector CLI options are supported by `eksctl create clus `--instance-selector-vcpus`, `--instance-selector-memory`, `--instance-selector-gpus` and `instance-selector-cpu-architecture` +???+ note + By default, GPU instance types are not filtered out. If you wish to do so (e.g. for cost effectiveness, when your applications don't particularly benefit from GPU-accelerated workloads), please explicitly set `gpus: 0` (via config file) or `--instance-selector-gpus=0` (via CLI flag). + An example file can be found [here](https://github.com/eksctl-io/eksctl/blob/main/examples/28-instance-selector.yaml). ### Dry Run