diff --git a/pkg/apis/gcpprovider/v1beta1/gcpmachineproviderconfig_types.go b/pkg/apis/gcpprovider/v1beta1/gcpmachineproviderconfig_types.go index 04aa60f95..fc7c89441 100644 --- a/pkg/apis/gcpprovider/v1beta1/gcpmachineproviderconfig_types.go +++ b/pkg/apis/gcpprovider/v1beta1/gcpmachineproviderconfig_types.go @@ -22,22 +22,26 @@ type GCPMachineProviderSpec struct { // CredentialsSecret is a reference to the secret with GCP credentials. CredentialsSecret *corev1.LocalObjectReference `json:"credentialsSecret,omitempty"` - CanIPForward bool `json:"canIPForward"` - DeletionProtection bool `json:"deletionProtection"` - Disks []*GCPDisk `json:"disks,omitempty"` - Labels map[string]string `json:"labels,omitempty"` - Metadata []*GCPMetadata `json:"gcpMetadata,omitempty"` - NetworkInterfaces []*GCPNetworkInterface `json:"networkInterfaces,omitempty"` - ServiceAccounts []GCPServiceAccount `json:"serviceAccounts"` - Tags []string `json:"tags,omitempty"` - TargetPools []string `json:"targetPools,omitempty"` - MachineType string `json:"machineType"` - Region string `json:"region"` - Zone string `json:"zone"` - ProjectID string `json:"projectID,omitempty"` + CanIPForward bool `json:"canIPForward"` + DeletionProtection bool `json:"deletionProtection"` + Disks []*GCPDisk `json:"disks,omitempty"` + Labels map[string]string `json:"labels,omitempty"` + Metadata []*GCPMetadata `json:"gcpMetadata,omitempty"` + NetworkInterfaces []*GCPNetworkInterface `json:"networkInterfaces,omitempty"` + ServiceAccounts []GCPServiceAccount `json:"serviceAccounts"` + Tags []string `json:"tags,omitempty"` + TargetPools []string `json:"targetPools,omitempty"` + MachineType string `json:"machineType"` + Region string `json:"region"` + Zone string `json:"zone"` + ProjectID string `json:"projectID,omitempty"` + GuestAccelerators []*GCPAcceleratorConfig `json:"guestAccelerators,omitempty"` // Preemptible indicates if created instance is preemptible Preemptible bool `json:"preemptible,omitempty"` + + OnHostMaintenance string `json:"onHostMaintenance,omitempty"` + AutomaticRestart *bool `json:"automaticRestart,omitempty"` } // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object @@ -104,3 +108,11 @@ type GCPKMSKeyReference struct { // Location is the GCP location in which the Key Ring exists. Location string `json:"location"` } + +// GCPAcceleratorConfig describes type and count of accelerator cards attached to the instance on GCP. +type GCPAcceleratorConfig struct { + // AcceleratorCount is number of AcceleratorType accelerators (GPUs) to be attached to an instance + AcceleratorCount int64 `json:"acceleratorCount,omitempty"` + // AcceleratorType is the type of accelerator (GPU) to be attached to an instance + AcceleratorType string `json:"acceleratorType,omitempty"` +} diff --git a/pkg/apis/gcpprovider/v1beta1/zz_generated.deepcopy.go b/pkg/apis/gcpprovider/v1beta1/zz_generated.deepcopy.go index 9f3d6d443..ce0a6c919 100644 --- a/pkg/apis/gcpprovider/v1beta1/zz_generated.deepcopy.go +++ b/pkg/apis/gcpprovider/v1beta1/zz_generated.deepcopy.go @@ -25,6 +25,21 @@ import ( "k8s.io/apimachinery/pkg/runtime" ) +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *GCPAcceleratorConfig) DeepCopyInto(out *GCPAcceleratorConfig) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GCPAcceleratorConfig. +func (in *GCPAcceleratorConfig) DeepCopy() *GCPAcceleratorConfig { + if in == nil { + return nil + } + out := new(GCPAcceleratorConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *GCPDisk) DeepCopyInto(out *GCPDisk) { *out = *in @@ -176,6 +191,22 @@ func (in *GCPMachineProviderSpec) DeepCopyInto(out *GCPMachineProviderSpec) { *out = make([]string, len(*in)) copy(*out, *in) } + if in.GuestAccelerators != nil { + in, out := &in.GuestAccelerators, &out.GuestAccelerators + *out = make([]*GCPAcceleratorConfig, len(*in)) + for i := range *in { + if (*in)[i] != nil { + in, out := &(*in)[i], &(*out)[i] + *out = new(GCPAcceleratorConfig) + **out = **in + } + } + } + if in.AutomaticRestart != nil { + in, out := &in.AutomaticRestart, &out.AutomaticRestart + *out = new(bool) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new GCPMachineProviderSpec. diff --git a/pkg/cloud/gcp/actuators/machine/reconciler.go b/pkg/cloud/gcp/actuators/machine/reconciler.go index fb5531367..3a6171a2c 100644 --- a/pkg/cloud/gcp/actuators/machine/reconciler.go +++ b/pkg/cloud/gcp/actuators/machine/reconciler.go @@ -3,10 +3,9 @@ package machine import ( "context" "fmt" - "time" - "strconv" "strings" + "time" "github.com/openshift/cluster-api-provider-gcp/pkg/apis/gcpprovider/v1beta1" machinev1 "github.com/openshift/machine-api-operator/pkg/apis/machine/v1beta1" @@ -25,6 +24,8 @@ const ( requeueAfterSeconds = 20 instanceLinkFmt = "https://www.googleapis.com/compute/v1/projects/%s/zones/%s/instances/%s" kmsKeyNameFmt = "projects/%s/locations/%s/keyRings/%s/cryptoKeys/%s" + machineTypeFmt = "zones/%s/machineTypes/%s" + acceleratorTypeFmt = "zones/%s/acceleratorTypes/%s" ) // Reconciler are list of services required by machine actuator, easy to create a fake @@ -39,6 +40,99 @@ func newReconciler(scope *machineScope) *Reconciler { } } +var ( + supportedGpuTypes = map[string]string{ + "nvidia-tesla-k80": "NVIDIA_K80_GPUS", + "nvidia-tesla-p100": "NVIDIA_P100_GPUS", + "nvidia-tesla-v100": "NVIDIA_V100_GPUS", + "nvidia-tesla-a100": "NVIDIA_A100_GPUS", + "nvidia-tesla-p4": "NVIDIA_P4_GPUS", + "nvidia-tesla-t4": "NVIDIA_T4_GPUS", + } +) + +func containsString(sli []string, str string) bool { + for _, elem := range sli { + if elem == str { + return true + } + } + return false +} + +// machineTypeAcceleratorCount represents nvidia-tesla-A100 GPUs which are only compatible with A2 machine family +func (r *Reconciler) checkQuota(machineTypeAcceleratorCount int64) error { + region, err := r.computeService.RegionGet(r.projectID, r.providerSpec.Region) + if err != nil { + return fmt.Errorf("failed to get region via compute service: %v", err) + } + quotas := region.Quotas + var guestAccelerators = []*v1beta1.GCPAcceleratorConfig{} + // When the machine type has associated accelerator instances (A2 machine family), accelerators will be nvidia-tesla-A100s. + // Additional guest accelerators are not allowed so ignore the providerSpec GuestAccelerators. + if machineTypeAcceleratorCount != 0 { + guestAccelerators = append(guestAccelerators, &v1beta1.GCPAcceleratorConfig{AcceleratorType: "nvidia-tesla-a100", AcceleratorCount: machineTypeAcceleratorCount}) + } else { + guestAccelerators = r.providerSpec.GuestAccelerators + } + // validate zone and then quota + for _, elem := range guestAccelerators { + _, err := r.computeService.AcceleratorTypesList(r.projectID, r.providerSpec.Zone, elem.AcceleratorType) + if err != nil { + return fmt.Errorf("AcceleratorType not available in the zone: %v", err) + } + metric := supportedGpuTypes[elem.AcceleratorType] + if metric == "" { + return machinecontroller.InvalidMachineConfiguration("Unsupported accelerator type") + } + // preemptible instances have separate quota + if r.providerSpec.Preemptible { + metric = "PREEMPTIBLE_" + metric + } + // check quota for GA + for i, q := range quotas { + if q.Metric == metric { + if int64(q.Usage)+elem.AcceleratorCount > int64(q.Limit) { + return machinecontroller.InvalidMachineConfiguration(fmt.Sprintf("Quota exceeded. Metric: %s. Usage: %v. Limit: %v.", metric, q.Usage, q.Limit)) + } + break + } + if i == len(quotas)-1 { + return machinecontroller.InvalidMachineConfiguration(fmt.Sprintf("No quota found. Metric: %s.", metric)) + } + } + } + return nil +} + +func (r *Reconciler) validateGuestAccelerators() error { + if len(r.providerSpec.GuestAccelerators) != 0 || strings.HasPrefix(r.providerSpec.MachineType, "a2-") { + a2MachineFamily, n1MachineFamily := r.computeService.GPUCompatibleMachineTypesList(r.providerSpec.ProjectID, r.providerSpec.Zone, r.Context) + machineType := r.providerSpec.MachineType + if a2MachineFamily[machineType] != 0 { + // a2 family machine - has fixed type and count of GPUs + if err := r.checkQuota(a2MachineFamily[machineType]); err != nil { + return err + } else { + return nil + } + } else if containsString(n1MachineFamily, machineType) { + // n1 family machine + if err := r.checkQuota(0); err != nil { + return err + } else { + return nil + } + } else { + // any other machine type + return machinecontroller.InvalidMachineConfiguration(fmt.Sprintf("MachineType %s does not support accelerators. Only A2 and N1 machine type families support guest acceleartors.", machineType)) + } + } else { + // no accelerators to validate so return nil + return nil + } +} + // Create creates machine if and only if machine exists, handled by cluster-api func (r *Reconciler) create() error { if err := validateMachine(*r.machine, *r.providerSpec); err != nil { @@ -50,16 +144,31 @@ func (r *Reconciler) create() error { CanIpForward: r.providerSpec.CanIPForward, DeletionProtection: r.providerSpec.DeletionProtection, Labels: r.providerSpec.Labels, - MachineType: fmt.Sprintf("zones/%s/machineTypes/%s", zone, r.providerSpec.MachineType), + MachineType: fmt.Sprintf(machineTypeFmt, zone, r.providerSpec.MachineType), Name: r.machine.Name, Tags: &compute.Tags{ Items: r.providerSpec.Tags, }, Scheduling: &compute.Scheduling{ - Preemptible: r.providerSpec.Preemptible, + Preemptible: r.providerSpec.Preemptible, + AutomaticRestart: r.providerSpec.AutomaticRestart, + OnHostMaintenance: r.providerSpec.OnHostMaintenance, }, } + var guestAccelerators = []*compute.AcceleratorConfig{} + for index, ga := range r.providerSpec.GuestAccelerators { + guestAccelerators = append(guestAccelerators, &compute.AcceleratorConfig{ + AcceleratorType: fmt.Sprintf(acceleratorTypeFmt, zone, r.providerSpec.GuestAccelerators[index].AcceleratorType), + AcceleratorCount: ga.AcceleratorCount, + }) + } + instance.GuestAccelerators = guestAccelerators + + if err := r.validateGuestAccelerators(); err != nil { + return err + } + if instance.Labels == nil { instance.Labels = map[string]string{} } diff --git a/pkg/cloud/gcp/actuators/services/compute/computeservice.go b/pkg/cloud/gcp/actuators/services/compute/computeservice.go index 8abcbb130..3387657f4 100644 --- a/pkg/cloud/gcp/actuators/services/compute/computeservice.go +++ b/pkg/cloud/gcp/actuators/services/compute/computeservice.go @@ -1,6 +1,10 @@ package computeservice import ( + "context" + "log" + "strings" + "github.com/openshift/cluster-api-provider-gcp/pkg/cloud/gcp/actuators/util" "github.com/openshift/cluster-api-provider-gcp/pkg/version" "google.golang.org/api/compute/v1" @@ -19,6 +23,9 @@ type GCPComputeService interface { TargetPoolsAddInstance(project string, region string, name string, instance string) (*compute.Operation, error) TargetPoolsRemoveInstance(project string, region string, name string, instance string) (*compute.Operation, error) MachineTypesGet(project string, machineType string, zone string) (*compute.MachineType, error) + RegionGet(project string, region string) (*compute.Region, error) + GPUCompatibleMachineTypesList(project string, zone string, ctx context.Context) (map[string]int64, []string) + AcceleratorTypesList(project string, zone string, acceleratorType string) (*compute.AcceleratorType, error) } type computeService struct { @@ -101,3 +108,33 @@ func (c *computeService) TargetPoolsRemoveInstance(project string, region string func (c *computeService) MachineTypesGet(project string, zone string, machineType string) (*compute.MachineType, error) { return c.service.MachineTypes.Get(project, zone, machineType).Do() } + +// funtion lists machineTyoes available in the zone and return map of A2 family and slice of N1 familiy machineTypes +func (c *computeService) GPUCompatibleMachineTypesList(project string, zone string, ctx context.Context) (map[string]int64, []string) { + req := c.service.MachineTypes.List(project, zone) + var ( + a2MachineFamily = map[string]int64{} + n1MachineFamily []string + ) + if err := req.Pages(ctx, func(page *compute.MachineTypeList) error { + for _, machineType := range page.Items { + if strings.HasPrefix(machineType.Name, "a2") { + a2MachineFamily[machineType.Name] = machineType.Accelerators[0].GuestAcceleratorCount + } else if strings.HasPrefix(machineType.Name, "n1") { + n1MachineFamily = append(n1MachineFamily, machineType.Name) + } + } + return nil + }); err != nil { + log.Fatal(err) + } + return a2MachineFamily, n1MachineFamily +} + +func (c *computeService) AcceleratorTypesList(project string, zone string, acceleratorType string) (*compute.AcceleratorType, error) { + return c.service.AcceleratorTypes.Get(project, zone, acceleratorType).Do() +} + +func (c *computeService) RegionGet(project string, region string) (*compute.Region, error) { + return c.service.Regions.Get(project, region).Do() +} diff --git a/pkg/cloud/gcp/actuators/services/compute/computeservice_mock.go b/pkg/cloud/gcp/actuators/services/compute/computeservice_mock.go index a0993d826..a314b4cf7 100644 --- a/pkg/cloud/gcp/actuators/services/compute/computeservice_mock.go +++ b/pkg/cloud/gcp/actuators/services/compute/computeservice_mock.go @@ -1,6 +1,8 @@ package computeservice import ( + "context" + compute "google.golang.org/api/compute/v1" "google.golang.org/api/googleapi" ) @@ -129,3 +131,14 @@ func MockBuilderFuncTypeNotFound(serviceAccountJSON string) (GCPComputeService, } return computeSvc, nil } + +func (c *GCPComputeServiceMock) RegionGet(project string, region string) (*compute.Region, error) { + return nil, nil +} + +func (c *GCPComputeServiceMock) GPUCompatibleMachineTypesList(project string, zone string, ctx context.Context) (map[string]int64, []string) { + return nil, nil +} +func (c *GCPComputeServiceMock) AcceleratorTypesList(project string, zone string, acceleratorType string) (*compute.AcceleratorType, error) { + return nil, nil +}