From 0ec3e079013998b1cbf90082b255d18d8cf4e6f5 Mon Sep 17 00:00:00 2001 From: Subhasmita Swain Date: Thu, 14 Jul 2022 12:41:49 +0530 Subject: [PATCH] add supported gpu types --- api/v1beta1/gcpmachine_types.go | 16 ++++++++-------- cloud/scope/machine.go | 10 ++++++++++ cloud/services/compute/instances/reconcile.go | 2 ++ cloud/services/compute/instances/service.go | 4 +++- controllers/gcpmachine_controller.go | 13 +++++++++++++ 5 files changed, 36 insertions(+), 9 deletions(-) diff --git a/api/v1beta1/gcpmachine_types.go b/api/v1beta1/gcpmachine_types.go index de1158b08a..e4d7d3c734 100644 --- a/api/v1beta1/gcpmachine_types.go +++ b/api/v1beta1/gcpmachine_types.go @@ -66,16 +66,16 @@ const ( IPForwardingDisabled IPForwarding = "Disabled" ) -// GCPAccleratorConfig represents the GPU accelerator configuration for the GCP machine. -type GCPAcceleratorConfig struct { - // AccerleratorType is the type of the GPU accelerator to be used for the GCP machine. +// AcceleratorConfig is the GPU accelerator configuration for the GCP machine. +type AcceleratorConfig struct { + // AccerleratorType is the type of the GPU accelerator to be used for the GCP machine. // +required AcceleratorType string `json:"acceleratorType"` // AcceleratorCount is the number of accelerators to be used for the GCP machine. // Defaults to 1. // +optional - AcceleratorCount int64 `json:"acceleratorCount,omitempty"` + AcceleratorCount int `json:"acceleratorCount,omitempty"` } // GCPMachineSpec defines the desired state of GCPMachine. @@ -102,7 +102,7 @@ type GCPMachineSpec struct { Image *string `json:"image,omitempty"` // AcceleratorConfig is the accelerator configuration for the GCP machine. - AcceleratorConfig *GCPAcceleratorConfig `json:"acceleratorConfig,omitempty"` + AcceleratorConfig *AcceleratorConfig `json:"acceleratorConfig,omitempty"` // OnHostMaintenance is the action to take when the host machine is being upgraded. // It is either "TERMINATE" or "MIGRATE" depending on the machine type and preemptibility. @@ -110,10 +110,10 @@ type GCPMachineSpec struct { // +optional OnHostMaintenance string `json:"onHostMaintenance,omitempty"` - // AutomaticRestart is whether the instance should be automatically restarted if it is terminated by GCP. + // AutomaticRestart is whether the instance should be automatically restarted if it is terminated by GCP. // Default=true // +optional - AutomaticRestart bool `json:"automaticRestart,omitempty"` + AutomaticRestart *bool `json:"automaticRestart,omitempty"` // AdditionalLabels is an optional set of tags to add to an instance, in addition to the ones added by default by the // GCP provider. If both the GCPCluster and the GCPMachine specify the same tag name with different values, the @@ -128,7 +128,7 @@ type GCPMachineSpec struct { // +optional AdditionalMetadata []MetadataItem `json:"additionalMetadata,omitempty"` - // BuildName is the name of the build to use for the GCP instance. + // BuildName is the name of the build to use for the GCP instance. // +optional BuildName string `json:"buildName,omitempty"` diff --git a/cloud/scope/machine.go b/cloud/scope/machine.go index 19aa4f27a5..4e9321a0ad 100644 --- a/cloud/scope/machine.go +++ b/cloud/scope/machine.go @@ -115,6 +115,16 @@ func (m *MachineScope) Name() string { return m.GCPMachine.Name } +// AcceleratorType returns the AcceleratorType from the GCPMachine. +func (m *MachineScope) AcceleratorType() string { + return m.GCPMachine.Spec.AcceleratorConfig.AcceleratorType +} + +// AcceleratorCount returns the number of accelerators for the machine. +func (m *MachineScope) AcceleratorCount() int { + return m.GCPMachine.Spec.AcceleratorConfig.AcceleratorCount +} + // Namespace returns the namespace name. func (m *MachineScope) Namespace() string { return m.GCPMachine.Namespace diff --git a/cloud/services/compute/instances/reconcile.go b/cloud/services/compute/instances/reconcile.go index e0a6bdd351..88832ed414 100644 --- a/cloud/services/compute/instances/reconcile.go +++ b/cloud/services/compute/instances/reconcile.go @@ -59,6 +59,8 @@ func (s *Service) Reconcile(ctx context.Context) error { machineName := s.scope.Name() zone := s.scope.Zone() project := s.scope.Project() + acceleratorType := s.scope.AcceleratorType() + acceleratorCount := s.scope.AcceleratorCount() // Since we don't know when the project was created, we must account for // both types of internal-dns: diff --git a/cloud/services/compute/instances/service.go b/cloud/services/compute/instances/service.go index 942bad46c7..0a6aadc6a6 100644 --- a/cloud/services/compute/instances/service.go +++ b/cloud/services/compute/instances/service.go @@ -43,6 +43,8 @@ type Scope interface { InstanceSpec() *compute.Instance InstanceImageSpec() *compute.AttachedDisk InstanceAdditionalDiskSpec() []*compute.AttachedDisk + AcceleratorType() string + AcceleratorCount() int } // Service implements instances reconciler. @@ -61,4 +63,4 @@ func New(scope Scope) *Service { instances: scope.Cloud().Instances(), instancegroups: scope.Cloud().InstanceGroups(), } -} +} \ No newline at end of file diff --git a/controllers/gcpmachine_controller.go b/controllers/gcpmachine_controller.go index 0b220bf726..2a0920fdec 100644 --- a/controllers/gcpmachine_controller.go +++ b/controllers/gcpmachine_controller.go @@ -47,8 +47,21 @@ type GCPMachineReconciler struct { client.Client ReconcileTimeout time.Duration WatchFilterValue string + AccleratorConfig *infrav1.AcceleratorConfig } +// add the supported gpu types to the machine type +var ( + gpuTypes = map[string]string{ + "nvidia-tesla-k80": "NVIDIA_K80_GPUS", + "nvidia-tesla-p100": "NVIDIA_P100_GPUS", + "nvidia-tesla-v100": "NVIDIA_V100_GPUS", + "nvidia-tesla-a100": "NVIDIA_A100_GPUS", + "nvidia-tesla-p4": "NVIDIA_P4_GPUS", + "nvidia-tesla-t4": "NVIDIA_T4_GPUS", + } +) + // +kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch // +kubebuilder:rbac:groups="",resources=secrets;,verbs=get;list;watch // +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines;machines/status,verbs=get;list;watch