Skip to content

Commit

Permalink
add supported gpu types
Browse files Browse the repository at this point in the history
  • Loading branch information
SubhasmitaSw committed Jul 15, 2022
1 parent 763acc0 commit 0ec3e07
Show file tree
Hide file tree
Showing 5 changed files with 36 additions and 9 deletions.
16 changes: 8 additions & 8 deletions api/v1beta1/gcpmachine_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,16 +66,16 @@ const (
IPForwardingDisabled IPForwarding = "Disabled"
)

// GCPAccleratorConfig represents the GPU accelerator configuration for the GCP machine.
type GCPAcceleratorConfig struct {
// AccerleratorType is the type of the GPU accelerator to be used for the GCP machine.
// AcceleratorConfig is the GPU accelerator configuration for the GCP machine.
type AcceleratorConfig struct {
// AccerleratorType is the type of the GPU accelerator to be used for the GCP machine.
// +required
AcceleratorType string `json:"acceleratorType"`

// AcceleratorCount is the number of accelerators to be used for the GCP machine.
// Defaults to 1.
// +optional
AcceleratorCount int64 `json:"acceleratorCount,omitempty"`
AcceleratorCount int `json:"acceleratorCount,omitempty"`
}

// GCPMachineSpec defines the desired state of GCPMachine.
Expand All @@ -102,18 +102,18 @@ type GCPMachineSpec struct {
Image *string `json:"image,omitempty"`

// AcceleratorConfig is the accelerator configuration for the GCP machine.
AcceleratorConfig *GCPAcceleratorConfig `json:"acceleratorConfig,omitempty"`
AcceleratorConfig *AcceleratorConfig `json:"acceleratorConfig,omitempty"`

// OnHostMaintenance is the action to take when the host machine is being upgraded.
// It is either "TERMINATE" or "MIGRATE" depending on the machine type and preemptibility.
// Default=TERMINATE
// +optional
OnHostMaintenance string `json:"onHostMaintenance,omitempty"`

// AutomaticRestart is whether the instance should be automatically restarted if it is terminated by GCP.
// AutomaticRestart is whether the instance should be automatically restarted if it is terminated by GCP.
// Default=true
// +optional
AutomaticRestart bool `json:"automaticRestart,omitempty"`
AutomaticRestart *bool `json:"automaticRestart,omitempty"`

// AdditionalLabels is an optional set of tags to add to an instance, in addition to the ones added by default by the
// GCP provider. If both the GCPCluster and the GCPMachine specify the same tag name with different values, the
Expand All @@ -128,7 +128,7 @@ type GCPMachineSpec struct {
// +optional
AdditionalMetadata []MetadataItem `json:"additionalMetadata,omitempty"`

// BuildName is the name of the build to use for the GCP instance.
// BuildName is the name of the build to use for the GCP instance.
// +optional
BuildName string `json:"buildName,omitempty"`

Expand Down
10 changes: 10 additions & 0 deletions cloud/scope/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,16 @@ func (m *MachineScope) Name() string {
return m.GCPMachine.Name
}

// AcceleratorType returns the AcceleratorType from the GCPMachine.
func (m *MachineScope) AcceleratorType() string {
return m.GCPMachine.Spec.AcceleratorConfig.AcceleratorType
}

// AcceleratorCount returns the number of accelerators for the machine.
func (m *MachineScope) AcceleratorCount() int {
return m.GCPMachine.Spec.AcceleratorConfig.AcceleratorCount
}

// Namespace returns the namespace name.
func (m *MachineScope) Namespace() string {
return m.GCPMachine.Namespace
Expand Down
2 changes: 2 additions & 0 deletions cloud/services/compute/instances/reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ func (s *Service) Reconcile(ctx context.Context) error {
machineName := s.scope.Name()
zone := s.scope.Zone()
project := s.scope.Project()
acceleratorType := s.scope.AcceleratorType()
acceleratorCount := s.scope.AcceleratorCount()

// Since we don't know when the project was created, we must account for
// both types of internal-dns:
Expand Down
4 changes: 3 additions & 1 deletion cloud/services/compute/instances/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ type Scope interface {
InstanceSpec() *compute.Instance
InstanceImageSpec() *compute.AttachedDisk
InstanceAdditionalDiskSpec() []*compute.AttachedDisk
AcceleratorType() string
AcceleratorCount() int
}

// Service implements instances reconciler.
Expand All @@ -61,4 +63,4 @@ func New(scope Scope) *Service {
instances: scope.Cloud().Instances(),
instancegroups: scope.Cloud().InstanceGroups(),
}
}
}
13 changes: 13 additions & 0 deletions controllers/gcpmachine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,21 @@ type GCPMachineReconciler struct {
client.Client
ReconcileTimeout time.Duration
WatchFilterValue string
AccleratorConfig *infrav1.AcceleratorConfig
}

// add the supported gpu types to the machine type
var (
gpuTypes = map[string]string{
"nvidia-tesla-k80": "NVIDIA_K80_GPUS",
"nvidia-tesla-p100": "NVIDIA_P100_GPUS",
"nvidia-tesla-v100": "NVIDIA_V100_GPUS",
"nvidia-tesla-a100": "NVIDIA_A100_GPUS",
"nvidia-tesla-p4": "NVIDIA_P4_GPUS",
"nvidia-tesla-t4": "NVIDIA_T4_GPUS",
}
)

// +kubebuilder:rbac:groups="",resources=events,verbs=get;list;watch;create;update;patch
// +kubebuilder:rbac:groups="",resources=secrets;,verbs=get;list;watch
// +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines;machines/status,verbs=get;list;watch
Expand Down

0 comments on commit 0ec3e07

Please sign in to comment.