Add predicates gpu-number.

Signed-off-by: peiniliu <[email protected]>
volcano-sh · Jul 15, 2022 · 11b14a0 · 11b14a0
1 parent 9def57e
commit 11b14a0
Show file tree

Hide file tree

Showing 8 changed files with 313 additions and 38 deletions.
diff --git a/docs/images/gpu-number.png b/docs/images/gpu-number.png
diff --git a/docs/user-guide/how_to_use_gpu_number.md b/docs/user-guide/how_to_use_gpu_number.md
@@ -0,0 +1,135 @@
+# GPU Number User guide
+
+## Environment setup
+
+### Install volcano
+
+
+#### 1. Install from source
+
+Refer to [Install Guide](../../installer/README.md) to install volcano.
+
+After installed, update the scheduler configuration:
+
+```shell script
+kubectl edit cm -n volcano-system volcano-scheduler-configmap
+```
+
+```yaml
+kind: ConfigMap
+apiVersion: v1
+metadata:
+  name: volcano-scheduler-configmap
+  namespace: volcano-system
+data:
+  volcano-scheduler.conf: |
+    actions: "enqueue, allocate, backfill"
+    tiers:
+    - plugins:
+      - name: priority
+      - name: gang
+      - name: conformance
+    - plugins:
+      - name: drf
+      - name: predicates
+        arguments:
+          predicate.GPUNumberEnable: true # enable gpu number
+      - name: proportion
+      - name: nodeorder
+      - name: binpack
+```
+
+#### 2. Install from release package.
+
+Same as above, after installed, update the scheduler configuration in `volcano-scheduler-configmap` configmap.
+
+### Install Volcano device plugin
+
+Please refer to [volcano device plugin](https://github.com/volcano-sh/devices/blob/master/README.md#quick-start)
+
+* Remember to config volcano device plugin to support gpu-number, users need to config volcano device plugin --gpu-strategy=number. For more information [volcano device plugin configuration](https://github.com/peiniliu/devices/blob/dev/doc/config.md)
+
+### Verify environment is ready
+
+Check the node status, it is ok  `volcano.sh/gpu-number` is included in the allocatable resources. 
+
+```shell script
+$ kubectl get node {node name} -oyaml
+...
+Capacity:
+  attachable-volumes-gce-pd:  127
+  cpu:                        2
+  ephemeral-storage:          98868448Ki
+  hugepages-1Gi:              0
+  hugepages-2Mi:              0
+  memory:                     7632596Ki
+  pods:                       110
+  volcano.sh/gpu-memory:      0
+  volcano.sh/gpu-number:      1
+Allocatable:
+  attachable-volumes-gce-pd:  127
+  cpu:                        1930m
+  ephemeral-storage:          47093746742
+  hugepages-1Gi:              0
+  hugepages-2Mi:              0
+  memory:                     5752532Ki
+  pods:                       110
+  volcano.sh/gpu-memory:      0
+  volcano.sh/gpu-number:      1
+```
+
+### Running Jobs With Multiple GPU Cards
+
+Jobs can have multiple exclusive NVIDIA GPUs cards via defining container level resource requirements `volcano.sh/gpu-number`:
+```shell script
+$ cat <<EOF | kubectl apply -f -
+apiVersion: v1
+kind: Pod
+metadata:
+  name: gpu-pod1
+spec:
+  containers:
+    - name: cuda-container
+      image: nvidia/cuda:9.0-devel
+      command: ["sleep"]
+      args: ["100000"]
+      resources:
+        limits:
+          volcano.sh/gpu-number: 1 # requesting 2 gpu cards
+EOF
+```
+
+If the above pods claim multiple gpu cards, you can see each of them has exclusive gpu cards:
+
+```shell script
+$ kubectl exec -ti  gpu-pod1 env
+...
+NVIDIA_VISIBLE_DEVICES=0
+VOLCANO_GPU_MEMORY_TOTAL=15109
+VOLCANO_GPU_ALLOCATED=1
+...
+```
+### Understanding How Multiple GPU Cards Requirement Works 
+
+The main architecture is similar as the previous, but the gpu-index results of each pod will be a list of gpu cards index. 
+
+![gpu_number](../images/gpu-number.png)
+
+1. create a pod with `volcano.sh/gpu-number` resource request,
+
+2. volcano scheduler predicates and allocate gpu cards to the pod. Add the below annotation
+
+```yaml
+annotations:
+  volcano.sh/gpu-index: “0”
+  volcano.sh/predicate-time: “1593764466550835304”
+```
+
+3. kubelet watches the pod bound to itself, and calls allocate API to set env before running the container.
+
+```yaml
+env:
+  NVIDIA_VISIBLE_DEVICES: “0” # GPU card index
+  VOLCANO_GPU_ALLOCATED: “1” # GPU number allocated
+  VOLCANO_GPU_MEMORY_TOTAL: “15109” # GPU memory of the card
+```
diff --git a/docs/user-guide/how_to_use_gpu_sharing.md b/docs/user-guide/how_to_use_gpu_sharing.md
@@ -46,6 +46,8 @@ Same as above, after installed, update the scheduler configuration in `volcano-s
 
 Please refer to [volcano device plugin](https://github.com/volcano-sh/devices/blob/master/README.md#quick-start)
 
+* By default volcano device plugin supports shared GPUs, users do not need to config volcano device plugin. Default setting is the same as setting --gpu-strategy=number. For more information [volcano device plugin configuration](https://github.com/peiniliu/devices/blob/dev/doc/config.md)
+
 ### Verify environment is ready
 
 Check the node status, it is ok if `volcano.sh/gpu-memory` and `volcano.sh/gpu-number` are included in the allocatable resources.
@@ -124,14 +126,14 @@ If only the above pods are claiming gpu resource in a cluster, you can see the p
 ```shell script
 $ kubectl exec -ti  gpu-pod1 env
 ...
-VOLCANO_GPU_TOTAL=11178
+VOLCANO_GPU_MEMORY_TOTAL=11178
 VOLCANO_GPU_ALLOCATED=1024
 NVIDIA_VISIBLE_DEVICES=0
 ...
 
 $ kubectl exec -ti  gpu-pod1 env
 ...
-VOLCANO_GPU_TOTAL=11178
+VOLCANO_GPU_MEMORY_TOTAL=11178
 VOLCANO_GPU_ALLOCATED=1024
 NVIDIA_VISIBLE_DEVICES=0
 ...
@@ -159,5 +161,5 @@ annotations:
 env:
   NVIDIA_VISIBLE_DEVICES: "0" # GPU card index
   VOLCANO_GPU_ALLOCATED: "1024" # GPU allocated
-  VOLCANO_GPU_TOTAL: "11178" # GPU memory of the card
+  VOLCANO_GPU_MEMORY_TOTAL: "11178" # GPU memory of the card
 ```
diff --git a/pkg/scheduler/api/device_info.go b/pkg/scheduler/api/device_info.go
@@ -53,6 +53,11 @@ func (g *GPUDevice) getUsedGPUMemory() uint {
 	return res
 }
 
+// isIdleGPU check if the device is idled.
+func (g *GPUDevice) isIdleGPU() bool {
+	return g.PodMap == nil || len(g.PodMap) == 0
+}
+
 // GetGPUResourceOfPod returns the GPU resource required by the pod.
 func GetGPUResourceOfPod(pod *v1.Pod) uint {
 	var mem uint
@@ -70,3 +75,21 @@ func getGPUResourceOfContainer(container *v1.Container) uint {
 	}
 	return mem
 }
+
+// GetGPUNumberOfPod returns the number of GPUs required by the pod.
+func GetGPUNumberOfPod(pod *v1.Pod) int {
+	var gpus int
+	for _, container := range pod.Spec.Containers {
+		gpus += getGPUNumberOfContainer(&container)
+	}
+	return gpus
+}
+
+// getGPUNumberOfContainer returns the number of GPUs required by the container.
+func getGPUNumberOfContainer(container *v1.Container) int {
+	var gpus int
+	if val, ok := container.Resources.Limits[VolcanoGPUNumber]; ok {
+		gpus = int(val.Value())
+	}
+	return gpus
+}
diff --git a/pkg/scheduler/api/node_info.go b/pkg/scheduler/api/node_info.go
@@ -565,13 +565,26 @@ func (ni *NodeInfo) getDevicesAllGPUMemory() map[int]uint {
 	return res
 }
 
+// GetDevicesIdleGPU returns all the idle gpu card.
+func (ni *NodeInfo) GetDevicesIdleGPUs() []int {
+	res := []int{}
+	for _, device := range ni.GPUDevices {
+		if device.isIdleGPU() {
+			res = append(res, device.ID)
+		}
+	}
+	return res
+}
+
 // AddGPUResource adds the pod to GPU pool if it is assigned
 func (ni *NodeInfo) AddGPUResource(pod *v1.Pod) {
 	gpuRes := GetGPUResourceOfPod(pod)
 	if gpuRes > 0 {
-		id := GetGPUIndex(pod)
-		if dev := ni.GPUDevices[id]; dev != nil {
-			dev.PodMap[string(pod.UID)] = pod
+		ids := GetGPUIndex(pod)
+		for _, id := range ids {
+			if dev := ni.GPUDevices[id]; dev != nil {
+				dev.PodMap[string(pod.UID)] = pod
+			}
 		}
 	}
 }
@@ -580,9 +593,11 @@ func (ni *NodeInfo) AddGPUResource(pod *v1.Pod) {
 func (ni *NodeInfo) SubGPUResource(pod *v1.Pod) {
 	gpuRes := GetGPUResourceOfPod(pod)
 	if gpuRes > 0 {
-		id := GetGPUIndex(pod)
-		if dev := ni.GPUDevices[id]; dev != nil {
-			delete(dev.PodMap, string(pod.UID))
+		ids := GetGPUIndex(pod)
+		for _, id := range ids {
+			if dev := ni.GPUDevices[id]; dev != nil {
+				delete(dev.PodMap, string(pod.UID))
+			}
 		}
 	}
 }

diff --git a/pkg/scheduler/api/pod_info.go b/pkg/scheduler/api/pod_info.go
@@ -160,20 +160,29 @@ func GetPodResourceWithoutInitContainers(pod *v1.Pod) *Resource {
 }
 
 // GetGPUIndex returns the ID of the GPU
-func GetGPUIndex(pod *v1.Pod) int {
+//return the gpu index list
+func GetGPUIndex(pod *v1.Pod) []int {
 	if len(pod.Annotations) > 0 {
 		value, found := pod.Annotations[GPUIndex]
 		if found {
-			id, err := strconv.Atoi(value)
-			if err != nil {
-				klog.Errorf("invalid %s=%s", GPUIndex, value)
-				return -1
+			ids := strings.Split(value, ",")
+			if len(ids) == 0 {
+				klog.Errorf("invalid gpu index annotation %s=%s", GPUIndex, value)
+			}
+			idSlice := make([]int, len(ids))
+			for idx, id := range ids {
+				j, err := strconv.Atoi(id)
+				if err != nil {
+					klog.Errorf("invalid %s=%s", GPUIndex, value)
+					return nil
+				}
+				idSlice[idx] = j
 			}
-			return id
+			return idSlice
 		}
 	}
 
-	return -1
+	return nil
 }
 
 func escapeJSONPointer(p string) string {
@@ -184,11 +193,12 @@ func escapeJSONPointer(p string) string {
 }
 
 // AddGPUIndexPatch returns the patch adding GPU index
-func AddGPUIndexPatch(id int) string {
+func AddGPUIndexPatch(ids []int) string {
+	idsstring := strings.Trim(strings.Replace(fmt.Sprint(ids), " ", ",", -1), "[]")
 	return fmt.Sprintf(`[{"op": "add", "path": "/metadata/annotations/%s", "value":"%d"},`+
-		`{"op": "add", "path": "/metadata/annotations/%s", "value": "%d"}]`,
+		`{"op": "add", "path": "/metadata/annotations/%s", "value": "%s"}]`,
 		escapeJSONPointer(PredicateTime), time.Now().UnixNano(),
-		escapeJSONPointer(GPUIndex), id)
+		escapeJSONPointer(GPUIndex), idsstring)
 }
 
 // RemoveGPUIndexPatch returns the patch removing GPU index

diff --git a/pkg/scheduler/plugins/predicates/gpu.go b/pkg/scheduler/plugins/predicates/gpu.go
@@ -20,37 +20,74 @@ import (
 	"fmt"
 
 	v1 "k8s.io/api/core/v1"
+	"k8s.io/klog"
 
 	"volcano.sh/volcano/pkg/scheduler/api"
 )
 
-// checkNodeGPUSharingPredicate checks if a gpu sharing pod can be scheduled on a node.
+// checkNodeGPUSharingPredicate checks if a pod with gpu requirement can be scheduled on a node.
 func checkNodeGPUSharingPredicate(pod *v1.Pod, nodeInfo *api.NodeInfo) (bool, error) {
 	// no gpu sharing request
 	if api.GetGPUResourceOfPod(pod) <= 0 {
 		return true, nil
 	}
+	ids := predicateGPUbyMemory(pod, nodeInfo)
+	if ids == nil {
+		return false, fmt.Errorf("no enough gpu memory on node %s", nodeInfo.Name)
+	}
+	return true, nil
+}
 
-	id := predicateGPU(pod, nodeInfo)
-	if id < 0 {
-		return false, fmt.Errorf("no enough gpu memory on single device of node %s", nodeInfo.Name)
+func checkNodeGPUNumberPredicate(pod *v1.Pod, nodeInfo *api.NodeInfo) (bool, error) {
+	//no gpu number request
+	if api.GetGPUNumberOfPod(pod) <= 0 {
+		return true, nil
+	}
+	ids := predicateGPUbyNumber(pod, nodeInfo)
+	if ids == nil {
+		return false, fmt.Errorf("no enough gpu number on node %s", nodeInfo.Name)
 	}
 	return true, nil
 }
 
 // predicateGPU returns the available GPU ID
-func predicateGPU(pod *v1.Pod, node *api.NodeInfo) int {
+func predicateGPUbyMemory(pod *v1.Pod, node *api.NodeInfo) []int {
 	gpuRequest := api.GetGPUResourceOfPod(pod)
 	allocatableGPUs := node.GetDevicesIdleGPUMemory()
 
+	var devIDs []int
+
 	for devID := 0; devID < len(allocatableGPUs); devID++ {
 		availableGPU, ok := allocatableGPUs[devID]
 		if ok {
 			if availableGPU >= gpuRequest {
-				return devID
+				devIDs = append(devIDs, devID)
+				return devIDs
 			}
 		}
 	}
 
-	return -1
+	return nil
+}
+
+// predicateGPU returns the available GPU IDs
+func predicateGPUbyNumber(pod *v1.Pod, node *api.NodeInfo) []int {
+	gpuRequest := api.GetGPUNumberOfPod(pod)
+	allocatableGPUs := node.GetDevicesIdleGPUs()
+
+	var devIDs []int
+
+	if len(allocatableGPUs) < gpuRequest {
+		klog.Errorf("Not enough gpu cards")
+		return nil
+	}
+
+	for devID := 0; devID < len(allocatableGPUs); devID++ {
+		devIDs = append(devIDs, allocatableGPUs[devID])
+		if len(devIDs) == gpuRequest {
+			return devIDs
+		}
+	}
+
+	return nil
 }