Merge pull request #1692 from peiniliu/dev

Add GPU Numbers Predicates
volcano-sh · Aug 15, 2022 · aa84a7d · aa84a7d
2 parents 3c3d4fc + 05382d1
commit aa84a7d
Show file tree

Hide file tree

Showing 9 changed files with 325 additions and 52 deletions.
diff --git a/docs/images/gpu-number.png b/docs/images/gpu-number.png
diff --git a/docs/user-guide/how_to_use_gpu_number.md b/docs/user-guide/how_to_use_gpu_number.md
@@ -0,0 +1,133 @@
+# GPU Number User guide
+
+## Environment setup
+
+### Install volcano
+
+
+#### 1. Install from source
+
+Refer to [Install Guide](../../installer/README.md) to install volcano.
+
+After installed, update the scheduler configuration:
+
+```shell script
+kubectl edit cm -n volcano-system volcano-scheduler-configmap
+```
+
+```yaml
+kind: ConfigMap
+apiVersion: v1
+metadata:
+  name: volcano-scheduler-configmap
+  namespace: volcano-system
+data:
+  volcano-scheduler.conf: |
+    actions: "enqueue, allocate, backfill"
+    tiers:
+    - plugins:
+      - name: priority
+      - name: gang
+      - name: conformance
+    - plugins:
+      - name: drf
+      - name: predicates
+        arguments:
+          predicate.GPUNumberEnable: true # enable gpu number
+      - name: proportion
+      - name: nodeorder
+      - name: binpack
+```
+
+#### 2. Install from release package.
+
+Same as above, after installed, update the scheduler configuration in `volcano-scheduler-configmap` configmap.
+
+### Install Volcano device plugin
+
+Please refer to [volcano device plugin](https://github.com/volcano-sh/devices/blob/master/README.md#quick-start)
+
+* Remember to config volcano device plugin to support gpu-number, users need to config volcano device plugin --gpu-strategy=number. For more information [volcano device plugin configuration](https://github.com/volcano-sh/devices/blob/master/doc/config.md)
+
+### Verify environment is ready
+
+Check the node status, it is ok  `volcano.sh/gpu-number` is included in the allocatable resources. 
+
+```shell script
+$ kubectl get node {node name} -oyaml
+...
+Capacity:
+  attachable-volumes-gce-pd:  127
+  cpu:                        2
+  ephemeral-storage:          98868448Ki
+  hugepages-1Gi:              0
+  hugepages-2Mi:              0
+  memory:                     7632596Ki
+  pods:                       110
+  volcano.sh/gpu-memory:      0
+  volcano.sh/gpu-number:      1
+Allocatable:
+  attachable-volumes-gce-pd:  127
+  cpu:                        1930m
+  ephemeral-storage:          47093746742
+  hugepages-1Gi:              0
+  hugepages-2Mi:              0
+  memory:                     5752532Ki
+  pods:                       110
+  volcano.sh/gpu-memory:      0
+  volcano.sh/gpu-number:      1
+```
+
+### Running Jobs With Multiple GPU Cards
+
+Jobs can have multiple exclusive NVIDIA GPUs cards via defining container level resource requirements `volcano.sh/gpu-number`:
+```shell script
+$ cat <<EOF | kubectl apply -f -
+apiVersion: v1
+kind: Pod
+metadata:
+  name: gpu-pod1
+spec:
+  containers:
+    - name: cuda-container
+      image: nvidia/cuda:9.0-devel
+      command: ["sleep"]
+      args: ["100000"]
+      resources:
+        limits:
+          volcano.sh/gpu-number: 1 # requesting 1 gpu cards
+EOF
+```
+
+If the above pods claim multiple gpu cards, you can see each of them has exclusive gpu cards:
+
+```shell script
+$ kubectl exec -ti  gpu-pod1 env
+...
+NVIDIA_VISIBLE_DEVICES=0
+VOLCANO_GPU_ALLOCATED=1
+...
+```
+### Understanding How Multiple GPU Cards Requirement Works 
+
+The main architecture is similar as the previous, but the gpu-index results of each pod will be a list of gpu cards index. 
+
+![gpu_number](../images/gpu-number.png)
+
+1. create a pod with `volcano.sh/gpu-number` resource request,
+
+2. volcano scheduler predicates and allocate gpu cards to the pod. Add the below annotation
+
+```yaml
+annotations:
+  volcano.sh/gpu-index: “0”
+  volcano.sh/predicate-time: “1593764466550835304”
+```
+
+3. kubelet watches the pod bound to itself, and calls allocate API to set env before running the container.
+
+```yaml
+env:
+  NVIDIA_VISIBLE_DEVICES: “0” # GPU card index
+  VOLCANO_GPU_ALLOCATED: “1” # GPU number allocated
+```
diff --git a/docs/user-guide/how_to_use_gpu_sharing.md b/docs/user-guide/how_to_use_gpu_sharing.md
@@ -46,6 +46,8 @@ Same as above, after installed, update the scheduler configuration in `volcano-s
 
 Please refer to [volcano device plugin](https://github.com/volcano-sh/devices/blob/master/README.md#quick-start)
 
+* By default volcano device plugin supports shared GPUs, users do not need to config volcano device plugin. Default setting is the same as setting --gpu-strategy=number. For more information [volcano device plugin configuration](https://github.com/volcano-sh/devices/blob/master/doc/config.md)
+
 ### Verify environment is ready
 
 Check the node status, it is ok if `volcano.sh/gpu-memory` and `volcano.sh/gpu-number` are included in the allocatable resources.
@@ -124,14 +126,14 @@ If only the above pods are claiming gpu resource in a cluster, you can see the p
 ```shell script
 $ kubectl exec -ti  gpu-pod1 env
 ...
-VOLCANO_GPU_TOTAL=11178
+VOLCANO_GPU_MEMORY_TOTAL=11178
 VOLCANO_GPU_ALLOCATED=1024
 NVIDIA_VISIBLE_DEVICES=0
 ...
 
 $ kubectl exec -ti  gpu-pod1 env
 ...
-VOLCANO_GPU_TOTAL=11178
+VOLCANO_GPU_MEMORY_TOTAL=11178
 VOLCANO_GPU_ALLOCATED=1024
 NVIDIA_VISIBLE_DEVICES=0
 ...
@@ -159,5 +161,5 @@ annotations:
 env:
   NVIDIA_VISIBLE_DEVICES: "0" # GPU card index
   VOLCANO_GPU_ALLOCATED: "1024" # GPU allocated
-  VOLCANO_GPU_TOTAL: "11178" # GPU memory of the card
+  VOLCANO_GPU_MEMORY_TOTAL: "11178" # GPU memory of the card
 ```
diff --git a/pkg/scheduler/api/device_info.go b/pkg/scheduler/api/device_info.go
@@ -46,26 +46,31 @@ func (g *GPUDevice) getUsedGPUMemory() uint {
 		if pod.Status.Phase == v1.PodSucceeded || pod.Status.Phase == v1.PodFailed {
 			continue
 		} else {
-			gpuRequest := GetGPUResourceOfPod(pod)
+			gpuRequest := GetGPUMemoryOfPod(pod)
 			res += gpuRequest
 		}
 	}
 	return res
 }
 
-// GetGPUResourceOfPod returns the GPU resource required by the pod.
-func GetGPUResourceOfPod(pod *v1.Pod) uint {
+// isIdleGPU check if the device is idled.
+func (g *GPUDevice) isIdleGPU() bool {
+	return g.PodMap == nil || len(g.PodMap) == 0
+}
+
+// GetGPUMemoryPod returns the GPU memory required by the pod.
+func GetGPUMemoryOfPod(pod *v1.Pod) uint {
 	var initMem uint
 	for _, container := range pod.Spec.InitContainers {
-		res := getGPUResourceOfContainer(container.Resources)
+		res := getGPUMemoryOfContainer(container.Resources)
 		if initMem < res {
 			initMem = res
 		}
 	}
 
 	var mem uint
 	for _, container := range pod.Spec.Containers {
-		mem += getGPUResourceOfContainer(container.Resources)
+		mem += getGPUMemoryOfContainer(container.Resources)
 	}
 
 	if mem > initMem {
@@ -74,11 +79,29 @@ func GetGPUResourceOfPod(pod *v1.Pod) uint {
 	return initMem
 }
 
-// getGPUResourceOfContainer returns the GPU resource required by the container.
-func getGPUResourceOfContainer(resources v1.ResourceRequirements) uint {
+// getGPUMemoryOfContainer returns the GPU memory required by the container.
+func getGPUMemoryOfContainer(resources v1.ResourceRequirements) uint {
 	var mem uint
 	if val, ok := resources.Limits[VolcanoGPUResource]; ok {
 		mem = uint(val.Value())
 	}
 	return mem
 }
+
+// GetGPUNumberOfPod returns the number of GPUs required by the pod.
+func GetGPUNumberOfPod(pod *v1.Pod) int {
+	var gpus int
+	for _, container := range pod.Spec.Containers {
+		gpus += getGPUNumberOfContainer(container.Resources)
+	}
+	return gpus
+}
+
+// getGPUNumberOfContainer returns the number of GPUs required by the container.
+func getGPUNumberOfContainer(resources v1.ResourceRequirements) int {
+	var gpus int
+	if val, ok := resources.Limits[VolcanoGPUNumber]; ok {
+		gpus = int(val.Value())
+	}
+	return gpus
+}
diff --git a/pkg/scheduler/api/device_info_test.go b/pkg/scheduler/api/device_info_test.go
@@ -23,7 +23,7 @@ import (
 	"k8s.io/apimachinery/pkg/api/resource"
 )
 
-func TestGetGPUResourceOfPod(t *testing.T) {
+func TestGetGPUMemoryOfPod(t *testing.T) {
 	testCases := []struct {
 		name string
 		pod  *v1.Pod
@@ -90,7 +90,7 @@ func TestGetGPUResourceOfPod(t *testing.T) {
 
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
-			got := GetGPUResourceOfPod(tc.pod)
+			got := GetGPUMemoryOfPod(tc.pod)
 			if tc.want != got {
 				t.Errorf("unexpected result, want: %v, got: %v", tc.want, got)
 			}

diff --git a/pkg/scheduler/api/node_info.go b/pkg/scheduler/api/node_info.go
@@ -570,24 +570,39 @@ func (ni *NodeInfo) getDevicesAllGPUMemory() map[int]uint {
 	return res
 }
 
+// GetDevicesIdleGPU returns all the idle gpu card.
+func (ni *NodeInfo) GetDevicesIdleGPUs() []int {
+	res := []int{}
+	for _, device := range ni.GPUDevices {
+		if device.isIdleGPU() {
+			res = append(res, device.ID)
+		}
+	}
+	return res
+}
+
 // AddGPUResource adds the pod to GPU pool if it is assigned
 func (ni *NodeInfo) AddGPUResource(pod *v1.Pod) {
-	gpuRes := GetGPUResourceOfPod(pod)
+	gpuRes := GetGPUMemoryOfPod(pod)
 	if gpuRes > 0 {
-		id := GetGPUIndex(pod)
-		if dev := ni.GPUDevices[id]; dev != nil {
-			dev.PodMap[string(pod.UID)] = pod
+		ids := GetGPUIndex(pod)
+		for _, id := range ids {
+			if dev := ni.GPUDevices[id]; dev != nil {
+				dev.PodMap[string(pod.UID)] = pod
+			}
 		}
 	}
 }
 
 // SubGPUResource frees the gpu hold by the pod
 func (ni *NodeInfo) SubGPUResource(pod *v1.Pod) {
-	gpuRes := GetGPUResourceOfPod(pod)
+	gpuRes := GetGPUMemoryOfPod(pod)
 	if gpuRes > 0 {
-		id := GetGPUIndex(pod)
-		if dev := ni.GPUDevices[id]; dev != nil {
-			delete(dev.PodMap, string(pod.UID))
+		ids := GetGPUIndex(pod)
+		for _, id := range ids {
+			if dev := ni.GPUDevices[id]; dev != nil {
+				delete(dev.PodMap, string(pod.UID))
+			}
 		}
 	}
 }

diff --git a/pkg/scheduler/api/pod_info.go b/pkg/scheduler/api/pod_info.go
@@ -160,20 +160,29 @@ func GetPodResourceWithoutInitContainers(pod *v1.Pod) *Resource {
 }
 
 // GetGPUIndex returns the ID of the GPU
-func GetGPUIndex(pod *v1.Pod) int {
+//return the gpu index list
+func GetGPUIndex(pod *v1.Pod) []int {
 	if len(pod.Annotations) > 0 {
 		value, found := pod.Annotations[GPUIndex]
 		if found {
-			id, err := strconv.Atoi(value)
-			if err != nil {
-				klog.Errorf("invalid %s=%s", GPUIndex, value)
-				return -1
+			ids := strings.Split(value, ",")
+			if len(ids) == 0 {
+				klog.Errorf("invalid gpu index annotation %s=%s", GPUIndex, value)
+			}
+			idSlice := make([]int, len(ids))
+			for idx, id := range ids {
+				j, err := strconv.Atoi(id)
+				if err != nil {
+					klog.Errorf("invalid %s=%s", GPUIndex, value)
+					return nil
+				}
+				idSlice[idx] = j
 			}
-			return id
+			return idSlice
 		}
 	}
 
-	return -1
+	return nil
 }
 
 func escapeJSONPointer(p string) string {
@@ -184,11 +193,12 @@ func escapeJSONPointer(p string) string {
 }
 
 // AddGPUIndexPatch returns the patch adding GPU index
-func AddGPUIndexPatch(id int) string {
+func AddGPUIndexPatch(ids []int) string {
+	idsstring := strings.Trim(strings.Replace(fmt.Sprint(ids), " ", ",", -1), "[]")
 	return fmt.Sprintf(`[{"op": "add", "path": "/metadata/annotations/%s", "value":"%d"},`+
-		`{"op": "add", "path": "/metadata/annotations/%s", "value": "%d"}]`,
+		`{"op": "add", "path": "/metadata/annotations/%s", "value": "%s"}]`,
 		escapeJSONPointer(PredicateTime), time.Now().UnixNano(),
-		escapeJSONPointer(GPUIndex), id)
+		escapeJSONPointer(GPUIndex), idsstring)
 }
 
 // RemoveGPUIndexPatch returns the patch removing GPU index