diff --git a/docs/images/gpu-number.png b/docs/images/gpu-number.png new file mode 100644 index 00000000000..41b5c2881d7 Binary files /dev/null and b/docs/images/gpu-number.png differ diff --git a/docs/user-guide/how_to_use_gpu_number.md b/docs/user-guide/how_to_use_gpu_number.md new file mode 100644 index 00000000000..ab83fa5225a --- /dev/null +++ b/docs/user-guide/how_to_use_gpu_number.md @@ -0,0 +1,135 @@ +# GPU Number User guide + +## Environment setup + +### Install volcano + + +#### 1. Install from source + +Refer to [Install Guide](../../installer/README.md) to install volcano. + +After installed, update the scheduler configuration: + +```shell script +kubectl edit cm -n volcano-system volcano-scheduler-configmap +``` + +```yaml +kind: ConfigMap +apiVersion: v1 +metadata: + name: volcano-scheduler-configmap + namespace: volcano-system +data: + volcano-scheduler.conf: | + actions: "enqueue, allocate, backfill" + tiers: + - plugins: + - name: priority + - name: gang + - name: conformance + - plugins: + - name: drf + - name: predicates + arguments: + predicate.GPUNumberEnable: true # enable gpu number + - name: proportion + - name: nodeorder + - name: binpack +``` + +#### 2. Install from release package. + +Same as above, after installed, update the scheduler configuration in `volcano-scheduler-configmap` configmap. + +### Install Volcano device plugin + +Please refer to [volcano device plugin](https://github.com/volcano-sh/devices/blob/master/README.md#quick-start) + +* Remember to config volcano device plugin to support gpu-number, users need to config volcano device plugin --gpu-strategy=number. For more information [volcano device plugin configuration](https://github.com/volcano-sh/devices/blob/dev/doc/config.md) + +### Verify environment is ready + +Check the node status, it is ok `volcano.sh/gpu-number` is included in the allocatable resources. + +```shell script +$ kubectl get node {node name} -oyaml +... +Capacity: + attachable-volumes-gce-pd: 127 + cpu: 2 + ephemeral-storage: 98868448Ki + hugepages-1Gi: 0 + hugepages-2Mi: 0 + memory: 7632596Ki + pods: 110 + volcano.sh/gpu-memory: 0 + volcano.sh/gpu-number: 1 +Allocatable: + attachable-volumes-gce-pd: 127 + cpu: 1930m + ephemeral-storage: 47093746742 + hugepages-1Gi: 0 + hugepages-2Mi: 0 + memory: 5752532Ki + pods: 110 + volcano.sh/gpu-memory: 0 + volcano.sh/gpu-number: 1 +``` + +### Running Jobs With Multiple GPU Cards + +Jobs can have multiple exclusive NVIDIA GPUs cards via defining container level resource requirements `volcano.sh/gpu-number`: +```shell script +$ cat < 0 { - id := GetGPUIndex(pod) - if dev := ni.GPUDevices[id]; dev != nil { - dev.PodMap[string(pod.UID)] = pod + ids := GetGPUIndex(pod) + for _, id := range ids { + if dev := ni.GPUDevices[id]; dev != nil { + dev.PodMap[string(pod.UID)] = pod + } } } } // SubGPUResource frees the gpu hold by the pod func (ni *NodeInfo) SubGPUResource(pod *v1.Pod) { - gpuRes := GetGPUResourceOfPod(pod) + gpuRes := GetGPUMemoryOfPod(pod) if gpuRes > 0 { - id := GetGPUIndex(pod) - if dev := ni.GPUDevices[id]; dev != nil { - delete(dev.PodMap, string(pod.UID)) + ids := GetGPUIndex(pod) + for _, id := range ids { + if dev := ni.GPUDevices[id]; dev != nil { + delete(dev.PodMap, string(pod.UID)) + } } } } diff --git a/pkg/scheduler/api/pod_info.go b/pkg/scheduler/api/pod_info.go index 0c72401de51..b4ade809dc9 100644 --- a/pkg/scheduler/api/pod_info.go +++ b/pkg/scheduler/api/pod_info.go @@ -160,20 +160,29 @@ func GetPodResourceWithoutInitContainers(pod *v1.Pod) *Resource { } // GetGPUIndex returns the ID of the GPU -func GetGPUIndex(pod *v1.Pod) int { +//return the gpu index list +func GetGPUIndex(pod *v1.Pod) []int { if len(pod.Annotations) > 0 { value, found := pod.Annotations[GPUIndex] if found { - id, err := strconv.Atoi(value) - if err != nil { - klog.Errorf("invalid %s=%s", GPUIndex, value) - return -1 + ids := strings.Split(value, ",") + if len(ids) == 0 { + klog.Errorf("invalid gpu index annotation %s=%s", GPUIndex, value) + } + idSlice := make([]int, len(ids)) + for idx, id := range ids { + j, err := strconv.Atoi(id) + if err != nil { + klog.Errorf("invalid %s=%s", GPUIndex, value) + return nil + } + idSlice[idx] = j } - return id + return idSlice } } - return -1 + return nil } func escapeJSONPointer(p string) string { @@ -184,11 +193,12 @@ func escapeJSONPointer(p string) string { } // AddGPUIndexPatch returns the patch adding GPU index -func AddGPUIndexPatch(id int) string { +func AddGPUIndexPatch(ids []int) string { + idsstring := strings.Trim(strings.Replace(fmt.Sprint(ids), " ", ",", -1), "[]") return fmt.Sprintf(`[{"op": "add", "path": "/metadata/annotations/%s", "value":"%d"},`+ - `{"op": "add", "path": "/metadata/annotations/%s", "value": "%d"}]`, + `{"op": "add", "path": "/metadata/annotations/%s", "value": "%s"}]`, escapeJSONPointer(PredicateTime), time.Now().UnixNano(), - escapeJSONPointer(GPUIndex), id) + escapeJSONPointer(GPUIndex), idsstring) } // RemoveGPUIndexPatch returns the patch removing GPU index diff --git a/pkg/scheduler/plugins/predicates/gpu.go b/pkg/scheduler/plugins/predicates/gpu.go index cd28ac0db40..16a5b38dd5f 100644 --- a/pkg/scheduler/plugins/predicates/gpu.go +++ b/pkg/scheduler/plugins/predicates/gpu.go @@ -20,37 +20,74 @@ import ( "fmt" v1 "k8s.io/api/core/v1" + "k8s.io/klog" "volcano.sh/volcano/pkg/scheduler/api" ) -// checkNodeGPUSharingPredicate checks if a gpu sharing pod can be scheduled on a node. +// checkNodeGPUSharingPredicate checks if a pod with gpu requirement can be scheduled on a node. func checkNodeGPUSharingPredicate(pod *v1.Pod, nodeInfo *api.NodeInfo) (bool, error) { // no gpu sharing request - if api.GetGPUResourceOfPod(pod) <= 0 { + if api.GetGPUMemoryOfPod(pod) <= 0 { return true, nil } + ids := predicateGPUbyMemory(pod, nodeInfo) + if ids == nil { + return false, fmt.Errorf("no enough gpu memory on node %s", nodeInfo.Name) + } + return true, nil +} - id := predicateGPU(pod, nodeInfo) - if id < 0 { - return false, fmt.Errorf("no enough gpu memory on single device of node %s", nodeInfo.Name) +func checkNodeGPUNumberPredicate(pod *v1.Pod, nodeInfo *api.NodeInfo) (bool, error) { + //no gpu number request + if api.GetGPUNumberOfPod(pod) <= 0 { + return true, nil + } + ids := predicateGPUbyNumber(pod, nodeInfo) + if ids == nil { + return false, fmt.Errorf("no enough gpu number on node %s", nodeInfo.Name) } return true, nil } // predicateGPU returns the available GPU ID -func predicateGPU(pod *v1.Pod, node *api.NodeInfo) int { - gpuRequest := api.GetGPUResourceOfPod(pod) +func predicateGPUbyMemory(pod *v1.Pod, node *api.NodeInfo) []int { + gpuRequest := api.GetGPUMemoryOfPod(pod) allocatableGPUs := node.GetDevicesIdleGPUMemory() + var devIDs []int + for devID := 0; devID < len(allocatableGPUs); devID++ { availableGPU, ok := allocatableGPUs[devID] if ok { if availableGPU >= gpuRequest { - return devID + devIDs = append(devIDs, devID) + return devIDs } } } - return -1 + return nil +} + +// predicateGPU returns the available GPU IDs +func predicateGPUbyNumber(pod *v1.Pod, node *api.NodeInfo) []int { + gpuRequest := api.GetGPUNumberOfPod(pod) + allocatableGPUs := node.GetDevicesIdleGPUs() + + var devIDs []int + + if len(allocatableGPUs) < gpuRequest { + klog.Errorf("Not enough gpu cards") + return nil + } + + for devID := 0; devID < len(allocatableGPUs); devID++ { + devIDs = append(devIDs, allocatableGPUs[devID]) + if len(devIDs) == gpuRequest { + return devIDs + } + } + + return nil } diff --git a/pkg/scheduler/plugins/predicates/predicates.go b/pkg/scheduler/plugins/predicates/predicates.go index 3d013484f71..f816a053a27 100644 --- a/pkg/scheduler/plugins/predicates/predicates.go +++ b/pkg/scheduler/plugins/predicates/predicates.go @@ -46,6 +46,7 @@ const ( // GPUSharingPredicate is the key for enabling GPU Sharing Predicate in YAML GPUSharingPredicate = "predicate.GPUSharingEnable" + GPUNumberPredicate = "predicate.GPUNumberEnable" // CachePredicate control cache predicate feature CachePredicate = "predicate.CacheEnable" @@ -79,6 +80,7 @@ type baseResource struct { type predicateEnable struct { gpuSharingEnable bool + gpuNumberEnable bool cacheEnable bool proportionalEnable bool proportional map[v1.ResourceName]baseResource @@ -100,6 +102,7 @@ func enablePredicate(args framework.Arguments) predicateEnable { - name: predicates arguments: predicate.GPUSharingEnable: true + predicate.GPUNumberEnable: true predicate.CacheEnable: true predicate.ProportionalEnable: true predicate.resources: nvidia.com/gpu @@ -111,12 +114,19 @@ func enablePredicate(args framework.Arguments) predicateEnable { predicate := predicateEnable{ gpuSharingEnable: false, + gpuNumberEnable: false, cacheEnable: false, proportionalEnable: false, } // Checks whether predicate.GPUSharingEnable is provided or not, if given, modifies the value in predicateEnable struct. args.GetBool(&predicate.gpuSharingEnable, GPUSharingPredicate) + args.GetBool(&predicate.gpuNumberEnable, GPUNumberPredicate) + + if predicate.gpuSharingEnable && predicate.gpuNumberEnable { + klog.Fatal("can not define true in both gpu sharing and gpu number") + } + args.GetBool(&predicate.cacheEnable, CachePredicate) // Checks whether predicate.ProportionalEnable is provided or not, if given, modifies the value in predicateEnable struct. args.GetBool(&predicate.proportionalEnable, ProportionalPredicate) @@ -175,31 +185,62 @@ func (pp *predicatesPlugin) OnSessionOpen(ssn *framework.Session) { return } - if predicate.gpuSharingEnable && api.GetGPUResourceOfPod(pod) > 0 { + //predicate gpu sharing + if predicate.gpuSharingEnable && api.GetGPUMemoryOfPod(pod) > 0 { nodeInfo, ok := ssn.Nodes[nodeName] if !ok { klog.Errorf("Failed to get node %s info from cache", nodeName) return } - - id := predicateGPU(pod, nodeInfo) - if id < 0 { + ids := predicateGPUbyMemory(pod, nodeInfo) + if ids == nil { klog.Errorf("The node %s can't place the pod %s in ns %s", pod.Spec.NodeName, pod.Name, pod.Namespace) return } - dev, ok := nodeInfo.GPUDevices[id] + patch := api.AddGPUIndexPatch(ids) + pod, err := kubeClient.CoreV1().Pods(pod.Namespace).Patch(context.TODO(), pod.Name, types.JSONPatchType, []byte(patch), metav1.PatchOptions{}) + if err != nil { + klog.Errorf("Patch pod %s failed with patch %s: %v", pod.Name, patch, err) + return + } + for _, id := range ids { + dev, ok := nodeInfo.GPUDevices[id] + if !ok { + klog.Errorf("Failed to get GPU %d from node %s", id, nodeName) + return + } + dev.PodMap[string(pod.UID)] = pod + } + klog.V(4).Infof("predicates with gpu sharing, update pod %s/%s allocate to node [%s]", pod.Namespace, pod.Name, nodeName) + } + + //predicate gpu number + if predicate.gpuNumberEnable && api.GetGPUNumberOfPod(pod) > 0 { + nodeInfo, ok := ssn.Nodes[nodeName] if !ok { - klog.Errorf("Failed to get GPU %d from node %s", id, nodeName) + klog.Errorf("Failed to get node %s info from cache", nodeName) return } - patch := api.AddGPUIndexPatch(id) + ids := predicateGPUbyNumber(pod, nodeInfo) + if ids == nil { + klog.Errorf("The node %s can't place the pod %s in ns %s", pod.Spec.NodeName, pod.Name, pod.Namespace) + return + } + patch := api.AddGPUIndexPatch(ids) pod, err := kubeClient.CoreV1().Pods(pod.Namespace).Patch(context.TODO(), pod.Name, types.JSONPatchType, []byte(patch), metav1.PatchOptions{}) if err != nil { klog.Errorf("Patch pod %s failed with patch %s: %v", pod.Name, patch, err) return } - dev.PodMap[string(pod.UID)] = pod - klog.V(4).Infof("predicates with gpu sharing, update pod %s/%s allocate to node [%s]", pod.Namespace, pod.Name, nodeName) + for _, id := range ids { + dev, ok := nodeInfo.GPUDevices[id] + if !ok { + klog.Errorf("Failed to get GPU %d from node %s", id, nodeName) + return + } + dev.PodMap[string(pod.UID)] = pod + } + klog.V(4).Infof("predicates with gpu number, update pod %s/%s allocate to node [%s]", pod.Namespace, pod.Name, nodeName) } node.AddPod(pod) @@ -214,9 +255,9 @@ func (pp *predicatesPlugin) OnSessionOpen(ssn *framework.Session) { return } - if predicate.gpuSharingEnable && api.GetGPUResourceOfPod(pod) > 0 { + if (predicate.gpuSharingEnable && api.GetGPUMemoryOfPod(pod) > 0) || (predicate.gpuNumberEnable && api.GetGPUNumberOfPod(pod) > 0) { // deallocate pod gpu id - id := api.GetGPUIndex(pod) + ids := api.GetGPUIndex(pod) patch := api.RemoveGPUIndexPatch() _, err := kubeClient.CoreV1().Pods(pod.Namespace).Patch(context.TODO(), pod.Name, types.JSONPatchType, []byte(patch), metav1.PatchOptions{}) if err != nil { @@ -229,8 +270,10 @@ func (pp *predicatesPlugin) OnSessionOpen(ssn *framework.Session) { klog.Errorf("Failed to get node %s info from cache", nodeName) return } - if dev, ok := nodeInfo.GPUDevices[id]; ok { - delete(dev.PodMap, string(pod.UID)) + for _, id := range ids { + if dev, ok := nodeInfo.GPUDevices[id]; ok { + delete(dev.PodMap, string(pod.UID)) + } } klog.V(4).Infof("predicates with gpu sharing, update pod %s/%s deallocate from node [%s]", pod.Namespace, pod.Name, nodeName) @@ -362,6 +405,16 @@ func (pp *predicatesPlugin) OnSessionOpen(ssn *framework.Session) { klog.V(4).Infof("checkNodeResourceIsProportional predicates Task <%s/%s> on Node <%s>: fit %v", task.Namespace, task.Name, node.Name, fit) } + if predicate.gpuNumberEnable { + //CheckGPUNumberPredicate + fit, err := checkNodeGPUNumberPredicate(task.Pod, node) + if err != nil { + return err + } + + klog.V(4).Infof("checkNodeGPUNumberPredicate predicates Task <%s/%s> on Node <%s>: fit %v", + task.Namespace, task.Name, node.Name, fit) + } return nil }) }