From 91d68a691792f523e41f1668866f360f8c735d64 Mon Sep 17 00:00:00 2001 From: Gengbin Zheng Date: Wed, 13 Jul 2022 14:45:05 -0700 Subject: [PATCH 1/6] mpl/gpu: add a new API MPL_gpu_query_pointer_is_dev Add MPL_gpu_query_pointer_is_dev for checking if a pointer should be treated as device allocations. For ZE, even host-registered memory should be treated as a device allocation because of how it is implemented by the driver. --- src/mpl/include/mpl_gpu.h | 5 +++++ src/mpl/src/gpu/mpl_gpu_cuda.c | 11 +++++++++++ src/mpl/src/gpu/mpl_gpu_hip.c | 11 +++++++++++ src/mpl/src/gpu/mpl_gpu_ze.c | 28 ++++++++++++++++++++++++++++ 4 files changed, 55 insertions(+) diff --git a/src/mpl/include/mpl_gpu.h b/src/mpl/include/mpl_gpu.h index 91349c08fda..e4b22ef2728 100644 --- a/src/mpl/include/mpl_gpu.h +++ b/src/mpl/include/mpl_gpu.h @@ -77,10 +77,15 @@ static inline int MPL_gpu_query_pointer_attr(const void *ptr, MPL_pointer_attr_t return MPL_SUCCESS; } +static inline int MPL_gpu_query_pointer_is_dev(const void *ptr, MPL_pointer_attr_t * attr) +{ + return 0; +} #endif /* ! MPL_HAVE_GPU */ int MPL_gpu_query_support(MPL_gpu_type_t * type); int MPL_gpu_query_pointer_attr(const void *ptr, MPL_pointer_attr_t * attr); +int MPL_gpu_query_pointer_is_dev(const void *ptr, MPL_pointer_attr_t * attr); int MPL_gpu_ipc_handle_create(const void *ptr, MPL_gpu_device_attr * ptr_attr, MPL_gpu_ipc_mem_handle_t * mpl_ipc_handle); diff --git a/src/mpl/src/gpu/mpl_gpu_cuda.c b/src/mpl/src/gpu/mpl_gpu_cuda.c index 9bf8c427bbe..c0e66fb3179 100644 --- a/src/mpl/src/gpu/mpl_gpu_cuda.c +++ b/src/mpl/src/gpu/mpl_gpu_cuda.c @@ -131,6 +131,17 @@ int MPL_gpu_query_pointer_attr(const void *ptr, MPL_pointer_attr_t * attr) goto fn_exit; } +int MPL_gpu_query_pointer_is_dev(const void *ptr, MPL_pointer_attr_t * attr) +{ + MPL_pointer_attr_t a; + + if (attr == NULL) { + MPL_gpu_query_pointer_attr(ptr, &a); + attr = &a; + } + return attr->type == MPL_GPU_POINTER_DEV; +} + int MPL_gpu_ipc_handle_create(const void *ptr, MPL_gpu_device_attr * ptr_attr, MPL_gpu_ipc_mem_handle_t * ipc_handle) { diff --git a/src/mpl/src/gpu/mpl_gpu_hip.c b/src/mpl/src/gpu/mpl_gpu_hip.c index c0eb60af4f7..059affe9f8e 100644 --- a/src/mpl/src/gpu/mpl_gpu_hip.c +++ b/src/mpl/src/gpu/mpl_gpu_hip.c @@ -123,6 +123,17 @@ int MPL_gpu_query_pointer_attr(const void *ptr, MPL_pointer_attr_t * attr) goto fn_exit; } +int MPL_gpu_query_pointer_is_dev(const void *ptr, MPL_pointer_attr_t * attr) +{ + MPL_pointer_attr_t a; + + if (attr == NULL) { + MPL_gpu_query_pointer_attr(ptr, &a); + attr = &a; + } + return attr->type == MPL_GPU_POINTER_DEV; +} + int MPL_gpu_ipc_handle_create(const void *ptr, MPL_gpu_device_attr * ptr_attr, MPL_gpu_ipc_mem_handle_t * ipc_handle) { diff --git a/src/mpl/src/gpu/mpl_gpu_ze.c b/src/mpl/src/gpu/mpl_gpu_ze.c index 530ba9e60a7..0ce58d86071 100644 --- a/src/mpl/src/gpu/mpl_gpu_ze.c +++ b/src/mpl/src/gpu/mpl_gpu_ze.c @@ -1596,6 +1596,34 @@ int MPL_gpu_query_pointer_attr(const void *ptr, MPL_pointer_attr_t * attr) goto fn_exit; } +int MPL_gpu_query_pointer_is_dev(const void *ptr, MPL_pointer_attr_t * attr) +{ + ze_result_t ret ATTRIBUTE((unused)); + ze_memory_type_t type; + + if (attr == NULL) { + ze_memory_allocation_properties_t prop = { + .stype = ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES, + .pNext = NULL, + .type = 0, + .id = 0, + .pageSize = 0, + }; + ze_device_handle_t device = NULL; + + ret = zeMemGetAllocProperties(ze_context, ptr, &prop, &device); + assert(ret == ZE_RESULT_SUCCESS); + type = prop.type; + } else { + type = attr->device_attr.prop.type; + } + + /* Treat all ZE allocations as device objects. This is because even host-registered memory + * are implemented as device objects in the driver. As such, these allocations don't work + * properly with XPMEM. */ + return type != ZE_MEMORY_TYPE_UNKNOWN; +} + int MPL_gpu_malloc(void **ptr, size_t size, MPL_gpu_device_handle_t h_device) { int mpl_err = MPL_SUCCESS; From ea390b56f4c78ff84f8a227602157df0b7011015 Mon Sep 17 00:00:00 2001 From: Gengbin Zheng Date: Tue, 12 Jul 2022 06:53:32 -0700 Subject: [PATCH 2/6] mpir/gpu: change MPIR_GPU_query_pointer_is_dev to call MPL API --- src/include/mpir_gpu.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/include/mpir_gpu.h b/src/include/mpir_gpu.h index 3e176046be5..5ca5a84448f 100644 --- a/src/include/mpir_gpu.h +++ b/src/include/mpir_gpu.h @@ -99,10 +99,7 @@ MPL_STATIC_INLINE_PREFIX int MPIR_GPU_query_pointer_attr(const void *ptr, MPL_po MPL_STATIC_INLINE_PREFIX bool MPIR_GPU_query_pointer_is_dev(const void *ptr) { if (ENABLE_GPU && ptr != NULL) { - MPL_pointer_attr_t attr; - MPL_gpu_query_pointer_attr(ptr, &attr); - - return attr.type == MPL_GPU_POINTER_DEV; + return MPL_gpu_query_pointer_is_dev(ptr, NULL); } return false; From c23100da617737209591ca6ab69e090b7ed1a4f1 Mon Sep 17 00:00:00 2001 From: Gengbin Zheng Date: Thu, 7 Jul 2022 12:41:09 -0700 Subject: [PATCH 3/6] ch4/shm: ensure xpmem can handle the provided buffer Utilize MPL_gpu_query_pointer_is_dev to ensure the provided buffer can be passed to XPMEM and work as intended. --- src/mpid/ch4/shm/ipc/src/ipc_send.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/mpid/ch4/shm/ipc/src/ipc_send.h b/src/mpid/ch4/shm/ipc/src/ipc_send.h index 2b176343347..b7333fc7c13 100644 --- a/src/mpid/ch4/shm/ipc/src/ipc_send.h +++ b/src/mpid/ch4/shm/ipc/src/ipc_send.h @@ -118,7 +118,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_IPCI_try_lmt_isend(const void *buf, MPI_Aint if (ipc_attr.gpu_attr.type == MPL_GPU_POINTER_DEV) { mpi_errno = MPIDI_GPU_get_ipc_attr(mem_addr, rank, comm, &ipc_attr); MPIR_ERR_CHECK(mpi_errno); - } else { + } else if (!MPL_gpu_query_pointer_is_dev(buf, &ipc_attr.gpu_attr)) { + /* The result of MPL_gpu_query_pointer_is_dev is not necessarily equivalent to + * (gpu_attr.type == MPL_GPU_POINTER_DEV) depending on the backend. This explicit check + * ensures the pointer can be accepted by XPMEM and work as intended. */ mpi_errno = MPIDI_XPMEM_get_ipc_attr(mem_addr, mem_size, &ipc_attr); MPIR_ERR_CHECK(mpi_errno); } From e60f0a9bde25416077c5d42e1c24926597b4ae2b Mon Sep 17 00:00:00 2001 From: Gengbin Zheng Date: Thu, 14 Jul 2022 08:42:10 -0700 Subject: [PATCH 4/6] mpl/gpu: add MPL function MPL_gpu_query_is_same_dev A new function MPL_gpu_query_is_same_dev to detect if two devices/subdevices are actually one device. --- src/mpl/include/mpl_gpu.h | 6 ++++ src/mpl/src/gpu/mpl_gpu_cuda.c | 5 ++++ src/mpl/src/gpu/mpl_gpu_hip.c | 5 ++++ src/mpl/src/gpu/mpl_gpu_ze.c | 51 ++++++++++++++++++++++++++++++++++ 4 files changed, 67 insertions(+) diff --git a/src/mpl/include/mpl_gpu.h b/src/mpl/include/mpl_gpu.h index e4b22ef2728..c9a34244127 100644 --- a/src/mpl/include/mpl_gpu.h +++ b/src/mpl/include/mpl_gpu.h @@ -81,11 +81,17 @@ static inline int MPL_gpu_query_pointer_is_dev(const void *ptr, MPL_pointer_attr { return 0; } + +static inline int MPL_gpu_query_is_same_dev(int dev1, int dev2) +{ + return dev1 == dev2; +} #endif /* ! MPL_HAVE_GPU */ int MPL_gpu_query_support(MPL_gpu_type_t * type); int MPL_gpu_query_pointer_attr(const void *ptr, MPL_pointer_attr_t * attr); int MPL_gpu_query_pointer_is_dev(const void *ptr, MPL_pointer_attr_t * attr); +int MPL_gpu_query_is_same_dev(int dev1, int dev2); int MPL_gpu_ipc_handle_create(const void *ptr, MPL_gpu_device_attr * ptr_attr, MPL_gpu_ipc_mem_handle_t * mpl_ipc_handle); diff --git a/src/mpl/src/gpu/mpl_gpu_cuda.c b/src/mpl/src/gpu/mpl_gpu_cuda.c index c0e66fb3179..b3c1a650483 100644 --- a/src/mpl/src/gpu/mpl_gpu_cuda.c +++ b/src/mpl/src/gpu/mpl_gpu_cuda.c @@ -142,6 +142,11 @@ int MPL_gpu_query_pointer_is_dev(const void *ptr, MPL_pointer_attr_t * attr) return attr->type == MPL_GPU_POINTER_DEV; } +int MPL_gpu_query_is_same_dev(int dev1, int dev2) +{ + return dev1 == dev2; +} + int MPL_gpu_ipc_handle_create(const void *ptr, MPL_gpu_device_attr * ptr_attr, MPL_gpu_ipc_mem_handle_t * ipc_handle) { diff --git a/src/mpl/src/gpu/mpl_gpu_hip.c b/src/mpl/src/gpu/mpl_gpu_hip.c index 059affe9f8e..93598409a93 100644 --- a/src/mpl/src/gpu/mpl_gpu_hip.c +++ b/src/mpl/src/gpu/mpl_gpu_hip.c @@ -134,6 +134,11 @@ int MPL_gpu_query_pointer_is_dev(const void *ptr, MPL_pointer_attr_t * attr) return attr->type == MPL_GPU_POINTER_DEV; } +int MPL_gpu_query_is_same_dev(int dev1, int dev2) +{ + return dev1 == dev2; +} + int MPL_gpu_ipc_handle_create(const void *ptr, MPL_gpu_device_attr * ptr_attr, MPL_gpu_ipc_mem_handle_t * ipc_handle) { diff --git a/src/mpl/src/gpu/mpl_gpu_ze.c b/src/mpl/src/gpu/mpl_gpu_ze.c index 0ce58d86071..272b587f462 100644 --- a/src/mpl/src/gpu/mpl_gpu_ze.c +++ b/src/mpl/src/gpu/mpl_gpu_ze.c @@ -63,6 +63,10 @@ typedef struct { unsigned int numQueueGroups; ze_event_handle_t prev_event; /* for imemcpy */ MPL_cmdlist_pool_t *last_cmdList_entry; /* for imemcpy */ +#ifdef ZE_PCI_PROPERTIES_EXT_NAME + ze_pci_address_ext_t pci; + int pci_avail; +#endif } MPL_ze_device_entry_t; static MPL_ze_device_entry_t *device_states; @@ -956,6 +960,19 @@ static int gpu_ze_init_driver(void) } } } +#ifdef ZE_PCI_PROPERTIES_EXT_NAME + ze_pci_ext_properties_t pci_property = { + .stype = ZE_STRUCTURE_TYPE_PCI_EXT_PROPERTIES, + .pNext = NULL, + }; + ret = zeDevicePciGetPropertiesExt(ze_devices_handle[d], &pci_property); + if (ret == ZE_RESULT_SUCCESS) { + device_state->pci_avail = 1; + device_state->pci = pci_property.address; + } else { + device_state->pci_avail = 0; + } +#endif MPL_free(queueProperties); } @@ -1624,6 +1641,38 @@ int MPL_gpu_query_pointer_is_dev(const void *ptr, MPL_pointer_attr_t * attr) return type != ZE_MEMORY_TYPE_UNKNOWN; } +int MPL_gpu_query_is_same_dev(int global_dev1, int global_dev2) +{ + MPL_ze_device_entry_t *device_state1, *device_state2; + int local_dev1, local_dev2; + + assert(global_dev1 >= 0 && global_dev1 < global_ze_device_count); + assert(global_dev2 >= 0 && global_dev2 < global_ze_device_count); + + local_dev1 = MPL_gpu_global_to_local_dev_id(global_dev1); + local_dev2 = MPL_gpu_global_to_local_dev_id(global_dev2); + /* check if invisible devices */ + if (local_dev1 == -1 || local_dev2 == -1) + return 0; + +#ifdef ZE_PCI_PROPERTIES_EXT_NAME + if (MPL_gpu_get_root_device(local_dev1) == MPL_gpu_get_root_device(local_dev2)) + return 1; + + device_state1 = device_states + local_dev1; + device_state2 = device_states + local_dev2; + if (device_state1->pci_avail && device_state2->pci_avail) + return device_state1->pci.domain == device_state2->pci.domain && + device_state1->pci.bus == device_state2->pci.bus && + device_state1->pci.device == device_state2->pci.device && + device_state1->pci.function == device_state2->pci.function; + else + return 0; +#else + return MPL_gpu_get_root_device(local_dev1) == MPL_gpu_get_root_device(local_dev2); +#endif +} + int MPL_gpu_malloc(void **ptr, size_t size, MPL_gpu_device_handle_t h_device) { int mpl_err = MPL_SUCCESS; @@ -1825,6 +1874,8 @@ int MPL_gpu_imemcpy(void *dest_ptr, void *src_ptr, size_t size, int dev, int orig_dev = dev; int engine = engine_conversion[engine_type]; + assert(dev >= 0 && dev < local_ze_device_count); + if (dest_ptr && src_ptr) { ret = get_next_event(&event); ZE_ERR_CHECK(ret); From 4365b610857dc21c09a8ca6e87ed52667780b10d Mon Sep 17 00:00:00 2001 From: Gengbin Zheng Date: Thu, 14 Jul 2022 08:44:06 -0700 Subject: [PATCH 5/6] shm/ipc: add a strategy to choose engine type in IPC path When two devices/subdevices are one device, a high bandwidth engine is preferred, the rest of the cases use a low latency engine. --- src/mpid/ch4/shm/ipc/src/ipc_p2p.h | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/src/mpid/ch4/shm/ipc/src/ipc_p2p.h b/src/mpid/ch4/shm/ipc/src/ipc_p2p.h index a0225c43a7b..1cc48e8191d 100644 --- a/src/mpid/ch4/shm/ipc/src/ipc_p2p.h +++ b/src/mpid/ch4/shm/ipc/src/ipc_p2p.h @@ -151,6 +151,18 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_IPCI_copy_data(MPIDI_IPC_hdr * ipc_hdr, MPIR_ goto fn_exit; } +MPL_STATIC_INLINE_PREFIX MPL_gpu_engine_type_t MPIDI_IPCI_choose_engine(int dev1, int dev2) +{ + MPL_gpu_engine_type_t engine = MPL_GPU_ENGINE_TYPE_COPY_LOW_LATENCY; + if (dev1 == -1 || dev2 == -1) { + return MPL_GPU_ENGINE_TYPE_COPY_HIGH_BANDWIDTH; + } + assert(dev1 != -1 && dev2 != -1); + if (MPL_gpu_query_is_same_dev(dev1, dev2)) + engine = MPL_GPU_ENGINE_TYPE_COPY_HIGH_BANDWIDTH; + return engine; +} + MPL_STATIC_INLINE_PREFIX int MPIDI_IPCI_handle_lmt_recv(MPIDI_IPC_hdr * ipc_hdr, size_t src_data_sz, MPIR_Request * sreq_ptr, @@ -200,11 +212,12 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_IPCI_handle_lmt_recv(MPIDI_IPC_hdr * ipc_hdr, MPIR_ERR_CHECK(mpi_errno); /* copy */ if (ipc_hdr->is_contig && dt_contig) { - mpi_errno = MPIR_Localcopy_gpu(src_buf, src_data_sz, MPI_BYTE, NULL, - MPIDIG_REQUEST(rreq, buffer), - MPIDIG_REQUEST(rreq, count), - MPIDIG_REQUEST(rreq, datatype), &attr, - MPL_GPU_ENGINE_TYPE_COPY_HIGH_BANDWIDTH, true); + MPL_gpu_engine_type_t engine = + MPIDI_IPCI_choose_engine(ipc_hdr->ipc_handle.gpu.global_dev_id, dev_id); + mpi_errno = + MPIR_Localcopy_gpu(src_buf, src_data_sz, MPI_BYTE, NULL, + MPIDIG_REQUEST(rreq, buffer), MPIDIG_REQUEST(rreq, count), + MPIDIG_REQUEST(rreq, datatype), &attr, engine, true); MPIR_ERR_CHECK(mpi_errno); } else { /* TODO: get sender datatype and call MPIR_Typerep_op with mapped_device set to dev_id */ From 9a2664b2e9f642e2004a380d69e5a90b00a9240e Mon Sep 17 00:00:00 2001 From: Gengbin Zheng Date: Sat, 16 Jul 2022 23:33:44 -0700 Subject: [PATCH 6/6] mpl/gpu: delay creation of command queues until needed Only create command queues for a device and engine type when it is needed. --- src/mpl/src/gpu/mpl_gpu_ze.c | 121 +++++++++++++++++++++-------------- 1 file changed, 73 insertions(+), 48 deletions(-) diff --git a/src/mpl/src/gpu/mpl_gpu_ze.c b/src/mpl/src/gpu/mpl_gpu_ze.c index 272b587f462..24267765d4c 100644 --- a/src/mpl/src/gpu/mpl_gpu_ze.c +++ b/src/mpl/src/gpu/mpl_gpu_ze.c @@ -59,10 +59,11 @@ typedef struct { } MPL_ze_engine_entry_t; typedef struct { - MPL_ze_engine_entry_t *engines; + int dev_id; unsigned int numQueueGroups; - ze_event_handle_t prev_event; /* for imemcpy */ - MPL_cmdlist_pool_t *last_cmdList_entry; /* for imemcpy */ + MPL_ze_engine_entry_t *engines; + ze_event_handle_t prev_event; /* for imemcopy */ + MPL_cmdlist_pool_t *last_cmdList_entry; /* for imemcopy */ #ifdef ZE_PCI_PROPERTIES_EXT_NAME ze_pci_address_ext_t pci; int pci_avail; @@ -804,7 +805,7 @@ static int gpu_ze_init_driver(void) ret = zeDriverGet(&driver_count, all_drivers); ZE_ERR_CHECK(ret); - int i, j, d; + int i, d; /* Find a driver instance with a GPU device */ for (i = 0; i < driver_count; ++i) { device_count = 0; @@ -901,20 +902,10 @@ static int gpu_ze_init_driver(void) (MPL_ze_device_entry_t *) MPL_malloc(sizeof(MPL_ze_device_entry_t) * local_ze_device_count, MPL_MEM_OTHER); - /* create command queues */ for (d = 0; d < local_ze_device_count; d++) { unsigned int numQueueGroups = 0; - ze_command_queue_desc_t cmdQueueDesc = { - .stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, - .pNext = NULL, - .index = 0, - .flags = 0, - .ordinal = 0, - .mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, - .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL, - }; - MPL_ze_device_entry_t *device_state = device_states + d; + device_state->dev_id = d; device_state->prev_event = NULL; device_state->last_cmdList_entry = NULL; ret = zeDeviceGetCommandQueueGroupProperties(ze_devices_handle[d], &numQueueGroups, NULL); @@ -931,33 +922,25 @@ static int gpu_ze_init_driver(void) device_state->numQueueGroups = numQueueGroups; for (i = 0; i < numQueueGroups; i++) { - cmdQueueDesc.ordinal = -1; + int ordinal = -1; if (queueProperties[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) { - cmdQueueDesc.ordinal = i; + ordinal = i; } else if (queueProperties[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY && queueProperties[i].numQueues >= 1 && !(queueProperties[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE)) { - cmdQueueDesc.ordinal = i; + ordinal = i; } device_state->engines[i].cmdList_pool = NULL; - if (cmdQueueDesc.ordinal == -1) { - device_state->engines[i].curQueue = 0; - device_state->engines[i].numQueues = 0; - device_state->engines[i].cmdQueues = NULL; - } else { - device_state->engines[i].numQueues = queueProperties[i].numQueues; - device_state->engines[i].curQueue = 0; + device_state->engines[i].curQueue = 0; + device_state->engines[i].numQueues = ordinal == -1 ? 0 : queueProperties[i].numQueues; + device_state->engines[i].cmdQueues = NULL; + if (device_state->engines[i].numQueues) { device_state->engines[i].cmdQueues = (ze_command_queue_handle_t *) MPL_malloc(sizeof(ze_command_queue_handle_t) * - queueProperties[i].numQueues, + device_state->engines[i].numQueues, MPL_MEM_OTHER); - for (j = 0; j < queueProperties[i].numQueues; j++) { - cmdQueueDesc.index = j; - ret = - zeCommandQueueCreate(ze_context, ze_devices_handle[d], &cmdQueueDesc, - &device_state->engines[i].cmdQueues[j]); - ZE_ERR_CHECK(ret); - } + memset(device_state->engines[i].cmdQueues, 0, + sizeof(ze_command_queue_handle_t) * device_state->engines[i].numQueues); } } #ifdef ZE_PCI_PROPERTIES_EXT_NAME @@ -1181,15 +1164,16 @@ int MPL_gpu_finalize(void) for (j = 0; j < device_state->numQueueGroups; j++) { MPL_ze_engine_entry_t *engine = device_state->engines + j; for (k = 0; k < engine->numQueues; k++) { - zeCommandQueueDestroy(engine->cmdQueues[k]); + if (engine->cmdQueues[k]) + zeCommandQueueDestroy(engine->cmdQueues[k]); } + MPL_free(engine->cmdQueues); MPL_cmdlist_pool_t *cmdlist, *t, *pool = engine->cmdList_pool; DL_FOREACH_SAFE(pool, cmdlist, t) { zeCommandListDestroy(cmdlist->cmdList); DL_DELETE(pool, cmdlist); MPL_free(cmdlist); } - MPL_free(engine->cmdQueues); } MPL_free(device_state->engines); } @@ -1782,6 +1766,42 @@ int MPL_gpu_get_buffer_bounds(const void *ptr, void **pbase, uintptr_t * len) } /* command list utility functions - used for MPL_gpu_imemcpy */ +/* create complete set of command queues for an engine type */ +static int create_cmdqueue(int dev, int engine) +{ + int mpl_err = MPL_SUCCESS; + int ret; + + ze_command_queue_desc_t cmdQueueDesc = { + .stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, + .pNext = NULL, + .index = 0, + .flags = 0, + .ordinal = engine, + .mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, + .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL, + }; + + MPL_ze_device_entry_t *device_state = device_states + dev; + assert(engine < device_state->numQueueGroups); + MPL_ze_engine_entry_t *engine_state = device_state->engines + engine; + assert(engine_state->numQueues); + + for (int i = 0; i < engine_state->numQueues; i++) { + cmdQueueDesc.index = i; + ret = + zeCommandQueueCreate(ze_context, ze_devices_handle[dev], &cmdQueueDesc, + &engine_state->cmdQueues[i]); + ZE_ERR_CHECK(ret); + } + + fn_exit: + return mpl_err; + fn_fail: + mpl_err = MPL_ERR_GPU_INTERNAL; + goto fn_exit; +} + static int get_cmdlist(int dev, int engine, MPL_cmdlist_pool_t ** cl_entry) { int mpl_err = MPL_SUCCESS; @@ -1879,18 +1899,16 @@ int MPL_gpu_imemcpy(void *dest_ptr, void *src_ptr, size_t size, int dev, if (dest_ptr && src_ptr) { ret = get_next_event(&event); ZE_ERR_CHECK(ret); - if (device_states[dev].last_cmdList_entry == NULL) { + if (device_states[orig_dev].last_cmdList_entry == NULL) { MPL_cmdlist_pool_t *cmdList_entry; - ret = get_cmdlist(dev, engine, &cmdList_entry); + ret = get_cmdlist(orig_dev, engine, &cmdList_entry); ZE_ERR_CHECK(ret); - cmdList = cmdList_entry->cmdList; - device_states[dev].last_cmdList_entry = cmdList_entry; + device_states[orig_dev].last_cmdList_entry = cmdList_entry; dev = cmdList_entry->dev; - } else { - cmdList = device_states[dev].last_cmdList_entry->cmdList; - if (device_states[dev].last_cmdList_entry->dev != dev) - goto fn_fail; } + cmdList = device_states[orig_dev].last_cmdList_entry->cmdList; + if (device_states[orig_dev].last_cmdList_entry->dev != dev) + goto fn_fail; assert(dev < local_ze_device_count); device_state = device_states + dev; ret = @@ -1911,12 +1929,19 @@ int MPL_gpu_imemcpy(void *dest_ptr, void *src_ptr, size_t size, int dev, if (commit && cmdList) { ret = zeCommandListClose(cmdList); ZE_ERR_CHECK(ret); - ret = - zeCommandQueueExecuteCommandLists(device_state-> - engines[engine].cmdQueues[device_state-> - engines[engine].curQueue], - 1, &cmdList, NULL); + int q_index = device_state->engines[engine].curQueue; + assert(device_state->engines[engine].cmdQueues); + ze_command_queue_handle_t cmdq = device_state->engines[engine].cmdQueues[q_index]; + if (cmdq == NULL) { + mpl_err = create_cmdqueue(device_state->dev_id, engine); + if (mpl_err != MPL_SUCCESS) + goto fn_fail; + cmdq = device_state->engines[engine].cmdQueues[q_index]; + assert(cmdq); + } + ret = zeCommandQueueExecuteCommandLists(cmdq, 1, &cmdList, NULL); ZE_ERR_CHECK(ret); + /* move to next queue */ device_state->engines[engine].curQueue++; if (device_state->engines[engine].curQueue == device_state->engines[engine].numQueues) device_state->engines[engine].curQueue = 0;