From 91d68a691792f523e41f1668866f360f8c735d64 Mon Sep 17 00:00:00 2001
From: Gengbin Zheng <gengbin.zheng@intel.com>
Date: Wed, 13 Jul 2022 14:45:05 -0700
Subject: [PATCH 1/6] mpl/gpu: add a new API MPL_gpu_query_pointer_is_dev

Add MPL_gpu_query_pointer_is_dev for checking if a pointer should
be treated as device allocations. For ZE, even host-registered memory
should be treated as a device allocation because of how it is
implemented by the driver.
---
 src/mpl/include/mpl_gpu.h      |  5 +++++
 src/mpl/src/gpu/mpl_gpu_cuda.c | 11 +++++++++++
 src/mpl/src/gpu/mpl_gpu_hip.c  | 11 +++++++++++
 src/mpl/src/gpu/mpl_gpu_ze.c   | 28 ++++++++++++++++++++++++++++
 4 files changed, 55 insertions(+)

diff --git a/src/mpl/include/mpl_gpu.h b/src/mpl/include/mpl_gpu.h
index 91349c08fda..e4b22ef2728 100644
--- a/src/mpl/include/mpl_gpu.h
+++ b/src/mpl/include/mpl_gpu.h
@@ -77,10 +77,15 @@ static inline int MPL_gpu_query_pointer_attr(const void *ptr, MPL_pointer_attr_t
     return MPL_SUCCESS;
 }
 
+static inline int MPL_gpu_query_pointer_is_dev(const void *ptr, MPL_pointer_attr_t * attr)
+{
+    return 0;
+}
 #endif /* ! MPL_HAVE_GPU */
 
 int MPL_gpu_query_support(MPL_gpu_type_t * type);
 int MPL_gpu_query_pointer_attr(const void *ptr, MPL_pointer_attr_t * attr);
+int MPL_gpu_query_pointer_is_dev(const void *ptr, MPL_pointer_attr_t * attr);
 
 int MPL_gpu_ipc_handle_create(const void *ptr, MPL_gpu_device_attr * ptr_attr,
                               MPL_gpu_ipc_mem_handle_t * mpl_ipc_handle);
diff --git a/src/mpl/src/gpu/mpl_gpu_cuda.c b/src/mpl/src/gpu/mpl_gpu_cuda.c
index 9bf8c427bbe..c0e66fb3179 100644
--- a/src/mpl/src/gpu/mpl_gpu_cuda.c
+++ b/src/mpl/src/gpu/mpl_gpu_cuda.c
@@ -131,6 +131,17 @@ int MPL_gpu_query_pointer_attr(const void *ptr, MPL_pointer_attr_t * attr)
     goto fn_exit;
 }
 
+int MPL_gpu_query_pointer_is_dev(const void *ptr, MPL_pointer_attr_t * attr)
+{
+    MPL_pointer_attr_t a;
+
+    if (attr == NULL) {
+        MPL_gpu_query_pointer_attr(ptr, &a);
+        attr = &a;
+    }
+    return attr->type == MPL_GPU_POINTER_DEV;
+}
+
 int MPL_gpu_ipc_handle_create(const void *ptr, MPL_gpu_device_attr * ptr_attr,
                               MPL_gpu_ipc_mem_handle_t * ipc_handle)
 {
diff --git a/src/mpl/src/gpu/mpl_gpu_hip.c b/src/mpl/src/gpu/mpl_gpu_hip.c
index c0eb60af4f7..059affe9f8e 100644
--- a/src/mpl/src/gpu/mpl_gpu_hip.c
+++ b/src/mpl/src/gpu/mpl_gpu_hip.c
@@ -123,6 +123,17 @@ int MPL_gpu_query_pointer_attr(const void *ptr, MPL_pointer_attr_t * attr)
     goto fn_exit;
 }
 
+int MPL_gpu_query_pointer_is_dev(const void *ptr, MPL_pointer_attr_t * attr)
+{
+    MPL_pointer_attr_t a;
+
+    if (attr == NULL) {
+        MPL_gpu_query_pointer_attr(ptr, &a);
+        attr = &a;
+    }
+    return attr->type == MPL_GPU_POINTER_DEV;
+}
+
 int MPL_gpu_ipc_handle_create(const void *ptr, MPL_gpu_device_attr * ptr_attr,
                               MPL_gpu_ipc_mem_handle_t * ipc_handle)
 {
diff --git a/src/mpl/src/gpu/mpl_gpu_ze.c b/src/mpl/src/gpu/mpl_gpu_ze.c
index 530ba9e60a7..0ce58d86071 100644
--- a/src/mpl/src/gpu/mpl_gpu_ze.c
+++ b/src/mpl/src/gpu/mpl_gpu_ze.c
@@ -1596,6 +1596,34 @@ int MPL_gpu_query_pointer_attr(const void *ptr, MPL_pointer_attr_t * attr)
     goto fn_exit;
 }
 
+int MPL_gpu_query_pointer_is_dev(const void *ptr, MPL_pointer_attr_t * attr)
+{
+    ze_result_t ret ATTRIBUTE((unused));
+    ze_memory_type_t type;
+
+    if (attr == NULL) {
+        ze_memory_allocation_properties_t prop = {
+            .stype = ZE_STRUCTURE_TYPE_MEMORY_ALLOCATION_PROPERTIES,
+            .pNext = NULL,
+            .type = 0,
+            .id = 0,
+            .pageSize = 0,
+        };
+        ze_device_handle_t device = NULL;
+
+        ret = zeMemGetAllocProperties(ze_context, ptr, &prop, &device);
+        assert(ret == ZE_RESULT_SUCCESS);
+        type = prop.type;
+    } else {
+        type = attr->device_attr.prop.type;
+    }
+
+    /* Treat all ZE allocations as device objects. This is because even host-registered memory
+     * are implemented as device objects in the driver. As such, these allocations don't work
+     * properly with XPMEM. */
+    return type != ZE_MEMORY_TYPE_UNKNOWN;
+}
+
 int MPL_gpu_malloc(void **ptr, size_t size, MPL_gpu_device_handle_t h_device)
 {
     int mpl_err = MPL_SUCCESS;

From ea390b56f4c78ff84f8a227602157df0b7011015 Mon Sep 17 00:00:00 2001
From: Gengbin Zheng <gengbin.zheng@intel.com>
Date: Tue, 12 Jul 2022 06:53:32 -0700
Subject: [PATCH 2/6] mpir/gpu: change MPIR_GPU_query_pointer_is_dev to call
 MPL API

---
 src/include/mpir_gpu.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/include/mpir_gpu.h b/src/include/mpir_gpu.h
index 3e176046be5..5ca5a84448f 100644
--- a/src/include/mpir_gpu.h
+++ b/src/include/mpir_gpu.h
@@ -99,10 +99,7 @@ MPL_STATIC_INLINE_PREFIX int MPIR_GPU_query_pointer_attr(const void *ptr, MPL_po
 MPL_STATIC_INLINE_PREFIX bool MPIR_GPU_query_pointer_is_dev(const void *ptr)
 {
     if (ENABLE_GPU && ptr != NULL) {
-        MPL_pointer_attr_t attr;
-        MPL_gpu_query_pointer_attr(ptr, &attr);
-
-        return attr.type == MPL_GPU_POINTER_DEV;
+        return MPL_gpu_query_pointer_is_dev(ptr, NULL);
     }
 
     return false;

From c23100da617737209591ca6ab69e090b7ed1a4f1 Mon Sep 17 00:00:00 2001
From: Gengbin Zheng <gengbin.zheng@intel.com>
Date: Thu, 7 Jul 2022 12:41:09 -0700
Subject: [PATCH 3/6] ch4/shm: ensure xpmem can handle the provided buffer

Utilize MPL_gpu_query_pointer_is_dev to ensure the provided buffer can
be passed to XPMEM and work as intended.
---
 src/mpid/ch4/shm/ipc/src/ipc_send.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/mpid/ch4/shm/ipc/src/ipc_send.h b/src/mpid/ch4/shm/ipc/src/ipc_send.h
index 2b176343347..b7333fc7c13 100644
--- a/src/mpid/ch4/shm/ipc/src/ipc_send.h
+++ b/src/mpid/ch4/shm/ipc/src/ipc_send.h
@@ -118,7 +118,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_IPCI_try_lmt_isend(const void *buf, MPI_Aint
     if (ipc_attr.gpu_attr.type == MPL_GPU_POINTER_DEV) {
         mpi_errno = MPIDI_GPU_get_ipc_attr(mem_addr, rank, comm, &ipc_attr);
         MPIR_ERR_CHECK(mpi_errno);
-    } else {
+    } else if (!MPL_gpu_query_pointer_is_dev(buf, &ipc_attr.gpu_attr)) {
+        /* The result of MPL_gpu_query_pointer_is_dev is not necessarily equivalent to
+         * (gpu_attr.type == MPL_GPU_POINTER_DEV) depending on the backend. This explicit check
+         * ensures the pointer can be accepted by XPMEM and work as intended. */
         mpi_errno = MPIDI_XPMEM_get_ipc_attr(mem_addr, mem_size, &ipc_attr);
         MPIR_ERR_CHECK(mpi_errno);
     }

From e60f0a9bde25416077c5d42e1c24926597b4ae2b Mon Sep 17 00:00:00 2001
From: Gengbin Zheng <gengbin.zheng@intel.com>
Date: Thu, 14 Jul 2022 08:42:10 -0700
Subject: [PATCH 4/6] mpl/gpu: add MPL function MPL_gpu_query_is_same_dev

A new function MPL_gpu_query_is_same_dev to detect if two devices/subdevices
are actually one device.
---
 src/mpl/include/mpl_gpu.h      |  6 ++++
 src/mpl/src/gpu/mpl_gpu_cuda.c |  5 ++++
 src/mpl/src/gpu/mpl_gpu_hip.c  |  5 ++++
 src/mpl/src/gpu/mpl_gpu_ze.c   | 51 ++++++++++++++++++++++++++++++++++
 4 files changed, 67 insertions(+)

diff --git a/src/mpl/include/mpl_gpu.h b/src/mpl/include/mpl_gpu.h
index e4b22ef2728..c9a34244127 100644
--- a/src/mpl/include/mpl_gpu.h
+++ b/src/mpl/include/mpl_gpu.h
@@ -81,11 +81,17 @@ static inline int MPL_gpu_query_pointer_is_dev(const void *ptr, MPL_pointer_attr
 {
     return 0;
 }
+
+static inline int MPL_gpu_query_is_same_dev(int dev1, int dev2)
+{
+    return dev1 == dev2;
+}
 #endif /* ! MPL_HAVE_GPU */
 
 int MPL_gpu_query_support(MPL_gpu_type_t * type);
 int MPL_gpu_query_pointer_attr(const void *ptr, MPL_pointer_attr_t * attr);
 int MPL_gpu_query_pointer_is_dev(const void *ptr, MPL_pointer_attr_t * attr);
+int MPL_gpu_query_is_same_dev(int dev1, int dev2);
 
 int MPL_gpu_ipc_handle_create(const void *ptr, MPL_gpu_device_attr * ptr_attr,
                               MPL_gpu_ipc_mem_handle_t * mpl_ipc_handle);
diff --git a/src/mpl/src/gpu/mpl_gpu_cuda.c b/src/mpl/src/gpu/mpl_gpu_cuda.c
index c0e66fb3179..b3c1a650483 100644
--- a/src/mpl/src/gpu/mpl_gpu_cuda.c
+++ b/src/mpl/src/gpu/mpl_gpu_cuda.c
@@ -142,6 +142,11 @@ int MPL_gpu_query_pointer_is_dev(const void *ptr, MPL_pointer_attr_t * attr)
     return attr->type == MPL_GPU_POINTER_DEV;
 }
 
+int MPL_gpu_query_is_same_dev(int dev1, int dev2)
+{
+    return dev1 == dev2;
+}
+
 int MPL_gpu_ipc_handle_create(const void *ptr, MPL_gpu_device_attr * ptr_attr,
                               MPL_gpu_ipc_mem_handle_t * ipc_handle)
 {
diff --git a/src/mpl/src/gpu/mpl_gpu_hip.c b/src/mpl/src/gpu/mpl_gpu_hip.c
index 059affe9f8e..93598409a93 100644
--- a/src/mpl/src/gpu/mpl_gpu_hip.c
+++ b/src/mpl/src/gpu/mpl_gpu_hip.c
@@ -134,6 +134,11 @@ int MPL_gpu_query_pointer_is_dev(const void *ptr, MPL_pointer_attr_t * attr)
     return attr->type == MPL_GPU_POINTER_DEV;
 }
 
+int MPL_gpu_query_is_same_dev(int dev1, int dev2)
+{
+    return dev1 == dev2;
+}
+
 int MPL_gpu_ipc_handle_create(const void *ptr, MPL_gpu_device_attr * ptr_attr,
                               MPL_gpu_ipc_mem_handle_t * ipc_handle)
 {
diff --git a/src/mpl/src/gpu/mpl_gpu_ze.c b/src/mpl/src/gpu/mpl_gpu_ze.c
index 0ce58d86071..272b587f462 100644
--- a/src/mpl/src/gpu/mpl_gpu_ze.c
+++ b/src/mpl/src/gpu/mpl_gpu_ze.c
@@ -63,6 +63,10 @@ typedef struct {
     unsigned int numQueueGroups;
     ze_event_handle_t prev_event;       /* for imemcpy */
     MPL_cmdlist_pool_t *last_cmdList_entry;     /* for imemcpy */
+#ifdef ZE_PCI_PROPERTIES_EXT_NAME
+    ze_pci_address_ext_t pci;
+    int pci_avail;
+#endif
 } MPL_ze_device_entry_t;
 
 static MPL_ze_device_entry_t *device_states;
@@ -956,6 +960,19 @@ static int gpu_ze_init_driver(void)
                 }
             }
         }
+#ifdef ZE_PCI_PROPERTIES_EXT_NAME
+        ze_pci_ext_properties_t pci_property = {
+            .stype = ZE_STRUCTURE_TYPE_PCI_EXT_PROPERTIES,
+            .pNext = NULL,
+        };
+        ret = zeDevicePciGetPropertiesExt(ze_devices_handle[d], &pci_property);
+        if (ret == ZE_RESULT_SUCCESS) {
+            device_state->pci_avail = 1;
+            device_state->pci = pci_property.address;
+        } else {
+            device_state->pci_avail = 0;
+        }
+#endif
         MPL_free(queueProperties);
     }
 
@@ -1624,6 +1641,38 @@ int MPL_gpu_query_pointer_is_dev(const void *ptr, MPL_pointer_attr_t * attr)
     return type != ZE_MEMORY_TYPE_UNKNOWN;
 }
 
+int MPL_gpu_query_is_same_dev(int global_dev1, int global_dev2)
+{
+    MPL_ze_device_entry_t *device_state1, *device_state2;
+    int local_dev1, local_dev2;
+
+    assert(global_dev1 >= 0 && global_dev1 < global_ze_device_count);
+    assert(global_dev2 >= 0 && global_dev2 < global_ze_device_count);
+
+    local_dev1 = MPL_gpu_global_to_local_dev_id(global_dev1);
+    local_dev2 = MPL_gpu_global_to_local_dev_id(global_dev2);
+    /* check if invisible devices */
+    if (local_dev1 == -1 || local_dev2 == -1)
+        return 0;
+
+#ifdef ZE_PCI_PROPERTIES_EXT_NAME
+    if (MPL_gpu_get_root_device(local_dev1) == MPL_gpu_get_root_device(local_dev2))
+        return 1;
+
+    device_state1 = device_states + local_dev1;
+    device_state2 = device_states + local_dev2;
+    if (device_state1->pci_avail && device_state2->pci_avail)
+        return device_state1->pci.domain == device_state2->pci.domain &&
+            device_state1->pci.bus == device_state2->pci.bus &&
+            device_state1->pci.device == device_state2->pci.device &&
+            device_state1->pci.function == device_state2->pci.function;
+    else
+        return 0;
+#else
+    return MPL_gpu_get_root_device(local_dev1) == MPL_gpu_get_root_device(local_dev2);
+#endif
+}
+
 int MPL_gpu_malloc(void **ptr, size_t size, MPL_gpu_device_handle_t h_device)
 {
     int mpl_err = MPL_SUCCESS;
@@ -1825,6 +1874,8 @@ int MPL_gpu_imemcpy(void *dest_ptr, void *src_ptr, size_t size, int dev,
     int orig_dev = dev;
     int engine = engine_conversion[engine_type];
 
+    assert(dev >= 0 && dev < local_ze_device_count);
+
     if (dest_ptr && src_ptr) {
         ret = get_next_event(&event);
         ZE_ERR_CHECK(ret);

From 4365b610857dc21c09a8ca6e87ed52667780b10d Mon Sep 17 00:00:00 2001
From: Gengbin Zheng <gengbin.zheng@intel.com>
Date: Thu, 14 Jul 2022 08:44:06 -0700
Subject: [PATCH 5/6] shm/ipc: add a strategy to choose engine type in IPC path

When two devices/subdevices are one device, a high bandwidth engine is
preferred, the rest of the cases use a low latency engine.
---
 src/mpid/ch4/shm/ipc/src/ipc_p2p.h | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/mpid/ch4/shm/ipc/src/ipc_p2p.h b/src/mpid/ch4/shm/ipc/src/ipc_p2p.h
index a0225c43a7b..1cc48e8191d 100644
--- a/src/mpid/ch4/shm/ipc/src/ipc_p2p.h
+++ b/src/mpid/ch4/shm/ipc/src/ipc_p2p.h
@@ -151,6 +151,18 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_IPCI_copy_data(MPIDI_IPC_hdr * ipc_hdr, MPIR_
     goto fn_exit;
 }
 
+MPL_STATIC_INLINE_PREFIX MPL_gpu_engine_type_t MPIDI_IPCI_choose_engine(int dev1, int dev2)
+{
+    MPL_gpu_engine_type_t engine = MPL_GPU_ENGINE_TYPE_COPY_LOW_LATENCY;
+    if (dev1 == -1 || dev2 == -1) {
+        return MPL_GPU_ENGINE_TYPE_COPY_HIGH_BANDWIDTH;
+    }
+    assert(dev1 != -1 && dev2 != -1);
+    if (MPL_gpu_query_is_same_dev(dev1, dev2))
+        engine = MPL_GPU_ENGINE_TYPE_COPY_HIGH_BANDWIDTH;
+    return engine;
+}
+
 MPL_STATIC_INLINE_PREFIX int MPIDI_IPCI_handle_lmt_recv(MPIDI_IPC_hdr * ipc_hdr,
                                                         size_t src_data_sz,
                                                         MPIR_Request * sreq_ptr,
@@ -200,11 +212,12 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_IPCI_handle_lmt_recv(MPIDI_IPC_hdr * ipc_hdr,
         MPIR_ERR_CHECK(mpi_errno);
         /* copy */
         if (ipc_hdr->is_contig && dt_contig) {
-            mpi_errno = MPIR_Localcopy_gpu(src_buf, src_data_sz, MPI_BYTE, NULL,
-                                           MPIDIG_REQUEST(rreq, buffer),
-                                           MPIDIG_REQUEST(rreq, count),
-                                           MPIDIG_REQUEST(rreq, datatype), &attr,
-                                           MPL_GPU_ENGINE_TYPE_COPY_HIGH_BANDWIDTH, true);
+            MPL_gpu_engine_type_t engine =
+                MPIDI_IPCI_choose_engine(ipc_hdr->ipc_handle.gpu.global_dev_id, dev_id);
+            mpi_errno =
+                MPIR_Localcopy_gpu(src_buf, src_data_sz, MPI_BYTE, NULL,
+                                   MPIDIG_REQUEST(rreq, buffer), MPIDIG_REQUEST(rreq, count),
+                                   MPIDIG_REQUEST(rreq, datatype), &attr, engine, true);
             MPIR_ERR_CHECK(mpi_errno);
         } else {
             /* TODO: get sender datatype and call MPIR_Typerep_op with mapped_device set to dev_id */

From 9a2664b2e9f642e2004a380d69e5a90b00a9240e Mon Sep 17 00:00:00 2001
From: Gengbin Zheng <gengbin.zheng@intel.com>
Date: Sat, 16 Jul 2022 23:33:44 -0700
Subject: [PATCH 6/6] mpl/gpu: delay creation of command queues until needed

Only create command queues for a device and engine type when it is
needed.
---
 src/mpl/src/gpu/mpl_gpu_ze.c | 121 +++++++++++++++++++++--------------
 1 file changed, 73 insertions(+), 48 deletions(-)

diff --git a/src/mpl/src/gpu/mpl_gpu_ze.c b/src/mpl/src/gpu/mpl_gpu_ze.c
index 272b587f462..24267765d4c 100644
--- a/src/mpl/src/gpu/mpl_gpu_ze.c
+++ b/src/mpl/src/gpu/mpl_gpu_ze.c
@@ -59,10 +59,11 @@ typedef struct {
 } MPL_ze_engine_entry_t;
 
 typedef struct {
-    MPL_ze_engine_entry_t *engines;
+    int dev_id;
     unsigned int numQueueGroups;
-    ze_event_handle_t prev_event;       /* for imemcpy */
-    MPL_cmdlist_pool_t *last_cmdList_entry;     /* for imemcpy */
+    MPL_ze_engine_entry_t *engines;
+    ze_event_handle_t prev_event;       /* for imemcopy */
+    MPL_cmdlist_pool_t *last_cmdList_entry;     /* for imemcopy */
 #ifdef ZE_PCI_PROPERTIES_EXT_NAME
     ze_pci_address_ext_t pci;
     int pci_avail;
@@ -804,7 +805,7 @@ static int gpu_ze_init_driver(void)
     ret = zeDriverGet(&driver_count, all_drivers);
     ZE_ERR_CHECK(ret);
 
-    int i, j, d;
+    int i, d;
     /* Find a driver instance with a GPU device */
     for (i = 0; i < driver_count; ++i) {
         device_count = 0;
@@ -901,20 +902,10 @@ static int gpu_ze_init_driver(void)
         (MPL_ze_device_entry_t *) MPL_malloc(sizeof(MPL_ze_device_entry_t) * local_ze_device_count,
                                              MPL_MEM_OTHER);
 
-    /* create command queues */
     for (d = 0; d < local_ze_device_count; d++) {
         unsigned int numQueueGroups = 0;
-        ze_command_queue_desc_t cmdQueueDesc = {
-            .stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
-            .pNext = NULL,
-            .index = 0,
-            .flags = 0,
-            .ordinal = 0,
-            .mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS,
-            .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL,
-        };
-
         MPL_ze_device_entry_t *device_state = device_states + d;
+        device_state->dev_id = d;
         device_state->prev_event = NULL;
         device_state->last_cmdList_entry = NULL;
         ret = zeDeviceGetCommandQueueGroupProperties(ze_devices_handle[d], &numQueueGroups, NULL);
@@ -931,33 +922,25 @@ static int gpu_ze_init_driver(void)
         device_state->numQueueGroups = numQueueGroups;
 
         for (i = 0; i < numQueueGroups; i++) {
-            cmdQueueDesc.ordinal = -1;
+            int ordinal = -1;
             if (queueProperties[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE) {
-                cmdQueueDesc.ordinal = i;
+                ordinal = i;
             } else if (queueProperties[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COPY &&
                        queueProperties[i].numQueues >= 1 &&
                        !(queueProperties[i].flags & ZE_COMMAND_QUEUE_GROUP_PROPERTY_FLAG_COMPUTE)) {
-                cmdQueueDesc.ordinal = i;
+                ordinal = i;
             }
             device_state->engines[i].cmdList_pool = NULL;
-            if (cmdQueueDesc.ordinal == -1) {
-                device_state->engines[i].curQueue = 0;
-                device_state->engines[i].numQueues = 0;
-                device_state->engines[i].cmdQueues = NULL;
-            } else {
-                device_state->engines[i].numQueues = queueProperties[i].numQueues;
-                device_state->engines[i].curQueue = 0;
+            device_state->engines[i].curQueue = 0;
+            device_state->engines[i].numQueues = ordinal == -1 ? 0 : queueProperties[i].numQueues;
+            device_state->engines[i].cmdQueues = NULL;
+            if (device_state->engines[i].numQueues) {
                 device_state->engines[i].cmdQueues =
                     (ze_command_queue_handle_t *) MPL_malloc(sizeof(ze_command_queue_handle_t) *
-                                                             queueProperties[i].numQueues,
+                                                             device_state->engines[i].numQueues,
                                                              MPL_MEM_OTHER);
-                for (j = 0; j < queueProperties[i].numQueues; j++) {
-                    cmdQueueDesc.index = j;
-                    ret =
-                        zeCommandQueueCreate(ze_context, ze_devices_handle[d], &cmdQueueDesc,
-                                             &device_state->engines[i].cmdQueues[j]);
-                    ZE_ERR_CHECK(ret);
-                }
+                memset(device_state->engines[i].cmdQueues, 0,
+                       sizeof(ze_command_queue_handle_t) * device_state->engines[i].numQueues);
             }
         }
 #ifdef ZE_PCI_PROPERTIES_EXT_NAME
@@ -1181,15 +1164,16 @@ int MPL_gpu_finalize(void)
         for (j = 0; j < device_state->numQueueGroups; j++) {
             MPL_ze_engine_entry_t *engine = device_state->engines + j;
             for (k = 0; k < engine->numQueues; k++) {
-                zeCommandQueueDestroy(engine->cmdQueues[k]);
+                if (engine->cmdQueues[k])
+                    zeCommandQueueDestroy(engine->cmdQueues[k]);
             }
+            MPL_free(engine->cmdQueues);
             MPL_cmdlist_pool_t *cmdlist, *t, *pool = engine->cmdList_pool;
             DL_FOREACH_SAFE(pool, cmdlist, t) {
                 zeCommandListDestroy(cmdlist->cmdList);
                 DL_DELETE(pool, cmdlist);
                 MPL_free(cmdlist);
             }
-            MPL_free(engine->cmdQueues);
         }
         MPL_free(device_state->engines);
     }
@@ -1782,6 +1766,42 @@ int MPL_gpu_get_buffer_bounds(const void *ptr, void **pbase, uintptr_t * len)
 }
 
 /* command list utility functions - used for MPL_gpu_imemcpy */
+/* create complete set of command queues for an engine type */
+static int create_cmdqueue(int dev, int engine)
+{
+    int mpl_err = MPL_SUCCESS;
+    int ret;
+
+    ze_command_queue_desc_t cmdQueueDesc = {
+        .stype = ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC,
+        .pNext = NULL,
+        .index = 0,
+        .flags = 0,
+        .ordinal = engine,
+        .mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS,
+        .priority = ZE_COMMAND_QUEUE_PRIORITY_NORMAL,
+    };
+
+    MPL_ze_device_entry_t *device_state = device_states + dev;
+    assert(engine < device_state->numQueueGroups);
+    MPL_ze_engine_entry_t *engine_state = device_state->engines + engine;
+    assert(engine_state->numQueues);
+
+    for (int i = 0; i < engine_state->numQueues; i++) {
+        cmdQueueDesc.index = i;
+        ret =
+            zeCommandQueueCreate(ze_context, ze_devices_handle[dev], &cmdQueueDesc,
+                                 &engine_state->cmdQueues[i]);
+        ZE_ERR_CHECK(ret);
+    }
+
+  fn_exit:
+    return mpl_err;
+  fn_fail:
+    mpl_err = MPL_ERR_GPU_INTERNAL;
+    goto fn_exit;
+}
+
 static int get_cmdlist(int dev, int engine, MPL_cmdlist_pool_t ** cl_entry)
 {
     int mpl_err = MPL_SUCCESS;
@@ -1879,18 +1899,16 @@ int MPL_gpu_imemcpy(void *dest_ptr, void *src_ptr, size_t size, int dev,
     if (dest_ptr && src_ptr) {
         ret = get_next_event(&event);
         ZE_ERR_CHECK(ret);
-        if (device_states[dev].last_cmdList_entry == NULL) {
+        if (device_states[orig_dev].last_cmdList_entry == NULL) {
             MPL_cmdlist_pool_t *cmdList_entry;
-            ret = get_cmdlist(dev, engine, &cmdList_entry);
+            ret = get_cmdlist(orig_dev, engine, &cmdList_entry);
             ZE_ERR_CHECK(ret);
-            cmdList = cmdList_entry->cmdList;
-            device_states[dev].last_cmdList_entry = cmdList_entry;
+            device_states[orig_dev].last_cmdList_entry = cmdList_entry;
             dev = cmdList_entry->dev;
-        } else {
-            cmdList = device_states[dev].last_cmdList_entry->cmdList;
-            if (device_states[dev].last_cmdList_entry->dev != dev)
-                goto fn_fail;
         }
+        cmdList = device_states[orig_dev].last_cmdList_entry->cmdList;
+        if (device_states[orig_dev].last_cmdList_entry->dev != dev)
+            goto fn_fail;
         assert(dev < local_ze_device_count);
         device_state = device_states + dev;
         ret =
@@ -1911,12 +1929,19 @@ int MPL_gpu_imemcpy(void *dest_ptr, void *src_ptr, size_t size, int dev,
     if (commit && cmdList) {
         ret = zeCommandListClose(cmdList);
         ZE_ERR_CHECK(ret);
-        ret =
-            zeCommandQueueExecuteCommandLists(device_state->
-                                              engines[engine].cmdQueues[device_state->
-                                                                        engines[engine].curQueue],
-                                              1, &cmdList, NULL);
+        int q_index = device_state->engines[engine].curQueue;
+        assert(device_state->engines[engine].cmdQueues);
+        ze_command_queue_handle_t cmdq = device_state->engines[engine].cmdQueues[q_index];
+        if (cmdq == NULL) {
+            mpl_err = create_cmdqueue(device_state->dev_id, engine);
+            if (mpl_err != MPL_SUCCESS)
+                goto fn_fail;
+            cmdq = device_state->engines[engine].cmdQueues[q_index];
+            assert(cmdq);
+        }
+        ret = zeCommandQueueExecuteCommandLists(cmdq, 1, &cmdList, NULL);
         ZE_ERR_CHECK(ret);
+        /* move to next queue */
         device_state->engines[engine].curQueue++;
         if (device_state->engines[engine].curQueue == device_state->engines[engine].numQueues)
             device_state->engines[engine].curQueue = 0;