diff --git a/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py b/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py
index e17692ef56..0c877c8749 100755
--- a/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py
+++ b/qa/L0_cuda_shared_memory/cuda_shared_memory_test.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -33,6 +33,7 @@
 import os
 import unittest
 
+import infer_util as iu
 import numpy as np
 import test_util as tu
 import tritonclient.grpc as grpcclient
@@ -42,6 +43,24 @@
 
 
 class CudaSharedMemoryTest(tu.TestResultCollector):
+    DEFAULT_SHM_BYTE_SIZE = 64
+
+    def setUp(self):
+        self._setup_client()
+
+    def _setup_client(self):
+        self.protocol = os.environ.get("CLIENT_TYPE", "http")
+        if self.protocol == "http":
+            self.url = "localhost:8000"
+            self.triton_client = httpclient.InferenceServerClient(
+                self.url, verbose=True
+            )
+        else:
+            self.url = "localhost:8001"
+            self.triton_client = grpcclient.InferenceServerClient(
+                self.url, verbose=True
+            )
+
     def test_invalid_create_shm(self):
         # Raises error since tried to create invalid cuda shared memory region
         try:
@@ -52,19 +71,15 @@ def test_invalid_create_shm(self):
 
     def test_valid_create_set_register(self):
         # Create a valid cuda shared memory region, fill data in it and register
-        if _protocol == "http":
-            triton_client = httpclient.InferenceServerClient(_url, verbose=True)
-        else:
-            triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
         shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
         cshm.set_shared_memory_region(
             shm_op0_handle, [np.array([1, 2], dtype=np.float32)]
         )
-        triton_client.register_cuda_shared_memory(
+        self.triton_client.register_cuda_shared_memory(
             "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
         )
-        shm_status = triton_client.get_cuda_shared_memory_status()
-        if _protocol == "http":
+        shm_status = self.triton_client.get_cuda_shared_memory_status()
+        if self.protocol == "http":
             self.assertEqual(len(shm_status), 1)
         else:
             self.assertEqual(len(shm_status.regions), 1)
@@ -72,14 +87,10 @@ def test_valid_create_set_register(self):
 
     def test_unregister_before_register(self):
         # Create a valid cuda shared memory region and unregister before register
-        if _protocol == "http":
-            triton_client = httpclient.InferenceServerClient(_url, verbose=True)
-        else:
-            triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
         shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
-        triton_client.unregister_cuda_shared_memory("dummy_data")
-        shm_status = triton_client.get_cuda_shared_memory_status()
-        if _protocol == "http":
+        self.triton_client.unregister_cuda_shared_memory("dummy_data")
+        shm_status = self.triton_client.get_cuda_shared_memory_status()
+        if self.protocol == "http":
             self.assertEqual(len(shm_status), 0)
         else:
             self.assertEqual(len(shm_status.regions), 0)
@@ -87,17 +98,13 @@ def test_unregister_before_register(self):
 
     def test_unregister_after_register(self):
         # Create a valid cuda shared memory region and unregister after register
-        if _protocol == "http":
-            triton_client = httpclient.InferenceServerClient(_url, verbose=True)
-        else:
-            triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
         shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
-        triton_client.register_cuda_shared_memory(
+        self.triton_client.register_cuda_shared_memory(
             "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
         )
-        triton_client.unregister_cuda_shared_memory("dummy_data")
-        shm_status = triton_client.get_cuda_shared_memory_status()
-        if _protocol == "http":
+        self.triton_client.unregister_cuda_shared_memory("dummy_data")
+        shm_status = self.triton_client.get_cuda_shared_memory_status()
+        if self.protocol == "http":
             self.assertEqual(len(shm_status), 0)
         else:
             self.assertEqual(len(shm_status.regions), 0)
@@ -105,54 +112,92 @@ def test_unregister_after_register(self):
 
     def test_reregister_after_register(self):
         # Create a valid cuda shared memory region and unregister after register
-        if _protocol == "http":
-            triton_client = httpclient.InferenceServerClient(_url, verbose=True)
-        else:
-            triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
         shm_op0_handle = cshm.create_shared_memory_region("dummy_data", 8, 0)
-        triton_client.register_cuda_shared_memory(
+        self.triton_client.register_cuda_shared_memory(
             "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
         )
         try:
-            triton_client.register_cuda_shared_memory(
+            self.triton_client.register_cuda_shared_memory(
                 "dummy_data", cshm.get_raw_handle(shm_op0_handle), 0, 8
             )
         except Exception as ex:
             self.assertIn(
                 "shared memory region 'dummy_data' already in manager", str(ex)
             )
-        shm_status = triton_client.get_cuda_shared_memory_status()
-        if _protocol == "http":
+        shm_status = self.triton_client.get_cuda_shared_memory_status()
+        if self.protocol == "http":
             self.assertEqual(len(shm_status), 1)
         else:
             self.assertEqual(len(shm_status.regions), 1)
         cshm.destroy_shared_memory_region(shm_op0_handle)
 
-    def _configure_sever(self):
-        shm_ip0_handle = cshm.create_shared_memory_region("input0_data", 64, 0)
-        shm_ip1_handle = cshm.create_shared_memory_region("input1_data", 64, 0)
-        shm_op0_handle = cshm.create_shared_memory_region("output0_data", 64, 0)
-        shm_op1_handle = cshm.create_shared_memory_region("output1_data", 64, 0)
+    def _configure_server(
+        self,
+        create_byte_size=DEFAULT_SHM_BYTE_SIZE,
+        register_byte_size=DEFAULT_SHM_BYTE_SIZE,
+        device_id=0,
+    ):
+        """Creates and registers cuda shared memory regions for testing.
+
+        Parameters
+        ----------
+        create_byte_size: int
+            Size of each cuda shared memory region to create.
+            NOTE: This should be sufficiently large to hold the inputs/outputs
+                  stored in shared memory.
+
+        register_byte_size: int
+            Size of each cuda shared memory region to register with server.
+            NOTE: The register_byte_size should be less than or equal
+            to the create_byte_size. Otherwise an exception will be raised for
+            an invalid set of registration args.
+
+        device_id: int
+            The GPU device ID of the cuda shared memory region to be created.
+
+        """
+
+        shm_ip0_handle = cshm.create_shared_memory_region(
+            "input0_data", create_byte_size, device_id
+        )
+        shm_ip1_handle = cshm.create_shared_memory_region(
+            "input1_data", create_byte_size, device_id
+        )
+        shm_op0_handle = cshm.create_shared_memory_region(
+            "output0_data", create_byte_size, device_id
+        )
+        shm_op1_handle = cshm.create_shared_memory_region(
+            "output1_data", create_byte_size, device_id
+        )
 
         input0_data = np.arange(start=0, stop=16, dtype=np.int32)
         input1_data = np.ones(shape=16, dtype=np.int32)
         cshm.set_shared_memory_region(shm_ip0_handle, [input0_data])
         cshm.set_shared_memory_region(shm_ip1_handle, [input1_data])
-        if _protocol == "http":
-            triton_client = httpclient.InferenceServerClient(_url, verbose=True)
-        else:
-            triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
-        triton_client.register_cuda_shared_memory(
-            "input0_data", cshm.get_raw_handle(shm_ip0_handle), 0, 64
+
+        self.triton_client.register_cuda_shared_memory(
+            "input0_data",
+            cshm.get_raw_handle(shm_ip0_handle),
+            device_id,
+            register_byte_size,
         )
-        triton_client.register_cuda_shared_memory(
-            "input1_data", cshm.get_raw_handle(shm_ip1_handle), 0, 64
+        self.triton_client.register_cuda_shared_memory(
+            "input1_data",
+            cshm.get_raw_handle(shm_ip1_handle),
+            device_id,
+            register_byte_size,
         )
-        triton_client.register_cuda_shared_memory(
-            "output0_data", cshm.get_raw_handle(shm_op0_handle), 0, 64
+        self.triton_client.register_cuda_shared_memory(
+            "output0_data",
+            cshm.get_raw_handle(shm_op0_handle),
+            device_id,
+            register_byte_size,
         )
-        triton_client.register_cuda_shared_memory(
-            "output1_data", cshm.get_raw_handle(shm_op1_handle), 0, 64
+        self.triton_client.register_cuda_shared_memory(
+            "output1_data",
+            cshm.get_raw_handle(shm_op1_handle),
+            device_id,
+            register_byte_size,
         )
         return [shm_ip0_handle, shm_ip1_handle, shm_op0_handle, shm_op1_handle]
 
@@ -160,79 +205,27 @@ def _cleanup_server(self, shm_handles):
         for shm_handle in shm_handles:
             cshm.destroy_shared_memory_region(shm_handle)
 
-    def _basic_inference(
-        self,
-        shm_ip0_handle,
-        shm_ip1_handle,
-        shm_op0_handle,
-        shm_op1_handle,
-        error_msg,
-        big_shm_name="",
-        big_shm_size=64,
-    ):
-        input0_data = np.arange(start=0, stop=16, dtype=np.int32)
-        input1_data = np.ones(shape=16, dtype=np.int32)
-        inputs = []
-        outputs = []
-        if _protocol == "http":
-            triton_client = httpclient.InferenceServerClient(_url, verbose=True)
-            inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
-            inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
-            outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=True))
-            outputs.append(
-                httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)
-            )
-        else:
-            triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
-            inputs.append(grpcclient.InferInput("INPUT0", [1, 16], "INT32"))
-            inputs.append(grpcclient.InferInput("INPUT1", [1, 16], "INT32"))
-            outputs.append(grpcclient.InferRequestedOutput("OUTPUT0"))
-            outputs.append(grpcclient.InferRequestedOutput("OUTPUT1"))
-        inputs[0].set_shared_memory("input0_data", 64)
-        if type(shm_ip1_handle) == np.array:
-            inputs[1].set_data_from_numpy(input0_data, binary_data=True)
-        elif big_shm_name != "":
-            inputs[1].set_shared_memory(big_shm_name, big_shm_size)
-        else:
-            inputs[1].set_shared_memory("input1_data", 64)
-        outputs[0].set_shared_memory("output0_data", 64)
-        outputs[1].set_shared_memory("output1_data", 64)
-
-        try:
-            results = triton_client.infer(
-                "simple", inputs, model_version="", outputs=outputs
-            )
-            output = results.get_output("OUTPUT0")
-            if _protocol == "http":
-                output_datatype = output["datatype"]
-                output_shape = output["shape"]
-            else:
-                output_datatype = output.datatype
-                output_shape = output.shape
-            output_dtype = triton_to_np_dtype(output_datatype)
-            output_data = cshm.get_contents_as_numpy(
-                shm_op0_handle, output_dtype, output_shape
-            )
-            self.assertTrue((output_data[0] == (input0_data + input1_data)).all())
-        except Exception as ex:
-            error_msg.append(str(ex))
-
     def test_unregister_after_inference(self):
         # Unregister after inference
         error_msg = []
-        shm_handles = self._configure_sever()
-        self._basic_inference(
-            shm_handles[0], shm_handles[1], shm_handles[2], shm_handles[3], error_msg
+        shm_handles = self._configure_server()
+        iu.shm_basic_infer(
+            self,
+            self.triton_client,
+            shm_handles[0],
+            shm_handles[1],
+            shm_handles[2],
+            shm_handles[3],
+            error_msg,
+            protocol=self.protocol,
+            use_cuda_shared_memory=True,
         )
         if len(error_msg) > 0:
             raise Exception(str(error_msg))
-        if _protocol == "http":
-            triton_client = httpclient.InferenceServerClient(_url, verbose=True)
-        else:
-            triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
-        triton_client.unregister_cuda_shared_memory("output0_data")
-        shm_status = triton_client.get_cuda_shared_memory_status()
-        if _protocol == "http":
+
+        self.triton_client.unregister_cuda_shared_memory("output0_data")
+        shm_status = self.triton_client.get_cuda_shared_memory_status()
+        if self.protocol == "http":
             self.assertEqual(len(shm_status), 3)
         else:
             self.assertEqual(len(shm_status.regions), 3)
@@ -241,22 +234,26 @@ def test_unregister_after_inference(self):
     def test_register_after_inference(self):
         # Register after inference
         error_msg = []
-        shm_handles = self._configure_sever()
-        if _protocol == "http":
-            triton_client = httpclient.InferenceServerClient(_url, verbose=True)
-        else:
-            triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
-        self._basic_inference(
-            shm_handles[0], shm_handles[1], shm_handles[2], shm_handles[3], error_msg
+        shm_handles = self._configure_server()
+        iu.shm_basic_infer(
+            self,
+            self.triton_client,
+            shm_handles[0],
+            shm_handles[1],
+            shm_handles[2],
+            shm_handles[3],
+            error_msg,
+            protocol=self.protocol,
+            use_cuda_shared_memory=True,
         )
         if len(error_msg) > 0:
             raise Exception(str(error_msg))
         shm_ip2_handle = cshm.create_shared_memory_region("input2_data", 64, 0)
-        triton_client.register_cuda_shared_memory(
+        self.triton_client.register_cuda_shared_memory(
             "input2_data", cshm.get_raw_handle(shm_ip2_handle), 0, 64
         )
-        shm_status = triton_client.get_cuda_shared_memory_status()
-        if _protocol == "http":
+        shm_status = self.triton_client.get_cuda_shared_memory_status()
+        if self.protocol == "http":
             self.assertEqual(len(shm_status), 5)
         else:
             self.assertEqual(len(shm_status.regions), 5)
@@ -266,23 +263,23 @@ def test_register_after_inference(self):
     def test_too_big_shm(self):
         # Shared memory input region larger than needed - Throws error
         error_msg = []
-        shm_handles = self._configure_sever()
+        shm_handles = self._configure_server()
         shm_ip2_handle = cshm.create_shared_memory_region("input2_data", 128, 0)
-        if _protocol == "http":
-            triton_client = httpclient.InferenceServerClient(_url, verbose=True)
-        else:
-            triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
-        triton_client.register_cuda_shared_memory(
+        self.triton_client.register_cuda_shared_memory(
             "input2_data", cshm.get_raw_handle(shm_ip2_handle), 0, 128
         )
-        self._basic_inference(
+        iu.shm_basic_infer(
+            self,
+            self.triton_client,
             shm_handles[0],
             shm_ip2_handle,
             shm_handles[2],
             shm_handles[3],
             error_msg,
-            "input2_data",
-            128,
+            big_shm_name="input2_data",
+            big_shm_size=128,
+            protocol=self.protocol,
+            use_cuda_shared_memory=True,
         )
         if len(error_msg) > 0:
             self.assertIn(
@@ -295,40 +292,52 @@ def test_too_big_shm(self):
     def test_mixed_raw_shm(self):
         # Mix of shared memory and RAW inputs
         error_msg = []
-        shm_handles = self._configure_sever()
+        shm_handles = self._configure_server()
         input1_data = np.ones(shape=16, dtype=np.int32)
-        self._basic_inference(
-            shm_handles[0], [input1_data], shm_handles[2], shm_handles[3], error_msg
+        iu.shm_basic_infer(
+            self,
+            self.triton_client,
+            shm_handles[0],
+            [input1_data],
+            shm_handles[2],
+            shm_handles[3],
+            error_msg,
+            protocol=self.protocol,
+            use_cuda_shared_memory=True,
         )
+
         if len(error_msg) > 0:
             raise Exception(error_msg[-1])
         self._cleanup_server(shm_handles)
 
     def test_unregisterall(self):
         # Unregister all shared memory blocks
-        shm_handles = self._configure_sever()
-        if _protocol == "http":
-            triton_client = httpclient.InferenceServerClient(_url, verbose=True)
-        else:
-            triton_client = grpcclient.InferenceServerClient(_url, verbose=True)
-        status_before = triton_client.get_cuda_shared_memory_status()
-        if _protocol == "http":
+        shm_handles = self._configure_server()
+        status_before = self.triton_client.get_cuda_shared_memory_status()
+        if self.protocol == "http":
             self.assertEqual(len(status_before), 4)
         else:
             self.assertEqual(len(status_before.regions), 4)
-        triton_client.unregister_cuda_shared_memory()
-        status_after = triton_client.get_cuda_shared_memory_status()
-        if _protocol == "http":
+        self.triton_client.unregister_cuda_shared_memory()
+        status_after = self.triton_client.get_cuda_shared_memory_status()
+        if self.protocol == "http":
             self.assertEqual(len(status_after), 0)
         else:
             self.assertEqual(len(status_after.regions), 0)
         self._cleanup_server(shm_handles)
 
+    def test_register_out_of_bound(self):
+        create_byte_size = self.DEFAULT_SHM_BYTE_SIZE
+        # Verify various edge cases of registered region size don't go out of bounds of the actual created shm region's size.
+        with self.assertRaisesRegex(
+            InferenceServerException,
+            "failed to register shared memory region.*invalid args",
+        ):
+            self._configure_server(
+                create_byte_size=create_byte_size,
+                register_byte_size=create_byte_size + 1,
+            )
+
 
 if __name__ == "__main__":
-    _protocol = os.environ.get("CLIENT_TYPE", "http")
-    if _protocol == "http":
-        _url = "localhost:8000"
-    else:
-        _url = "localhost:8001"
     unittest.main()
diff --git a/qa/L0_cuda_shared_memory/test.sh b/qa/L0_cuda_shared_memory/test.sh
index b011244174..1daa9724d4 100755
--- a/qa/L0_cuda_shared_memory/test.sh
+++ b/qa/L0_cuda_shared_memory/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+# Copyright 2019-2024, NVIDIA CORPORATION. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -47,7 +47,8 @@ for i in \
         test_register_after_inference \
         test_too_big_shm \
         test_mixed_raw_shm \
-        test_unregisterall; do
+        test_unregisterall \
+        test_register_out_of_bound; do
     for client_type in http grpc; do
         SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1"
         SERVER_LOG="./$i.$client_type.server.log"
diff --git a/qa/L0_grpc/python_grpc_aio_test.py b/qa/L0_grpc/python_grpc_aio_test.py
index f342f19ad5..ba43b36abb 100755
--- a/qa/L0_grpc/python_grpc_aio_test.py
+++ b/qa/L0_grpc/python_grpc_aio_test.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python
-# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -113,7 +113,7 @@ async def test_get_cuda_shared_memory_status(self):
     async def test_register_cuda_shared_memory(self):
         with self.assertRaisesRegex(
             InferenceServerException,
-            "\[StatusCode\.INVALID_ARGUMENT\] failed to register CUDA shared memory region '': failed to open CUDA IPC handle: invalid argument",
+            "failed to register shared memory region.*invalid args",
         ):
             await self._triton_client.register_cuda_shared_memory("", b"", 0, 0)
 
diff --git a/qa/L0_shared_memory/shared_memory_test.py b/qa/L0_shared_memory/shared_memory_test.py
index 828c714ec6..ca2f2e6abe 100755
--- a/qa/L0_shared_memory/shared_memory_test.py
+++ b/qa/L0_shared_memory/shared_memory_test.py
@@ -33,6 +33,7 @@
 import os
 import unittest
 
+import infer_util as iu
 import numpy as np
 import test_util as tu
 import tritonclient.grpc as grpcclient
@@ -186,84 +187,20 @@ def _cleanup_server(self, shm_handles):
         for shm_handle in shm_handles:
             shm.destroy_shared_memory_region(shm_handle)
 
-    def _basic_inference(
-        self,
-        shm_ip0_handle,
-        shm_ip1_handle,
-        shm_op0_handle,
-        shm_op1_handle,
-        error_msg,
-        big_shm_name="",
-        big_shm_size=DEFAULT_SHM_BYTE_SIZE,
-        shm_output_offset=0,
-        shm_output_byte_size=DEFAULT_SHM_BYTE_SIZE,
-        default_shm_byte_size=DEFAULT_SHM_BYTE_SIZE,
-    ):
-        input0_data = np.arange(start=0, stop=16, dtype=np.int32)
-        input1_data = np.ones(shape=16, dtype=np.int32)
-        inputs = []
-        outputs = []
-        if self.protocol == "http":
-            inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
-            inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
-            outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=True))
-            outputs.append(
-                httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)
-            )
-        else:
-            inputs.append(grpcclient.InferInput("INPUT0", [1, 16], "INT32"))
-            inputs.append(grpcclient.InferInput("INPUT1", [1, 16], "INT32"))
-            outputs.append(grpcclient.InferRequestedOutput("OUTPUT0"))
-            outputs.append(grpcclient.InferRequestedOutput("OUTPUT1"))
-
-        inputs[0].set_shared_memory("input0_data", default_shm_byte_size)
-
-        if type(shm_ip1_handle) == np.array:
-            inputs[1].set_data_from_numpy(input0_data, binary_data=True)
-        elif big_shm_name != "":
-            inputs[1].set_shared_memory(big_shm_name, big_shm_size)
-        else:
-            inputs[1].set_shared_memory("input1_data", default_shm_byte_size)
-
-        outputs[0].set_shared_memory(
-            "output0_data", shm_output_byte_size, offset=shm_output_offset
-        )
-        outputs[1].set_shared_memory(
-            "output1_data", shm_output_byte_size, offset=shm_output_offset
-        )
-
-        try:
-            results = self.triton_client.infer(
-                "simple", inputs, model_version="", outputs=outputs
-            )
-            output = results.get_output("OUTPUT0")
-            if self.protocol == "http":
-                output_datatype = output["datatype"]
-                output_shape = output["shape"]
-            else:
-                output_datatype = output.datatype
-                output_shape = output.shape
-            output_dtype = utils.triton_to_np_dtype(output_datatype)
-            output_data = shm.get_contents_as_numpy(
-                shm_op0_handle, output_dtype, output_shape
-            )
-            self.assertTrue(
-                (output_data[0] == (input0_data + input1_data)).all(),
-                "Model output does not match expected output",
-            )
-        except Exception as ex:
-            error_msg.append(str(ex))
-
     def test_unregister_after_inference(self):
         # Unregister after inference
         error_msg = []
         shm_handles = self._configure_server()
-        self._basic_inference(
+        iu.shm_basic_infer(
+            self,
+            self.triton_client,
             shm_handles[0],
             shm_handles[1],
             shm_handles[2],
             shm_handles[3],
             error_msg,
+            protocol=self.protocol,
+            use_system_shared_memory=True,
         )
         if len(error_msg) > 0:
             raise Exception(str(error_msg))
@@ -279,9 +216,19 @@ def test_register_after_inference(self):
         # Register after inference
         error_msg = []
         shm_handles = self._configure_server()
-        self._basic_inference(
-            shm_handles[0], shm_handles[1], shm_handles[2], shm_handles[3], error_msg
+
+        iu.shm_basic_infer(
+            self,
+            self.triton_client,
+            shm_handles[0],
+            shm_handles[1],
+            shm_handles[2],
+            shm_handles[3],
+            error_msg,
+            protocol=self.protocol,
+            use_system_shared_memory=True,
         )
+
         if len(error_msg) > 0:
             raise Exception(str(error_msg))
         shm_ip2_handle = shm.create_shared_memory_region(
@@ -308,14 +255,19 @@ def test_too_big_shm(self):
         self.triton_client.register_system_shared_memory(
             "input2_data", "/input2_data", 128
         )
-        self._basic_inference(
+
+        iu.shm_basic_infer(
+            self,
+            self.triton_client,
             shm_handles[0],
             shm_ip2_handle,
             shm_handles[2],
             shm_handles[3],
             error_msg,
-            "input2_data",
-            128,
+            big_shm_name="input2_data",
+            big_shm_size=128,
+            protocol=self.protocol,
+            use_system_shared_memory=True,
         )
         if len(error_msg) > 0:
             self.assertTrue(
@@ -330,8 +282,17 @@ def test_mixed_raw_shm(self):
         error_msg = []
         shm_handles = self._configure_server()
         input1_data = np.ones(shape=16, dtype=np.int32)
-        self._basic_inference(
-            shm_handles[0], [input1_data], shm_handles[2], shm_handles[3], error_msg
+
+        iu.shm_basic_infer(
+            self,
+            self.triton_client,
+            shm_handles[0],
+            [input1_data],
+            shm_handles[2],
+            shm_handles[3],
+            error_msg,
+            protocol=self.protocol,
+            use_system_shared_memory=True,
         )
         if len(error_msg) > 0:
             raise Exception(error_msg[-1])
@@ -365,14 +326,20 @@ def test_infer_offset_out_of_bound(self):
             # gRPC will throw an error if > 2**63 - 1, so instead test for
             # exceeding shm region size by 1 byte, given its size is 64 bytes
             offset = 64
-        self._basic_inference(
+
+        iu.shm_basic_infer(
+            self,
+            self.triton_client,
             shm_handles[0],
             shm_handles[1],
             shm_handles[2],
             shm_handles[3],
             error_msg,
             shm_output_offset=offset,
+            protocol=self.protocol,
+            use_system_shared_memory=True,
         )
+
         self.assertEqual(len(error_msg), 1)
         self.assertIn("Invalid offset for shared memory region", error_msg[0])
         self._cleanup_server(shm_handles)
@@ -384,7 +351,9 @@ def test_infer_byte_size_out_of_bound(self):
         offset = 60
         byte_size = self.DEFAULT_SHM_BYTE_SIZE
 
-        self._basic_inference(
+        iu.shm_basic_infer(
+            self,
+            self.triton_client,
             shm_handles[0],
             shm_handles[1],
             shm_handles[2],
@@ -392,6 +361,8 @@ def test_infer_byte_size_out_of_bound(self):
             error_msg,
             shm_output_offset=offset,
             shm_output_byte_size=byte_size,
+            protocol=self.protocol,
+            use_system_shared_memory=True,
         )
         self.assertEqual(len(error_msg), 1)
         self.assertIn(
diff --git a/qa/common/infer_util.py b/qa/common/infer_util.py
index 9a181c1d29..18512d9927 100755
--- a/qa/common/infer_util.py
+++ b/qa/common/infer_util.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 
-# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -1306,3 +1306,90 @@ def infer_zero(
                 shm.destroy_shared_memory_region(shm_op_handles[io_num])
 
     return results
+
+
+# Perform basic inference for shared memory tests
+def shm_basic_infer(
+    tester,
+    triton_client,
+    shm_ip0_handle,
+    shm_ip1_handle,
+    shm_op0_handle,
+    shm_op1_handle,
+    error_msg,
+    big_shm_name="",
+    big_shm_size=64,
+    default_shm_byte_size=64,
+    shm_output_offset=0,
+    shm_output_byte_size=64,
+    protocol="http",
+    use_system_shared_memory=False,
+    use_cuda_shared_memory=False,
+):
+    # Lazy shm imports...
+    if use_system_shared_memory:
+        import tritonclient.utils.shared_memory as shm
+    elif use_cuda_shared_memory:
+        import tritonclient.utils.cuda_shared_memory as cudashm
+    else:
+        raise Exception("No shared memory type specified")
+
+    input0_data = np.arange(start=0, stop=16, dtype=np.int32)
+    input1_data = np.ones(shape=16, dtype=np.int32)
+    inputs = []
+    outputs = []
+    if protocol == "http":
+        inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
+        inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=True))
+        outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False))
+    else:
+        inputs.append(grpcclient.InferInput("INPUT0", [1, 16], "INT32"))
+        inputs.append(grpcclient.InferInput("INPUT1", [1, 16], "INT32"))
+        outputs.append(grpcclient.InferRequestedOutput("OUTPUT0"))
+        outputs.append(grpcclient.InferRequestedOutput("OUTPUT1"))
+
+    inputs[0].set_shared_memory("input0_data", default_shm_byte_size)
+
+    if type(shm_ip1_handle) == np.array:
+        inputs[1].set_data_from_numpy(input0_data, binary_data=True)
+    elif big_shm_name != "":
+        inputs[1].set_shared_memory(big_shm_name, big_shm_size)
+    else:
+        inputs[1].set_shared_memory("input1_data", default_shm_byte_size)
+
+    outputs[0].set_shared_memory(
+        "output0_data", shm_output_byte_size, offset=shm_output_offset
+    )
+    outputs[1].set_shared_memory(
+        "output1_data", shm_output_byte_size, offset=shm_output_offset
+    )
+
+    try:
+        results = triton_client.infer(
+            "simple", inputs, model_version="", outputs=outputs
+        )
+        output = results.get_output("OUTPUT0")
+        if protocol == "http":
+            output_datatype = output["datatype"]
+            output_shape = output["shape"]
+        else:
+            output_datatype = output.datatype
+            output_shape = output.shape
+        output_dtype = triton_to_np_dtype(output_datatype)
+
+        if use_system_shared_memory:
+            output_data = shm.get_contents_as_numpy(
+                shm_op0_handle, output_dtype, output_shape
+            )
+        elif use_cuda_shared_memory:
+            output_data = cudashm.get_contents_as_numpy(
+                shm_op0_handle, output_dtype, output_shape
+            )
+
+        tester.assertTrue(
+            (output_data[0] == (input0_data + input1_data)).all(),
+            "Model output does not match expected output",
+        )
+    except Exception as ex:
+        error_msg.append(str(ex))
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 783275d8d7..53c8add989 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -180,6 +180,7 @@ if(${TRITON_ENABLE_GPU})
     main
     PRIVATE
       CUDA::cudart
+      -lcuda
   )
 endif() # TRITON_ENABLE_GPU
 
diff --git a/src/shared_memory_manager.cc b/src/shared_memory_manager.cc
index 1064982669..8101a2e236 100644
--- a/src/shared_memory_manager.cc
+++ b/src/shared_memory_manager.cc
@@ -255,11 +255,57 @@ OpenCudaIPCRegion(
   cudaError_t err = cudaIpcOpenMemHandle(
       data_ptr, *cuda_shm_handle, cudaIpcMemLazyEnablePeerAccess);
   if (err != cudaSuccess) {
+    // Log detailed error message and send generic error to client
+    LOG_ERROR << "failed to open CUDA IPC handle: " << cudaGetErrorString(err);
     return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INTERNAL, std::string(
-                                         "failed to open CUDA IPC handle: " +
-                                         std::string(cudaGetErrorString(err)))
-                                         .c_str());
+        TRITONSERVER_ERROR_INVALID_ARG,
+        std::string("failed to register shared memory region: invalid args")
+            .c_str());
+  }
+
+  return nullptr;
+}
+
+TRITONSERVER_Error*
+GetCudaSharedMemoryRegionSize(CUdeviceptr data_ptr, size_t& shm_region_size)
+{
+  CUdeviceptr* base = nullptr;
+  CUresult result = cuMemGetAddressRange(base, &shm_region_size, data_ptr);
+  if (result != CUDA_SUCCESS) {
+    const char* errorString;
+    if (cuGetErrorString(result, &errorString) != CUDA_SUCCESS) {
+      return TRITONSERVER_ErrorNew(
+          TRITONSERVER_ERROR_INTERNAL, "Failed to get CUDA error string");
+    }
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INTERNAL,
+        std::string(
+            "Failed to get CUDA address range: " + std::string(errorString))
+            .c_str());
+  }
+  return nullptr;
+}
+
+TRITONSERVER_Error*
+CheckCudaSharedMemoryRegionSize(
+    const std::string& name, CUdeviceptr data_ptr, size_t byte_size)
+{
+  size_t shm_region_size = 0;
+  auto err = GetCudaSharedMemoryRegionSize(data_ptr, shm_region_size);
+
+  // User-provided offset and byte_size should not go out-of-bounds.
+  if (err != nullptr || byte_size > shm_region_size) {
+    if (err != nullptr) {
+      // Log detailed error message and send generic error to client
+      LOG_ERROR << TRITONSERVER_ErrorMessage(err);
+      TRITONSERVER_ErrorDelete(err);
+    }
+    return TRITONSERVER_ErrorNew(
+        TRITONSERVER_ERROR_INVALID_ARG,
+        std::string(
+            "failed to register shared memory region '" + name +
+            "': invalid args")
+            .c_str());
   }
 
   return nullptr;
@@ -363,16 +409,11 @@ SharedMemoryManager::RegisterCUDASharedMemory(
   void* mapped_addr;
 
   // Get CUDA shared memory base address
-  TRITONSERVER_Error* err =
-      OpenCudaIPCRegion(cuda_shm_handle, &mapped_addr, device_id);
-  if (err != nullptr) {
-    return TRITONSERVER_ErrorNew(
-        TRITONSERVER_ERROR_INVALID_ARG,
-        std::string(
-            "failed to register CUDA shared memory region '" + name +
-            "': " + TRITONSERVER_ErrorMessage(err))
-            .c_str());
-  }
+  RETURN_IF_ERR(OpenCudaIPCRegion(cuda_shm_handle, &mapped_addr, device_id));
+
+  // Enforce that registered region is in-bounds of shm file object.
+  RETURN_IF_ERR(CheckCudaSharedMemoryRegionSize(
+      name, reinterpret_cast<CUdeviceptr>(mapped_addr), byte_size));
 
   shared_memory_map_.insert(std::make_pair(
       name, std::unique_ptr<CUDASharedMemoryInfo>(new CUDASharedMemoryInfo(
diff --git a/src/shared_memory_manager.h b/src/shared_memory_manager.h
index f079308bd5..51eb0f0786 100644
--- a/src/shared_memory_manager.h
+++ b/src/shared_memory_manager.h
@@ -39,6 +39,7 @@
 #include "triton/common/triton_json.h"
 
 #ifdef TRITON_ENABLE_GPU
+#include <cuda.h>
 #include <cuda_runtime_api.h>
 #endif  // TRITON_ENABLE_GPU