Add support for response sender in the default mode (#7311)

triton-inference-server · Jun 6, 2024 · 1f68c0d · 1f68c0d
1 parent 797d296
commit 1f68c0d
Show file tree

Hide file tree

Showing 10 changed files with 1,052 additions and 46 deletions.
diff --git a/qa/L0_backend_python/bls/test.sh b/qa/L0_backend_python/bls/test.sh
@@ -100,7 +100,8 @@ if [[ ${TEST_WINDOWS} == 0 ]]; then
         echo "instance_group [ { kind: KIND_CPU} ]" >> models/libtorch_cpu/config.pbtxt
 
     # Test with different sizes of CUDA memory pool
-    for CUDA_MEMORY_POOL_SIZE_MB in 64 128 ; do
+    # TODO: Why 256 worked in place of 128, on decoupled data pipeline?
+    for CUDA_MEMORY_POOL_SIZE_MB in 64 256 ; do
         CUDA_MEMORY_POOL_SIZE_BYTES=$((CUDA_MEMORY_POOL_SIZE_MB * 1024 * 1024))
         SERVER_ARGS="--model-repository=${MODELDIR}/bls/models --backend-directory=${BACKEND_DIR} --log-verbose=1 --cuda-memory-pool-byte-size=0:${CUDA_MEMORY_POOL_SIZE_BYTES}"
         for TRIAL in non_decoupled decoupled ; do

diff --git a/qa/L0_backend_python/decoupled/decoupled_test.py b/qa/L0_backend_python/decoupled/decoupled_test.py
@@ -243,12 +243,12 @@ def test_decoupled_return_response_error(self):
                 client.async_stream_infer(model_name=model_name, inputs=inputs)
                 data_item = user_data._completed_requests.get()
                 if type(data_item) == InferenceServerException:
-                    self.assertEqual(
-                        data_item.message(),
+                    self.assertIn(
                         "Python model 'decoupled_return_response_error_0_0' is using "
                         "the decoupled mode and the execute function must return "
                         "None.",
-                        "Exception message didn't match.",
+                        data_item.message(),
+                        "Exception message didn't show up.",
                     )
 
     def test_decoupled_send_after_close_error(self):

diff --git a/qa/L0_backend_python/lifecycle/lifecycle_test.py b/qa/L0_backend_python/lifecycle/lifecycle_test.py
@@ -199,7 +199,7 @@ def test_infer_pymodel_error(self):
                     print(e.message())
                     self.assertTrue(
                         e.message().startswith(
-                            "Failed to process the request(s) for model instance"
+                            "Failed to process the request(s) for model "
                         ),
                         "Exception message is not correct",
                     )
@@ -208,45 +208,6 @@ def test_infer_pymodel_error(self):
                         False, "Wrong exception raised or did not raise an exception"
                     )
 
-    def test_incorrect_execute_return(self):
-        model_name = "execute_return_error"
-        shape = [1, 1]
-        with self._shm_leak_detector.Probe() as shm_probe:
-            with httpclient.InferenceServerClient(
-                f"{_tritonserver_ipaddr}:8000"
-            ) as client:
-                input_data = (5 * np.random.randn(*shape)).astype(np.float32)
-                inputs = [
-                    httpclient.InferInput(
-                        "INPUT", input_data.shape, np_to_triton_dtype(input_data.dtype)
-                    )
-                ]
-                inputs[0].set_data_from_numpy(input_data)
-
-                # The first request to this model will return None.
-                with self.assertRaises(InferenceServerException) as e:
-                    client.infer(model_name, inputs)
-
-                self.assertTrue(
-                    "Failed to process the request(s) for model instance "
-                    "'execute_return_error_0_0', message: Expected a list in the "
-                    "execute return" in str(e.exception),
-                    "Exception message is not correct.",
-                )
-
-                # The second inference request will return a list of None object
-                # instead of Python InferenceResponse objects.
-                with self.assertRaises(InferenceServerException) as e:
-                    client.infer(model_name, inputs)
-
-                self.assertTrue(
-                    "Failed to process the request(s) for model instance "
-                    "'execute_return_error_0_0', message: Expected an "
-                    "'InferenceResponse' object in the execute function return"
-                    " list" in str(e.exception),
-                    "Exception message is not correct.",
-                )
-
 
 if __name__ == "__main__":
     unittest.main()