From 85a658a799cc88907711b095ae3d440ea33cff07 Mon Sep 17 00:00:00 2001
From: kthui <18255193+kthui@users.noreply.github.com>
Date: Thu, 29 Feb 2024 14:12:57 -0800
Subject: [PATCH] Add test for max queue delay timeout prompt response

---
 qa/L0_batcher/queue_timeout_test.py | 88 +++++++++++++++++++++++++++++
 qa/L0_batcher/test.sh               | 40 ++++++++++++-
 2 files changed, 127 insertions(+), 1 deletion(-)
 create mode 100755 qa/L0_batcher/queue_timeout_test.py

diff --git a/qa/L0_batcher/queue_timeout_test.py b/qa/L0_batcher/queue_timeout_test.py
new file mode 100755
index 00000000000..cbe9fc2ca63
--- /dev/null
+++ b/qa/L0_batcher/queue_timeout_test.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import concurrent.futures
+import time
+import unittest
+
+import numpy as np
+import tritonclient.grpc as grpcclient
+from tritonclient.utils import InferenceServerException
+
+
+class TestMaxQueueDelayTimeout(unittest.TestCase):
+    def setUp(self):
+        # Initialize client
+        self._triton = grpcclient.InferenceServerClient("localhost:8001")
+
+    def _get_inputs(self, batch_size):
+        self.assertIsInstance(batch_size, int)
+        self.assertGreater(batch_size, 0)
+        shape = [batch_size, 8]
+        inputs = [grpcclient.InferInput("INPUT0", shape, "FP32")]
+        inputs[0].set_data_from_numpy(np.ones(shape, dtype=np.float32))
+        return inputs
+
+    def _generate_callback_and_response_pair(self):
+        response = {"responded": False, "result": None, "error": None}
+
+        def callback(result, error):
+            response["responded"] = True
+            response["result"] = result
+            response["error"] = error
+
+        return callback, response
+
+    # Test queued requests on dynamic batch scheduler can be cancelled
+    def test_default_queue_policy_timeout_prompt_response(self):
+        model_name = "dynamic_batch"
+        with concurrent.futures.ThreadPoolExecutor() as pool:
+            # Saturate the slots on the model
+            saturate_thread = pool.submit(
+                self._triton.infer, model_name, self._get_inputs(batch_size=1)
+            )
+            time.sleep(2)  # ensure the slots are filled
+            # The next request should be queued
+            callback, response = self._generate_callback_and_response_pair()
+            queue_future = self._triton.async_infer(
+                model_name, self._get_inputs(batch_size=1), callback
+            )
+            time.sleep(2)  # ensure the request is queued
+            # Check if the request has timed-out
+            time.sleep(2)  # ensure the timeout period has expired
+            self.assertTrue(response["responded"])
+            self.assertEqual(response["result"], None)
+            self.assertIsInstance(response["error"], InferenceServerException)
+            self.assertEqual(response["error"].status(), "StatusCode.UNAVAILABLE")
+            self.assertEqual(response["error"].message(), "Request timeout expired")
+            # Join saturating thread
+            saturate_thread.result()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/qa/L0_batcher/test.sh b/qa/L0_batcher/test.sh
index c5f8819276e..dae674552d4 100755
--- a/qa/L0_batcher/test.sh
+++ b/qa/L0_batcher/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -736,6 +736,44 @@ if [[ "$(< /proc/sys/kernel/osrelease)" != *microsoft* ]]; then
     unset TRITONSERVER_DELAY_SCHEDULER
 fi
 
+# Test requests should be returned immediately upon timeout, without waiting for
+# the next slot to be available and then returned.
+rm -rf models && mkdir models
+mkdir -p models/dynamic_batch/1 && (cd models/dynamic_batch && \
+    echo 'backend: "identity"' >> config.pbtxt && \
+    echo 'max_batch_size: 1' >> config.pbtxt && \
+    echo -e 'input [{ name: "INPUT0" \n data_type: TYPE_FP32 \n dims: [ -1 ] }]' >> config.pbtxt && \
+    echo -e 'output [{ name: "OUTPUT0" \n data_type: TYPE_FP32 \n dims: [ -1 ] }]' >> config.pbtxt && \
+    echo -e 'instance_group [{ count: 1 \n kind: KIND_CPU }]' >> config.pbtxt && \
+    echo -e 'dynamic_batching {' >> config.pbtxt && \
+    echo -e '  preferred_batch_size: [ 1 ]' >> config.pbtxt && \
+    echo -e '  default_queue_policy { timeout_action: REJECT \n default_timeout_microseconds: 1000000 \n max_queue_size: 8 }' >> config.pbtxt && \
+    echo -e '}' >> config.pbtxt && \
+    echo -e 'parameters [{ key: "execute_delay_ms" \n value: { string_value: "8000" } }]' >> config.pbtxt)
+
+TEST_LOG="queue_timeout_test.log"
+SERVER_LOG="./queue_timeout_test.server.log"
+
+SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=2"
+run_server
+if [ "$SERVER_PID" == "0" ]; then
+    echo -e "\n***\n*** Failed to start $SERVER\n***"
+    cat $SERVER_LOG
+    exit 1
+fi
+
+set +e
+python queue_timeout_test.py > $TEST_LOG 2>&1
+if [ $? -ne 0 ]; then
+    echo -e "\n***\n*** Scheduler Tests Failed\n***"
+    cat $TEST_LOG
+    RET=1
+fi
+set -e
+
+kill $SERVER_PID
+wait $SERVER_PID
+
 if [ $RET -eq 0 ]; then
     echo -e "\n***\n*** Test Passed\n***"
 else