Merge branch 'triton-inference-server:main' into improve-k8sonprem-chart

triton-inference-server · Feb 22, 2024 · 45e7a82 · 45e7a82
2 parents 62f26f9 + 5732163
commit 45e7a82
Show file tree

Hide file tree

Showing 4 changed files with 128 additions and 46 deletions.
diff --git a/qa/L0_memory_growth/busy_op_test.py b/qa/L0_memory_growth/busy_op_test.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python
 
-# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
+# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -30,9 +30,8 @@
 from builtins import range
 
 import numpy as np
-import tritongrpcclient as grpcclient
-import tritonhttpclient as httpclient
-from tritonclientutils import np_to_triton_dtype
+import tritonclient.http as httpclient
+from tritonclient.utils import np_to_triton_dtype
 
 FLAGS = None
 
@@ -54,15 +53,6 @@
         default="localhost:8000",
         help="Inference server URL. Default is localhost:8000.",
     )
-    parser.add_argument(
-        "-i",
-        "--protocol",
-        type=str,
-        required=False,
-        default="http",
-        help='Protocol ("http"/"grpc") used to '
-        + 'communicate with inference service. Default is "http".',
-    )
     parser.add_argument("-m", "--model", type=str, required=True, help="Name of model.")
     parser.add_argument(
         "-n",
@@ -71,47 +61,39 @@
         required=True,
         help="Number of asynchronous requests to launch.",
     )
-    parser.add_argument(
-        "-d",
-        "--delay",
-        type=int,
-        required=True,
-        help="Number of delay cycles to use as input to model.",
-    )
 
     FLAGS = parser.parse_args()
-    if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"):
-        print(
-            'unexpected protocol "{}", expects "http" or "grpc"'.format(FLAGS.protocol)
-        )
-        exit(1)
-
-    client_util = httpclient if FLAGS.protocol == "http" else grpcclient
 
     # Run the busyop model which takes a delay as input.
     model_name = FLAGS.model
 
-    # Create the inference context for the model.
-    client = client_util.InferenceServerClient(FLAGS.url, verbose=FLAGS.verbose)
+    # Create the inference context for the model. Need to set the concurrency
+    # based on the number of requests so that the delivery of the async
+    # requests is not blocked.
+    # See the comment for more details: https://github.com/triton-inference-server/client/blob/r24.02/src/python/library/tritonclient/http/_client.py#L1501
+    client = httpclient.InferenceServerClient(
+        FLAGS.url, verbose=FLAGS.verbose, concurrency=FLAGS.num_requests
+    )
 
     # Collect async requests here
     requests = []
 
-    # Create the data for the one input tensor
-    input_data = np.array([FLAGS.delay], dtype=np.int32)
+    # Create the data for the input tensor. Creating tensor size with 5 MB.
+    tensor_size = [1, 5 * 1024 * 1024]
+    input_data = np.random.randn(*tensor_size).astype(np.float32)
 
     inputs = [
-        client_util.InferInput(
-            "in", input_data.shape, np_to_triton_dtype(input_data.dtype)
+        httpclient.InferInput(
+            "INPUT0", input_data.shape, np_to_triton_dtype(input_data.dtype)
         )
     ]
     inputs[0].set_data_from_numpy(input_data)
 
     # Send requests
     for i in range(FLAGS.num_requests):
         requests.append(client.async_infer(model_name, inputs))
-        print("Sent request %d" % i)
+        print("Sent request %d" % i, flush=True)
     # wait for requests to finish
     for i in range(len(requests)):
         requests[i].get_result()
-        print("Received result %d" % i)
+        print("Received result %d" % i, flush=True)
diff --git a/qa/L0_memory_growth/test.sh b/qa/L0_memory_growth/test.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -213,20 +213,18 @@ for MODEL in $(ls models); do
     set -e
 done
 
-# Next perform a test that has unbound memory growth. Use the busy op model
-# with a high delay in order to force requests to sit in the queue, and result
+# Next perform a test that has unbound memory growth. Use the busy op Python model
+# with a sleep function in order to force requests to sit in the queue, and result
 # in memory growth.
 BUSY_OP_TEST=busy_op_test.py
-DELAY_CYCLES=2100000000
 NUM_REQUESTS=100
 
 rm -rf test_repo && mkdir test_repo
-cp -r ${DATADIR}/qa_custom_ops/tf_custom_ops/graphdef_busyop test_repo/
+mkdir -p test_repo/busy_op/1/
+cp ../python_models/busy_op/model.py test_repo/busy_op/1/
+cp ../python_models/busy_op/config.pbtxt test_repo/busy_op
 
-# Explicitly set library path so custom ops can find TF
-LD_LIBRARY_PATH=/opt/tritonserver/backends/tensorflow:$LD_LIBRARY_PATH
 SERVER_ARGS="--model-repository=`pwd`/test_repo"
-SERVER_LD_PRELOAD="${DATADIR}/qa_custom_ops/tf_custom_ops/libbusyop.so"
 
 LEAKCHECK_LOG="test_busyop.valgrind.log"
 MASSIF_LOG="test_busyop.massif"
@@ -254,12 +252,12 @@ set +e
 # Run the busy_op test if no PTX issue was observed when launching server
 if [ $SKIP_BUSYOP -ne 1 ]; then
     SECONDS=0
-    python $BUSY_OP_TEST -v -m graphdef_busyop -d $DELAY_CYCLES -n $NUM_REQUESTS > $CLIENT_LOG 2>&1
+    python $BUSY_OP_TEST -v -m busy_op -n $NUM_REQUESTS > $CLIENT_LOG 2>&1
     TEST_RETCODE=$?
     TEST_DURATION=$SECONDS
     if [ ${TEST_RETCODE} -ne 0 ]; then
         cat $CLIENT_LOG
-        echo -e "\n***\n*** Test graphdef_busyop Failed\n***"
+        echo -e "\n***\n*** busy_op_test.py Failed\n***"
         RET=1
     fi
     set -e
@@ -291,12 +289,13 @@ if [ $SKIP_BUSYOP -ne 1 ]; then
     # Always output memory usage for easier triage of MAX_ALLOWED_ALLOC settings in the future
     grep -i "Change in memory allocation" "${CLIENT_LOG}" || true
 fi
+
 set -e
 
 if [ $RET -eq 0 ]; then
     echo -e "\n***\n*** Test Passed\n***"
 else
-    echo -e "\n***\n*** Test FAILED\n***"
+    echo -e "\n***\n*** Test Failed\n***"
 fi
 
 # Run only if both TRITON_FROM and TRITON_TO_DL are set

diff --git a/qa/python_models/busy_op/config.pbtxt b/qa/python_models/busy_op/config.pbtxt
@@ -0,0 +1,52 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+name: "busy_op"
+backend: "python"
+max_batch_size: 1
+
+input [
+  {
+    name: "INPUT0"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  }
+]
+
+output [
+  {
+    name: "OUTPUT0"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  }
+]
+
+instance_group [
+  {
+    count: 1
+    kind : KIND_CPU
+  }
+]
diff --git a/qa/python_models/busy_op/model.py b/qa/python_models/busy_op/model.py
@@ -0,0 +1,49 @@
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+import time
+
+import triton_python_backend_utils as pb_utils
+
+
+class TritonPythonModel:
+    """
+    This model calls sleep for the first request in order to force requests to
+    sit in the queue, and result in memory growth.
+    """
+
+    def initialize(self, args):
+        self.sleep = True
+
+    def execute(self, requests):
+        if self.sleep:
+            time.sleep(50)
+            self.sleep = False
+        responses = []
+        for request in requests:
+            input_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0")
+            out_tensor = pb_utils.Tensor("OUTPUT0", input_tensor.as_numpy())
+            responses.append(pb_utils.InferenceResponse([out_tensor]))
+        return responses