Skip to content

Commit

Permalink
Merge branch 'triton-inference-server:main' into improve-k8sonprem-chart
Browse files Browse the repository at this point in the history
  • Loading branch information
okyspace authored Feb 22, 2024
2 parents 62f26f9 + 5732163 commit 45e7a82
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 46 deletions.
52 changes: 17 additions & 35 deletions qa/L0_memory_growth/busy_op_test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/python

# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -30,9 +30,8 @@
from builtins import range

import numpy as np
import tritongrpcclient as grpcclient
import tritonhttpclient as httpclient
from tritonclientutils import np_to_triton_dtype
import tritonclient.http as httpclient
from tritonclient.utils import np_to_triton_dtype

FLAGS = None

Expand All @@ -54,15 +53,6 @@
default="localhost:8000",
help="Inference server URL. Default is localhost:8000.",
)
parser.add_argument(
"-i",
"--protocol",
type=str,
required=False,
default="http",
help='Protocol ("http"/"grpc") used to '
+ 'communicate with inference service. Default is "http".',
)
parser.add_argument("-m", "--model", type=str, required=True, help="Name of model.")
parser.add_argument(
"-n",
Expand All @@ -71,47 +61,39 @@
required=True,
help="Number of asynchronous requests to launch.",
)
parser.add_argument(
"-d",
"--delay",
type=int,
required=True,
help="Number of delay cycles to use as input to model.",
)

FLAGS = parser.parse_args()
if (FLAGS.protocol != "http") and (FLAGS.protocol != "grpc"):
print(
'unexpected protocol "{}", expects "http" or "grpc"'.format(FLAGS.protocol)
)
exit(1)

client_util = httpclient if FLAGS.protocol == "http" else grpcclient

# Run the busyop model which takes a delay as input.
model_name = FLAGS.model

# Create the inference context for the model.
client = client_util.InferenceServerClient(FLAGS.url, verbose=FLAGS.verbose)
# Create the inference context for the model. Need to set the concurrency
# based on the number of requests so that the delivery of the async
# requests is not blocked.
# See the comment for more details: https://github.com/triton-inference-server/client/blob/r24.02/src/python/library/tritonclient/http/_client.py#L1501
client = httpclient.InferenceServerClient(
FLAGS.url, verbose=FLAGS.verbose, concurrency=FLAGS.num_requests
)

# Collect async requests here
requests = []

# Create the data for the one input tensor
input_data = np.array([FLAGS.delay], dtype=np.int32)
# Create the data for the input tensor. Creating tensor size with 5 MB.
tensor_size = [1, 5 * 1024 * 1024]
input_data = np.random.randn(*tensor_size).astype(np.float32)

inputs = [
client_util.InferInput(
"in", input_data.shape, np_to_triton_dtype(input_data.dtype)
httpclient.InferInput(
"INPUT0", input_data.shape, np_to_triton_dtype(input_data.dtype)
)
]
inputs[0].set_data_from_numpy(input_data)

# Send requests
for i in range(FLAGS.num_requests):
requests.append(client.async_infer(model_name, inputs))
print("Sent request %d" % i)
print("Sent request %d" % i, flush=True)
# wait for requests to finish
for i in range(len(requests)):
requests[i].get_result()
print("Received result %d" % i)
print("Received result %d" % i, flush=True)
21 changes: 10 additions & 11 deletions qa/L0_memory_growth/test.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
# Copyright 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2020-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -213,20 +213,18 @@ for MODEL in $(ls models); do
set -e
done

# Next perform a test that has unbound memory growth. Use the busy op model
# with a high delay in order to force requests to sit in the queue, and result
# Next perform a test that has unbound memory growth. Use the busy op Python model
# with a sleep function in order to force requests to sit in the queue, and result
# in memory growth.
BUSY_OP_TEST=busy_op_test.py
DELAY_CYCLES=2100000000
NUM_REQUESTS=100

rm -rf test_repo && mkdir test_repo
cp -r ${DATADIR}/qa_custom_ops/tf_custom_ops/graphdef_busyop test_repo/
mkdir -p test_repo/busy_op/1/
cp ../python_models/busy_op/model.py test_repo/busy_op/1/
cp ../python_models/busy_op/config.pbtxt test_repo/busy_op

# Explicitly set library path so custom ops can find TF
LD_LIBRARY_PATH=/opt/tritonserver/backends/tensorflow:$LD_LIBRARY_PATH
SERVER_ARGS="--model-repository=`pwd`/test_repo"
SERVER_LD_PRELOAD="${DATADIR}/qa_custom_ops/tf_custom_ops/libbusyop.so"

LEAKCHECK_LOG="test_busyop.valgrind.log"
MASSIF_LOG="test_busyop.massif"
Expand Down Expand Up @@ -254,12 +252,12 @@ set +e
# Run the busy_op test if no PTX issue was observed when launching server
if [ $SKIP_BUSYOP -ne 1 ]; then
SECONDS=0
python $BUSY_OP_TEST -v -m graphdef_busyop -d $DELAY_CYCLES -n $NUM_REQUESTS > $CLIENT_LOG 2>&1
python $BUSY_OP_TEST -v -m busy_op -n $NUM_REQUESTS > $CLIENT_LOG 2>&1
TEST_RETCODE=$?
TEST_DURATION=$SECONDS
if [ ${TEST_RETCODE} -ne 0 ]; then
cat $CLIENT_LOG
echo -e "\n***\n*** Test graphdef_busyop Failed\n***"
echo -e "\n***\n*** busy_op_test.py Failed\n***"
RET=1
fi
set -e
Expand Down Expand Up @@ -291,12 +289,13 @@ if [ $SKIP_BUSYOP -ne 1 ]; then
# Always output memory usage for easier triage of MAX_ALLOWED_ALLOC settings in the future
grep -i "Change in memory allocation" "${CLIENT_LOG}" || true
fi

set -e

if [ $RET -eq 0 ]; then
echo -e "\n***\n*** Test Passed\n***"
else
echo -e "\n***\n*** Test FAILED\n***"
echo -e "\n***\n*** Test Failed\n***"
fi

# Run only if both TRITON_FROM and TRITON_TO_DL are set
Expand Down
52 changes: 52 additions & 0 deletions qa/python_models/busy_op/config.pbtxt
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

name: "busy_op"
backend: "python"
max_batch_size: 1

input [
{
name: "INPUT0"
data_type: TYPE_FP32
dims: [ -1 ]
}
]

output [
{
name: "OUTPUT0"
data_type: TYPE_FP32
dims: [ -1 ]
}
]

instance_group [
{
count: 1
kind : KIND_CPU
}
]
49 changes: 49 additions & 0 deletions qa/python_models/busy_op/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import time

import triton_python_backend_utils as pb_utils


class TritonPythonModel:
"""
This model calls sleep for the first request in order to force requests to
sit in the queue, and result in memory growth.
"""

def initialize(self, args):
self.sleep = True

def execute(self, requests):
if self.sleep:
time.sleep(50)
self.sleep = False
responses = []
for request in requests:
input_tensor = pb_utils.get_input_tensor_by_name(request, "INPUT0")
out_tensor = pb_utils.Tensor("OUTPUT0", input_tensor.as_numpy())
responses.append(pb_utils.InferenceResponse([out_tensor]))
return responses

0 comments on commit 45e7a82

Please sign in to comment.