Skip to content

Commit

Permalink
ci: Return custom exit code to indicate known shm leak failure in L0_…
Browse files Browse the repository at this point in the history
…backend_python bls test (#7485)
  • Loading branch information
krishung5 authored Jul 31, 2024
1 parent 41844a8 commit e181662
Show file tree
Hide file tree
Showing 10 changed files with 81 additions and 45 deletions.
2 changes: 1 addition & 1 deletion qa/L0_backend_python/argument_validation/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

CLIENT_PY=../python_unittest.py
CLIENT_PY=../test_infer_shm_leak.py
CLIENT_LOG="./arg_validation_client.log"
TEST_RESULT_FILE='test_results.txt'
SERVER_ARGS="--model-repository=${MODELDIR}/argument_validation/models --backend-directory=${BACKEND_DIR} --log-verbose=1"
Expand Down
41 changes: 23 additions & 18 deletions qa/L0_backend_python/bls/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,15 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

CLIENT_PY=../python_unittest.py
CLIENT_PY=../test_infer_shm_leak.py
CLIENT_LOG="./bls_client.log"
TEST_RESULT_FILE='test_results.txt'
source ../../common/util.sh

TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:=http://github.com/triton-inference-server}

RET=0
rm -fr *.log ./models *.txt
rm -fr *.log ./models *.txt *.xml

# FIXME: [DLIS-5970] Until Windows supports GPU tensors, only test CPU
if [[ ${TEST_WINDOWS} == 0 ]]; then
Expand Down Expand Up @@ -119,30 +119,35 @@ if [[ ${TEST_WINDOWS} == 0 ]]; then

for MODEL_NAME in bls bls_memory bls_memory_async bls_async; do
export MODEL_NAME=${MODEL_NAME}

python3 -m pytest --junitxml="${MODEL_NAME}.${TRIAL}.${CUDA_MEMORY_POOL_SIZE_MB}.report.xml" $CLIENT_PY >> $CLIENT_LOG 2>&1
if [ $? -ne 0 ]; then
# Run with pytest to capture the return code correctly
pytest --junitxml="${MODEL_NAME}.${TRIAL}.${CUDA_MEMORY_POOL_SIZE_MB}.report.xml" $CLIENT_PY >> $CLIENT_LOG 2>&1
EXIT_CODE=$?
if [ $EXIT_CODE -ne 0 ]; then
echo -e "\n***\n*** ${MODEL_NAME} ${BLS_KIND} test FAILED. \n***"
RET=$EXIT_CODE
cat $SERVER_LOG
cat $CLIENT_LOG
RET=1
fi
done

set -e

kill_server

# Check for bls 'test_timeout' to ensure timeout value is being correctly passed
if [ `grep -c "Request timeout: 11000000000" $SERVER_LOG` == "0" ]; then
echo -e "\n***\n*** BLS timeout value not correctly passed to model: line ${LINENO}\n***"
cat $SERVER_LOG
RET=1
set -e

# Only check the timeout value if there is no error since the test
# may fail before the test_timeout case gets run.
if [ $RET -eq 0 ]; then
# Check for bls 'test_timeout' to ensure timeout value is being correctly passed
if [ `grep -c "Request timeout: 11000000000" $SERVER_LOG` == "0" ]; then
echo -e "\n***\n*** BLS timeout value not correctly passed to model: line ${LINENO}\n***"
cat $SERVER_LOG
RET=1
fi
fi

if [[ $CUDA_MEMORY_POOL_SIZE_MB -eq 128 ]]; then
if [[ $CUDA_MEMORY_POOL_SIZE_MB -eq 256 ]]; then
if [ `grep -c "Failed to allocate memory from CUDA memory pool" $SERVER_LOG` != "0" ]; then
echo -e "\n***\n*** Expected to use CUDA memory pool for all tests when CUDA_MEMOY_POOL_SIZE_MB is 128 MB for 'bls' $BLS_KIND test\n***"
echo -e "\n***\n*** Expected to use CUDA memory pool for all tests when CUDA_MEMORY_POOL_SIZE_MB is 256 MB for 'bls' $BLS_KIND test\n***"
cat $SERVER_LOG
RET=1
fi
Expand Down Expand Up @@ -342,10 +347,10 @@ set -e

kill_server

if [ $RET -eq 1 ]; then
echo -e "\n***\n*** BLS test FAILED. \n***"
else
if [ $RET -eq 0 ]; then
echo -e "\n***\n*** BLS test PASSED. \n***"
else
echo -e "\n***\n*** BLS test FAILED. \n***"
fi

exit $RET
2 changes: 1 addition & 1 deletion qa/L0_backend_python/custom_metrics/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

CLIENT_PY=../python_unittest.py
CLIENT_PY=../test_infer_shm_leak.py
CLIENT_LOG="./custom_metrics_client.log"
TEST_RESULT_FILE='test_results.txt'
source ../../common/util.sh
Expand Down
2 changes: 1 addition & 1 deletion qa/L0_backend_python/request_rescheduling/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

CLIENT_PY="../python_unittest.py"
CLIENT_PY="../test_infer_shm_leak.py"
CLIENT_LOG="./request_rescheduling_client.log"
TEST_RESULT_FILE='test_results.txt'
source ../../common/util.sh
Expand Down
2 changes: 1 addition & 1 deletion qa/L0_backend_python/setup_python_enviroment.sh
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ apt-get update && apt-get -y install \
libboost-dev
rm -f /usr/bin/python3 && \
ln -s "/usr/bin/python3.${PYTHON_ENV_VERSION}" /usr/bin/python3
pip3 install --upgrade install requests numpy virtualenv protobuf
pip3 install --upgrade requests numpy virtualenv protobuf
find /opt/tritonserver/qa/pkgs/ -maxdepth 1 -type f -name \
"tritonclient-*linux*.whl" | xargs printf -- '%s[all]' | \
xargs pip3 install --upgrade
Expand Down
29 changes: 26 additions & 3 deletions qa/L0_backend_python/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -425,11 +425,20 @@ if [ "$TEST_JETSON" == "0" ]; then
# between dependencies.
setup_virtualenv

set +e
(cd ${TEST} && bash -ex test.sh)
if [ $? -ne 0 ]; then
EXIT_CODE=$?
if [ $EXIT_CODE -ne 0 ]; then
echo "Subtest ${TEST} FAILED"
RET=1
RET=$EXIT_CODE

# In bls test, it is allowed to fail with a strict memory leak of 480 bytes with exit code '123'.
# Propagate the exit code to make sure it's not overwritten by other tests.
if [[ ${TEST} == "bls" ]] && [[ $EXIT_CODE -ne 1 ]] ; then
BLS_RET=$RET
fi
fi
set -e

deactivate_virtualenv
done
Expand All @@ -438,11 +447,13 @@ if [ "$TEST_JETSON" == "0" ]; then
if [[ ${PYTHON_ENV_VERSION} = "10" ]] && [[ ${TEST_WINDOWS} == 0 ]]; then
# In 'env' test we use miniconda for dependency management. No need to run
# the test in a virtual environment.
set +e
(cd env && bash -ex test.sh)
if [ $? -ne 0 ]; then
echo "Subtest env FAILED"
RET=1
fi
set -e
fi
fi

Expand All @@ -459,12 +470,14 @@ for TEST in ${SUBTESTS}; do
# between dependencies.
setup_virtualenv

set +e
(cd ${TEST} && bash -ex test.sh)

if [ $? -ne 0 ]; then
echo "Subtest ${TEST} FAILED"
RET=1
fi
set -e

deactivate_virtualenv
done
Expand All @@ -475,4 +488,14 @@ else
echo -e "\n***\n*** Test FAILED\n***"
fi

exit $RET
# Exit with RET if it is 1, meaning that the test failed.
# Otherwise, exit with BLS_RET if it is set, meaning that the known memory leak is captured.
if [ $RET -eq 1 ]; then
exit $RET
else
if [ -z "$BLS_RET" ]; then
exit $RET
else
exit $BLS_RET
fi
fi
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import os
import unittest

import pytest
import shm_util
import tritonclient.grpc as grpcclient
from tritonclient.utils import *
Expand All @@ -41,11 +42,13 @@
# we overwrite the IP address with the TRITONSERVER_IPADDR envvar
_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")

# The exit code 123 is used to indicate that the shm leak probe detected a 480
# bytes leak in the bls sub-test. Any leak other than 480 bytes will cause the
# test to fail with the default exit code 1.
ALLOWED_FAILURE_EXIT_CODE = 123

class PythonUnittest(unittest.TestCase):
def setUp(self):
self._shm_leak_detector = shm_util.ShmLeakDetector()

class TestInferShmLeak:
def _run_unittest(self, model_name):
with grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") as client:
# No input is required
Expand All @@ -54,15 +57,17 @@ def _run_unittest(self, model_name):

# The model returns 1 if the tests were successfully passed.
# Otherwise, it will return 0.
self.assertEqual(
output0, [1], f"python_unittest failed for model {model_name}"
)

def test_python_unittest(self):
model_name = os.environ["MODEL_NAME"]
with self._shm_leak_detector.Probe() as shm_probe:
self._run_unittest(model_name)
assert output0 == [1], f"python_unittest failed for model {model_name}"

def test_shm_leak(self):
self._shm_leak_detector = shm_util.ShmLeakDetector()
model_name = os.environ.get("MODEL_NAME", "default_model")

if __name__ == "__main__":
unittest.main()
try:
with self._shm_leak_detector.Probe() as shm_probe:
self._run_unittest(model_name)
except AssertionError as e:
if "Known shared memory leak of 480 bytes detected" in str(e):
pytest.exit(str(e), returncode=ALLOWED_FAILURE_EXIT_CODE)
else:
raise e
6 changes: 3 additions & 3 deletions qa/L0_dlpack_multi_gpu/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

SERVER=/opt/tritonserver/bin/tritonserver
SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1"
CLIENT_PY=./python_unittest.py
CLIENT_PY=./test_infer_shm_leak.py
CLIENT_LOG="./client.log"
EXPECTED_NUM_TESTS="1"
TEST_RESULT_FILE='test_results.txt'
Expand All @@ -52,8 +52,8 @@ rm -fr *.log ./models
mkdir -p models/dlpack_test/1/
cp ../python_models/dlpack_test/model.py models/dlpack_test/1/
cp ../python_models/dlpack_test/config.pbtxt models/dlpack_test
cp ../L0_backend_python/python_unittest.py .
sed -i 's#sys.path.append("../../common")#sys.path.append("../common")#g' python_unittest.py
cp ../L0_backend_python/test_infer_shm_leak.py .
sed -i 's#sys.path.append("../../common")#sys.path.append("../common")#g' test_infer_shm_leak.py

run_server
if [ "$SERVER_PID" == "0" ]; then
Expand Down
6 changes: 3 additions & 3 deletions qa/L0_warmup/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ export CUDA_VISIBLE_DEVICES=0

CLIENT=../clients/image_client
CLIENT_LOG="./client.log"
CLIENT_PY=./python_unittest.py
CLIENT_PY=./test_infer_shm_leak.py
EXPECTED_NUM_TESTS="1"
TEST_RESULT_FILE='test_results.txt'

Expand Down Expand Up @@ -449,8 +449,8 @@ mkdir -p models/bls_onnx_warmup/1/
cp ../python_models/bls_onnx_warmup/model.py models/bls_onnx_warmup/1/
cp ../python_models/bls_onnx_warmup/config.pbtxt models/bls_onnx_warmup/.

cp ../L0_backend_python/python_unittest.py .
sed -i 's#sys.path.append("../../common")#sys.path.append("../common")#g' python_unittest.py
cp ../L0_backend_python/test_infer_shm_leak.py .
sed -i 's#sys.path.append("../../common")#sys.path.append("../common")#g' test_infer_shm_leak.py

run_server
if [ "$SERVER_PID" == "0" ]; then
Expand Down
5 changes: 4 additions & 1 deletion qa/common/shm_util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python3

# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -441,6 +441,9 @@ def __exit__(self, type, value, traceback):
print(
f"Shared memory leak detected [{shm_region}]: {curr_shm_free_size} (curr free) < {prev_shm_free_size} (prev free)."
)
# FIXME DLIS-7122: Known shared memory leak of 480 bytes in BLS test.
if curr_shm_free_size == 1006576 and prev_shm_free_size == 1007056:
assert False, f"Known shared memory leak of 480 bytes detected."
assert not shm_leak_detected, f"Shared memory leak detected."

def _get_shm_free_sizes(self, delay_sec=0):
Expand Down

0 comments on commit e181662

Please sign in to comment.