Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ci: Return custom exit code to indicate known shm leak failure in L0_backend_python bls test #7485

Merged
merged 14 commits into from
Jul 31, 2024
2 changes: 1 addition & 1 deletion qa/L0_backend_python/argument_validation/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

CLIENT_PY=../python_unittest.py
CLIENT_PY=../test_infer_shm_leak.py
CLIENT_LOG="./arg_validation_client.log"
TEST_RESULT_FILE='test_results.txt'
SERVER_ARGS="--model-repository=${MODELDIR}/argument_validation/models --backend-directory=${BACKEND_DIR} --log-verbose=1"
Expand Down
41 changes: 23 additions & 18 deletions qa/L0_backend_python/bls/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,15 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

CLIENT_PY=../python_unittest.py
CLIENT_PY=../test_infer_shm_leak.py
CLIENT_LOG="./bls_client.log"
TEST_RESULT_FILE='test_results.txt'
source ../../common/util.sh

TRITON_REPO_ORGANIZATION=${TRITON_REPO_ORGANIZATION:=http://github.com/triton-inference-server}

RET=0
rm -fr *.log ./models *.txt
rm -fr *.log ./models *.txt *.xml

# FIXME: [DLIS-5970] Until Windows supports GPU tensors, only test CPU
if [[ ${TEST_WINDOWS} == 0 ]]; then
Expand Down Expand Up @@ -119,30 +119,35 @@ if [[ ${TEST_WINDOWS} == 0 ]]; then

for MODEL_NAME in bls bls_memory bls_memory_async bls_async; do
export MODEL_NAME=${MODEL_NAME}

python3 -m pytest --junitxml="${MODEL_NAME}.${TRIAL}.${CUDA_MEMORY_POOL_SIZE_MB}.report.xml" $CLIENT_PY >> $CLIENT_LOG 2>&1
if [ $? -ne 0 ]; then
# Run with pytest to capture the return code correctly
pytest --junitxml="${MODEL_NAME}.${TRIAL}.${CUDA_MEMORY_POOL_SIZE_MB}.report.xml" $CLIENT_PY >> $CLIENT_LOG 2>&1
EXIT_CODE=$?
if [ $EXIT_CODE -ne 0 ]; then
echo -e "\n***\n*** ${MODEL_NAME} ${BLS_KIND} test FAILED. \n***"
RET=$EXIT_CODE
cat $SERVER_LOG
cat $CLIENT_LOG
RET=1
fi
done

set -e

kill_server

# Check for bls 'test_timeout' to ensure timeout value is being correctly passed
if [ `grep -c "Request timeout: 11000000000" $SERVER_LOG` == "0" ]; then
echo -e "\n***\n*** BLS timeout value not correctly passed to model: line ${LINENO}\n***"
cat $SERVER_LOG
RET=1
set -e

# Only check the timeout value if there is no error since the test
# may fail before the test_timeout case gets run.
if [ $RET -eq 0 ]; then
# Check for bls 'test_timeout' to ensure timeout value is being correctly passed
if [ `grep -c "Request timeout: 11000000000" $SERVER_LOG` == "0" ]; then
echo -e "\n***\n*** BLS timeout value not correctly passed to model: line ${LINENO}\n***"
cat $SERVER_LOG
RET=1
fi
fi

if [[ $CUDA_MEMORY_POOL_SIZE_MB -eq 128 ]]; then
if [[ $CUDA_MEMORY_POOL_SIZE_MB -eq 256 ]]; then
if [ `grep -c "Failed to allocate memory from CUDA memory pool" $SERVER_LOG` != "0" ]; then
echo -e "\n***\n*** Expected to use CUDA memory pool for all tests when CUDA_MEMOY_POOL_SIZE_MB is 128 MB for 'bls' $BLS_KIND test\n***"
echo -e "\n***\n*** Expected to use CUDA memory pool for all tests when CUDA_MEMORY_POOL_SIZE_MB is 256 MB for 'bls' $BLS_KIND test\n***"
cat $SERVER_LOG
RET=1
fi
Expand Down Expand Up @@ -342,10 +347,10 @@ set -e

kill_server

if [ $RET -eq 1 ]; then
echo -e "\n***\n*** BLS test FAILED. \n***"
else
if [ $RET -eq 0 ]; then
echo -e "\n***\n*** BLS test PASSED. \n***"
else
echo -e "\n***\n*** BLS test FAILED. \n***"
fi

exit $RET
2 changes: 1 addition & 1 deletion qa/L0_backend_python/custom_metrics/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

CLIENT_PY=../python_unittest.py
CLIENT_PY=../test_infer_shm_leak.py
CLIENT_LOG="./custom_metrics_client.log"
TEST_RESULT_FILE='test_results.txt'
source ../../common/util.sh
Expand Down
2 changes: 1 addition & 1 deletion qa/L0_backend_python/request_rescheduling/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

CLIENT_PY="../python_unittest.py"
CLIENT_PY="../test_infer_shm_leak.py"
CLIENT_LOG="./request_rescheduling_client.log"
TEST_RESULT_FILE='test_results.txt'
source ../../common/util.sh
Expand Down
2 changes: 1 addition & 1 deletion qa/L0_backend_python/setup_python_enviroment.sh
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ apt-get update && apt-get -y install \
libboost-dev
rm -f /usr/bin/python3 && \
ln -s "/usr/bin/python3.${PYTHON_ENV_VERSION}" /usr/bin/python3
pip3 install --upgrade install requests numpy virtualenv protobuf
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice find 🚀

pip3 install --upgrade requests numpy virtualenv protobuf
find /opt/tritonserver/qa/pkgs/ -maxdepth 1 -type f -name \
"tritonclient-*linux*.whl" | xargs printf -- '%s[all]' | \
xargs pip3 install --upgrade
Expand Down
29 changes: 26 additions & 3 deletions qa/L0_backend_python/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -425,11 +425,20 @@ if [ "$TEST_JETSON" == "0" ]; then
# between dependencies.
setup_virtualenv

set +e
(cd ${TEST} && bash -ex test.sh)
if [ $? -ne 0 ]; then
EXIT_CODE=$?
if [ $EXIT_CODE -ne 0 ]; then
echo "Subtest ${TEST} FAILED"
RET=1
RET=$EXIT_CODE

# In bls test, it is allowed to fail with a strict memory leak of 480 bytes with exit code '123'.
# Propagate the exit code to make sure it's not overwritten by other tests.
if [[ ${TEST} == "bls" ]] && [[ $EXIT_CODE -ne 1 ]] ; then
BLS_RET=$RET
fi
fi
set -e

deactivate_virtualenv
done
Expand All @@ -438,11 +447,13 @@ if [ "$TEST_JETSON" == "0" ]; then
if [[ ${PYTHON_ENV_VERSION} = "10" ]] && [[ ${TEST_WINDOWS} == 0 ]]; then
# In 'env' test we use miniconda for dependency management. No need to run
# the test in a virtual environment.
set +e
(cd env && bash -ex test.sh)
if [ $? -ne 0 ]; then
echo "Subtest env FAILED"
RET=1
fi
set -e
fi
fi

Expand All @@ -459,12 +470,14 @@ for TEST in ${SUBTESTS}; do
# between dependencies.
setup_virtualenv

set +e
(cd ${TEST} && bash -ex test.sh)

if [ $? -ne 0 ]; then
echo "Subtest ${TEST} FAILED"
RET=1
fi
set -e

deactivate_virtualenv
done
Expand All @@ -475,4 +488,14 @@ else
echo -e "\n***\n*** Test FAILED\n***"
fi

exit $RET
# Exit with RET if it is 1, meaning that the test failed.
# Otherwise, exit with BLS_RET if it is set, meaning that the known memory leak is captured.
if [ $RET -eq 1 ]; then
exit $RET
else
if [ -z "$BLS_RET" ]; then
exit $RET
else
exit $BLS_RET
fi
fi
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import os
import unittest

import pytest
import shm_util
import tritonclient.grpc as grpcclient
from tritonclient.utils import *
Expand All @@ -41,11 +42,13 @@
# we overwrite the IP address with the TRITONSERVER_IPADDR envvar
_tritonserver_ipaddr = os.environ.get("TRITONSERVER_IPADDR", "localhost")

# The exit code 123 is used to indicate that the shm leak probe detected a 480
# bytes leak in the bls sub-test. Any leak other than 480 bytes will cause the
# test to fail with the default exit code 1.
ALLOWED_FAILURE_EXIT_CODE = 123

class PythonUnittest(unittest.TestCase):
def setUp(self):
self._shm_leak_detector = shm_util.ShmLeakDetector()

class TestInferShmLeak:
def _run_unittest(self, model_name):
with grpcclient.InferenceServerClient(f"{_tritonserver_ipaddr}:8001") as client:
# No input is required
Expand All @@ -54,15 +57,17 @@ def _run_unittest(self, model_name):

# The model returns 1 if the tests were successfully passed.
# Otherwise, it will return 0.
self.assertEqual(
output0, [1], f"python_unittest failed for model {model_name}"
)

def test_python_unittest(self):
model_name = os.environ["MODEL_NAME"]
with self._shm_leak_detector.Probe() as shm_probe:
self._run_unittest(model_name)
assert output0 == [1], f"python_unittest failed for model {model_name}"

def test_shm_leak(self):
self._shm_leak_detector = shm_util.ShmLeakDetector()
model_name = os.environ.get("MODEL_NAME", "default_model")

if __name__ == "__main__":
unittest.main()
try:
with self._shm_leak_detector.Probe() as shm_probe:
self._run_unittest(model_name)
except AssertionError as e:
if "Known shared memory leak of 480 bytes detected" in str(e):
pytest.exit(str(e), returncode=ALLOWED_FAILURE_EXIT_CODE)
else:
raise e
6 changes: 3 additions & 3 deletions qa/L0_dlpack_multi_gpu/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

SERVER=/opt/tritonserver/bin/tritonserver
SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1"
CLIENT_PY=./python_unittest.py
CLIENT_PY=./test_infer_shm_leak.py
CLIENT_LOG="./client.log"
EXPECTED_NUM_TESTS="1"
TEST_RESULT_FILE='test_results.txt'
Expand All @@ -52,8 +52,8 @@ rm -fr *.log ./models
mkdir -p models/dlpack_test/1/
cp ../python_models/dlpack_test/model.py models/dlpack_test/1/
cp ../python_models/dlpack_test/config.pbtxt models/dlpack_test
cp ../L0_backend_python/python_unittest.py .
sed -i 's#sys.path.append("../../common")#sys.path.append("../common")#g' python_unittest.py
cp ../L0_backend_python/test_infer_shm_leak.py .
sed -i 's#sys.path.append("../../common")#sys.path.append("../common")#g' test_infer_shm_leak.py

run_server
if [ "$SERVER_PID" == "0" ]; then
Expand Down
6 changes: 3 additions & 3 deletions qa/L0_warmup/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ export CUDA_VISIBLE_DEVICES=0

CLIENT=../clients/image_client
CLIENT_LOG="./client.log"
CLIENT_PY=./python_unittest.py
CLIENT_PY=./test_infer_shm_leak.py
EXPECTED_NUM_TESTS="1"
TEST_RESULT_FILE='test_results.txt'

Expand Down Expand Up @@ -449,8 +449,8 @@ mkdir -p models/bls_onnx_warmup/1/
cp ../python_models/bls_onnx_warmup/model.py models/bls_onnx_warmup/1/
cp ../python_models/bls_onnx_warmup/config.pbtxt models/bls_onnx_warmup/.

cp ../L0_backend_python/python_unittest.py .
sed -i 's#sys.path.append("../../common")#sys.path.append("../common")#g' python_unittest.py
cp ../L0_backend_python/test_infer_shm_leak.py .
sed -i 's#sys.path.append("../../common")#sys.path.append("../common")#g' test_infer_shm_leak.py

run_server
if [ "$SERVER_PID" == "0" ]; then
Expand Down
5 changes: 4 additions & 1 deletion qa/common/shm_util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python3

# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -441,6 +441,9 @@ def __exit__(self, type, value, traceback):
print(
f"Shared memory leak detected [{shm_region}]: {curr_shm_free_size} (curr free) < {prev_shm_free_size} (prev free)."
)
# FIXME DLIS-7122: Known shared memory leak of 480 bytes in BLS test.
if curr_shm_free_size == 1006576 and prev_shm_free_size == 1007056:
assert False, f"Known shared memory leak of 480 bytes detected."
assert not shm_leak_detected, f"Shared memory leak detected."

def _get_shm_free_sizes(self, delay_sec=0):
Expand Down
Loading