diff --git a/qa/L0_perf_nomodel/run_test.sh b/qa/L0_perf_nomodel/run_test.sh index b1e2702ecb..ce3350d97b 100755 --- a/qa/L0_perf_nomodel/run_test.sh +++ b/qa/L0_perf_nomodel/run_test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -38,6 +38,7 @@ PERF_CLIENT_PERCENTILE=${PERF_CLIENT_PERCENTILE:=95} PERF_CLIENT_STABILIZE_WINDOW=${PERF_CLIENT_STABILIZE_WINDOW:=5000} PERF_CLIENT_STABILIZE_THRESHOLD=${PERF_CLIENT_STABILIZE_THRESHOLD:=5} TENSOR_SIZE=${TENSOR_SIZE:=1} +TENSOR_ELEMENT_BYTES=${TENSOR_ELEMENT_BYTES:=4} SHARED_MEMORY=${SHARED_MEMORY:="none"} REPORTER=../common/reporter.py @@ -126,6 +127,16 @@ for BACKEND in $BACKENDS; do fi fi + # set shared memory output size + OUTPUT_SHARED_MEMORY_SIZE="" + if [[ "$SHARED_MEMORY" != "none" ]]; then + OUTPUT_SHARED_MEMORY_SIZE=$((TENSOR_ELEMENT_BYTES*TENSOR_SIZE)) + if [ $MAX_BATCH > 1 ]; then + OUTPUT_SHARED_MEMORY_SIZE=$((OUTPUT_SHARED_MEMORY_SIZE*MAX_BATCH)) + fi + OUTPUT_SHARED_MEMORY_SIZE="--output-shared-memory-size $OUTPUT_SHARED_MEMORY_SIZE" + fi + if [ $DYNAMIC_BATCH > 1 ]; then NAME=${BACKEND}_sbatch${STATIC_BATCH}_dbatch${DYNAMIC_BATCH}_instance${INSTANCE_CNT} else @@ -189,6 +200,7 @@ for BACKEND in $BACKENDS; do -p${PERF_CLIENT_STABILIZE_WINDOW} \ -s${PERF_CLIENT_STABILIZE_THRESHOLD} \ ${PERF_CLIENT_EXTRA_ARGS} \ + ${OUTPUT_SHARED_MEMORY_SIZE} \ -m ${MODEL_NAME} \ -b${STATIC_BATCH} -t${CONCURRENCY} \ --max-trials "${PA_MAX_TRIALS}" \ diff --git a/qa/L0_perf_nomodel/test.sh b/qa/L0_perf_nomodel/test.sh index 6ff68303ed..a213d24e9d 100755 --- a/qa/L0_perf_nomodel/test.sh +++ b/qa/L0_perf_nomodel/test.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright 2019-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions @@ -72,14 +72,14 @@ RUNTEST=./run_test.sh # by 4. TENSOR_SIZE_16MB=$((4*1024*1024)) -if [ "$BENCHMARK_TEST_SHARED_MEMORY" == "system" ]; then +if [ "$TEST_SHARED_MEMORY" == "system" ]; then UNDERTEST_NAME="$UNDERTEST_NAME System Shared Memory"; SUFFIX="_shm" -elif [ "$BENCHMARK_TEST_SHARED_MEMORY" == "cuda" ]; then +elif [ "$TEST_SHARED_MEMORY" == "cuda" ]; then UNDERTEST_NAME="$UNDERTEST_NAME CUDA Shared Memory"; SUFFIX="_cudashm" else - BENCHMARK_TEST_SHARED_MEMORY="none" + TEST_SHARED_MEMORY="none" TEST_NAMES=( "${UNDERTEST_NAME} Minimum Latency GRPC" "${UNDERTEST_NAME} Minimum Latency HTTP" @@ -188,7 +188,7 @@ for idx in "${!TEST_NAMES[@]}"; do TEST_CONCURRENCY=${TEST_CONCURRENCY[$idx]} # FIXME: If PA C API adds SHMEM support, remove this. - if [[ "${BENCHMARK_TEST_SHARED_MEMORY}" != "none" ]] && \ + if [[ "${TEST_SHARED_MEMORY}" != "none" ]] && \ [[ "${TEST_PROTOCOL}" == "triton_c_api" ]]; then echo "WARNING: Perf Analyzer does not support shared memory I/O when benchmarking directly with Triton C API, skipping." continue @@ -202,7 +202,7 @@ for idx in "${!TEST_NAMES[@]}"; do PERF_CLIENT_PROTOCOL=${TEST_PROTOCOL} \ TENSOR_SIZE=${TEST_TENSOR_SIZE} \ BACKENDS=${TEST_BACKENDS} \ - SHARED_MEMORY=${BENCHMARK_TEST_SHARED_MEMORY} \ + SHARED_MEMORY=${TEST_SHARED_MEMORY} \ STATIC_BATCH_SIZES=1 \ DYNAMIC_BATCH_SIZES=1 \ INSTANCE_COUNTS=${TEST_INSTANCE_COUNT} \