diff --git a/qa/L0_perf_tensorrt_llm/test.sh b/qa/L0_perf_tensorrt_llm/test.sh index 35d360498d..e74b01e568 100755 --- a/qa/L0_perf_tensorrt_llm/test.sh +++ b/qa/L0_perf_tensorrt_llm/test.sh @@ -34,7 +34,7 @@ TRT_ROOT="/usr/local/tensorrt" MODEL_NAME="gpt2_tensorrt_llm" NAME="tensorrt_llm_benchmarking_test" MODEL_REPOSITORY="$(pwd)/triton_model_repo" -TENSORRTLLM_BACKEND_DIR="/opt/tritonserver/tensorrtllm_backend" +TENSORRTLLM_BACKEND_DIR="/workspace/tensorrtllm_backend" GPT_DIR="$TENSORRTLLM_BACKEND_DIR/tensorrt_llm/examples/gpt" TOKENIZER_DIR="$GPT_DIR/gpt2" ENGINES_DIR="${BASE_DIR}/engines/inflight_batcher_llm/${NUM_GPUS}-gpu" @@ -47,40 +47,27 @@ SERVER_TIMEOUT=${SERVER_TIMEOUT:=120} function clone_tensorrt_llm_backend_repo { rm -rf $TENSORRTLLM_BACKEND_DIR && mkdir $TENSORRTLLM_BACKEND_DIR apt-get update && apt-get install git-lfs -y --no-install-recommends - git clone --single-branch --depth=1 -b ${TENSORRTLLM_BACKEND_REPO_TAG} https://github.com/triton-inference-server/tensorrtllm_backend.git $TENSORRTLLM_BACKEND_DIR + git clone --single-branch --depth=1 -b ${TENSORRTLLM_BACKEND_REPO_TAG} ${TRITON_REPO_ORG}/tensorrtllm_backend.git $TENSORRTLLM_BACKEND_DIR cd $TENSORRTLLM_BACKEND_DIR && git lfs install && git submodule update --init --recursive } # Update Open MPI to a version compatible with SLURM. function upgrade_openmpi { - cd /tmp/ local CURRENT_VERSION=$(mpirun --version 2>&1 | awk '/Open MPI/ {gsub(/rc[0-9]+/, "", $NF); print $NF}') if [ -n "$CURRENT_VERSION" ] && dpkg --compare-versions "$CURRENT_VERSION" lt "5.0.1"; then # Uninstall the current version of Open MPI - wget "https://download.open-mpi.org/release/open-mpi/v$(echo "${CURRENT_VERSION}" | awk -F. '{print $1"."$2}')/openmpi-${CURRENT_VERSION}.tar.gz" || { - echo "Failed to download Open MPI ${CURRENT_VERSION}" - exit 1 - } - rm -rf "openmpi-${CURRENT_VERSION}" && tar -xzf "openmpi-${CURRENT_VERSION}.tar.gz" && cd "openmpi-${CURRENT_VERSION}" || { - echo "Failed to extract Open MPI ${CURRENT_VERSION}" - exit 1 - } - unset PMIX_VERSION && ./configure --prefix=/opt/hpcx/ompi/ && make uninstall || { - echo "Failed to uninstall Open MPI ${CURRENT_VERSION}" - exit 1 - } - rm -rf /opt/hpcx/ompi/ /usr/local/mpi/ || { - echo "Failed to remove Open MPI ${CURRENT_VERSION} installation directories" + rm -r /opt/hpcx/ompi/ /usr/local/mpi && rm -rf /usr/lib/$(gcc -print-multiarch)/openmpi || { + echo "Failed to uninstall the existing Open MPI version $CURRENT_VERSION." exit 1 } - cd ../ && rm -r openmpi-${CURRENT_VERSION} else - echo "Installed Open MPI version is not less than 5.0.1. Skipping the upgrade." + echo "The installed Open MPI version ($CURRENT_VERSION) is 5.0.1 or higher. Skipping the upgrade." return fi # Install SLURM supported Open MPI version + cd /tmp/ wget "https://download.open-mpi.org/release/open-mpi/v5.0/openmpi-5.0.1.tar.gz" || { echo "Failed to download Open MPI 5.0.1" exit 1 @@ -108,18 +95,6 @@ function upgrade_openmpi { mpirun --version } -function install_tensorrt_llm { - # Install CMake - bash ${TENSORRTLLM_BACKEND_DIR}/tensorrt_llm/docker/common/install_cmake.sh - export PATH="/usr/local/cmake/bin:${PATH}" - - TORCH_INSTALL_TYPE="pypi" && - (cd ${TENSORRTLLM_BACKEND_DIR}/tensorrt_llm && - bash docker/common/install_pytorch.sh $TORCH_INSTALL_TYPE && - python3 ./scripts/build_wheel.py --trt_root=/usr/local/tensorrt && - pip3 install ./build/tensorrt_llm*.whl) -} - function build_gpt2_base_model { # Download weights from HuggingFace Transformers cd ${GPT_DIR} && rm -rf gpt2 && git clone https://huggingface.co/gpt2-medium gpt2 && cd gpt2 @@ -131,24 +106,21 @@ function build_gpt2_base_model { cd ${GPT_DIR} # Convert weights from HF Tranformers to FT format - python3 hf_gpt_convert.py -p 1 -i gpt2 -o ./c-model/gpt2 --tensor-parallelism ${NUM_GPUS} --storage-type float16 + python3 convert_checkpoint.py --model_dir gpt2 --dtype float16 --tp_size ${NUM_GPUS} --output_dir "./c-model/gpt2/${NUM_GPUS}-gpu/" cd ${BASE_DIR} } function build_gpt2_tensorrt_engine { # Build TensorRT engines cd ${GPT_DIR} - python3 build.py --model_dir="./c-model/gpt2/${NUM_GPUS}-gpu/" \ - --world_size="${NUM_GPUS}" \ - --dtype float16 \ - --use_inflight_batching \ - --use_gpt_attention_plugin float16 \ - --paged_kv_cache \ - --use_gemm_plugin float16 \ - --remove_input_padding \ - --hidden_act gelu \ - --parallel_build \ - --output_dir="${ENGINES_DIR}" + trtllm-build --checkpoint_dir "./c-model/gpt2/${NUM_GPUS}-gpu/" \ + --gpt_attention_plugin float16 \ + --remove_input_padding enable \ + --paged_kv_cache enable \ + --gemm_plugin float16 \ + --workers "${NUM_GPUS}" \ + --output_dir "${ENGINES_DIR}" + cd ${BASE_DIR} } @@ -172,18 +144,18 @@ function prepare_model_repository { replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" replace_config_tags '${preprocessing_instance_count}' '1' "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" - replace_config_tags '${tokenizer_type}' 'auto' "${MODEL_REPOSITORY}/preprocessing/config.pbtxt" replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt" replace_config_tags '${postprocessing_instance_count}' '1' "${MODEL_REPOSITORY}/postprocessing/config.pbtxt" replace_config_tags '${tokenizer_dir}' "${TOKENIZER_DIR}/" "${MODEL_REPOSITORY}/postprocessing/config.pbtxt" - replace_config_tags '${tokenizer_type}' 'auto' "${MODEL_REPOSITORY}/postprocessing/config.pbtxt" replace_config_tags '${triton_max_batch_size}' "128" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" replace_config_tags '${decoupled_mode}' 'true' "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" replace_config_tags '${max_queue_delay_microseconds}' "1000000" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" replace_config_tags '${batching_strategy}' 'inflight_fused_batching' "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" replace_config_tags '${engine_dir}' "${ENGINES_DIR}" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" + replace_config_tags '${triton_backend}' "tensorrtllm" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" + replace_config_tags '${max_queue_size}' "0" "${MODEL_REPOSITORY}/tensorrt_llm/config.pbtxt" } # Wait until server health endpoint shows ready. Sets WAIT_RET to 0 on @@ -244,13 +216,12 @@ function kill_server { upgrade_openmpi clone_tensorrt_llm_backend_repo -install_tensorrt_llm build_gpt2_base_model build_gpt2_tensorrt_engine prepare_model_repository # Install perf_analyzer -pip3 install tritonclient nvidia-ml-py3 +pip3 install tritonclient ARCH="amd64" STATIC_BATCH=1 diff --git a/qa/L0_perf_vllm/test.sh b/qa/L0_perf_vllm/test.sh index 498f6f8e14..e1ce8cf2ed 100755 --- a/qa/L0_perf_vllm/test.sh +++ b/qa/L0_perf_vllm/test.sh @@ -41,7 +41,7 @@ SERVER_ARGS="--model-repository=${MODEL_REPO} --backend-directory=${BACKEND_DIR} export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:=0} EXPORT_FILE=profile-export-vllm-model.json -pip3 install tritonclient nvidia-ml-py3 +pip3 install tritonclient rm -rf $MODEL_REPO $EXPORT_FILE *.tjson *.json *.csv mkdir -p $MODEL_REPO/$MODEL_NAME/1