From 13fd025d0888053b00df62d1c3ac0146363caa79 Mon Sep 17 00:00:00 2001 From: Zach Kimberg Date: Tue, 18 Jun 2024 14:12:29 -0700 Subject: [PATCH] [CI] LLM Integration Tests through pytest suite (#2023) --- .github/workflows/llm_integration.yml | 941 +----------------- serving/docker/scripts/docker_name_builder.sh | 6 +- tests/integration/.gitignore | 1 + tests/integration/tests.py | 435 +++++++- 4 files changed, 417 insertions(+), 966 deletions(-) diff --git a/.github/workflows/llm_integration.yml b/.github/workflows/llm_integration.yml index e42f428a3..a74879060 100644 --- a/.github/workflows/llm_integration.yml +++ b/.github/workflows/llm_integration.yml @@ -7,10 +7,6 @@ on: description: 'The released version of DJL' required: false default: '' - run_test: - description: 'Run only the tests you need [ hf, trtllm, scheduler, lmi-dist, vllm, vllm-lora, lmi-dist-lora ]' - required: false - default: '' schedule: - cron: '0 15 * * *' @@ -51,118 +47,24 @@ jobs: gpu_instance_id_2: ${{ steps.create_gpu2.outputs.action_g6_instance_id }} gpu_instance_id_3: ${{ steps.create_gpu3.outputs.action_g6_instance_id }} - hf-handler-test: - if: contains(fromJson('["", "hf"]'), github.event.inputs.run_test) + test: runs-on: [ self-hosted, g6 ] timeout-minutes: 60 needs: create-runners strategy: + fail-fast: false matrix: - arch: [ lmi ] - steps: - - uses: actions/checkout@v4 - - name: Clean env - run: | - yes | docker system prune -a --volumes - sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ - echo "wait dpkg lock..." - while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done - - name: Set up Python3 - uses: actions/setup-python@v5 - with: - python-version: '3.10.x' - - name: Install pip dependencies - run: pip3 install requests "numpy<2" huggingface_hub - - name: Build container name - run: ./serving/docker/scripts/docker_name_builder.sh ${{ matrix.arch }} ${{ github.event.inputs.djl-version }} - - name: Download models and dockers - working-directory: tests/integration - run: | - docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG - - name: Test gpt-neo - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py huggingface gpt-neo-2.7b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/ - python3 llm/client.py huggingface gpt-neo-2.7b - docker rm -f $(docker ps -aq) - - name: Test bloom-7b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py huggingface bloom-7b1 - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve - python3 llm/client.py huggingface bloom-7b1 - docker rm -f $(docker ps -aq) - - name: Test LLAMA-7b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py huggingface llama-2-7b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve - python3 llm/client.py huggingface llama-2-7b - docker rm -f $(docker ps -aq) - - name: Test GPTJ-6B - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py huggingface gpt-j-6b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve - python3 llm/client.py huggingface gpt-j-6b - docker rm -f $(docker ps -aq) - - name: Test gpt4all-lora - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py huggingface gpt4all-lora - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve - python3 llm/client.py huggingface gpt4all-lora - docker rm -f $(docker ps -aq) - - name: Test streaming bigscience/bloom-3b - working-directory: tests/integration - run: | - rm -rf models - echo -en "CUDA_VISIBLE_DEVICES=1,2" > docker_env - python3 llm/prepare.py huggingface bigscience/bloom-3b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve - python3 llm/client.py huggingface bigscience/bloom-3b - rm -rf docker_env - docker rm -f $(docker ps -aq) - - name: Test streaming t5-large - working-directory: tests/integration - run: | - rm -rf models - echo -en "CUDA_VISIBLE_DEVICES=1" > docker_env - python3 llm/prepare.py huggingface t5-large - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve - python3 llm/client.py huggingface t5-large - rm -rf docker_env - docker rm -f $(docker ps -aq) - - name: On fail step - if: ${{ failure() }} - working-directory: tests/integration - run: | - docker rm -f $(docker ps -aq) || true - cat logs/serving.log - - name: Upload test logs - uses: actions/upload-artifact@v3 - with: - name: hf-handler-${{ matrix.arch }}-logs - path: tests/integration/logs/ - - trt-llm-handler-test: - if: contains(fromJson('["", "trtllm"]'), github.event.inputs.run_test) - runs-on: [ self-hosted, g6 ] - timeout-minutes: 120 - needs: create-runners + test: + - TestHfHandler + - TestTrtLlmHandler1 + - TestTrtLlmHandler2 + - TestSchedulerSingleGPU + - TestSchedulerMultiGPU + - TestLmiDist1 + - TestLmiDist2 + - TestVllm1 + - TestVllmLora + - TestLmiDistLora steps: - uses: actions/checkout@v4 - name: Clean env @@ -176,240 +78,25 @@ jobs: with: python-version: '3.10.x' - name: Install pip dependencies - run: pip3 install requests "numpy<2" huggingface_hub - - name: Build container name - run: ./serving/docker/scripts/docker_name_builder.sh tensorrt-llm ${{ github.event.inputs.djl-version }} - - name: Download models and dockers - working-directory: tests/integration - run: | - docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG - - name: llama2-13b HF model with tp=4 - working-directory: tests/integration - run: | - rm -rf models - echo -en "CUDA_VISIBLE_DEVICES=0,1,2,3" > docker_env - python3 llm/prepare.py trtllm llama2-13b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \ - serve - python3 llm/client.py trtllm llama2-13b - rm -rf docker_env - docker rm -f $(docker ps -aq) - - name: falcon-7b triton repo with tp=1 - working-directory: tests/integration - run: | - rm -rf models - echo -en "CUDA_VISIBLE_DEVICES=0" > docker_env - python3 llm/prepare.py trtllm falcon-7b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \ - serve - python3 llm/client.py trtllm falcon-7b - rm -rf docker_env - docker rm -f $(docker ps -aq) - - name: internlm-7b HF model with tp=4 - working-directory: tests/integration - run: | - rm -rf models - echo -en "CUDA_VISIBLE_DEVICES=0,1,2,3" > docker_env - python3 llm/prepare.py trtllm internlm-7b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \ - serve - python3 llm/client.py trtllm internlm-7b - rm -rf docker_env - docker rm -f $(docker ps -aq) - - name: baichuan2-13b HF model with tp=4 - working-directory: tests/integration - run: | - rm -rf models - echo -en "CUDA_VISIBLE_DEVICES=0,1,2,3" > docker_env - python3 llm/prepare.py trtllm baichuan2-13b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \ - serve - python3 llm/client.py trtllm baichuan2-13b - rm -rf docker_env - docker rm -f $(docker ps -aq) - - name: chatglm3-6b HF model with tp=4 - working-directory: tests/integration - run: | - rm -rf models - echo -en "CUDA_VISIBLE_DEVICES=0,1,2,3" > docker_env - python3 llm/prepare.py trtllm chatglm3-6b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \ - serve - python3 llm/client.py trtllm chatglm3-6b - rm -rf docker_env - docker rm -f $(docker ps -aq) - - name: GPT2 HF model with tp=4 - working-directory: tests/integration - run: | - rm -rf models - echo -en "CUDA_VISIBLE_DEVICES=0,1,2,3" > docker_env - python3 llm/prepare.py trtllm gpt2 - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \ - serve - python3 llm/client.py trtllm gpt2 - rm -rf docker_env - docker rm -f $(docker ps -aq) - - name: SantaCoder HF model with tp=4 - working-directory: tests/integration - run: | - rm -rf models - echo -en "CUDA_VISIBLE_DEVICES=0,1,2,3" > docker_env - python3 llm/prepare.py trtllm santacoder - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \ - serve - python3 llm/client.py trtllm santacoder - rm -rf docker_env - docker rm -f $(docker ps -aq) - - name: On fail step - if: ${{ failure() }} - working-directory: tests/integration - run: | - docker rm -f $(docker ps -aq) || true - cat logs/serving.log - - name: Upload test logs - uses: actions/upload-artifact@v3 - with: - name: trtllm-handler-logs - path: tests/integration/logs/ - - trt-llm-handler-test-2: - if: contains(fromJson('["", "trtllm"]'), github.event.inputs.run_test) - runs-on: [ self-hosted, g6 ] - timeout-minutes: 120 - needs: create-runners - steps: - - uses: actions/checkout@v4 - - name: Clean env - run: | - yes | docker system prune -a --volumes - sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ - echo "wait dpkg lock..." - while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done - - name: Set up Python3 - uses: actions/setup-python@v5 - with: - python-version: '3.10.x' - - name: Install pip dependencies - run: pip3 install requests "numpy<2" huggingface_hub - - name: Build container name - run: ./serving/docker/scripts/docker_name_builder.sh tensorrt-llm ${{ github.event.inputs.djl-version }} - - name: Download models and dockers - working-directory: tests/integration - run: | - docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG - - name: llama2-7b HF model with tp=4 and smoothquant - working-directory: tests/integration - run: | - rm -rf models - echo -en "CUDA_VISIBLE_DEVICES=0,1,2,3" > docker_env - python3 llm/prepare.py trtllm llama2-7b-smoothquant - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm-sq \ - serve - python3 llm/client.py trtllm llama2-7b-smoothquant - rm -rf docker_env - docker rm -f $(docker ps -aq) - - name: mistral-7b HF model with tp=4 - working-directory: tests/integration - run: | - rm -rf models - echo -en "CUDA_VISIBLE_DEVICES=0,1,2,3" > docker_env - python3 llm/prepare.py trtllm mistral-7b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \ - serve - python3 llm/client.py trtllm mistral-7b - rm -rf docker_env - docker rm -f $(docker ps -aq) - - name: gpt-j-6b HF model with tp=1 - working-directory: tests/integration - run: | - rm -rf models - echo -en "CUDA_VISIBLE_DEVICES=0" > docker_env - python3 llm/prepare.py trtllm gpt-j-6b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \ - serve - python3 llm/client.py trtllm gpt-j-6b - rm -rf docker_env - docker rm -f $(docker ps -aq) - - name: qwen-7b HF model with tp=4 - working-directory: tests/integration - run: | - rm -rf models - echo -en "CUDA_VISIBLE_DEVICES=0,1,2,3" > docker_env - python3 llm/prepare.py trtllm qwen-7b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \ - serve - python3 llm/client.py trtllm qwen-7b - rm -rf docker_env - docker rm -f $(docker ps -aq) - - name: On fail step - if: ${{ failure() }} - working-directory: tests/integration - run: | - docker rm -f $(docker ps -aq) || true - cat logs/serving.log - - name: Upload test logs - uses: actions/upload-artifact@v3 - with: - name: trtllm-handler-quantization-logs - path: tests/integration/logs/ - - scheduler-single-gpu-test: - if: contains(fromJson('["", "scheduler"]'), github.event.inputs.run_test) - runs-on: [ self-hosted, g6 ] - timeout-minutes: 60 - needs: create-runners - steps: - - uses: actions/checkout@v4 - - name: Clean env - run: | - yes | docker system prune -a --volumes - sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ - echo "wait dpkg lock..." - while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done - - name: Set up Python3 - uses: actions/setup-python@v5 - with: - python-version: '3.10.x' + run: pip3 install pytest requests "numpy<2" huggingface_hub - name: Install awscurl working-directory: tests/integration run: | curl -OL https://github.com/frankfliu/junkyard/releases/download/v0.2.2/awscurl chmod +x awscurl mkdir outputs - - name: Build container name - run: ./serving/docker/scripts/docker_name_builder.sh lmi ${{ github.event.inputs.djl-version }} - - name: Download models and dockers - working-directory: tests/integration - run: | - docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG - - name: Test gpt2 - working-directory: tests/integration - run: | - # Correctness test - rm -rf models - python3 llm/prepare.py rolling_batch_scheduler gpt2 - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/ - python3 rb_client.py correctness gpt2 - docker rm -f $(docker ps -aq) - - name: Test bloom-560m + - name: Test working-directory: tests/integration + env: + TEST_DJL_VERSION: ${{ inputs.djl-version }} run: | - rm -rf models - python3 llm/prepare.py rolling_batch_scheduler bloom-560m - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/ - python3 rb_client.py scheduler_single_gpu bloom-560m - docker rm -f $(docker ps -aq) - - name: Print outputs - working-directory: tests/integration - run: for file in outputs/*; do if [ -f "$file" ]; then echo "Contents of $file:"; cat "$file"; echo; fi; done + pytest -k ${{ matrix.test }} tests.py - name: Cleanup working-directory: tests/integration run: | rm -rf outputs rm awscurl - - name: On fail step + - name: On Failure if: ${{ failure() }} working-directory: tests/integration run: | @@ -417,599 +104,17 @@ jobs: rm -rf outputs && rm -rf models rm awscurl docker rm -f $(docker ps -aq) || true - cat logs/serving.log - - name: Upload test logs - uses: actions/upload-artifact@v3 - with: - name: rb-single-gpu-logs - path: tests/integration/logs/ - - scheduler-multi-gpu-test: - if: contains(fromJson('["", "scheduler"]'), github.event.inputs.run_test) - runs-on: [ self-hosted, g6 ] - timeout-minutes: 60 - needs: create-runners - steps: - - uses: actions/checkout@v4 - - name: Clean env - run: | - yes | docker system prune -a --volumes - sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ - echo "wait dpkg lock..." - while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done - - name: Set up Python3 - uses: actions/setup-python@v5 - with: - python-version: '3.10.x' - - name: Install awscurl - working-directory: tests/integration - run: | - curl -OL https://github.com/frankfliu/junkyard/releases/download/v0.2.2/awscurl - chmod +x awscurl - mkdir outputs - - name: Build container name - run: ./serving/docker/scripts/docker_name_builder.sh lmi ${{ github.event.inputs.djl-version }} - - name: Download models and dockers - working-directory: tests/integration - run: | - docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG - - name: Test gptj-6b - working-directory: tests/integration - run: | - # Concurrent requests test - rm -rf models - python3 llm/prepare.py rolling_batch_scheduler gpt-j-6b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/ - python3 rb_client.py scheduler_multi_gpu gpt-j-6b - docker rm -f $(docker ps -aq) - - name: Print outputs - working-directory: tests/integration - run: for file in outputs/*; do if [ -f "$file" ]; then echo "Contents of $file:"; cat "$file"; echo; fi; done - - name: Cleanup - working-directory: tests/integration - run: | - rm -rf models && rm -rf outputs - rm awscurl - - name: On fail step - if: ${{ failure() }} - working-directory: tests/integration - run: | - for file in outputs/*; do if [ -f "$file" ]; then echo "Contents of $file:"; cat "$file"; echo; fi; done - rm -rf outputs && rm -rf models - rm awscurl - docker rm -f $(docker ps -aq) || true - cat logs/serving.log - - name: Upload test logs - uses: actions/upload-artifact@v3 - with: - name: rb-multi-gpu-logs - path: tests/integration/logs/ - - lmi-dist-test-1: - if: contains(fromJson('["", "lmi-dist"]'), github.event.inputs.run_test) - runs-on: [ self-hosted, g6 ] - timeout-minutes: 60 - needs: create-runners - steps: - - uses: actions/checkout@v4 - - name: Clean env - run: | - yes | docker system prune -a --volumes - sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ - echo "wait dpkg lock..." - while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done - - name: Set up Python3 - uses: actions/setup-python@v5 - with: - python-version: '3.10.x' - - name: Install pip dependencies - run: pip3 install requests "numpy<2" huggingface_hub - - name: Build container name - run: ./serving/docker/scripts/docker_name_builder.sh lmi ${{ github.event.inputs.djl-version }} - - name: Download docker - working-directory: tests/integration - run: | - docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG - - name: Test gpt-neox-20b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py lmi_dist gpt-neox-20b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/ - python3 llm/client.py lmi_dist gpt-neox-20b - docker rm -f $(docker ps -aq) - - name: Test falcon-7b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py lmi_dist falcon-7b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/ - python3 llm/client.py lmi_dist falcon-7b - docker rm -f $(docker ps -aq) - - name: Test falcon2-11b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py lmi_dist falcon-11b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/ - python3 llm/client.py lmi_dist falcon-11b - docker rm -f $(docker ps -aq) - - name: Test flan-t5-xxl - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py lmi_dist flan-t5-xxl - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/ - python3 llm/client.py lmi_dist flan-t5-xxl - docker rm -f $(docker ps -aq) - - name: Test gpt2 - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py lmi_dist gpt2 - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/ - python3 llm/client.py lmi_dist gpt2 - docker rm -f $(docker ps -aq) - - name: Test mpt-7b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py lmi_dist mpt-7b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/ - python3 llm/client.py lmi_dist mpt-7b - docker rm -f $(docker ps -aq) - - name: Test llama-2-tiny AutoAwq - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py lmi_dist llama-2-tiny - echo -en "CUDA_VISIBLE_DEVICES=0,1,2,3" > docker_env - # partition - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - partition --model-dir /opt/ml/input/data/training --save-mp-checkpoint-path /opt/ml/input/data/training/aot - # launch the container again - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/aot - python3 llm/client.py lmi_dist llama-2-tiny - sudo rm -rf models - docker rm -f $(docker ps -aq) - - name: On fail step - if: ${{ failure() }} - working-directory: tests/integration - run: | - docker rm -f $(docker ps -aq) || true - cat logs/serving.log - - name: Remove model directory - working-directory: tests/integration - run: | - sudo rm -rf models - - name: Upload test logs - uses: actions/upload-artifact@v3 - with: - name: lmi-dist-logs-1 - path: tests/integration/logs/ - - lmi-dist-test-2: - if: contains(fromJson('["", "lmi-dist"]'), github.event.inputs.run_test) - runs-on: [ self-hosted, g6 ] - timeout-minutes: 60 - needs: create-runners - steps: - - uses: actions/checkout@v4 - - name: Clean env - run: | - yes | docker system prune -a --volumes - sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ - echo "wait dpkg lock..." - while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done - - name: Set up Python3 - uses: actions/setup-python@v5 - with: - python-version: '3.10.x' - - name: Install pip dependencies - run: pip3 install requests "numpy<2" - - name: Build container name - run: ./serving/docker/scripts/docker_name_builder.sh lmi ${{ github.event.inputs.djl-version }} - - name: Download docker - working-directory: tests/integration - run: | - docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG - - name: Test octocoder - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py lmi_dist octocoder - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/ - python3 llm/client.py lmi_dist octocoder - docker rm -f $(docker ps -aq) - - name: Test speculative-llama-13b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py lmi_dist speculative-llama-13b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/ - python3 llm/client.py lmi_dist speculative-llama-13b - docker rm -f $(docker ps -aq) - - name: Test starcoder2-7b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py lmi_dist starcoder2-7b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/ - python3 llm/client.py lmi_dist starcoder2-7b - docker rm -f $(docker ps -aq) - - name: Test gemma-2b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py lmi_dist gemma-2b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/ - python3 llm/client.py lmi_dist gemma-2b - docker rm -f $(docker ps -aq) - - name: Test llama2-13b-gptq - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py lmi_dist llama2-13b-gptq - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/ - python3 llm/client.py lmi_dist llama2-13b-gptq - docker rm -f $(docker ps -aq) - - name: Test Mistral-7b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py lmi_dist mistral-7b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/ - python3 llm/client.py lmi_dist mistral-7b - docker rm -f $(docker ps -aq) - - name: Test llama2-7b-32k - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py lmi_dist llama2-7b-32k - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/ - python3 llm/client.py lmi_dist llama2-7b-32k - docker rm -f $(docker ps -aq) - - name: Test mistral-7b-128k-awq - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py lmi_dist mistral-7b-128k-awq - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/ - python3 llm/client.py lmi_dist mistral-7b-128k-awq - docker rm -f $(docker ps -aq) - - name: Test llama2-7b-chat - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py lmi_dist llama2-7b-chat - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/ - python3 llm/client.py lmi_dist_chat llama2-7b-chat - docker rm -f $(docker ps -aq) - - name: On fail step - if: ${{ failure() }} - working-directory: tests/integration - run: | - docker rm -f $(docker ps -aq) || true - cat logs/serving.log - - name: Upload test logs - uses: actions/upload-artifact@v3 - with: - name: lmi-dist-logs-2 - path: tests/integration/logs/ - - vllm-test: - if: contains(fromJson('["", "vllm"]'), github.event.inputs.run_test) - runs-on: [ self-hosted, g6 ] - timeout-minutes: 60 - needs: create-runners - steps: - - uses: actions/checkout@v4 - - name: Clean env - run: | - yes | docker system prune -a --volumes - sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ - echo "wait dpkg lock..." - while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done - - name: Set up Python3 - uses: actions/setup-python@v5 - with: - python-version: '3.10.x' - - name: Install pip dependencies - run: pip3 install requests "numpy<2" huggingface_hub - - name: Build container name - run: ./serving/docker/scripts/docker_name_builder.sh lmi ${{ github.event.inputs.djl-version }} - - name: Download docker - working-directory: tests/integration - run: | - docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG - - name: Test llama2-13b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py vllm llama2-13b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/ - python3 llm/client.py vllm llama2-13b - docker rm -f $(docker ps -aq) - - name: Test llama2-13b awq - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py vllm llama2-13b-awq - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/ - python3 llm/client.py vllm llama2-13b - docker rm -f $(docker ps -aq) - - name: Test gpt-neox-20b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py vllm gpt-neox-20b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/ - python3 llm/client.py vllm gpt-neox-20b - docker rm -f $(docker ps -aq) - - name: Test Mistral-7b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py vllm mistral-7b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/ - python3 llm/client.py vllm mistral-7b - docker rm -f $(docker ps -aq) - - name: Test phi-2 - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py vllm phi-2 - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/ - python3 llm/client.py vllm phi-2 - docker rm -f $(docker ps -aq) - - name: Test starcoder2-7b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py vllm starcoder2-7b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/ - python3 llm/client.py vllm starcoder2-7b - docker rm -f $(docker ps -aq) - - name: Test gemma-2b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py vllm gemma-2b - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/ - python3 llm/client.py vllm gemma-2b - docker rm -f $(docker ps -aq) - - name: Test llama2-7b-chat - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py vllm llama2-7b-chat - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve -m test=file:/opt/ml/model/test/ - python3 llm/client.py vllm_chat llama2-7b-chat - docker rm -f $(docker ps -aq) - - name: On fail step - if: ${{ failure() }} - working-directory: tests/integration - run: | - docker rm -f $(docker ps -aq) || true - cat logs/serving.log - - name: Upload test logs - uses: actions/upload-artifact@v3 - with: - name: vllm-logs - path: tests/integration/logs/ - - - vllm-lora-test: - if: contains(fromJson('["", "vllm-lora"]'), github.event.inputs.run_test) - runs-on: [ self-hosted, g6 ] - timeout-minutes: 60 - needs: create-runners - steps: - - uses: actions/checkout@v4 - - name: Clean env - run: | - yes | docker system prune -a --volumes - sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ - echo "wait dpkg lock..." - while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done - - name: Set up Python3 - uses: actions/setup-python@v5 - with: - python-version: '3.10.x' - - name: Install pip dependencies - run: pip3 install requests "numpy<2" huggingface_hub - - name: Build container name - run: ./serving/docker/scripts/docker_name_builder.sh lmi ${{ github.event.inputs.djl-version }} - - name: Download docker - working-directory: tests/integration - run: | - docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG - - name: Test vllm unmerged lora - llama7b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py vllm llama-7b-unmerged-lora - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve - python3 llm/client.py vllm_adapters llama-7b-unmerged-lora - docker rm -f $(docker ps -aq) - - name: Test vllm unmerged lora overflow - llama7b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py vllm llama-7b-unmerged-lora-overflow - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve - python3 llm/client.py vllm_adapters llama-7b-unmerged-lora-overflow - docker rm -f $(docker ps -aq) - - name: Test vllm lora awq - llama2-13b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py vllm llama2-13b-awq-unmerged-lora - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve - python3 llm/client.py vllm_adapters llama2-13b-awq-unmerged-lora - docker rm -f $(docker ps -aq) - - name: Test vllm lora - mistral-7b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py vllm mistral-7b-unmerged-lora - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve - python3 llm/client.py vllm_adapters mistral-7b-unmerged-lora - docker rm -f $(docker ps -aq) - - name: Test vllm lora awq - mistral-7b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py vllm mistral-7b-awq-unmerged-lora - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve - python3 llm/client.py vllm_adapters mistral-7b-awq-unmerged-lora - docker rm -f $(docker ps -aq) - - name: Test vllm lora - llama-3-8b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py vllm llama3-8b-unmerged-lora - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve - python3 llm/client.py vllm_adapters llama3-8b-unmerged-lora - docker rm -f $(docker ps -aq) - - name: On fail step - if: ${{ failure() }} - working-directory: tests/integration - run: | - docker rm -f $(docker ps -aq) || true - cat logs/serving.log - - name: Upload test logs - uses: actions/upload-artifact@v3 - with: - name: vllm-lora-logs - path: tests/integration/logs/ - - lmi-dist-lora-test: - if: contains(fromJson('["", "lmi-dist-lora"]'), github.event.inputs.run_test) - runs-on: [ self-hosted, g6 ] - timeout-minutes: 60 - needs: create-runners - steps: - - uses: actions/checkout@v4 - - name: Clean env - run: | - yes | docker system prune -a --volumes - sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ - echo "wait dpkg lock..." - while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done - - name: Set up Python3 - uses: actions/setup-python@v5 - with: - python-version: '3.10.x' - - name: Install pip dependencies - run: pip3 install requests "numpy<2" huggingface_hub - - name: Build container name - run: ./serving/docker/scripts/docker_name_builder.sh lmi ${{ github.event.inputs.djl-version }} - - name: Download docker - working-directory: tests/integration - run: | - docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG - - name: Test lmi-dist unmerged lora - llama7b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py lmi_dist llama-7b-unmerged-lora - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve - python3 llm/client.py lmi_dist_adapters llama-7b-unmerged-lora - docker rm -f $(docker ps -aq) - - name: Test lmi-dist unmerged lora overflow - llama7b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py lmi_dist llama-7b-unmerged-lora-overflow - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve - python3 llm/client.py lmi_dist_adapters llama-7b-unmerged-lora-overflow - docker rm -f $(docker ps -aq) - - name: Test lmi-dist lora awq - llama2-13b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py lmi_dist llama2-13b-awq-unmerged-lora - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve - python3 llm/client.py lmi_dist_adapters llama2-13b-awq-unmerged-lora - docker rm -f $(docker ps -aq) - - name: Test lmi-dist lora - mistral-7b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py lmi_dist mistral-7b-unmerged-lora - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve - python3 llm/client.py lmi_dist_adapters mistral-7b-unmerged-lora - docker rm -f $(docker ps -aq) - - name: Test lmi-dist lora awq - mistral-7b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py lmi_dist mistral-7b-awq-unmerged-lora - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve - python3 llm/client.py lmi_dist_adapters mistral-7b-awq-unmerged-lora - docker rm -f $(docker ps -aq) - - name: Test lmi-dist lora - llama-3-8b - working-directory: tests/integration - run: | - rm -rf models - python3 llm/prepare.py lmi_dist llama3-8b-unmerged-lora - ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \ - serve - python3 llm/client.py lmi_dist_adapters llama3-8b-unmerged-lora - docker rm -f $(docker ps -aq) - - name: On fail step - if: ${{ failure() }} - working-directory: tests/integration - run: | - docker rm -f $(docker ps -aq) || true - cat logs/serving.log - name: Upload test logs + if: ${{ always() }} uses: actions/upload-artifact@v3 with: - name: lmi-dist-lora-logs - path: tests/integration/logs/ + name: test-${{ matrix.test }}-logs + path: tests/integration/all_logs/ stop-runners: if: always() runs-on: [ self-hosted, scheduler ] - needs: [ create-runners, hf-handler-test, trt-llm-handler-test, trt-llm-handler-test-2, scheduler-single-gpu-test, scheduler-multi-gpu-test, lmi-dist-test-1, lmi-dist-test-2, vllm-test, vllm-lora-test, lmi-dist-lora-test] + needs: [ create-runners, test] steps: - name: Stop all instances run: | diff --git a/serving/docker/scripts/docker_name_builder.sh b/serving/docker/scripts/docker_name_builder.sh index e2d78f6c4..250e17a89 100755 --- a/serving/docker/scripts/docker_name_builder.sh +++ b/serving/docker/scripts/docker_name_builder.sh @@ -13,8 +13,4 @@ else fi fi -if [[ -n "$GITHUB_ENV" ]]; then - echo "DJLSERVING_DOCKER_TAG=$image" >> $GITHUB_ENV -else - echo "$image" -fi +echo "DJLSERVING_DOCKER_TAG=$image" >> $GITHUB_ENV diff --git a/tests/integration/.gitignore b/tests/integration/.gitignore index 3875dea7c..65e4bde6c 100644 --- a/tests/integration/.gitignore +++ b/tests/integration/.gitignore @@ -1,3 +1,4 @@ /docker_env /logs +/all_logs /models diff --git a/tests/integration/tests.py b/tests/integration/tests.py index 648700b04..93ec1c450 100644 --- a/tests/integration/tests.py +++ b/tests/integration/tests.py @@ -4,21 +4,26 @@ import subprocess import llm.prepare as prepare import llm.client as client -import rb_client +import rb_client as rb_client -djl_version = '' +djl_version = os.environ.get('TEST_DJL_VERSION', '').strip() class Runner: - def __init__(self, container): + def __init__(self, container, test_name=None): self.container = container - flavor = subprocess.run([ - '../../serving/docker/scripts/docker_name_builder.sh', container, - djl_version - ], - capture_output=True, - text=True).stdout.strip() + self.test_name = test_name + + # Compute flavor + if djl_version is not None and len(djl_version) > 0: + if container == "cpu": + flavor = djl_version + else: + flavor = f"{djl_version}-{container}" + else: + flavor = f"{container}-nightly" + self.image = f"deepjavalibrary/djl-serving:{flavor}" def __enter__(self): @@ -27,57 +32,401 @@ def __enter__(self): return self def __exit__(self, *args): - container = subprocess.run(['docker', 'ps', '-aq'], - capture_output=True, - text=True).stdout.strip() - if container != '': - subprocess.run(['docker', 'rm', '-f', container], - shell=True, - check=True) + if self.test_name is not None: + esc_test_name = self.test_name.replace("/", "-") + os.system(f"mkdir -p all_logs/{esc_test_name}") + os.system(f"cp -r logs all_logs/{esc_test_name}") + subprocess.run(["./remove_container.sh"], check=True) os.system("cat logs/serving.log") - def launch(self, env_vars=None): + def launch(self, env_vars=None, cmd=None): if env_vars is not None: with open("docker_env", "w") as f: f.write(env_vars) + else: + if os.path.isfile("docker_env"): + os.remove("docker_env") + + if cmd is None: + cmd = 'serve -m test=file:/opt/ml/model/test/' model_dir = os.path.join(os.getcwd(), 'models') subprocess.run( - f'./launch_container.sh {self.image} {model_dir} {self.container} serve -m test=file:/opt/ml/model/test/' + f'./launch_container.sh {self.image} {model_dir} {self.container} {cmd}' .split(), check=True) -def test_gpt_neo(): - with Runner('deepspeed') as r: - prepare.build_hf_handler_model("gpt-neo-2.7b") - r.launch() - client.run("huggingface gpt-neo-2.7b".split()) +class TestHfHandler: + # Runs on g5.12xl + def test_gpt_neo(self): + with Runner('lmi', 'test_gpt4all_lora') as r: + prepare.build_hf_handler_model("gpt-neo-2.7b") + r.launch() + client.run("huggingface gpt-neo-2.7b".split()) + + def test_bloom_7b(self): + with Runner('lmi', 'bloom-7b1') as r: + prepare.build_hf_handler_model("bloom-7b1") + r.launch() + client.run("huggingface bloom-7b1".split()) + + def test_llama2_7b(self): + with Runner('lmi', 'llama-2-7b') as r: + prepare.build_hf_handler_model("llama-2-7b") + r.launch() + client.run("huggingface llama-2-7b".split()) + + def test_gptj_6B(self): + with Runner('lmi', 'gpt-j-6b') as r: + prepare.build_hf_handler_model("gpt-j-6b") + r.launch() + client.run("huggingface gpt-j-6b".split()) + + def test_gpt4all_lora(self): + with Runner('lmi', 'gpt4all-lora') as r: + prepare.build_hf_handler_model("gpt4all-lora") + r.launch() + client.run("huggingface gpt4all-lora".split()) + + def test_streaming_bigscience_bloom_3b(self): + with Runner('lmi', 'bigscience/bloom-3b') as r: + prepare.build_hf_handler_model("bigscience/bloom-3b") + r.launch("CUDA_VISIBLE_DEVICES=1,2") + client.run("huggingface bigscience/bloom-3b".split()) + + def test_streaming_t5_large(self): + with Runner('lmi', 't5-large') as r: + prepare.build_hf_handler_model("t5-large") + r.launch("CUDA_VISIBLE_DEVICES=1") + client.run("huggingface t5-large".split()) + + +class TestTrtLlmHandler1: + # Runs on g5.12xl + def test_llama2_13b_tp4(self): + with Runner('tensorrt-llm', 'llama2-13b') as r: + prepare.build_trtllm_handler_model("llama2-13b") + r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3") + client.run("trtllm llama2-13b".split()) + + def test_falcon_triton(self): + with Runner('tensorrt-llm', 'falcon-7b') as r: + prepare.build_trtllm_handler_model("falcon-7b") + r.launch("CUDA_VISIBLE_DEVICES=0") + client.run("trtllm falcon-7b".split()) + + def test_internlm_7b(self): + with Runner('tensorrt-llm', 'internlm-7b') as r: + prepare.build_trtllm_handler_model("internlm-7b") + r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3") + client.run("trtllm internlm-7b".split()) + + def test_baichuan2_13b(self): + with Runner('tensorrt-llm', 'baichuan2-13b') as r: + prepare.build_trtllm_handler_model("baichuan2-13b") + r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3") + client.run("trtllm baichuan2-13b".split()) + + def test_chatglm3_6b(self): + with Runner('tensorrt-llm', 'chatglm3-6b') as r: + prepare.build_trtllm_handler_model("chatglm3-6b") + r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3") + client.run("trtllm chatglm3-6b".split()) + + def test_gpt2(self): + with Runner('tensorrt-llm', 'gpt2') as r: + prepare.build_trtllm_handler_model("gpt2") + r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3") + client.run("trtllm gpt2".split()) + + def test_santacoder(self): + with Runner('tensorrt-llm', 'santacoder') as r: + prepare.build_trtllm_handler_model("santacoder") + r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3") + client.run("trtllm santacoder".split()) + + +class TestTrtLlmHandler2: + # Runs on g5.12xl + def test_llama2_7b_hf_smoothquant(self): + with Runner('tensorrt-llm', 'llama2-7b-smoothquant') as r: + prepare.build_trtllm_handler_model("llama2-7b-smoothquant") + r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3") + client.run("trtllm llama2-7b-smoothquant".split()) + + def test_mistral(self): + with Runner('tensorrt-llm', 'mistral-7b') as r: + prepare.build_trtllm_handler_model("mistral-7b") + r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3") + client.run("trtllm mistral-7b".split()) + + def test_gpt_j_6b(self): + with Runner('tensorrt-llm', 'gpt-j-6b') as r: + prepare.build_trtllm_handler_model("gpt-j-6b") + r.launch("CUDA_VISIBLE_DEVICES=0") + client.run("trtllm gpt-j-6b".split()) + + def test_qwen_7b(self): + with Runner('tensorrt-llm', 'qwen-7b') as r: + prepare.build_trtllm_handler_model("qwen-7b") + r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3") + client.run("trtllm qwen-7b".split()) + + +class TestSchedulerSingleGPU: + # Runs on g5.12xl + + def test_gpt2(self): + with Runner('lmi', 'gpt2') as r: + prepare.build_rolling_batch_model("gpt2") + r.launch() + rb_client.run("correctness gpt2".split()) + + def test_bllm(self): + with Runner('lmi', 'bloom-560m') as r: + prepare.build_rolling_batch_model("bloom-560m") + r.launch() + rb_client.run("scheduler_single_gpu bloom-560m".split()) + + +class TestSchedulerMultiGPU: + # Runs on g5.12xl + + def test_gptj_6b(self): + with Runner('lmi', 'gpt-j-6b') as r: + prepare.build_rolling_batch_model("gpt-j-6b") + r.launch() + rb_client.run("scheduler_multi_gpu gpt-j-6b".split()) + + +class TestLmiDist1: + # Runs on g5.12xl + + def test_gpt_neox_20b(self): + with Runner('lmi', 'gpt-neox-20b') as r: + prepare.build_lmi_dist_model("gpt-neox-20b") + r.launch() + client.run("lmi_dist gpt-neox-20b".split()) + + def test_falcon_7b(self): + with Runner('lmi', 'falcon-7b') as r: + prepare.build_lmi_dist_model("falcon-7b") + r.launch() + client.run("lmi_dist falcon-7b".split()) + + def test_falcon2_11b(self): + with Runner('lmi', 'falcon-11b') as r: + prepare.build_lmi_dist_model("falcon-11b") + r.launch() + client.run("lmi_dist falcon-11b".split()) + + def test_gpt2(self): + with Runner('lmi', 'gpt2') as r: + envs = [ + "OPTION_MAX_ROLLING_BATCH_SIZE=2", + "OPTION_OUTPUT_FORMATTER=jsonlines", + "TENSOR_PARALLEL_DEGREE=1", "HF_MODEL_ID=gpt2", + "OPTION_TASK=text-generation", "OPTION_ROLLING_BATCH=lmi-dist" + ] + r.launch("\n".join(envs)) + client.run("lmi_dist gpt2".split()) + + def test_mpt_7b(self): + with Runner('lmi', 'mpt-7b') as r: + prepare.build_lmi_dist_model("mpt-7b") + r.launch() + client.run("lmi_dist mpt-7b".split()) + + def test_llama2_tiny_autoawq(self): + with Runner('lmi', 'llama-2-tiny-autoawq') as r: + prepare.build_lmi_dist_model("llama-2-tiny") + r.launch( + "CUDA_VISIBLE_DEVICES=0,1,2,3", + cmd= + "partition --model-dir /opt/ml/input/data/training --save-mp-checkpoint-path /opt/ml/input/data/training/aot" + ) + r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3", + cmd="serve -m test=file:/opt/ml/model/test/aot") + client.run("lmi_dist llama-2-tiny".split()) + + +class TestLmiDist2: + # Runs on g5.12xl + + def test_gpt_neox_20b(self): + with Runner('lmi', 'octocoder') as r: + prepare.build_lmi_dist_model("octocoder") + r.launch() + client.run("lmi_dist octocoder".split()) + + def test_speculative_llama_13b(self): + with Runner('lmi', 'speculative-llama-13b') as r: + prepare.build_lmi_dist_model("speculative-llama-13b") + r.launch() + client.run("lmi_dist speculative-llama-13b".split()) + + def test_starcoder2_7b(self): + with Runner('lmi', 'starcoder2-7b') as r: + prepare.build_lmi_dist_model("starcoder2-7b") + r.launch() + client.run("lmi_dist starcoder2-7b".split()) + + def test_gemma_2b(self): + with Runner('lmi', 'gemma-2b') as r: + prepare.build_lmi_dist_model("gemma-2b") + r.launch() + client.run("lmi_dist gemma-2b".split()) + + def test_llama2_13b_gptq(self): + with Runner('lmi', 'llama2-13b-gptq') as r: + prepare.build_lmi_dist_model("llama2-13b-gptq") + r.launch() + client.run("lmi_dist llama2-13b-gptq".split()) + + def test_mistral_7b(self): + with Runner('lmi', 'mistral-7b') as r: + prepare.build_lmi_dist_model("mistral-7b") + r.launch() + client.run("lmi_dist mistral-7b".split()) + + def test_llama2_7b_32k(self): + with Runner('lmi', 'llama2-7b-32k') as r: + prepare.build_lmi_dist_model("llama2-7b-32k") + r.launch() + client.run("lmi_dist llama2-7b-32k".split()) + + def test_mistral_7b_128k_awq(self): + with Runner('lmi', 'mistral-7b-128k-awq') as r: + prepare.build_lmi_dist_model("mistral-7b-128k-awq") + r.launch() + client.run("lmi_dist mistral-7b-128k-awq".split()) + + def test_llama2_7b_chat(self): + with Runner('lmi', 'llama2-7b-chat') as r: + prepare.build_lmi_dist_model("llama2-7b-chat") + r.launch() + client.run("lmi_dist_chat llama2-7b-chat".split()) + + +class TestVllm1: + # Runs on g5.12xl + + def test_gpt_neox_20b(self): + with Runner('lmi', 'gpt-neox-20b') as r: + prepare.build_vllm_model("gpt-neox-20b") + r.launch() + client.run("vllm gpt-neox-20b".split()) + + def test_mistral_7b(self): + with Runner('lmi', 'mistral-7b') as r: + prepare.build_vllm_model("mistral-7b") + r.launch() + client.run("vllm mistral-7b".split()) + + def test_phi2(self): + with Runner('lmi', 'phi-2') as r: + prepare.build_vllm_model("phi-2") + r.launch() + client.run("vllm phi-2".split()) + + def test_starcoder2_7b(self): + with Runner('lmi', 'starcoder2-7b') as r: + prepare.build_vllm_model("starcoder2-7b") + r.launch() + client.run("vllm starcoder2-7b".split()) + + def test_gemma_2b(self): + with Runner('lmi', 'gemma-2b') as r: + prepare.build_vllm_model("gemma-2b") + r.launch() + client.run("vllm gemma-2b".split()) + + def test_llama2_7b_chat(self): + with Runner('lmi', 'llama2-7b-chat') as r: + prepare.build_vllm_model("llama2-7b-chat") + r.launch() + client.run("vllm_chat llama2-7b-chat".split()) + + +class TestVllmLora: + # Runs on g5.12xl + + def test_lora_unmerged(self): + with Runner('lmi', 'llama-7b-unmerged-lora') as r: + prepare.build_vllm_model("llama-7b-unmerged-lora") + r.launch() + client.run("vllm_adapters llama-7b-unmerged-lora".split()) + + def test_lora_unmerged_overflow(self): + with Runner('lmi', 'llama-7b-unmerged-lora-overflow') as r: + prepare.build_vllm_model("llama-7b-unmerged-lora-overflow") + r.launch() + client.run("vllm_adapters llama-7b-unmerged-lora-overflow".split()) + + def test_lora_awq_llama2_13b(self): + with Runner('lmi', 'llama2-13b-awq-unmerged-lora') as r: + prepare.build_vllm_model("llama2-13b-awq-unmerged-lora") + r.launch() + client.run("vllm_adapters llama2-13b-awq-unmerged-lora".split()) + + def test_lora_mistral_7b(self): + with Runner('lmi', 'mistral-7b-unmerged-lora') as r: + prepare.build_vllm_model("mistral-7b-unmerged-lora") + r.launch() + client.run("vllm_adapters mistral-7b-unmerged-lora".split()) + + def test_lora_awq_mistral_7b(self): + with Runner('lmi', 'mistral-7b-awq-unmerged-lora') as r: + prepare.build_vllm_model("mistral-7b-awq-unmerged-lora") + r.launch() + client.run("vllm_adapters mistral-7b-awq-unmerged-lora".split()) + + def test_lora_llama3_8b(self): + with Runner('lmi', 'llama3-8b-unmerged-lora') as r: + prepare.build_vllm_model("llama3-8b-unmerged-lora") + r.launch() + client.run("vllm_adapters llama3-8b-unmerged-lora".split()) -def test_llama_7b(): - with Runner('deepspeed') as r: - prepare.build_hf_handler_model("open-llama-7b") - r.launch() - client.run("huggingface open-llama-7b".split()) +class TestLmiDistLora: + # Runs on g5.12xl + def test_lora_unmerged(self): + with Runner('lmi', 'llama-7b-unmerged-lora') as r: + prepare.build_lmi_dist_model("llama-7b-unmerged-lora") + r.launch() + client.run("lmi_dist_adapters llama-7b-unmerged-lora".split()) -def test_unmerged_lora_llama7b(): - with Runner('deepspeed') as r: - prepare.build_hf_handler_model("llama-7b-unmerged-lora") - r.launch() - client.run("huggingface llama-7b-unmerged-lora".split()) + def test_lora_unmerged_overflow(self): + with Runner('lmi', 'llama-7b-unmerged-lora-overflow') as r: + prepare.build_lmi_dist_model("llama-7b-unmerged-lora-overflow") + r.launch() + client.run( + "lmi_dist_adapters llama-7b-unmerged-lora-overflow".split()) + def test_lora_awq_llama2_13b(self): + with Runner('lmi', 'llama2-13b-awq-unmerged-lora') as r: + prepare.build_lmi_dist_model("llama2-13b-awq-unmerged-lora") + r.launch() + client.run( + "lmi_dist_adapters llama2-13b-awq-unmerged-lora".split()) -def test_falcon_7b_triton_tp1(): - with Runner('tensorrt-llm') as r: - prepare.build_trtllm_handler_model("falcon-7b") - r.launch("CUDA_VISIBLE_DEVICES=0") - client.run("trtllm falcon-7b".split()) + def test_lora_mistral_7b(self): + with Runner('lmi', 'mistral-7b-unmerged-lora') as r: + prepare.build_lmi_dist_model("mistral-7b-unmerged-lora") + r.launch() + client.run("lmi_dist_adapters mistral-7b-unmerged-lora".split()) + def test_lora_awq_mistral_7b(self): + with Runner('lmi', 'mistral-7b-awq-unmerged-lora') as r: + prepare.build_lmi_dist_model("mistral-7b-awq-unmerged-lora") + r.launch() + client.run( + "lmi_dist_adapters mistral-7b-awq-unmerged-lora".split()) -def test_bloom_560m(): - with Runner('deepspeed') as r: - prepare.build_rolling_batch_model("bloom-560m") - r.launch() - rb_client.run("scheduler_single_gpu bloom-560m".split()) + def test_lora_llama3_8b(self): + with Runner('lmi', 'llama3-8b-unmerged-lora') as r: + prepare.build_lmi_dist_model("llama3-8b-unmerged-lora") + r.launch() + client.run("lmi_dist_adapters llama3-8b-unmerged-lora".split())