From 13fd025d0888053b00df62d1c3ac0146363caa79 Mon Sep 17 00:00:00 2001
From: Zach Kimberg <kimbergz@amazon.com>
Date: Tue, 18 Jun 2024 14:12:29 -0700
Subject: [PATCH] [CI] LLM Integration Tests through pytest suite (#2023)

---
 .github/workflows/llm_integration.yml         | 941 +-----------------
 serving/docker/scripts/docker_name_builder.sh |   6 +-
 tests/integration/.gitignore                  |   1 +
 tests/integration/tests.py                    | 435 +++++++-
 4 files changed, 417 insertions(+), 966 deletions(-)

diff --git a/.github/workflows/llm_integration.yml b/.github/workflows/llm_integration.yml
index e42f428a3..a74879060 100644
--- a/.github/workflows/llm_integration.yml
+++ b/.github/workflows/llm_integration.yml
@@ -7,10 +7,6 @@ on:
         description: 'The released version of DJL'
         required: false
         default: ''
-      run_test:
-        description: 'Run only the tests you need [ hf, trtllm, scheduler, lmi-dist, vllm, vllm-lora, lmi-dist-lora ]'
-        required: false
-        default: ''
   schedule:
     - cron: '0 15 * * *'
 
@@ -51,118 +47,24 @@ jobs:
       gpu_instance_id_2: ${{ steps.create_gpu2.outputs.action_g6_instance_id }}
       gpu_instance_id_3: ${{ steps.create_gpu3.outputs.action_g6_instance_id }}
 
-  hf-handler-test:
-    if: contains(fromJson('["", "hf"]'), github.event.inputs.run_test)
+  test:
     runs-on: [ self-hosted, g6 ]
     timeout-minutes: 60
     needs: create-runners
     strategy:
+      fail-fast: false
       matrix:
-        arch: [ lmi ]
-    steps:
-      - uses: actions/checkout@v4
-      - name: Clean env
-        run: |
-          yes | docker system prune -a --volumes
-          sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
-          echo "wait dpkg lock..."
-          while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
-      - name: Set up Python3
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.10.x'
-      - name: Install pip dependencies
-        run: pip3 install requests "numpy<2" huggingface_hub
-      - name: Build container name
-        run: ./serving/docker/scripts/docker_name_builder.sh ${{ matrix.arch }} ${{ github.event.inputs.djl-version }}
-      - name: Download models and dockers
-        working-directory: tests/integration
-        run: |
-          docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
-      - name: Test gpt-neo
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py huggingface gpt-neo-2.7b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-          serve -m test=file:/opt/ml/model/test/
-          python3 llm/client.py huggingface gpt-neo-2.7b
-          docker rm -f $(docker ps -aq)
-      - name: Test bloom-7b
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py huggingface bloom-7b1
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-          serve
-          python3 llm/client.py huggingface bloom-7b1
-          docker rm -f $(docker ps -aq)
-      - name: Test LLAMA-7b
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py huggingface llama-2-7b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-          serve
-          python3 llm/client.py huggingface llama-2-7b
-          docker rm -f $(docker ps -aq)
-      - name: Test GPTJ-6B
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py huggingface gpt-j-6b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-          serve
-          python3 llm/client.py huggingface gpt-j-6b
-          docker rm -f $(docker ps -aq)
-      - name: Test gpt4all-lora
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py huggingface gpt4all-lora
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-          serve
-          python3 llm/client.py huggingface gpt4all-lora
-          docker rm -f $(docker ps -aq)
-      - name: Test streaming bigscience/bloom-3b
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          echo -en "CUDA_VISIBLE_DEVICES=1,2" > docker_env
-          python3 llm/prepare.py huggingface bigscience/bloom-3b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-          serve
-          python3 llm/client.py huggingface bigscience/bloom-3b
-          rm -rf docker_env
-          docker rm -f $(docker ps -aq)
-      - name: Test streaming t5-large
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          echo -en "CUDA_VISIBLE_DEVICES=1" > docker_env
-          python3 llm/prepare.py huggingface t5-large
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-          serve
-          python3 llm/client.py huggingface t5-large
-          rm -rf docker_env
-          docker rm -f $(docker ps -aq)
-      - name: On fail step
-        if: ${{ failure() }}
-        working-directory: tests/integration
-        run: |
-          docker rm -f $(docker ps -aq) || true
-          cat logs/serving.log
-      - name: Upload test logs
-        uses: actions/upload-artifact@v3
-        with:
-          name: hf-handler-${{ matrix.arch }}-logs
-          path: tests/integration/logs/
-
-  trt-llm-handler-test:
-    if: contains(fromJson('["", "trtllm"]'), github.event.inputs.run_test)
-    runs-on: [ self-hosted, g6 ]
-    timeout-minutes: 120
-    needs: create-runners
+        test:
+          - TestHfHandler
+          - TestTrtLlmHandler1
+          - TestTrtLlmHandler2
+          - TestSchedulerSingleGPU
+          - TestSchedulerMultiGPU
+          - TestLmiDist1
+          - TestLmiDist2
+          - TestVllm1
+          - TestVllmLora
+          - TestLmiDistLora
     steps:
       - uses: actions/checkout@v4
       - name: Clean env
@@ -176,240 +78,25 @@ jobs:
         with:
           python-version: '3.10.x'
       - name: Install pip dependencies
-        run: pip3 install requests "numpy<2" huggingface_hub
-      - name: Build container name
-        run: ./serving/docker/scripts/docker_name_builder.sh tensorrt-llm ${{ github.event.inputs.djl-version }}
-      - name: Download models and dockers
-        working-directory: tests/integration
-        run: |
-          docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
-      - name: llama2-13b HF model with tp=4
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          echo -en "CUDA_VISIBLE_DEVICES=0,1,2,3" > docker_env
-          python3 llm/prepare.py trtllm llama2-13b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \
-          serve
-          python3 llm/client.py trtllm llama2-13b
-          rm -rf docker_env
-          docker rm -f $(docker ps -aq)
-      - name: falcon-7b triton repo with tp=1
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          echo -en "CUDA_VISIBLE_DEVICES=0" > docker_env
-          python3 llm/prepare.py trtllm falcon-7b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \
-          serve
-          python3 llm/client.py trtllm falcon-7b
-          rm -rf docker_env
-          docker rm -f $(docker ps -aq)
-      - name: internlm-7b HF model with tp=4
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          echo -en "CUDA_VISIBLE_DEVICES=0,1,2,3" > docker_env
-          python3 llm/prepare.py trtllm internlm-7b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \
-          serve
-          python3 llm/client.py trtllm internlm-7b
-          rm -rf docker_env
-          docker rm -f $(docker ps -aq)
-      - name: baichuan2-13b HF model with tp=4
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          echo -en "CUDA_VISIBLE_DEVICES=0,1,2,3" > docker_env
-          python3 llm/prepare.py trtllm baichuan2-13b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \
-          serve
-          python3 llm/client.py trtllm baichuan2-13b
-          rm -rf docker_env
-          docker rm -f $(docker ps -aq)
-      - name: chatglm3-6b HF model with tp=4
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          echo -en "CUDA_VISIBLE_DEVICES=0,1,2,3" > docker_env
-          python3 llm/prepare.py trtllm chatglm3-6b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \
-          serve
-          python3 llm/client.py trtllm chatglm3-6b
-          rm -rf docker_env
-          docker rm -f $(docker ps -aq)
-      - name: GPT2 HF model with tp=4
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          echo -en "CUDA_VISIBLE_DEVICES=0,1,2,3" > docker_env
-          python3 llm/prepare.py trtllm gpt2
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \
-          serve
-          python3 llm/client.py trtllm gpt2
-          rm -rf docker_env
-          docker rm -f $(docker ps -aq)
-      - name: SantaCoder HF model with tp=4
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          echo -en "CUDA_VISIBLE_DEVICES=0,1,2,3" > docker_env
-          python3 llm/prepare.py trtllm santacoder
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \
-          serve
-          python3 llm/client.py trtllm santacoder
-          rm -rf docker_env
-          docker rm -f $(docker ps -aq)
-      - name: On fail step
-        if: ${{ failure() }}
-        working-directory: tests/integration
-        run: |
-          docker rm -f $(docker ps -aq) || true
-          cat logs/serving.log
-      - name: Upload test logs
-        uses: actions/upload-artifact@v3
-        with:
-          name: trtllm-handler-logs
-          path: tests/integration/logs/
-
-  trt-llm-handler-test-2:
-    if: contains(fromJson('["", "trtllm"]'), github.event.inputs.run_test)
-    runs-on: [ self-hosted, g6 ]
-    timeout-minutes: 120
-    needs: create-runners
-    steps:
-      - uses: actions/checkout@v4
-      - name: Clean env
-        run: |
-          yes | docker system prune -a --volumes
-          sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
-          echo "wait dpkg lock..."
-          while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
-      - name: Set up Python3
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.10.x'
-      - name: Install pip dependencies
-        run: pip3 install requests "numpy<2" huggingface_hub
-      - name: Build container name
-        run: ./serving/docker/scripts/docker_name_builder.sh tensorrt-llm ${{ github.event.inputs.djl-version }}
-      - name: Download models and dockers
-        working-directory: tests/integration
-        run: |
-          docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
-      - name: llama2-7b HF model with tp=4 and smoothquant
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          echo -en "CUDA_VISIBLE_DEVICES=0,1,2,3" > docker_env
-          python3 llm/prepare.py trtllm llama2-7b-smoothquant
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm-sq \
-          serve
-          python3 llm/client.py trtllm llama2-7b-smoothquant
-          rm -rf docker_env
-          docker rm -f $(docker ps -aq)
-      - name: mistral-7b HF model with tp=4
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          echo -en "CUDA_VISIBLE_DEVICES=0,1,2,3" > docker_env
-          python3 llm/prepare.py trtllm mistral-7b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \
-          serve
-          python3 llm/client.py trtllm mistral-7b
-          rm -rf docker_env
-          docker rm -f $(docker ps -aq)
-      - name: gpt-j-6b HF model with tp=1
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          echo -en "CUDA_VISIBLE_DEVICES=0" > docker_env
-          python3 llm/prepare.py trtllm gpt-j-6b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \
-          serve
-          python3 llm/client.py trtllm gpt-j-6b
-          rm -rf docker_env
-          docker rm -f $(docker ps -aq)
-      - name: qwen-7b HF model with tp=4
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          echo -en "CUDA_VISIBLE_DEVICES=0,1,2,3" > docker_env
-          python3 llm/prepare.py trtllm qwen-7b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \
-          serve
-          python3 llm/client.py trtllm qwen-7b
-          rm -rf docker_env
-          docker rm -f $(docker ps -aq)
-      - name: On fail step
-        if: ${{ failure() }}
-        working-directory: tests/integration
-        run: |
-          docker rm -f $(docker ps -aq) || true
-          cat logs/serving.log
-      - name: Upload test logs
-        uses: actions/upload-artifact@v3
-        with:
-          name: trtllm-handler-quantization-logs
-          path: tests/integration/logs/
-
-  scheduler-single-gpu-test:
-    if: contains(fromJson('["", "scheduler"]'), github.event.inputs.run_test)
-    runs-on: [ self-hosted, g6 ]
-    timeout-minutes: 60
-    needs: create-runners
-    steps:
-      - uses: actions/checkout@v4
-      - name: Clean env
-        run: |
-          yes | docker system prune -a --volumes
-          sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
-          echo "wait dpkg lock..."
-          while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
-      - name: Set up Python3
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.10.x'
+        run: pip3 install pytest requests "numpy<2" huggingface_hub
       - name: Install awscurl
         working-directory: tests/integration
         run: |
           curl -OL https://github.com/frankfliu/junkyard/releases/download/v0.2.2/awscurl
           chmod +x awscurl
           mkdir outputs
-      - name: Build container name
-        run: ./serving/docker/scripts/docker_name_builder.sh lmi ${{ github.event.inputs.djl-version }}
-      - name: Download models and dockers
-        working-directory: tests/integration
-        run: |
-          docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
-      - name: Test gpt2
-        working-directory: tests/integration
-        run: |
-          # Correctness test
-          rm -rf models
-          python3 llm/prepare.py rolling_batch_scheduler gpt2
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-          serve -m test=file:/opt/ml/model/test/
-          python3 rb_client.py correctness gpt2
-          docker rm -f $(docker ps -aq)
-      - name: Test bloom-560m
+      - name: Test
         working-directory: tests/integration
+        env:
+          TEST_DJL_VERSION: ${{ inputs.djl-version }}
         run: |
-          rm -rf models
-          python3 llm/prepare.py rolling_batch_scheduler bloom-560m
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-          serve -m test=file:/opt/ml/model/test/
-          python3 rb_client.py scheduler_single_gpu bloom-560m
-          docker rm -f $(docker ps -aq)
-      - name: Print outputs
-        working-directory: tests/integration
-        run: for file in outputs/*; do if [ -f "$file" ]; then echo "Contents of $file:"; cat "$file"; echo; fi; done
+          pytest -k ${{ matrix.test }} tests.py
       - name: Cleanup
         working-directory: tests/integration
         run: |
           rm -rf outputs
           rm awscurl
-      - name: On fail step
+      - name: On Failure
         if: ${{ failure() }}
         working-directory: tests/integration
         run: |
@@ -417,599 +104,17 @@ jobs:
           rm -rf outputs && rm -rf models
           rm awscurl
           docker rm -f $(docker ps -aq) || true
-          cat logs/serving.log
-      - name: Upload test logs
-        uses: actions/upload-artifact@v3
-        with:
-          name: rb-single-gpu-logs
-          path: tests/integration/logs/
-
-  scheduler-multi-gpu-test:
-    if: contains(fromJson('["", "scheduler"]'), github.event.inputs.run_test)
-    runs-on: [ self-hosted, g6 ]
-    timeout-minutes: 60
-    needs: create-runners
-    steps:
-      - uses: actions/checkout@v4
-      - name: Clean env
-        run: |
-          yes | docker system prune -a --volumes
-          sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
-          echo "wait dpkg lock..."
-          while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
-      - name: Set up Python3
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.10.x'
-      - name: Install awscurl
-        working-directory: tests/integration
-        run: |
-          curl -OL https://github.com/frankfliu/junkyard/releases/download/v0.2.2/awscurl
-          chmod +x awscurl
-          mkdir outputs
-      - name: Build container name
-        run: ./serving/docker/scripts/docker_name_builder.sh lmi ${{ github.event.inputs.djl-version }}
-      - name: Download models and dockers
-        working-directory: tests/integration
-        run: |
-          docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
-      - name: Test gptj-6b
-        working-directory: tests/integration
-        run: |
-          # Concurrent requests test
-          rm -rf models
-          python3 llm/prepare.py rolling_batch_scheduler gpt-j-6b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-          serve -m test=file:/opt/ml/model/test/
-          python3 rb_client.py scheduler_multi_gpu gpt-j-6b
-          docker rm -f $(docker ps -aq)
-      - name: Print outputs
-        working-directory: tests/integration
-        run: for file in outputs/*; do if [ -f "$file" ]; then echo "Contents of $file:"; cat "$file"; echo; fi; done
-      - name: Cleanup
-        working-directory: tests/integration
-        run: |
-          rm -rf models && rm -rf outputs
-          rm awscurl
-      - name: On fail step
-        if: ${{ failure() }}
-        working-directory: tests/integration
-        run: |
-          for file in outputs/*; do if [ -f "$file" ]; then echo "Contents of $file:"; cat "$file"; echo; fi; done
-          rm -rf outputs && rm -rf models
-          rm awscurl
-          docker rm -f $(docker ps -aq) || true
-          cat logs/serving.log
-      - name: Upload test logs
-        uses: actions/upload-artifact@v3
-        with:
-          name: rb-multi-gpu-logs
-          path: tests/integration/logs/
-
-  lmi-dist-test-1:
-    if: contains(fromJson('["", "lmi-dist"]'), github.event.inputs.run_test)
-    runs-on: [ self-hosted, g6 ]
-    timeout-minutes: 60
-    needs: create-runners
-    steps:
-      - uses: actions/checkout@v4
-      - name: Clean env
-        run: |
-          yes | docker system prune -a --volumes
-          sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
-          echo "wait dpkg lock..."
-          while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
-      - name: Set up Python3
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.10.x'
-      - name: Install pip dependencies
-        run: pip3 install requests "numpy<2" huggingface_hub
-      - name: Build container name
-        run: ./serving/docker/scripts/docker_name_builder.sh lmi ${{ github.event.inputs.djl-version }}
-      - name: Download docker
-        working-directory: tests/integration
-        run: |
-          docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
-      - name: Test gpt-neox-20b
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py lmi_dist gpt-neox-20b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-            serve -m test=file:/opt/ml/model/test/
-          python3 llm/client.py lmi_dist gpt-neox-20b
-          docker rm -f $(docker ps -aq)
-      - name: Test falcon-7b
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py lmi_dist falcon-7b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-            serve -m test=file:/opt/ml/model/test/
-          python3 llm/client.py lmi_dist falcon-7b
-          docker rm -f $(docker ps -aq)
-      - name: Test falcon2-11b
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py lmi_dist falcon-11b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-            serve -m test=file:/opt/ml/model/test/
-          python3 llm/client.py lmi_dist falcon-11b
-          docker rm -f $(docker ps -aq)
-      - name: Test flan-t5-xxl
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py lmi_dist flan-t5-xxl
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-            serve -m test=file:/opt/ml/model/test/
-          python3 llm/client.py lmi_dist flan-t5-xxl
-          docker rm -f $(docker ps -aq)
-      - name: Test gpt2
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py lmi_dist gpt2
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-            serve -m test=file:/opt/ml/model/test/
-          python3 llm/client.py lmi_dist gpt2
-          docker rm -f $(docker ps -aq)
-      - name: Test mpt-7b
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py lmi_dist mpt-7b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-            serve -m test=file:/opt/ml/model/test/
-          python3 llm/client.py lmi_dist mpt-7b
-          docker rm -f $(docker ps -aq)
-      - name: Test llama-2-tiny AutoAwq
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py lmi_dist llama-2-tiny
-          echo -en "CUDA_VISIBLE_DEVICES=0,1,2,3" > docker_env
-          # partition
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-             partition --model-dir /opt/ml/input/data/training --save-mp-checkpoint-path /opt/ml/input/data/training/aot
-          # launch the container again
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-             serve -m test=file:/opt/ml/model/test/aot
-          python3 llm/client.py lmi_dist llama-2-tiny
-          sudo rm -rf models
-          docker rm -f $(docker ps -aq)
-      - name: On fail step
-        if: ${{ failure() }}
-        working-directory: tests/integration
-        run: |
-          docker rm -f $(docker ps -aq) || true
-          cat logs/serving.log
-      - name: Remove model directory
-        working-directory: tests/integration
-        run: |
-          sudo rm -rf models
-      - name: Upload test logs
-        uses: actions/upload-artifact@v3
-        with:
-          name: lmi-dist-logs-1
-          path: tests/integration/logs/
-
-  lmi-dist-test-2:
-    if: contains(fromJson('["", "lmi-dist"]'), github.event.inputs.run_test)
-    runs-on: [ self-hosted, g6 ]
-    timeout-minutes: 60
-    needs: create-runners
-    steps:
-      - uses: actions/checkout@v4
-      - name: Clean env
-        run: |
-          yes | docker system prune -a --volumes
-          sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
-          echo "wait dpkg lock..."
-          while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
-      - name: Set up Python3
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.10.x'
-      - name: Install pip dependencies
-        run: pip3 install requests "numpy<2"
-      - name: Build container name
-        run: ./serving/docker/scripts/docker_name_builder.sh lmi ${{ github.event.inputs.djl-version }}
-      - name: Download docker
-        working-directory: tests/integration
-        run: |
-          docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
-      - name: Test octocoder
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py lmi_dist octocoder
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-            serve -m test=file:/opt/ml/model/test/
-          python3 llm/client.py lmi_dist octocoder
-          docker rm -f $(docker ps -aq)
-      - name: Test speculative-llama-13b
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py lmi_dist speculative-llama-13b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-            serve -m test=file:/opt/ml/model/test/
-          python3 llm/client.py lmi_dist speculative-llama-13b
-          docker rm -f $(docker ps -aq)
-      - name: Test starcoder2-7b
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py lmi_dist starcoder2-7b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-            serve -m test=file:/opt/ml/model/test/
-          python3 llm/client.py lmi_dist starcoder2-7b
-          docker rm -f $(docker ps -aq)
-      - name: Test gemma-2b
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py lmi_dist gemma-2b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-            serve -m test=file:/opt/ml/model/test/
-          python3 llm/client.py lmi_dist gemma-2b
-          docker rm -f $(docker ps -aq)
-      - name: Test llama2-13b-gptq
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py lmi_dist llama2-13b-gptq
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-            serve -m test=file:/opt/ml/model/test/
-          python3 llm/client.py lmi_dist llama2-13b-gptq
-          docker rm -f $(docker ps -aq)
-      - name: Test Mistral-7b
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py lmi_dist mistral-7b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-            serve -m test=file:/opt/ml/model/test/
-          python3 llm/client.py lmi_dist mistral-7b
-          docker rm -f $(docker ps -aq)
-      - name: Test llama2-7b-32k
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py lmi_dist llama2-7b-32k
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-            serve -m test=file:/opt/ml/model/test/
-          python3 llm/client.py lmi_dist llama2-7b-32k
-          docker rm -f $(docker ps -aq)
-      - name: Test mistral-7b-128k-awq
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py lmi_dist mistral-7b-128k-awq
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-            serve -m test=file:/opt/ml/model/test/
-          python3 llm/client.py lmi_dist mistral-7b-128k-awq
-          docker rm -f $(docker ps -aq)
-      - name: Test llama2-7b-chat
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py lmi_dist llama2-7b-chat
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-            serve -m test=file:/opt/ml/model/test/
-          python3 llm/client.py lmi_dist_chat llama2-7b-chat
-          docker rm -f $(docker ps -aq)
-      - name: On fail step
-        if: ${{ failure() }}
-        working-directory: tests/integration
-        run: |
-          docker rm -f $(docker ps -aq) || true
-          cat logs/serving.log
-      - name: Upload test logs
-        uses: actions/upload-artifact@v3
-        with:
-          name: lmi-dist-logs-2
-          path: tests/integration/logs/
-
-  vllm-test:
-    if: contains(fromJson('["", "vllm"]'), github.event.inputs.run_test)
-    runs-on: [ self-hosted, g6 ]
-    timeout-minutes: 60
-    needs: create-runners
-    steps:
-      - uses: actions/checkout@v4
-      - name: Clean env
-        run: |
-          yes | docker system prune -a --volumes
-          sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
-          echo "wait dpkg lock..."
-          while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
-      - name: Set up Python3
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.10.x'
-      - name: Install pip dependencies
-        run: pip3 install requests "numpy<2" huggingface_hub
-      - name: Build container name
-        run: ./serving/docker/scripts/docker_name_builder.sh lmi ${{ github.event.inputs.djl-version }}
-      - name: Download docker
-        working-directory: tests/integration
-        run: |
-          docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
-      - name: Test llama2-13b
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py vllm llama2-13b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-            serve -m test=file:/opt/ml/model/test/
-          python3 llm/client.py vllm llama2-13b
-          docker rm -f $(docker ps -aq)
-      - name: Test llama2-13b awq
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py vllm llama2-13b-awq
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-            serve -m test=file:/opt/ml/model/test/
-          python3 llm/client.py vllm llama2-13b
-          docker rm -f $(docker ps -aq)
-      - name: Test gpt-neox-20b
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py vllm gpt-neox-20b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-            serve -m test=file:/opt/ml/model/test/
-          python3 llm/client.py vllm gpt-neox-20b
-          docker rm -f $(docker ps -aq)
-      - name: Test Mistral-7b
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py vllm mistral-7b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-            serve -m test=file:/opt/ml/model/test/
-          python3 llm/client.py vllm mistral-7b
-          docker rm -f $(docker ps -aq)
-      - name: Test phi-2
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py vllm phi-2
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-            serve -m test=file:/opt/ml/model/test/
-          python3 llm/client.py vllm phi-2
-          docker rm -f $(docker ps -aq)
-      - name: Test starcoder2-7b
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py vllm starcoder2-7b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-            serve -m test=file:/opt/ml/model/test/
-          python3 llm/client.py vllm starcoder2-7b
-          docker rm -f $(docker ps -aq)
-      - name: Test gemma-2b
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py vllm gemma-2b
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-            serve -m test=file:/opt/ml/model/test/
-          python3 llm/client.py vllm gemma-2b
-          docker rm -f $(docker ps -aq)
-      - name: Test llama2-7b-chat
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py vllm llama2-7b-chat
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-            serve -m test=file:/opt/ml/model/test/
-          python3 llm/client.py vllm_chat llama2-7b-chat
-          docker rm -f $(docker ps -aq)
-      - name: On fail step
-        if: ${{ failure() }}
-        working-directory: tests/integration
-        run: |
-          docker rm -f $(docker ps -aq) || true
-          cat logs/serving.log
-      - name: Upload test logs
-        uses: actions/upload-artifact@v3
-        with:
-          name: vllm-logs
-          path: tests/integration/logs/
-
-
-  vllm-lora-test:
-    if: contains(fromJson('["", "vllm-lora"]'), github.event.inputs.run_test)
-    runs-on: [ self-hosted, g6 ]
-    timeout-minutes: 60
-    needs: create-runners
-    steps:
-      - uses: actions/checkout@v4
-      - name: Clean env
-        run: |
-          yes | docker system prune -a --volumes
-          sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
-          echo "wait dpkg lock..."
-          while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
-      - name: Set up Python3
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.10.x'
-      - name: Install pip dependencies
-        run: pip3 install requests "numpy<2" huggingface_hub
-      - name: Build container name
-        run: ./serving/docker/scripts/docker_name_builder.sh lmi ${{ github.event.inputs.djl-version }}
-      - name: Download docker
-        working-directory: tests/integration
-        run: |
-          docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
-      - name: Test vllm unmerged lora - llama7b
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py vllm llama-7b-unmerged-lora
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-          serve
-          python3 llm/client.py vllm_adapters llama-7b-unmerged-lora
-          docker rm -f $(docker ps -aq)
-      - name: Test vllm unmerged lora overflow - llama7b
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py vllm llama-7b-unmerged-lora-overflow
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-          serve
-          python3 llm/client.py vllm_adapters llama-7b-unmerged-lora-overflow
-          docker rm -f $(docker ps -aq)
-      - name: Test vllm lora awq - llama2-13b
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py vllm llama2-13b-awq-unmerged-lora
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-          serve
-          python3 llm/client.py vllm_adapters llama2-13b-awq-unmerged-lora
-          docker rm -f $(docker ps -aq)
-      - name: Test vllm lora - mistral-7b
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py vllm mistral-7b-unmerged-lora
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-          serve
-          python3 llm/client.py vllm_adapters mistral-7b-unmerged-lora
-          docker rm -f $(docker ps -aq)
-      - name: Test vllm lora awq - mistral-7b
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py vllm mistral-7b-awq-unmerged-lora
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-          serve
-          python3 llm/client.py vllm_adapters mistral-7b-awq-unmerged-lora
-          docker rm -f $(docker ps -aq)
-      - name: Test vllm lora - llama-3-8b
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py vllm llama3-8b-unmerged-lora
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-          serve
-          python3 llm/client.py vllm_adapters llama3-8b-unmerged-lora
-          docker rm -f $(docker ps -aq)
-      - name: On fail step
-        if: ${{ failure() }}
-        working-directory: tests/integration
-        run: |
-          docker rm -f $(docker ps -aq) || true
-          cat logs/serving.log
-      - name: Upload test logs
-        uses: actions/upload-artifact@v3
-        with:
-          name: vllm-lora-logs
-          path: tests/integration/logs/
-
-  lmi-dist-lora-test:
-    if: contains(fromJson('["", "lmi-dist-lora"]'), github.event.inputs.run_test)
-    runs-on: [ self-hosted, g6 ]
-    timeout-minutes: 60
-    needs: create-runners
-    steps:
-      - uses: actions/checkout@v4
-      - name: Clean env
-        run: |
-          yes | docker system prune -a --volumes
-          sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
-          echo "wait dpkg lock..."
-          while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
-      - name: Set up Python3
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.10.x'
-      - name: Install pip dependencies
-        run: pip3 install requests "numpy<2" huggingface_hub
-      - name: Build container name
-        run: ./serving/docker/scripts/docker_name_builder.sh lmi ${{ github.event.inputs.djl-version }}
-      - name: Download docker
-        working-directory: tests/integration
-        run: |
-          docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
-      - name: Test lmi-dist unmerged lora - llama7b
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py lmi_dist llama-7b-unmerged-lora
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-          serve
-          python3 llm/client.py lmi_dist_adapters llama-7b-unmerged-lora
-          docker rm -f $(docker ps -aq)
-      - name: Test lmi-dist unmerged lora overflow - llama7b
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py lmi_dist llama-7b-unmerged-lora-overflow
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-          serve
-          python3 llm/client.py lmi_dist_adapters llama-7b-unmerged-lora-overflow
-          docker rm -f $(docker ps -aq)
-      - name: Test lmi-dist lora awq - llama2-13b
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py lmi_dist llama2-13b-awq-unmerged-lora
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-          serve
-          python3 llm/client.py lmi_dist_adapters llama2-13b-awq-unmerged-lora
-          docker rm -f $(docker ps -aq)
-      - name: Test lmi-dist lora - mistral-7b
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py lmi_dist mistral-7b-unmerged-lora
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-          serve
-          python3 llm/client.py lmi_dist_adapters mistral-7b-unmerged-lora
-          docker rm -f $(docker ps -aq)
-      - name: Test lmi-dist lora awq - mistral-7b
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py lmi_dist mistral-7b-awq-unmerged-lora
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-          serve
-          python3 llm/client.py lmi_dist_adapters mistral-7b-awq-unmerged-lora
-          docker rm -f $(docker ps -aq)
-      - name: Test lmi-dist lora - llama-3-8b
-        working-directory: tests/integration
-        run: |
-          rm -rf models
-          python3 llm/prepare.py lmi_dist llama3-8b-unmerged-lora
-          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models lmi \
-          serve
-          python3 llm/client.py lmi_dist_adapters llama3-8b-unmerged-lora
-          docker rm -f $(docker ps -aq)
-      - name: On fail step
-        if: ${{ failure() }}
-        working-directory: tests/integration
-        run: |
-          docker rm -f $(docker ps -aq) || true
-          cat logs/serving.log
       - name: Upload test logs
+        if: ${{ always() }}
         uses: actions/upload-artifact@v3
         with:
-          name: lmi-dist-lora-logs
-          path: tests/integration/logs/
+          name: test-${{ matrix.test }}-logs
+          path: tests/integration/all_logs/
 
   stop-runners:
     if: always()
     runs-on: [ self-hosted, scheduler ]
-    needs: [ create-runners, hf-handler-test, trt-llm-handler-test,  trt-llm-handler-test-2, scheduler-single-gpu-test, scheduler-multi-gpu-test, lmi-dist-test-1, lmi-dist-test-2, vllm-test, vllm-lora-test, lmi-dist-lora-test]
+    needs: [ create-runners, test]
     steps:
       - name: Stop all instances
         run: |
diff --git a/serving/docker/scripts/docker_name_builder.sh b/serving/docker/scripts/docker_name_builder.sh
index e2d78f6c4..250e17a89 100755
--- a/serving/docker/scripts/docker_name_builder.sh
+++ b/serving/docker/scripts/docker_name_builder.sh
@@ -13,8 +13,4 @@ else
   fi
 fi
 
-if [[ -n "$GITHUB_ENV" ]]; then
-  echo "DJLSERVING_DOCKER_TAG=$image" >> $GITHUB_ENV
-else
-  echo "$image"
-fi
+echo "DJLSERVING_DOCKER_TAG=$image" >> $GITHUB_ENV
diff --git a/tests/integration/.gitignore b/tests/integration/.gitignore
index 3875dea7c..65e4bde6c 100644
--- a/tests/integration/.gitignore
+++ b/tests/integration/.gitignore
@@ -1,3 +1,4 @@
 /docker_env
 /logs
+/all_logs
 /models
diff --git a/tests/integration/tests.py b/tests/integration/tests.py
index 648700b04..93ec1c450 100644
--- a/tests/integration/tests.py
+++ b/tests/integration/tests.py
@@ -4,21 +4,26 @@
 import subprocess
 import llm.prepare as prepare
 import llm.client as client
-import rb_client
+import rb_client as rb_client
 
-djl_version = ''
+djl_version = os.environ.get('TEST_DJL_VERSION', '').strip()
 
 
 class Runner:
 
-    def __init__(self, container):
+    def __init__(self, container, test_name=None):
         self.container = container
-        flavor = subprocess.run([
-            '../../serving/docker/scripts/docker_name_builder.sh', container,
-            djl_version
-        ],
-                                capture_output=True,
-                                text=True).stdout.strip()
+        self.test_name = test_name
+
+        # Compute flavor
+        if djl_version is not None and len(djl_version) > 0:
+            if container == "cpu":
+                flavor = djl_version
+            else:
+                flavor = f"{djl_version}-{container}"
+        else:
+            flavor = f"{container}-nightly"
+
         self.image = f"deepjavalibrary/djl-serving:{flavor}"
 
     def __enter__(self):
@@ -27,57 +32,401 @@ def __enter__(self):
         return self
 
     def __exit__(self, *args):
-        container = subprocess.run(['docker', 'ps', '-aq'],
-                                   capture_output=True,
-                                   text=True).stdout.strip()
-        if container != '':
-            subprocess.run(['docker', 'rm', '-f', container],
-                           shell=True,
-                           check=True)
+        if self.test_name is not None:
+            esc_test_name = self.test_name.replace("/", "-")
+            os.system(f"mkdir -p all_logs/{esc_test_name}")
+            os.system(f"cp -r logs all_logs/{esc_test_name}")
+        subprocess.run(["./remove_container.sh"], check=True)
         os.system("cat logs/serving.log")
 
-    def launch(self, env_vars=None):
+    def launch(self, env_vars=None, cmd=None):
         if env_vars is not None:
             with open("docker_env", "w") as f:
                 f.write(env_vars)
+        else:
+            if os.path.isfile("docker_env"):
+                os.remove("docker_env")
+
+        if cmd is None:
+            cmd = 'serve -m test=file:/opt/ml/model/test/'
 
         model_dir = os.path.join(os.getcwd(), 'models')
         subprocess.run(
-            f'./launch_container.sh {self.image} {model_dir} {self.container} serve -m test=file:/opt/ml/model/test/'
+            f'./launch_container.sh {self.image} {model_dir} {self.container} {cmd}'
             .split(),
             check=True)
 
 
-def test_gpt_neo():
-    with Runner('deepspeed') as r:
-        prepare.build_hf_handler_model("gpt-neo-2.7b")
-        r.launch()
-        client.run("huggingface gpt-neo-2.7b".split())
+class TestHfHandler:
+    # Runs on g5.12xl
+    def test_gpt_neo(self):
+        with Runner('lmi', 'test_gpt4all_lora') as r:
+            prepare.build_hf_handler_model("gpt-neo-2.7b")
+            r.launch()
+            client.run("huggingface gpt-neo-2.7b".split())
+
+    def test_bloom_7b(self):
+        with Runner('lmi', 'bloom-7b1') as r:
+            prepare.build_hf_handler_model("bloom-7b1")
+            r.launch()
+            client.run("huggingface bloom-7b1".split())
+
+    def test_llama2_7b(self):
+        with Runner('lmi', 'llama-2-7b') as r:
+            prepare.build_hf_handler_model("llama-2-7b")
+            r.launch()
+            client.run("huggingface llama-2-7b".split())
+
+    def test_gptj_6B(self):
+        with Runner('lmi', 'gpt-j-6b') as r:
+            prepare.build_hf_handler_model("gpt-j-6b")
+            r.launch()
+            client.run("huggingface gpt-j-6b".split())
+
+    def test_gpt4all_lora(self):
+        with Runner('lmi', 'gpt4all-lora') as r:
+            prepare.build_hf_handler_model("gpt4all-lora")
+            r.launch()
+            client.run("huggingface gpt4all-lora".split())
+
+    def test_streaming_bigscience_bloom_3b(self):
+        with Runner('lmi', 'bigscience/bloom-3b') as r:
+            prepare.build_hf_handler_model("bigscience/bloom-3b")
+            r.launch("CUDA_VISIBLE_DEVICES=1,2")
+            client.run("huggingface bigscience/bloom-3b".split())
+
+    def test_streaming_t5_large(self):
+        with Runner('lmi', 't5-large') as r:
+            prepare.build_hf_handler_model("t5-large")
+            r.launch("CUDA_VISIBLE_DEVICES=1")
+            client.run("huggingface t5-large".split())
+
+
+class TestTrtLlmHandler1:
+    # Runs on g5.12xl
+    def test_llama2_13b_tp4(self):
+        with Runner('tensorrt-llm', 'llama2-13b') as r:
+            prepare.build_trtllm_handler_model("llama2-13b")
+            r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3")
+            client.run("trtllm llama2-13b".split())
+
+    def test_falcon_triton(self):
+        with Runner('tensorrt-llm', 'falcon-7b') as r:
+            prepare.build_trtllm_handler_model("falcon-7b")
+            r.launch("CUDA_VISIBLE_DEVICES=0")
+            client.run("trtllm falcon-7b".split())
+
+    def test_internlm_7b(self):
+        with Runner('tensorrt-llm', 'internlm-7b') as r:
+            prepare.build_trtllm_handler_model("internlm-7b")
+            r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3")
+            client.run("trtllm internlm-7b".split())
+
+    def test_baichuan2_13b(self):
+        with Runner('tensorrt-llm', 'baichuan2-13b') as r:
+            prepare.build_trtllm_handler_model("baichuan2-13b")
+            r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3")
+            client.run("trtllm baichuan2-13b".split())
+
+    def test_chatglm3_6b(self):
+        with Runner('tensorrt-llm', 'chatglm3-6b') as r:
+            prepare.build_trtllm_handler_model("chatglm3-6b")
+            r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3")
+            client.run("trtllm chatglm3-6b".split())
+
+    def test_gpt2(self):
+        with Runner('tensorrt-llm', 'gpt2') as r:
+            prepare.build_trtllm_handler_model("gpt2")
+            r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3")
+            client.run("trtllm gpt2".split())
+
+    def test_santacoder(self):
+        with Runner('tensorrt-llm', 'santacoder') as r:
+            prepare.build_trtllm_handler_model("santacoder")
+            r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3")
+            client.run("trtllm santacoder".split())
+
+
+class TestTrtLlmHandler2:
+    # Runs on g5.12xl
+    def test_llama2_7b_hf_smoothquant(self):
+        with Runner('tensorrt-llm', 'llama2-7b-smoothquant') as r:
+            prepare.build_trtllm_handler_model("llama2-7b-smoothquant")
+            r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3")
+            client.run("trtllm llama2-7b-smoothquant".split())
+
+    def test_mistral(self):
+        with Runner('tensorrt-llm', 'mistral-7b') as r:
+            prepare.build_trtllm_handler_model("mistral-7b")
+            r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3")
+            client.run("trtllm mistral-7b".split())
+
+    def test_gpt_j_6b(self):
+        with Runner('tensorrt-llm', 'gpt-j-6b') as r:
+            prepare.build_trtllm_handler_model("gpt-j-6b")
+            r.launch("CUDA_VISIBLE_DEVICES=0")
+            client.run("trtllm gpt-j-6b".split())
+
+    def test_qwen_7b(self):
+        with Runner('tensorrt-llm', 'qwen-7b') as r:
+            prepare.build_trtllm_handler_model("qwen-7b")
+            r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3")
+            client.run("trtllm qwen-7b".split())
+
+
+class TestSchedulerSingleGPU:
+    # Runs on g5.12xl
+
+    def test_gpt2(self):
+        with Runner('lmi', 'gpt2') as r:
+            prepare.build_rolling_batch_model("gpt2")
+            r.launch()
+            rb_client.run("correctness gpt2".split())
+
+    def test_bllm(self):
+        with Runner('lmi', 'bloom-560m') as r:
+            prepare.build_rolling_batch_model("bloom-560m")
+            r.launch()
+            rb_client.run("scheduler_single_gpu bloom-560m".split())
+
+
+class TestSchedulerMultiGPU:
+    # Runs on g5.12xl
+
+    def test_gptj_6b(self):
+        with Runner('lmi', 'gpt-j-6b') as r:
+            prepare.build_rolling_batch_model("gpt-j-6b")
+            r.launch()
+            rb_client.run("scheduler_multi_gpu gpt-j-6b".split())
+
+
+class TestLmiDist1:
+    # Runs on g5.12xl
+
+    def test_gpt_neox_20b(self):
+        with Runner('lmi', 'gpt-neox-20b') as r:
+            prepare.build_lmi_dist_model("gpt-neox-20b")
+            r.launch()
+            client.run("lmi_dist gpt-neox-20b".split())
+
+    def test_falcon_7b(self):
+        with Runner('lmi', 'falcon-7b') as r:
+            prepare.build_lmi_dist_model("falcon-7b")
+            r.launch()
+            client.run("lmi_dist falcon-7b".split())
+
+    def test_falcon2_11b(self):
+        with Runner('lmi', 'falcon-11b') as r:
+            prepare.build_lmi_dist_model("falcon-11b")
+            r.launch()
+            client.run("lmi_dist falcon-11b".split())
+
+    def test_gpt2(self):
+        with Runner('lmi', 'gpt2') as r:
+            envs = [
+                "OPTION_MAX_ROLLING_BATCH_SIZE=2",
+                "OPTION_OUTPUT_FORMATTER=jsonlines",
+                "TENSOR_PARALLEL_DEGREE=1", "HF_MODEL_ID=gpt2",
+                "OPTION_TASK=text-generation", "OPTION_ROLLING_BATCH=lmi-dist"
+            ]
+            r.launch("\n".join(envs))
+            client.run("lmi_dist gpt2".split())
+
+    def test_mpt_7b(self):
+        with Runner('lmi', 'mpt-7b') as r:
+            prepare.build_lmi_dist_model("mpt-7b")
+            r.launch()
+            client.run("lmi_dist mpt-7b".split())
+
+    def test_llama2_tiny_autoawq(self):
+        with Runner('lmi', 'llama-2-tiny-autoawq') as r:
+            prepare.build_lmi_dist_model("llama-2-tiny")
+            r.launch(
+                "CUDA_VISIBLE_DEVICES=0,1,2,3",
+                cmd=
+                "partition --model-dir /opt/ml/input/data/training --save-mp-checkpoint-path /opt/ml/input/data/training/aot"
+            )
+            r.launch("CUDA_VISIBLE_DEVICES=0,1,2,3",
+                     cmd="serve -m test=file:/opt/ml/model/test/aot")
+            client.run("lmi_dist llama-2-tiny".split())
+
+
+class TestLmiDist2:
+    # Runs on g5.12xl
+
+    def test_gpt_neox_20b(self):
+        with Runner('lmi', 'octocoder') as r:
+            prepare.build_lmi_dist_model("octocoder")
+            r.launch()
+            client.run("lmi_dist octocoder".split())
+
+    def test_speculative_llama_13b(self):
+        with Runner('lmi', 'speculative-llama-13b') as r:
+            prepare.build_lmi_dist_model("speculative-llama-13b")
+            r.launch()
+            client.run("lmi_dist speculative-llama-13b".split())
+
+    def test_starcoder2_7b(self):
+        with Runner('lmi', 'starcoder2-7b') as r:
+            prepare.build_lmi_dist_model("starcoder2-7b")
+            r.launch()
+            client.run("lmi_dist starcoder2-7b".split())
+
+    def test_gemma_2b(self):
+        with Runner('lmi', 'gemma-2b') as r:
+            prepare.build_lmi_dist_model("gemma-2b")
+            r.launch()
+            client.run("lmi_dist gemma-2b".split())
+
+    def test_llama2_13b_gptq(self):
+        with Runner('lmi', 'llama2-13b-gptq') as r:
+            prepare.build_lmi_dist_model("llama2-13b-gptq")
+            r.launch()
+            client.run("lmi_dist llama2-13b-gptq".split())
+
+    def test_mistral_7b(self):
+        with Runner('lmi', 'mistral-7b') as r:
+            prepare.build_lmi_dist_model("mistral-7b")
+            r.launch()
+            client.run("lmi_dist mistral-7b".split())
+
+    def test_llama2_7b_32k(self):
+        with Runner('lmi', 'llama2-7b-32k') as r:
+            prepare.build_lmi_dist_model("llama2-7b-32k")
+            r.launch()
+            client.run("lmi_dist llama2-7b-32k".split())
+
+    def test_mistral_7b_128k_awq(self):
+        with Runner('lmi', 'mistral-7b-128k-awq') as r:
+            prepare.build_lmi_dist_model("mistral-7b-128k-awq")
+            r.launch()
+            client.run("lmi_dist mistral-7b-128k-awq".split())
+
+    def test_llama2_7b_chat(self):
+        with Runner('lmi', 'llama2-7b-chat') as r:
+            prepare.build_lmi_dist_model("llama2-7b-chat")
+            r.launch()
+            client.run("lmi_dist_chat llama2-7b-chat".split())
+
+
+class TestVllm1:
+    # Runs on g5.12xl
+
+    def test_gpt_neox_20b(self):
+        with Runner('lmi', 'gpt-neox-20b') as r:
+            prepare.build_vllm_model("gpt-neox-20b")
+            r.launch()
+            client.run("vllm gpt-neox-20b".split())
+
+    def test_mistral_7b(self):
+        with Runner('lmi', 'mistral-7b') as r:
+            prepare.build_vllm_model("mistral-7b")
+            r.launch()
+            client.run("vllm mistral-7b".split())
+
+    def test_phi2(self):
+        with Runner('lmi', 'phi-2') as r:
+            prepare.build_vllm_model("phi-2")
+            r.launch()
+            client.run("vllm phi-2".split())
+
+    def test_starcoder2_7b(self):
+        with Runner('lmi', 'starcoder2-7b') as r:
+            prepare.build_vllm_model("starcoder2-7b")
+            r.launch()
+            client.run("vllm starcoder2-7b".split())
+
+    def test_gemma_2b(self):
+        with Runner('lmi', 'gemma-2b') as r:
+            prepare.build_vllm_model("gemma-2b")
+            r.launch()
+            client.run("vllm gemma-2b".split())
+
+    def test_llama2_7b_chat(self):
+        with Runner('lmi', 'llama2-7b-chat') as r:
+            prepare.build_vllm_model("llama2-7b-chat")
+            r.launch()
+            client.run("vllm_chat llama2-7b-chat".split())
+
+
+class TestVllmLora:
+    # Runs on g5.12xl
+
+    def test_lora_unmerged(self):
+        with Runner('lmi', 'llama-7b-unmerged-lora') as r:
+            prepare.build_vllm_model("llama-7b-unmerged-lora")
+            r.launch()
+            client.run("vllm_adapters llama-7b-unmerged-lora".split())
+
+    def test_lora_unmerged_overflow(self):
+        with Runner('lmi', 'llama-7b-unmerged-lora-overflow') as r:
+            prepare.build_vllm_model("llama-7b-unmerged-lora-overflow")
+            r.launch()
+            client.run("vllm_adapters llama-7b-unmerged-lora-overflow".split())
+
+    def test_lora_awq_llama2_13b(self):
+        with Runner('lmi', 'llama2-13b-awq-unmerged-lora') as r:
+            prepare.build_vllm_model("llama2-13b-awq-unmerged-lora")
+            r.launch()
+            client.run("vllm_adapters llama2-13b-awq-unmerged-lora".split())
+
+    def test_lora_mistral_7b(self):
+        with Runner('lmi', 'mistral-7b-unmerged-lora') as r:
+            prepare.build_vllm_model("mistral-7b-unmerged-lora")
+            r.launch()
+            client.run("vllm_adapters mistral-7b-unmerged-lora".split())
+
+    def test_lora_awq_mistral_7b(self):
+        with Runner('lmi', 'mistral-7b-awq-unmerged-lora') as r:
+            prepare.build_vllm_model("mistral-7b-awq-unmerged-lora")
+            r.launch()
+            client.run("vllm_adapters mistral-7b-awq-unmerged-lora".split())
+
+    def test_lora_llama3_8b(self):
+        with Runner('lmi', 'llama3-8b-unmerged-lora') as r:
+            prepare.build_vllm_model("llama3-8b-unmerged-lora")
+            r.launch()
+            client.run("vllm_adapters llama3-8b-unmerged-lora".split())
 
 
-def test_llama_7b():
-    with Runner('deepspeed') as r:
-        prepare.build_hf_handler_model("open-llama-7b")
-        r.launch()
-        client.run("huggingface open-llama-7b".split())
+class TestLmiDistLora:
+    # Runs on g5.12xl
 
+    def test_lora_unmerged(self):
+        with Runner('lmi', 'llama-7b-unmerged-lora') as r:
+            prepare.build_lmi_dist_model("llama-7b-unmerged-lora")
+            r.launch()
+            client.run("lmi_dist_adapters llama-7b-unmerged-lora".split())
 
-def test_unmerged_lora_llama7b():
-    with Runner('deepspeed') as r:
-        prepare.build_hf_handler_model("llama-7b-unmerged-lora")
-        r.launch()
-        client.run("huggingface llama-7b-unmerged-lora".split())
+    def test_lora_unmerged_overflow(self):
+        with Runner('lmi', 'llama-7b-unmerged-lora-overflow') as r:
+            prepare.build_lmi_dist_model("llama-7b-unmerged-lora-overflow")
+            r.launch()
+            client.run(
+                "lmi_dist_adapters llama-7b-unmerged-lora-overflow".split())
 
+    def test_lora_awq_llama2_13b(self):
+        with Runner('lmi', 'llama2-13b-awq-unmerged-lora') as r:
+            prepare.build_lmi_dist_model("llama2-13b-awq-unmerged-lora")
+            r.launch()
+            client.run(
+                "lmi_dist_adapters llama2-13b-awq-unmerged-lora".split())
 
-def test_falcon_7b_triton_tp1():
-    with Runner('tensorrt-llm') as r:
-        prepare.build_trtllm_handler_model("falcon-7b")
-        r.launch("CUDA_VISIBLE_DEVICES=0")
-        client.run("trtllm falcon-7b".split())
+    def test_lora_mistral_7b(self):
+        with Runner('lmi', 'mistral-7b-unmerged-lora') as r:
+            prepare.build_lmi_dist_model("mistral-7b-unmerged-lora")
+            r.launch()
+            client.run("lmi_dist_adapters mistral-7b-unmerged-lora".split())
 
+    def test_lora_awq_mistral_7b(self):
+        with Runner('lmi', 'mistral-7b-awq-unmerged-lora') as r:
+            prepare.build_lmi_dist_model("mistral-7b-awq-unmerged-lora")
+            r.launch()
+            client.run(
+                "lmi_dist_adapters mistral-7b-awq-unmerged-lora".split())
 
-def test_bloom_560m():
-    with Runner('deepspeed') as r:
-        prepare.build_rolling_batch_model("bloom-560m")
-        r.launch()
-        rb_client.run("scheduler_single_gpu bloom-560m".split())
+    def test_lora_llama3_8b(self):
+        with Runner('lmi', 'llama3-8b-unmerged-lora') as r:
+            prepare.build_lmi_dist_model("llama3-8b-unmerged-lora")
+            r.launch()
+            client.run("lmi_dist_adapters llama3-8b-unmerged-lora".split())