neuralmagic · andy-neuma · Jan 26, 2024 · Jan 26, 2024 · Jan 26, 2024 · Jan 27, 2024
diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh
@@ -6,27 +6,31 @@ set -o pipefail
 # cd into parent directory of this file
 cd "$(dirname "${BASH_SOURCE[0]}")/.."
 
-(wget && curl) || (apt-get update && apt-get install -y wget curl)
+(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
 
-# run benchmarks and upload the result to buildkite
+# run python-based benchmarks and upload the result to buildkite
 python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
 bench_latency_exit_code=$?
 
 python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
 bench_throughput_exit_code=$?
 
+# run server-based benchmarks and upload the result to buildkite
 python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
 server_pid=$!
 wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 
 # wait for server to start, timeout after 600 seconds
 timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
 python3 benchmarks/benchmark_serving.py \
+    --backend openai \
     --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json \
     --model meta-llama/Llama-2-7b-chat-hf \
     --num-prompts 20 \
     --endpoint /v1/completions \
-    --tokenizer meta-llama/Llama-2-7b-chat-hf 2>&1 | tee benchmark_serving.txt
+    --tokenizer meta-llama/Llama-2-7b-chat-hf \
+    --save-result \
+    2>&1 | tee benchmark_serving.txt
 bench_serving_exit_code=$?
 kill $server_pid
 
@@ -44,7 +48,7 @@ sed -n '$p' benchmark_throughput.txt >> benchmark_results.md # last line
 echo "### Serving Benchmarks" >> benchmark_results.md
 sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
 echo "" >> benchmark_results.md
-tail -n 5 benchmark_serving.txt >> benchmark_results.md # last 5 lines
+tail -n 13 benchmark_serving.txt >> benchmark_results.md # last 13 lines
 
 # upload the results to buildkite
 /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
@@ -61,3 +65,5 @@ fi
 if [ $bench_serving_exit_code -ne 0 ]; then
     exit $bench_serving_exit_code
 fi
+
+/workspace/buildkite-agent artifact upload openai-*.json
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -11,8 +11,16 @@ steps:
 - label: AsyncEngine Test
   command: pytest -v -s async_engine
 
-- label: Distributed Test
-  command: pytest -v -s test_comm_ops.py
+- label: Basic Correctness Test
+  command: pytest -v -s --forked basic_correctness
+
+- label: Distributed Comm Ops Test
+  command: pytest -v -s --forked test_comm_ops.py
+  working_dir: "/vllm-workspace/tests/distributed"
+  num_gpus: 2 # only support 1 or 2 for now.
+
+- label: Distributed Correctness Test
+  command: pytest -v -s --forked test_basic_distributed_correctness.py
   working_dir: "/vllm-workspace/tests/distributed"
   num_gpus: 2 # only support 1 or 2 for now.
 
@@ -49,3 +57,10 @@ steps:
   commands:
   - pip install aiohttp
   - bash run-benchmarks.sh
+
+- label: Documentation Build
+  working_dir: "/vllm-workspace/docs"
+  no_gpu: True
+  commands:
+  - pip install -r requirements-docs.txt
+  - SPHINXOPTS=\"-W\" make html
diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
@@ -5,7 +5,7 @@
 steps:
   - label: ":docker: build image"
     commands:
-      - "docker build --tag {{ docker_image }} --target test --progress plain ."
+      - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
       - "docker push {{ docker_image }}"
     env:
       DOCKER_BUILDKIT: "1"
@@ -35,13 +35,15 @@ steps:
               - image: "{{ docker_image }}"
                 command: ["bash"]
                 args:
-                - "-c"
+                - '-c'
                 - "'cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}'"
+                {% if not step.no_gpu %}
                 resources:
                   requests:
                     nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
                   limits:
                     nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
+                {% endif %}
                 env:
                   - name: HF_TOKEN
                     valueFrom:

diff --git a/.github/actions/nm-build-vllm/action.yml b/.github/actions/nm-build-vllm/action.yml
@@ -19,8 +19,6 @@ runs:
   steps:
   - id: build
     run: |
-      # TODO: this is a hack ... fix it later
-      # pyenv hardcoded ... python version hardcoded ...
       COMMIT=${{ github.sha }}
       VENV="${{ inputs.venv }}-${COMMIT:0:7}"
       source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate

diff --git a/.github/actions/nm-set-env/action.yml b/.github/actions/nm-set-env/action.yml
@@ -1,21 +1,26 @@
 name: set neuralmagic env
 description: 'sets environment variables for neuralmagic'
 inputs:
-  hf_home:
+  hf_token:
     description: 'Hugging Face home'
     required: true
+  Gi_per_thread:
+    description: 'requested GiB to reserve per thread'
+    required: true
 runs:
   using: composite
   steps:
   - run: |
-      echo "HF_HOME=${HF_HOME_TOKEN}" >> $GITHUB_ENV
-      echo "TORCH_CUDA_ARCH_LIST=8.0+PTX" >> $GITHUB_ENV
+      echo "HF_TOKEN=${HF_TOKEN_SECRET}" >> $GITHUB_ENV
+      NUM_THREADS=$(./.github/scripts/determine-threading -G ${{ inputs.Gi_per_thread }})
+      echo "MAX_JOBS=${NUM_THREADS}" >> $GITHUB_ENV
+      echo "VLLM_INSTALL_PUNICA_KERNELS=1" >> $GITHUB_ENV
       echo "PYENV_ROOT=/usr/local/apps/pyenv" >> $GITHUB_ENV
       echo "XDG_CONFIG_HOME=/usr/local/apps" >> $GITHUB_ENV
       WHOAMI=$(whoami)
       echo "PATH=/usr/local/apps/pyenv/plugins/pyenv-virtualenv/shims:/usr/local/apps/pyenv/shims:/usr/local/apps/pyenv/bin:/usr/local/apps/nvm/versions/node/v16.20.2/bin:/usr/local/cuda-12.1/bin:/usr/local/cuda-12.1/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/home/${WHOAMI}/.local/bin:" >> $GITHUB_ENV
       echo "LD_LIBRARY_PATH=/usr/local/cuda-12.1/lib64::/usr/local/cuda-12.1/lib64:" >> $GITHUB_ENV
       echo "PROJECT_ID=12" >> $GITHUB_ENV
     env:
-        HF_HOME_TOKEN: ${{ inputs.hf_home }}
+        HF_TOKEN_SECRET: ${{ inputs.hf_token }}
     shell: bash
diff --git a/.github/actions/nm-test-vllm/action.yml b/.github/actions/nm-test-vllm/action.yml
@@ -4,8 +4,8 @@ inputs:
   test_directory:
     description: 'test directory, path is relative to neuralmagic-vllm'
     required: true
-  test_xml:
-    description: 'filename for xml test results'
+  test_results:
+    description: 'top-level directory for xml test results'
     required: true
   python:
     description: 'python version, e.g. 3.10.12'
@@ -22,15 +22,15 @@ runs:
   steps:
   - id: test
     run: |
-      SUCCESS=0
-      # TODO: this is a hack ... fix it later
-      # pyenv hardcoded ... python version hardcoded ...
       COMMIT=${{ github.sha }}
       VENV="${{ inputs.venv }}-${COMMIT:0:7}"
       source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate
       pip3 install --index-url http://192.168.201.226:8080/ --trusted-host 192.168.201.226 magic-wand
       pip3 install -r requirements-dev.txt
-      pytest --junitxml=${{ inputs.test_xml }} ${{ inputs.test_directory }} || SUCCESS=$?
+      # run tests via runner script (serially)
+      SUCCESS=0
+      ./.github/scripts/run-tests -t ${{ inputs.test_directory }} -r ${{ inputs.test_results }} || SUCCESS=$?
+      echo "was this a SUCCESS? ${SUCCESS}"
       echo "status=${SUCCESS}" >> "$GITHUB_OUTPUT"
       exit ${SUCCESS}
     shell: bash
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
@@ -0,0 +1,6 @@
+SUMMARY:
+"please provide a brief summary"
+
+TEST PLAN:
+"please outline how the changes were tested"
+
diff --git a/.github/scripts/run-tests b/.github/scripts/run-tests
@@ -0,0 +1,66 @@
+#!/bin/bash -e
+
+# simple helper script to manage concurrency while running tests
+
+usage() {
+    echo "Usage: ${0} <options>"
+    echo
+    echo "  -t    - test directory, i.e. location of *.py test files. (default 'tests/')"
+    echo "  -r    - desired results base directory. xml results will mirror provided tests directory structure. (default 'test-results/')"
+    echo "  -h    - this list of options"
+    echo
+    echo "note: all paths are relative to 'neuralmagic-vllm' root"
+    echo
+    exit 1
+}
+
+TEST_DIR=tests
+RESULTS_DIR=test-results
+
+while getopts "ht:r:" OPT; do
+    case "${OPT}" in
+	h)
+	    usage
+	    ;;
+	t)
+	    TEST_DIR="${OPTARG}"
+	    ;;
+    r)
+        RESULTS_DIR="${OPTARG}"
+        ;;
+    esac
+done
+
+# check if variables are valid
+if [ -z "${RESULTS_DIR}" ]; then
+    echo "please set desired results base directory"
+    usage
+fi
+
+if [ -z "${TEST_DIR}" ]; then
+    echo "please set test directory"
+    usage
+fi
+
+if [ ! -d "${TEST_DIR}" ]; then
+    echo "specified test directory, '${TEST_DIR}' does not exist ..."
+    usage
+fi
+
+# run tests serially
+TESTS_DOT_PY=$(find ${TEST_DIR}  -not -name "__init__.py" -name "*.py")
+TESTS_TO_RUN=($TESTS_DOT_PY)
+SUCCESS=0
+for TEST in "${TESTS_TO_RUN[@]}"
+do
+    LOCAL_SUCCESS=0
+    RESULT_XML=$(echo ${TEST} | sed -e "s/${TEST_DIR}/${RESULTS_DIR}/" | sed -e "s/.py/.xml/")
+    pytest --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
+    SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
+done
+
+if [ "${SUCCESS}" -eq "0" ]; then
+    exit 0
+else
+    exit 1
+fi
diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
@@ -15,6 +15,10 @@ on:
         description: "git commit hash or branch name"
         type: string
         required: true
+      Gi_per_thread:
+        description: 'requested GiB to reserve per thread'
+        type: string
+        required: true
       python:
         description: "python version, e.g. 3.10.12"
         type: string
@@ -35,6 +39,10 @@ on:
         description: "git commit hash or branch name"
         type: string
         required: true
+      Gi_per_thread:
+        description: 'requested GiB to reserve per thread'
+        type: string
+        required: true
       python:
         description: "python version, e.g. 3.10.12"
         type: string
@@ -61,7 +69,8 @@ jobs:
               id: setenv
               uses: ./.github/actions/nm-set-env/
               with:
-                hf_home: ${{ secrets.NM_HF_HOME }}
+                hf_token: ${{ secrets.NM_HF_TOKEN }}
+                Gi_per_thread: ${{ inputs.Gi_per_thread }}
 
             - name: set python
               id: set_python
@@ -88,7 +97,7 @@ jobs:
               id: build
               uses: ./.github/actions/nm-build-vllm/
               with:
-                Gi_per_thread: 1
+                Gi_per_thread: ${{ inputs.Gi_per_thread }}
                 python: ${{ inputs.python }}
                 venv: TEST
 
@@ -97,7 +106,7 @@ jobs:
               uses: ./.github/actions/nm-test-vllm/
               with:
                 test_directory: tests
-                test_xml: test-results/all_tests.xml
+                test_results: test-results
                 python: ${{ inputs.python }}
                 venv: TEST
 
@@ -134,12 +143,13 @@ jobs:
                 TEST_STATUS: ${{ steps.test.outputs.status }}
               run: |
                   echo "checkout status: ${CHECKOUT}"
-                  if [[ "${CHECKOUT}" != *"success"* ]]; then exit 1; fi
-                  if [ ${LINT_STATUS} -ne 0 ]; then exit 1; fi
-                  if [ ${BUILD_STATUS} -ne 0 ]; then exit 1; fi
+                  echo "lint status: ${LINT_STATUS}"
                   echo "build status: ${BUILD_STATUS}"
-                  if [ ${TEST_STATUS} -ne 0 ]; then exit 1; fi
                   echo "test status: ${TEST_STATUS}"
+                  if [[ "${CHECKOUT}" != *"success"* ]]; then exit 1; fi
+                  if [ -z "${LINT_STATUS}" ] || [ "${LINT_STATUS}" -ne "0" ]; then exit 1; fi
+                  if [ -z "${BUILD_STATUS}" ] || [ "${BUILD_STATUS}" -ne "0" ]; then exit 1; fi
+                  if [ -z "${TEST_STATUS}" ] || [ "${TEST_STATUS}" -ne "0" ]; then exit 1; fi
 
             - name: complete testmo run
               uses: ./.github/actions/nm-testmo-run-complete/

diff --git a/.github/workflows/remote-push.yml b/.github/workflows/remote-push.yml
@@ -13,18 +13,17 @@ jobs:
 
     # TODO: expand python matrix later, once CI system has
     #       matured.
-    # TODO: adjust timeout after we get a bit more experience.
-    #       making it 60 is a bit permissive.
 
     # TODO: enable this later
-    AWS-AVX2-32G-A10G-24G:
+    AWS-AVX2-192G-4-A10G-96G:
         strategy:
             matrix:
                 python: [3.10.12]
         uses: ./.github/workflows/build-test.yml
         with:
-            label: aws-avx2-32G-a10g-24G
-            timeout: 60
+            label: aws-avx2-192G-4-a10g-96G
+            timeout: 180
             gitref: '${{ github.ref }}'
+            Gi_per_thread: 4
             python: ${{ matrix.python }}
         secrets: inherit
diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh
@@ -13,6 +13,8 @@ $python_executable -m pip install -r requirements.txt
 
 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1
+# Make sure punica is built for the release (for LoRA)
+export VLLM_INSTALL_PUNICA_KERNELS=1
 
 # Build
 $python_executable setup.py bdist_wheel --dist-dir=dist
diff --git a/Dockerfile b/Dockerfile
@@ -7,6 +7,12 @@ FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
 RUN apt-get update -y \
     && apt-get install -y python3-pip git
 
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-12.1/compat/
+
 WORKDIR /workspace
 
 # install build and runtime dependencies
@@ -45,6 +51,8 @@ ENV MAX_JOBS=${max_jobs}
 # number of threads used by nvcc
 ARG nvcc_threads=8
 ENV NVCC_THREADS=$nvcc_threads
+# make sure punica kernels are built (for LoRA)
+ENV VLLM_INSTALL_PUNICA_KERNELS=1
 
 RUN python3 setup.py build_ext --inplace
 #################### EXTENSION Build IMAGE ####################
@@ -67,8 +75,10 @@ RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip instal
 
 
 #################### RUNTIME BASE IMAGE ####################
-# use CUDA base as CUDA runtime dependencies are already installed via pip
-FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
+# We used base cuda image because pytorch installs its own cuda libraries.
+# However cupy depends on cuda libraries so we had to switch to the runtime image
+# In the future it would be nice to get a container with pytorch and cuda without duplicating cuda
+FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04 AS vllm-base
 
 # libnccl required for ray
 RUN apt-get update -y \