neuralmagic · varun-sundar-rabindranath · Mar 26, 2024 · Feb 22, 2024 · Feb 22, 2024 · Feb 22, 2024
diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
@@ -0,0 +1,38 @@
+# This script build the ROCm docker image and run the API server inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Print ROCm version
+rocminfo
+
+# Try building the docker image
+docker build -t rocm -f Dockerfile.rocm .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f rocm || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image
+docker run --device /dev/kfd --device /dev/dri --network host --name rocm rocm python3 -m vllm.entrypoints.api_server &
+
+# Wait for the server to start
+wait_for_server_to_start() {
+    timeout=300
+    counter=0
+
+    while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
+        sleep 1
+        counter=$((counter + 1))
+        if [ $counter -ge $timeout ]; then
+            echo "Timeout after $timeout seconds"
+            break
+        fi
+    done
+}
+wait_for_server_to_start
+
+# Test a simple prompt
+curl -X POST -H "Content-Type: application/json" \
+    localhost:8000/generate \
+    -d '{"prompt": "San Francisco is a"}'
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -28,7 +28,7 @@ steps:
   num_gpus: 2 # only support 1 or 2 for now.
 
 - label: Engine Test
-  command: pytest -v -s engine tokenization test_sequence.py
+  command: pytest -v -s engine tokenization test_sequence.py test_config.py
 
 - label: Entrypoints Test
   command: pytest -v -s entrypoints
@@ -47,7 +47,10 @@ steps:
     - pytest -v -s prefix_caching
 
 - label: Samplers Test
-  command: pytest -v -s samplers --forked
+  command: pytest -v -s samplers
+
+- label: LogitsProcessor Test
+  command: pytest -v -s test_logits_processor.py
 
 - label: Worker Test
   command: pytest -v -s worker
@@ -56,7 +59,7 @@ steps:
   command: pytest -v -s spec_decode
 
 - label: LoRA Test %N
-  command: pytest -v -s lora --forked --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+  command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
   parallelism: 4
 
 - label: Metrics Test

diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
@@ -3,6 +3,11 @@
 {% set default_working_dir = "/vllm-workspace/tests" %}
 
 steps:
+  - label: "AMD Test"
+    agents:
+      queue: amd
+    command: bash .buildkite/run-amd-test.sh
+
   - label: ":docker: build image"
     commands:
       - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,6 +1,14 @@
+FILL IN THE PR DESCRIPTION HERE
+
+FIX #xxxx (*link existing issues this PR will resolve*)
+
+**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**
+
+---
+
 <details>
 <!-- inside this <details> section, markdown rendering does not work, so we use raw html here. -->
-<summary><b> PR Checklist (Click to expand. Please read before submitting.) </b></summary>
+<summary><b> PR Checklist (Click to Expand) </b></summary>
 
 <p>Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.</p>
 
@@ -53,8 +61,4 @@
 
 </details>
 
----
-
-Please provide a brief explanation of the motivation behind the PR and the changes it introduces. This helps reviewers understand the context and rationale for the contribution. If possible, please link existing issues this PR will resolve.
-
 
diff --git a/.github/actions/nm-build-vllm/action.yml b/.github/actions/nm-build-vllm/action.yml
@@ -1,9 +1,6 @@
 name: build nm-vllm
 description: 'build nm-vllm'
 inputs:
-  Gi_per_thread:
-    description: 'requested GiB to reserve per thread'
-    required: true
   python:
     description: 'python version, e.g. 3.10.12'
     required: true

diff --git a/.github/actions/nm-set-env/action.yml b/.github/actions/nm-set-env/action.yml
@@ -7,6 +7,10 @@ inputs:
   Gi_per_thread:
     description: 'requested GiB to reserve per thread'
     required: true
+  nvcc_threads:
+    description: "number of threads nvcc build threads"
+    type: string
+    required: true
 runs:
   using: composite
   steps:
@@ -16,6 +20,7 @@ runs:
       echo "HF_HOME=/EFS/hf_home" >> $GITHUB_ENV
       NUM_THREADS=$(./.github/scripts/determine-threading -G ${{ inputs.Gi_per_thread }})
       echo "MAX_JOBS=${NUM_THREADS}" >> $GITHUB_ENV
+      echo "NVCC_THREADS=${{ inputs.nvcc_threads }}" >> $GITHUB_ENV
       echo "VLLM_INSTALL_PUNICA_KERNELS=1" >> $GITHUB_ENV
       echo "NCCL_IGNORE_DISABLED_P2P=1" >> $GITHUB_ENV
       echo "PYENV_ROOT=/usr/local/apps/pyenv" >> $GITHUB_ENV

diff --git a/.github/scripts/run-tests b/.github/scripts/run-tests
@@ -100,6 +100,8 @@ do
         coverage run --data-file=.coverage-$(basename ${TEST}) -m pytest --forked --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
     elif [[ "${TEST}" == *"models_logprobs"* ]]; then
         coverage run --data-file=.coverage-$(basename ${TEST}) -m pytest --forked --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
+    elif [[ "${TEST}" == *"basic_correctness"* ]]; then
+        coverage run --data-file=.coverage-$(basename ${TEST}) -m pytest --forked --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
     else
         coverage run --data-file=.coverage-$(basename ${TEST}) -m pytest --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
     fi

diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
@@ -19,6 +19,10 @@ on:
         description: 'requested GiB to reserve per thread'
         type: string
         required: true
+      nvcc_threads:
+        description: "number of threads nvcc build threads"
+        type: string
+        required: true
       python:
         description: "python version, e.g. 3.10.12"
         type: string
@@ -47,6 +51,10 @@ on:
         description: 'requested GiB to reserve per thread'
         type: string
         required: true
+      nvcc_threads:
+        description: "number of threads nvcc build threads"
+        type: string
+        required: true
       python:
         description: "python version, e.g. 3.10.12"
         type: string
@@ -79,6 +87,7 @@ jobs:
               with:
                 hf_token: ${{ secrets.NM_HF_TOKEN }}
                 Gi_per_thread: ${{ inputs.Gi_per_thread }}
+                nvcc_threads: ${{ inputs.nvcc_threads }}
 
             - name: set python
               id: set_python
@@ -111,7 +120,6 @@ jobs:
               id: build
               uses: ./.github/actions/nm-build-vllm/
               with:
-                Gi_per_thread: ${{ inputs.Gi_per_thread }}
                 python: ${{ inputs.python }}
                 venv: TEST
                 pypi: ${{ secrets.NM_PRIVATE_PYPI_LOCATION }}

diff --git a/.github/workflows/build-whl.yml b/.github/workflows/build-whl.yml
@@ -19,6 +19,10 @@ on:
         description: 'requested GiB to reserve per thread'
         type: string
         required: true
+      nvcc_threads:
+        description: "number of threads nvcc build threads"
+        type: string
+        required: true
       python:
         description: "python version, e.g. 3.10.12"
         type: string
@@ -43,6 +47,10 @@ on:
         description: 'requested GiB to reserve per thread'
         type: string
         required: true
+      nvcc_threads:
+        description: "number of threads nvcc build threads"
+        type: string
+        required: true
       python:
         description: "python version, e.g. 3.10.12"
         type: string
@@ -76,6 +84,7 @@ jobs:
               with:
                 hf_token: ${{ secrets.NM_HF_TOKEN }}
                 Gi_per_thread: ${{ inputs.Gi_per_thread }}
+                nvcc_threads: ${{ inputs.nvcc_threads }}
 
             - name: set python
               id: set_python
@@ -101,7 +110,6 @@ jobs:
               id: build
               uses: ./.github/actions/nm-build-vllm/
               with:
-                Gi_per_thread: ${{ inputs.Gi_per_thread }}
                 python: ${{ inputs.python }}
                 venv: ${{ env.VENV_BUILD_BASE }}
                 pypi: ${{ secrets.NM_PRIVATE_PYPI_LOCATION }}

diff --git a/.github/workflows/gen-whl.yml b/.github/workflows/gen-whl.yml
@@ -20,5 +20,6 @@ jobs:
         timeout: 30
         gitref: ${{ inputs.gitref }}
         Gi_per_thread: 4
+        nvcc_threads: 8
         python: ${{ matrix.python }}
     secrets: inherit
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
@@ -24,6 +24,7 @@ jobs:
             timeout: 240
             gitref: ${{ github.ref }}
             Gi_per_thread: 4
+            nvcc_threads: 8
             python: 3.10.12
             test_skip_list:
         secrets: inherit
@@ -35,6 +36,7 @@ jobs:
             timeout: 300
             gitref: ${{ github.ref }}
             Gi_per_thread: 12
+            nvcc_threads: 1
             python: 3.11.4
             test_skip_list:
         secrets: inherit
@@ -48,6 +50,7 @@ jobs:
     #         timeout: 480
     #         gitref: '${{ github.ref }}'
     #         Gi_per_thread: 4
+    #         nvcc_threads: 8
     #         python: "3.10.12"
     #         # Always push if it is a scheduled job
     #         push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"
@@ -62,6 +65,7 @@ jobs:
             timeout: 720
             gitref: '${{ github.ref }}'
             Gi_per_thread: 12
+            nvcc_threads: 1
             python: "3.10.12"
             # Always push if it is a scheduled job
             push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"
@@ -75,5 +79,6 @@ jobs:
             timeout: 60
             gitref: '${{ github.ref }}'
             Gi_per_thread: 12
+            nvcc_threads: 1
             python: "3.10.12"
         secrets: inherit
diff --git a/.github/workflows/nm-benchmark.yml b/.github/workflows/nm-benchmark.yml
@@ -23,6 +23,10 @@ on:
         description: 'requested GiB to reserve per thread'
         type: string
         required: true
+      nvcc_threads:
+        description: "number of threads nvcc build threads"
+        type: string
+        required: true
       python:
         description: "python version, e.g. 3.10.12"
         type: string
@@ -55,6 +59,10 @@ on:
         description: 'requested GiB to reserve per thread'
         type: string
         required: true
+      nvcc_threads:
+        description: "number of threads nvcc build threads"
+        type: string
+        required: true
       python:
         description: "python version, e.g. 3.10.12"
         type: string
@@ -89,6 +97,7 @@ jobs:
         with:
           hf_token: ${{ secrets.NM_HF_TOKEN }}
           Gi_per_thread: ${{ inputs.Gi_per_thread }}
+          nvcc_threads: ${{ inputs.nvcc_threads }}
 
       - name: set python
         id: set_python
@@ -107,7 +116,6 @@ jobs:
         id: build
         uses: ./.github/actions/nm-build-vllm/
         with:
-          Gi_per_thread: ${{ inputs.Gi_per_thread }}
           python: ${{ inputs.python }}
           venv: TEST
           pypi: ${{ secrets.NM_PRIVATE_PYPI_LOCATION }}

diff --git a/.github/workflows/nm-lm-eval-accuracy.yml b/.github/workflows/nm-lm-eval-accuracy.yml
@@ -19,6 +19,10 @@ on:
         description: 'requested GiB to reserve per thread'
         type: string
         required: true
+      nvcc_threads:
+        description: "number of threads nvcc build threads"
+        type: string
+        required: true
       python:
         description: "python version, e.g. 3.10.12"
         type: string
@@ -43,6 +47,10 @@ on:
         description: 'requested GiB to reserve per thread'
         type: string
         required: true
+      nvcc_threads:
+        description: "number of threads nvcc build threads"
+        type: string
+        required: true
       python:
         description: "python version, e.g. 3.10.12"
         type: string
@@ -68,6 +76,7 @@ jobs:
         with:
           hf_token: ${{ secrets.NM_HF_TOKEN }}
           Gi_per_thread: ${{ inputs.Gi_per_thread }}
+          nvcc_threads: ${{ inputs.nvcc_threads }}
 
       - name: set python
         id: set_python
@@ -86,7 +95,6 @@ jobs:
         id: build
         uses: ./.github/actions/nm-build-vllm/
         with:
-          Gi_per_thread: ${{ inputs.Gi_per_thread }}
           python: ${{ inputs.python }}
           venv: TEST
           pypi: ${{ secrets.NM_PRIVATE_PYPI_LOCATION }}

diff --git a/.github/workflows/remote-push.yml b/.github/workflows/remote-push.yml
@@ -24,6 +24,7 @@ jobs:
             timeout: 240
             gitref: '${{ github.ref }}'
             Gi_per_thread: 4
+            nvcc_threads: 8
             python: ${{ matrix.python }}
             test_skip_list: neuralmagic/tests/skip-for-remote-push.txt
         secrets: inherit
@@ -37,6 +38,7 @@ jobs:
     #        timeout: 60
     #        gitref: '${{ github.ref }}'
     #        Gi_per_thread: 12
+    #        nvcc_threads: 1
     #        python: "3.10.12"
     #        push_benchmark_results_to_gh_pages: "false"
     #    secrets: inherit