Merge branch 'vllm-project:main' into main

haichuan1221 · Aug 2, 2024 · a720662 · a720662
2 parents a057bfd + 3bb4b1e
commit a720662
Show file tree

Hide file tree

Showing 37 changed files with 186 additions and 171 deletions.
diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
@@ -55,8 +55,7 @@ while true; do
 done
 
 echo "--- Pulling container" 
-docker login registry-1.docker.io -u alexeivivanovamd -p ${DH_TOKEN}
-image_name="rocmshared/vllm-ci:${BUILDKITE_COMMIT}"
+image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull ${image_name}
 

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -44,7 +44,7 @@ steps:
   fast_check: true
   commands:
   # This flashinfer installation will fail on AMD ROCm, so it is set as optional.
-  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl || true
+  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl || true
   - pytest -v -s basic_correctness/test_basic_correctness.py
   - pytest -v -s basic_correctness/test_cpu_offload.py
   - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
@@ -164,7 +164,7 @@ steps:
 - label: Models Test
   #mirror_hardwares: [amd]
   commands:
-    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+    - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
     - pytest -v -s models -m \"not vlm\"
 
 - label: Vision Language Models Test
@@ -281,7 +281,7 @@ steps:
   - pytest -v -s distributed/test_custom_all_reduce.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
-  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+  - pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
   - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
   - pytest -v -s -x lora/test_mixtral.py
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
@@ -32,6 +32,7 @@ jobs:
         pip install types-setuptools
     - name: Mypy
       run: |
+        mypy
         mypy tests --follow-imports skip
         mypy vllm/attention --follow-imports skip
         mypy vllm/core --follow-imports skip
@@ -44,5 +45,4 @@ jobs:
         mypy vllm/prompt_adapter --follow-imports skip
         mypy vllm/spec_decode --follow-imports skip
         mypy vllm/worker --follow-imports skip
-        mypy
 
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -49,7 +49,7 @@ jobs:
       matrix:
           os: ['ubuntu-20.04']
           python-version: ['3.8', '3.9', '3.10', '3.11']
-          pytorch-version: ['2.3.1']  # Must be the most recent version that meets requirements-cuda.txt.
+          pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements-cuda.txt.
           cuda-version: ['11.8', '12.1']
 
     steps:

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -32,7 +32,7 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
 # requirements.txt files and should be kept consistent.  The ROCm torch
 # versions are derived from Dockerfile.rocm
 #
-set(TORCH_SUPPORTED_VERSION_CUDA "2.3.1")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.4.0")
 set(TORCH_SUPPORTED_VERSION_ROCM "2.5.0")
 
 #
@@ -156,12 +156,15 @@ set(VLLM_EXT_SRC
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   include(FetchContent)
-  SET(CUTLASS_ENABLE_HEADERS_ONLY=ON)
+  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
   FetchContent_Declare(
         cutlass
         GIT_REPOSITORY https://github.com/nvidia/cutlass.git
         # CUTLASS 3.5.0
         GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc
+        # Shallow clone with depth 1
+        GIT_SHALLOW TRUE
+        GIT_PROGRESS TRUE
   )
   FetchContent_MakeAvailable(cutlass)
 

diff --git a/Dockerfile b/Dockerfile
@@ -192,7 +192,7 @@ RUN --mount=type=bind,from=mamba-builder,src=/usr/src/mamba,target=/usr/src/mamb
     python3 -m pip install /usr/src/mamba/*.whl --no-cache-dir
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.9/flashinfer-0.0.9+cu121torch2.3-cp310-cp310-linux_x86_64.whl
+    python3 -m pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
 #################### vLLM installation IMAGE ####################
 
 

diff --git a/docs/source/serving/distributed_serving.rst b/docs/source/serving/distributed_serving.rst
@@ -50,7 +50,7 @@ You can also additionally specify :code:`--pipeline-parallel-size` to enable pip
     $     --pipeline-parallel-size 2
 
 .. note::
-    Pipeline parallel is a beta feature. It is only supported for online serving as well as LLaMa, GPT2, and Mixtral style models.
+    Pipeline parallel is a beta feature. It is only supported for online serving as well as LLaMa, GPT2, Mixtral, Qwen, Qwen2, and Nemotron style models.
 
 Multi-Node Inference and Serving
 --------------------------------

diff --git a/format.sh b/format.sh
@@ -96,6 +96,7 @@ echo 'vLLM yapf: Done'
 
 # Run mypy
 echo 'vLLM mypy:'
+mypy --follow-imports skip  # Note that this is less strict than CI
 mypy tests --follow-imports skip
 mypy vllm/attention --follow-imports skip
 mypy vllm/core --follow-imports skip
@@ -108,7 +109,7 @@ mypy vllm/model_executor  --follow-imports skip
 mypy vllm/prompt_adapter --follow-imports skip
 mypy vllm/spec_decode --follow-imports skip
 mypy vllm/worker --follow-imports skip
-mypy
+echo 'vLLM mypy: Done'
 
 
 # If git diff returns a file that is in the skip list, the file may be checked anyway:
@@ -127,7 +128,7 @@ spell_check_all(){
   codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}"
 }
 
-# Spelling  check of files that differ from main branch.
+# Spelling check of files that differ from main branch.
 spell_check_changed() {
     # The `if` guard ensures that the list of filenames is not empty, which
     # could cause ruff to receive 0 positional arguments, making it hang

diff --git a/pyproject.toml b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
     "ninja",
     "packaging",
     "setuptools >= 49.4.0",
-    "torch == 2.3.1",
+    "torch == 2.4.0",
     "wheel",
 ]
 build-backend = "setuptools.build_meta"

diff --git a/requirements-build.txt b/requirements-build.txt
@@ -3,5 +3,5 @@ cmake>=3.21
 ninja
 packaging
 setuptools>=49.4.0
-torch==2.3.1
+torch==2.4.0
 wheel
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
@@ -4,8 +4,8 @@
 # Dependencies for NVIDIA GPUs
 ray >= 2.9
 nvidia-ml-py # for pynvml package
-torch == 2.3.1
+torch == 2.4.0
 # These must be updated alongside torch
-torchvision == 0.18.1   # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
-xformers == 0.0.27  # Requires PyTorch 2.3.1
-vllm-flash-attn == 2.5.9.post1  # Requires PyTorch 2.3.1
+torchvision == 0.19   # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+xformers == 0.0.27.post2  # Requires PyTorch 2.4.0
+vllm-flash-attn == 2.6.1  # Requires PyTorch 2.4.0
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -14,7 +14,6 @@ peft
 requests
 ray
 sentence-transformers # required for embedding
-sparseml==1.8.0 # required for compressed-tensors
 compressed-tensors==0.4.0 # required for compressed-tensors
 timm # required for internvl test
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -152,7 +152,6 @@ def __init__(
         model_kwargs: Optional[Dict[str, Any]] = None,
         is_embedding_model: bool = False,
         is_vision_model: bool = False,
-        is_sparseml_model: bool = False,
     ) -> None:
         torch_dtype = STR_DTYPE_TO_TORCH_DTYPE[dtype]
 
@@ -169,9 +168,6 @@ def __init__(
         else:
             if is_vision_model:
                 auto_cls = AutoModelForVision2Seq
-            elif is_sparseml_model:
-                from sparseml.transformers import SparseAutoModelForCausalLM
-                auto_cls = SparseAutoModelForCausalLM
             else:
                 auto_cls = AutoModelForCausalLM
 

diff --git a/tests/kernels/test_flash_attn.py b/tests/kernels/test_flash_attn.py
@@ -20,6 +20,7 @@ def ref_paged_attn(
     block_tables: torch.Tensor,
     scale: float,
     sliding_window: Optional[int] = None,
+    soft_cap: Optional[float] = None,
 ) -> torch.Tensor:
     num_seqs = len(query_lens)
     block_tables = block_tables.cpu().numpy()
@@ -53,6 +54,8 @@ def ref_paged_attn(
                                              (query_len + sliding_window) +
                                              1).bool().logical_not()
             mask |= sliding_window_mask
+        if soft_cap is not None:
+            attn = soft_cap * torch.tanh(attn / soft_cap)
         attn.masked_fill_(mask, float("-inf"))
         attn = torch.softmax(attn, dim=-1).to(v.dtype)
         out = torch.einsum("hqk,khd->qhd", attn, v)
@@ -68,13 +71,15 @@ def ref_paged_attn(
 @pytest.mark.parametrize("head_size", HEAD_SIZES)
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
-@torch.inference_mode
+@pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
+@torch.inference_mode()
 def test_flash_attn_with_paged_kv(
     kv_lens: List[int],
     num_heads: Tuple[int, int],
     head_size: int,
     dtype: torch.dtype,
     block_size: int,
+    soft_cap: Optional[float],
 ) -> None:
     torch.set_default_device("cuda")
     torch.cuda.manual_seed_all(0)
@@ -108,6 +113,7 @@ def test_flash_attn_with_paged_kv(
         causal=True,
         block_table=block_tables,
         cache_seqlens=kv_lens_tensor,
+        softcap=soft_cap if soft_cap is not None else 0,
     ).squeeze(1)
 
     ref_output = ref_paged_attn(
@@ -118,6 +124,7 @@ def test_flash_attn_with_paged_kv(
         kv_lens=kv_lens,
         block_tables=block_tables,
         scale=scale,
+        soft_cap=soft_cap,
     )
     assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2), \
         f"{torch.max(torch.abs(output - ref_output))}"
@@ -129,14 +136,16 @@ def test_flash_attn_with_paged_kv(
 @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 @pytest.mark.parametrize("sliding_window", [None])
 @pytest.mark.parametrize("dtype", DTYPES)
-@torch.inference_mode
+@pytest.mark.parametrize("soft_cap", [None, 10.0, 50.0])
+@torch.inference_mode()
 def test_varlen_with_paged_kv(
     seq_lens: List[Tuple[int, int]],
     num_heads: Tuple[int, int],
     head_size: int,
     sliding_window: Optional[int],
     dtype: torch.dtype,
     block_size: int,
+    soft_cap: Optional[float],
 ) -> None:
     torch.set_default_device("cuda")
     torch.cuda.manual_seed_all(0)
@@ -163,10 +172,6 @@ def test_varlen_with_paged_kv(
                             head_size,
                             dtype=dtype)
     value_cache = torch.randn_like(key_cache)
-    # Normalize the scale of the key and value caches to mitigate
-    # numerical instability.
-    key_cache /= head_size**0.5
-    value_cache /= head_size**0.5
     cu_query_lens = torch.tensor([0] + query_lens,
                                  dtype=torch.int32).cumsum(dim=0,
                                                            dtype=torch.int32)
@@ -192,6 +197,7 @@ def test_varlen_with_paged_kv(
         causal=True,
         window_size=window_size,
         block_table=block_tables,
+        softcap=soft_cap if soft_cap is not None else 0,
     )
 
     ref_output = ref_paged_attn(
@@ -203,6 +209,7 @@ def test_varlen_with_paged_kv(
         block_tables=block_tables,
         scale=scale,
         sliding_window=sliding_window,
+        soft_cap=soft_cap,
     )
     assert torch.allclose(output, ref_output, atol=1e-2, rtol=1e-2), \
         f"{torch.max(torch.abs(output - ref_output))}"
diff --git a/tests/models/test_compressed_tensors.py b/tests/models/test_compressed_tensors.py
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
@@ -1,4 +1,4 @@
-"""Test model set-up and weight loading for sparseml-quantized models.
+"""Test model set-up and weight loading for llmcompressor-quantized models.
 
 Run `pytest tests/quantization/test_compressed_tensors.py`.
 """

diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
@@ -150,6 +150,7 @@ def __init__(
         sliding_window: Optional[int] = None,
         kv_cache_dtype: str = "auto",
         blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
     ) -> None:
         raise NotImplementedError
 

diff --git a/vllm/attention/backends/blocksparse_attn.py b/vllm/attention/backends/blocksparse_attn.py
@@ -283,12 +283,15 @@ def __init__(
         sliding_window: Optional[int],
         kv_cache_dtype: str,
         blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
     ) -> None:
         assert blocksparse_params is not None
         assert alibi_slopes is None, ValueError(
             "Alibi not support for blocksparse flash attention.")
         assert sliding_window is None, ValueError(
             "sliding_window is invalid for blocksparse attention.")
+        assert logits_soft_cap is None, ValueError(
+            "logits_soft_cap is invalid for blocksparse attention.")
 
         if "num_heads" not in blocksparse_params:
             blocksparse_params["num_heads"] = num_heads