Merge branch 'vllm-project:main' into optimize-minicpmv-code

vllm-project · Aug 8, 2024 · 2d306b7 · 2d306b7
2 parents 25aea1c + 6dffa4b
commit 2d306b7
Show file tree

Hide file tree

Showing 47 changed files with 1,155 additions and 488 deletions.
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
@@ -1,9 +1,27 @@
 steps:
-  - label: "Build wheel - CUDA {{matrix.cuda_version}}"
+  - label: "Build wheel - CUDA 12.1"
     agents:
       queue: cpu_queue
     commands:
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION={{matrix.cuda_version}} --tag vllm-ci:build-image --target build --progress plain ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
+      - "mkdir artifacts"
+      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
+      # rename the files to change linux -> manylinux1
+      - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
+      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
+      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
+    env:
+      DOCKER_BUILDKIT: "1"
+
+  - block: "Build CUDA 11.8 wheel"
+    key: block-build-cu118-wheel
+
+  - label: "Build wheel - CUDA 11.8"
+    depends_on: block-build-cu118-wheel
+    agents:
+      queue: cpu_queue
+    commands:
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg buildkite_commit=$BUILDKITE_COMMIT --build-arg USE_SCCACHE=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
       - "mkdir artifacts"
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       # rename the files to change linux -> manylinux1
@@ -12,8 +30,3 @@ steps:
       - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
     env:
       DOCKER_BUILDKIT: "1"
-    matrix:
-      setup:
-        cuda_version:
-          - "11.8.0"
-          - "12.1.0"
diff --git a/Dockerfile.openvino b/Dockerfile.openvino
@@ -21,7 +21,7 @@ COPY setup.py /workspace/vllm/
 # install build requirements
 RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
 # build vLLM with OpenVINO backend
-RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/pre-release" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace/vllm/
 
 COPY examples/ /workspace/vllm/examples
 COPY benchmarks/ /workspace/vllm/benchmarks

diff --git a/docs/source/getting_started/openvino-installation.rst b/docs/source/getting_started/openvino-installation.rst
@@ -57,7 +57,7 @@ Install from source
 
   .. code-block:: console
 
-      $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/pre-release" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
+      $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
 
 .. _openvino_backend_performance_tips:
 

diff --git a/docs/source/models/spec_decode.rst b/docs/source/models/spec_decode.rst
@@ -14,17 +14,17 @@ Speculative decoding is a technique which improves inter-token latency in memory
 Speculating with a draft model
 ------------------------------
 
-The following code configures vLLM to use speculative decoding with a draft model, speculating 5 tokens at a time.
+The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
 
 .. code-block:: python
 
     from vllm import LLM, SamplingParams
-    
+
     prompts = [
         "The future of AI is",
     ]
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-    
+
     llm = LLM(
         model="facebook/opt-6.7b",
         tensor_parallel_size=1,
@@ -33,12 +33,56 @@ The following code configures vLLM to use speculative decoding with a draft mode
         use_v2_block_manager=True,
     )
     outputs = llm.generate(prompts, sampling_params)
-    
+
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
+To perform the same with an online mode launch the server:
+
+.. code-block:: bash
+
+    python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \
+    --seed 42 -tp 1 --speculative_model facebook/opt-125m --use-v2-block-manager \
+    --num_speculative_tokens 5 --gpu_memory_utilization 0.8
+
+ Then use a client:
+
+.. code-block:: python
+
+    from openai import OpenAI
+
+    # Modify OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+
+    client = OpenAI(
+        # defaults to os.environ.get("OPENAI_API_KEY")
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    models = client.models.list()
+    model = models.data[0].id
+
+    # Completion API
+    stream = False
+    completion = client.completions.create(
+        model=model,
+        prompt="The future of AI is",
+        echo=False,
+        n=1,
+        stream=stream,
+    )
+
+    print("Completion results:")
+    if stream:
+        for c in completion:
+            print(c)
+    else:
+        print(completion)
+
 Speculating by matching n-grams in the prompt
 ---------------------------------------------
 
@@ -48,12 +92,12 @@ matching n-grams in the prompt. For more information read `this thread. <https:/
 .. code-block:: python
 
     from vllm import LLM, SamplingParams
-    
+
     prompts = [
         "The future of AI is",
     ]
     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-    
+
     llm = LLM(
         model="facebook/opt-6.7b",
         tensor_parallel_size=1,
@@ -63,7 +107,7 @@ matching n-grams in the prompt. For more information read `this thread. <https:/
         use_v2_block_manager=True,
     )
     outputs = llm.generate(prompts, sampling_params)
-    
+
     for output in outputs:
         prompt = output.prompt
         generated_text = output.outputs[0].text
@@ -74,7 +118,7 @@ Speculating using MLP speculators
 
 The following code configures vLLM to use speculative decoding where proposals are generated by
 draft models that conditioning draft predictions on both context vectors and sampled tokens.
-For more information see `this blog <https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/>`_ or 
+For more information see `this blog <https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/>`_ or
 `this technical report <https://arxiv.org/abs/2404.19124>`_.
 
 .. code-block:: python
@@ -100,9 +144,9 @@ For more information see `this blog <https://pytorch.org/blog/hitchhikers-guide-
         generated_text = output.outputs[0].text
         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
-Note that these speculative models currently need to be run without tensor parallelism, although 
-it is possible to run the main model using tensor parallelism (see example above). Since the 
-speculative models are relatively small, we still see significant speedups. However, this 
+Note that these speculative models currently need to be run without tensor parallelism, although
+it is possible to run the main model using tensor parallelism (see example above). Since the
+speculative models are relatively small, we still see significant speedups. However, this
 limitation will be fixed in a future release.
 
 A variety of speculative models of this type are available on HF hub:

diff --git a/docs/source/quantization/supported_hardware.rst b/docs/source/quantization/supported_hardware.rst
@@ -5,18 +5,20 @@ Supported Hardware for Quantization Kernels
 
 The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
 
-==============  ======  =======  =======  =====  ======  =======  =========  =======  ==============  ==========
-Implementation  Volta   Turing   Ampere   Ada    Hopper  AMD GPU  Intel GPU  x86 CPU  AWS Inferentia  Google TPU
-==============  ======  =======  =======  =====  ======  =======  =========  =======  ==============  ==========
-AQLM            ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
-AWQ             ❌      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
-DeepSpeedFP     ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
-FP8             ❌      ❌       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
-Marlin          ❌      ❌       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
-GPTQ            ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
-SqueezeLLM      ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
-bitsandbytes    ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
-==============  ======  =======  =======  =====  ======  =======  =========  =======  ==============  ==========
+=====================  ======  =======  =======  =====  ======  =======  =========  =======  ==============  ==========
+Implementation         Volta   Turing   Ampere   Ada    Hopper  AMD GPU  Intel GPU  x86 CPU  AWS Inferentia  Google TPU
+=====================  ======  =======  =======  =====  ======  =======  =========  =======  ==============  ==========
+AWQ                    ❌      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+GPTQ                   ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+Marlin (GPTQ/AWQ/FP8)  ❌      ❌       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+INT8 (W8A8)            ❌      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+FP8 (W8A8)             ❌      ❌       ❌       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+AQLM                   ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+bitsandbytes           ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+DeepSpeedFP            ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+GGUF                   ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+SqueezeLLM             ✅      ✅       ✅       ✅     ✅      ❌        ❌         ❌       ❌              ❌
+=====================  ======  =======  =======  =====  ======  =======  =========  =======  ==============  ==========
 
 Notes:
 ^^^^^^
@@ -27,4 +29,4 @@ Notes:
 
 Please note that this compatibility chart may be subject to change as vLLM continues to evolve and expand its support for different hardware platforms and quantization methods.
 
-For the most up-to-date information on hardware support and quantization methods, please check the `quantization directory <https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization>`_ or consult with the vLLM development team.
+For the most up-to-date information on hardware support and quantization methods, please check the `quantization directory <https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/layers/quantization>`_ or consult with the vLLM development team.
diff --git a/requirements-openvino.txt b/requirements-openvino.txt
@@ -1,34 +1,7 @@
 # Common dependencies
-# -r requirements-common.txt
-# TODO: remove temporary copy of all common dependencies once Optimum Intel will support Transformers >= 4.43.2
-cmake >= 3.21
-ninja  # For faster builds.
-psutil
-sentencepiece  # Required for LLaMA tokenizer.
-numpy < 2.0.0
-requests
-tqdm
-py-cpuinfo
-transformers < 4.43
-tokenizers >= 0.19.1  # Required for Llama 3.
-fastapi
-aiohttp
-openai
-uvicorn[standard]
-pydantic >= 2.0  # Required for OpenAI server.
-pillow  # Required for image processing
-prometheus_client >= 0.18.0
-prometheus-fastapi-instrumentator >= 7.0.0
-tiktoken >= 0.6.0  # Required for DBRX tokenizer
-lm-format-enforcer == 0.10.3
-outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0
-typing_extensions
-filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
-pyzmq
-gguf == 0.9.1
+-r requirements-common.txt
 
 # OpenVINO dependencies
 torch >= 2.1.2
-openvino ~= 2024.3.0.dev
-openvino-tokenizers[transformers] ~= 2024.3.0.0.dev
-optimum-intel[openvino] >= 1.18.1
+openvino ~= 2024.3.0
+optimum-intel[openvino] >= 1.18.2
diff --git a/setup.py b/setup.py
@@ -272,7 +272,7 @@ def _build_custom_ops() -> bool:
 
 
 def _build_core_ext() -> bool:
-    return not _is_neuron() and not _is_tpu()
+    return not _is_neuron() and not _is_tpu() and not _is_openvino()
 
 
 def get_hipcc_rocm_version():

diff --git a/tests/async_engine/test_chat_template.py b/tests/async_engine/test_chat_template.py
@@ -1,22 +1,16 @@
-import os
-import pathlib
-
 import pytest
 
-from vllm.entrypoints.chat_utils import load_chat_template
+from vllm.entrypoints.chat_utils import apply_chat_template, load_chat_template
 from vllm.entrypoints.openai.protocol import ChatCompletionRequest
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
-chatml_jinja_path = pathlib.Path(os.path.dirname(os.path.abspath(
-    __file__))).parent.parent / "examples/template_chatml.jinja"
+from ..utils import VLLM_PATH
+
+chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
 assert chatml_jinja_path.exists()
 
 # Define models, templates, and their corresponding expected outputs
 MODEL_TEMPLATE_GENERATON_OUTPUT = [
-    ("facebook/opt-125m", None, True,
-     "Hello</s>Hi there!</s>What is the capital of</s>"),
-    ("facebook/opt-125m", None, False,
-     "Hello</s>Hi there!</s>What is the capital of</s>"),
     ("facebook/opt-125m", chatml_jinja_path, True, """<|im_start|>user
 Hello<|im_end|>
 <|im_start|>assistant
@@ -93,11 +87,12 @@ def test_get_gen_prompt(model, template, add_generation_prompt,
         add_generation_prompt=add_generation_prompt)
 
     # Call the function and get the result
-    result = tokenizer.apply_chat_template(
+    result = apply_chat_template(
+        tokenizer,
         conversation=mock_request.messages,
-        tokenize=False,
+        chat_template=mock_request.chat_template or template_content,
         add_generation_prompt=mock_request.add_generation_prompt,
-        chat_template=mock_request.chat_template or template_content)
+    )
 
     # Test assertion
     assert result == expected_output, (

diff --git a/tests/async_engine/test_openapi_server_ray.py b/tests/async_engine/test_openapi_server_ray.py
@@ -1,10 +1,12 @@
 import openai  # use the official client for correctness check
 import pytest
 
-from ..utils import RemoteOpenAIServer
+from ..utils import VLLM_PATH, RemoteOpenAIServer
 
 # any model with a chat template should work here
 MODEL_NAME = "facebook/opt-125m"
+chatml_jinja_path = VLLM_PATH / "examples/template_chatml.jinja"
+assert chatml_jinja_path.exists()
 
 
 @pytest.fixture(scope="module")
@@ -16,7 +18,9 @@ def server():
         "--max-model-len",
         "2048",
         "--enforce-eager",
-        "--engine-use-ray"
+        "--engine-use-ray",
+        "--chat-template",
+        str(chatml_jinja_path),
     ]
 
     with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
@@ -83,7 +87,7 @@ async def test_single_chat_session(client: openai.AsyncOpenAI):
     choice = chat_completion.choices[0]
     assert choice.finish_reason == "length"
     assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=13, total_tokens=23)
+        completion_tokens=10, prompt_tokens=55, total_tokens=65)
 
     message = choice.message
     assert message.content is not None and len(message.content) >= 10

diff --git a/tests/entrypoints/openai/test_mp_crash.py b/tests/entrypoints/openai/test_mp_crash.py
@@ -0,0 +1,35 @@
+from typing import Any
+
+import pytest
+
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.entrypoints.openai.api_server import build_async_engine_client
+from vllm.entrypoints.openai.cli_args import make_arg_parser
+from vllm.utils import FlexibleArgumentParser
+
+
+def crashing_from_engine_args(
+    cls,
+    engine_args: Any = None,
+    start_engine_loop: Any = None,
+    usage_context: Any = None,
+    stat_loggers: Any = None,
+) -> "AsyncLLMEngine":
+    raise Exception("foo")
+
+
+@pytest.mark.asyncio
+async def test_mp_crash_detection(monkeypatch):
+
+    with pytest.raises(RuntimeError) as excinfo, monkeypatch.context() as m:
+        m.setattr(AsyncLLMEngine, "from_engine_args",
+                  crashing_from_engine_args)
+        parser = FlexibleArgumentParser(
+            description="vLLM's remote OpenAI server.")
+        parser = make_arg_parser(parser)
+        args = parser.parse_args([])
+
+        async with build_async_engine_client(args):
+            pass
+    assert "The server process died before responding to the readiness probe"\
+          in str(excinfo.value)