diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index 83a56e25aca73..38aff57a410dc 100644
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -5,6 +5,19 @@ set -ex
# Print ROCm version
rocminfo
+
+echo "reset" > /opt/amdgpu/etc/gpu_state
+
+while true; do
+ sleep 3
+ if grep -q clean /opt/amdgpu/etc/gpu_state; then
+ echo "GPUs state is \"clean\""
+ break
+ fi
+done
+
+
+
# Try building the docker image
docker build -t rocm -f Dockerfile.rocm .
@@ -14,7 +27,8 @@ trap remove_docker_container EXIT
remove_docker_container
# Run the image
-docker run --device /dev/kfd --device /dev/dri --network host --name rocm rocm python3 -m vllm.entrypoints.api_server &
+export HIP_VISIBLE_DEVICES=1
+docker run --device /dev/kfd --device /dev/dri --network host -e HIP_VISIBLE_DEVICES --name rocm rocm python3 -m vllm.entrypoints.api_server &
# Wait for the server to start
wait_for_server_to_start() {
diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
new file mode 100644
index 0000000000000..8ba03b78e8dbf
--- /dev/null
+++ b/.buildkite/run-neuron-test.sh
@@ -0,0 +1,37 @@
+# This script build the Neuron docker image and run the API server inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -e
+
+# Try building the docker image
+aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
+docker build -t neuron -f Dockerfile.neuron .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f neuron || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image
+docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
+ --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
+
+# Wait for the server to start
+wait_for_server_to_start() {
+ timeout=300
+ counter=0
+
+ while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
+ sleep 1
+ counter=$((counter + 1))
+ if [ $counter -ge $timeout ]; then
+ echo "Timeout after $timeout seconds"
+ break
+ fi
+ done
+}
+wait_for_server_to_start
+
+# Test a simple prompt
+curl -X POST -H "Content-Type: application/json" \
+ localhost:8000/generate \
+ -d '{"prompt": "San Francisco is a"}'
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 695290ed74ab5..11cda053260ec 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -12,7 +12,11 @@ steps:
command: pytest -v -s async_engine
- label: Basic Correctness Test
- command: pytest -v -s basic_correctness
+ commands:
+ - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
+ - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
+ - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
+ - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
- label: Core Test
command: pytest -v -s core
@@ -27,13 +31,14 @@ steps:
num_gpus: 2 # only support 1 or 2 for now.
commands:
- pytest -v -s test_pynccl.py
+ - pytest -v -s test_pynccl_library.py
- TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_basic_distributed_correctness.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_basic_distributed_correctness.py
- TEST_DIST_MODEL=facebook/opt-125m pytest -v -s test_chunked_prefill_distributed.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf pytest -v -s test_chunked_prefill_distributed.py
- label: Engine Test
- command: pytest -v -s engine tokenization test_sequence.py test_config.py
+ command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
- label: Entrypoints Test
commands:
@@ -85,9 +90,15 @@ steps:
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 4
+- label: Tensorizer Test
+ command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
+
- label: Metrics Test
command: pytest -v -s metrics
+- label: Quantization Test
+ command: pytest -v -s quantization
+
- label: Benchmarks
working_dir: "/vllm-workspace/.buildkite"
commands:
diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
index 3ed23c62c005d..fb1086db77823 100644
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@@ -3,13 +3,6 @@
{% set default_working_dir = "/vllm-workspace/tests" %}
steps:
- - label: "AMD Test"
- agents:
- queue: amd
- command: bash .buildkite/run-amd-test.sh
-
- - label: "CPU Test"
- command: bash .buildkite/run-cpu-test.sh
- label: ":docker: build image"
commands:
@@ -23,6 +16,19 @@ steps:
limit: 5
- wait
+ - label: "AMD Test"
+ agents:
+ queue: amd
+ command: bash .buildkite/run-amd-test.sh
+
+ - label: "Neuron Test"
+ agents:
+ queue: neuron
+ command: bash .buildkite/run-neuron-test.sh
+
+ - label: "CPU Test"
+ command: bash .buildkite/run-cpu-test.sh
+
{% for step in steps %}
- label: "{{ step.label }}"
agents:
diff --git a/.github/ISSUE_TEMPLATE/200-installation.yml b/.github/ISSUE_TEMPLATE/200-installation.yml
index 4c6c96187cc6c..df41ade8c3c01 100644
--- a/.github/ISSUE_TEMPLATE/200-installation.yml
+++ b/.github/ISSUE_TEMPLATE/200-installation.yml
@@ -18,6 +18,7 @@ body:
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
+ It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
```text
The output of `python collect_env.py`
diff --git a/.github/ISSUE_TEMPLATE/300-usage.yml b/.github/ISSUE_TEMPLATE/300-usage.yml
index 88227b4b2e7b9..54763af1058f6 100644
--- a/.github/ISSUE_TEMPLATE/300-usage.yml
+++ b/.github/ISSUE_TEMPLATE/300-usage.yml
@@ -18,6 +18,7 @@ body:
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
+ It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
```text
The output of `python collect_env.py`
diff --git a/.github/ISSUE_TEMPLATE/400-bug report.yml b/.github/ISSUE_TEMPLATE/400-bug report.yml
index f1124dfa78bbc..08120ad8e5a60 100644
--- a/.github/ISSUE_TEMPLATE/400-bug report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug report.yml
@@ -18,6 +18,7 @@ body:
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
+ It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
```text
The output of `python collect_env.py`
@@ -57,6 +58,8 @@ body:
If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
+
+ If you experienced crashes or hangs, it would be helpful to run vllm with `export VLLM_TRACE_FUNCTION=1` . All the function calls in vllm will be recorded. Inspect these log files, and tell which function crashes or hangs.
placeholder: |
A clear and concise description of what the bug is.
diff --git a/.github/ISSUE_TEMPLATE/700-performance discussion.yml b/.github/ISSUE_TEMPLATE/700-performance discussion.yml
index 9e8e7b4aa3530..4f8843420a94e 100644
--- a/.github/ISSUE_TEMPLATE/700-performance discussion.yml
+++ b/.github/ISSUE_TEMPLATE/700-performance discussion.yml
@@ -39,6 +39,7 @@ body:
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
+ It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
```text
The output of `python collect_env.py`
diff --git a/.github/scripts/run-tests b/.github/scripts/run-tests
index d0d28cdc9a0aa..046059e4088d3 100755
--- a/.github/scripts/run-tests
+++ b/.github/scripts/run-tests
@@ -113,8 +113,8 @@ do
# need to be run with specific options
if [[ "${TEST}" == *"kernels"* || "${TEST}" == *"samplers"* ]]; then
CUDA_VISIBLE_DEVICES=0,1 pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
- elif [[ "${TEST}" == *"test_basic_distributed_correctness"* ]]; then
- CUDA_VISIBLE_DEVICES=0,1 TEST_DIST_MODEL=facebook/opt-125m pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
+ elif [[ "${TEST}" == *"distributed"* ]]; then
+ CUDA_VISIBLE_DEVICES=0,1 pytest ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
elif [[ "${TEST}" == *"test_models_logprobs"* ]]; then
pytest --forked ${CC_PYTEST_FLAGS} --junitxml=${RESULT_XML} ${TEST} || LOCAL_SUCCESS=$?
else
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
new file mode 100644
index 0000000000000..089c7d18ad6f2
--- /dev/null
+++ b/.github/workflows/mypy.yaml
@@ -0,0 +1,50 @@
+name: mypy
+
+on:
+ # Trigger the workflow on push or pull request,
+ # but only for the main branch
+ push:
+ branches:
+ - main
+ pull_request:
+ branches:
+ - main
+
+jobs:
+ ruff:
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ python-version: ["3.8", "3.9", "3.10", "3.11"]
+ steps:
+ - uses: actions/checkout@v2
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install mypy==1.9.0
+ pip install types-setuptools
+ pip install types-PyYAML
+ pip install types-requests
+ pip install types-setuptools
+ - name: Mypy
+ run: |
+ mypy vllm/attention --config-file pyproject.toml
+ mypy vllm/distributed --config-file pyproject.toml
+ mypy vllm/entrypoints --config-file pyproject.toml
+ mypy vllm/executor --config-file pyproject.toml
+ mypy vllm/usage --config-file pyproject.toml
+ mypy vllm/*.py --config-file pyproject.toml
+ mypy vllm/transformers_utils --config-file pyproject.toml
+ mypy vllm/engine --config-file pyproject.toml
+ mypy vllm/worker --config-file pyproject.toml
+ mypy vllm/spec_decode --config-file pyproject.toml
+ mypy vllm/lora --config-file pyproject.toml
+
+ # TODO(sang): Fix nested dir
+ mypy vllm/model_executor/*.py --config-file pyproject.toml
+ mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml
+
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index fc97e33c19af2..4b9fc3d04d872 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -56,6 +56,9 @@ jobs:
- name: Checkout
uses: actions/checkout@v3
+ - name: Setup ccache
+ uses: hendrikmuhs/ccache-action@v1.2
+
- name: Set up Linux Env
if: ${{ runner.os == 'Linux' }}
run: |
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
index e8060e369a889..e71033f828006 100644
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -15,7 +15,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
- python-version: ["3.10"]
+ python-version: ["3.8", "3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml
index b163c960db555..04f307bcf8b0e 100644
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@@ -14,7 +14,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
- python-version: ["3.10"]
+ python-version: ["3.8", "3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
diff --git a/.gitignore b/.gitignore
index f411b54cc3da3..06e8923ced311 100644
--- a/.gitignore
+++ b/.gitignore
@@ -72,6 +72,8 @@ instance/
# Sphinx documentation
docs/_build/
+docs/source/getting_started/examples/*.rst
+!**/*.template.rst
# PyBuilder
.pybuilder/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1845151181284..e9262b57d0867 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -167,12 +167,14 @@ set(VLLM_EXT_SRC
"csrc/layernorm_kernels.cu"
"csrc/quantization/squeezellm/quant_cuda_kernel.cu"
"csrc/quantization/gptq/q_gemm.cu"
+ "csrc/quantization/fp8/fp8_cuda_kernels.cu"
"csrc/cuda_utils_kernels.cu"
"csrc/moe_align_block_size_kernels.cu"
"csrc/pybind.cpp")
if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_EXT_SRC
+ "csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu"
"csrc/quantization/marlin/marlin_cuda_kernel.cu"
"csrc/custom_all_reduce.cu")
@@ -210,23 +212,11 @@ define_gpu_extension_target(
set(VLLM_PUNICA_EXT_SRC
"csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
- "csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu"
- "csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu"
- "csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu"
"csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
- "csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu"
- "csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu"
- "csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu"
- "csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu"
"csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
- "csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu"
"csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
"csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
- "csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu"
- "csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu"
"csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
- "csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu"
- "csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu"
"csrc/punica/punica_ops.cc")
#
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
new file mode 100644
index 0000000000000..fe42b4ef393f1
--- /dev/null
+++ b/Dockerfile.neuron
@@ -0,0 +1,36 @@
+# default base image
+ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04"
+
+FROM $BASE_IMAGE
+
+RUN echo "Base image is $BASE_IMAGE"
+
+# Install some basic utilities
+RUN apt-get update && apt-get install python3 python3-pip -y
+
+### Mount Point ###
+# When launching the container, mount the code directory to /app
+ARG APP_MOUNT=/app
+VOLUME [ ${APP_MOUNT} ]
+WORKDIR ${APP_MOUNT}
+
+RUN python3 -m pip install --upgrade pip
+RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
+RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
+RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+
+COPY ./vllm /app/vllm/vllm
+COPY ./setup.py /app/vllm/setup.py
+COPY ./requirements-common.txt /app/vllm/requirements-common.txt
+COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
+
+RUN cd /app/vllm \
+ && python3 -m pip install -U -r requirements-neuron.txt
+
+ENV VLLM_BUILD_WITH_NEURON 1
+RUN cd /app/vllm \
+ && pip install -e . \
+ && cd ..
+
+CMD ["/bin/bash"]
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index b1c5fac9d78ef..3f84b949481d1 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -14,7 +14,7 @@ RUN echo "Base image is $BASE_IMAGE"
ARG FA_GFX_ARCHS="gfx90a;gfx942"
RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
-ARG FA_BRANCH="3d2b6f5"
+ARG FA_BRANCH="ae7928c"
RUN echo "FA_BRANCH is $FA_BRANCH"
# whether to build flash-attention
@@ -92,13 +92,10 @@ RUN if [ "$BUILD_TRITON" = "1" ]; then \
COPY ./ /app/vllm
RUN python3 -m pip install --upgrade pip numba
-RUN python3 -m pip install xformers==0.0.23 --no-deps
RUN cd /app \
&& cd vllm \
&& pip install -U -r requirements-rocm.txt \
- && if [ "$BUILD_FA" = "1" ]; then \
- bash patch_xformers.rocm.sh; fi \
&& patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch \
&& python3 setup.py install \
&& cd ..
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 51c395e5698b5..34c22d2d9879a 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -139,6 +139,7 @@ async def async_request_trt_llm(
"data:")
data = json.loads(chunk)
+ output.generated_text += data["text_output"]
timestamp = time.perf_counter()
# First token
if ttft == 0.0:
@@ -153,7 +154,6 @@ async def async_request_trt_llm(
most_recent_timestamp = timestamp
output.latency = most_recent_timestamp - st
- output.generated_text = json.loads(data)["text_output"]
output.success = True
else:
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index aadbc441713fc..44da3bad8d840 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -9,6 +9,7 @@
from tqdm import tqdm
from vllm import LLM, SamplingParams
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
def main(args: argparse.Namespace):
@@ -101,7 +102,7 @@ def run_to_completion(profile_dir: Optional[str] = None):
parser.add_argument('--tokenizer', type=str, default=None)
parser.add_argument('--quantization',
'-q',
- choices=['awq', 'gptq', 'squeezellm', None],
+ choices=[*QUANTIZATION_METHODS, None],
default=None)
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
parser.add_argument('--input-len', type=int, default=32)
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index a1810bed0b81b..3df71ffa5662d 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -29,7 +29,7 @@
import warnings
from dataclasses import dataclass
from datetime import datetime
-from typing import AsyncGenerator, List, Tuple
+from typing import AsyncGenerator, List, Optional, Tuple
import numpy as np
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
@@ -60,7 +60,11 @@ def sample_sharegpt_requests(
dataset_path: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
+ fixed_output_len: Optional[int] = None,
) -> List[Tuple[str, int, int]]:
+ if fixed_output_len is not None and fixed_output_len < 4:
+ raise ValueError("output_len too small")
+
# Load the dataset.
with open(dataset_path) as f:
dataset = json.load(f)
@@ -70,38 +74,32 @@ def sample_sharegpt_requests(
dataset = [(data["conversations"][0]["value"],
data["conversations"][1]["value"]) for data in dataset]
- # some of these will be filtered out, so sample more than we need
- sampled_indices = random.sample(range(len(dataset)),
- int(num_requests * 1.2))
- dataset = [dataset[i] for i in sampled_indices]
-
- # Tokenize the prompts and completions.
- prompts = [prompt for prompt, _ in dataset]
- prompt_token_ids = tokenizer(prompts).input_ids
- completions = [completion for _, completion in dataset]
- completion_token_ids = tokenizer(completions).input_ids
- tokenized_dataset = []
- for i in range(len(dataset)):
- output_len = len(completion_token_ids[i])
- tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
+ # Shuffle the dataset.
+ random.shuffle(dataset)
- # Filter out too long sequences.
+ # Filter out sequences that are too long or too short
filtered_dataset: List[Tuple[str, int, int]] = []
- for prompt, prompt_token_ids, output_len in tokenized_dataset:
+ for i in range(len(dataset)):
+ if len(filtered_dataset) == num_requests:
+ break
+
+ # Tokenize the prompts and completions.
+ prompt = dataset[i][0]
+ prompt_token_ids = tokenizer(prompt).input_ids
+ completion = dataset[i][1]
+ completion_token_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_token_ids)
+ output_len = len(completion_token_ids
+ ) if fixed_output_len is None else fixed_output_len
if prompt_len < 4 or output_len < 4:
# Prune too short sequences.
- # This is because TGI causes errors when the input or output length
- # is too short.
continue
if prompt_len > 1024 or prompt_len + output_len > 2048:
# Prune too long sequences.
continue
filtered_dataset.append((prompt, prompt_len, output_len))
- # Sample the requests.
- sampled_requests = random.sample(filtered_dataset, num_requests)
- return sampled_requests
+ return filtered_dataset
def sample_sonnet_requests(
@@ -363,6 +361,7 @@ def main(args: argparse.Namespace):
dataset_path=args.dataset,
num_requests=args.num_prompts,
tokenizer=tokenizer,
+ fixed_output_len=args.sharegpt_output_len,
)
elif args.dataset_name == "sharegpt":
@@ -370,6 +369,7 @@ def main(args: argparse.Namespace):
dataset_path=args.dataset_path,
num_requests=args.num_prompts,
tokenizer=tokenizer,
+ fixed_output_len=args.sharegpt_output_len,
)
elif args.dataset_name == "sonnet":
@@ -526,6 +526,12 @@ def main(args: argparse.Namespace):
default=1000,
help="Number of prompts to process.",
)
+ parser.add_argument(
+ "--sharegpt-output-len",
+ type=int,
+ default=None,
+ help="Output length for each request. Overrides the output length "
+ "from the ShareGPT dataset.")
parser.add_argument(
"--sonnet-input-len",
type=int,
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 6df1e1d628e6c..695d06e7b243d 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -10,6 +10,8 @@
from transformers import (AutoModelForCausalLM, AutoTokenizer,
PreTrainedTokenizerBase)
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+
def sample_requests(
dataset_path: str,
@@ -101,25 +103,22 @@ def run_vllm(
)
# Add the requests to the engine.
+ prompts = []
+ sampling_params = []
for prompt, _, output_len in requests:
- sampling_params = SamplingParams(
- n=n,
- temperature=0.0 if use_beam_search else 1.0,
- top_p=1.0,
- use_beam_search=use_beam_search,
- ignore_eos=True,
- max_tokens=output_len,
- )
- # FIXME(woosuk): Do not use internal method.
- llm._add_request(
- prompt=prompt,
- prompt_token_ids=None,
- sampling_params=sampling_params,
- )
+ prompts.append(prompt)
+ sampling_params.append(
+ SamplingParams(
+ n=n,
+ temperature=0.0 if use_beam_search else 1.0,
+ top_p=1.0,
+ use_beam_search=use_beam_search,
+ ignore_eos=True,
+ max_tokens=output_len,
+ ))
start = time.perf_counter()
- # FIXME(woosuk): Do not use internal method.
- llm._run_engine(use_tqdm=True)
+ llm.generate(prompts, sampling_params, use_tqdm=True)
end = time.perf_counter()
return end - start
@@ -267,7 +266,7 @@ def main(args: argparse.Namespace):
parser.add_argument("--tokenizer", type=str, default=None)
parser.add_argument('--quantization',
'-q',
- choices=['awq', 'gptq', 'squeezellm', None],
+ choices=[*QUANTIZATION_METHODS, None],
default=None)
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
parser.add_argument("--n",
diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py
new file mode 100644
index 0000000000000..59392947b15c8
--- /dev/null
+++ b/benchmarks/kernels/benchmark_aqlm.py
@@ -0,0 +1,302 @@
+import argparse
+import os
+import sys
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.aqlm import (
+ dequantize_weight, generic_dequantize_gemm, get_int_dtype,
+ optimized_dequantize_gemm)
+
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+
+def torch_mult(
+ input: torch.Tensor, # [..., in_features]
+ weights: torch.Tensor,
+ scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
+) -> torch.Tensor:
+ output = F.linear(input, weights)
+ return output
+
+
+def dequant_out_scale(
+ input: torch.Tensor, # [..., in_features]
+ codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
+ codebooks: torch.
+ Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
+ scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
+ output_partition_sizes: torch.IntTensor,
+ bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+
+ weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+ if bias is None:
+ output = F.linear(input, weights, bias)
+ orig_shape = output.shape
+ flattened_output = output.view(-1, output.size(-1))
+ f_scales = scales.view(-1, scales.shape[0])
+ b_scales = f_scales.expand(flattened_output.shape[0], -1)
+ flattened_output *= b_scales
+ return flattened_output.view(orig_shape)
+ else:
+ b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
+ -1, weights.shape[1])
+ weights *= b_scales
+ return F.linear(input, weights, bias)
+
+
+def dequant_weight_scale(
+ input: torch.Tensor, # [..., in_features]
+ codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
+ codebooks: torch.
+ Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
+ scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
+ output_partition_sizes: torch.IntTensor,
+ bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+
+ weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+ b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
+ -1, weights.shape[1])
+ weights *= b_scales
+ return F.linear(input, weights, bias)
+
+
+def dequant_no_scale(
+ input: torch.Tensor, # [..., in_features]
+ codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
+ codebooks: torch.
+ Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
+ scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
+ output_partition_sizes: torch.IntTensor,
+ bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+
+ weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+ return F.linear(input, weights, bias)
+
+
+# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
+# the generic pytorch version.
+# Just visual comparison.
+def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
+
+ n = parts.sum().item()
+
+ device = torch.device('cuda:0')
+
+ code_range = (1 << bits) // 2
+ ingroups = 8
+
+ codes = torch.randint(-code_range,
+ code_range,
+ size=(n, k // ingroups, nbooks),
+ dtype=get_int_dtype(bits),
+ device=device)
+
+ codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
+ dtype=torch.float16,
+ device=device)
+
+ count = 0
+ for index in range(16):
+ for i in range(8):
+ for book in range(nbooks):
+ codebooks[book, index, 0, i] = count * (10**book)
+ count += 1
+
+ print("codes shape", codes.shape)
+
+ for i in range(16):
+ for book in range(nbooks):
+ codes[0, i, book] = i
+ codes[0, -i, book] = i
+
+ weights = dequantize_weight(codes, codebooks, None)
+ weights2 = ops.aqlm_dequant(codes, codebooks, parts)
+
+ print("weights shape:", weights.shape)
+ print("weights2 shape:", weights2.shape)
+
+ print("weights are:", weights)
+ print("weights2 are:", weights2)
+
+ print("first 128 weights are", weights[0, 0:128].to(torch.int32))
+ print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32))
+
+ print("last 128 weights are", weights[0, -128:])
+ print("last 128 weights2 are:", weights2[0, -128:])
+
+
+def main():
+
+ parser = argparse.ArgumentParser(description="Benchmark aqlm performance.")
+
+ # Add arguments
+ parser.add_argument("--nbooks",
+ type=int,
+ default=1,
+ help="Number of codebooks (default: 1)")
+ parser.add_argument("--bits",
+ type=int,
+ default=16,
+ help="Number of bits per code element (default: 16)")
+ parser.add_argument(
+ "--test",
+ type=bool,
+ default=False,
+ help="Run the decompression/dequant tester rather than benchmarking "
+ "(default: False)")
+
+ # Parse the arguments
+ args = parser.parse_args()
+
+ # Extract values
+ nbooks = args.nbooks
+ bits = args.bits
+
+ if args.test:
+ dequant_test(4096, torch.tensor((4096, )), nbooks, bits)
+ return
+
+ # Otherwise, benchmark.
+ methods = [
+ ops.aqlm_gemm,
+ dequant_out_scale,
+ generic_dequantize_gemm,
+ optimized_dequantize_gemm,
+ dequant_weight_scale,
+ torch_mult,
+ dequant_no_scale,
+ ]
+
+ filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv"
+ print(f"writing benchmarks to file {filename}")
+ with open(filename, "w") as f:
+ sys.stdout = f
+
+ print('m | k | n | n parts', end='')
+ for method in methods:
+ print(f" | {method.__name__.replace('_', ' ')} (µs)", end='')
+ print('')
+
+ # These are reasonable prefill sizes.
+ ksandpartions = ((4096, (4096, 4096, 4096)), (4096, (4096, )),
+ (4096, (11008, 11008)), (11008, (4096, )))
+
+ # reasonable ranges for m.
+ for m in [
+ 1, 2, 4, 8, 10, 12, 14, 16, 24, 32, 48, 52, 56, 64, 96, 112,
+ 128, 256, 512, 1024, 1536, 2048, 3072, 4096
+ ]:
+ print(f'{m}', file=sys.__stdout__)
+ for ksp in ksandpartions:
+ run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits,
+ methods)
+
+ sys.stdout = sys.__stdout__
+
+
+def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
+ methods):
+
+ # I didn't see visible improvements from increasing these, but feel free :)
+ num_warmup_trials = 1
+ num_trials = 1
+
+ num_calls = 100
+
+ # warmup.
+ for method in methods:
+ for _ in range(num_warmup_trials):
+ run_timing(
+ num_calls=num_calls,
+ m=m,
+ k=k,
+ parts=parts,
+ nbooks=nbooks,
+ bits=bits,
+ method=method,
+ )
+
+ n = parts.sum().item()
+ print(f'{m} | {k} | {n} | {parts.tolist()}', end='')
+
+ for method in methods:
+ best_time_us = 1e20
+ for _ in range(num_trials):
+ kernel_dur_ms = run_timing(
+ num_calls=num_calls,
+ m=m,
+ k=k,
+ parts=parts,
+ nbooks=nbooks,
+ bits=bits,
+ method=method,
+ )
+
+ kernel_dur_us = 1000 * kernel_dur_ms
+
+ if kernel_dur_us < best_time_us:
+ best_time_us = kernel_dur_us
+
+ print(f' | {kernel_dur_us:.0f}', end='')
+
+ print('')
+
+
+def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor,
+ nbooks: int, bits: int, method) -> float:
+
+ n = parts.sum().item()
+
+ device = torch.device('cuda:0')
+
+ input = torch.randn((1, m, k), dtype=torch.float16, device=device)
+
+ code_range = (1 << bits) // 2
+ ingroups = 8
+
+ codes = torch.randint(-code_range,
+ code_range,
+ size=(n, k // ingroups, nbooks),
+ dtype=get_int_dtype(bits),
+ device=device)
+
+ codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
+ dtype=torch.float16,
+ device=device)
+
+ scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device)
+
+ # for comparison to just a pytorch mult.
+ weights = torch.randn((n, k), dtype=torch.float16, device=device)
+
+ start_event = torch.cuda.Event(enable_timing=True)
+ end_event = torch.cuda.Event(enable_timing=True)
+
+ start_event.record()
+
+ if method is torch_mult:
+ for i in range(num_calls):
+ torch_mult(input, weights, scales)
+ else:
+ for i in range(num_calls):
+ method(input, codes, codebooks, scales, parts, None)
+
+ end_event.record()
+ end_event.synchronize()
+
+ dur_ms = start_event.elapsed_time(end_event) / num_calls
+ return dur_ms
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/collect_env.py b/collect_env.py
index e09abfed3dfba..c89f8c64eddcb 100644
--- a/collect_env.py
+++ b/collect_env.py
@@ -63,6 +63,7 @@
"magma",
"triton",
"optree",
+ "nccl",
}
DEFAULT_PIP_PATTERNS = {
@@ -73,6 +74,7 @@
"triton",
"optree",
"onnx",
+ "nccl",
# UPSTREAM SYNC: needed for sparsity
"nm-magic-wand-nightly",
}
diff --git a/csrc/ops.h b/csrc/ops.h
index 41ecc1e89371b..ff7a3de1a0a8c 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -86,6 +86,21 @@ void gelu_fast(
torch::Tensor& input);
#ifndef USE_ROCM
+torch::Tensor aqlm_gemm(
+ const torch::Tensor& input,
+ const torch::Tensor& codes,
+ const torch::Tensor& codebooks,
+ const torch::Tensor& scales,
+ const torch::Tensor& codebook_partition_sizes,
+ const std::optional& bias
+);
+
+torch::Tensor aqlm_dequant(
+ const torch::Tensor& codes,
+ const torch::Tensor& codebooks,
+ const torch::Tensor& codebook_partition_sizes
+);
+
torch::Tensor awq_gemm(
torch::Tensor _in_feats,
torch::Tensor _kernel,
@@ -131,6 +146,11 @@ void gptq_shuffle(
torch::Tensor q_perm,
int bit);
+void scaled_fp8_quant(
+ torch::Tensor& out,
+ torch::Tensor& input,
+ torch::Tensor& scale);
+
void moe_align_block_size(
torch::Tensor topk_ids,
int num_experts,
diff --git a/csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu b/csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu
deleted file mode 100644
index e8202dff561d9..0000000000000
--- a/csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_bfloat16, nv_half)
diff --git a/csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu b/csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu
deleted file mode 100644
index 3e7cf31dead0f..0000000000000
--- a/csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_half, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu b/csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu
deleted file mode 100644
index 68277fa6b7d56..0000000000000
--- a/csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, nv_half, nv_half)
diff --git a/csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu b/csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu
deleted file mode 100644
index 3b7531b8fbcfc..0000000000000
--- a/csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_bfloat16, float, nv_half)
diff --git a/csrc/punica/bgmv/bgmv_config.h b/csrc/punica/bgmv/bgmv_config.h
index 9b76b98ab3322..fec484d693055 100644
--- a/csrc/punica/bgmv/bgmv_config.h
+++ b/csrc/punica/bgmv/bgmv_config.h
@@ -47,6 +47,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
f(in_T, out_T, W_T, narrow, 13696) \
f(in_T, out_T, W_T, narrow, 13824) \
f(in_T, out_T, W_T, narrow, 14336) \
+ f(in_T, out_T, W_T, narrow, 15360) \
f(in_T, out_T, W_T, narrow, 16384) \
f(in_T, out_T, W_T, narrow, 20480) \
f(in_T, out_T, W_T, narrow, 22016) \
@@ -59,6 +60,7 @@ void bgmv_kernel(out_T *__restrict__ Y, const in_T *__restrict__ X,
f(in_T, out_T, W_T, narrow, 32768) \
f(in_T, out_T, W_T, narrow, 33024) \
f(in_T, out_T, W_T, narrow, 36864) \
+ f(in_T, out_T, W_T, narrow, 43264) \
f(in_T, out_T, W_T, narrow, 49152) \
f(in_T, out_T, W_T, narrow, 64000) \
f(in_T, out_T, W_T, narrow, 64256) \
diff --git a/csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu b/csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu
deleted file mode 100644
index b3b74aa3ec904..0000000000000
--- a/csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_bfloat16, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu b/csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu
deleted file mode 100644
index 3cc87f5df76a1..0000000000000
--- a/csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_bfloat16, nv_half)
diff --git a/csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu b/csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu
deleted file mode 100644
index 9eda98bd8ddcf..0000000000000
--- a/csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, nv_half, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu b/csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu
deleted file mode 100644
index 060f9ebb8c2b1..0000000000000
--- a/csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, nv_half, float, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu b/csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu
deleted file mode 100644
index b37e44570bf40..0000000000000
--- a/csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_bfloat16, nv_half)
diff --git a/csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu b/csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu
deleted file mode 100644
index 06718cbb0a3e9..0000000000000
--- a/csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, nv_half, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu b/csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu
deleted file mode 100644
index 41fb0e45ef4e6..0000000000000
--- a/csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, float, nv_bfloat16)
diff --git a/csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu b/csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu
deleted file mode 100644
index 50b7ead9fcefd..0000000000000
--- a/csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "bgmv_config.h"
-#include "bgmv_impl.cuh"
-
-FOR_BGMV_WIDE_NARROW(INST_BGMV_TWOSIDE, float, float, nv_half)
diff --git a/csrc/punica/bgmv/generator.py b/csrc/punica/bgmv/generator.py
index c347d4f2ab9f4..9bf7f6358880f 100644
--- a/csrc/punica/bgmv/generator.py
+++ b/csrc/punica/bgmv/generator.py
@@ -18,6 +18,26 @@
if weight_dtype == "fp32":
# FP32 weights are not supported.
continue
+ if output_dtype == "fp32":
+ # LoRA A matrix.
+ if input_dtype != weight_dtype:
+ # NOTE(woosuk): While Punica supports the case where the
+ # input and weight dtypes are different, we only generate
+ # the kernels the same dtypes to reduce the binary size.
+ continue
+ elif input_dtype == "fp32":
+ # LoRA B matrix.
+ if output_dtype != weight_dtype:
+ # NOTE(woosuk): While Punica supports the case where the
+ # output and weight dtypes are different, we only generate
+ # the kernels the same dtypes to reduce the binary size.
+ continue
+ elif not (input_dtype == output_dtype == weight_dtype):
+ # NOTE(woosuk): While Punica supports mixed data types for
+ # input, output, and weight, we only generate the kernels with
+ # the same data types to reduce the binary size.
+ continue
+
kernel_definition = TEMPLATE.format(
input_dtype=DTYPE_MAP[input_dtype],
output_dtype=DTYPE_MAP[output_dtype],
diff --git a/csrc/punica/punica_ops.cc b/csrc/punica/punica_ops.cc
index 7ebfd851c4feb..a1eaa90e85f27 100644
--- a/csrc/punica/punica_ops.cc
+++ b/csrc/punica/punica_ops.cc
@@ -50,6 +50,23 @@ inline bool launch_bgmv_kernel(out_T *Y, const in_T *X, const W_T *W,
int64_t y_offset, int64_t full_y_size,
int64_t batch_size, int64_t num_layers,
int64_t layer_idx, float scale) {
+ // NOTE(woosuk): While Punica supports various combinations of input/output
+ // data types, we limit the supported data types to reduce the binary size.
+ constexpr bool is_input_float = std::is_same::value;
+ constexpr bool is_output_float = std::is_same::value;
+ if (is_input_float) {
+ if (!std::is_same::value) {
+ return false;
+ }
+ } else if (is_output_float) {
+ if (!std::is_same::value) {
+ return false;
+ }
+ } else if (!(std::is_same::value &&
+ std::is_same::value)) {
+ return false;
+ }
+
switch (pack_u32(in_features, out_features)) {
#define CASE_ONESIDE(_in_T, _out_T, _W_T, feat_in, feat_out) \
case pack_u32(feat_in, feat_out): \
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
index de02afc162113..a5b16c5abc3ed 100644
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -63,6 +63,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
// Quantization ops
#ifndef USE_ROCM
+ ops.def("aqlm_gemm", &aqlm_gemm, "Quantized GEMM for AQLM");
+ ops.def("aqlm_dequant", &aqlm_dequant, "Decompression method for AQLM");
ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
ops.def("marlin_gemm", &marlin_gemm, "Marlin Optimized Quantized GEMM for GPTQ");
ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ");
@@ -71,6 +73,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ");
ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");
ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
+ ops.def("scaled_fp8_quant", &scaled_fp8_quant, "Compute FP8 quantized tensor and scaling factor");
ops.def(
"moe_align_block_size",
&moe_align_block_size,
diff --git a/csrc/quantization/aqlm/gemm_kernels.cu b/csrc/quantization/aqlm/gemm_kernels.cu
new file mode 100644
index 0000000000000..4415316e1e8cd
--- /dev/null
+++ b/csrc/quantization/aqlm/gemm_kernels.cu
@@ -0,0 +1,712 @@
+/*
+ * Modified by Neural Magic
+ * Adapted from https://github.com/Vahe1994/AQLM
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include
+#include
+
+
+namespace vllm {
+namespace aqlm {
+
+__global__ void Code1x16MatVec(
+ const int4* __restrict__ A,
+ const int4* __restrict__ B,
+ int4* __restrict__ C,
+ const int4* __restrict__ codebook,
+ const int prob_m,
+ const int prob_k,
+ const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long.
+ const int codebook_stride // as int4.
+) {
+ int a_gl_stride = prob_k / 8 / 8;
+ int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+ bool pred = a_gl_rd < prob_m;
+
+ if (pred)
+ {
+ // advance to the correct codebook, this easy because we only multiply one column of the codebook.
+ auto codebook_size = &codebook_a_sizes.x;
+ while (a_gl_rd >= *codebook_size)
+ {
+ codebook += codebook_stride;
+ ++codebook_size;
+ }
+ }
+
+ int b_gl_rd = 0;
+ int c_gl_wr = a_gl_rd;
+ a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
+ int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
+
+ __shared__ int4 sh_b[32 * 9];
+ float res = 0;
+
+ int iters = (prob_k / 8 + 8 * 32 - 1) / (8 * 32);
+ while (iters--) {
+ // We pad shared memory to avoid bank conflicts during reads
+ __syncthreads();
+ for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) {
+ if (b_gl_rd + i < prob_k / 8)
+ sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i];
+ }
+ __syncthreads();
+ b_gl_rd += 32 * 8;
+
+ int b_sh_rd = 9 * (threadIdx.x % 32);
+ if (pred && a_gl_rd < a_gl_end) {
+ const uint16_t* enc = reinterpret_cast(&A[a_gl_rd]);
+ #pragma unroll
+ for (int i = 0; i < 8; i++) {
+ uint32_t dec[4];
+ // We bypass the L1 cache to avoid massive amounts of memory streaming that doesn't
+ // actually help us; this brings > 2x speedup.
+ asm volatile (
+ "ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
+ : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3])
+ : "l"((void*) &codebook[enc[i]])
+ );
+ half2* a = reinterpret_cast(&dec);
+ half2* b = reinterpret_cast(&sh_b[b_sh_rd]);
+ half2 res2 = {};
+ #pragma unroll
+ for (int j = 0; j < 4; j++)
+ res2 = __hfma2(a[j], b[j], res2);
+ res += __half2float(res2.x) + __half2float(res2.y);
+ b_sh_rd++;
+ }
+ a_gl_rd += 32;
+ }
+ }
+
+ if (pred) {
+ #pragma unroll
+ for (int i = 16; i > 0; i /= 2)
+ res += __shfl_down_sync(0xffffffff, res, i);
+ if (threadIdx.x % 32 == 0)
+ reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res);
+ }
+}
+
+__global__ void Code2x8MatVec(
+ const int4* __restrict__ A,
+ const int4* __restrict__ B,
+ int4* __restrict__ C,
+ const int4* __restrict__ codebook,
+ int prob_m,
+ int prob_k,
+ const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long.
+ const int codebook_stride // as int4.
+
+) {
+ int a_gl_stride = prob_k / 8 / 8;
+ int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+ bool pred = a_gl_rd < prob_m;
+
+ if (pred)
+ {
+ // advance to the correct codebook, this easy because we only multiply one column of the codebook.
+ auto codebook_size = &codebook_a_sizes.x;
+ while (a_gl_rd >= *codebook_size)
+ {
+ codebook += codebook_stride;
+ ++codebook_size;
+ }
+ }
+
+ int b_gl_rd = 0;
+ int c_gl_wr = a_gl_rd;
+ a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
+ int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
+ int lane = threadIdx.x % 8;
+
+ extern __shared__ int4 sh[];
+ int4* sh_b = sh;
+ int4* sh_code = sh_b + 32 * 9;
+ int4* sh_code0 = sh_code;
+ int4* sh_code1 = sh_code + 256 * 8;
+
+ for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) {
+ int4 dec = codebook[i];
+ #pragma unroll
+ for (int j = 0; j < 8; j++)
+ sh_code[8 * i + (j + lane) % 8] = dec;
+ }
+ __syncthreads();
+
+ float res = 0;
+
+ int iters = (prob_k / 8 + 8 * 32 - 1) / (8 * 32);
+ while (iters--) {
+ // We pad shared memory to avoid bank conflicts during reads
+ __syncthreads();
+ for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) {
+ if (b_gl_rd + i < prob_k / 8)
+ sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i];
+ }
+ __syncthreads();
+ b_gl_rd += 32 * 8;
+
+ int b_sh_rd = 9 * (threadIdx.x % 32);
+ if (pred && a_gl_rd < a_gl_end) {
+ const uint8_t* enc = reinterpret_cast(&A[a_gl_rd]);
+ #pragma unroll
+ for (int i = 0; i < 8; i++) {
+ half2* a0 = reinterpret_cast(&sh_code0[8 * enc[2 * i + 0] + lane]);
+ half2* a1 = reinterpret_cast(&sh_code1[8 * enc[2 * i + 1] + lane]);
+ half2* b = reinterpret_cast(&sh_b[b_sh_rd]);
+ half2 res2 = {};
+ #pragma unroll
+ for (int j = 0; j < 4; j++)
+ res2 = __hfma2(__hadd2(a0[j], a1[j]), b[j], res2);
+ res += __half2float(res2.x) + __half2float(res2.y);
+ b_sh_rd++;
+ }
+ a_gl_rd += 32;
+ }
+ }
+
+ if (pred) {
+ #pragma unroll
+ for (int i = 16; i > 0; i /= 2)
+ res += __shfl_down_sync(0xffffffff, res, i);
+ if (threadIdx.x % 32 == 0)
+ reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res);
+ }
+}
+
+
+__global__ void Code1x16Dequant(
+ const int4* __restrict__ A,
+ int4* __restrict__ C,
+ const int4* __restrict__ codebook,
+ int prob_m,
+ int prob_k,
+ const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long, sums to m.
+ const int codebook_stride // as int4
+) {
+ int a_gl_stride = prob_k / 8 / 8;
+ int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+ bool pred = a_gl_rd < prob_m;
+
+ if (pred)
+ {
+ // advance to the correct codebook, this easy because we only multiply one column of the codebook.
+ auto codebook_size = &codebook_a_sizes.x;
+ while (a_gl_rd >= *codebook_size)
+ {
+ codebook += codebook_stride;
+ ++codebook_size;
+ }
+ }
+
+ a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
+ int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
+
+ int c_gl_stride = prob_k / 8;
+ int c_gl_wr = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+ c_gl_wr = c_gl_stride * c_gl_wr + (threadIdx.x % 32) * 8;
+
+ int iters = (prob_k / 8 - 1) / (8 * 32) + 1;
+ while (iters--) {
+ if (pred && a_gl_rd < a_gl_end) {
+ const uint16_t* enc = reinterpret_cast(&A[a_gl_rd]);
+ #pragma unroll
+ for (int i = 0; i < 8; i++) {
+ int4 chunk;
+ auto dec = reinterpret_cast(&chunk);
+ // We bypass the L1 cache to avoid massive amounts of memory streaming that doesn't
+ // actually help us; this brings > 2x speedup.
+ asm volatile (
+ "ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
+ : "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3])
+ : "l"((void*) &codebook[enc[i]])
+ );
+
+ C[a_gl_rd * 8 + i] = chunk;
+ }
+ }
+ a_gl_rd += 32;
+ }
+}
+
+
+__global__ void Code2x8Dequant(
+ const int4* __restrict__ A,
+ int4* __restrict__ C,
+ const int4* __restrict__ codebook,
+ int prob_m,
+ int prob_k,
+ const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long, corresponds to cols.
+ const int codebook_stride // as int4
+) {
+ int a_gl_stride = prob_k / 8 / 8;
+ int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+ bool pred = a_gl_rd < prob_m;
+
+ if (pred)
+ {
+ // advance to the correct codebook, this easy because we only multiply one column of the codebook.
+ auto codebook_size = &codebook_a_sizes.x;
+ while (a_gl_rd >= *codebook_size)
+ {
+ codebook += codebook_stride;
+ ++codebook_size;
+ }
+ }
+
+ a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
+ int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
+ int lane = threadIdx.x % 8;
+
+ int c_gl_stride = prob_k / 8;
+ int c_gl_wr = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
+ c_gl_wr = c_gl_stride * c_gl_wr + (threadIdx.x % 32) * 8;
+
+ extern __shared__ int4 sh[];
+ int4* sh_code = sh;
+ int4* sh_code0 = sh_code;
+ int4* sh_code1 = sh_code + 256 * 8;
+
+ for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) {
+ int4 dec = codebook[i];
+ #pragma unroll
+ for (int j = 0; j < 8; j++)
+ sh_code[8 * i + (j + lane) % 8] = dec;
+ }
+ __syncthreads();
+
+ float res = 0;
+
+ int iters = (prob_k / 8 - 1) / (8 * 32) + 1;
+ while (iters--) {
+ if (pred && a_gl_rd < a_gl_end) {
+ const uint8_t* enc = reinterpret_cast(&A[a_gl_rd]);
+ #pragma unroll
+ for (int i = 0; i < 8; i++) {
+ int4 chunk;
+ half2* a0 = reinterpret_cast(&sh_code0[8 * enc[2 * i + 0] + lane]);
+ half2* a1 = reinterpret_cast(&sh_code1[8 * enc[2 * i + 1] + lane]);
+ #pragma unroll
+ for (int j = 0; j < 4; j++)
+ reinterpret_cast(&chunk)[j] = __hadd2(a0[j], a1[j]);
+ C[a_gl_rd * 8 + i] = chunk;
+ }
+ }
+ a_gl_rd += 32;
+ }
+}
+
+inline int ceildiv(int a, int b) {
+ return (a + b - 1) / b;
+}
+
+const int THREAD_M = 16;
+
+void code1x16_matvec_cuda(
+ const void* __restrict__ A,
+ const void* __restrict__ B,
+ void* __restrict__ C,
+ const void* __restrict__ codebook,
+ int prob_m,
+ int prob_k,
+ const int4 codebook_a_sizes,
+ const int codebook_stride
+) {
+ int sms;
+ cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
+ int waves = 0;
+ int thread_m;
+ do {
+ waves++;
+ thread_m = ceildiv(prob_m, waves * sms);
+ } while (thread_m > THREAD_M);
+
+ int blocks = ceildiv(prob_m, thread_m);
+ int threads = 32 * thread_m;
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+ Code1x16MatVec<<>>(
+ (const int4*) A,
+ (const int4*) B,
+ (int4*) C,
+ (const int4*) codebook,
+ prob_m,
+ prob_k,
+ codebook_a_sizes,
+ codebook_stride
+ );
+}
+
+void code2x8_matvec_cuda(
+ const void* __restrict__ A,
+ const void* __restrict__ B,
+ void* __restrict__ C,
+ const void* __restrict__ codebook,
+ int prob_m,
+ int prob_k,
+ const int4 codebook_a_sizes,
+ const int codebook_stride
+) {
+ int sms;
+ cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
+ int waves = 0;
+ int thread_m;
+ do {
+ waves++;
+ thread_m = ceildiv(prob_m, waves * sms);
+ } while (thread_m > THREAD_M);
+
+ int blocks = ceildiv(prob_m, thread_m);
+ int threads = 32 * thread_m;
+ int shared = 16 * (2 * 256 * 8 + 32 * 9);
+ cudaFuncSetAttribute(
+ Code2x8MatVec, cudaFuncAttributeMaxDynamicSharedMemorySize, shared
+ );
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+ Code2x8MatVec<<>>(
+ (const int4*) A,
+ (const int4*) B,
+ (int4*) C,
+ (const int4*) codebook,
+ prob_m,
+ prob_k,
+ codebook_a_sizes,
+ codebook_stride
+ );
+}
+
+void code1x16_dequant_cuda(
+ const void* __restrict__ A,
+ void* __restrict__ C,
+ const void* __restrict__ codebook,
+ int prob_m,
+ int prob_k,
+ const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long.
+ const int codebook_stride // as int4.
+) {
+ int sms;
+ cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
+ int waves = 0;
+ int thread_m;
+ do {
+ waves++;
+ thread_m = ceildiv(prob_m, waves * sms);
+ } while (thread_m > THREAD_M);
+
+ int blocks = ceildiv(prob_m, thread_m);
+ int threads = 32 * thread_m;
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+ Code1x16Dequant<<>>(
+ (const int4*) A,
+ (int4*) C,
+ (const int4*) codebook,
+ prob_m,
+ prob_k,
+ codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long.
+ codebook_stride // as int4.
+ );
+}
+
+// Dequantizes the code and codebook into weights.
+void code2x8_dequant_cuda(
+ const void* __restrict__ A,
+ void* __restrict__ C,
+ const void* __restrict__ codebook,
+ int prob_m,
+ int prob_k,
+ const int4 codebook_a_sizes, // cumulative sizes of A spanning each codebook, at most 3 long, corresponds to cols.
+ const int codebook_stride // as int4
+) {
+ int sms;
+ cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
+ int waves = 0;
+ int thread_m;
+ do {
+ waves++;
+ thread_m = ceildiv(prob_m, waves * sms);
+ } while (thread_m > THREAD_M);
+
+ int blocks = ceildiv(prob_m, thread_m);
+ int threads = 32 * thread_m;
+ int shared = 16 * (2 * 256 * 8 + 32 * 9);
+ cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
+
+ cudaFuncSetAttribute(
+ Code2x8Dequant, cudaFuncAttributeMaxDynamicSharedMemorySize, shared
+ );
+ Code2x8Dequant<<>>(
+ (const int4*) A,
+ (int4*) C,
+ (const int4*) codebook,
+ prob_m,
+ prob_k,
+ codebook_a_sizes,
+ codebook_stride
+ );
+}
+
+int codebook_stride(const torch::Tensor& codebooks)
+{
+ return codebooks.stride(0) * codebooks.element_size() / sizeof(int4);
+}
+
+void code1x16_matvec(
+ const torch::Tensor& A,
+ const torch::Tensor& B,
+ torch::Tensor& C,
+ const torch::Tensor& codebook,
+ const int4 codebook_a_sizes // cumulative sizes of A spanning each codebook, at most 3 long.
+) {
+ const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
+ int prob_m = C.size(0);
+ int prob_k = B.size(0);
+
+ code1x16_matvec_cuda(
+ A.data_ptr(),
+ B.data_ptr(),
+ C.data_ptr(),
+ codebook.data_ptr(),
+ prob_m,
+ prob_k,
+ codebook_a_sizes,
+ codebook_stride(codebook)
+ );
+}
+
+torch::Tensor code1x16_matmat(
+ const torch::Tensor& input,
+ const torch::Tensor& codes,
+ const torch::Tensor& codebooks,
+ const torch::Tensor& scales,
+ const int4 codebook_a_sizes,
+ const std::optional& bias) {
+ auto input_sizes = input.sizes();
+ auto out_features = codes.size(0) * codebooks.size(2);
+ auto flat_input = input.reshape({-1, input.size(-1)});
+ auto flat_output = torch::empty({flat_input.size(0), out_features},
+ torch::TensorOptions()
+ .dtype(input.dtype())
+ .device(input.device())
+ );
+
+ for (int i = 0; i < flat_input.size(0); ++i) {
+ auto input_vec = flat_input.index({i});
+ auto output_vec = flat_output.index({i});
+ code1x16_matvec(
+ codes.squeeze(2),
+ input_vec,
+ output_vec,
+ codebooks,
+ codebook_a_sizes
+ );
+ }
+ flat_output *= scales.flatten().unsqueeze(0);
+
+ if (bias.has_value()) {
+ flat_output += bias->unsqueeze(0);
+ }
+
+ auto output_sizes = input_sizes.vec();
+ output_sizes.pop_back();
+ output_sizes.push_back(-1);
+ auto output = flat_output.reshape(output_sizes);
+ return output;
+}
+
+void code2x8_matvec(
+ const torch::Tensor& A,
+ const torch::Tensor& B,
+ torch::Tensor& C,
+ const torch::Tensor& codebook,
+ const int4 codebook_a_sizes
+) {
+ const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
+ int prob_m = C.size(0);
+ int prob_k = B.size(0);
+ code2x8_matvec_cuda(
+ A.data_ptr(),
+ B.data_ptr(),
+ C.data_ptr(),
+ codebook.data_ptr(),
+ prob_m,
+ prob_k,
+ codebook_a_sizes,
+ 2 * codebook_stride(codebook)
+ );
+}
+
+torch::Tensor code2x8_matmat(
+ const torch::Tensor& input,
+ const torch::Tensor& codes,
+ const torch::Tensor& codebooks,
+ const torch::Tensor& scales,
+ const int4 codebook_a_sizes,
+ const std::optional& bias
+) {
+ auto input_sizes = input.sizes();
+ auto out_features = codes.size(0) * codebooks.size(2);
+ auto flat_input = input.reshape({-1, input.size(-1)});
+ auto flat_output = torch::empty({flat_input.size(0), out_features},
+ torch::TensorOptions()
+ .dtype(input.dtype())
+ .device(input.device())
+ );
+
+ for (int i = 0; i < flat_input.size(0); ++i) {
+ auto input_vec = flat_input.index({i});
+ auto output_vec = flat_output.index({i});
+ code2x8_matvec(
+ codes.squeeze(2),
+ input_vec,
+ output_vec,
+ codebooks,
+ codebook_a_sizes
+ );
+ }
+ flat_output *= scales.flatten().unsqueeze(0);
+ if (bias.has_value()) {
+ flat_output += bias->unsqueeze(0);
+ }
+
+ auto output_sizes = input_sizes.vec();
+ output_sizes.pop_back();
+ output_sizes.push_back(-1);
+ auto output = flat_output.reshape(output_sizes);
+ return output;
+}
+
+// Accumulate the partition sizes.
+int4 accumulate_sizes(const torch::Tensor& codebook_partition_sizes)
+{
+ int4 cumulative_sizes;
+ auto cumulative_size = &cumulative_sizes.x;
+ int i = 0;
+ int last = 0;
+ assert(codebook_partition_sizes.size(0) <= 4);
+ for (; i < codebook_partition_sizes.size(0); ++i, ++cumulative_size)
+ {
+ *cumulative_size = codebook_partition_sizes[i].item() + last;
+ last = *cumulative_size;
+ }
+ // fill in the rest with unreachable.
+ for (; i < 4; ++i, ++cumulative_size)
+ {
+ *cumulative_size = last*10;
+ }
+ return cumulative_sizes;
+}
+
+} // namespace aqlm
+} // namespace vllm
+
+
+torch::Tensor aqlm_gemm(
+ const torch::Tensor& input,
+ const torch::Tensor& codes,
+ const torch::Tensor& codebooks,
+ const torch::Tensor& scales,
+ const torch::Tensor& codebook_partition_sizes,
+ const std::optional& bias
+)
+{
+ int4 cumulative_sizes = vllm::aqlm::accumulate_sizes(codebook_partition_sizes);
+
+ int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0);
+ int const entries = codebooks.size(1);
+
+ if (nbooks == 1 && entries == (1 << 16))
+ {
+ return vllm::aqlm::code1x16_matmat(input, codes, codebooks, scales, cumulative_sizes, bias);
+ }
+ if (nbooks == 2 && entries == (1 << 8))
+ {
+ return vllm::aqlm::code2x8_matmat(input, codes, codebooks, scales, cumulative_sizes, bias);
+ }
+
+ TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.")
+ return {};
+}
+
+torch::Tensor aqlm_dequant(
+ const torch::Tensor& codes,
+ const torch::Tensor& codebooks,
+ const torch::Tensor& codebook_partition_sizes
+)
+{
+ int4 cumulative_sizes = vllm::aqlm::accumulate_sizes(codebook_partition_sizes);
+
+ int const nbooks = codebooks.size(0) / codebook_partition_sizes.size(0);
+ int const entries = codebooks.size(1);
+
+ const at::cuda::OptionalCUDAGuard device_guard(device_of(codes));
+ int rows = codes.size(1);
+ int cols = codes.size(0);
+
+ auto in_features = codes.size(1) * 8;
+ auto out_features = codes.size(0);
+
+ assert(out_features = codebook_partition_sizes.sum().item());
+
+ auto weights = torch::empty({out_features, in_features},
+ torch::TensorOptions()
+ .dtype(codebooks.dtype())
+ .device(codebooks.device())
+ );
+
+ if (nbooks == 1 && entries == (1 << 16))
+ {
+ vllm::aqlm::code1x16_dequant_cuda(
+ codes.data_ptr(),
+ weights.data_ptr(),
+ codebooks.data_ptr(),
+ out_features,
+ in_features,
+ cumulative_sizes,
+ vllm::aqlm::codebook_stride(codebooks));
+
+ // if you wanted to flip to scaling the weights, (though it's 30%-ish slower and not consistent with gemv implementation.)
+ // weights *= scales.index({"...", 0, 0});
+
+ return weights;
+ }
+
+ if (nbooks == 2 && entries == (1 << 8))
+ {
+ vllm::aqlm::code2x8_dequant_cuda(
+ codes.data_ptr(),
+ weights.data_ptr(),
+ codebooks.data_ptr(),
+ out_features,
+ in_features,
+ cumulative_sizes,
+ vllm::aqlm::codebook_stride(codebooks));
+
+ // if you wanted to flip to scaling the weights, (though it's 30%-ish slower and not consistent with gemv implementation)
+ // weights *= scales.index({"...", 0, 0});
+
+ return weights;
+ }
+
+ TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries, " entries is not currently supported.")
+ return {};
+}
diff --git a/csrc/quantization/fp8/fp8_cuda_kernels.cu b/csrc/quantization/fp8/fp8_cuda_kernels.cu
new file mode 100644
index 0000000000000..c3337cede1282
--- /dev/null
+++ b/csrc/quantization/fp8/fp8_cuda_kernels.cu
@@ -0,0 +1,103 @@
+#include
+#include
+#include
+
+#include
+
+#include "cuda_compat.h"
+#include "dispatch_utils.h"
+
+namespace vllm {
+
+__device__ __forceinline__ float atomicMaxFloat(float* addr, float value) {
+ float old;
+ old = (value >= 0) ? __int_as_float(atomicMax((int*)addr, __float_as_int(value))) :
+ __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(value)));
+
+ return old;
+}
+
+// Compute the absolute maximum m of the input tensor and store
+// m / float8_e4m3::max() in *scale. Each thread block performs a
+// reduction tree and the memory in scale is atomically updated.
+// So to get the right answer, *scale needs to be initialized to
+// a value <= 0.0 and we need to wait for all thread blocks to
+// finish before consuming *scale.
+template
+__global__ void segmented_max_reduction(
+ float* __restrict__ scale,
+ const scalar_t* __restrict__ input,
+ int64_t num_elems) {
+ __shared__ float cache[1024];
+ int i = blockDim.x * blockIdx.x + threadIdx.x;
+
+ // First store maximum for all values processes by
+ // the current thread in cache[threadIdx.x]
+ scalar_t tmp = 0.0;
+ while (i < num_elems) {
+ float x = static_cast(input[i]);
+ tmp = max(tmp, fabs(x));
+ i += blockDim.x * gridDim.x;
+ }
+ cache[threadIdx.x] = tmp;
+
+ __syncthreads();
+
+ // Now perform parallel reduction within the thread block
+ int ib = blockDim.x / 2;
+ while (ib != 0) {
+ if (threadIdx.x < ib && cache[threadIdx.x + ib] > cache[threadIdx.x]) {
+ cache[threadIdx.x] = cache[threadIdx.x + ib];
+ }
+ __syncthreads();
+ ib /= 2;
+ }
+ // Finally, since cache[0] contains the maximum for this thread block,
+ // atomically write the max to the target location
+ if (threadIdx.x == 0) {
+ atomicMaxFloat(scale, cache[0] / std::numeric_limits::max());
+ }
+}
+
+template
+__global__ void scaled_fp8_quant_kernel(
+ c10::Float8_e4m3fn* __restrict__ out,
+ const scalar_t* __restrict__ input,
+ const float* __restrict__ scale,
+ int64_t num_elems) {
+ int i = blockDim.x * blockIdx.x + threadIdx.x;
+ while (i < num_elems) {
+ out[i] = static_cast(input[i] / *scale);
+ i += blockDim.x * gridDim.x;
+ }
+}
+
+} // namespace vllm
+
+void scaled_fp8_quant(
+ torch::Tensor& out, // [..., d]
+ torch::Tensor& input, // [..., d]
+ torch::Tensor& scale) // [1]
+{
+ int64_t num_tokens = input.numel() / input.size(-1);
+ int64_t num_elems = input.numel();
+ dim3 grid(num_tokens);
+ dim3 block(1024);
+ const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
+ const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+ VLLM_DISPATCH_FLOATING_TYPES(
+ input.scalar_type(),
+ "scaled_fp8_quant_kernel",
+ [&] {
+ vllm::segmented_max_reduction<<>>(
+ scale.data_ptr(),
+ input.data_ptr(),
+ num_elems);
+ vllm::scaled_fp8_quant_kernel<<>>(
+ out.data_ptr(),
+ input.data_ptr(),
+ scale.data_ptr(),
+ num_elems);
+ });
+}
+
diff --git a/csrc/quantization/marlin/marlin_cuda_kernel.cu b/csrc/quantization/marlin/marlin_cuda_kernel.cu
index cf1b0afdec8b4..002a70001885d 100644
--- a/csrc/quantization/marlin/marlin_cuda_kernel.cu
+++ b/csrc/quantization/marlin/marlin_cuda_kernel.cu
@@ -67,20 +67,13 @@ __device__ inline void cp_async4_pred(void *smem_ptr, const void *glob_ptr,
"r"(smem), "l"(glob_ptr), "n"(BYTES));
}
-// Asynchronous global->shared copy with a cache hint indicating that the values
-// may be evicted immediately; used for quantized weights B, which are only
-// accessed precisely once and should thus not pollute the L2 cache which we
-// need for inputs A and outputs C.
-__device__ inline void cp_async4_stream(void *smem_ptr, const void *glob_ptr) {
+// Asynchronous global->shared copy
+__device__ inline void cp_async4(void *smem_ptr, const void *glob_ptr) {
const int BYTES = 16;
uint32_t smem = static_cast(__cvta_generic_to_shared(smem_ptr));
- asm volatile(
- "{\n"
- " .reg .b64 p;\n"
- " createpolicy.fractional.L2::evict_first.b64 p, 1.0;"
- " cp.async.cg.shared.global.L2::cache_hint [%0], [%1], %2, p;\n"
- "}\n" ::"r"(smem),
- "l"(glob_ptr), "n"(BYTES));
+ asm volatile("{\n"
+ " cp.async.cg.shared.global [%0], [%1], %2;\n"
+ "}\n" :: "r"(smem), "l"(glob_ptr), "n"(BYTES));
}
// Async copy fence.
@@ -448,14 +441,14 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
int4 *sh_b_stage = sh_b + b_sh_stage * pipe;
#pragma unroll
for (int i = 0; i < b_sh_wr_iters; i++) {
- cp_async4_stream(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
+ cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
B_ptr[i] += b_gl_rd_delta_o;
}
// Only fetch scales if this tile starts a new group
if (group_blocks != -1 && pipe % (group_blocks / thread_k_blocks) == 0) {
int4 *sh_s_stage = sh_s + s_sh_stage * pipe;
if (s_sh_wr_pred)
- cp_async4_stream(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
+ cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
s_gl_rd += s_gl_rd_delta;
}
}
@@ -750,7 +743,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
// write-out
if (group_blocks == -1 && last) {
if (s_sh_wr_pred)
- cp_async4_stream(&sh_s[s_sh_wr], &s[s_gl_rd]);
+ cp_async4(&sh_s[s_sh_wr], &s[s_gl_rd]);
cp_async_fence();
}
thread_block_reduce();
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 7a8c365ffb3bb..aac8cbb63ebeb 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -11,12 +11,14 @@
# documentation root, use os.path.abspath to make it absolute, like shown here.
import logging
+import os
import sys
from typing import List
from sphinx.ext import autodoc
logger = logging.getLogger(__name__)
+sys.path.append(os.path.abspath("../.."))
# -- Project information -----------------------------------------------------
@@ -46,7 +48,7 @@
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns: List[str] = []
+exclude_patterns: List[str] = ["**/*.template.rst"]
# Exclude the prompt "$" when copying code
copybutton_prompt_text = r"\$ "
@@ -71,6 +73,13 @@
# so a file named "default.css" will overwrite the builtin "default.css".
# html_static_path = ['_static']
+
+# Generate additional rst documentation here.
+def setup(app):
+ from docs.source.generate_examples import generate_examples
+ generate_examples()
+
+
# Mock out external dependencies here.
autodoc_mock_imports = [
"cpuinfo",
@@ -83,6 +92,7 @@
"vllm._C",
"numpy",
"tqdm",
+ "tensorizer",
]
for mock_target in autodoc_mock_imports:
diff --git a/docs/source/dev/engine/async_llm_engine.rst b/docs/source/dev/engine/async_llm_engine.rst
index 47db1e0a401b1..93fc310cb543b 100644
--- a/docs/source/dev/engine/async_llm_engine.rst
+++ b/docs/source/dev/engine/async_llm_engine.rst
@@ -1,7 +1,6 @@
-
AsyncLLMEngine
=================================
-.. autoclass:: vllm.engine.async_llm_engine.AsyncLLMEngine
- :members: generate, abort
+.. autoclass:: vllm.AsyncLLMEngine
+ :members:
:show-inheritance:
diff --git a/docs/source/dev/engine/llm_engine.rst b/docs/source/dev/engine/llm_engine.rst
index 1de6d7adc87c6..0b8c1e219d7c9 100644
--- a/docs/source/dev/engine/llm_engine.rst
+++ b/docs/source/dev/engine/llm_engine.rst
@@ -1,6 +1,6 @@
LLMEngine
=================================
-.. autoclass:: vllm.engine.llm_engine.LLMEngine
- :members: add_request, abort_request, step
- :show-inheritance:
\ No newline at end of file
+.. autoclass:: vllm.LLMEngine
+ :members:
+ :show-inheritance:
diff --git a/docs/source/dev/sampling_params.rst b/docs/source/dev/sampling_params.rst
index 844859b3ec1f0..ef3d1509bda6d 100644
--- a/docs/source/dev/sampling_params.rst
+++ b/docs/source/dev/sampling_params.rst
@@ -1,4 +1,5 @@
Sampling Params
===============
-.. automodule:: vllm.sampling_params.SamplingParams
\ No newline at end of file
+.. autoclass:: vllm.SamplingParams
+ :members:
diff --git a/docs/source/generate_examples.py b/docs/source/generate_examples.py
new file mode 100644
index 0000000000000..79b49a186236a
--- /dev/null
+++ b/docs/source/generate_examples.py
@@ -0,0 +1,61 @@
+import re
+from pathlib import Path
+
+
+def fix_case(text: str) -> str:
+ subs = [
+ ("api", "API"),
+ ("llm", "LLM"),
+ ("vllm", "vLLM"),
+ ("openai", "OpenAI"),
+ ("multilora", "MultiLoRA"),
+ ]
+ for sub in subs:
+ text = re.sub(*sub, text, flags=re.IGNORECASE)
+ return text
+
+
+def underline(title: str, character: str = "=") -> str:
+ return f"{title}\n{character * len(title)}"
+
+
+def generate_title(filename: str) -> str:
+ # Turn filename into a title
+ title = filename.replace("_", " ").title()
+ # Handle acronyms and names
+ title = fix_case(title)
+ # Underline title
+ title = underline(title)
+ return title
+
+
+def generate_examples():
+ root_dir = Path(__file__).parent.parent.parent.resolve()
+
+ # Source paths
+ script_dir = root_dir / "examples"
+ script_paths = sorted(script_dir.glob("*.py"))
+
+ # Destination paths
+ doc_dir = root_dir / "docs/source/getting_started/examples"
+ doc_paths = [doc_dir / f"{path.stem}.rst" for path in script_paths]
+
+ # Generate the example docs for each example script
+ for script_path, doc_path in zip(script_paths, doc_paths):
+ script_url = f"https://github.com/vllm-project/vllm/blob/main/examples/{script_path.name}"
+ # Make script_path relative to doc_path and call it include_path
+ include_path = '../../../..' / script_path.relative_to(root_dir)
+ content = (f"{generate_title(doc_path.stem)}\n\n"
+ f"Source {script_url}.\n\n"
+ f".. literalinclude:: {include_path}\n"
+ " :language: python\n"
+ " :linenos:\n")
+ with open(doc_path, "w+") as f:
+ f.write(content)
+
+ # Generate the toctree for the example scripts
+ with open(doc_dir / "examples_index.template.rst") as f:
+ examples_index = f.read()
+ with open(doc_dir / "examples_index.rst", "w+") as f:
+ example_docs = "\n ".join(path.stem for path in script_paths)
+ f.write(examples_index.replace(r"%EXAMPLE_DOCS%", example_docs))
diff --git a/docs/source/getting_started/examples/examples_index.template.rst b/docs/source/getting_started/examples/examples_index.template.rst
new file mode 100644
index 0000000000000..1b34cccbae15a
--- /dev/null
+++ b/docs/source/getting_started/examples/examples_index.template.rst
@@ -0,0 +1,8 @@
+Examples
+=================================
+
+.. toctree::
+ :maxdepth: 1
+ :caption: Scripts
+
+ %EXAMPLE_DOCS%
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 5d5d52696ba34..e8daa5f052754 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -65,6 +65,7 @@ Documentation
getting_started/neuron-installation
getting_started/cpu-installation
getting_started/quickstart
+ getting_started/examples/examples_index
.. toctree::
:maxdepth: 1
diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst
index a82c2cef10e83..cbc8099e6f70f 100644
--- a/docs/source/models/adding_model.rst
+++ b/docs/source/models/adding_model.rst
@@ -95,7 +95,7 @@ This method should load the weights from the HuggingFace's checkpoint file and a
5. Register your model
----------------------
-Finally, include your :code:`*ForCausalLM` class in `vllm/model_executor/models/__init__.py `_ and register it to the :code:`_MODEL_REGISTRY` in `vllm/model_executor/model_loader.py `_.
+Finally, register your :code:`*ForCausalLM` class to the :code:`_MODELS` in `vllm/model_executor/models/__init__.py `_.
6. Out-of-Tree Model Integration
--------------------------------------------
diff --git a/docs/source/models/engine_args.rst b/docs/source/models/engine_args.rst
index d8a7ac72e0175..bdf566d3ebbd1 100644
--- a/docs/source/models/engine_args.rst
+++ b/docs/source/models/engine_args.rst
@@ -5,132 +5,19 @@ Engine Arguments
Below, you can find an explanation of every engine argument for vLLM:
-.. option:: --model
-
- Name or path of the huggingface model to use.
-
-.. option:: --tokenizer
-
- Name or path of the huggingface tokenizer to use.
-
-.. option:: --revision
-
- The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.
-
-.. option:: --tokenizer-revision
-
- The specific tokenizer version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.
-
-.. option:: --tokenizer-mode {auto,slow}
-
- The tokenizer mode.
-
- * "auto" will use the fast tokenizer if available.
- * "slow" will always use the slow tokenizer.
-
-.. option:: --trust-remote-code
-
- Trust remote code from huggingface.
-
-.. option:: --download-dir
-
- Directory to download and load the weights, default to the default cache dir of huggingface.
-
-.. option:: --load-format {auto,pt,safetensors,npcache,dummy}
-
- The format of the model weights to load.
-
- * "auto" will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is not available.
- * "pt" will load the weights in the pytorch bin format.
- * "safetensors" will load the weights in the safetensors format.
- * "npcache" will load the weights in pytorch format and store a numpy cache to speed up the loading.
- * "dummy" will initialize the weights with random values, mainly for profiling.
-
-.. option:: --dtype {auto,half,float16,bfloat16,float,float32}
-
- Data type for model weights and activations.
-
- * "auto" will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models.
- * "half" for FP16. Recommended for AWQ quantization.
- * "float16" is the same as "half".
- * "bfloat16" for a balance between precision and range.
- * "float" is shorthand for FP32 precision.
- * "float32" for FP32 precision.
-
-.. option:: --max-model-len
-
- Model context length. If unspecified, will be automatically derived from the model config.
-
-.. option:: --worker-use-ray
-
- Use Ray for distributed serving, will be automatically set when using more than 1 GPU.
-
-.. option:: --pipeline-parallel-size (-pp)
-
- Number of pipeline stages.
-
-.. option:: --tensor-parallel-size (-tp)
-
- Number of tensor parallel replicas.
-
-.. option:: --max-parallel-loading-workers
-
- Load model sequentially in multiple batches, to avoid RAM OOM when using tensor parallel and large models.
-
-.. option:: --block-size {8,16,32}
-
- Token block size for contiguous chunks of tokens.
-
-.. option:: --enable-prefix-caching
-
- Enables automatic prefix caching
-
-.. option:: --seed
-
- Random seed for operations.
-
-.. option:: --swap-space
-
- CPU swap space size (GiB) per GPU.
-
-.. option:: --gpu-memory-utilization
-
- The fraction of GPU memory to be used for the model executor, which can range from 0 to 1.
- For example, a value of 0.5 would imply 50% GPU memory utilization.
- If unspecified, will use the default value of 0.9.
-
-.. option:: --max-num-batched-tokens
-
- Maximum number of batched tokens per iteration.
-
-.. option:: --max-num-seqs
-
- Maximum number of sequences per iteration.
-
-.. option:: --max-paddings
-
- Maximum number of paddings in a batch.
-
-.. option:: --disable-log-stats
-
- Disable logging statistics.
-
-.. option:: --quantization (-q) {awq,squeezellm,None}
-
- Method used to quantize the weights.
+.. argparse::
+ :module: vllm.engine.arg_utils
+ :func: _engine_args_parser
+ :prog: -m vllm.entrypoints.openai.api_server
+ :nodefaultconst:
Async Engine Arguments
----------------------
-Below are the additional arguments related to the asynchronous engine:
-
-.. option:: --engine-use-ray
- Use Ray to start the LLM engine in a separate process as the server process.
-
-.. option:: --disable-log-requests
-
- Disable logging requests.
-
-.. option:: --max-log-len
+Below are the additional arguments related to the asynchronous engine:
- Max number of prompt characters or prompt ID numbers being printed in log. Defaults to unlimited.
\ No newline at end of file
+.. argparse::
+ :module: vllm.engine.arg_utils
+ :func: _async_engine_args_parser
+ :prog: -m vllm.entrypoints.openai.api_server
+ :nodefaultconst:
\ No newline at end of file
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index c09b0ff250437..ceb658bbd5c66 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -30,23 +30,23 @@ Alongside each architecture, we include some popular models that use it.
* - :code:`CohereForCausalLM`
- Command-R
- :code:`CohereForAI/c4ai-command-r-v01`, etc.
- -
+ -
* - :code:`DbrxForCausalLM`
- DBRX
- :code:`databricks/dbrx-base`, :code:`databricks/dbrx-instruct`, etc.
- -
+ -
* - :code:`DeciLMForCausalLM`
- DeciLM
- :code:`Deci/DeciLM-7B`, :code:`Deci/DeciLM-7B-instruct`, etc.
- -
+ -
* - :code:`BloomForCausalLM`
- BLOOM, BLOOMZ, BLOOMChat
- :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc.
- -
+ -
* - :code:`FalconForCausalLM`
- Falcon
- :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc.
- -
+ -
* - :code:`GemmaForCausalLM`
- Gemma
- :code:`google/gemma-2b`, :code:`google/gemma-7b`, etc.
@@ -54,19 +54,19 @@ Alongside each architecture, we include some popular models that use it.
* - :code:`GPT2LMHeadModel`
- GPT-2
- :code:`gpt2`, :code:`gpt2-xl`, etc.
- -
+ -
* - :code:`GPTBigCodeForCausalLM`
- StarCoder, SantaCoder, WizardCoder
- :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc.
- -
+ -
* - :code:`GPTJForCausalLM`
- GPT-J
- :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc.
- -
+ -
* - :code:`GPTNeoXForCausalLM`
- GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM
- :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc.
- -
+ -
* - :code:`InternLMForCausalLM`
- InternLM
- :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc.
@@ -80,8 +80,8 @@ Alongside each architecture, we include some popular models that use it.
- :code:`core42/jais-13b`, :code:`core42/jais-13b-chat`, :code:`core42/jais-30b-v3`, :code:`core42/jais-30b-chat-v3`, etc.
-
* - :code:`LlamaForCausalLM`
- - LLaMA, LLaMA-2, Vicuna, Alpaca, Yi
- - :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.
+ - LLaMA, Llama 2, Meta Llama 3, Vicuna, Alpaca, Yi
+ - :code:`meta-llama/Meta-Llama-3-8B-Instruct`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.
- ✅︎
* - :code:`MiniCPMForCausalLM`
- MiniCPM
@@ -93,32 +93,36 @@ Alongside each architecture, we include some popular models that use it.
- ✅︎
* - :code:`MixtralForCausalLM`
- Mixtral-8x7B, Mixtral-8x7B-Instruct
- - :code:`mistralai/Mixtral-8x7B-v0.1`, :code:`mistralai/Mixtral-8x7B-Instruct-v0.1`, etc.
+ - :code:`mistralai/Mixtral-8x7B-v0.1`, :code:`mistralai/Mixtral-8x7B-Instruct-v0.1`, :code:`mistral-community/Mixtral-8x22B-v0.1`, etc.
- ✅︎
* - :code:`MPTForCausalLM`
- MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
- :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc.
- -
+ -
* - :code:`OLMoForCausalLM`
- OLMo
- - :code:`allenai/OLMo-1B`, :code:`allenai/OLMo-7B`, etc.
- -
+ - :code:`allenai/OLMo-1B-hf`, :code:`allenai/OLMo-7B-hf`, etc.
+ -
* - :code:`OPTForCausalLM`
- OPT, OPT-IML
- :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc.
- -
+ -
* - :code:`OrionForCausalLM`
- Orion
- :code:`OrionStarAI/Orion-14B-Base`, :code:`OrionStarAI/Orion-14B-Chat`, etc.
- -
+ -
* - :code:`PhiForCausalLM`
- Phi
- :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc.
- -
+ -
+ * - :code:`Phi3ForCausalLM`
+ - Phi-3
+ - :code:`microsoft/Phi-3-mini-4k-instruct`, :code:`microsoft/Phi-3-mini-128k-instruct`, etc.
+ -
* - :code:`QWenLMHeadModel`
- Qwen
- :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
- -
+ -
* - :code:`Qwen2ForCausalLM`
- Qwen2
- :code:`Qwen/Qwen2-beta-7B`, :code:`Qwen/Qwen2-beta-7B-Chat`, etc.
@@ -126,11 +130,11 @@ Alongside each architecture, we include some popular models that use it.
* - :code:`Qwen2MoeForCausalLM`
- Qwen2MoE
- :code:`Qwen/Qwen1.5-MoE-A2.7B`, :code:`Qwen/Qwen1.5-MoE-A2.7B-Chat`, etc.
- -
+ -
* - :code:`StableLmForCausalLM`
- StableLM
- :code:`stabilityai/stablelm-3b-4e1t/` , :code:`stabilityai/stablelm-base-alpha-7b-v2`, etc.
- -
+ -
If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
Otherwise, please refer to :ref:`Adding a New Model ` for instructions on how to implement support for your model.
diff --git a/docs/source/serving/deploying_with_docker.rst b/docs/source/serving/deploying_with_docker.rst
index 7ec769630300d..cfc462ff33b90 100644
--- a/docs/source/serving/deploying_with_docker.rst
+++ b/docs/source/serving/deploying_with_docker.rst
@@ -49,3 +49,6 @@ To run vLLM:
--env "HUGGING_FACE_HUB_TOKEN=" \
vllm/vllm-openai
+.. note::
+
+ vLLM docker image is currently designed to be run under the root user (contribution welcomed for changing this!). It will try to load library at runtime under the root user's home directory, e.g. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` . If you are running the container under a different user, you may need to change the permissions of the library (and all the parent directories) to allow the user to access it. Then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` .
diff --git a/docs/source/serving/run_on_sky.rst b/docs/source/serving/run_on_sky.rst
index 2c88d24dc5d0b..bd33c76cec3de 100644
--- a/docs/source/serving/run_on_sky.rst
+++ b/docs/source/serving/run_on_sky.rst
@@ -1,7 +1,7 @@
.. _on_cloud:
-Running on clouds with SkyPilot
-===============================
+Deploying and scaling up with SkyPilot
+================================================
.. raw:: html
@@ -9,51 +9,75 @@ Running on clouds with SkyPilot
-vLLM can be run on the cloud to scale to multiple GPUs with `SkyPilot `__, an open-source framework for running LLMs on any cloud.
+vLLM can be **run and scaled to multiple service replicas on clouds and Kubernetes** with `SkyPilot `__, an open-source framework for running LLMs on any cloud. More examples for various open models, such as Llama-3, Mixtral, etc, can be found in `SkyPilot AI gallery `__.
-To install SkyPilot and setup your cloud credentials, run:
+
+Prerequisites
+-------------
+
+- Go to the `HuggingFace model page `__ and request access to the model :code:`meta-llama/Meta-Llama-3-8B-Instruct`.
+- Check that you have installed SkyPilot (`docs `__).
+- Check that :code:`sky check` shows clouds or Kubernetes are enabled.
.. code-block:: console
- $ pip install skypilot
- $ sky check
+ pip install skypilot-nightly
+ sky check
+
+
+Run on a single instance
+------------------------
See the vLLM SkyPilot YAML for serving, `serving.yaml `__.
.. code-block:: yaml
resources:
- accelerators: A100
+ accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+ use_spot: True
+ disk_size: 512 # Ensure model checkpoints can fit.
+ disk_tier: best
+ ports: 8081 # Expose to internet traffic.
envs:
- MODEL_NAME: decapoda-research/llama-13b-hf
- TOKENIZER: hf-internal-testing/llama-tokenizer
+ MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+ HF_TOKEN: # Change to your own huggingface token, or use --env to pass.
setup: |
- conda create -n vllm python=3.9 -y
+ conda create -n vllm python=3.10 -y
conda activate vllm
- git clone https://github.com/vllm-project/vllm.git
- cd vllm
- pip install .
- pip install gradio
+
+ pip install vllm==0.4.0.post1
+ # Install Gradio for web UI.
+ pip install gradio openai
+ pip install flash-attn==2.5.7
run: |
conda activate vllm
echo 'Starting vllm api server...'
- python -u -m vllm.entrypoints.api_server \
- --model $MODEL_NAME \
- --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
- --tokenizer $TOKENIZER 2>&1 | tee api_server.log &
+ python -u -m vllm.entrypoints.openai.api_server \
+ --port 8081 \
+ --model $MODEL_NAME \
+ --trust-remote-code \
+ --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+ 2>&1 | tee api_server.log &
+
echo 'Waiting for vllm api server to start...'
while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
+
echo 'Starting gradio server...'
- python vllm/examples/gradio_webserver.py
+ git clone https://github.com/vllm-project/vllm.git || true
+ python vllm/examples/gradio_openai_chatbot_webserver.py \
+ -m $MODEL_NAME \
+ --port 8811 \
+ --model-url http://localhost:8081/v1 \
+ --stop-token-ids 128009,128001
-Start the serving the LLaMA-13B model on an A100 GPU:
+Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...):
.. code-block:: console
- $ sky launch serving.yaml
+ HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN
Check the output of the command. There will be a shareable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion.
@@ -61,9 +85,226 @@ Check the output of the command. There will be a shareable gradio link (like the
(task, pid=7431) Running on public URL: https://.gradio.live
-**Optional**: Serve the 65B model instead of the default 13B and use more GPU:
+**Optional**: Serve the 70B model instead of the default 8B and use more GPU:
+
+.. code-block:: console
+
+ HF_TOKEN="your-huggingface-token" sky launch serving.yaml --gpus A100:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3-70B-Instruct
+
+
+Scale up to multiple replicas
+-----------------------------
+
+SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file.
+
+.. code-block:: yaml
+
+ service:
+ replicas: 2
+ # An actual request for readiness probe.
+ readiness_probe:
+ path: /v1/chat/completions
+ post_data:
+ model: $MODEL_NAME
+ messages:
+ - role: user
+ content: Hello! What is your name?
+ max_tokens: 1
+
+.. raw:: html
+
+
+ Click to see the full recipe YAML
+
+
+.. code-block:: yaml
+
+ service:
+ replicas: 2
+ # An actual request for readiness probe.
+ readiness_probe:
+ path: /v1/chat/completions
+ post_data:
+ model: $MODEL_NAME
+ messages:
+ - role: user
+ content: Hello! What is your name?
+ max_tokens: 1
+
+ resources:
+ accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+ use_spot: True
+ disk_size: 512 # Ensure model checkpoints can fit.
+ disk_tier: best
+ ports: 8081 # Expose to internet traffic.
+
+ envs:
+ MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+ HF_TOKEN: # Change to your own huggingface token, or use --env to pass.
+
+ setup: |
+ conda create -n vllm python=3.10 -y
+ conda activate vllm
+
+ pip install vllm==0.4.0.post1
+ # Install Gradio for web UI.
+ pip install gradio openai
+ pip install flash-attn==2.5.7
+
+ run: |
+ conda activate vllm
+ echo 'Starting vllm api server...'
+ python -u -m vllm.entrypoints.openai.api_server \
+ --port 8081 \
+ --model $MODEL_NAME \
+ --trust-remote-code \
+ --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+ 2>&1 | tee api_server.log &
+
+ echo 'Waiting for vllm api server to start...'
+ while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
+
+ echo 'Starting gradio server...'
+ git clone https://github.com/vllm-project/vllm.git || true
+ python vllm/examples/gradio_openai_chatbot_webserver.py \
+ -m $MODEL_NAME \
+ --port 8811 \
+ --model-url http://localhost:8081/v1 \
+ --stop-token-ids 128009,128001
+
+.. raw:: html
+
+
+
+Start the serving the Llama-3 8B model on multiple replicas:
+
+.. code-block:: console
+
+ HF_TOKEN="your-huggingface-token" sky serve up -n vllm serving.yaml --env HF_TOKEN
+
+
+Wait until the service is ready:
.. code-block:: console
- sky launch -c vllm-serve-new -s serve.yaml --gpus A100:8 --env MODEL_NAME=decapoda-research/llama-65b-hf
+ watch -n10 sky serve status vllm
+
+
+.. raw:: html
+
+
+ Example outputs:
+
+.. code-block:: console
+
+ Services
+ NAME VERSION UPTIME STATUS REPLICAS ENDPOINT
+ vllm 1 35s READY 2/2 xx.yy.zz.100:30001
+
+ Service Replicas
+ SERVICE_NAME ID VERSION IP LAUNCHED RESOURCES STATUS REGION
+ vllm 1 1 xx.yy.zz.121 18 mins ago 1x GCP({'L4': 1}) READY us-east4
+ vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP({'L4': 1}) READY us-east4
+
+.. raw:: html
+
+
+
+After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
+
+.. code-block:: console
+
+ ENDPOINT=$(sky serve status --endpoint 8081 vllm)
+ curl -L http://$ENDPOINT/v1/chat/completions \
+ -H "Content-Type: application/json" \
+ -d '{
+ "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+ "messages": [
+ {
+ "role": "system",
+ "content": "You are a helpful assistant."
+ },
+ {
+ "role": "user",
+ "content": "Who are you?"
+ }
+ ],
+ "stop_token_ids": [128009, 128001]
+ }'
+
+To enable autoscaling, you could specify additional configs in `services`:
+
+.. code-block:: yaml
+
+ services:
+ replica_policy:
+ min_replicas: 0
+ max_replicas: 3
+ target_qps_per_replica: 2
+
+This will scale the service up to when the QPS exceeds 2 for each replica.
+
+
+**Optional**: Connect a GUI to the endpoint
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+
+It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas.
+
+.. raw:: html
+
+
+ Click to see the full GUI YAML
+
+.. code-block:: yaml
+
+ envs:
+ MODEL_NAME: meta-llama/Meta-Llama-3-70B-Instruct
+ ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm.
+
+ resources:
+ cpus: 2
+
+ setup: |
+ conda activate vllm
+ if [ $? -ne 0 ]; then
+ conda create -n vllm python=3.10 -y
+ conda activate vllm
+ fi
+
+ # Install Gradio for web UI.
+ pip install gradio openai
+
+ run: |
+ conda activate vllm
+ export PATH=$PATH:/sbin
+ WORKER_IP=$(hostname -I | cut -d' ' -f1)
+ CONTROLLER_PORT=21001
+ WORKER_PORT=21002
+
+ echo 'Starting gradio server...'
+ git clone https://github.com/vllm-project/vllm.git || true
+ python vllm/examples/gradio_openai_chatbot_webserver.py \
+ -m $MODEL_NAME \
+ --port 8811 \
+ --model-url http://$ENDPOINT/v1 \
+ --stop-token-ids 128009,128001 | tee ~/gradio.log
+
+.. raw:: html
+
+
+
+1. Start the chat web UI:
+
+.. code-block:: console
+
+ sky launch -c gui ./gui.yaml --env ENDPOINT=$(sky serve status --endpoint vllm)
+
+
+2. Then, we can access the GUI at the returned gradio link:
+
+.. code-block:: console
+
+ | INFO | stdout | Running on public URL: https://6141e84201ce0bb4ed.gradio.live
+
diff --git a/examples/aqlm_example.py b/examples/aqlm_example.py
new file mode 100644
index 0000000000000..e7c17fa0362ae
--- /dev/null
+++ b/examples/aqlm_example.py
@@ -0,0 +1,46 @@
+import argparse
+
+from vllm import LLM, SamplingParams
+
+
+def main():
+
+ parser = argparse.ArgumentParser(description='AQLM examples')
+
+ parser.add_argument('--model',
+ '-m',
+ type=str,
+ default=None,
+ help='model path, as for HF')
+ parser.add_argument('--choice',
+ '-c',
+ type=int,
+ default=0,
+ help='known good models by index, [0-4]')
+ parser.add_argument('--tensor_parallel_size',
+ '-t',
+ type=int,
+ default=1,
+ help='tensor parallel size')
+
+ args = parser.parse_args()
+
+ models = [
+ "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf",
+ "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-2x8-hf",
+ "ISTA-DASLab/Llama-2-13b-AQLM-2Bit-1x16-hf",
+ "ISTA-DASLab/Mixtral-8x7b-AQLM-2Bit-1x16-hf",
+ "BlackSamorez/TinyLlama-1_1B-Chat-v1_0-AQLM-2Bit-1x16-hf",
+ ]
+
+ model = LLM(args.model if args.model is not None else models[args.choice],
+ tensor_parallel_size=args.tensor_parallel_size)
+
+ sampling_params = SamplingParams(max_tokens=100, temperature=0)
+ outputs = model.generate("Hello my name is",
+ sampling_params=sampling_params)
+ print(outputs[0].outputs[0].text)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/examples/fp8/extract_scales.py b/examples/fp8/extract_scales.py
index 5e5b31265e3af..1eb961a5a76e3 100644
--- a/examples/fp8/extract_scales.py
+++ b/examples/fp8/extract_scales.py
@@ -11,7 +11,7 @@
from vllm.model_executor.layers.quantization.schema import QuantParamSchema
-# Adapted from vllm/model_executor/weight_utils.py
+# Adapted from vllm/model_executor/model_loader/weight_utils.py
# The main differences are that we add the NPZ format and simplify
# its functionality drastically for our purposes (e.g. we assume that
# the quantized model exists locally and there is no need to download it)
@@ -71,7 +71,7 @@ def _prepare_hf_weights(
return hf_weights_files, use_safetensors
-# Adapted from vllm/model_executor/weight_utils.py
+# Adapted from vllm/model_executor/model_loader/weight_utils.py
def _hf_tensorfile_iterator(filename: str, load_format: str,
use_safetensors: bool):
if load_format == "npz":
diff --git a/examples/openai_chatcompletion_client.py b/examples/openai_chat_completion_client.py
similarity index 100%
rename from examples/openai_chatcompletion_client.py
rename to examples/openai_chat_completion_client.py
diff --git a/examples/tensorize_vllm_model.py b/examples/tensorize_vllm_model.py
new file mode 100644
index 0000000000000..e2456168de9d5
--- /dev/null
+++ b/examples/tensorize_vllm_model.py
@@ -0,0 +1,282 @@
+import argparse
+import dataclasses
+import os
+import time
+import uuid
+from functools import partial
+from typing import Type
+
+import torch
+import torch.nn as nn
+from tensorizer import (DecryptionParams, EncryptionParams, TensorDeserializer,
+ TensorSerializer, stream_io)
+from tensorizer.utils import convert_bytes, get_mem_usage, no_init_or_tensor
+from transformers import AutoConfig, PretrainedConfig
+
+from vllm.distributed import initialize_model_parallel
+from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.llm_engine import LLMEngine
+from vllm.model_executor.model_loader.tensorizer import TensorizerArgs
+from vllm.model_executor.models import ModelRegistry
+
+# yapf conflicts with isort for this docstring
+# yapf: disable
+"""
+tensorize_vllm_model.py is a script that can be used to serialize and
+deserialize vLLM models. These models can be loaded using tensorizer
+to the GPU extremely quickly over an HTTP/HTTPS endpoint, an S3 endpoint,
+or locally. Tensor encryption and decryption is also supported, although
+libsodium must be installed to use it. Install vllm with tensorizer support
+using `pip install vllm[tensorizer]`.
+
+To serialize a model, install vLLM from source, then run something
+like this from the root level of this repository:
+
+python -m examples.tensorize_vllm_model \
+ --model EleutherAI/gpt-j-6B \
+ --dtype float16 \
+ serialize \
+ --serialized-directory s3://my-bucket/ \
+ --suffix vllm
+
+Which downloads the model from HuggingFace, loads it into vLLM, serializes it,
+and saves it to your S3 bucket. A local directory can also be used. This
+assumes your S3 credentials are specified as environment variables
+in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and `S3_ENDPOINT`.
+To provide S3 credentials directly, you can provide `--s3-access-key-id` and
+`--s3-secret-access-key`, as well as `--s3-endpoint` as CLI args to this
+script.
+
+You can also encrypt the model weights with a randomly-generated key by
+providing a `--keyfile` argument.
+
+To deserialize a model, you can run something like this from the root
+level of this repository:
+
+python -m examples.tensorize_vllm_model \
+ --model EleutherAI/gpt-j-6B \
+ --dtype float16 \
+ deserialize \
+ --path-to-tensors s3://my-bucket/vllm/EleutherAI/gpt-j-6B/vllm/model.tensors
+
+Which downloads the model tensors from your S3 bucket and deserializes them.
+
+You can also provide a `--keyfile` argument to decrypt the model weights if
+they were serialized with encryption.
+
+For more information on the available arguments for serializing, run
+`python -m examples.tensorize_vllm_model serialize --help`.
+
+Or for deserializing:
+
+`python -m examples.tensorize_vllm_model deserialize --help`.
+
+Once a model is serialized, it can be used to load the model when running the
+OpenAI inference client at `vllm/entrypoints/openai/api_server.py` by providing
+the `--tensorizer-uri` CLI argument that is functionally the same as the
+`--path-to-tensors` argument in this script, along with `--vllm-tensorized`, to
+signify that the model to be deserialized is a vLLM model, rather than a
+HuggingFace `PreTrainedModel`, which can also be deserialized using tensorizer
+in the same inference server, albeit without the speed optimizations. To
+deserialize an encrypted file, the `--encryption-keyfile` argument can be used
+to provide the path to the keyfile used to encrypt the model weights. For
+information on all the arguments that can be used to configure tensorizer's
+deserialization, check out the tensorizer options argument group in the
+`vllm/entrypoints/openai/api_server.py` script with `--help`.
+
+Tensorizer can also be invoked with the `LLM` class directly to load models:
+
+ llm = LLM(model="facebook/opt-125m",
+ load_format="tensorizer",
+ tensorizer_uri=path_to_opt_tensors,
+ num_readers=3,
+ vllm_tensorized=True)
+"""
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ description="An example script that can be used to serialize and "
+ "deserialize vLLM models. These models "
+ "can be loaded using tensorizer directly to the GPU "
+ "extremely quickly. Tensor encryption and decryption is "
+ "also supported, although libsodium must be installed to "
+ "use it.")
+ parser = EngineArgs.add_cli_args(parser)
+ subparsers = parser.add_subparsers(dest='command')
+
+ serialize_parser = subparsers.add_parser(
+ 'serialize', help="Serialize a model to `--serialized-directory`")
+
+ serialize_parser.add_argument(
+ "--suffix",
+ type=str,
+ required=False,
+ help=(
+ "The suffix to append to the serialized model directory, which is "
+ "used to construct the location of the serialized model tensors, "
+ "e.g. if `--serialized-directory` is `s3://my-bucket/` and "
+ "`--suffix` is `v1`, the serialized model tensors will be "
+ "saved to "
+ "`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
+ "If none is provided, a random UUID will be used."))
+ serialize_parser.add_argument(
+ "--serialized-directory",
+ type=str,
+ required=True,
+ help="The directory to serialize the model to. "
+ "This can be a local directory or S3 URI. The path to where the "
+ "tensors are saved is a combination of the supplied `dir` and model "
+ "reference ID. For instance, if `dir` is the serialized directory, "
+ "and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
+ "be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
+ "where `suffix` is given by `--suffix` or a random UUID if not "
+ "provided.")
+
+ serialize_parser.add_argument(
+ "--keyfile",
+ type=str,
+ required=False,
+ help=("Encrypt the model weights with a randomly-generated binary key,"
+ " and save the key at this path"))
+
+ deserialize_parser = subparsers.add_parser(
+ 'deserialize',
+ help=("Deserialize a model from `--path-to-tensors`"
+ " to verify it can be loaded and used."))
+
+ deserialize_parser.add_argument(
+ "--path-to-tensors",
+ type=str,
+ required=True,
+ help="The local path or S3 URI to the model tensors to deserialize. ")
+
+ deserialize_parser.add_argument(
+ "--keyfile",
+ type=str,
+ required=False,
+ help=("Path to a binary key to use to decrypt the model weights,"
+ " if the model was serialized with encryption"))
+
+ return parser.parse_args()
+
+
+def make_model_contiguous(model):
+ # Ensure tensors are saved in memory contiguously
+ for param in model.parameters():
+ param.data = param.data.contiguous()
+
+
+def _get_vllm_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
+ architectures = getattr(config, "architectures", [])
+ for arch in architectures:
+ model_cls = ModelRegistry.load_model_cls(arch)
+ if model_cls is not None:
+ return model_cls
+ raise ValueError(
+ f"Model architectures {architectures} are not supported for now. "
+ f"Supported architectures: {ModelRegistry.get_supported_archs()}")
+
+
+def serialize():
+
+ eng_args_dict = {f.name: getattr(args, f.name) for f in
+ dataclasses.fields(EngineArgs)}
+ engine_args = EngineArgs.from_cli_args(argparse.Namespace(**eng_args_dict))
+ engine = LLMEngine.from_engine_args(engine_args)
+
+ model = (engine.model_executor.driver_worker.
+ model_runner.model)
+
+ encryption_params = EncryptionParams.random() if keyfile else None
+ if keyfile:
+ with _write_stream(keyfile) as stream:
+ stream.write(encryption_params.key)
+
+ with _write_stream(model_path) as stream:
+ serializer = TensorSerializer(stream, encryption=encryption_params)
+ serializer.write_module(model)
+ serializer.close()
+
+ print("Serialization complete. Model tensors saved to", model_path)
+ if keyfile:
+ print("Key saved to", keyfile)
+
+
+def deserialize():
+ config = AutoConfig.from_pretrained(model_ref)
+
+ with no_init_or_tensor():
+ model_class = _get_vllm_model_architecture(config)
+ model = model_class(config)
+
+ before_mem = get_mem_usage()
+ start = time.time()
+
+ if keyfile:
+ with _read_stream(keyfile) as stream:
+ key = stream.read()
+ decryption_params = DecryptionParams.from_key(key)
+ tensorizer_args.deserializer_params['encryption'] = \
+ decryption_params
+
+ with (_read_stream(model_path)) as stream, TensorDeserializer(
+ stream, **tensorizer_args.deserializer_params) as deserializer:
+ deserializer.load_into_module(model)
+ end = time.time()
+
+ # Brag about how fast we are.
+ total_bytes_str = convert_bytes(deserializer.total_tensor_bytes)
+ duration = end - start
+ per_second = convert_bytes(deserializer.total_tensor_bytes / duration)
+ after_mem = get_mem_usage()
+ print(
+ f"Deserialized {total_bytes_str} in {end - start:0.2f}s, {per_second}/s"
+ )
+ print(f"Memory usage before: {before_mem}")
+ print(f"Memory usage after: {after_mem}")
+
+ return model
+
+
+args = parse_args()
+
+s3_access_key_id = (args.s3_access_key_id or os.environ.get("S3_ACCESS_KEY_ID")
+ or None)
+s3_secret_access_key = (args.s3_secret_access_key
+ or os.environ.get("S3_SECRET_ACCESS_KEY") or None)
+
+s3_endpoint = (args.s3_endpoint or os.environ.get("S3_ENDPOINT_URL") or None)
+
+_read_stream, _write_stream = (partial(
+ stream_io.open_stream,
+ mode=mode,
+ s3_access_key_id=s3_access_key_id,
+ s3_secret_access_key=s3_secret_access_key,
+ s3_endpoint=s3_endpoint,
+) for mode in ("rb", "wb+"))
+
+model_ref = args.model
+
+model_name = model_ref.split("/")[1]
+
+os.environ["MASTER_ADDR"] = "127.0.0.1"
+os.environ["MASTER_PORT"] = "8080"
+
+torch.distributed.init_process_group(world_size=1, rank=0)
+initialize_model_parallel()
+
+keyfile = args.keyfile if args.keyfile else None
+
+if args.command == "serialize":
+ input_dir = args.serialized_directory.rstrip('/')
+ suffix = args.suffix if args.suffix else uuid.uuid4().hex
+ base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
+ model_path = f"{base_path}/model.tensors"
+ serialize()
+elif args.command == "deserialize":
+ tensorizer_args = TensorizerArgs.from_cli_args(args)
+ model_path = args.path_to_tensors
+ deserialize()
+else:
+ raise ValueError("Either serialize or deserialize must be specified.")
diff --git a/format.sh b/format.sh
index deb57b2b049d1..4ac1842daef0a 100755
--- a/format.sh
+++ b/format.sh
@@ -93,9 +93,21 @@ fi
echo 'vLLM yapf: Done'
# Run mypy
-# TODO(zhuohan): Enable mypy
-# echo 'vLLM mypy:'
-# mypy
+echo 'vLLM mypy:'
+mypy vllm/attention --config-file pyproject.toml
+mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml
+mypy vllm/distributed --config-file pyproject.toml
+mypy vllm/entrypoints --config-file pyproject.toml
+mypy vllm/executor --config-file pyproject.toml
+mypy vllm/usage --config-file pyproject.toml
+mypy vllm/*.py --config-file pyproject.toml
+mypy vllm/transformers_utils --config-file pyproject.toml
+mypy vllm/engine --config-file pyproject.toml
+mypy vllm/worker --config-file pyproject.toml
+mypy vllm/spec_decode --config-file pyproject.toml
+mypy vllm/model_executor/*.py --config-file pyproject.toml
+mypy vllm/lora --config-file pyproject.toml
+
CODESPELL_EXCLUDES=(
'--skip' '*docs/source/_build/**'
@@ -228,5 +240,3 @@ if ! git diff --quiet &>/dev/null; then
exit 1
fi
-
-
diff --git a/patch_xformers.rocm.sh b/patch_xformers.rocm.sh
deleted file mode 100644
index de427b24d306f..0000000000000
--- a/patch_xformers.rocm.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-set -e
-
-XFORMERS_VERSION="0.0.23"
-
-export XFORMERS_INSTALLED_VERSION=$(python -c 'import xformers; print(xformers.__version__)')
-
-if [ "$XFORMERS_INSTALLED_VERSION" != "$XFORMERS_VERSION" ]; then
- echo "ERROR: xformers version must be ${XFORMERS_VERSION}. ${XFORMERS_INSTALLED_VERSION} is installed"
- exit 1
-fi
-
-export XFORMERS_FMHA_FLASH_PATH=$(python -c 'from xformers import ops as xops; print(xops.fmha.flash.__file__)')
-export XFORMERS_FMHA_COMMON_PATH=$(python -c 'from xformers import ops as xops; print(xops.fmha.common.__file__)')
-
-echo "XFORMERS_FMHA_FLASH_PATH = ${XFORMERS_FMHA_FLASH_PATH}"
-echo "XFORMERS_FMHA_COMMON_PATH = ${XFORMERS_FMHA_COMMON_PATH}"
-
-if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-${XFORMERS_VERSION}.rocm.patch"; then
- echo "Applying patch to ${XFORMERS_FMHA_FLASH_PATH}"
- patch -p0 $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-${XFORMERS_VERSION}.rocm.patch"
- echo "Successfully patch ${XFORMERS_FMHA_FLASH_PATH}"
-else
- echo "${XFORMERS_FMHA_FLASH_PATH} was patched before"
-fi
-
-if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-${XFORMERS_VERSION}.rocm.patch"; then
- echo "Applying patch to ${XFORMERS_FMHA_COMMON_PATH}"
- patch -p0 $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-${XFORMERS_VERSION}.rocm.patch"
- echo "Successfully patch ${XFORMERS_FMHA_COMMON_PATH}"
-else
- echo "${XFORMERS_FMHA_COMMON_PATH} was patched before"
-fi
diff --git a/pyproject.toml b/pyproject.toml
index 607c09935db89..21cdeb6ef0280 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,11 +45,16 @@ ignore = [
python_version = "3.8"
ignore_missing_imports = true
+check_untyped_defs = true
+follow_imports = "skip"
files = "vllm"
# TODO(woosuk): Include the code from Megatron and HuggingFace.
-exclude = "vllm/model_executor/parallel_utils/|vllm/model_executor/models/"
-
+exclude = [
+ "vllm/model_executor/parallel_utils/|vllm/model_executor/models/",
+ # Ignore triton kernels in ops.
+ 'vllm/attention/ops/.*\.py$'
+]
[tool.codespell]
ignore-words-list = "dout, te, indicies"
diff --git a/requirements-common.txt b/requirements-common.txt
index ff053388a23e1..3cc7bba8f84db 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -5,10 +5,14 @@ sentencepiece # Required for LLaMA tokenizer.
numpy
requests
py-cpuinfo
-transformers >= 4.39.1 # Required for StarCoder2 & Llava.
+transformers >= 4.40.0 # Required for StarCoder2 & Llava, Llama 3.
+tokenizers >= 0.19.1 # Required for Llama 3.
fastapi
uvicorn[standard]
pydantic >= 2.0 # Required for OpenAI server.
prometheus_client >= 0.18.0
tiktoken == 0.6.0 # Required for DBRX tokenizer
-outlines == 0.0.34 # Requires torch >= 2.1.0
\ No newline at end of file
+lm-format-enforcer == 0.9.8
+outlines == 0.0.34 # Requires torch >= 2.1.0
+typing_extensions
+filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
diff --git a/requirements-cpu.txt b/requirements-cpu.txt
index 36d20bc9473ea..e911ad03295f0 100644
--- a/requirements-cpu.txt
+++ b/requirements-cpu.txt
@@ -3,4 +3,4 @@
# Dependencies for x86_64 CPUs
torch == 2.2.1+cpu
-triton >= 2.1.0 # FIXME(woosuk): This is a hack to avoid import error.
+triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error.
\ No newline at end of file
diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 6ee75e8139c04..1bddae4c6f40f 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -3,8 +3,7 @@
# Dependencies for NVIDIA GPUs
ray >= 2.9
-pynvml == 11.5.0
+nvidia-ml-py # for pynvml package
vllm-nccl-cu12>=2.18,<2.19 # for downloading nccl library
torch == 2.2.1
xformers == 0.0.25 # Requires PyTorch 2.2.1
-triton >= 2.1.0
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 75d22bbdb2a1b..d9816828d007d 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -7,13 +7,14 @@ codespell==2.2.6
isort==5.13.2
# type checking
-mypy==0.991
+mypy==1.9.0
types-PyYAML
types-requests
types-setuptools
# testing
pytest
+tensorizer==2.9.0a0
pytest-forked
pytest-asyncio
pytest-rerunfailures
@@ -25,7 +26,6 @@ requests
ray
peft
awscli
-ai2-olmo # required for OLMo
# Benchmarking
aiohttp
diff --git a/rocm_patch/commonpy_xformers-0.0.23.rocm.patch b/rocm_patch/commonpy_xformers-0.0.23.rocm.patch
deleted file mode 100644
index 4d7495cf13e1d..0000000000000
--- a/rocm_patch/commonpy_xformers-0.0.23.rocm.patch
+++ /dev/null
@@ -1,13 +0,0 @@
---- /opt/conda/envs/py_3.10/lib/python3.10/site-packages/xformers/ops/fmha/common.py 2023-11-29 03:17:03.930103539 +0000
-+++ common.py 2023-11-28 16:14:19.846233146 +0000
-@@ -298,8 +298,8 @@
- dtype = d.query.dtype
- if device_type not in cls.SUPPORTED_DEVICES:
- reasons.append(f"device={device_type} (supported: {cls.SUPPORTED_DEVICES})")
-- if device_type == "cuda" and not _built_with_cuda:
-- reasons.append("xFormers wasn't build with CUDA support")
-+ #if device_type == "cuda" and not _built_with_cuda:
-+ # reasons.append("xFormers wasn't build with CUDA support")
- if device_type == "cuda":
- device_capability = torch.cuda.get_device_capability(d.device)
- if device_capability < cls.CUDA_MINIMUM_COMPUTE_CAPABILITY:
diff --git a/rocm_patch/flashpy_xformers-0.0.23.rocm.patch b/rocm_patch/flashpy_xformers-0.0.23.rocm.patch
deleted file mode 100644
index ac846728a7a91..0000000000000
--- a/rocm_patch/flashpy_xformers-0.0.23.rocm.patch
+++ /dev/null
@@ -1,152 +0,0 @@
---- flash_ori.py 2023-12-13 05:43:31.530752623 +0000
-+++ flash_patch.py 2023-12-13 06:00:45.962403104 +0000
-@@ -36,44 +36,44 @@
-
- FLASH_VERSION = "0.0.0"
- try:
-- try:
-- from ... import _C_flashattention # type: ignore[attr-defined]
-- from ..._cpp_lib import _build_metadata
--
-- if _build_metadata is not None:
-- FLASH_VERSION = _build_metadata.flash_version
-- except ImportError:
-- import flash_attn
-- from flash_attn.flash_attn_interface import flash_attn_cuda as _C_flashattention
--
-- FLASH_VERSION = flash_attn.__version__
-- flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:3])
-- if (
-- flash_ver_parsed != (2, 3, 6)
-- and os.environ.get("XFORMERS_IGNORE_FLASH_VERSION_CHECK", "0") != "1"
-- ):
-- raise ImportError("Requires Flash attention 2.3.6 for varlen_fwd api")
-+ #try:
-+ # from ... import _C_flashattention # type: ignore[attr-defined]
-+ # from ..._cpp_lib import _build_metadata
-+
-+ # if _build_metadata is not None:
-+ # FLASH_VERSION = _build_metadata.flash_version
-+ #except ImportError:
-+ import flash_attn
-+ from flash_attn.flash_attn_interface import flash_attn_cuda as _C_flashattention
-+
-+ FLASH_VERSION = flash_attn.__version__
-+ # flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:3])
-+ # if (
-+ # flash_ver_parsed != (2, 3, 6)
-+ # and os.environ.get("XFORMERS_IGNORE_FLASH_VERSION_CHECK", "0") != "1"
-+ # ):
-+ # raise ImportError("Requires Flash attention 2.3.6 for varlen_fwd api")
-
- # create library so that flash-attn goes through the PyTorch Dispatcher
-- _flash_lib = torch.library.Library("xformers_flash", "DEF")
--
-- _flash_lib.define(
-- "flash_fwd(Tensor query, Tensor key, Tensor value, "
-- "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, Tensor? seqused_k, "
-- "int max_seqlen_q, int max_seqlen_k, "
-- "float p, float softmax_scale, "
-- "bool is_causal, int window_left, "
-- "int window_right, bool return_softmax) -> (Tensor, Tensor, Tensor)"
-- )
-+ #_flash_lib = torch.library.Library("xformers_flash", "DEF")
-
-- _flash_lib.define(
-- "flash_bwd(Tensor dout, Tensor query, Tensor key, Tensor value, "
-- "Tensor out, Tensor softmax_lse_, Tensor dq, Tensor dk, Tensor dv, "
-- "Tensor cu_seqlens_q, Tensor cu_seqlens_k, "
-- "int max_seqlen_q, int max_seqlen_k, "
-- "float p, float softmax_scale, bool is_causal, "
-- "int window_left, int window_right, Tensor rng_state) -> (Tensor, Tensor, Tensor)"
-- )
-+ #_flash_lib.define(
-+ # "flash_fwd(Tensor query, Tensor key, Tensor value, "
-+ # "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, Tensor? seqused_k, "
-+ # "int max_seqlen_q, int max_seqlen_k, "
-+ # "float p, float softmax_scale, "
-+ # "bool is_causal, int window_left, "
-+ # "int window_right, bool return_softmax) -> (Tensor, Tensor, Tensor)"
-+ #)
-+
-+ #_flash_lib.define(
-+ # "flash_bwd(Tensor dout, Tensor query, Tensor key, Tensor value, "
-+ # "Tensor out, Tensor softmax_lse_, Tensor dq, Tensor dk, Tensor dv, "
-+ # "Tensor cu_seqlens_q, Tensor cu_seqlens_k, "
-+ # "int max_seqlen_q, int max_seqlen_k, "
-+ # "float p, float softmax_scale, bool is_causal, "
-+ # "int window_left, int window_right, Tensor rng_state) -> (Tensor, Tensor, Tensor)"
-+ #)
-
- def _flash_fwd(
- query,
-@@ -111,8 +111,8 @@
- p,
- softmax_scale,
- is_causal,
-- window_left, # window_size_left
-- window_right, # window_size_right
-+ # window_left, # window_size_left
-+ # window_right, # window_size_right
- return_softmax,
- None, # rng
- )
-@@ -134,15 +134,15 @@
- out,
- cu_seq_lens_q,
- cu_seq_lens_k,
-- seqused_k,
-+ # seqused_k,
- max_seq_len_q,
- max_seq_len_k,
- p,
- softmax_scale,
- False,
- is_causal,
-- window_left,
-- window_right,
-+ # window_left,
-+ # window_right,
- return_softmax,
- None,
- )
-@@ -184,8 +184,8 @@
- p,
- softmax_scale,
- is_causal,
-- window_left,
-- window_right,
-+ # window_left,
-+ # window_right,
- None,
- rng_state,
- )
-@@ -208,15 +208,15 @@
- softmax_scale,
- False, # zero_tensors
- is_causal,
-- window_left,
-- window_right,
-+ # window_left,
-+ # window_right,
- None,
- rng_state,
- )
- return dq, dk, dv
-
-- _flash_lib.impl("flash_fwd", _flash_fwd, "CUDA")
-- _flash_lib.impl("flash_bwd", _flash_bwd, "CUDA")
-+ #_flash_lib.impl("flash_fwd", _flash_fwd, "CUDA")
-+ #_flash_lib.impl("flash_bwd", _flash_bwd, "CUDA")
- except ImportError:
- pass
-
-@@ -400,7 +400,7 @@
- implementation.
- """
-
-- OPERATOR = get_operator("xformers_flash", "flash_fwd")
-+ OPERATOR = _flash_fwd # get_operator("xformers_flash", "flash_fwd")
- SUPPORTED_DEVICES: Set[str] = {"cuda"}
- CUDA_MINIMUM_COMPUTE_CAPABILITY = (8, 0)
- SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16}
diff --git a/setup.py b/setup.py
index 92e2b418c9419..e4c4773e9dca7 100644
--- a/setup.py
+++ b/setup.py
@@ -208,7 +208,8 @@ def _is_neuron() -> bool:
subprocess.run(["neuron-ls"], capture_output=True, check=True)
except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
torch_neuronx_installed = False
- return torch_neuronx_installed
+ return torch_neuronx_installed or os.environ.get("VLLM_BUILD_WITH_NEURON",
+ False)
def _is_cpu() -> bool:
@@ -379,14 +380,6 @@ def _read_requirements(filename: str) -> List[str]:
# UPSTREAM SYNC: needed for sparsity
_sparsity_deps = ["nm-magic-wand-nightly"]
-
-def get_extra_requirements() -> dict:
- return {
- "sparse": _sparsity_deps,
- "sparsity": _sparsity_deps,
- }
-
-
package_data = {
"vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
}
@@ -426,8 +419,13 @@ def get_extra_requirements() -> dict:
"tests")),
python_requires=">=3.8",
install_requires=get_requirements(),
- extras_require=get_extra_requirements(),
ext_modules=ext_modules,
+ extras_require={
+ "tensorizer": ["tensorizer==2.9.0a1"],
+ # UPSTREAM SYNC: required for sparsity
+ "sparse": _sparsity_deps,
+ "sparsity": _sparsity_deps,
+ },
cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {},
package_data=package_data,
)
diff --git a/tests/async_engine/test_api_server.py b/tests/async_engine/test_api_server.py
index 248bfbc8ab5c0..7f57d5cf9b182 100644
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@@ -25,21 +25,30 @@ def _query_server_long(prompt: str) -> dict:
@pytest.fixture
-def api_server(tokenizer_pool_size: int):
+def api_server(tokenizer_pool_size: int, engine_use_ray: bool,
+ worker_use_ray: bool):
script_path = Path(__file__).parent.joinpath(
"api_server_async_engine.py").absolute()
- uvicorn_process = subprocess.Popen([
+ commands = [
sys.executable, "-u",
str(script_path), "--model", "facebook/opt-125m", "--host",
"127.0.0.1", "--tokenizer-pool-size",
str(tokenizer_pool_size)
- ])
+ ]
+ if engine_use_ray:
+ commands.append("--engine-use-ray")
+ if worker_use_ray:
+ commands.append("--worker-use-ray")
+ uvicorn_process = subprocess.Popen(commands)
yield
uvicorn_process.terminate()
@pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
-def test_api_server(api_server, tokenizer_pool_size: int):
+@pytest.mark.parametrize("worker_use_ray", [False, True])
+@pytest.mark.parametrize("engine_use_ray", [False, True])
+def test_api_server(api_server, tokenizer_pool_size: int, worker_use_ray: bool,
+ engine_use_ray: bool):
"""
Run the API server and test it.
diff --git a/tests/async_engine/test_chat_template.py b/tests/async_engine/test_chat_template.py
index 6972ae1dee4a1..8d6ad6706fb0e 100644
--- a/tests/async_engine/test_chat_template.py
+++ b/tests/async_engine/test_chat_template.py
@@ -76,20 +76,29 @@ def test_load_chat_template():
{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}""" # noqa: E501
-def test_no_load_chat_template():
+def test_no_load_chat_template_filelike():
# Testing chatml template
template = "../../examples/does_not_exist"
tokenizer = MockTokenizer()
+ mock_serving_chat = MockServingChat(tokenizer)
+
+ with pytest.raises(ValueError, match="looks like a file path"):
+ OpenAIServingChat._load_chat_template(mock_serving_chat,
+ chat_template=template)
+
+
+def test_no_load_chat_template_literallike():
+ # Testing chatml template
+ template = "{{ messages }}"
+ tokenizer = MockTokenizer()
+
mock_serving_chat = MockServingChat(tokenizer)
OpenAIServingChat._load_chat_template(mock_serving_chat,
chat_template=template)
template_content = tokenizer.chat_template
- # Test assertions
- assert template_content is not None
- # Hard coded value for template_chatml.jinja
- assert template_content == """../../examples/does_not_exist"""
+ assert template_content == template
@pytest.mark.asyncio
diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index bd4c7ea3301be..97cff623c5e1d 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -4,8 +4,6 @@
"""
import pytest
-from vllm.attention.selector import VLLM_ATTENTION_BACKEND
-
MODELS = [
"facebook/opt-125m",
"meta-llama/Llama-2-7b-hf",
@@ -16,7 +14,6 @@
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [5])
@pytest.mark.parametrize("enforce_eager", [False, True])
-@pytest.mark.parametrize("attn_backend", ["XFORMERS", "FLASH_ATTN"])
def test_models(
hf_runner,
vllm_runner,
@@ -25,10 +22,7 @@ def test_models(
dtype: str,
max_tokens: int,
enforce_eager: bool,
- attn_backend: str,
- monkeypatch,
) -> None:
- monkeypatch.setenv(VLLM_ATTENTION_BACKEND, attn_backend)
hf_model = hf_runner(model, dtype=dtype)
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
del hf_model
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index 293f5539015c0..b021377f229e5 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -34,10 +34,6 @@ def test_models(
enforce_eager: bool,
tensor_parallel_size: int,
) -> None:
- if (tensor_parallel_size == 2 and chunked_prefill_token_size != 16
- and not enforce_eager):
- pytest.skip(f"Skip {chunked_prefill_token_size=} and {enforce_eager=} "
- "for high TP to save testing time.")
max_num_seqs = min(chunked_prefill_token_size, 256)
enable_chunked_prefill = False
max_num_batched_tokens = None
diff --git a/tests/conftest.py b/tests/conftest.py
index 98ae0404b4426..43e08f0fc33fa 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -346,6 +346,10 @@ def generate_greedy_logprobs_nm(
return [(output_ids, output_str, output_logprobs)
for output_ids, output_str, output_logprobs in outputs]
+ def __del__(self):
+ del self.model
+ cleanup()
+
@pytest.fixture
def hf_runner_nm():
@@ -533,6 +537,10 @@ def generate_greedy_logprobs(
return [(output_ids, output_str, output_logprobs)
for output_ids, output_str, output_logprobs in outputs]
+ def __del__(self):
+ del self.model
+ cleanup()
+
@pytest.fixture
def vllm_runner_nm():
diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
index 94b65401e1dd4..0ee78a9b0a8ea 100644
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -230,6 +230,76 @@ def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
assert baseline_token_ids == test_token_ids
+@pytest.mark.parametrize(
+ "common_llm_kwargs",
+ [
+ {
+ # Use a small model for a fast test.
+ "model": "facebook/opt-125m",
+
+ # skip cuda graph creation for fast test.
+ "enforce_eager": True,
+ "enable_chunked_prefill": True,
+ "max_num_batched_tokens": 2,
+ "max_num_seqs": 2,
+ },
+ ])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [
+ {
+ "use_v2_block_manager": False,
+ },
+])
+@pytest.mark.parametrize("test_llm_kwargs", [
+ {
+ "use_v2_block_manager": True,
+ "num_lookahead_slots": 0,
+ },
+ {
+ "use_v2_block_manager": True,
+ "num_lookahead_slots": 5,
+ },
+])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("seed", [1])
+def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
+ test_llm_generator, batch_size):
+ """Verify that chunked prefill works with BlockManagerV2, with and without
+ lookahead scheduling.
+ """
+ output_len = 32
+ temperature = 0.0
+
+ prompts = [
+ "Hello, my name is",
+ "The president of the United States is",
+ "The capital of France is",
+ "The future of AI is",
+ ]
+
+ prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+ sampling_params = SamplingParams(
+ max_tokens=output_len,
+ ignore_eos=True,
+ temperature=temperature,
+ )
+
+ print('Getting token ids with BlockManagerV1')
+ baseline_token_ids = get_token_ids_from_llm_generator(
+ baseline_llm_generator, prompts, sampling_params)
+
+ print('Getting token ids with BlockManagerV2')
+ test_token_ids = get_token_ids_from_llm_generator(test_llm_generator,
+ prompts, sampling_params)
+
+ for expected_token_ids, actual_token_ids in zip(baseline_token_ids,
+ test_token_ids):
+ assert expected_token_ids == actual_token_ids
+
+ assert baseline_token_ids == test_token_ids
+
+
def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params):
for llm in llm_generator:
outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
diff --git a/tests/core/test_scheduler.py b/tests/core/test_scheduler.py
index 9588a1bead5f6..ab471d206618b 100644
--- a/tests/core/test_scheduler.py
+++ b/tests/core/test_scheduler.py
@@ -540,7 +540,7 @@ def test_decode_schedule_preempted():
curr_loras = None
for i in range(3):
_, seq_group = create_dummy_prompt(str(i), prompt_length=60)
- scheduler._allocate_and_set_running(seq_group, 60)
+ scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1)
running.append(seq_group)
scheduler.block_manager.can_append_slots = MagicMock()
@@ -563,7 +563,8 @@ def cannot_append_second_group(seq_group, num_lookahead_slots):
assert len(output.preempted) == 2
# Verify budgets are updated.
assert budget.num_batched_tokens == 1
- assert budget.num_curr_seqs == 1
+ # NOTE: When enable_chunk is False, num_seqs budget is not updated.
+ # assert budget.num_curr_seqs == 1
# Both should be preempted, not swapped.
assert output.blocks_to_swap_out == {}
# Nothing is copied.
@@ -581,7 +582,7 @@ def test_decode_swap_beam_search():
budget = create_token_budget()
for i in range(3):
_, seq_group = create_dummy_prompt(str(i), prompt_length=60, best_of=2)
- scheduler._allocate_and_set_running(seq_group, 60)
+ scheduler._allocate_and_set_running(seq_group)
running.append(seq_group)
append_new_token_seq_group(60, seq_group, 1)
budget.add_num_seqs(seq_group.request_id,
@@ -629,7 +630,7 @@ def test_schedule_decode_blocks_to_copy_update():
running = deque()
policy = PolicyFactory.get_policy(policy_name="fcfs")
curr_loras = None
- scheduler._allocate_and_set_running(seq_group, 60)
+ scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1)
running.append(seq_group)
@@ -659,7 +660,7 @@ def test_schedule_swapped_simple():
curr_loras = None
blocks_to_swap_out = {}
_, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
- scheduler._allocate_and_set_running(seq_group, 60)
+ scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1)
scheduler._swap_out(seq_group, blocks_to_swap_out)
swapped.append(seq_group)
@@ -687,7 +688,7 @@ def test_schedule_swapped_max_token_budget():
blocks_to_swap_out = {}
for _ in range(2):
_, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
- scheduler._allocate_and_set_running(seq_group, 60)
+ scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1)
scheduler._swap_out(seq_group, blocks_to_swap_out)
swapped.append(seq_group)
@@ -721,7 +722,7 @@ def test_schedule_swapped_max_seqs():
blocks_to_swap_out = {}
for i in range(4):
_, seq_group = create_dummy_prompt(str(i), prompt_length=60)
- scheduler._allocate_and_set_running(seq_group, 60)
+ scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1)
scheduler._swap_out(seq_group, blocks_to_swap_out)
swapped.append(seq_group)
@@ -759,7 +760,7 @@ def test_schedule_swapped_max_loras():
lora_name=str(i),
lora_int_id=i + 1,
lora_local_path="abc"))
- scheduler._allocate_and_set_running(seq_group, 60)
+ scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1)
scheduler._swap_out(seq_group, blocks_to_swap_out)
swapped.append(seq_group)
@@ -783,7 +784,7 @@ def test_schedule_swapped_cannot_swap_in():
blocks_to_swap_out = {}
for _ in range(2):
_, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
- scheduler._allocate_and_set_running(seq_group, 60)
+ scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1)
scheduler._swap_out(seq_group, blocks_to_swap_out)
swapped.append(seq_group)
@@ -808,7 +809,7 @@ def test_schedule_swapped_blocks_to_copy():
policy = PolicyFactory.get_policy(policy_name="fcfs")
curr_loras = None
_, seq_group = create_dummy_prompt("1", prompt_length=60, best_of=2)
- scheduler._allocate_and_set_running(seq_group, 60)
+ scheduler._allocate_and_set_running(seq_group)
append_new_token_seq_group(60, seq_group, 1)
blocks_to_swap_out = {}
scheduler._swap_out(seq_group, blocks_to_swap_out)
diff --git a/tests/core/utils.py b/tests/core/utils.py
index fbbdb07cb8e6e..22c1d3826dff4 100644
--- a/tests/core/utils.py
+++ b/tests/core/utils.py
@@ -1,5 +1,5 @@
import time
-from typing import Optional, Tuple
+from typing import Iterable, Optional, Tuple
from vllm import SamplingParams
from vllm.lora.request import LoRARequest
@@ -31,14 +31,17 @@ def create_dummy_prompt(
def create_seq_group(
- seq_prompt_len=1024,
- seq_output_lens=(128, ),
- request_id='0',
- seq_id_start=0,
-) -> SequenceGroup:
+ seq_prompt_len: int = 1024,
+ seq_output_lens: Iterable[int] = (128, ),
+ request_id: str = '0',
+ seq_id_start: int = 0,
+ sampling_params: Optional[SamplingParams] = None) -> SequenceGroup:
assert len(seq_output_lens) > 0
+ if sampling_params is None:
+ sampling_params = SamplingParams()
+
prompt_token_ids = [0] * seq_prompt_len
seqs = []
@@ -60,7 +63,7 @@ def create_seq_group(
seq_group = SequenceGroup(
request_id=request_id,
seqs=seqs,
- sampling_params=SamplingParams(),
+ sampling_params=sampling_params,
arrival_time=time.time(),
)
diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py
index 77aa90b12bf8f..4021dba90ee0a 100644
--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
@@ -10,13 +10,15 @@
test_basic_distributed_correctness.py
```
"""
-import os
-
+# UPSTREAM SYNC: We can only run one model per invocation of the test.
+# Otherwise, we have duplicate ray.init() calls which fails.
+# Rather than ruining .github/scripts/run-tests to pass via env
+# variables, we just run llama which is sufficient for smoke test.
import pytest
import torch
MODELS = [
- os.environ["TEST_DIST_MODEL"],
+ "meta-llama/Llama-2-7b-hf",
]
diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py
index eeac0ae9e899a..209d03084c3e5 100644
--- a/tests/distributed/test_chunked_prefill_distributed.py
+++ b/tests/distributed/test_chunked_prefill_distributed.py
@@ -11,13 +11,14 @@
test_chunked_prefill_distributed.py
```
"""
-
+# UPSTREAM SYNC: We can only run one model per invocation of the test.
+# Otherwise, we have duplicate ray.init() calls which fails.
+# Rather than ruining .github/scripts/run-tests to pass via env
+# variables, we just run llama which is sufficient for smoke test.
import pytest
import torch
-# UPSTREAM SYNC: our automation does not call via env variables.
MODELS = [
- "facebook/opt-125m",
"meta-llama/Llama-2-7b-hf",
]
diff --git a/tests/distributed/test_pynccl.py b/tests/distributed/test_pynccl.py
index b50eed1c8c722..6d7d4a5806bd0 100644
--- a/tests/distributed/test_pynccl.py
+++ b/tests/distributed/test_pynccl.py
@@ -1,18 +1,19 @@
import multiprocessing
-import os
import pytest
import torch
from vllm.distributed.device_communicators.pynccl import (NCCLCommunicator,
ncclGetUniqueId)
+from vllm.distributed.parallel_state import init_distributed_environment
+from vllm.utils import update_environment_variables
def distributed_run(fn, world_size):
number_of_processes = world_size
processes = []
for i in range(number_of_processes):
- env = os.environ.copy()
+ env = {}
env['RANK'] = str(i)
env['LOCAL_RANK'] = str(i)
env['WORLD_SIZE'] = str(number_of_processes)
@@ -26,20 +27,23 @@ def distributed_run(fn, world_size):
for p in processes:
p.join()
+ for p in processes:
+ assert p.exitcode == 0
+
-def update_env(fn):
+def worker_fn_wrapper(fn):
# `multiprocessing.Process` cannot accept environment variables directly
# so we need to pass the environment variables as arguments
# and update the environment variables in the function
- def wrapper(env):
- import os
- os.environ.update(env)
+ def wrapped_fn(env):
+ update_environment_variables(env)
+ init_distributed_environment()
fn()
- return wrapper
+ return wrapped_fn
-@update_env
+@worker_fn_wrapper
def worker_fn():
comm = NCCLCommunicator()
tensor = torch.ones(16, 1024, 1024, dtype=torch.float32).cuda(comm.rank)
@@ -54,7 +58,7 @@ def test_pynccl():
distributed_run(worker_fn, 2)
-@update_env
+@worker_fn_wrapper
def worker_fn_with_cudagraph():
with torch.no_grad():
graph = torch.cuda.CUDAGraph()
diff --git a/tests/distributed/test_pynccl_library.py b/tests/distributed/test_pynccl_library.py
new file mode 100644
index 0000000000000..ec60a5ed3114d
--- /dev/null
+++ b/tests/distributed/test_pynccl_library.py
@@ -0,0 +1,43 @@
+import multiprocessing
+import tempfile
+
+
+def target_fn(env, filepath):
+ from vllm.utils import update_environment_variables
+ update_environment_variables(env)
+ from vllm.utils import nccl_integrity_check
+ nccl_integrity_check(filepath)
+
+
+def test_library_file():
+ # note: don't import vllm.distributed.device_communicators.pynccl
+ # before running this test, otherwise the library file will be loaded
+ # and it might interfere with the test
+ from vllm.utils import find_nccl_library
+ so_file = find_nccl_library()
+ with open(so_file, 'rb') as f:
+ content = f.read()
+ try:
+ # corrupt the library file, should raise an exception
+ with open(so_file, 'wb') as f:
+ f.write(content[:len(content) // 2])
+ p = multiprocessing.Process(target=target_fn, args=({}, so_file))
+ p.start()
+ p.join()
+ assert p.exitcode != 0
+
+ # move the library file to a tmp path
+ # test VLLM_NCCL_SO_PATH
+ fd, path = tempfile.mkstemp()
+ with open(path, 'wb') as f:
+ f.write(content)
+ p = multiprocessing.Process(target=target_fn,
+ args=({
+ "VLLM_NCCL_SO_PATH": path
+ }, path))
+ p.start()
+ p.join()
+ assert p.exitcode == 0
+ finally:
+ with open(so_file, 'wb') as f:
+ f.write(content)
diff --git a/tests/engine/output_processor/test_multi_step.py b/tests/engine/output_processor/test_multi_step.py
new file mode 100644
index 0000000000000..6da3da091db78
--- /dev/null
+++ b/tests/engine/output_processor/test_multi_step.py
@@ -0,0 +1,270 @@
+import random
+from unittest.mock import MagicMock
+
+import pytest
+from transformers import PreTrainedTokenizer
+
+from tests.core.utils import create_seq_group
+from vllm.core.scheduler import Scheduler
+from vllm.engine.output_processor.multi_step import MultiStepOutputProcessor
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import (Logprob, SequenceGroupOutput, SequenceOutput,
+ SequenceStatus)
+from vllm.transformers_utils.detokenizer import Detokenizer
+from vllm.utils import Counter
+
+
+@pytest.mark.parametrize("seq_output_len", [128])
+@pytest.mark.parametrize("num_new_tokens", [1, 12])
+@pytest.mark.skip_global_cleanup
+def test_appends_token_ids(num_new_tokens: int, seq_output_len: int):
+ """Verify multi-step decoding appends token ids correctly.
+
+ We append token ids and verify all the token ids were appended correctly.
+ Note that ignore_eos=True.
+ """
+ detokenizer = MagicMock(spec=Detokenizer)
+ scheduler = MagicMock(spec=Scheduler)
+ stop_checker = MagicMock(spec=StopChecker)
+ seq_counter = Counter()
+
+ output_processor = MultiStepOutputProcessor(
+ detokenizer=detokenizer,
+ scheduler=scheduler,
+ seq_counter=seq_counter,
+ get_tokenizer_for_seq=lambda _: mock_tokenizer(),
+ stop_checker=stop_checker,
+ )
+
+ seq_group = create_seq_group(
+ seq_prompt_len=1024,
+ seq_output_lens=[seq_output_len],
+ sampling_params=SamplingParams(max_tokens=seq_output_len +
+ num_new_tokens,
+ ignore_eos=True),
+ )
+
+ seq = seq_group.get_seqs()[0]
+ seq.status = SequenceStatus.RUNNING
+
+ new_token_ids = list(range(num_new_tokens))
+
+ outputs = [
+ SequenceGroupOutput(
+ samples=[
+ SequenceOutput(
+ parent_seq_id=seq.seq_id,
+ output_token=output_token,
+ logprobs={output_token: Logprob(0.0)},
+ )
+ ],
+ prompt_logprobs=None,
+ ) for output_token in new_token_ids
+ ]
+
+ assert seq.get_token_ids()[-len(new_token_ids):] != new_token_ids
+ output_processor.process_outputs(seq_group, outputs)
+ assert seq.get_token_ids()[-len(new_token_ids):] == new_token_ids
+
+
+@pytest.mark.parametrize("seq_prompt_len", [1024])
+@pytest.mark.parametrize("seq_output_len", [128])
+@pytest.mark.parametrize("num_new_tokens", [5, 6, 7, 8])
+@pytest.mark.parametrize("max_tokens", [128 + 3])
+@pytest.mark.skip_global_cleanup
+def test_respects_max_tokens(num_new_tokens: int, seq_prompt_len: int,
+ seq_output_len: int, max_tokens: int):
+ """Verify tokens after max_tokens are dropped and not appended to the
+ sequence.
+ """
+ detokenizer = MagicMock(spec=Detokenizer)
+ scheduler = MagicMock(spec=Scheduler)
+ stop_checker = MagicMock(spec=StopChecker)
+ seq_counter = Counter()
+
+ output_processor = MultiStepOutputProcessor(
+ detokenizer=detokenizer,
+ scheduler=scheduler,
+ seq_counter=seq_counter,
+ get_tokenizer_for_seq=lambda _: mock_tokenizer(),
+ stop_checker=stop_checker,
+ )
+
+ seq_group = create_seq_group(
+ seq_prompt_len=seq_prompt_len,
+ seq_output_lens=[seq_output_len],
+ sampling_params=SamplingParams(max_tokens=max_tokens, ),
+ )
+
+ seq = seq_group.get_seqs()[0]
+ seq.status = SequenceStatus.RUNNING
+
+ new_token_ids = list(range(num_new_tokens))
+
+ outputs = [
+ SequenceGroupOutput(
+ samples=[
+ SequenceOutput(
+ parent_seq_id=seq.seq_id,
+ output_token=output_token,
+ logprobs={output_token: Logprob(0.0)},
+ )
+ ],
+ prompt_logprobs=None,
+ ) for output_token in new_token_ids
+ ]
+
+ assert seq.get_len() == seq_prompt_len + seq_output_len
+ output_processor.process_outputs(seq_group, outputs)
+
+ # Expect the processed sequence to not go over max tokens in len.
+ assert seq.get_len() == seq_prompt_len + max_tokens
+
+ # Expect the correct tokens were appended.
+ expected_appended_tokens = new_token_ids[:max_tokens - seq_output_len]
+ assert seq.get_token_ids(
+ )[-len(expected_appended_tokens):] == expected_appended_tokens
+
+
+@pytest.mark.parametrize("seq_prompt_len", [1024])
+@pytest.mark.parametrize("seq_output_len", [128])
+@pytest.mark.parametrize("num_new_tokens", [12])
+@pytest.mark.parametrize("seed", list(range(6)))
+@pytest.mark.skip_global_cleanup
+def test_respects_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
+ seq_output_len: int, seed: int):
+ """Verify the eos token id is included in the sequence, but subsequent
+ tokens are dropped (not appended to sequence).
+ """
+ random.seed(seed)
+ detokenizer = MagicMock(spec=Detokenizer)
+ scheduler = MagicMock(spec=Scheduler)
+ stop_checker = MagicMock(spec=StopChecker)
+ seq_counter = Counter()
+
+ eos_token_id = 100
+
+ output_processor = MultiStepOutputProcessor(
+ detokenizer=detokenizer,
+ scheduler=scheduler,
+ seq_counter=seq_counter,
+ get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
+ stop_checker=stop_checker,
+ )
+
+ seq_group = create_seq_group(
+ seq_prompt_len=seq_prompt_len,
+ seq_output_lens=[seq_output_len],
+ sampling_params=SamplingParams(
+ # Ensure enough space.
+ max_tokens=seq_output_len + num_new_tokens, ),
+ )
+
+ seq = seq_group.get_seqs()[0]
+ seq.status = SequenceStatus.RUNNING
+
+ new_token_ids = list(range(num_new_tokens))
+ assert eos_token_id not in new_token_ids
+ eos_index = random.randint(0, len(new_token_ids) - 1)
+ new_token_ids[eos_index] = eos_token_id
+
+ outputs = [
+ SequenceGroupOutput(
+ samples=[
+ SequenceOutput(
+ parent_seq_id=seq.seq_id,
+ output_token=output_token,
+ logprobs={output_token: Logprob(0.0)},
+ )
+ ],
+ prompt_logprobs=None,
+ ) for output_token in new_token_ids
+ ]
+
+ assert seq.get_len() == seq_prompt_len + seq_output_len
+ output_processor.process_outputs(seq_group, outputs)
+
+ # Expect the processed sequence to not go beyond provided eos.
+ assert seq.get_len() == seq_prompt_len + seq_output_len + (eos_index + 1)
+
+ # Expect the correct tokens were appended.
+ expected_appended_tokens = new_token_ids[:eos_index + 1]
+ assert seq.get_token_ids(
+ )[-len(expected_appended_tokens):] == expected_appended_tokens
+
+
+@pytest.mark.parametrize("seq_prompt_len", [1024])
+@pytest.mark.parametrize("seq_output_len", [128])
+@pytest.mark.parametrize("num_new_tokens", [12])
+@pytest.mark.parametrize("seed", list(range(6)))
+@pytest.mark.skip_global_cleanup
+def test_ignores_eos_token_id(num_new_tokens: int, seq_prompt_len: int,
+ seq_output_len: int, seed: int):
+ """When sampling parameters dictate that we should ignore the eos token id,
+ ensure all token ids are appended even if the eos token id is emitted.
+ """
+ random.seed(seed)
+ detokenizer = MagicMock(spec=Detokenizer)
+ scheduler = MagicMock(spec=Scheduler)
+ stop_checker = MagicMock(spec=StopChecker)
+ seq_counter = Counter()
+
+ eos_token_id = 100
+
+ output_processor = MultiStepOutputProcessor(
+ detokenizer=detokenizer,
+ scheduler=scheduler,
+ seq_counter=seq_counter,
+ get_tokenizer_for_seq=lambda _: mock_tokenizer(eos_token_id),
+ stop_checker=stop_checker,
+ )
+
+ seq_group = create_seq_group(
+ seq_prompt_len=seq_prompt_len,
+ seq_output_lens=[seq_output_len],
+ sampling_params=SamplingParams(
+ # Ensure enough space.
+ max_tokens=seq_output_len + num_new_tokens,
+ ignore_eos=True,
+ ),
+ )
+
+ seq = seq_group.get_seqs()[0]
+ seq.status = SequenceStatus.RUNNING
+
+ new_token_ids = list(range(num_new_tokens))
+ assert eos_token_id not in new_token_ids
+ eos_index = random.randint(0, len(new_token_ids) - 1)
+ new_token_ids[eos_index] = eos_token_id
+
+ outputs = [
+ SequenceGroupOutput(
+ samples=[
+ SequenceOutput(
+ parent_seq_id=seq.seq_id,
+ output_token=output_token,
+ logprobs={output_token: Logprob(0.0)},
+ )
+ ],
+ prompt_logprobs=None,
+ ) for output_token in new_token_ids
+ ]
+
+ assert seq.get_len() == seq_prompt_len + seq_output_len
+ output_processor.process_outputs(seq_group, outputs)
+
+ # Expect the processed sequence to go beyond eos.
+ assert seq.get_len() == seq_prompt_len + seq_output_len + num_new_tokens
+
+ # Expect the correct tokens were appended.
+ expected_appended_tokens = new_token_ids[:seq_output_len + num_new_tokens -
+ seq_output_len]
+ assert seq.get_token_ids(
+ )[-len(expected_appended_tokens):] == expected_appended_tokens
+
+
+def mock_tokenizer(eos_token_id=1000):
+ tokenizer = MagicMock(spec=PreTrainedTokenizer)
+ tokenizer.eos_token_id = eos_token_id
+ return tokenizer
diff --git a/tests/engine/test_skip_tokenizer_init.py b/tests/engine/test_skip_tokenizer_init.py
new file mode 100644
index 0000000000000..baa463a316902
--- /dev/null
+++ b/tests/engine/test_skip_tokenizer_init.py
@@ -0,0 +1,23 @@
+import pytest
+
+from vllm.entrypoints.llm import LLM
+from vllm.sampling_params import SamplingParams
+
+
+@pytest.mark.parametrize("model", ["facebook/opt-125m"])
+def test_skip_tokenizer_initialization(model: str):
+ # This test checks if the flag skip_tokenizer_init skips the initialization
+ # of tokenizer and detokenizer. The generated output is expected to contain
+ # token ids.
+ llm = LLM(model=model, skip_tokenizer_init=True)
+ sampling_params = SamplingParams(prompt_logprobs=True, detokenize=True)
+ with pytest.raises(ValueError) as err:
+ llm.generate("abc", sampling_params)
+ assert "prompts must be None if" in str(err.value)
+ outputs = llm.generate(prompt_token_ids=[[1, 2, 3]],
+ sampling_params=sampling_params)
+ assert len(outputs) > 0
+ completions = outputs[0].outputs
+ assert len(completions) > 0
+ assert completions[0].text == ""
+ assert completions[0].token_ids
diff --git a/tests/entrypoints/test_guided_processors.py b/tests/entrypoints/test_guided_processors.py
index 5622744566bcc..30f0ad5d8272f 100644
--- a/tests/entrypoints/test_guided_processors.py
+++ b/tests/entrypoints/test_guided_processors.py
@@ -1,11 +1,14 @@
# This unit test should be moved to a new
# tests/test_guided_decoding directory.
-
+import pytest
import torch
from transformers import AutoTokenizer
-from vllm.model_executor.guided_logits_processors import (JSONLogitsProcessor,
- RegexLogitsProcessor)
+from vllm.entrypoints.openai.protocol import CompletionRequest
+from vllm.model_executor.guided_decoding import (
+ get_guided_decoding_logits_processor)
+from vllm.model_executor.guided_decoding.outlines_logits_processors import (
+ JSONLogitsProcessor, RegexLogitsProcessor)
TEST_SCHEMA = {
"type": "object",
@@ -73,3 +76,36 @@ def test_guided_logits_processors():
json_LP(token_ids, tensor)
assert tensor.shape == original_tensor.shape
assert not torch.allclose(tensor, original_tensor)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("backend", ["outlines", "lm-format-enforcer"])
+async def test_guided_logits_processor_black_box(backend: str):
+ tokenizer = AutoTokenizer.from_pretrained('HuggingFaceH4/zephyr-7b-beta')
+ token_ids = tokenizer.encode(
+ f"Give an example IPv4 address with this regex: {TEST_REGEX}")
+ regex_request = CompletionRequest(model='test',
+ prompt=token_ids,
+ guided_regex=TEST_REGEX)
+ regex_lp = await get_guided_decoding_logits_processor(
+ backend, regex_request, tokenizer)
+ assert regex_lp is not None
+ tensor = torch.rand(32000)
+ original_tensor = torch.clone(tensor)
+ tensor = regex_lp(token_ids, tensor)
+ assert tensor.shape == original_tensor.shape
+ assert not torch.allclose(tensor, original_tensor)
+
+ token_ids = tokenizer.encode(
+ f"Give an employee profile that fits this schema: {TEST_SCHEMA}")
+ json_request = CompletionRequest(model='test',
+ prompt=token_ids,
+ guided_json=TEST_SCHEMA)
+ json_lp = await get_guided_decoding_logits_processor(
+ backend, json_request, tokenizer)
+ assert json_lp is not None
+ tensor = torch.rand(32000)
+ original_tensor = torch.clone(tensor)
+ tensor = json_lp(token_ids, tensor)
+ assert tensor.shape == original_tensor.shape
+ assert not torch.allclose(tensor, original_tensor)
diff --git a/tests/entrypoints/test_llm_generate.py b/tests/entrypoints/test_llm_generate.py
new file mode 100644
index 0000000000000..5e8b7ca4d9977
--- /dev/null
+++ b/tests/entrypoints/test_llm_generate.py
@@ -0,0 +1,41 @@
+import pytest
+
+from vllm import LLM, SamplingParams
+
+
+def test_multiple_sampling_params():
+
+ llm = LLM(model="facebook/opt-125m",
+ max_num_batched_tokens=4096,
+ tensor_parallel_size=1)
+
+ prompts = [
+ "Hello, my name is",
+ "The president of the United States is",
+ "The capital of France is",
+ "The future of AI is",
+ ]
+
+ sampling_params = [
+ SamplingParams(temperature=0.01, top_p=0.95),
+ SamplingParams(temperature=0.3, top_p=0.95),
+ SamplingParams(temperature=0.7, top_p=0.95),
+ SamplingParams(temperature=0.99, top_p=0.95),
+ ]
+
+ # Multiple SamplingParams should be matched with each prompt
+ outputs = llm.generate(prompts, sampling_params=sampling_params)
+ assert len(prompts) == len(outputs)
+
+ # Exception raised, if the size of params does not match the size of prompts
+ with pytest.raises(ValueError):
+ outputs = llm.generate(prompts, sampling_params=sampling_params[:3])
+
+ # Single SamplingParams should be applied to every prompt
+ single_sampling_params = SamplingParams(temperature=0.3, top_p=0.95)
+ outputs = llm.generate(prompts, sampling_params=single_sampling_params)
+ assert len(prompts) == len(outputs)
+
+ # sampling_params is None, default params should be applied
+ outputs = llm.generate(prompts, sampling_params=None)
+ assert len(prompts) == len(outputs)
\ No newline at end of file
diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 047b2abbe28ec..5c416a12555b5 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -506,7 +506,10 @@ async def test_logits_bias(server, client: openai.AsyncOpenAI):
assert first_response != completion.choices[0].text
-async def test_guided_json_completion(server, client: openai.AsyncOpenAI):
+@pytest.mark.parametrize("guided_decoding_backend",
+ ["outlines", "lm-format-enforcer"])
+async def test_guided_json_completion(server, client: openai.AsyncOpenAI,
+ guided_decoding_backend: str):
completion = await client.completions.create(
model=MODEL_NAME,
prompt=f"Give an example JSON for an employee profile "
@@ -514,7 +517,8 @@ async def test_guided_json_completion(server, client: openai.AsyncOpenAI):
n=3,
temperature=1.0,
max_tokens=500,
- extra_body=dict(guided_json=TEST_SCHEMA))
+ extra_body=dict(guided_json=TEST_SCHEMA,
+ guided_decoding_backend=guided_decoding_backend))
assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 3
@@ -524,7 +528,10 @@ async def test_guided_json_completion(server, client: openai.AsyncOpenAI):
jsonschema.validate(instance=output_json, schema=TEST_SCHEMA)
-async def test_guided_json_chat(server, client: openai.AsyncOpenAI):
+@pytest.mark.parametrize("guided_decoding_backend",
+ ["outlines", "lm-format-enforcer"])
+async def test_guided_json_chat(server, client: openai.AsyncOpenAI,
+ guided_decoding_backend: str):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
@@ -538,8 +545,9 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI):
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
- max_tokens=500,
- extra_body=dict(guided_json=TEST_SCHEMA))
+ max_tokens=1000,
+ extra_body=dict(guided_json=TEST_SCHEMA,
+ guided_decoding_backend=guided_decoding_backend))
message = chat_completion.choices[0].message
assert message.content is not None
json1 = json.loads(message.content)
@@ -555,8 +563,9 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI):
chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
messages=messages,
- max_tokens=500,
- extra_body=dict(guided_json=TEST_SCHEMA))
+ max_tokens=1000,
+ extra_body=dict(guided_json=TEST_SCHEMA,
+ guided_decoding_backend=guided_decoding_backend))
message = chat_completion.choices[0].message
assert message.content is not None
json2 = json.loads(message.content)
@@ -565,14 +574,18 @@ async def test_guided_json_chat(server, client: openai.AsyncOpenAI):
assert json1["age"] != json2["age"]
-async def test_guided_regex_completion(server, client: openai.AsyncOpenAI):
+@pytest.mark.parametrize("guided_decoding_backend",
+ ["outlines", "lm-format-enforcer"])
+async def test_guided_regex_completion(server, client: openai.AsyncOpenAI,
+ guided_decoding_backend: str):
completion = await client.completions.create(
model=MODEL_NAME,
prompt=f"Give an example IPv4 address with this regex: {TEST_REGEX}",
n=3,
temperature=1.0,
max_tokens=20,
- extra_body=dict(guided_regex=TEST_REGEX))
+ extra_body=dict(guided_regex=TEST_REGEX,
+ guided_decoding_backend=guided_decoding_backend))
assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 3
@@ -581,7 +594,10 @@ async def test_guided_regex_completion(server, client: openai.AsyncOpenAI):
assert re.fullmatch(TEST_REGEX, completion.choices[i].text) is not None
-async def test_guided_regex_chat(server, client: openai.AsyncOpenAI):
+@pytest.mark.parametrize("guided_decoding_backend",
+ ["outlines", "lm-format-enforcer"])
+async def test_guided_regex_chat(server, client: openai.AsyncOpenAI,
+ guided_decoding_backend: str):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
@@ -595,7 +611,8 @@ async def test_guided_regex_chat(server, client: openai.AsyncOpenAI):
model=MODEL_NAME,
messages=messages,
max_tokens=20,
- extra_body=dict(guided_regex=TEST_REGEX))
+ extra_body=dict(guided_regex=TEST_REGEX,
+ guided_decoding_backend=guided_decoding_backend))
ip1 = chat_completion.choices[0].message.content
assert ip1 is not None
assert re.fullmatch(TEST_REGEX, ip1) is not None
@@ -606,21 +623,26 @@ async def test_guided_regex_chat(server, client: openai.AsyncOpenAI):
model=MODEL_NAME,
messages=messages,
max_tokens=20,
- extra_body=dict(guided_regex=TEST_REGEX))
+ extra_body=dict(guided_regex=TEST_REGEX,
+ guided_decoding_backend=guided_decoding_backend))
ip2 = chat_completion.choices[0].message.content
assert ip2 is not None
assert re.fullmatch(TEST_REGEX, ip2) is not None
assert ip1 != ip2
-async def test_guided_choice_completion(server, client: openai.AsyncOpenAI):
+@pytest.mark.parametrize("guided_decoding_backend",
+ ["outlines", "lm-format-enforcer"])
+async def test_guided_choice_completion(server, client: openai.AsyncOpenAI,
+ guided_decoding_backend: str):
completion = await client.completions.create(
model=MODEL_NAME,
prompt="The best language for type-safe systems programming is ",
n=2,
temperature=1.0,
max_tokens=10,
- extra_body=dict(guided_choice=TEST_CHOICE))
+ extra_body=dict(guided_choice=TEST_CHOICE,
+ guided_decoding_backend=guided_decoding_backend))
assert completion.id is not None
assert completion.choices is not None and len(completion.choices) == 2
@@ -628,7 +650,10 @@ async def test_guided_choice_completion(server, client: openai.AsyncOpenAI):
assert completion.choices[i].text in TEST_CHOICE
-async def test_guided_choice_chat(server, client: openai.AsyncOpenAI):
+@pytest.mark.parametrize("guided_decoding_backend",
+ ["outlines", "lm-format-enforcer"])
+async def test_guided_choice_chat(server, client: openai.AsyncOpenAI,
+ guided_decoding_backend: str):
messages = [{
"role": "system",
"content": "you are a helpful assistant"
@@ -642,7 +667,8 @@ async def test_guided_choice_chat(server, client: openai.AsyncOpenAI):
model=MODEL_NAME,
messages=messages,
max_tokens=10,
- extra_body=dict(guided_choice=TEST_CHOICE))
+ extra_body=dict(guided_choice=TEST_CHOICE,
+ guided_decoding_backend=guided_decoding_backend))
choice1 = chat_completion.choices[0].message.content
assert choice1 in TEST_CHOICE
@@ -655,18 +681,23 @@ async def test_guided_choice_chat(server, client: openai.AsyncOpenAI):
model=MODEL_NAME,
messages=messages,
max_tokens=10,
- extra_body=dict(guided_choice=TEST_CHOICE))
+ extra_body=dict(guided_choice=TEST_CHOICE,
+ guided_decoding_backend=guided_decoding_backend))
choice2 = chat_completion.choices[0].message.content
assert choice2 in TEST_CHOICE
assert choice1 != choice2
-async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI):
+@pytest.mark.parametrize("guided_decoding_backend",
+ ["outlines", "lm-format-enforcer"])
+async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI,
+ guided_decoding_backend: str):
with pytest.raises(openai.BadRequestError):
_ = await client.completions.create(
model=MODEL_NAME,
prompt="Give an example JSON that fits this schema: 42",
- extra_body=dict(guided_json=42))
+ extra_body=dict(guided_json=42,
+ guided_decoding_backend=guided_decoding_backend))
messages = [{
"role": "system",
@@ -692,20 +723,51 @@ async def test_guided_decoding_type_error(server, client: openai.AsyncOpenAI):
extra_body=dict(guided_regex=TEST_REGEX, guided_json=TEST_SCHEMA))
-async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
- resp = await client.chat.completions.create(
+@pytest.mark.parametrize("guided_decoding_backend",
+ ["outlines", "lm-format-enforcer"])
+async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI,
+ guided_decoding_backend: str):
+ messages = [{
+ "role": "system",
+ "content": "you are a helpful assistant"
+ }, {
+ "role":
+ "user",
+ "content":
+ "The best language for type-safe systems programming is "
+ }]
+ chat_completion = await client.chat.completions.create(
model=MODEL_NAME,
- messages=[{
- "role":
- "user",
- "content": ('what is 1+1? please respond with a JSON object, '
- 'the format is {"result": 2}')
- }],
- response_format={"type": "json_object"})
-
- content = resp.choices[0].message.content
- loaded = json.loads(content)
- assert loaded == {"result": 2}, loaded
+ messages=messages,
+ max_tokens=10,
+ logprobs=True,
+ top_logprobs=5,
+ extra_body=dict(guided_choice=TEST_CHOICE,
+ guided_decoding_backend=guided_decoding_backend))
+ top_logprobs = chat_completion.choices[0].logprobs.top_logprobs
+
+ # -9999.0 is the minimum logprob returned by OpenAI
+ assert all(
+ isinstance(logprob, float) and logprob >= -9999.0
+ for token_dict in top_logprobs
+ for token, logprob in token_dict.items())
+
+
+async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
+ for _ in range(2):
+ resp = await client.chat.completions.create(
+ model=MODEL_NAME,
+ messages=[{
+ "role":
+ "user",
+ "content": ('what is 1+1? please respond with a JSON object, '
+ 'the format is {"result": 2}')
+ }],
+ response_format={"type": "json_object"})
+
+ content = resp.choices[0].message.content
+ loaded = json.loads(content)
+ assert loaded == {"result": 2}, loaded
async def test_guided_grammar(server, client: openai.AsyncOpenAI):
diff --git a/tests/kernels/test_prefix_prefill.py b/tests/kernels/test_prefix_prefill.py
index cd303ee4a4448..d581a0c843b76 100644
--- a/tests/kernels/test_prefix_prefill.py
+++ b/tests/kernels/test_prefix_prefill.py
@@ -10,7 +10,7 @@
NUM_HEADS = [64]
NUM_QUERIES_PER_KV = [1, 8, 64]
-HEAD_SIZES = [128]
+HEAD_SIZES = [128, 96]
DTYPES = [torch.float16]
CUDA_DEVICES = [
f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index 1127cc33183c9..a3ffc53d8cd1d 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -143,6 +143,12 @@ def baichuan_lora_files():
return snapshot_download(repo_id="jeeejeee/baichuan7b-text2sql-spider")
+@pytest.fixture(scope="session")
+def baichuan_zero_lora_files():
+ # all the lora_B weights are initialized to zero.
+ return snapshot_download(repo_id="jeeejeee/baichuan7b-zero-init")
+
+
@pytest.fixture(scope="session")
def tinyllama_lora_files():
return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
@@ -153,11 +159,11 @@ def llama_2_7b_engine_extra_embeddings() -> nn.Module:
cleanup()
get_model_old = get_model
- def get_model_patched(model_config, device_config, **kwargs):
- return get_model_old(model_config,
- device_config,
- lora_config=LoRAConfig(max_loras=4,
- max_lora_rank=8))
+ def get_model_patched(*, model_config, device_config, **kwargs):
+ kwargs["lora_config"] = LoRAConfig(max_loras=4, max_lora_rank=8)
+ return get_model_old(model_config=model_config,
+ device_config=device_config,
+ **kwargs)
with patch("vllm.worker.model_runner.get_model", get_model_patched):
engine = vllm.LLM("meta-llama/Llama-2-7b-hf", enable_lora=False)
diff --git a/tests/lora/test_baichuan.py b/tests/lora/test_baichuan.py
index 2178266d2e0c8..5ab863eea94b3 100644
--- a/tests/lora/test_baichuan.py
+++ b/tests/lora/test_baichuan.py
@@ -62,7 +62,7 @@ def test_baichuan_lora(baichuan_lora_files):
@pytest.mark.skip("Requires multiple GPUs")
-def test_llama_tensor_parallel_equality(baichuan_lora_files):
+def test_baichuan_tensor_parallel_equality(baichuan_lora_files):
# Cannot use as it will initialize torch.cuda too early...
# if torch.cuda.device_count() < 4:
# pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 3fed7d2a0826f..522a91dac57c8 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -422,7 +422,9 @@ def test_lm_head_logits_processor(dist_init, num_loras, device,
def _pretest():
linear = ParallelLMHead(vocab_size + lora_config.lora_extra_vocab_size,
- 1024, vocab_size)
+ 1024,
+ vocab_size,
+ params_dtype=torch.float16)
linear.weight.data = torch.rand_like(linear.weight.data)
linear.weight.data[:, vocab_size:] = 0
logits_processor = LogitsProcessor(
@@ -454,7 +456,7 @@ def _pretest():
num_inputs=8 * num_loras, # * 3,
input_size=(1, 1024),
input_range=(0, 1),
- input_type=torch.float32,
+ input_type=torch.float16,
)
lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
@@ -503,7 +505,7 @@ def _pretest():
num_inputs=8 * num_loras * 3,
input_size=(1, 1024),
input_range=(0, 1),
- input_type=torch.float32,
+ input_type=torch.float16,
)
lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
@@ -545,11 +547,17 @@ def test_linear_parallel(dist_init, num_loras, orientation, device) -> None:
def create_random_linear_parallel_layer():
if orientation == "row":
- linear = RowParallelLinear(4096, 4096, bias=False)
+ linear = RowParallelLinear(4096,
+ 4096,
+ bias=False,
+ params_dtype=torch.float16)
linear.weight.data = torch.rand_like(linear.weight.data)
lora_linear = RowParallelLinearWithLoRA(linear)
else:
- linear = ColumnParallelLinear(4096, 4096, bias=False)
+ linear = ColumnParallelLinear(4096,
+ 4096,
+ bias=False,
+ params_dtype=torch.float16)
linear.weight.data = torch.rand_like(linear.weight.data)
lora_linear = ColumnParallelLinearWithLoRA(linear)
lora_linear.create_lora_weights(max_loras, lora_config)
@@ -573,7 +581,7 @@ def create_random_linear_parallel_layer():
num_inputs=32 * num_loras,
input_size=(1, 4096),
input_range=(0, 1),
- input_type=torch.float32,
+ input_type=torch.float16,
)
lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
@@ -612,7 +620,7 @@ def create_random_linear_parallel_layer():
num_inputs=32 * num_loras,
input_size=(1, 4096),
input_range=(0, 1),
- input_type=torch.float32,
+ input_type=torch.float16,
)
lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
@@ -648,15 +656,24 @@ def test_column_parallel_packed(dist_init, num_loras, repeats, device) -> None:
def create_column_parallel_packed_layer():
if repeats == 2:
linear = MergedColumnParallelLinear(4096, [4096] * repeats,
- bias=False)
+ bias=False,
+ params_dtype=torch.float16)
linear.weight.data = torch.rand_like(linear.weight.data)
lora_linear = MergedColumnParallelLinearWithLoRA(linear)
elif repeats == 3:
- linear = QKVParallelLinear(4096, 64, 32, bias=False)
+ linear = QKVParallelLinear(4096,
+ 64,
+ 32,
+ bias=False,
+ params_dtype=torch.float16)
linear.weight.data = torch.rand_like(linear.weight.data)
lora_linear = MergedQKVParallelLinearWithLora(linear)
else:
- linear = QKVParallelLinear(4096, 64, 32, bias=False)
+ linear = QKVParallelLinear(4096,
+ 64,
+ 32,
+ bias=False,
+ params_dtype=torch.float16)
linear.weight.data = torch.rand_like(linear.weight.data)
lora_linear = QKVParallelLinearWithLora(linear)
@@ -691,7 +708,7 @@ class FakeConfig:
num_inputs=32 * num_loras,
input_size=(1, 4096),
input_range=(0, 1),
- input_type=torch.float32,
+ input_type=torch.float16,
)
lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
@@ -731,7 +748,7 @@ class FakeConfig:
num_inputs=32 * num_loras,
input_size=(1, 4096),
input_range=(0, 1),
- input_type=torch.float32,
+ input_type=torch.float16,
)
lora_mapping = LoRAMapping(index_mapping, prompt_mapping)
diff --git a/tests/lora/test_lora_checkpoints.py b/tests/lora/test_lora_checkpoints.py
index 35ad7342944cd..d4d1665b624ea 100644
--- a/tests/lora/test_lora_checkpoints.py
+++ b/tests/lora/test_lora_checkpoints.py
@@ -3,9 +3,16 @@
from vllm.lora.models import LoRAModel
from vllm.model_executor.models.baichuan import BaiChuanBaseForCausalLM
+lora_lst = ["baichuan7B", "baichuan7B-zero", "chatglm3-6b"]
-@pytest.mark.parametrize("lora_name", ["baichuan7B", "chatglm3-6b"])
-def test_load_checkpoints(lora_name, chatglm3_lora_files, baichuan_lora_files):
+
+@pytest.mark.parametrize("lora_name", lora_lst)
+def test_load_checkpoints(
+ lora_name,
+ baichuan_lora_files,
+ baichuan_zero_lora_files,
+ chatglm3_lora_files,
+):
supported_lora_modules = BaiChuanBaseForCausalLM.supported_lora_modules
packed_modules_mapping = BaiChuanBaseForCausalLM.packed_modules_mapping
embedding_modules = BaiChuanBaseForCausalLM.embedding_modules
@@ -26,6 +33,17 @@ def test_load_checkpoints(lora_name, chatglm3_lora_files, baichuan_lora_files):
device="cpu",
embedding_modules=embedding_modules,
embedding_padding_modules=embed_padding_modules)
+ elif lora_name == "baichuan7B-zero":
+ #Test that the target_modules contain prefix
+ # such as "model.layers.0.self_atten.W_pack", and
+ # the test should pass.
+ LoRAModel.from_local_checkpoint(
+ baichuan_zero_lora_files,
+ expected_lora_modules,
+ lora_model_id=1,
+ device="cpu",
+ embedding_modules=embedding_modules,
+ embedding_padding_modules=embed_padding_modules)
else:
# For the baichuan7B model, load chatglm3-6b's LoRA,
# and the test should raise the following error.
diff --git a/tests/lora/test_punica.py b/tests/lora/test_punica.py
index cab8b44ccd2df..f3b9bd5912967 100644
--- a/tests/lora/test_punica.py
+++ b/tests/lora/test_punica.py
@@ -72,6 +72,7 @@ def _lora_ref_impl(
11008,
13824,
14336,
+ 15360,
22016,
24576,
27392,
@@ -81,6 +82,7 @@ def _lora_ref_impl(
32768,
33024,
36864,
+ 43264,
49152,
64000,
64256,
diff --git a/tests/lora/test_worker.py b/tests/lora/test_worker.py
index 54594690f7922..732e91a52c0a9 100644
--- a/tests/lora/test_worker.py
+++ b/tests/lora/test_worker.py
@@ -3,8 +3,8 @@
import tempfile
from unittest.mock import patch
-from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
- ParallelConfig, SchedulerConfig)
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+ ModelConfig, ParallelConfig, SchedulerConfig)
from vllm.lora.models import LoRAMapping
from vllm.lora.request import LoRARequest
from vllm.worker.worker import Worker
@@ -18,12 +18,14 @@ def test_worker_apply_lora(sql_lora_files):
"meta-llama/Llama-2-7b-hf",
tokenizer_mode="auto",
trust_remote_code=False,
- download_dir=None,
- load_format="dummy",
seed=0,
dtype="float16",
revision=None,
),
+ load_config=LoadConfig(
+ download_dir=None,
+ load_format="dummy",
+ ),
parallel_config=ParallelConfig(1, 1, False),
scheduler_config=SchedulerConfig(32, 32, 32),
device_config=DeviceConfig("cuda"),
diff --git a/tests/model_executor/weight_utils.py b/tests/model_executor/weight_utils.py
index 3154f2826d10c..b0086dd7a7d71 100644
--- a/tests/model_executor/weight_utils.py
+++ b/tests/model_executor/weight_utils.py
@@ -3,7 +3,7 @@
import huggingface_hub.constants
import pytest
-from vllm.model_executor.weight_utils import enable_hf_transfer
+from vllm.model_executor.model_loader.weight_utils import enable_hf_transfer
def test_hf_transfer_auto_activation():
diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py
new file mode 100644
index 0000000000000..a7abc011f57d7
--- /dev/null
+++ b/tests/models/test_aqlm.py
@@ -0,0 +1,95 @@
+"""Compare the outputs of a AQLM model between vLLM and HF Transformers
+
+Run `pytest tests/models/test_aqlm.py`.
+"""
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+
+capability = torch.cuda.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+aqlm_not_supported = (capability <
+ QUANTIZATION_METHODS["aqlm"].get_min_capability())
+
+# In this test we hardcode prompts and generations for the model so we don't
+# need to require the AQLM package as a dependency
+example_prompts = [
+ 'vLLM is a high-throughput and memory-efficient inference and serving '
+ 'engine for LLMs.\n',
+ 'Briefly describe the major milestones in the development of artificial '
+ 'intelligence from 1950 to 2020.\n',
+ 'Compare and contrast artificial intelligence with human intelligence in '
+ 'terms of processing information.\n',
+ 'Describe the basic components of a neural network and how it can be '
+ 'trained.\n',
+ 'Write a short story about a robot that dreams for the first time.\n',
+ 'Analyze the impact of the COVID-19 pandemic on global economic structures '
+ 'and future business models.\n',
+ 'Explain the cultural significance of the Mona Lisa painting, and how its '
+ 'perception might vary in Western versus Eastern societies.\n',
+ "Translate the following English sentence into Japanese, French, and "
+ "Swahili: 'The early bird catches the worm.'\n"
+]
+
+# These ground truth generations were generated using `transformers==4.38.1
+# aqlm==1.1.0 torch==2.2.0`
+# and the below code:
+# ```python
+# from transformers import AutoTokenizer, AutoModelForCausalLM
+# model_id = "ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"
+# quantized_model = AutoModelForCausalLM.from_pretrained(model_id,
+# torch_dtype="auto", device_map="cuda").cuda()
+# tokenizer = AutoTokenizer.from_pretrained(model_id)
+# outputs = []
+# for prompt in example_prompts:
+# input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")
+# hf_outputs = quantized_model.generate(input_ids, max_new_tokens=32)
+# outputs.append(tokenizer.decode(hf_outputs[0][input_ids.shape[1]:]))
+# print(outputs)
+# ```
+ground_truth_generations = [
+ '\n### Features\n\n- **High-throughput**: v',
+ 'The major milestones in the development of artificial intelligence from '
+ '195',
+ 'Compare and contrast artificial intelligence with human intelligence in '
+ 'terms of processing information. The',
+ 'Explain the difference between supervised and unsupervised learning.'
+ '\nExplain',
+ 'Write a short story about a robot that dreams for the first time. The',
+ 'Analyze the impact of the COVID-19 pandemic on global economic',
+ 'The Mona Lisa is a painting by Leonardo da Vinci, and it',
+ 'The early bird catches the worm.\nThe early bird catches the'
+]
+
+
+@pytest.mark.skipif(aqlm_not_supported,
+ reason="AQLM is not supported on this GPU type.")
+@pytest.mark.parametrize("model", ["ISTA-DASLab/Llama-2-7b-AQLM-2Bit-1x16-hf"])
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [16])
+@pytest.mark.parametrize("num_logprobs", [1])
+def test_models(
+ vllm_runner,
+ example_prompts,
+ model: str,
+ dtype: str,
+ max_tokens: int,
+ num_logprobs: int,
+) -> None:
+
+ vllm_model = vllm_runner(model, dtype=dtype)
+ vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts,
+ max_tokens,
+ num_logprobs)
+
+ # loop through the prompts to compare against the ground truth generations
+ for prompt_idx in range(len(example_prompts)):
+ vllm_output_ids, vllm_output_str, vllm_logprobs = vllm_outputs[
+ prompt_idx]
+
+ print("Prompt: ", repr(example_prompts[prompt_idx]))
+ print("Reference output:", repr(ground_truth_generations[prompt_idx]))
+ print("Output output: ", repr(vllm_output_str))
+ assert vllm_output_str == ground_truth_generations[prompt_idx]
diff --git a/tests/models/test_compressed.py b/tests/models/test_compressed.py
index a72d01d8956ed..5b4446fcbd9d2 100644
--- a/tests/models/test_compressed.py
+++ b/tests/models/test_compressed.py
@@ -26,7 +26,7 @@
@pytest.mark.parametrize("max_tokens", [32])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(
- vllm_runner_nm,
+ vllm_runner,
example_prompts,
model_format_pairs,
dtype: str,
@@ -35,20 +35,20 @@ def test_models(
) -> None:
model_name, sparsity = model_format_pairs
- sparse_model = vllm_runner_nm(model_name=model_name,
- sparsity=sparsity,
- dtype=dtype,
- max_model_len=MAX_MODEL_LEN)
+ sparse_model = vllm_runner(model_name=model_name,
+ sparsity=sparsity,
+ dtype=dtype,
+ max_model_len=MAX_MODEL_LEN)
sparse_outputs = sparse_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
del sparse_model
gc.collect()
- dense_model = vllm_runner_nm(model_name=model_name,
- sparsity=None,
- dtype=dtype,
- max_model_len=MAX_MODEL_LEN)
+ dense_model = vllm_runner(model_name=model_name,
+ sparsity=None,
+ dtype=dtype,
+ max_model_len=MAX_MODEL_LEN)
dense_outputs = dense_model.generate_greedy_logprobs(
example_prompts, max_tokens, num_logprobs)
diff --git a/tests/models/test_compressed_memory.py b/tests/models/test_compressed_memory.py
index 15e6824b05b3b..ab8a08f7a58dd 100644
--- a/tests/models/test_compressed_memory.py
+++ b/tests/models/test_compressed_memory.py
@@ -22,21 +22,17 @@
@pytest.mark.parametrize("model_format_extrablocks", MODEL_FORMAT_EXTRABLOCKS)
@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("num_logprobs", [3])
def test_models(
- vllm_runner_nm,
- example_prompts,
+ vllm_runner,
model_format_extrablocks,
dtype: str,
- max_tokens: int,
- num_logprobs: int,
) -> None:
model_name, sparsity, num_extra_blocks = model_format_extrablocks
- dense_model = vllm_runner_nm(model_name=model_name,
- sparsity=None,
- dtype=dtype,
- max_model_len=1024)
+ dense_model = vllm_runner(model_name=model_name,
+ enforce_eager=True,
+ sparsity=None,
+ dtype=dtype,
+ max_model_len=1024)
dense_gpu_alloc = (
dense_model.model.llm_engine.scheduler.block_manager.gpu_allocator)
dense_num_kv_blocks = dense_gpu_alloc.num_blocks
@@ -45,8 +41,9 @@ def test_models(
torch.cuda.empty_cache()
gc.collect()
- sparse_model = vllm_runner_nm(
+ sparse_model = vllm_runner(
model_name=model_name,
+ enforce_eager=True,
sparsity=sparsity,
dtype=dtype,
max_model_len=1024,
diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py
index eb225522447aa..3c2418fca972a 100644
--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@@ -23,15 +23,14 @@
import torch
from compare_utils import check_logprobs_close
-from vllm.model_executor.layers.quantization import (
- _QUANTIZATION_CONFIG_REGISTRY)
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
MAX_MODEL_LEN = 1024
capability = torch.cuda.get_device_capability()
capability = capability[0] * 10 + capability[1]
-marlin_not_supported = (
- capability < _QUANTIZATION_CONFIG_REGISTRY["marlin"].get_min_capability())
+marlin_not_supported = (capability <
+ QUANTIZATION_METHODS["marlin"].get_min_capability())
@dataclass
diff --git a/tests/models/test_models_logprobs.py b/tests/models/test_models_logprobs.py
index 9818a9db62f5a..8081c22442f95 100644
--- a/tests/models/test_models_logprobs.py
+++ b/tests/models/test_models_logprobs.py
@@ -18,14 +18,20 @@
"EleutherAI/gpt-j-6b",
"EleutherAI/pythia-1b",
"bigscience/bloom-1b1",
- # "mosaicml/mpt-7b", # vLLM upstream bug in mpt right now # noqa
+ "mosaicml/mpt-7b",
"microsoft/phi-2",
"stabilityai/stablelm-3b-4e1t",
- # "allenai/OLMo-1B", # dependencies are not installed right now # noqa
+ "allenai/OLMo-1B",
"bigcode/starcoder2-3b",
"Qwen/Qwen1.5-0.5B",
]
+SKIPPED_MODELS = [
+ "mosaicml/mpt-7b",
+ "allenai/OLMo-1B",
+ "bigcode/starcoder2-3b",
+]
+
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16", "half"])
@@ -40,6 +46,10 @@ def test_models(
max_tokens: int,
num_logprobs: int,
) -> None:
+ if model in SKIPPED_MODELS:
+ pytest.skip(reason="Low priority models not currently passing. "
+ "We need to re-enable these.")
+
hf_model = hf_runner_nm(model, dtype=dtype)
hf_outputs = hf_model.generate_greedy_logprobs_nm(example_prompts,
max_tokens, num_logprobs)
diff --git a/tests/quantization/test_autogptq_marlin_configs.py b/tests/quantization/test_autogptq_marlin_configs.py
index cd64622e2226f..1310b4da218b5 100644
--- a/tests/quantization/test_autogptq_marlin_configs.py
+++ b/tests/quantization/test_autogptq_marlin_configs.py
@@ -36,8 +36,6 @@ def test_auto_gptq(model_quant_type: str, ) -> None:
model_path,
tokenizer_mode="auto",
trust_remote_code=False,
- download_dir=None,
- load_format="dummy",
seed=0,
dtype="float16",
revision=None,
@@ -49,8 +47,6 @@ def test_auto_gptq(model_quant_type: str, ) -> None:
model_path,
tokenizer_mode="auto",
trust_remote_code=False,
- download_dir=None,
- load_format="dummy",
seed=0,
dtype="float16",
revision=None,
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
new file mode 100644
index 0000000000000..fa10e60de10a7
--- /dev/null
+++ b/tests/quantization/test_fp8.py
@@ -0,0 +1,24 @@
+"""Tests whether FP8 computation is enabled correctly.
+
+Run `pytest tests/quantization/test_fp8.py --forked`.
+"""
+import pytest
+import torch
+
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.model_executor.layers.quantization.fp8 import Fp8LinearMethod
+
+capability = torch.cuda.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+
+
+@pytest.mark.skipif(
+ capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
+ reason="FP8 is not supported on this GPU type.")
+def test_load_fp16_model(vllm_runner) -> None:
+ llm = vllm_runner("facebook/opt-125m", quantization="fp8")
+
+ model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
+ fc1 = model.model.decoder.layers[0].fc1
+ assert isinstance(fc1.linear_method, Fp8LinearMethod)
+ assert fc1.weight.dtype == torch.float8_e4m3fn
diff --git a/tests/samplers/test_rejection_sampler.py b/tests/samplers/test_rejection_sampler.py
index 9cfbfadfe98c5..55dd1a8f54cc6 100644
--- a/tests/samplers/test_rejection_sampler.py
+++ b/tests/samplers/test_rejection_sampler.py
@@ -96,12 +96,16 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int,
bonus_token_ids,
)
+ # Bonus tokens are currently disabled. Verify they're set to -1.
+ # See https://github.com/vllm-project/vllm/issues/4212
+ expected_bonus_token_ids = bonus_token_ids.clone() * 0 - 1
+
if which_tokens_accepted == "all_tokens_accepted":
# Expect all tokens to be equal to draft tokens.
assert torch.equal(output_token_ids[:, :-1], draft_token_ids)
# Expect all bonus tokens to be included.
- assert torch.equal(output_token_ids[:, -1:], bonus_token_ids)
+ assert torch.equal(output_token_ids[:, -1:], expected_bonus_token_ids)
elif which_tokens_accepted == "no_tokens_accepted":
# Expect first token to be equal to recovered tokens.
assert torch.equal(output_token_ids[:, 0], recovered_token_ids[:, 0])
@@ -111,7 +115,7 @@ def test_correct_output_format(which_tokens_accepted: str, seed: int,
torch.ones_like(output_token_ids[:, 1:]) * -1)
elif which_tokens_accepted == "some_tokens_accepted":
recovered_plus_bonus = torch.cat(
- (recovered_token_ids, bonus_token_ids), dim=-1)
+ (recovered_token_ids, expected_bonus_token_ids), dim=-1)
# Assert first rejected token is a recovered token or bonus token.
assert torch.equal(
recovered_plus_bonus[torch.arange(0, batch_size),
diff --git a/tests/samplers/test_sampler.py b/tests/samplers/test_sampler.py
index 7c73943f4df9f..f1670c539fd00 100644
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@@ -35,8 +35,12 @@ def _prepare_test(
dtype=input_tensor.dtype)
sampler = MockLogitsSampler(fake_logits)
# UPSTREAM SYNC: passing device required for multi-gpu tests
- model_runner = ModelRunner(None, None, None, DeviceConfig(device=device),
- None)
+ model_runner = ModelRunner(model_config=None,
+ parallel_config=None,
+ scheduler_config=None,
+ device_config=DeviceConfig(device=device),
+ load_config=None,
+ lora_config=None)
return input_tensor, fake_logits, sampler, model_runner
@@ -603,7 +607,12 @@ def test_sampler_top_k_top_p(seed: int, device: str):
device=input_tensor.device,
dtype=input_tensor.dtype)
sampler = MockLogitsSampler(fake_logits)
- model_runner = ModelRunner(None, None, None, None, None)
+ model_runner = ModelRunner(model_config=None,
+ parallel_config=None,
+ scheduler_config=None,
+ device_config=None,
+ load_config=None,
+ lora_config=None)
generation_model = GenerationMixin()
generation_config = GenerationConfig(top_k=top_k,
@@ -638,7 +647,8 @@ def test_sampler_top_k_top_p(seed: int, device: str):
def mock_sample(probs, *args, **kwargs):
nonlocal sample_probs
sample_probs = probs
- return [[prob.topk(1, dim=-1).indices.tolist(), [0]] for prob in probs]
+ return ([[prob.topk(1, dim=-1).indices.tolist(), [0]]
+ for prob in probs], None)
with patch("vllm.model_executor.layers.sampler._sample", mock_sample):
sampler(logits=fake_logits, sampling_metadata=sampling_metadata)
diff --git a/tests/spec_decode/e2e/__init__.py b/tests/spec_decode/e2e/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index 1d99cb5d32219..59fb8311fc5b7 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -1,3 +1,5 @@
+from typing import List, Tuple
+
import pytest
from tests.conftest import cleanup
@@ -6,28 +8,34 @@
@pytest.fixture
-def baseline_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
- baseline_llm_kwargs, seed):
- return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+def baseline_llm_generator(request, common_llm_kwargs,
+ per_test_common_llm_kwargs, baseline_llm_kwargs,
+ seed):
+ return create_llm_generator("baseline", request, common_llm_kwargs,
+ per_test_common_llm_kwargs,
baseline_llm_kwargs, seed)
@pytest.fixture
-def test_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
+def test_llm_generator(request, common_llm_kwargs, per_test_common_llm_kwargs,
test_llm_kwargs, seed):
- return create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
- test_llm_kwargs, seed)
+ return create_llm_generator("test", request, common_llm_kwargs,
+ per_test_common_llm_kwargs, test_llm_kwargs,
+ seed)
-def create_llm_generator(common_llm_kwargs, per_test_common_llm_kwargs,
- distinct_llm_kwargs, seed):
+def create_llm_generator(baseline_or_test, request, common_llm_kwargs,
+ per_test_common_llm_kwargs, distinct_llm_kwargs,
+ seed):
kwargs = {
**common_llm_kwargs,
**per_test_common_llm_kwargs,
**distinct_llm_kwargs,
}
+ test_name = request.node.name
def generator_inner():
+ print(f'Creating {baseline_or_test=} LLM for {test_name=}. {kwargs=}')
llm = LLM(**kwargs)
set_random_seed(seed)
@@ -36,6 +44,23 @@ def generator_inner():
del llm
cleanup()
- for llm in generator_inner():
- yield llm
+ def generator_outer():
+ for llm in generator_inner():
+ yield llm
+ del llm
+
+ return generator_outer
+
+
+def get_output_from_llm_generator(
+ llm_generator, prompts,
+ sampling_params) -> Tuple[List[str], List[List[int]]]:
+ tokens = []
+ token_ids = []
+ for llm in llm_generator():
+ outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
+ token_ids = [output.outputs[0].token_ids for output in outputs]
+ tokens = [output.outputs[0].text for output in outputs]
del llm
+
+ return tokens, token_ids
diff --git a/tests/spec_decode/e2e/test_compatibility.py b/tests/spec_decode/e2e/test_compatibility.py
new file mode 100644
index 0000000000000..fde950c14382c
--- /dev/null
+++ b/tests/spec_decode/e2e/test_compatibility.py
@@ -0,0 +1,169 @@
+import pytest
+
+from vllm import SamplingParams
+
+from .conftest import get_output_from_llm_generator
+
+
+@pytest.mark.parametrize(
+ "common_llm_kwargs",
+ [{
+ "model": "JackFram/llama-68m",
+ "speculative_model": "JackFram/llama-68m",
+ "num_speculative_tokens": 5,
+
+ # Required for spec decode.
+ "use_v2_block_manager": True
+ }])
+@pytest.mark.parametrize(
+ "per_test_common_llm_kwargs",
+ [
+ {
+ # Expect failure as spec decode not supported by
+ # Ray backend.
+ "worker_use_ray": True,
+ },
+ ])
+@pytest.mark.parametrize("test_llm_kwargs", [{}])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_xfail_ray(test_llm_generator):
+ """Verify that speculative decoding with Ray fails.
+ """
+ output_len = 128
+ temperature = 0.0
+
+ prompts = [
+ "Hello, my name is",
+ ]
+
+ sampling_params = SamplingParams(
+ max_tokens=output_len,
+ ignore_eos=True,
+ temperature=temperature,
+ )
+
+ with pytest.raises(AssertionError,
+ match="Speculative decoding not yet supported for "):
+ get_output_from_llm_generator(test_llm_generator, prompts,
+ sampling_params)
+
+
+@pytest.mark.parametrize(
+ "common_llm_kwargs",
+ [{
+ "model": "JackFram/llama-68m",
+ "speculative_model": "JackFram/llama-68m",
+ "num_speculative_tokens": 5,
+
+ # Required for spec decode.
+ "use_v2_block_manager": True
+ }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [
+ {
+ "enable_chunked_prefill": True,
+ },
+])
+@pytest.mark.parametrize("test_llm_kwargs", [{}])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_xfail_chunked_prefill(test_llm_generator):
+ """Verify that speculative decoding with chunked prefill fails.
+ """
+ output_len = 128
+ temperature = 0.0
+
+ prompts = [
+ "Hello, my name is",
+ ]
+
+ sampling_params = SamplingParams(
+ max_tokens=output_len,
+ ignore_eos=True,
+ temperature=temperature,
+ )
+
+ with pytest.raises(ValueError,
+ match="Speculative decoding and chunked prefill"):
+ get_output_from_llm_generator(test_llm_generator, prompts,
+ sampling_params)
+
+
+@pytest.mark.parametrize(
+ "common_llm_kwargs",
+ [{
+ "model": "meta-llama/Llama-2-7b-chat-hf",
+ "speculative_model": "JackFram/llama-68m",
+ "num_speculative_tokens": 5,
+
+ # Required for spec decode.
+ "use_v2_block_manager": True
+ }])
+@pytest.mark.parametrize(
+ "per_test_common_llm_kwargs",
+ [
+ {
+ # Speculative max model len > overridden max model len should raise.
+ "max_model_len": 128,
+ "speculative_max_model_len": 129,
+ },
+ {
+ # Speculative max model len > draft max model len should raise.
+ # https://huggingface.co/JackFram/llama-68m/blob/3b606af5198a0b26762d589a3ee3d26ee6fa6c85/config.json#L12
+ "speculative_max_model_len": 2048 + 1,
+ },
+ {
+ # Speculative max model len > target max model len should raise.
+ # https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/f5db02db724555f92da89c216ac04704f23d4590/config.json#L12
+ "speculative_max_model_len": 4096 + 1,
+ },
+ ])
+@pytest.mark.parametrize("test_llm_kwargs", [{}])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_xfail_spec_max_model_len(test_llm_generator):
+ """Verify that speculative decoding validates speculative_max_model_len.
+ """
+ output_len = 128
+ temperature = 0.0
+
+ prompts = [
+ "Hello, my name is",
+ ]
+
+ sampling_params = SamplingParams(
+ max_tokens=output_len,
+ ignore_eos=True,
+ temperature=temperature,
+ )
+
+ with pytest.raises(ValueError, match="cannot be larger than"):
+ get_output_from_llm_generator(test_llm_generator, prompts,
+ sampling_params)
+
+
+@pytest.mark.parametrize("common_llm_kwargs", [{
+ "model": "JackFram/llama-68m",
+ "speculative_model": "JackFram/llama-68m",
+ "num_speculative_tokens": 5,
+}])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [{}])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_xfail_block_manager_v1(test_llm_generator):
+ """Verify that speculative decoding with block manager v1 fails.
+ """
+ output_len = 128
+ temperature = 0.0
+
+ prompts = [
+ "Hello, my name is",
+ ]
+
+ sampling_params = SamplingParams(
+ max_tokens=output_len,
+ ignore_eos=True,
+ temperature=temperature,
+ )
+
+ with pytest.raises(ValueError,
+ match="Speculative decoding requires usage of the V2"):
+ get_output_from_llm_generator(test_llm_generator, prompts,
+ sampling_params)
diff --git a/tests/spec_decode/e2e/test_correctness.py b/tests/spec_decode/e2e/test_correctness.py
index b5a6fcb7900a3..0536cc4ecde76 100644
--- a/tests/spec_decode/e2e/test_correctness.py
+++ b/tests/spec_decode/e2e/test_correctness.py
@@ -1,24 +1,78 @@
+"""The tests in this file verify end-to-end speculative decoding correctness.
+
+This docstring details important information on the testing methodology.
+
+Most of the tests rely on "greedy equality", where we expect the output of
+speculative decoding on a sequence to exactly match the output of normal non-
+speculative decoding.
+
+Since speculative decoding with rejection sampling guarantees that the output
+distribution matches the target model's output distribution (up to hardware
+numerics, see https://arxiv.org/pdf/2302.01318.pdf), we can expect greedy
+equality. This gives us good coverage of temp=0.
+
+For temp>0, we rely on unit tests on the rejection sampler to verify that the
+output distribution is the same with spec decode vs. no spec decode (this would
+be prohibitively expensive to run with a real model).
+
+NOTE: Speculative decoding's distribution equality requires that the measured
+distributions of the target model and proposal model be deterministic given the
+same input. vLLM largely guarantees this.
+
+@cadedaniel has seen cases where the output probabilities of a draft/target
+model change slightly with certain batch sizes or prompts, even with Torch
+determinism flags set. It is unclear if this is a bug in vLLM, due to non-
+determinism in on-device batched operations, a bug in vLLM's spec decode
+implementation, or the "hardware numerics" limitations. Either way, rejection
+sampling ensures the output distribution matches the target model, but it breaks
+greedy-equality tests for those batch sizes/prompts.
+"""
+
+from itertools import cycle
+
import pytest
+from transformers import AutoTokenizer
from vllm import SamplingParams
+from .conftest import get_output_from_llm_generator
+
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
# Use a small model for a fast test.
- "model": "facebook/opt-125m",
- "speculative_model": "facebook/opt-125m",
- "num_speculative_tokens": 5,
+ # Note this is repeated in the test body; to initialize a tokenizer.
+ "model": "JackFram/llama-68m",
+
+ # Skip cuda graph recording for fast test.
+ "enforce_eager": True,
# Required for spec decode.
"use_v2_block_manager": True
}])
-@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize(
+ "per_test_common_llm_kwargs",
+ [
+ {
+ "speculative_model": "JackFram/llama-68m",
+ "num_speculative_tokens": 5,
+ },
+ {
+ # Verify the detokenizer assertions in the test work when spec
+ # decode is disabled.
+ },
+ ])
@pytest.mark.parametrize("test_llm_kwargs", [{}])
+@pytest.mark.parametrize("batch_size", [1, 32])
@pytest.mark.parametrize("seed", [1])
-def test_spec_decode_config(test_llm_generator):
- output_len = 1024
+def test_spec_decode_e2e_with_detokenization(test_llm_generator,
+ batch_size: int):
+ """Run generation with speculative decoding on a batch. Verify the engine
+ generates the correct number of tokens (via ignore_eos=True), and that the
+ detokenization matches HF transformers.
+ """
+ output_len = 32
temperature = 0.0
prompts = [
@@ -28,23 +82,516 @@ def test_spec_decode_config(test_llm_generator):
"The future of AI is",
]
+ prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
sampling_params = SamplingParams(
max_tokens=output_len,
ignore_eos=True,
temperature=temperature,
)
- with pytest.raises(
- AssertionError,
- match="Speculative decoding not yet supported for GPU backend"):
- get_token_ids_from_llm_generator(test_llm_generator, prompts,
- sampling_params)
+ batch_tokens, batch_token_ids = get_output_from_llm_generator(
+ test_llm_generator, prompts, sampling_params)
+
+ # Expect a generation for each prompt in the batch.
+ assert len(batch_token_ids) == len(prompts)
+
+ # Expect each generation to have expected number of tokens (note ignore_eos
+ # is True).
+ assert [len(token_ids)
+ for token_ids in batch_token_ids] == ([output_len] * batch_size)
+
+ # Expect detokenized string to match.
+ tok = AutoTokenizer.from_pretrained("JackFram/llama-68m")
+ for actual_tokens, actual_token_ids in zip(batch_tokens, batch_token_ids):
+ expected_tokens = tok.decode(actual_token_ids)
+ print(f"{actual_token_ids=}")
+ assert actual_tokens.strip() == expected_tokens.strip()
+
+
+@pytest.mark.parametrize(
+ "common_llm_kwargs",
+ [{
+ # Skip cuda graph recording for fast test.
+ "enforce_eager": True,
+
+ # Required for spec decode.
+ "use_v2_block_manager": True,
+
+ # Print spec metrics.
+ "disable_log_stats": False,
+ }])
+@pytest.mark.parametrize(
+ "per_test_common_llm_kwargs",
+ [
+ # Try two different tiny base models.
+ # Note that one is equal to the draft model, another isn't.
+ {
+ "model": "JackFram/llama-68m",
+ },
+ {
+ "model": "JackFram/llama-160m",
+ },
+ ])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+ {
+ "speculative_model": "JackFram/llama-68m",
+ "num_speculative_tokens": 5,
+ },
+])
+@pytest.mark.parametrize(
+ "output_len",
+ [
+ # Use long output len for the small model test.
+ 1536,
+ ])
+@pytest.mark.parametrize("batch_size", [1])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
+ baseline_llm_generator, test_llm_generator, batch_size: int,
+ output_len: int):
+ """Verify greedy equality on a tiny model with batch size of one.
+
+ Since this test is cheaper than other e2e correctness tests, we generate
+ with a higher output_len.
+ """
+ run_greedy_equality_correctness_test(baseline_llm_generator,
+ test_llm_generator,
+ batch_size,
+ max_output_len=output_len,
+ force_output_len=True)
+
+
+@pytest.mark.parametrize(
+ "common_llm_kwargs",
+ [{
+ # Skip cuda graph recording for fast test.
+ "enforce_eager": True,
+
+ # Required for spec decode.
+ "use_v2_block_manager": True,
+
+ # Print spec metrics.
+ "disable_log_stats": False,
+ }])
+@pytest.mark.parametrize(
+ "per_test_common_llm_kwargs",
+ [
+ # Try two different tiny base models.
+ # Note that one is equal to the draft model, another isn't.
+ {
+ "model": "JackFram/llama-68m",
+ },
+ {
+ "model": "JackFram/llama-160m",
+ },
+ ])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+ {
+ "speculative_model": "JackFram/llama-68m",
+ "num_speculative_tokens": 5,
+ },
+])
+@pytest.mark.parametrize(
+ "output_len",
+ [
+ # Use small output len for fast test.
+ 256,
+ ])
+@pytest.mark.parametrize("batch_size", [64])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
+ baseline_llm_generator, test_llm_generator, batch_size: int,
+ output_len: int):
+ """Verify greedy equality on a tiny model and large batch size.
+ """
+ run_greedy_equality_correctness_test(baseline_llm_generator,
+ test_llm_generator,
+ batch_size,
+ max_output_len=output_len,
+ force_output_len=True)
+
+
+@pytest.mark.parametrize(
+ "common_llm_kwargs",
+ [{
+ # Skip cuda graph recording for fast test.
+ "enforce_eager": True,
+
+ # Required for spec decode.
+ "use_v2_block_manager": True
+ }])
+@pytest.mark.parametrize(
+ "per_test_common_llm_kwargs",
+ [
+ # Try two different tiny base models.
+ # Note that one is equal to the draft model, another isn't.
+ {
+ "model": "JackFram/llama-68m",
+ },
+ {
+ "model": "JackFram/llama-160m",
+ },
+ ])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+ {
+ "speculative_model": "JackFram/llama-68m",
+ "num_speculative_tokens": 5,
+ },
+])
+@pytest.mark.parametrize("max_output_len", [
+ 256,
+])
+@pytest.mark.parametrize("batch_size", [32])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
+ baseline_llm_generator, test_llm_generator, batch_size: int,
+ max_output_len: int):
+ """Verify greedy equality on a tiny model, with a large batch size, and when
+ sampling respects the EOS token.
+ """
+ run_greedy_equality_correctness_test(baseline_llm_generator,
+ test_llm_generator,
+ batch_size,
+ max_output_len,
+ force_output_len=False)
+
+
+@pytest.mark.parametrize(
+ "common_llm_kwargs",
+ [{
+ # A "real" model (not tiny).
+ "model": "meta-llama/Llama-2-7b-chat-hf",
+
+ # Skip cuda graph recording for fast test.
+ "enforce_eager": True,
+
+ # Required for spec decode.
+ "use_v2_block_manager": True,
+
+ # Print spec metrics.
+ "disable_log_stats": False,
+ }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+ {
+ "speculative_model": "JackFram/llama-68m",
+ "num_speculative_tokens": 5,
+ },
+])
+@pytest.mark.parametrize("batch_size", [1])
+@pytest.mark.parametrize(
+ "output_len",
+ [
+ # Use decently long output len for a high quality test.
+ 256,
+ ])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_e2e_greedy_correctness_real_model_bs1(
+ baseline_llm_generator, test_llm_generator, batch_size: int,
+ output_len: int):
+ """Verify greedy equality on a "real" model and batch size of 1. This is
+ separate from large BS tests to make identifying the source of bugs easier.
+ """
+ run_greedy_equality_correctness_test(baseline_llm_generator,
+ test_llm_generator,
+ batch_size,
+ max_output_len=output_len,
+ force_output_len=True)
+
+
+@pytest.mark.parametrize(
+ "common_llm_kwargs",
+ [{
+ # A "real" model (not tiny).
+ "model": "meta-llama/Llama-2-7b-chat-hf",
+
+ # Skip cuda graph recording for fast test.
+ "enforce_eager": True,
+
+ # Required for spec decode.
+ "use_v2_block_manager": True,
+
+ # Print spec metrics.
+ "disable_log_stats": False,
+ }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+ {
+ "speculative_model": "JackFram/llama-68m",
+ "num_speculative_tokens": 5,
+ },
+])
+@pytest.mark.parametrize("batch_size", [32])
+@pytest.mark.parametrize(
+ "output_len",
+ [
+ # Use smaller output len for fast test.
+ 64,
+ ])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
+ baseline_llm_generator, test_llm_generator, batch_size: int,
+ output_len: int):
+ """Verify greedy equality with a "real" model on a nontrivial batch size.
+ This is the closest test to a real production workload.
+ """
+ run_greedy_equality_correctness_test(baseline_llm_generator,
+ test_llm_generator,
+ batch_size,
+ max_output_len=output_len,
+ force_output_len=True)
+
+
+@pytest.mark.parametrize(
+ "common_llm_kwargs",
+ [{
+ "block_size": 8,
+ # 2 for small prompt, 256//8 for generated.
+ "num_gpu_blocks_override": 2 + 256 // 8,
+ "max_model_len": (2 + 256 // 8) * 8,
+
+ # Skip cuda graph recording for fast test.
+ "enforce_eager": True,
+
+ # Required for spec decode.
+ "use_v2_block_manager": True
+ }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [
+ {
+ "model": "JackFram/llama-160m",
+ },
+])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+ {
+ "speculative_model": "JackFram/llama-68m",
+ "num_speculative_tokens": 5,
+ },
+])
+@pytest.mark.parametrize(
+ "output_len",
+ [
+ # Use small output len for fast test.
+ 256,
+ ])
+@pytest.mark.parametrize("batch_size", [4])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_e2e_greedy_correctness_with_preemption(
+ baseline_llm_generator, test_llm_generator, batch_size: int,
+ output_len: int):
+ """Verify greedy equality, even when some sequences are preempted mid-
+ generation.
+ """
+ run_greedy_equality_correctness_test(baseline_llm_generator,
+ test_llm_generator,
+ batch_size,
+ max_output_len=output_len,
+ force_output_len=True)
+
+
+@pytest.mark.parametrize(
+ "common_llm_kwargs",
+ [{
+ "model": "JackFram/llama-160m",
+
+ # Skip cuda graph recording for fast test.
+ "enforce_eager": True,
+
+ # Required for spec decode.
+ "use_v2_block_manager": True
+ }])
+@pytest.mark.parametrize(
+ "per_test_common_llm_kwargs",
+ [
+ # As of this writing, vLLM only compiles with these 3 block sizes by
+ # default.
+ {
+ "block_size": 8,
+ },
+ {
+ "block_size": 16,
+ },
+ {
+ "block_size": 32,
+ },
+ ])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+ {
+ "speculative_model": "JackFram/llama-68m",
+ "num_speculative_tokens": 5,
+ },
+])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize(
+ "output_len",
+ [
+ # Use smaller output len for fast test.
+ 32,
+ ])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_different_block_size(baseline_llm_generator,
+ test_llm_generator, batch_size: int,
+ output_len: int):
+ """Verify greedy equality over different block sizes.
+ """
+ run_greedy_equality_correctness_test(baseline_llm_generator,
+ test_llm_generator,
+ batch_size,
+ max_output_len=output_len,
+ force_output_len=True)
+
+
+@pytest.mark.parametrize(
+ "common_llm_kwargs",
+ [{
+ "model": "JackFram/llama-160m",
+
+ # Skip cuda graph recording for fast test.
+ "enforce_eager": True,
+
+ # Required for spec decode.
+ "use_v2_block_manager": True
+ }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize(
+ "test_llm_kwargs",
+ [
+ {
+ "speculative_model": "JackFram/llama-68m",
+ "num_speculative_tokens": 5,
+
+ # Artificially limit the draft model max model len; this forces vLLM
+ # to skip speculation once the sequences grow beyond 32-k tokens.
+ "speculative_max_model_len": 32,
+ },
+ ])
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize(
+ "output_len",
+ [
+ # This must be a good bit larger than speculative_max_model_len so that
+ # we can test the case where all seqs are skipped, but still small to
+ # ensure fast test.
+ 64,
+ ])
+@pytest.mark.parametrize("seed", [1])
+def test_skip_speculation(baseline_llm_generator, test_llm_generator,
+ batch_size: int, output_len: int):
+ """Verify greedy equality when some (or all) sequences skip speculation.
+ We do this by setting the max model len of the draft model to an
+ artificially low value, such that when the sequences grow beyond it, they
+ are skipped in speculative decoding.
+ """
+ run_greedy_equality_correctness_test(baseline_llm_generator,
+ test_llm_generator,
+ batch_size,
+ max_output_len=output_len,
+ force_output_len=True)
+
+
+@pytest.mark.parametrize(
+ "common_llm_kwargs",
+ [{
+ "model": "JackFram/llama-68m",
+
+ # Skip cuda graph recording for fast test.
+ "enforce_eager": True,
+
+ # Required for spec decode.
+ "use_v2_block_manager": True
+ }])
+@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize(
+ "test_llm_kwargs",
+ [
+ {
+ "speculative_model": "JackFram/llama-68m",
+ "num_speculative_tokens": k,
+ }
+ # Try a range of common k, as well as large speculation.
+ for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 63]
+ ])
+@pytest.mark.parametrize("batch_size", [2])
+@pytest.mark.parametrize(
+ "output_len",
+ [
+ # Use smaller output len for fast test.
+ 32,
+ ])
+@pytest.mark.parametrize("seed", [1])
+def test_many_k(baseline_llm_generator, test_llm_generator, batch_size: int,
+ output_len: int):
+ """Verify that speculative decoding produces exact equality to without spec
+ decode with many different values of k.
+ """
+ run_greedy_equality_correctness_test(baseline_llm_generator,
+ test_llm_generator,
+ batch_size,
+ max_output_len=output_len,
+ force_output_len=True)
+
+
+def run_greedy_equality_correctness_test(baseline_llm_generator,
+ test_llm_generator,
+ batch_size,
+ max_output_len,
+ force_output_len: bool,
+ print_tokens: bool = False):
+ """Helper method that compares the outputs of both the baseline LLM and
+ the test LLM. It asserts greedy equality, e.g. that the outputs are exactly
+ the same when temperature is zero.
+ """
+ temperature = 0.0
+
+ prompts = [
+ "Hello, my name is",
+ "The president of the United States is",
+ "The capital of France is",
+ "The future of AI is",
+ "San Francisco is know for its",
+ "Facebook was created in 2004 by",
+ "Curious George is a",
+ "Python 3.11 brings improvements to its",
+ ]
+
+ prompts = [prompt for prompt, _ in zip(cycle(prompts), range(batch_size))]
+
+ # If the test requires that we generated max_output_len tokens, then set the
+ # sampling params to ignore eos token.
+ ignore_eos = force_output_len
+
+ sampling_params = SamplingParams(
+ max_tokens=max_output_len,
+ ignore_eos=ignore_eos,
+ temperature=temperature,
+ )
+
+ spec_batch_tokens, spec_batch_token_ids = get_output_from_llm_generator(
+ test_llm_generator, prompts, sampling_params)
+ (baseline_batch_tokens,
+ baseline_batch_token_ids) = get_output_from_llm_generator(
+ baseline_llm_generator, prompts, sampling_params)
-def get_token_ids_from_llm_generator(llm_generator, prompts, sampling_params):
- for llm in llm_generator:
- outputs = llm.generate(prompts, sampling_params, use_tqdm=True)
- token_ids = [output.outputs[0].token_ids for output in outputs]
- del llm
+ assert len(baseline_batch_token_ids) == len(prompts)
+ assert len(spec_batch_token_ids) == len(prompts)
- return token_ids
+ for i, (baseline_token_ids, baseline_tokens, spec_token_ids,
+ spec_tokens) in enumerate(
+ zip(baseline_batch_token_ids, baseline_batch_tokens,
+ spec_batch_token_ids, spec_batch_tokens)):
+ if print_tokens:
+ print(f'{i=} {baseline_tokens=}')
+ print(f'{i=} {spec_tokens=}')
+ print(f'{i=} {baseline_token_ids=}')
+ print(f'{i=} {spec_token_ids=}')
+ assert baseline_token_ids == spec_token_ids
diff --git a/tests/spec_decode/test_metrics.py b/tests/spec_decode/test_metrics.py
index 36e91672069dc..312878804b86e 100644
--- a/tests/spec_decode/test_metrics.py
+++ b/tests/spec_decode/test_metrics.py
@@ -119,7 +119,7 @@ def test_initial_metrics_has_correct_values(has_data: bool):
num_draft_tokens = 0
k = 5
- num_possible_tokens = AsyncMetricsCollector.get_max_num_accepted_tokens(
+ max_num_emitted_tokens = AsyncMetricsCollector.get_max_num_emitted_tokens(
num_draft_tokens, k)
rej_sampler = MagicMock()
@@ -153,7 +153,7 @@ def test_initial_metrics_has_correct_values(has_data: bool):
assert (metrics.draft_acceptance_rate == num_accepted_tokens /
num_draft_tokens)
assert (metrics.system_efficiency == num_emitted_tokens /
- num_possible_tokens)
+ max_num_emitted_tokens)
else:
assert math.isnan(metrics.draft_acceptance_rate)
assert math.isnan(metrics.system_efficiency)
diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
index f4d44108b47c2..e7aaa1ff4eff8 100644
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -125,7 +125,7 @@ def test_same_output_for_single_step():
zero_kv_cache(worker.cache_engine)
set_random_seed(seed)
expected_output = worker.execute_model(
- **single_step_execute_model_data.to_dict(), )
+ **single_step_execute_model_data.to_dict(), )[0]
actual_token_ids = [
output.samples[0].output_token for output in actual_output
@@ -219,7 +219,7 @@ def test_same_output_for_multi_step():
continuations=continuations,
final_seq_lens=final_seq_lens))
- single_step_output.append(
+ single_step_output.extend(
worker.execute_model(**execute_model_data.to_dict(), ))
# Append output tokens to new sequence data.
@@ -344,8 +344,8 @@ def test_draft_proposals_no_speculations():
assert torch.is_tensor(proposals.proposal_token_ids)
assert torch.is_tensor(proposals.proposal_probs)
- assert proposals.proposal_token_ids.shape == torch.Size([0, k])
- assert proposals.proposal_probs.shape[:-1] == torch.Size([0, k])
+ assert proposals.proposal_token_ids.shape == torch.Size([batch_size, k])
+ assert proposals.proposal_probs.shape[:-1] == torch.Size([batch_size, k])
assert proposals.proposal_lens.shape == torch.Size([batch_size])
assert proposals.proposal_lens.tolist() == [0 for _ in range(batch_size)]
diff --git a/tests/spec_decode/test_spec_decode_worker.py b/tests/spec_decode/test_spec_decode_worker.py
index 47aff8f575413..d24d726c9c0cf 100644
--- a/tests/spec_decode/test_spec_decode_worker.py
+++ b/tests/spec_decode/test_spec_decode_worker.py
@@ -1,4 +1,5 @@
import random
+from types import SimpleNamespace
from unittest.mock import MagicMock
import pytest
@@ -6,6 +7,7 @@
from vllm.model_executor.layers.rejection_sampler import RejectionSampler
from vllm.model_executor.utils import set_random_seed
+from vllm.sequence import SamplerOutput
from vllm.spec_decode.interfaces import SpeculativeProposals
from vllm.spec_decode.metrics import (AsyncMetricsCollector,
SpecDecodeWorkerMetrics)
@@ -37,7 +39,8 @@ def test_correctly_calls_draft_model(k: int, batch_size: int):
execute_model_data, _, _ = create_batch(batch_size, k)
with pytest.raises(ValueError, match=exception_secret):
- worker.execute_model(**execute_model_data.to_dict(), num_spec_tokens=k)
+ worker.execute_model(**execute_model_data.to_dict(),
+ num_lookahead_slots=k)
call_args_list = draft_worker.get_spec_proposals.call_args_list
assert len(call_args_list) == 1
@@ -60,8 +63,8 @@ def test_correctly_calls_target_model(k: int, batch_size: int):
"""Verify SpecDecodeWorker calls the target model with correct
inputs. Everything else is mocked out.
"""
- draft_worker = mock_worker(cls=MultiStepWorker)
- target_worker = mock_worker()
+ draft_worker = mock_worker(cls=MultiStepWorker, use_spec=False)
+ target_worker = mock_worker(use_spec=False)
rejection_sampler = MagicMock(spec=RejectionSampler)
rejection_sampler.token_id_dtype = torch.int64
metrics_collector = MagicMock(spec=AsyncMetricsCollector)
@@ -102,7 +105,8 @@ def test_correctly_calls_target_model(k: int, batch_size: int):
target_worker.execute_model.side_effect = ValueError(exception_secret)
with pytest.raises(ValueError, match=exception_secret):
- worker.execute_model(**execute_model_data.to_dict(), num_spec_tokens=k)
+ worker.execute_model(**execute_model_data.to_dict(),
+ num_lookahead_slots=k)
seen_contexts = []
@@ -141,8 +145,10 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int):
"""
vocab_size = 32_000
- draft_worker = mock_worker(cls=MultiStepWorker, vocab_size=vocab_size)
- target_worker = mock_worker(vocab_size=vocab_size)
+ draft_worker = mock_worker(cls=MultiStepWorker,
+ vocab_size=vocab_size,
+ use_spec=False)
+ target_worker = mock_worker(vocab_size=vocab_size, use_spec=False)
rejection_sampler = MagicMock(spec=RejectionSampler)
rejection_sampler.token_id_dtype = torch.int64
metrics_collector = MagicMock(spec=AsyncMetricsCollector)
@@ -189,26 +195,26 @@ def test_correctly_calls_rejection_sampler(k: int, batch_size: int):
target_output = create_sampler_output_list(target_token_ids,
target_token_probs)
- target_worker.execute_model.return_value = target_output[0]
+ target_worker.execute_model.return_value = [target_output[0]]
exception_secret = 'artifical stop'
rejection_sampler.side_effect = ValueError(exception_secret)
with pytest.raises(ValueError, match=exception_secret):
- worker.execute_model(**execute_model_data.to_dict(), num_spec_tokens=k)
+ worker.execute_model(**execute_model_data.to_dict(),
+ num_lookahead_slots=k)
assert len(rejection_sampler.call_args_list) == 1
- args, _ = rejection_sampler.call_args_list[0]
- (actual_proposal_scores, actual_bonus_token_ids, actual_proposal_probs,
- actual_proposal_token_ids) = args
+ _, kwargs = rejection_sampler.call_args_list[0]
+ actual = SimpleNamespace(**kwargs)
- assert torch.equal(actual_bonus_token_ids,
+ assert torch.equal(actual.bonus_token_ids,
target_token_ids.reshape(batch_size, k + 1)[:, -1:])
assert torch.equal(
- actual_proposal_scores,
+ actual.target_probs,
target_token_probs.reshape(batch_size, k + 1, -1)[:, :-1])
- assert torch.equal(actual_proposal_token_ids, proposal_token_ids)
- assert torch.equal(actual_proposal_probs, proposal_probs)
+ assert torch.equal(actual.draft_token_ids, proposal_token_ids)
+ assert torch.equal(actual.draft_probs, proposal_probs)
@pytest.mark.parametrize('k', [1, 2, 6])
@@ -220,8 +226,10 @@ def test_correctly_formats_output(k: int, batch_size: int):
"""
vocab_size = 32_000
- draft_worker = mock_worker(cls=MultiStepWorker, vocab_size=vocab_size)
- target_worker = mock_worker(vocab_size=vocab_size)
+ draft_worker = mock_worker(cls=MultiStepWorker,
+ vocab_size=vocab_size,
+ use_spec=False)
+ target_worker = mock_worker(vocab_size=vocab_size, use_spec=False)
rejection_sampler = MagicMock(spec=RejectionSampler)
rejection_sampler.token_id_dtype = torch.int64
metrics_collector = MagicMock(spec=AsyncMetricsCollector)
@@ -268,7 +276,7 @@ def test_correctly_formats_output(k: int, batch_size: int):
target_output = create_sampler_output_list(target_token_ids,
target_token_probs)
- target_worker.execute_model.return_value = target_output[0]
+ target_worker.execute_model.return_value = [target_output[0]]
rejection_sampler_output = torch.randint(low=0,
high=vocab_size,
@@ -283,7 +291,7 @@ def test_correctly_formats_output(k: int, batch_size: int):
rejection_sampler.return_value = rejection_sampler_output
output = worker.execute_model(**execute_model_data.to_dict(),
- num_spec_tokens=k)
+ num_lookahead_slots=k)
expected_output = create_sampler_output_list(
rejection_sampler_output.transpose(0, 1), [None for _ in range(k + 1)])
@@ -332,8 +340,10 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
"""
vocab_size = 32_000
- draft_worker = mock_worker(cls=MultiStepWorker, vocab_size=vocab_size)
- target_worker = mock_worker(vocab_size=vocab_size)
+ draft_worker = mock_worker(cls=MultiStepWorker,
+ vocab_size=vocab_size,
+ use_spec=False)
+ target_worker = mock_worker(vocab_size=vocab_size, use_spec=False)
rejection_sampler = MagicMock(spec=RejectionSampler)
rejection_sampler.token_id_dtype = torch.int64
metrics_collector = MagicMock(spec=AsyncMetricsCollector)
@@ -380,7 +390,7 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
target_output = create_sampler_output_list(target_token_ids,
target_token_probs)
- target_worker.execute_model.return_value = target_output[0]
+ target_worker.execute_model.return_value = [target_output[0]]
rejection_sampler_output = torch.randint(low=0,
high=vocab_size,
@@ -400,7 +410,7 @@ def test_collects_metrics(k: int, batch_size: int, returns_metrics: bool):
mock_rejsample_metrics)
output = worker.execute_model(**execute_model_data.to_dict(),
- num_spec_tokens=k)
+ num_lookahead_slots=k)
assert output[0].spec_decode_worker_metrics == mock_rejsample_metrics
call_args_list = (
@@ -423,6 +433,8 @@ def test_k_equals_zero(k: int, batch_size: int):
rejection_sampler.token_id_dtype = torch.int64
metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+ target_worker.execute_model.return_value = [MagicMock(spec=SamplerOutput)]
+
draft_worker.device = 'cuda'
target_worker.device = 'cuda'
@@ -435,7 +447,7 @@ def test_k_equals_zero(k: int, batch_size: int):
batch_size, k, prev_output_token_len=0)
out = worker.execute_model(**execute_model_data.to_dict(),
- num_spec_tokens=k)
+ num_lookahead_slots=k)
assert len(out) == 1, f"expected only one token output when {k=}"
assert out[0].probs is None, "expect gpu tensor references to be None"
@@ -443,7 +455,7 @@ def test_k_equals_zero(k: int, batch_size: int):
0].sampled_tokens is None, "expect gpu tensor references to be None"
draft_worker.execute_model.assert_called_once_with(
- **execute_model_data.to_dict(), return_python_output=False)
+ **execute_model_data.to_dict())
target_worker.execute_model.assert_called_once_with(
**execute_model_data.to_dict())
@@ -462,6 +474,8 @@ def test_empty_input_batch(k: int, batch_size: int):
rejection_sampler.token_id_dtype = torch.int64
metrics_collector = MagicMock(spec=AsyncMetricsCollector)
+ target_worker.execute_model.return_value = [MagicMock(spec=SamplerOutput)]
+
draft_worker.device = 'cuda'
target_worker.device = 'cuda'
@@ -474,7 +488,7 @@ def test_empty_input_batch(k: int, batch_size: int):
batch_size, k, prev_output_token_len=0)
out = worker.execute_model(**execute_model_data.to_dict(),
- num_spec_tokens=k)
+ num_lookahead_slots=k)
assert len(out) == 1, f"expected only one token output when {k=}"
assert out[0].probs is None, "expect gpu tensor references to be None"
@@ -482,7 +496,7 @@ def test_empty_input_batch(k: int, batch_size: int):
0].sampled_tokens is None, "expect gpu tensor references to be None"
draft_worker.execute_model.assert_called_once_with(
- **execute_model_data.to_dict(), return_python_output=False)
+ **execute_model_data.to_dict())
target_worker.execute_model.assert_called_once_with(
**execute_model_data.to_dict())
@@ -492,8 +506,8 @@ def test_init_device():
"""Verify SpecDecodeWorker invokes proposer/scorer worker init_device, as
well as other GPU initialization.
"""
- draft_worker = mock_worker(cls=MultiStepWorker)
- target_worker = mock_worker()
+ draft_worker = mock_worker(cls=MultiStepWorker, use_spec=False)
+ target_worker = mock_worker(use_spec=False)
rejection_sampler = MagicMock(spec=RejectionSampler)
rejection_sampler.token_id_dtype = torch.int64
metrics_collector = MagicMock(spec=AsyncMetricsCollector)
diff --git a/tests/spec_decode/utils.py b/tests/spec_decode/utils.py
index 4637826f254d6..4f8295d25cf41 100644
--- a/tests/spec_decode/utils.py
+++ b/tests/spec_decode/utils.py
@@ -63,11 +63,14 @@ def create_execute_model_data(
def mock_worker(cls=None,
vocab_size: int = 30_000,
max_model_len: int = 2048,
- rank: int = 0) -> MagicMock:
+ rank: int = 0,
+ use_spec: bool = True) -> MagicMock:
if cls is None:
cls = Worker
- worker = MagicMock(spec=cls)
+ spec = cls if use_spec else None
+
+ worker = MagicMock(spec=spec)
worker.vocab_size = vocab_size
worker.max_model_len = max_model_len
worker.rank = rank
@@ -118,6 +121,7 @@ def create_worker(cls: type,
scheduler_config=engine_config.scheduler_config,
device_config=engine_config.device_config,
cache_config=engine_config.cache_config,
+ load_config=engine_config.load_config,
local_rank=0,
rank=0,
distributed_init_method=distributed_init_method,
@@ -211,7 +215,7 @@ def create_sampler_output_list(
SequenceOutput(
output_token=token_id,
parent_seq_id=seq_ids[seq_index],
- logprobs={token_id: 0},
+ logprobs={token_id: Logprob(0)},
)
],
prompt_logprobs=None,
diff --git a/tests/tensorizer_loader/__init__.py b/tests/tensorizer_loader/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/tensorizer_loader/tensorize_vllm_model_for_testing.py b/tests/tensorizer_loader/tensorize_vllm_model_for_testing.py
new file mode 100644
index 0000000000000..e4b15fd57add4
--- /dev/null
+++ b/tests/tensorizer_loader/tensorize_vllm_model_for_testing.py
@@ -0,0 +1,245 @@
+import argparse
+import dataclasses
+import os
+import time
+import uuid
+from functools import partial
+from typing import Type
+
+import torch
+import torch.nn as nn
+from tensorizer import (DecryptionParams, EncryptionParams, TensorDeserializer,
+ TensorSerializer, stream_io)
+from tensorizer.utils import convert_bytes, get_mem_usage, no_init_or_tensor
+from transformers import AutoConfig, PretrainedConfig
+
+from vllm.distributed import initialize_model_parallel
+from vllm.engine.arg_utils import EngineArgs
+from vllm.engine.llm_engine import LLMEngine
+from vllm.model_executor.model_loader.tensorizer import TensorizerArgs
+from vllm.model_executor.models import ModelRegistry
+
+# yapf conflicts with isort for this docstring
+# yapf: disable
+"""
+tensorize_vllm_model.py is a script that can be used to serialize and
+deserialize vLLM models. These models can be loaded using tensorizer directly
+to the GPU extremely quickly. Tensor encryption and decryption is also
+supported, although libsodium must be installed to use it. Install
+vllm with tensorizer support using `pip install vllm[tensorizer]`.
+
+To serialize a model, you can run something like this:
+
+python tensorize_vllm_model.py \
+ --model EleutherAI/gpt-j-6B \
+ --dtype float16 \
+ serialize \
+ --serialized-directory s3://my-bucket/ \
+ --suffix vllm
+
+Which downloads the model from HuggingFace, loads it into vLLM, serializes it,
+and saves it to your S3 bucket. A local directory can also be used.
+
+You can also encrypt the model weights with a randomly-generated key by
+providing a `--keyfile` argument.
+
+To deserialize a model, you can run something like this:
+
+python tensorize_vllm_model.py \
+ --model EleutherAI/gpt-j-6B \
+ --dtype float16 \
+ deserialize \
+ --path-to-tensors s3://my-bucket/vllm/EleutherAI/gpt-j-6B/vllm/model.tensors
+
+Which downloads the model tensors from your S3 bucket and deserializes them.
+To provide S3 credentials, you can provide `--s3-access-key-id` and
+`--s3-secret-access-key`, as well as `--s3-endpoint` as CLI args to this script,
+the OpenAI entrypoint, as arguments for LLM(), or as environment variables
+in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and `S3_ENDPOINT`.
+
+
+You can also provide a `--keyfile` argument to decrypt the model weights if
+they were serialized with encryption.
+
+For more information on the available arguments, run
+`python tensorize_vllm_model.py --help`.
+"""
+
+
+def parse_args():
+ parser = argparse.ArgumentParser(
+ description="An example script that can be used to serialize and "
+ "deserialize vLLM models. These models "
+ "can be loaded using tensorizer directly to the GPU "
+ "extremely quickly. Tensor encryption and decryption is "
+ "also supported, although libsodium must be installed to "
+ "use it.")
+ parser = TensorizerArgs.add_cli_args(EngineArgs.add_cli_args(parser))
+ subparsers = parser.add_subparsers(dest='command')
+
+ serialize_parser = subparsers.add_parser(
+ 'serialize', help="Serialize a model to `--serialized-directory`")
+
+ serialize_parser.add_argument(
+ "--suffix",
+ type=str,
+ required=False,
+ help=(
+ "The suffix to append to the serialized model directory, which is "
+ "used to construct the location of the serialized model tensors, "
+ "e.g. if `--serialized-directory` is `s3://my-bucket/` and "
+ "`--suffix` is `v1`, the serialized model tensors will be "
+ "saved to "
+ "`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
+ "If none is provided, a random UUID will be used."))
+ serialize_parser.add_argument(
+ "--serialized-directory",
+ type=str,
+ required=True)
+
+ serialize_parser.add_argument(
+ "--keyfile",
+ type=str,
+ required=False,
+ help=("Encrypt the model weights with a randomly-generated binary key,"
+ " and save the key at this path"))
+
+ deserialize_parser = subparsers.add_parser(
+ 'deserialize',
+ help=("Deserialize a model from `--path-to-tensors`"
+ " to verify it can be loaded and used."))
+
+ deserialize_parser.add_argument(
+ "--path-to-tensors",
+ type=str,
+ required=True,
+ help="The local path or S3 URI to the model tensors to deserialize. ")
+
+ deserialize_parser.add_argument(
+ "--keyfile",
+ type=str,
+ required=False,
+ help=("Path to a binary key to use to decrypt the model weights,"
+ " if the model was serialized with encryption"))
+
+ return parser.parse_args()
+
+
+def make_model_contiguous(model):
+ # Ensure tensors are saved in memory contiguously
+ for param in model.parameters():
+ param.data = param.data.contiguous()
+
+
+def _get_vllm_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
+ architectures = getattr(config, "architectures", [])
+ for arch in architectures:
+ model_cls = ModelRegistry.load_model_cls(arch)
+ if model_cls is not None:
+ return model_cls
+ raise ValueError(
+ f"Model architectures {architectures} are not supported for now. "
+ f"Supported architectures: {ModelRegistry.get_supported_archs()}")
+
+
+def serialize():
+ eng_args_dict = {f.name: getattr(args, f.name) for f in
+ dataclasses.fields(EngineArgs)}
+ engine_args = EngineArgs.from_cli_args(argparse.Namespace(**eng_args_dict))
+ engine = LLMEngine.from_engine_args(engine_args)
+
+ model = (engine.model_executor.driver_worker.
+ model_runner.model)
+
+ encryption_params = EncryptionParams.random() if keyfile else None
+ if keyfile:
+ with _write_stream(keyfile) as stream:
+ stream.write(encryption_params.key)
+
+ with _write_stream(model_path) as stream:
+ serializer = TensorSerializer(stream, encryption=encryption_params)
+ serializer.write_module(model)
+ serializer.close()
+
+ print("Serialization complete. Model tensors saved to", model_path)
+ if keyfile:
+ print("Key saved to", keyfile)
+
+
+def deserialize():
+ config = AutoConfig.from_pretrained(model_ref)
+
+ with no_init_or_tensor():
+ model_class = _get_vllm_model_architecture(config)
+ model = model_class(config)
+
+ before_mem = get_mem_usage()
+ start = time.time()
+
+ if keyfile:
+ with _read_stream(keyfile) as stream:
+ key = stream.read()
+ decryption_params = DecryptionParams.from_key(key)
+ tensorizer_args.deserializer_params['encryption'] = \
+ decryption_params
+
+ with (_read_stream(model_path)) as stream, TensorDeserializer(
+ stream, **tensorizer_args.deserializer_params) as deserializer:
+ deserializer.load_into_module(model)
+ end = time.time()
+
+ # Brag about how fast we are.
+ total_bytes_str = convert_bytes(deserializer.total_tensor_bytes)
+ duration = end - start
+ per_second = convert_bytes(deserializer.total_tensor_bytes / duration)
+ after_mem = get_mem_usage()
+ print(
+ f"Deserialized {total_bytes_str} in {end - start:0.2f}s, {per_second}/s"
+ )
+ print(f"Memory usage before: {before_mem}")
+ print(f"Memory usage after: {after_mem}")
+
+ return model
+
+
+args = parse_args()
+
+s3_access_key_id = (args.s3_access_key_id or os.environ.get("S3_ACCESS_KEY_ID")
+ or None)
+s3_secret_access_key = (args.s3_secret_access_key
+ or os.environ.get("S3_SECRET_ACCESS_KEY") or None)
+
+s3_endpoint = (args.s3_endpoint or os.environ.get("S3_ENDPOINT_URL") or None)
+
+_read_stream, _write_stream = (partial(
+ stream_io.open_stream,
+ mode=mode,
+ s3_access_key_id=s3_access_key_id,
+ s3_secret_access_key=s3_secret_access_key,
+ s3_endpoint=s3_endpoint,
+) for mode in ("rb", "wb+"))
+
+model_ref = args.model
+
+model_name = model_ref.split("/")[1]
+
+os.environ["MASTER_ADDR"] = "127.0.0.1"
+os.environ["MASTER_PORT"] = "8080"
+
+torch.distributed.init_process_group(world_size=1, rank=0)
+initialize_model_parallel()
+
+keyfile = args.keyfile if args.keyfile else None
+
+if args.command == "serialize":
+ input_dir = args.serialized_directory.rstrip('/')
+ suffix = args.suffix if args.suffix else uuid.uuid4().hex
+ base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
+ model_path = f"{base_path}/model.tensors"
+ serialize()
+elif args.command == "deserialize":
+ tensorizer_args = TensorizerArgs.from_cli_args(args)
+ model_path = args.path_to_tensors
+ deserialize()
+else:
+ raise ValueError("Either serialize or deserialize must be specified.")
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
new file mode 100644
index 0000000000000..a97cc0b3706b4
--- /dev/null
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -0,0 +1,327 @@
+import gc
+import json
+import os
+import subprocess
+from unittest.mock import MagicMock, patch
+
+import openai
+import pytest
+import ray
+import torch
+
+from tests.entrypoints.test_openai_server import ServerRunner
+from vllm import SamplingParams
+from vllm.model_executor.model_loader.tensorizer import (
+ EncryptionParams, TensorizerConfig, TensorSerializer,
+ is_vllm_serialized_tensorizer, load_with_tensorizer, open_stream)
+
+prompts = [
+ "Hello, my name is",
+ "The president of the United States is",
+ "The capital of France is",
+ "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95, seed=0)
+
+model_ref = "facebook/opt-125m"
+tensorize_model_for_testing_script = os.path.join(
+ os.path.dirname(__file__), "tensorize_vllm_model_for_testing.py")
+
+
+def is_curl_installed():
+ try:
+ subprocess.check_call(['curl', '--version'])
+ return True
+ except (subprocess.CalledProcessError, FileNotFoundError):
+ return False
+
+
+@pytest.fixture(autouse=True)
+def tensorizer_config():
+ config = TensorizerConfig(tensorizer_uri="vllm", vllm_tensorized=True)
+ return config
+
+
+@patch('vllm.model_executor.model_loader.tensorizer.TensorizerAgent')
+def test_load_with_tensorizer(mock_agent, tensorizer_config):
+ mock_linear_method = MagicMock()
+ mock_agent_instance = mock_agent.return_value
+ mock_agent_instance.deserialize.return_value = MagicMock()
+
+ result = load_with_tensorizer(tensorizer_config,
+ linear_method=mock_linear_method)
+
+ mock_agent.assert_called_once_with(tensorizer_config,
+ linear_method=mock_linear_method)
+ mock_agent_instance.deserialize.assert_called_once()
+ assert result == mock_agent_instance.deserialize.return_value
+
+
+def test_is_vllm_model_with_vllm_in_uri(tensorizer_config):
+ tensorizer_config.vllm_tensorized = True
+
+ result = is_vllm_serialized_tensorizer(tensorizer_config)
+
+ assert result is True
+
+
+def test_is_vllm_model_without_vllm_in_uri(tensorizer_config):
+ tensorizer_config.vllm_tensorized = False
+
+ result = is_vllm_serialized_tensorizer(tensorizer_config)
+
+ assert result is False
+
+
+def test_deserialized_vllm_model_has_same_outputs(vllm_runner, tmp_path):
+ vllm_model = vllm_runner(model_ref)
+ model_path = tmp_path / (model_ref + ".tensors")
+ outputs = vllm_model.generate(prompts, sampling_params)
+ model = (vllm_model.model.llm_engine.model_executor.driver_worker.
+ model_runner.model)
+ with open_stream(model_path, "wb+") as stream:
+ serializer = TensorSerializer(stream)
+ serializer.write_module(model)
+ del vllm_model, model
+ gc.collect()
+ torch.cuda.empty_cache()
+ loaded_vllm_model = vllm_runner(
+ model_ref,
+ load_format="tensorizer",
+ model_loader_extra_config=TensorizerConfig(tensorizer_uri=model_path,
+ num_readers=1,
+ vllm_tensorized=True),
+ )
+ deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
+
+ # Assumes SamplingParams being seeded ensures the outputs are deterministic
+ assert outputs == deserialized_outputs
+
+
+@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
+def test_can_deserialize_s3(vllm_runner):
+ model_ref = "EleutherAI/pythia-1.4b"
+ tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
+
+ loaded_hf_model = vllm_runner(model_ref,
+ load_format="tensorizer",
+ model_loader_extra_config=TensorizerConfig(
+ tensorizer_uri=tensorized_path,
+ num_readers=1,
+ vllm_tensorized=False,
+ s3_endpoint="object.ord1.coreweave.com",
+ ))
+
+ deserialized_outputs = loaded_hf_model.generate(prompts, sampling_params)
+
+ assert deserialized_outputs
+
+
+@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
+def test_deserialized_encrypted_vllm_model_has_same_outputs(
+ vllm_runner, tmp_path):
+ vllm_model = vllm_runner(model_ref)
+ model_path = tmp_path / (model_ref + ".tensors")
+ key_path = tmp_path / (model_ref + ".key")
+ outputs = vllm_model.generate(prompts, sampling_params)
+ model = (vllm_model.model.llm_engine.model_executor.driver_worker.
+ model_runner.model)
+
+ encryption_params = EncryptionParams.random()
+ with open_stream(model_path, "wb+") as stream:
+ serializer = TensorSerializer(stream, encryption=encryption_params)
+ serializer.write_module(model)
+ with open_stream(key_path, "wb+") as stream:
+ stream.write(encryption_params.key)
+ del vllm_model, model
+ gc.collect()
+ torch.cuda.empty_cache()
+ loaded_vllm_model = vllm_runner(model_ref,
+ load_format="tensorizer",
+ model_loader_extra_config=TensorizerConfig(
+ tensorizer_uri=model_path,
+ encryption_keyfile=key_path,
+ num_readers=1,
+ vllm_tensorized=True))
+
+ deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
+
+ # Assumes SamplingParams being seeded ensures the outputs are deterministic
+ assert outputs == deserialized_outputs
+
+
+def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
+ tmp_path):
+ hf_model = hf_runner(model_ref)
+ model_path = tmp_path / (model_ref + ".tensors")
+ max_tokens = 50
+ outputs = hf_model.generate_greedy(prompts, max_tokens=max_tokens)
+ with open_stream(model_path, "wb+") as stream:
+ serializer = TensorSerializer(stream)
+ serializer.write_module(hf_model.model)
+ del hf_model
+ gc.collect()
+ torch.cuda.empty_cache()
+ loaded_hf_model = vllm_runner(model_ref,
+ load_format="tensorizer",
+ model_loader_extra_config=TensorizerConfig(
+ tensorizer_uri=model_path,
+ num_readers=1,
+ vllm_tensorized=False))
+
+ deserialized_outputs = loaded_hf_model.generate_greedy(
+ prompts, max_tokens=max_tokens)
+
+ assert outputs == deserialized_outputs
+
+
+def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
+ from huggingface_hub import snapshot_download
+
+ from examples.multilora_inference import (create_test_prompts,
+ process_requests)
+
+ model_ref = "meta-llama/Llama-2-7b-hf"
+ lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
+ test_prompts = create_test_prompts(lora_path)
+
+ # Serialize model before deserializing and binding LoRA adapters
+ vllm_model = vllm_runner(model_ref, )
+ model_path = tmp_path / (model_ref + ".tensors")
+ model = (vllm_model.model.llm_engine.model_executor.driver_worker.
+ model_runner.model)
+ with open_stream(model_path, "wb+") as stream:
+ serializer = TensorSerializer(stream)
+ serializer.write_module(model)
+ del vllm_model, model
+ gc.collect()
+ torch.cuda.empty_cache()
+ loaded_vllm_model = vllm_runner(
+ model_ref,
+ load_format="tensorizer",
+ model_loader_extra_config=TensorizerConfig(
+ tensorizer_uri=model_path,
+ num_readers=1,
+ vllm_tensorized=True,
+ ),
+ enable_lora=True,
+ max_loras=1,
+ max_lora_rank=8,
+ max_cpu_loras=2,
+ max_num_seqs=50,
+ max_model_len=1000,
+ )
+ process_requests(loaded_vllm_model.model.llm_engine, test_prompts)
+
+ assert loaded_vllm_model
+
+
+def test_load_without_tensorizer_load_format(vllm_runner):
+ with pytest.raises(ValueError):
+ vllm_runner(model_ref,
+ model_loader_extra_config=TensorizerConfig(
+ tensorizer_uri="test", vllm_tensorized=False))
+
+
+@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
+def test_tensorize_vllm_model(tmp_path):
+ # Test serialize command
+ serialize_args = [
+ "python3", tensorize_model_for_testing_script, "--model", model_ref,
+ "--dtype", "float16", "serialize", "--serialized-directory", tmp_path,
+ "--suffix", "tests"
+ ]
+ result = subprocess.run(serialize_args, capture_output=True, text=True)
+ print(result.stdout) # Print the output of the serialize command
+
+ assert result.returncode == 0, (f"Serialize command failed with output:"
+ f"\n{result.stdout}\n{result.stderr}")
+
+ path_to_tensors = f"{tmp_path}/vllm/{model_ref}/tests/model.tensors"
+
+ # Test deserialize command
+ deserialize_args = [
+ "python3", tensorize_model_for_testing_script, "--model", model_ref,
+ "--dtype", "float16", "deserialize", "--path-to-tensors",
+ path_to_tensors
+ ]
+ result = subprocess.run(deserialize_args, capture_output=True, text=True)
+ assert result.returncode == 0, (f"Deserialize command failed with output:"
+ f"\n{result.stdout}\n{result.stderr}")
+
+
+@pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
+def test_openai_apiserver_with_tensorizer(tmp_path):
+ ## Serialize model
+ serialize_args = [
+ "python3", tensorize_model_for_testing_script, "--model", model_ref,
+ "--dtype", "float16", "serialize", "--serialized-directory", tmp_path,
+ "--suffix", "tests"
+ ]
+ result = subprocess.run(serialize_args, capture_output=True, text=True)
+ print(result.stdout) # Print the output of the serialize command
+
+ assert result.returncode == 0, (f"Serialize command failed with output:"
+ f"\n{result.stdout}\n{result.stderr}")
+
+ path_to_tensors = f"{tmp_path}/vllm/{model_ref}/tests/model.tensors"
+ model_loader_extra_config = {
+ "tensorizer_uri": path_to_tensors,
+ "vllm_tensorized": True
+ }
+
+ ## Start OpenAI API server
+ openai_args = [
+ "--model", model_ref, "--dtype", "float16", "--load-format",
+ "tensorizer", "--model-loader-extra-config",
+ json.dumps(model_loader_extra_config), "--port", "8000"
+ ]
+
+ server = ServerRunner.remote(openai_args)
+
+ assert ray.get(server.ready.remote())
+ print("Server ready.")
+
+ client = openai.OpenAI(
+ base_url="http://localhost:8000/v1",
+ api_key="token-abc123",
+ )
+ completion = client.completions.create(model=model_ref,
+ prompt="Hello, my name is",
+ max_tokens=5,
+ temperature=0.0)
+
+ assert completion.id is not None
+ assert completion.choices is not None and len(completion.choices) == 1
+ assert completion.choices[0].text is not None and len(
+ completion.choices[0].text) >= 5
+ assert completion.choices[0].finish_reason == "length"
+ assert completion.usage == openai.types.CompletionUsage(
+ completion_tokens=5, prompt_tokens=6, total_tokens=11)
+
+
+def test_raise_value_error_on_invalid_load_format(vllm_runner):
+ with pytest.raises(ValueError):
+ vllm_runner(model_ref,
+ load_format="safetensors",
+ model_loader_extra_config=TensorizerConfig(
+ tensorizer_uri="test", vllm_tensorized=False))
+
+
+def test_tensorizer_with_tp(vllm_runner):
+ with pytest.raises(ValueError):
+ model_ref = "EleutherAI/pythia-1.4b"
+ tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
+
+ vllm_runner(
+ model_ref,
+ load_format="tensorizer",
+ model_loader_extra_config=TensorizerConfig(
+ tensorizer_uri=tensorized_path,
+ num_readers=1,
+ vllm_tensorized=False,
+ s3_endpoint="object.ord1.coreweave.com",
+ ),
+ tensor_parallel_size=2,
+ )
diff --git a/tests/test_config.py b/tests/test_config.py
index 13a9f76212679..19db10630bbae 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -11,8 +11,6 @@ def test_get_sliding_window():
"Qwen/Qwen1.5-7B",
tokenizer_mode="auto",
trust_remote_code=False,
- download_dir=None,
- load_format="dummy",
seed=0,
dtype="float16",
revision=None,
@@ -30,8 +28,6 @@ def test_get_sliding_window():
"mistralai/Mistral-7B-v0.1",
tokenizer_mode="auto",
trust_remote_code=False,
- download_dir=None,
- load_format="dummy",
seed=0,
dtype="float16",
revision=None,
diff --git a/tests/test_logger.py b/tests/test_logger.py
new file mode 100644
index 0000000000000..601f72b50811c
--- /dev/null
+++ b/tests/test_logger.py
@@ -0,0 +1,27 @@
+import os
+import sys
+import tempfile
+
+from vllm.logger import enable_trace_function_call
+
+
+def f1(x):
+ return f2(x)
+
+
+def f2(x):
+ return x
+
+
+def test_trace_function_call():
+ fd, path = tempfile.mkstemp()
+ cur_dir = os.path.dirname(__file__)
+ enable_trace_function_call(path, cur_dir)
+ f1(1)
+ with open(path, 'r') as f:
+ content = f.read()
+
+ assert "f1" in content
+ assert "f2" in content
+ sys.settrace(None)
+ os.remove(path)
diff --git a/tests/test_logits_processor.py b/tests/test_logits_processor.py
index fe321520114f7..5bb93ca74855b 100644
--- a/tests/test_logits_processor.py
+++ b/tests/test_logits_processor.py
@@ -37,7 +37,12 @@ def _prepare_test(
1e-2,
dtype=input_tensor.dtype)
logits_processor = MockLogitsProcessor(32000, 0.5, fake_logits)
- model_runner = ModelRunner(None, None, None, None, None)
+ model_runner = ModelRunner(model_config=None,
+ parallel_config=None,
+ scheduler_config=None,
+ device_config=None,
+ load_config=None,
+ lora_config=None)
return input_tensor, fake_logits, logits_processor, model_runner
diff --git a/tests/worker/test_model_runner.py b/tests/worker/test_model_runner.py
index dcaae4af4a6f8..59bed2ce0dad3 100644
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -12,7 +12,12 @@ def test_prepare_prompt(batch_size):
100000,
100000,
enable_chunked_prefill=False)
- model_runner = ModelRunner(None, None, scheduler_config, None, None)
+ model_runner = ModelRunner(model_config=None,
+ parallel_config=None,
+ scheduler_config=scheduler_config,
+ device_config=None,
+ load_config=None,
+ lora_config=None)
model_runner.set_block_size(16)
prompt_lens = []
@@ -118,8 +123,6 @@ def test_prepare_decode_cuda_graph(batch_size):
"facebook/opt-125m",
tokenizer_mode="auto",
trust_remote_code=False,
- download_dir=None,
- load_format="dummy",
seed=0,
dtype="float16",
revision=None,
@@ -129,8 +132,12 @@ def test_prepare_decode_cuda_graph(batch_size):
100000,
100000,
enable_chunked_prefill=False)
- model_runner = ModelRunner(model_config, None, scheduler_config, None,
- None)
+ model_runner = ModelRunner(model_config=model_config,
+ parallel_config=None,
+ scheduler_config=scheduler_config,
+ device_config=None,
+ load_config=None,
+ lora_config=None)
model_runner.set_block_size(16)
prompt_lens = []
@@ -205,14 +212,17 @@ def test_empty_seq_group():
"facebook/opt-125m",
tokenizer_mode="auto",
trust_remote_code=False,
- download_dir=None,
- load_format="dummy",
seed=0,
dtype="float16",
revision=None,
enforce_eager=False,
)
- model_runner = ModelRunner(model_config, None, None, None, None)
+ model_runner = ModelRunner(model_config=model_config,
+ parallel_config=None,
+ scheduler_config=None,
+ device_config=None,
+ load_config=None,
+ lora_config=None)
model_runner.set_block_size(16)
seq_group_metadata_list = []
input_tokens, input_positions, attn_metadata, _, _, _, slot_mapping = (
@@ -251,8 +261,6 @@ def mock_get_process_group_ranks(group=None):
"facebook/opt-125m",
tokenizer_mode="auto",
trust_remote_code=False,
- download_dir=None,
- load_format="dummy",
seed=0,
dtype="float16",
revision=None,
@@ -262,11 +270,12 @@ def mock_get_process_group_ranks(group=None):
100000,
100000,
enable_chunked_prefill=True)
- model_runner = ModelRunner(model_config,
- None,
- scheduler_config,
- None,
- None,
+ model_runner = ModelRunner(model_config=model_config,
+ parallel_config=None,
+ scheduler_config=scheduler_config,
+ device_config=None,
+ load_config=None,
+ lora_config=None,
is_driver_worker=True)
model_runner.set_block_size(16)
diff --git a/tests/worker/test_swap.py b/tests/worker/test_swap.py
index 8edb1cf05c08e..1804cf78d8003 100644
--- a/tests/worker/test_swap.py
+++ b/tests/worker/test_swap.py
@@ -23,6 +23,7 @@ def test_swap() -> None:
scheduler_config=engine_config.scheduler_config,
device_config=engine_config.device_config,
cache_config=engine_config.cache_config,
+ load_config=engine_config.load_config,
local_rank=0,
rank=0,
distributed_init_method=distributed_init_method,
diff --git a/vllm/__init__.py b/vllm/__init__.py
index 3585c0944f7e3..56cf5c01644fc 100644
--- a/vllm/__init__.py
+++ b/vllm/__init__.py
@@ -3,8 +3,8 @@
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.llm_engine import LLMEngine
-from vllm.engine.ray_utils import initialize_ray_cluster
from vllm.entrypoints.llm import LLM
+from vllm.executor.ray_utils import initialize_ray_cluster
from vllm.model_executor.models import ModelRegistry
from vllm.outputs import CompletionOutput, RequestOutput
from vllm.sampling_params import SamplingParams
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index a0837a20875fe..508d35656eb00 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1,4 +1,4 @@
-from typing import Dict, Optional
+from typing import Dict, Optional, Tuple
import torch
@@ -153,6 +153,28 @@ def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
size_n, size_k)
+# aqlm
+def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor,
+ codebooks: torch.Tensor, scales: torch.Tensor,
+ codebook_partition_sizes: torch.Tensor,
+ bias: Optional[torch.Tensor]) -> torch.Tensor:
+ return vllm_ops.aqlm_gemm(input, codes, codebooks, scales,
+ codebook_partition_sizes, bias)
+
+
+def aqlm_dequant(codes: torch.Tensor, codebooks: torch.Tensor,
+ codebook_partition_sizes: torch.Tensor) -> torch.Tensor:
+ return vllm_ops.aqlm_dequant(codes, codebooks, codebook_partition_sizes)
+
+
+# fp8
+def scaled_fp8_quant(input: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+ scale = torch.zeros(1, device=input.device, dtype=torch.float32)
+ output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
+ vllm_ops.scaled_fp8_quant(output, input, scale)
+ return output, scale
+
+
# moe
def moe_align_block_size(topk_ids: torch.Tensor, num_experts: int,
block_size: int, sorted_token_ids: torch.Tensor,
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 7a4ccecf702f4..be747c9900368 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -116,7 +116,7 @@ def forward(
key: torch.Tensor,
value: torch.Tensor,
kv_cache: torch.Tensor,
- attn_metadata: AttentionMetadata[AttentionMetadataPerStage],
+ attn_metadata: AttentionMetadata,
kv_scale: float,
) -> torch.Tensor:
raise NotImplementedError
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index e55435cd2c947..7c5863a030ff5 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -154,25 +154,30 @@ def __init__(
f"Head size {head_size} is not supported by PagedAttention. "
f"Supported head sizes are: {suppored_head_sizes}.")
- self.use_naive_attn = torch.cuda.get_device_capability()[0] != 9
+ self.use_naive_attn = False
# NOTE: Allow for switching between Triton and CK. Defaulting to triton.
self.use_triton_flash_attn = (os.environ.get(
"VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in ("true", "1"))
- if self.use_naive_attn:
- # AMD Radeon 7900 series (gfx1100) currently does not support
- # xFormers nor FlashAttention. As a temporary workaround, we use
- # naive PyTorch implementation of attention.
- self.attn_fuc = _naive_attention()
- logger.debug("Using naive attention in ROCmBackend")
- elif self.use_triton_flash_attn:
+ if self.use_triton_flash_attn:
from vllm.attention.ops.triton_flash_attention import ( # noqa: F401
triton_attention)
self.attn_func = triton_attention
logger.debug("Using Triton FA in ROCmBackend")
else:
- from flash_attn import flash_attn_varlen_func # noqa: F401
- self.attn_func = flash_attn_varlen_func
- logger.debug("Using CK FA in ROCmBackend")
+ # if not using triton, navi3x not use flash-attn either
+ if torch.cuda.get_device_capability()[0] == 11:
+ self.use_naive_attn = True
+ else:
+ try:
+ from flash_attn import flash_attn_varlen_func # noqa: F401
+ self.attn_func = flash_attn_varlen_func
+ logger.debug("Using CK FA in ROCmBackend")
+ except ModuleNotFoundError:
+ self.use_naive_attn = True
+
+ if self.use_naive_attn:
+ self.attn_func = _naive_attention
+ logger.debug("Using naive attention in ROCmBackend")
def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor:
"""torch.repeat_interleave(x, dim=1, repeats=n_rep)"""
@@ -243,17 +248,18 @@ def forward(
if prefill_meta := attn_metadata.prefill_metadata:
# Prompt run.
+ assert prefill_meta.prompt_lens is not None
if kv_cache is None or prefill_meta.block_tables.numel() == 0:
# triton attention
# When block_tables are not filled, it means q and k are the
# prompt, and they have the same length.
- if self.use_naive_attn or self.use_triton_flash_attn:
+ if self.use_triton_flash_attn or self.use_naive_attn:
if self.num_kv_heads != self.num_heads:
# Interleave for MQA workaround.
key = self.repeat_kv(key, self.num_queries_per_kv)
value = self.repeat_kv(value, self.num_queries_per_kv)
if self.use_naive_attn:
- out = self.attn_fuc(
+ out = self.attn_func(
query,
key,
value,
@@ -334,26 +340,21 @@ def _naive_attention(
prompt_lens: List[int],
scale: float,
) -> torch.Tensor:
- num_tokens = query.shape[0]
output = torch.empty_like(query)
start = 0
for _, prompt_len in enumerate(prompt_lens):
end = start + prompt_len
out = _naive_masked_attention(
- query[None, start:end],
- key[None, start:end],
- value[None, start:end],
+ query[start:end],
+ key[start:end],
+ value[start:end],
scale,
)
# TODO(woosuk): Unnecessary copy. Optimize.
output[start:end].copy_(out)
start += prompt_len
- # Using view got RuntimeError: view size is not compatible
- # with input tensor's size and stride (at least one
- # dimension spans across two contiguous subspaces).
- # Use reshape instead.
- return output.reshape(num_tokens, -1)
+ return output
def _naive_masked_attention(
@@ -362,14 +363,13 @@ def _naive_masked_attention(
value: torch.Tensor,
scale: float,
) -> torch.Tensor:
- seq_len, _, _ = query.shape
+ seq_len, head_size, head_dim = query.shape
attn_mask = torch.triu(torch.ones(seq_len,
seq_len,
dtype=query.dtype,
device=query.device),
diagonal=1)
attn_mask = attn_mask * torch.finfo(query.dtype).min
-
attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
attn_weights = attn_weights + attn_mask.float()
attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
diff --git a/vllm/attention/backends/torch_sdpa.py b/vllm/attention/backends/torch_sdpa.py
index d21b54b16db4b..55a7ce59ac6e0 100644
--- a/vllm/attention/backends/torch_sdpa.py
+++ b/vllm/attention/backends/torch_sdpa.py
@@ -106,7 +106,7 @@ def forward(
key: torch.Tensor,
value: torch.Tensor,
kv_cache: Optional[torch.Tensor],
- attn_metadata: TorchSDPAMetadata,
+ attn_metadata: TorchSDPAMetadata, # type: ignore
kv_scale: float,
) -> torch.Tensor:
"""Forward pass with torch SDPA and PagedAttention.
@@ -136,6 +136,7 @@ def forward(
kv_scale)
if attn_metadata.is_prompt:
+ assert attn_metadata.prompt_lens is not None
if (kv_cache is None or attn_metadata.block_tables.numel() == 0):
if self.num_kv_heads != self.num_heads:
key = key.repeat_interleave(self.num_queries_per_kv, dim=1)
diff --git a/vllm/attention/backends/xformers.py b/vllm/attention/backends/xformers.py
index b745a04a143b4..572a4dc79a719 100644
--- a/vllm/attention/backends/xformers.py
+++ b/vllm/attention/backends/xformers.py
@@ -288,6 +288,7 @@ def _run_memory_efficient_xformers_forward(
value: shape = [num_prefill_tokens, num_kv_heads, head_size]
attn_metadata: Metadata for attention.
"""
+ assert attn_metadata.prompt_lens is not None
original_query = query
if self.num_kv_heads != self.num_heads:
# GQA/MQA requires the shape [B, M, G, H, K].
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index 70f09224f1cf6..4896cf3909c6e 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -47,7 +47,8 @@ def _fwd_kernel(
stride_v_cache_bl,
num_queries_per_kv: int,
BLOCK_M: tl.constexpr,
- BLOCK_DMODEL: tl.constexpr,
+ BLOCK_DMODEL: tl.constexpr, # head size
+ BLOCK_DMODEL_PADDED: tl.constexpr, # head size padded to a power of 2
BLOCK_N: tl.constexpr,
):
cur_batch = tl.program_id(0)
@@ -59,26 +60,30 @@ def _fwd_kernel(
cur_batch_ctx_len = tl.load(B_Ctxlen + cur_batch)
cur_batch_seq_len = tl.load(B_Seqlen + cur_batch)
cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
+ cur_batch_query_len = cur_batch_seq_len - cur_batch_ctx_len
block_start_loc = BLOCK_M * start_m
# initialize offsets
offs_n = tl.arange(0, BLOCK_N)
- offs_d = tl.arange(0, BLOCK_DMODEL)
+ offs_d = tl.arange(0, BLOCK_DMODEL_PADDED)
offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
off_q = (
(cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs +
cur_head * stride_qh + offs_d[None, :] * stride_qd)
- q = tl.load(
- Q + off_q,
- mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len,
- other=0.0)
+ dim_mask = tl.where(
+ tl.arange(0, BLOCK_DMODEL_PADDED) < BLOCK_DMODEL, 1, 0).to(tl.int1)
+
+ q = tl.load(Q + off_q,
+ mask=dim_mask[None, :] &
+ (offs_m[:, None] < cur_batch_query_len),
+ other=0.0)
# # initialize pointer to m and l
m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
- acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+ acc = tl.zeros([BLOCK_M, BLOCK_DMODEL_PADDED], dtype=tl.float32)
for start_n in range(0, cur_batch_ctx_len, BLOCK_N):
start_n = tl.multiple_of(start_n, BLOCK_N)
@@ -99,7 +104,8 @@ def _fwd_kernel(
offs_d[None, :] * stride_v_cache_d +
(start_n + offs_n[:, None]) % block_size * stride_v_cache_bl)
k = tl.load(K_cache + off_k,
- mask=(start_n + offs_n[None, :]) < cur_batch_ctx_len,
+ mask=dim_mask[:, None] &
+ ((start_n + offs_n[None, :]) < cur_batch_ctx_len),
other=0.0)
qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
@@ -126,7 +132,8 @@ def _fwd_kernel(
acc = acc * acc_scale[:, None]
# update acc
v = tl.load(V_cache + off_v,
- mask=(start_n + offs_n[:, None]) < cur_batch_ctx_len,
+ mask=dim_mask[None, :] &
+ ((start_n + offs_n[:, None]) < cur_batch_ctx_len),
other=0.0)
p = p.to(v.dtype)
@@ -142,16 +149,15 @@ def _fwd_kernel(
k_ptrs = K + off_k
v_ptrs = V + off_v
- block_mask = tl.where(
- block_start_loc < cur_batch_seq_len - cur_batch_ctx_len, 1, 0)
+ block_mask = tl.where(block_start_loc < cur_batch_query_len, 1, 0)
for start_n in range(0, block_mask * (start_m + 1) * BLOCK_M, BLOCK_N):
start_n = tl.multiple_of(start_n, BLOCK_N)
# -- compute qk ----
k = tl.load(k_ptrs +
(cur_batch_in_all_start_index + start_n) * stride_kbs,
- mask=(start_n + offs_n[None, :]) <
- cur_batch_seq_len - cur_batch_ctx_len,
+ mask=dim_mask[:, None] &
+ ((start_n + offs_n[None, :]) < cur_batch_query_len),
other=0.0)
qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
@@ -179,8 +185,8 @@ def _fwd_kernel(
# update acc
v = tl.load(v_ptrs +
(cur_batch_in_all_start_index + start_n) * stride_vbs,
- mask=(start_n + offs_n[:, None]) <
- cur_batch_seq_len - cur_batch_ctx_len,
+ mask=dim_mask[None, :] &
+ ((start_n + offs_n[:, None]) < cur_batch_query_len),
other=0.0)
p = p.to(v.dtype)
@@ -195,7 +201,8 @@ def _fwd_kernel(
out_ptrs = Out + off_o
tl.store(out_ptrs,
acc,
- mask=offs_m[:, None] < cur_batch_seq_len - cur_batch_ctx_len)
+ mask=dim_mask[None, :] &
+ (offs_m[:, None] < cur_batch_query_len))
return
@triton.jit
@@ -636,7 +643,8 @@ def context_attention_fwd(q,
# shape constraints
Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
assert Lq == Lk and Lk == Lv
- assert Lk in {16, 32, 64, 128}
+ # round up Lk to a power of 2 - this is required for Triton block size
+ Lk_padded = 2**((Lk - 1).bit_length())
sm_scale = 1.0 / (Lq**0.5)
batch, head = b_seq_len.shape[0], q.shape[1]
@@ -646,6 +654,7 @@ def context_attention_fwd(q,
num_warps = 8 if Lk <= 64 else 8
if alibi_slopes is not None:
+ assert Lk == Lk_padded
_fwd_kernel_alibi[grid](
q,
k,
@@ -738,6 +747,7 @@ def context_attention_fwd(q,
num_queries_per_kv=num_queries_per_kv,
BLOCK_M=BLOCK,
BLOCK_DMODEL=Lk,
+ BLOCK_DMODEL_PADDED=Lk_padded,
BLOCK_N=BLOCK,
num_warps=num_warps,
num_stages=1,
diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py
index 87cf30cbef79a..e160411859f0b 100644
--- a/vllm/attention/ops/triton_flash_attention.py
+++ b/vllm/attention/ops/triton_flash_attention.py
@@ -415,7 +415,11 @@ def attn_fwd(
return
is_mqa = hq != hk
- off_h_k = off_h_q % hk if is_mqa else off_h_q
+ if is_mqa: # noqa: SIM108
+ off_h_k = off_h_q % hk
+ else:
+ off_h_k = off_h_q
+
n_extra_tokens = 0
if seqlen_k < BLOCK_N:
n_extra_tokens = BLOCK_N - seqlen_k
diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 554e802cd5513..7cc17f21dcd0e 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -25,7 +25,7 @@ class _Backend(enum.Enum):
def get_attn_backend(dtype: torch.dtype) -> Type[AttentionBackend]:
backend = _which_attn_to_use(dtype)
if backend == _Backend.FLASH_ATTN:
- logger.info("Using FlashAttention backend.")
+ logger.info("Using FlashAttention-2 backend.")
from vllm.attention.backends.flash_attn import ( # noqa: F401
FlashAttentionBackend)
return FlashAttentionBackend
@@ -62,12 +62,12 @@ def _which_attn_to_use(dtype: torch.dtype) -> _Backend:
# NVIDIA GPUs.
if torch.cuda.get_device_capability()[0] < 8:
# Volta and Turing NVIDIA GPUs.
- logger.info("Cannot use FlashAttention backend for Volta and Turing "
+ logger.info("Cannot use FlashAttention-2 backend for Volta and Turing "
"GPUs.")
return _Backend.XFORMERS
if dtype not in (torch.float16, torch.bfloat16):
- logger.info("Cannot use FlashAttention backend for dtype other than "
+ logger.info("Cannot use FlashAttention-2 backend for dtype other than "
"torch.float16 or torch.bfloat16.")
return _Backend.XFORMERS
@@ -75,8 +75,8 @@ def _which_attn_to_use(dtype: torch.dtype) -> _Backend:
import flash_attn # noqa: F401
except ImportError:
logger.info(
- "Cannot use FlashAttention backend because the flash_attn package "
- "is not found. Please install it for better performance.")
+ "Cannot use FlashAttention-2 backend because the flash_attn "
+ "package is not found. Please install it for better performance.")
return _Backend.XFORMERS
backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
diff --git a/vllm/config.py b/vllm/config.py
index 744fecdc7c64f..7f1bb70274e3d 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1,14 +1,15 @@
import enum
import json
import os
-from dataclasses import dataclass, fields
-from typing import TYPE_CHECKING, ClassVar, Optional, Union
+from dataclasses import dataclass, field, fields
+from typing import TYPE_CHECKING, ClassVar, List, Optional, Union
import torch
from packaging.version import Version
from transformers import PretrainedConfig
from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.transformers_utils.config import get_config, get_hf_text_config
from vllm.utils import (get_cpu_memory, get_nvcc_cuda_version, is_cpu, is_hip,
is_neuron)
@@ -16,8 +17,14 @@
if TYPE_CHECKING:
from ray.util.placement_group import PlacementGroup
+ from vllm.model_executor.model_loader.loader import BaseModelLoader
+
logger = init_logger(__name__)
+# If true, will load models from ModelScope instead of Hugging Face Hub.
+VLLM_USE_MODELSCOPE = os.environ.get("VLLM_USE_MODELSCOPE",
+ "False").lower() == "true"
+
_GB = 1 << 30
@@ -31,18 +38,6 @@ class ModelConfig:
available, and "slow" will always use the slow tokenizer.
trust_remote_code: Trust remote code (e.g., from HuggingFace) when
downloading the model and tokenizer.
- download_dir: Directory to download and load the weights, default to the
- default cache directory of huggingface.
- load_format: The format of the model weights to load:
- "auto" will try to load the weights in the safetensors format and
- fall back to the pytorch bin format if safetensors format is
- not available.
- "pt" will load the weights in the pytorch bin format.
- "safetensors" will load the weights in the safetensors format.
- "npcache" will load the weights in pytorch format and store
- a numpy cache to speed up the loading.
- "dummy" will initialize the weights with random values, which is
- mainly for profiling.
dtype: Data type for model weights and activations. The "auto" option
will use FP16 precision for FP32 and FP16 models, and BF16 precision
for BF16 models.
@@ -62,8 +57,8 @@ class ModelConfig:
weights. If None, we assume the model weights are not quantized.
quantization_param_path: Path to JSON file containing scaling factors.
Used to load KV cache scaling factors into the model when KV cache
- type is FP8_E4M3 on ROCm (AMD GPU). In the future these will also
- be used to load activation and weight scaling factors when the
+ type is FP8_E4M3 on ROCm (AMD GPU). In the future these will also
+ be used to load activation and weight scaling factors when the
model dtype is FP8_E4M3 on ROCm.
enforce_eager: Whether to enforce eager execution. If True, we will
disable CUDA graph and always execute the model in eager mode.
@@ -71,6 +66,8 @@ class ModelConfig:
max_context_len_to_capture: Maximum context len covered by CUDA graphs.
When a sequence has context length larger than this, we fall back
to eager mode.
+ skip_tokenizer_init: If true, skip initialization of tokenizer and
+ detokenizer.
"""
def __init__(
@@ -79,8 +76,6 @@ def __init__(
tokenizer: str,
tokenizer_mode: str,
trust_remote_code: bool,
- download_dir: Optional[str],
- load_format: str,
dtype: Union[str, torch.dtype],
seed: int,
revision: Optional[str] = None,
@@ -94,13 +89,12 @@ def __init__(
enforce_eager: bool = False,
max_context_len_to_capture: Optional[int] = None,
max_logprobs: int = 5,
+ skip_tokenizer_init: bool = False,
) -> None:
self.model = model
self.tokenizer = tokenizer
self.tokenizer_mode = tokenizer_mode
self.trust_remote_code = trust_remote_code
- self.download_dir = download_dir
- self.load_format = load_format
self.seed = seed
self.revision = revision
self.code_revision = code_revision
@@ -112,22 +106,7 @@ def __init__(
self.enforce_eager = enforce_eager
self.max_context_len_to_capture = max_context_len_to_capture
self.max_logprobs = max_logprobs
-
- if os.environ.get("VLLM_USE_MODELSCOPE", "False").lower() == "true":
- # download model from ModelScope hub,
- # lazy import so that modelscope is not required for normal use.
- # pylint: disable=C.
- from modelscope.hub.snapshot_download import snapshot_download
-
- if not os.path.exists(model):
- model_path = snapshot_download(model_id=model,
- cache_dir=download_dir,
- revision=revision)
- else:
- model_path = model
- self.model = model_path
- self.download_dir = model_path
- self.tokenizer = model_path
+ self.skip_tokenizer_init = skip_tokenizer_init
self.hf_config = get_config(self.model, trust_remote_code, revision,
code_revision)
@@ -135,43 +114,13 @@ def __init__(
self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
self.max_model_len = _get_and_verify_max_len(self.hf_text_config,
max_model_len)
- self._verify_load_format()
- self._verify_tokenizer_mode()
+ if not self.skip_tokenizer_init:
+ self._verify_tokenizer_mode()
self._verify_quantization()
# UPSTREAM SYNC: keep sparsity
self._verify_sparsity()
self._verify_cuda_graph()
- def _verify_load_format(self) -> None:
- load_format = self.load_format.lower()
- supported_load_format = [
- "auto", "pt", "safetensors", "npcache", "dummy"
- ]
- rocm_not_supported_load_format = []
- if load_format not in supported_load_format:
- raise ValueError(
- f"Unknown load format: {self.load_format}. Must be one of "
- "'auto', 'pt', 'safetensors', 'npcache', or 'dummy'.")
- if is_hip() and load_format in rocm_not_supported_load_format:
- rocm_supported_load_format = [
- f for f in supported_load_format
- if (f not in rocm_not_supported_load_format)
- ]
- raise ValueError(
- f"load format '{load_format}' is not supported in ROCm. "
- f"Supported load format are "
- f"{rocm_supported_load_format}")
-
- # TODO: Remove this check once HF updates the pt weights of Mixtral.
- architectures = getattr(self.hf_config, "architectures", [])
- # architectures can be None instead of []
- if architectures and "MixtralForCausalLM" in architectures \
- and load_format == "pt":
- raise ValueError(
- "Currently, the 'pt' format is not supported for Mixtral. "
- "Please use the 'safetensors' format instead. ")
- self.load_format = load_format
-
def _verify_tokenizer_mode(self) -> None:
tokenizer_mode = self.tokenizer_mode.lower()
if tokenizer_mode not in ["auto", "slow"]:
@@ -207,8 +156,8 @@ def _verify_sparsity(self) -> None:
f"({self.sparsity}).")
def _verify_quantization(self) -> None:
- supported_quantization = ["awq", "gptq", "squeezellm", "marlin"]
- rocm_not_supported_quantization = ["awq", "marlin"]
+ supported_quantization = [*QUANTIZATION_METHODS]
+ rocm_supported_quantization = ["gptq", "squeezellm"]
if self.quantization is not None:
self.quantization = self.quantization.lower()
@@ -244,7 +193,7 @@ def _verify_quantization(self) -> None:
f"Unknown quantization method: {self.quantization}. Must "
f"be one of {supported_quantization}.")
if is_hip(
- ) and self.quantization in rocm_not_supported_quantization:
+ ) and self.quantization not in rocm_supported_quantization:
raise ValueError(
f"{self.quantization} quantization is currently not "
f"supported in ROCm.")
@@ -449,7 +398,7 @@ def verify_with_parallel_config(
@dataclass
class TokenizerPoolConfig:
"""Configuration for the tokenizer pool.
-
+
Args:
pool_size: Number of tokenizer workers in the pool.
pool_type: Type of the pool.
@@ -473,9 +422,9 @@ def create_config(
tokenizer_pool_extra_config: Optional[Union[str, dict]]
) -> Optional["TokenizerPoolConfig"]:
"""Create a TokenizerPoolConfig from the given parameters.
-
+
If tokenizer_pool_size is 0, return None.
-
+
Args:
tokenizer_pool_size: Number of tokenizer workers in the pool.
tokenizer_pool_type: Type of the pool.
@@ -498,6 +447,65 @@ def create_config(
return tokenizer_pool_config
+class LoadFormat(str, enum.Enum):
+ AUTO = "auto"
+ PT = "pt"
+ SAFETENSORS = "safetensors"
+ NPCACHE = "npcache"
+ DUMMY = "dummy"
+ TENSORIZER = "tensorizer"
+
+
+@dataclass
+class LoadConfig:
+ """
+ download_dir: Directory to download and load the weights, default to the
+ default cache directory of huggingface.
+ load_format: The format of the model weights to load:
+ "auto" will try to load the weights in the safetensors format and
+ fall back to the pytorch bin format if safetensors format is
+ not available.
+ "pt" will load the weights in the pytorch bin format.
+ "safetensors" will load the weights in the safetensors format.
+ "npcache" will load the weights in pytorch format and store
+ a numpy cache to speed up the loading.
+ "dummy" will initialize the weights with random values, which is
+ mainly for profiling.
+ "tensorizer" will use CoreWeave's tensorizer library for
+ fast weight loading.
+ """
+
+ load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
+ download_dir: Optional[str] = None
+ model_loader_extra_config: Optional[Union[str, dict]] = field(
+ default_factory=dict)
+
+ def __post_init__(self):
+ model_loader_extra_config = self.model_loader_extra_config or {}
+ if isinstance(model_loader_extra_config, str):
+ self.model_loader_extra_config = json.loads(
+ model_loader_extra_config)
+ self._verify_load_format()
+
+ def _verify_load_format(self) -> None:
+ if not isinstance(self.load_format, str):
+ return
+
+ load_format = self.load_format.lower()
+ self.load_format = LoadFormat(load_format)
+
+ rocm_not_supported_load_format: List[str] = []
+ if is_hip() and load_format in rocm_not_supported_load_format:
+ rocm_supported_load_format = [
+ f for f in LoadFormat.__members__
+ if (f not in rocm_not_supported_load_format)
+ ]
+ raise ValueError(
+ f"load format '{load_format}' is not supported in ROCm. "
+ f"Supported load formats are "
+ f"{rocm_supported_load_format}")
+
+
class ParallelConfig:
"""Configuration for the distributed execution.
@@ -687,6 +695,9 @@ def maybe_create_spec_config(
target_dtype: str,
speculative_model: Optional[str],
num_speculative_tokens: Optional[int],
+ speculative_max_model_len: Optional[int],
+ enable_chunked_prefill: bool,
+ use_v2_block_manager: bool,
) -> Optional["SpeculativeConfig"]:
"""Create a SpeculativeConfig if possible, else return None.
@@ -704,6 +715,15 @@ def maybe_create_spec_config(
model, if provided.
num_speculative_tokens (Optional[int]): The number of speculative
tokens, if provided.
+ speculative_max_model_len (Optional[int]): The maximum model len of
+ the speculative model. Used when testing the ability to skip
+ speculation for some sequences.
+ enable_chunked_prefill (bool): Whether vLLM is configured to use
+ chunked prefill or not. Used for raising an error since its not
+ yet compatible with spec decode.
+ use_v2_block_manager (bool): Whether vLLM is configured to use the
+ v2 block manager or not. Used for raising an error since the v2
+ block manager is required with spec decode.
Returns:
Optional["SpeculativeConfig"]: An instance of SpeculativeConfig if
@@ -719,26 +739,36 @@ def maybe_create_spec_config(
"num_speculative_tokens to be provided, but found "
f"{speculative_model=} and {num_speculative_tokens=}.")
+ assert (speculative_model is not None
+ and num_speculative_tokens is not None)
+
+ if enable_chunked_prefill:
+ raise ValueError(
+ "Speculative decoding and chunked prefill are "
+ f"currently mutually exclusive ({enable_chunked_prefill=}).")
+
+ if not use_v2_block_manager:
+ raise ValueError(
+ "Speculative decoding requires usage of the V2 "
+ "block manager. Enable it with --use-v2-block-manager.")
+
# TODO: The user should be able to specify revision/quantization/max
# model len for the draft model. It is not currently supported.
draft_revision = None
draft_code_revision = None
draft_quantization = None
- draft_max_model_len = None
draft_model_config = ModelConfig(
model=speculative_model,
tokenizer=target_model_config.tokenizer,
tokenizer_mode=target_model_config.tokenizer_mode,
trust_remote_code=target_model_config.trust_remote_code,
- download_dir=target_model_config.download_dir,
- load_format=target_model_config.load_format,
dtype=target_model_config.dtype,
seed=target_model_config.seed,
revision=draft_revision,
code_revision=draft_code_revision,
tokenizer_revision=target_model_config.tokenizer_revision,
- max_model_len=draft_max_model_len,
+ max_model_len=None,
quantization=draft_quantization,
enforce_eager=target_model_config.enforce_eager,
max_context_len_to_capture=target_model_config.
@@ -746,6 +776,13 @@ def maybe_create_spec_config(
max_logprobs=target_model_config.max_logprobs,
)
+ draft_model_config.max_model_len = (
+ SpeculativeConfig._maybe_override_draft_max_model_len(
+ speculative_max_model_len,
+ draft_model_config.max_model_len,
+ target_model_config.max_model_len,
+ ))
+
draft_parallel_config = (
SpeculativeConfig.create_draft_parallel_config(
target_parallel_config))
@@ -756,6 +793,41 @@ def maybe_create_spec_config(
num_speculative_tokens,
)
+ @staticmethod
+ def _maybe_override_draft_max_model_len(
+ speculative_max_model_len: Optional[int],
+ draft_max_model_len: int,
+ target_max_model_len: int,
+ ) -> int:
+ """Determine the max sequence len for the draft model. This is usually
+ the draft_max_model_len, but may be the target_max_model_len if it is
+ less than the draft_max_model_len, or may be speculative_max_model_len
+ if it is specified.
+
+ This is necessary so that sequences do not exceed the capacity of the
+ draft model or the target model.
+
+ speculative_max_model_len is mainly used for testing that sequences can
+ skip speculation.
+ """
+
+ if speculative_max_model_len is not None:
+
+ if speculative_max_model_len > draft_max_model_len:
+ raise ValueError(f"{speculative_max_model_len=} cannot be "
+ f"larger than {draft_max_model_len=}")
+
+ if speculative_max_model_len > target_max_model_len:
+ raise ValueError(f"{speculative_max_model_len=} cannot be "
+ f"larger than {target_max_model_len=}")
+
+ return speculative_max_model_len
+
+ return min(
+ draft_max_model_len,
+ target_max_model_len,
+ )
+
@staticmethod
def create_draft_parallel_config(
target_parallel_config: ParallelConfig) -> ParallelConfig:
@@ -1024,7 +1096,7 @@ def _get_and_verify_max_len(
derived_max_model_len = default_max_len
rope_scaling = getattr(hf_config, "rope_scaling", None)
- if rope_scaling is not None:
+ if rope_scaling is not None and rope_scaling["type"] != "su":
assert "factor" in rope_scaling
scaling_factor = rope_scaling["factor"]
if rope_scaling["type"] == "yarn":
@@ -1033,7 +1105,7 @@ def _get_and_verify_max_len(
derived_max_model_len *= scaling_factor
if max_model_len is None:
- max_model_len = derived_max_model_len
+ max_model_len = int(derived_max_model_len)
elif max_model_len > derived_max_model_len:
# Some models might have a separate key for specifying model_max_length
# that will be bigger than derived_max_model_len. We compare user input
@@ -1052,6 +1124,21 @@ def _get_and_verify_max_len(
return int(max_model_len)
+@dataclass
+class DecodingConfig:
+ """Dataclass which contains the decoding strategy of the engine"""
+
+ # Which guided decoding algo to use. 'outlines' / 'lm-format-enforcer'
+ guided_decoding_backend: str = 'outlines'
+
+ def __post_init__(self):
+ valid_guided_backends = ['outlines', 'lm-format-enforcer']
+ backend = self.guided_decoding_backend
+ if backend not in valid_guided_backends:
+ raise ValueError(f"Invalid guided_decoding_backend '{backend},"
+ f"must be one of {valid_guided_backends}")
+
+
@dataclass(frozen=True)
class EngineConfig:
"""Dataclass which contains all engine-related configuration. This
@@ -1063,9 +1150,11 @@ class EngineConfig:
parallel_config: ParallelConfig
scheduler_config: SchedulerConfig
device_config: DeviceConfig
+ load_config: LoadConfig
lora_config: Optional[LoRAConfig]
vision_language_config: Optional[VisionLanguageConfig]
speculative_config: Optional[SpeculativeConfig]
+ decoding_config: Optional[DecodingConfig]
def __post_init__(self):
"""Verify configs are valid & consistent with each other.
diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index ba061bbc4fbcb..f1b65b2514f76 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -104,7 +104,7 @@ def append_token_ids(self,
token_ids (List[int]): The sequence of token IDs to be appended.
"""
assert self._is_allocated
- assert token_ids, "can't append empty token ids"
+ assert self._blocks is not None
self.ensure_num_empty_slots(num_empty_slots=len(token_ids) +
num_lookahead_slots)
diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py
index 50c70533c4fbc..f11234a0bf2dd 100644
--- a/vllm/core/block/common.py
+++ b/vllm/core/block/common.py
@@ -99,7 +99,7 @@ def __init__(
refcounter: RefCounter,
allocator: BlockAllocator,
):
- self._copy_on_writes = defaultdict(list)
+ self._copy_on_writes: Dict[BlockId, List[BlockId]] = defaultdict(list)
self._refcounter = refcounter
self._allocator = allocator
@@ -138,6 +138,8 @@ def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
prev_block=block.prev_block).block_id
# Track src/dst copy.
+ assert src_block_id is not None
+ assert block_id is not None
self._copy_on_writes[src_block_id].append(block_id)
return block_id
@@ -180,6 +182,6 @@ def recurse(block: Block, lst: List[Block]) -> None:
recurse(block.prev_block, lst)
lst.append(block)
- all_blocks = []
+ all_blocks: List[Block] = []
recurse(last_block, all_blocks)
return all_blocks
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
index fbceacf0ec417..50ce922118124 100644
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -52,8 +52,7 @@ def __call__(
class BlockAllocator(ABC):
@abstractmethod
- def allocate_mutable(self, prev_block: Optional[Block],
- device: Device) -> Block:
+ def allocate_mutable(self, prev_block: Optional[Block]) -> Block:
pass
@abstractmethod
@@ -98,8 +97,7 @@ class NoFreeBlocksError(ValueError):
class DeviceAwareBlockAllocator(BlockAllocator):
@abstractmethod
- def allocate_mutable(self, prev_block: Optional[Block],
- device: Device) -> Block:
+ def allocate_mutable(self, prev_block: Optional[Block]) -> Block:
pass
@abstractmethod
diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index e7e3b4dc1e9b4..be093922b84f2 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -2,7 +2,9 @@
from abc import ABC, abstractmethod
from itertools import count, takewhile
from os.path import commonprefix
-from typing import Dict, List, Optional, Set
+from typing import Dict, List, Optional
+from typing import Sequence as GenericSequence
+from typing import Set
from vllm.block import BlockTable, PhysicalTokenBlock
from vllm.core.evictor import EvictionPolicy, Evictor, make_evictor
@@ -231,10 +233,10 @@ def __init__(
if self.enable_caching:
logger.info("Automatic prefix caching is enabled.")
- self.gpu_allocator = CachedBlockAllocator(Device.GPU, block_size,
- num_gpu_blocks)
- self.cpu_allocator = CachedBlockAllocator(Device.CPU, block_size,
- num_cpu_blocks)
+ self.gpu_allocator: BlockAllocatorBase = CachedBlockAllocator(
+ Device.GPU, block_size, num_gpu_blocks)
+ self.cpu_allocator: BlockAllocatorBase = CachedBlockAllocator(
+ Device.CPU, block_size, num_cpu_blocks)
else:
self.gpu_allocator = UncachedBlockAllocator(
Device.GPU, block_size, num_gpu_blocks)
@@ -588,7 +590,8 @@ def get_all_computed_blocks(self, seq: Sequence) -> List[int]:
for b in takewhile(lambda b: b.computed, block_table[:-1])
]
- def get_common_computed_block_ids(self, seqs: List[Sequence]) -> List[int]:
+ def get_common_computed_block_ids(
+ self, seqs: List[Sequence]) -> GenericSequence[int]:
"""Return the block ids that are common for a given sequence group.
Used in prefill (can skip prefill of some blocks).
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index 813e71ad883b2..6339a6baf4161 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -1,5 +1,6 @@
"""A block manager that manages token blocks."""
from typing import Dict, List, Optional
+from typing import Sequence as GenericSequence
from vllm.core.block.block_table import BlockTable
from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
@@ -205,7 +206,8 @@ def mark_blocks_as_computed(self, seq_group: SequenceGroup):
# as computed.
self.block_allocator.mark_blocks_as_computed()
- def get_common_computed_block_ids(self, seqs: List[Sequence]) -> List[int]:
+ def get_common_computed_block_ids(
+ self, seqs: List[Sequence]) -> GenericSequence[int]:
"""Determine which blocks for which we skip prefill.
With prefix caching we can skip prefill for previously-generated blocks.
diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py
index 711536bcc97be..56c2c5995c38b 100644
--- a/vllm/core/interfaces.py
+++ b/vllm/core/interfaces.py
@@ -1,6 +1,7 @@
import enum
from abc import ABC, abstractmethod
from typing import Dict, List
+from typing import Sequence as GenericSequence
from vllm.sequence import Sequence, SequenceGroup
@@ -103,7 +104,8 @@ def access_all_blocks_in_seq(
pass
@abstractmethod
- def get_common_computed_block_ids(self, seqs: List[Sequence]) -> List[int]:
+ def get_common_computed_block_ids(
+ self, seqs: List[Sequence]) -> GenericSequence[int]:
pass
@abstractmethod
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 411ef494fd93d..99f7a34d336a4 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -42,8 +42,8 @@ class SchedulingBudget:
"""
token_budget: int
max_num_seqs: int
- _requeset_ids_num_batched_tokens: Set[int] = field(default_factory=set)
- _requeset_ids_num_curr_seqs: Set[int] = field(default_factory=set)
+ _requeset_ids_num_batched_tokens: Set[str] = field(default_factory=set)
+ _requeset_ids_num_curr_seqs: Set[str] = field(default_factory=set)
_num_batched_tokens: int = 0
_num_curr_seqs: int = 0
@@ -133,7 +133,7 @@ def is_empty(self) -> bool:
return (not self.scheduled_seq_groups and not self.blocks_to_swap_in
and not self.blocks_to_swap_out and not self.blocks_to_copy)
- def _sort_by_lora_ids(self) -> bool:
+ def _sort_by_lora_ids(self):
self.scheduled_seq_groups = sorted(
self.scheduled_seq_groups,
key=lambda g: (g.seq_group.lora_int_id, g.seq_group.request_id))
@@ -297,7 +297,6 @@ def num_decoding_tokens_per_seq(self) -> int:
def add_seq_group(self, seq_group: SequenceGroup) -> None:
# Add sequence groups to the waiting queue.
- logger.debug(f"add_seq_group {seq_group.request_id}")
self.waiting.append(seq_group)
def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
@@ -337,7 +336,8 @@ def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
self.free_seq(seq)
def has_unfinished_seqs(self) -> bool:
- return self.waiting or self.running or self.swapped
+ return len(self.waiting) != 0 or len(self.running) != 0 or len(
+ self.swapped) != 0
def get_num_unfinished_seq_groups(self) -> int:
return len(self.waiting) + len(self.running) + len(self.swapped)
@@ -395,16 +395,16 @@ def _schedule_running(
# We can have up to 1 running prefill at any given time in running
# queue, which means we can guarantee chunk size is at least 1.
assert num_running_tokens != 0
- num_running_seqs = seq_group.get_max_num_running_seqs()
running_queue.popleft()
while not self._can_append_slots(seq_group):
budget.subtract_num_batched_tokens(seq_group.request_id,
num_running_tokens)
+ num_running_seqs = seq_group.get_max_num_running_seqs()
budget.subtract_num_seqs(seq_group.request_id,
num_running_seqs)
if curr_loras is not None and seq_group.lora_int_id > 0:
- curr_loras.pop(seq_group.lora_int_id)
+ curr_loras.remove(seq_group.lora_int_id)
if running_queue:
# Preempt the lowest-priority sequence groups.
@@ -426,7 +426,6 @@ def _schedule_running(
swapped_out.append(seq_group)
break
else:
- logger.debug(f"append slot for {seq_group}")
self._append_slots(seq_group, blocks_to_copy)
is_prefill = seq_group.is_prefill()
if is_prefill:
@@ -440,7 +439,13 @@ def _schedule_running(
token_chunk_size=1))
budget.add_num_batched_tokens(seq_group.request_id,
num_running_tokens)
- budget.add_num_seqs(seq_group.request_id, num_running_seqs)
+ # OPTIMIZATION: Note that get_max_num_running_seqs is
+ # expensive. For the default scheduling chase where
+ # enable_chunking is False, num_seqs are updated before running
+ # this method, so we don't have to update it again here.
+ if enable_chunking:
+ num_running_seqs = seq_group.get_max_num_running_seqs()
+ budget.add_num_seqs(seq_group.request_id, num_running_seqs)
if curr_loras is not None and seq_group.lora_int_id > 0:
curr_loras.add(seq_group.lora_int_id)
@@ -496,7 +501,7 @@ def _schedule_swapped(
now = time.time()
swapped_queue = policy.sort_by_priority(now, swapped_queue)
- leftover_swapped = deque()
+ leftover_swapped: Deque[SequenceGroup] = deque()
while swapped_queue:
seq_group = swapped_queue[0]
@@ -507,7 +512,9 @@ def _schedule_swapped(
lora_int_id = 0
if self.lora_enabled:
lora_int_id = seq_group.lora_int_id
- if (lora_int_id > 0 and lora_int_id not in curr_loras
+ assert curr_loras is not None
+ assert self.lora_config is not None
+ if (lora_int_id > 0 and (lora_int_id not in curr_loras)
and len(curr_loras) >= self.lora_config.max_loras):
# We don't have a space for another LoRA, so
# we ignore this request for now.
@@ -593,7 +600,7 @@ def _schedule_prefills(
# Copy the queue so that the input queue is not modified.
waiting_queue = deque([s for s in waiting_queue])
- leftover_waiting_sequences = deque()
+ leftover_waiting_sequences: Deque[SequenceGroup] = deque()
while self._passed_delay(time.time()) and waiting_queue:
seq_group = waiting_queue[0]
@@ -635,6 +642,8 @@ def _schedule_prefills(
lora_int_id = 0
if self.lora_enabled:
lora_int_id = seq_group.lora_int_id
+ assert curr_loras is not None
+ assert self.lora_config is not None
if (self.lora_enabled and lora_int_id > 0
and lora_int_id not in curr_loras
and len(curr_loras) >= self.lora_config.max_loras):
@@ -654,7 +663,7 @@ def _schedule_prefills(
if curr_loras is not None and lora_int_id > 0:
curr_loras.add(lora_int_id)
waiting_queue.popleft()
- self._allocate_and_set_running(seq_group, num_new_tokens)
+ self._allocate_and_set_running(seq_group)
seq_groups.append(
ScheduledSequenceGroup(seq_group=seq_group,
token_chunk_size=num_new_tokens))
@@ -674,7 +683,7 @@ def _schedule_prefills(
def _schedule_default(self) -> SchedulerOutputs:
"""Schedule queued requests.
- The current policy is designed to opimimize the throughput. First,
+ The current policy is designed to optimize the throughput. First,
it batches as many prefill requests as possible. And it schedules
decodes. If there's a pressure on GPU memory, decode requests can
be swapped or preempted.
@@ -757,9 +766,7 @@ def _schedule_default(self) -> SchedulerOutputs:
blocks_to_copy=merge_dicts(running_scheduled.blocks_to_copy,
swapped_in.blocks_to_copy),
ignored_seq_groups=prefills.ignored_seq_groups,
- num_lookahead_slots=(prefills.num_lookahead_slots +
- running_scheduled.num_lookahead_slots +
- swapped_in.num_lookahead_slots),
+ num_lookahead_slots=running_scheduled.num_lookahead_slots,
)
def _schedule_chunked_prefill(self):
@@ -780,7 +787,7 @@ def _schedule_chunked_prefill(self):
token_budget=self.scheduler_config.max_num_batched_tokens,
max_num_seqs=self.scheduler_config.max_num_seqs,
)
- curr_loras = set()
+ curr_loras: Set[int] = set()
remaining_waiting, prefills = (self.waiting,
SchedulerPrefillOutputs.create_empty())
@@ -845,9 +852,7 @@ def _schedule_chunked_prefill(self):
blocks_to_copy=merge_dicts(running_scheduled.blocks_to_copy,
swapped_in.blocks_to_copy),
ignored_seq_groups=prefills.ignored_seq_groups,
- num_lookahead_slots=(prefills.num_lookahead_slots +
- running_scheduled.num_lookahead_slots +
- swapped_in.num_lookahead_slots),
+ num_lookahead_slots=running_scheduled.num_lookahead_slots,
)
def _schedule(self) -> SchedulerOutputs:
@@ -878,27 +883,6 @@ def _can_swap_in(self, seq_group: SequenceGroup) -> bool:
num_lookahead_slots=self._get_num_lookahead_slots(is_prefill),
)
- def _can_append_slots(self, seq_group: SequenceGroup) -> bool:
- """Determine whether or not we have enough space in the KV cache to
- continue generation of the sequence group.
- """
- # Appending slots only occurs in decoding.
- is_prefill = False
-
- return self.block_manager.can_append_slots(
- seq_group=seq_group,
- num_lookahead_slots=self._get_num_lookahead_slots(is_prefill),
- )
-
- def _can_swap_in(self, seq_group: SequenceGroup) -> bool:
- # Swapping in is considered decode.
- is_prefill = False
-
- return self.block_manager.can_swap_in(
- seq_group=seq_group,
- num_lookahead_slots=self._get_num_lookahead_slots(is_prefill),
- )
-
def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
# Schedule sequence groups.
# This function call changes the internal states of the scheduler
@@ -972,8 +956,7 @@ def free_finished_seq_groups(self) -> None:
self.running = deque(seq_group for seq_group in self.running
if not seq_group.is_finished())
- def _allocate_and_set_running(self, seq_group: SequenceGroup,
- num_new_tokens: int) -> None:
+ def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None:
self.block_manager.allocate(seq_group)
for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
seq.status = SequenceStatus.RUNNING
@@ -1108,7 +1091,7 @@ def _get_num_lookahead_slots(self, is_prefill: bool) -> int:
def _get_num_new_tokens(self, seq_group: SequenceGroup,
status: SequenceStatus, enable_chunking: bool,
- budget: SchedulingBudget) -> Tuple[int, bool]:
+ budget: SchedulingBudget) -> int:
"""Get the next new tokens to compute for a given sequence group
that's in a given `status`.
diff --git a/vllm/distributed/communication_op.py b/vllm/distributed/communication_op.py
index 1004d626b6a4b..a3e93691a1e8e 100644
--- a/vllm/distributed/communication_op.py
+++ b/vllm/distributed/communication_op.py
@@ -1,5 +1,5 @@
from collections import namedtuple
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
import torch
from torch.distributed import ProcessGroup
@@ -144,7 +144,7 @@ def broadcast_tensor_dict(
tensor_dict: Optional[Dict[Any, Union[torch.Tensor, Any]]] = None,
src: int = 0,
group: Optional[ProcessGroup] = None,
-) -> Dict[Any, Union[torch.Tensor, Any]]:
+) -> Optional[Dict[Any, Union[torch.Tensor, Any]]]:
"""Broadcast the input tensor dictionary."""
group = group or torch.distributed.group.WORLD
ranks = torch.distributed.get_process_group_ranks(group)
@@ -157,10 +157,10 @@ def broadcast_tensor_dict(
rank = torch.distributed.get_rank()
if rank == src:
+ metadata_list: List[Tuple[Any, Any]] = []
assert isinstance(
tensor_dict,
dict), (f"Expecting a dictionary, got {type(tensor_dict)}")
- metadata_list = []
for key, value in tensor_dict.items():
if isinstance(value, torch.Tensor):
assert value.is_cuda, (
@@ -190,10 +190,10 @@ def broadcast_tensor_dict(
torch.distributed.broadcast_object_list(recv_metadata_list,
src=src,
group=group)
- metadata_list = recv_metadata_list[0]
+ assert recv_metadata_list[0] is not None
tensor_dict = {}
async_handles = []
- for key, value in metadata_list:
+ for key, value in recv_metadata_list[0]:
if isinstance(value, TensorMetadata):
tensor = torch.empty(value.size,
dtype=value.dtype,
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index 84238d2e46076..9dbb427d91ff1 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -1,5 +1,6 @@
+import os
from contextlib import contextmanager
-from typing import Optional
+from typing import Any, List, Optional
import torch
import torch.distributed as dist
@@ -17,7 +18,7 @@
logger = init_logger(__name__)
-_CA_HANDLE = None
+_CA_HANDLE: Optional["CustomAllreduce"] = None
_IS_CAPTURING = False
_SUPPORTED_WORLD_SIZES = [2, 4, 6, 8]
@@ -42,19 +43,39 @@ def init_custom_ar() -> None:
" disable_custom_all_reduce=True explicitly.", world_size,
str(_SUPPORTED_WORLD_SIZES))
return
- if not _can_p2p(rank, world_size):
+ num_dev = torch.cuda.device_count()
+ # note: num dev can be larger than world_size if we're only using
+ # first few GPUs
+ if num_dev < world_size:
logger.warn(
- "Custom allreduce is disabled because your platform lacks GPU P2P"
- " capability or P2P test failed. To silence this warning, specify"
- " disable_custom_all_reduce=True explicitly.")
+ "Cannot test GPU P2P because not all GPUs are visible to the "
+ "current process. This might be the case if 'CUDA_VISIBLE_DEVICES'"
+ " is set.")
return
- full_nvlink = _is_full_nvlink(rank, world_size)
+ # test nvlink first, this will filter out most of the cases
+ # where custom allreduce is not supported
+ if "CUDA_VISIBLE_DEVICES" in os.environ:
+ device_ids = list(
+ map(int, os.environ["CUDA_VISIBLE_DEVICES"].split(",")))
+ else:
+ device_ids = list(range(num_dev))
+ # this checks hardware and driver support for NVLink
+ full_nvlink = _is_full_nvlink(device_ids)
if world_size > 2 and not full_nvlink:
logger.warn(
"Custom allreduce is disabled because it's not supported on more"
" than two PCIe-only GPUs. To silence this warning, specify"
" disable_custom_all_reduce=True explicitly.")
return
+ # test P2P capability, this checks software/cudaruntime support
+ # this is expensive to compute at the first time
+ # then we cache the result
+ if not _can_p2p(rank, world_size):
+ logger.warn(
+ "Custom allreduce is disabled because your platform lacks GPU P2P"
+ " capability or P2P test failed. To silence this warning, specify"
+ " disable_custom_all_reduce=True explicitly.")
+ return
_CA_HANDLE = CustomAllreduce(rank, world_size, full_nvlink)
@@ -96,7 +117,7 @@ def custom_all_reduce(input: torch.Tensor) -> Optional[torch.Tensor]:
ca_handle = get_handle()
# when custom allreduce is disabled, this will be None
if ca_handle is None:
- return
+ return None
if is_capturing():
if torch.cuda.is_current_stream_capturing():
if ca_handle.should_custom_ar(input):
@@ -114,6 +135,8 @@ def custom_all_reduce(input: torch.Tensor) -> Optional[torch.Tensor]:
if ca_handle.should_custom_ar(input):
return ca_handle.all_reduce_unreg(input)
+ return None
+
@contextmanager
def _nvml():
@@ -124,59 +147,41 @@ def _nvml():
pynvml.nvmlShutdown()
-# query if the set of gpus are fully connected by nvlink (1 hop)
@_nvml()
-def _is_full_nvlink(rank, world_size):
- handle = pynvml.nvmlDeviceGetHandleByIndex(rank)
- for i in range(world_size):
- if i != rank:
- try:
- link_state = pynvml.nvmlDeviceGetNvLinkState(handle, i)
- if not link_state:
+def _is_full_nvlink(device_ids: List[int]) -> bool:
+ """
+ query if the set of gpus are fully connected by nvlink (1 hop)
+ Note that `pynvml` is not affected by `CUDA_VISIBLE_DEVICES`,
+ so it works on real physical device ids.
+ """
+ handles = [pynvml.nvmlDeviceGetHandleByIndex(i) for i in device_ids]
+ for i, handle in enumerate(handles):
+ for j, peer_handle in enumerate(handles):
+ if i < j:
+ try:
+ p2p_status = pynvml.nvmlDeviceGetP2PStatus(
+ handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK)
+ if p2p_status != pynvml.NVML_P2P_STATUS_OK:
+ return False
+ except pynvml.NVMLError as error:
+ logger.error(
+ "NVLink detection failed. This is normal if your"
+ " machine has no NVLink equipped.",
+ exc_info=error)
return False
- except pynvml.NVMLError as error:
- logger.info(
- f"NVLink detection failed with message \"{str(error)}\". "
- "This is normal if your machine has no NVLink equipped")
- return False
return True
def _can_p2p(rank: int, world_size: int) -> bool:
- num_dev = torch.cuda.device_count()
- # note: num dev can be larger than world_size if we're only using
- # first few GPUs
- if num_dev < world_size:
- logger.warn(
- "Cannot test GPU P2P because not all GPUs are visible to the "
- "current process. This might be the case if 'CUDA_VISIBLE_DEVICES'"
- " is set.")
- return False
+ from vllm.distributed.utils import gpu_p2p_access_check
for i in range(world_size):
if i == rank:
continue
- if not torch.cuda.can_device_access_peer(rank, i):
- return False
- # on some platforms, P2P support might be buggy and we need
- # additional checks. See also:
- # https://github.com/vllm-project/vllm/issues/2728
- if not _can_actually_p2p(rank, i):
+ if not gpu_p2p_access_check(rank, i):
return False
return True
-# code partly borrowed from
-# https://github.com/turboderp/exllamav2/blob/1c67f97f3d2a968605a9c31ab791a05c85bb7879/exllamav2/compat.py#L10
-# License: MIT
-def _can_actually_p2p(idx_a, idx_b):
- dev_i = f"cuda:{idx_a}"
- dev_j = f"cuda:{idx_b}"
- a = torch.randn(5, device=dev_i) + 123.0
- b = a.to(dev_j)
- c = b.to(dev_i)
- return torch.all(a == c)
-
-
class CustomAllreduce:
# max_size: max supported allreduce size
@@ -221,14 +226,14 @@ def _get_ipc_meta(self, inp: torch.Tensor):
return self._gather_ipc_meta(shard_data)
def _gather_ipc_meta(self, shard_data):
- all_data = [None] * self.world_size
+ all_data: List[Optional[Any]] = [None] * self.world_size
dist.all_gather_object(all_data, shard_data)
handles = []
offsets = []
for i in range(len(all_data)):
- handles.append(all_data[i][0])
- offsets.append(all_data[i][1])
+ handles.append(all_data[i][0]) # type: ignore
+ offsets.append(all_data[i][1]) # type: ignore
return handles, offsets
def register_buffer(self, inp: torch.Tensor):
diff --git a/vllm/distributed/device_communicators/pynccl.py b/vllm/distributed/device_communicators/pynccl.py
index 0a8bb860efa1c..e922beba44bfa 100644
--- a/vllm/distributed/device_communicators/pynccl.py
+++ b/vllm/distributed/device_communicators/pynccl.py
@@ -20,50 +20,37 @@
# variable in the code.
import ctypes
-import datetime
-import glob
-import os
+import platform
+from typing import Optional, Union
# ===================== import region =====================
import torch
import torch.distributed as dist
-from torch.distributed import ReduceOp
+from torch.distributed import ProcessGroup, ReduceOp
+from vllm.distributed.parallel_state import get_cpu_world_group, get_local_rank
from vllm.logger import init_logger
+from vllm.utils import find_nccl_library, nccl_integrity_check
logger = init_logger(__name__)
-so_file = os.environ.get("VLLM_NCCL_SO_PATH", "")
-
-# check if we have vllm-managed nccl
-vllm_nccl_path = None
-if torch.version.cuda is not None:
- cuda_major = torch.version.cuda.split(".")[0]
- path = os.path.expanduser(
- f"~/.config/vllm/nccl/cu{cuda_major}/libnccl.so.*")
- files = glob.glob(path)
- vllm_nccl_path = files[0] if files else None
-
-# manually load the nccl library
-if so_file:
- logger.info(
- f"Loading nccl from environment variable VLLM_NCCL_SO_PATH={so_file}")
-else:
- if torch.version.cuda is not None:
- so_file = vllm_nccl_path or "libnccl.so.2"
- elif torch.version.hip is not None:
- so_file = "librccl.so.1"
- else:
- raise ValueError("NCCL only supports CUDA and ROCm backends.")
- logger.info(f"Loading nccl from library {so_file}")
+so_file = find_nccl_library()
try:
+ # load the library in another process.
+ # if it core dumps, it will not crash the current process
+ nccl_integrity_check(so_file)
nccl = ctypes.CDLL(so_file)
except Exception as e:
logger.error(
f"Failed to load NCCL library from {so_file} ."
"It is expected if you are not running on NVIDIA/AMD GPUs."
- "Otherwise please set the environment variable VLLM_NCCL_SO_PATH"
+ "Otherwise, the nccl library might not exist, be corrupted "
+ f"or it does not support the current platform {platform.platform()}."
+ f"One solution is to download libnccl2 version 2.18 from "
+ f"https://developer.download.nvidia.com/compute/cuda/repos/ "
+ f"and extract the libnccl.so.2 file. If you already have the "
+ f"library, please set the environment variable VLLM_NCCL_SO_PATH"
" to point to the correct nccl library path.")
raise e
@@ -73,6 +60,18 @@
ncclResult_t = ctypes.c_int
+_c_ncclGetErrorString = nccl.ncclGetErrorString
+_c_ncclGetErrorString.restype = ctypes.c_char_p
+_c_ncclGetErrorString.argtypes = [ncclResult_t]
+
+
+def NCCL_CHECK(result: ncclResult_t) -> None:
+ if result != 0:
+ error_str = _c_ncclGetErrorString(result)
+ error_str = error_str.decode("utf-8")
+ raise RuntimeError(f"NCCL error: {error_str}")
+
+
# equivalent to c declaration:
# ncclResult_t ncclGetVersion(int *version);
_c_ncclGetVersion = nccl.ncclGetVersion
@@ -82,8 +81,7 @@
def ncclGetVersion() -> str:
version = ctypes.c_int()
- result = _c_ncclGetVersion(ctypes.byref(version))
- assert result == 0
+ NCCL_CHECK(_c_ncclGetVersion(ctypes.byref(version)))
# something like 21903 --> "2.19.3"
version_str = str(version.value)
major = version_str[0].lstrip("0")
@@ -105,8 +103,7 @@ class NcclUniqueId(ctypes.Structure):
def ncclGetUniqueId() -> NcclUniqueId:
unique_id = NcclUniqueId()
- result = _c_ncclGetUniqueId(ctypes.byref(unique_id))
- assert result == 0
+ NCCL_CHECK(_c_ncclGetUniqueId(ctypes.byref(unique_id)))
return unique_id
@@ -121,9 +118,10 @@ def ncclGetUniqueId() -> NcclUniqueId:
ctypes.POINTER(ctypes.c_void_p), ctypes.c_int, NcclUniqueId, ctypes.c_int
]
+ncclDataType_t = ctypes.c_int
-# enums
-class ncclDataType_t(ctypes.c_int):
+
+class ncclDataTypeEnum:
ncclInt8 = 0
ncclChar = 0
ncclUint8 = 1
@@ -142,7 +140,7 @@ class ncclDataType_t(ctypes.c_int):
ncclNumTypes = 10
@classmethod
- def from_torch(cls, dtype: torch.dtype) -> 'ncclDataType_t':
+ def from_torch(cls, dtype: torch.dtype) -> int:
if dtype == torch.int8:
return cls.ncclInt8
if dtype == torch.uint8:
@@ -162,7 +160,10 @@ def from_torch(cls, dtype: torch.dtype) -> 'ncclDataType_t':
raise ValueError(f"Unsupported dtype: {dtype}")
-class ncclRedOp_t(ctypes.c_int):
+ncclRedOp_t = ctypes.c_int
+
+
+class ncclRedOpTypeEnum:
ncclSum = 0
ncclProd = 1
ncclMax = 2
@@ -171,7 +172,7 @@ class ncclRedOp_t(ctypes.c_int):
ncclNumOps = 5
@classmethod
- def from_torch(cls, op: ReduceOp) -> 'ncclRedOp_t':
+ def from_torch(cls, op: ReduceOp) -> int:
if op == ReduceOp.SUM:
return cls.ncclSum
if op == ReduceOp.PRODUCT:
@@ -194,8 +195,8 @@ def from_torch(cls, op: ReduceOp) -> 'ncclRedOp_t':
_c_ncclAllReduce = nccl.ncclAllReduce
_c_ncclAllReduce.restype = ctypes.c_int
_c_ncclAllReduce.argtypes = [
- ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ncclDataType_t,
- ncclRedOp_t, ctypes.c_void_p, ctypes.c_void_p
+ ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ncclRedOp_t,
+ ncclDataType_t, ctypes.c_void_p, ctypes.c_void_p
]
# equivalent to c declaration:
@@ -209,66 +210,73 @@ class NCCLCommunicator:
def __init__(
self,
- backend=None,
- init_method=None,
- timeout=datetime.timedelta(seconds=10),
- world_size: int = -1,
- rank: int = -1,
- store=None,
- group_name: str = "",
- pg_options=None,
- local_rank: int = -1,
+ group: Optional[ProcessGroup] = None,
+ device: Optional[Union[int, str, torch.device]] = None,
):
- if not dist.is_initialized():
- backend = backend or "nccl"
- assert backend == 'nccl', (
- "only use nccl backend for starting the NCCL communicator")
- dist.init_process_group(backend=backend,
- init_method=init_method,
- timeout=timeout,
- world_size=world_size,
- rank=rank,
- store=store,
- group_name=group_name,
- pg_options=pg_options)
- self.rank = dist.get_rank()
- self.world_size = dist.get_world_size()
- if local_rank == -1:
- local_rank = self.rank
- self.local_rank = local_rank
- # don't use these args, as they can be -1
- # use `self.rank`, `self.local_rank` and `self.world_size` instead
- del world_size, rank, local_rank
- torch.cuda.set_device(self.local_rank)
+ """
+ Args:
+ group: the process group to work on. If None, it will use the
+ default process group.
+ device: the device to bind the NCCLCommunicator to. If None,
+ it will be bind to f"cuda:{local_rank}".
+ It is the caller's responsibility to make sure each communicator
+ is bind to a unique device.
+ """
+ assert dist.is_initialized()
+ group = get_cpu_world_group() if group is None else group
+ assert dist.get_backend(group) != dist.Backend.NCCL, (
+ "NCCLCommunicator should be attached to a non-NCCL group.")
+ self.group = group
+ self.rank = dist.get_rank(group)
+ self.world_size = dist.get_world_size(group)
if self.rank == 0:
self.unique_id = ncclGetUniqueId()
else:
self.unique_id = NcclUniqueId()
- tensor = torch.ByteTensor(list(self.unique_id.internal)).cuda(
- self.local_rank)
- dist.broadcast(tensor, src=0)
- byte_list = tensor.cpu().tolist()
+ tensor = torch.ByteTensor(list(self.unique_id.internal))
+ dist.broadcast(tensor, src=0, group=group)
+ byte_list = tensor.tolist()
for i, byte in enumerate(byte_list):
self.unique_id.internal[i] = byte
self.comm = ctypes.c_void_p()
- result = _c_ncclCommInitRank(ctypes.byref(self.comm), self.world_size,
- self.unique_id, self.rank)
- assert result == 0
- self.stream = torch.cuda.Stream(device=f"cuda:{self.local_rank}")
+ if device is None:
+ local_rank = get_local_rank()
+ device = torch.device(f"cuda:{local_rank}")
+ elif isinstance(device, int):
+ device = torch.device(f"cuda:{device}")
+ elif isinstance(device, str):
+ device = torch.device(device)
+ # now `device` is a `torch.device` object
+ assert isinstance(device, torch.device)
+ self.device = device
+ # nccl communicator and stream will use this device
+ # `torch.cuda.device` is a context manager that changes the
+ # current cuda device to the specified one
+ with torch.cuda.device(device):
+ NCCL_CHECK(
+ _c_ncclCommInitRank(ctypes.byref(self.comm), self.world_size,
+ self.unique_id, self.rank))
+ self.stream = torch.cuda.Stream()
def all_reduce(self,
tensor: torch.Tensor,
op: ReduceOp = ReduceOp.SUM,
stream=None):
+ # nccl communicator created on a specific device
+ # will only work on tensors on the same device
+ # otherwise it will cause "illegal memory access"
+ assert tensor.device == self.device, (
+ f"this nccl communicator is created to work on {self.device}, "
+ f"but the input tensor is on {tensor.device}")
if stream is None:
stream = self.stream
- result = _c_ncclAllReduce(ctypes.c_void_p(tensor.data_ptr()),
- ctypes.c_void_p(tensor.data_ptr()),
- tensor.numel(),
- ncclDataType_t.from_torch(tensor.dtype),
- ncclRedOp_t.from_torch(op), self.comm,
- ctypes.c_void_p(stream.cuda_stream))
- assert result == 0
+ NCCL_CHECK(
+ _c_ncclAllReduce(ctypes.c_void_p(tensor.data_ptr()),
+ ctypes.c_void_p(tensor.data_ptr()),
+ tensor.numel(),
+ ncclDataTypeEnum.from_torch(tensor.dtype),
+ ncclRedOpTypeEnum.from_torch(op), self.comm,
+ ctypes.c_void_p(stream.cuda_stream)))
def __del__(self):
# `dist` module might have been already destroyed
diff --git a/vllm/distributed/device_communicators/pynccl_utils.py b/vllm/distributed/device_communicators/pynccl_utils.py
index aeb73015733d1..a717fddb695ba 100644
--- a/vllm/distributed/device_communicators/pynccl_utils.py
+++ b/vllm/distributed/device_communicators/pynccl_utils.py
@@ -2,7 +2,7 @@
from typing import Optional
import torch
-from torch.distributed import ReduceOp
+from torch.distributed import ProcessGroup, ReduceOp
from vllm.logger import init_logger
@@ -30,28 +30,24 @@ def is_initialized() -> bool:
def set_pynccl_stream(stream: torch.cuda.Stream):
"""Set the cuda stream for communication"""
try:
+ assert comm is not None
comm.stream = stream
yield
finally:
pass
-def init_process_group(world_size: int,
- rank: int,
- init_method: str,
- local_rank: int = -1) -> None:
+def init_process_group(group: Optional[ProcessGroup] = None) -> None:
assert not is_initialized()
global comm
logger.info(f"vLLM is using nccl=={ncclGetVersion()}")
- comm = NCCLCommunicator(init_method=init_method,
- world_size=world_size,
- local_rank=local_rank,
- rank=rank)
+ comm = NCCLCommunicator(group=group)
def all_reduce(input_: torch.Tensor, op=ReduceOp.SUM) -> None:
"""All-reduces the input tensor across the process group."""
assert input_.is_cuda, f"{input_} should be a cuda tensor"
+ assert comm is not None
comm.all_reduce(input_, op)
@@ -62,8 +58,9 @@ def destroy_process_group() -> None:
def get_world_size() -> int:
"""Returns the world size."""
+ assert comm is not None
return comm.world_size
-def get_nccl_backend():
+def get_nccl_backend() -> Optional["NCCLCommunicator"]:
return comm
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
index 9fceffe7cb88b..515f2212511b7 100644
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@@ -4,10 +4,15 @@
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Tensor and pipeline parallel groups."""
import contextlib
+import os
from typing import Optional
import torch
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
# Tensor model parallel group that the current rank belongs to.
_TENSOR_MODEL_PARALLEL_GROUP = None
# Pipeline model parallel group that the current rank belongs to.
@@ -37,6 +42,13 @@
# source rank when broadcasting from the first or last pipeline stage.
_PIPELINE_GLOBAL_RANKS = None
+_LOCAL_RANK = -1
+
+
+def get_local_rank():
+ global _LOCAL_RANK
+ return _LOCAL_RANK
+
def init_distributed_environment(
world_size: int = -1,
@@ -45,6 +57,8 @@ def init_distributed_environment(
local_rank: int = -1,
backend: str = "nccl",
):
+ logger.debug(f"{world_size=} {rank=} {local_rank=} "
+ f"{distributed_init_method=} {backend=}")
if not torch.distributed.is_initialized():
assert distributed_init_method is not None, (
"distributed_init_method must be provided when initializing "
@@ -60,6 +74,13 @@ def init_distributed_environment(
ranks = list(range(torch.distributed.get_world_size()))
_CPU_WORLD_GROUP = torch.distributed.new_group(ranks=ranks,
backend="gloo")
+ # set the local rank
+ # local_rank is not available in torch ProcessGroup,
+ # see https://github.com/pytorch/pytorch/issues/122816
+ if local_rank == -1 and distributed_init_method == "env://":
+ local_rank = int(os.environ['LOCAL_RANK'])
+ global _LOCAL_RANK
+ _LOCAL_RANK = local_rank
def initialize_model_parallel(
diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
index 0cd420c8e11b5..e0a871ebe1756 100644
--- a/vllm/distributed/utils.py
+++ b/vllm/distributed/utils.py
@@ -2,9 +2,18 @@
# Adapted from
# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-from typing import Sequence
+import json
+import os
+from typing import Dict, Optional, Sequence
import torch
+import torch.distributed as dist
+
+from vllm.logger import init_logger
+
+from .parallel_state import get_cpu_world_group, get_local_rank
+
+logger = init_logger(__name__)
def ensure_divisibility(numerator, denominator):
@@ -46,3 +55,79 @@ def split_tensor_along_last_dim(
return tuple(chunk.contiguous() for chunk in tensor_list)
return tensor_list
+
+
+# code partly borrowed from
+# https://github.com/turboderp/exllamav2/blob/1c67f97f3d2a968605a9c31ab791a05c85bb7879/exllamav2/compat.py#L10
+# License: MIT
+def _can_actually_p2p(idx_a, idx_b):
+ dev_i = f"cuda:{idx_a}"
+ dev_j = f"cuda:{idx_b}"
+ a = torch.randn(5, device=dev_i) + 123.0
+ b = a.to(dev_j)
+ c = b.to(dev_i)
+ return torch.all(a == c).cpu().item()
+
+
+# why do we need this cache?
+# 1. we can have runtime checks for P2P access, where every process checks
+# P2P access to all other GPUs. Unfortunately, the test might cost many
+# (world_size * world_size) cuda context, and reduce the memory available
+# for the model. see https://github.com/vllm-project/vllm/issues/3821
+# 2. alternatively, we can have a p2p map that is generated by the master
+# process and broadcasted to all other processes. This still requires
+# #world_size of cuda context, belonging to the master process, on each GPU.
+# 3. we can have a cache file, that records the p2p access status. The first
+# time the master process checks the p2p access, it will generate the cache
+# file, at the cost of #world_size of cuda context. Later on, all processes
+# can read the cache file to check the p2p access status without any cost of
+# additional cuda context.
+# Note that the cache file is suffixed by the CUDA_VISIBLE_DEVICES, so that we
+# can have different cache files for different CUDA_VISIBLE_DEVICES settings,
+# e.g. used by different vllm engines. The device id in the cache file is a
+# **local** device id, i.e. from 0 to num_dev-1, where num_dev is the number
+# of visible devices in the vllm engine.
+_gpu_p2p_access_cache: Optional[Dict[str, bool]] = None
+
+
+def gpu_p2p_access_check(i: int, j: int) -> bool:
+ """Check if GPU i can access GPU j."""
+
+ # if the cache variable is already calculated,
+ # read from the cache instead of checking it again
+ global _gpu_p2p_access_cache
+ if _gpu_p2p_access_cache is not None:
+ return _gpu_p2p_access_cache[f"{i}->{j}"]
+
+ is_distributed = dist.is_initialized()
+
+ num_dev = torch.cuda.device_count()
+ cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
+ if cuda_visible_devices is None:
+ cuda_visible_devices = ",".join(str(i) for i in range(num_dev))
+ path = os.path.expanduser(
+ f"~/.config/vllm/gpu_p2p_access_cache_for_{cuda_visible_devices}.json")
+ os.makedirs(os.path.dirname(path), exist_ok=True)
+ if (not is_distributed or get_local_rank() == 0) \
+ and (not os.path.exists(path)):
+ # only the local master process (with local_rank == 0) can
+ # enter this block to calculate the cache
+ logger.info(f"generating GPU P2P access cache for in {path}")
+ cache = {}
+ for _i in range(num_dev):
+ for _j in range(num_dev):
+ # on some platforms, P2P support might be buggy and we need
+ # additional checks. See also:
+ # https://github.com/vllm-project/vllm/issues/2728
+ cache[f"{_i}->{_j}"] = torch.cuda.can_device_access_peer(
+ _i, _j) and _can_actually_p2p(_i, _j)
+ with open(path, "w") as f:
+ json.dump(cache, f, indent=4)
+ if is_distributed:
+ cpu_world_group = get_cpu_world_group()
+ dist.barrier(cpu_world_group)
+ logger.info(f"reading GPU P2P access cache from {path}")
+ with open(path, "r") as f:
+ cache = json.load(f)
+ _gpu_p2p_access_cache = cache
+ return _gpu_p2p_access_cache[f"{i}->{j}"]
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 12ef26cf83180..ae623b6f2600a 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -5,10 +5,11 @@
from dataclasses import dataclass
from typing import Optional
-from vllm.config import (CacheConfig, DeviceConfig, EngineConfig, LoRAConfig,
- ModelConfig, ParallelConfig, SchedulerConfig,
- SpeculativeConfig, TokenizerPoolConfig,
- VisionLanguageConfig)
+from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig,
+ EngineConfig, LoadConfig, LoRAConfig, ModelConfig,
+ ParallelConfig, SchedulerConfig, SpeculativeConfig,
+ TokenizerPoolConfig, VisionLanguageConfig)
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
from vllm.utils import str_to_int_tuple
@@ -17,6 +18,7 @@ class EngineArgs:
"""Arguments for vLLM engine."""
model: str
tokenizer: Optional[str] = None
+ skip_tokenizer_init: bool = False
tokenizer_mode: str = 'auto'
trust_remote_code: bool = False
download_dir: Optional[str] = None
@@ -61,19 +63,21 @@ class EngineArgs:
ray_workers_use_nsight: bool = False
num_gpu_blocks_override: Optional[int] = None
num_lookahead_slots: int = 0
+ model_loader_extra_config: Optional[dict] = None
# Related to Vision-language models such as llava
image_input_type: Optional[str] = None
image_token_id: Optional[int] = None
image_input_shape: Optional[str] = None
image_feature_size: Optional[int] = None
-
scheduler_delay_factor: float = 0.0
enable_chunked_prefill: bool = False
+ guided_decoding_backend: str = 'outlines'
# Speculative decoding configuration.
speculative_model: Optional[str] = None
num_speculative_tokens: Optional[int] = None
+ speculative_max_model_len: Optional[int] = None
def __post_init__(self):
if self.tokenizer is None:
@@ -84,72 +88,79 @@ def add_cli_args(
parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
"""Shared CLI arguments for vLLM engine."""
- # NOTE: If you update any of the arguments below, please also
- # make sure to update docs/source/models/engine_args.rst
-
# Model arguments
parser.add_argument(
'--model',
type=str,
default='facebook/opt-125m',
- help='name or path of the huggingface model to use')
+ help='Name or path of the huggingface model to use.')
parser.add_argument(
'--tokenizer',
type=str,
default=EngineArgs.tokenizer,
- help='name or path of the huggingface tokenizer to use')
+ help='Name or path of the huggingface tokenizer to use.')
+ parser.add_argument(
+ '--skip-tokenizer-init',
+ action='store_true',
+ help='Skip initialization of tokenizer and detokenizer')
parser.add_argument(
'--revision',
type=str,
default=None,
- help='the specific model version to use. It can be a branch '
+ help='The specific model version to use. It can be a branch '
'name, a tag name, or a commit id. If unspecified, will use '
'the default version.')
parser.add_argument(
'--code-revision',
type=str,
default=None,
- help='the specific revision to use for the model code on '
+ help='The specific revision to use for the model code on '
'Hugging Face Hub. It can be a branch name, a tag name, or a '
'commit id. If unspecified, will use the default version.')
parser.add_argument(
'--tokenizer-revision',
type=str,
default=None,
- help='the specific tokenizer version to use. It can be a branch '
+ help='The specific tokenizer version to use. It can be a branch '
'name, a tag name, or a commit id. If unspecified, will use '
'the default version.')
- parser.add_argument('--tokenizer-mode',
- type=str,
- default=EngineArgs.tokenizer_mode,
- choices=['auto', 'slow'],
- help='tokenizer mode. "auto" will use the fast '
- 'tokenizer if available, and "slow" will '
- 'always use the slow tokenizer.')
+ parser.add_argument(
+ '--tokenizer-mode',
+ type=str,
+ default=EngineArgs.tokenizer_mode,
+ choices=['auto', 'slow'],
+ help='The tokenizer mode.\n\n* "auto" will use the '
+ 'fast tokenizer if available.\n* "slow" will '
+ 'always use the slow tokenizer.')
parser.add_argument('--trust-remote-code',
action='store_true',
- help='trust remote code from huggingface')
+ help='Trust remote code from huggingface.')
parser.add_argument('--download-dir',
type=str,
default=EngineArgs.download_dir,
- help='directory to download and load the weights, '
+ help='Directory to download and load the weights, '
'default to the default cache dir of '
- 'huggingface')
+ 'huggingface.')
parser.add_argument(
'--load-format',
type=str,
default=EngineArgs.load_format,
- choices=['auto', 'pt', 'safetensors', 'npcache', 'dummy'],
- help='The format of the model weights to load. '
- '"auto" will try to load the weights in the safetensors format '
+ choices=[
+ 'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer'
+ ],
+ help='The format of the model weights to load.\n\n'
+ '* "auto" will try to load the weights in the safetensors format '
'and fall back to the pytorch bin format if safetensors format '
- 'is not available. '
- '"pt" will load the weights in the pytorch bin format. '
- '"safetensors" will load the weights in the safetensors format. '
- '"npcache" will load the weights in pytorch format and store '
- 'a numpy cache to speed up the loading. '
- '"dummy" will initialize the weights with random values, '
- 'which is mainly for profiling.')
+ 'is not available.\n'
+ '* "pt" will load the weights in the pytorch bin format.\n'
+ '* "safetensors" will load the weights in the safetensors format.\n'
+ '* "npcache" will load the weights in pytorch format and store '
+ 'a numpy cache to speed up the loading.\n'
+ '* "dummy" will initialize the weights with random values, '
+ 'which is mainly for profiling.\n'
+ '* "tensorizer" will load the weights using tensorizer from '
+ 'CoreWeave which assumes tensorizer_uri is set to the location of '
+ 'the serialized weights.')
parser.add_argument(
'--dtype',
type=str,
@@ -157,10 +168,14 @@ def add_cli_args(
choices=[
'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'
],
- help='data type for model weights and activations. '
- 'The "auto" option will use FP16 precision '
- 'for FP32 and FP16 models, and BF16 precision '
- 'for BF16 models.')
+ help='Data type for model weights and activations.\n\n'
+ '* "auto" will use FP16 precision for FP32 and FP16 models, and '
+ 'BF16 precision for BF16 models.\n'
+ '* "half" for FP16. Recommended for AWQ quantization.\n'
+ '* "float16" is the same as "half".\n'
+ '* "bfloat16" for a balance between precision and range.\n'
+ '* "float" is shorthand for FP32 precision.\n'
+ '* "float32" for FP32 precision.')
parser.add_argument(
'--kv-cache-dtype',
type=str,
@@ -169,7 +184,7 @@ def add_cli_args(
help='Data type for kv cache storage. If "auto", will use model '
'data type. FP8_E5M2 (without scaling) is only supported on cuda '
'version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead '
- 'supported for common inference criteria. ')
+ 'supported for common inference criteria.')
parser.add_argument(
'--quantization-param-path',
type=str,
@@ -180,51 +195,63 @@ def add_cli_args(
'default to 1.0, which may cause accuracy issues. '
'FP8_E5M2 (without scaling) is only supported on cuda version'
'greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead '
- 'supported for common inference criteria. ')
+ 'supported for common inference criteria.')
parser.add_argument('--max-model-len',
type=int,
default=EngineArgs.max_model_len,
- help='model context length. If unspecified, '
- 'will be automatically derived from the model.')
+ help='Model context length. If unspecified, will '
+ 'be automatically derived from the model config.')
+ parser.add_argument(
+ '--guided-decoding-backend',
+ type=str,
+ default='outlines',
+ choices=['outlines', 'lm-format-enforcer'],
+ help='Which engine will be used for guided decoding'
+ ' (JSON schema / regex etc) by default. Currently support '
+ 'https://github.com/outlines-dev/outlines and '
+ 'https://github.com/noamgat/lm-format-enforcer.'
+ ' Can be overridden per request via guided_decoding_backend'
+ ' parameter.')
# Parallel arguments
parser.add_argument('--worker-use-ray',
action='store_true',
- help='use Ray for distributed serving, will be '
- 'automatically set when using more than 1 GPU')
+ help='Use Ray for distributed serving, will be '
+ 'automatically set when using more than 1 GPU.')
parser.add_argument('--pipeline-parallel-size',
'-pp',
type=int,
default=EngineArgs.pipeline_parallel_size,
- help='number of pipeline stages')
+ help='Number of pipeline stages.')
parser.add_argument('--tensor-parallel-size',
'-tp',
type=int,
default=EngineArgs.tensor_parallel_size,
- help='number of tensor parallel replicas')
+ help='Number of tensor parallel replicas.')
parser.add_argument(
'--max-parallel-loading-workers',
type=int,
default=EngineArgs.max_parallel_loading_workers,
- help='load model sequentially in multiple batches, '
+ help='Load model sequentially in multiple batches, '
'to avoid RAM OOM when using tensor '
- 'parallel and large models')
+ 'parallel and large models.')
parser.add_argument(
'--ray-workers-use-nsight',
action='store_true',
- help='If specified, use nsight to profile ray workers')
+ help='If specified, use nsight to profile Ray workers.')
# KV cache arguments
parser.add_argument('--block-size',
type=int,
default=EngineArgs.block_size,
- choices=[8, 16, 32, 128],
- help='token block size')
+ choices=[8, 16, 32],
+ help='Token block size for contiguous chunks of '
+ 'tokens.')
parser.add_argument('--enable-prefix-caching',
action='store_true',
- help='Enables automatic prefix caching')
+ help='Enables automatic prefix caching.')
parser.add_argument('--use-v2-block-manager',
action='store_true',
- help='Use BlockSpaceMangerV2')
+ help='Use BlockSpaceMangerV2.')
parser.add_argument(
'--num-lookahead-slots',
type=int,
@@ -237,18 +264,19 @@ def add_cli_args(
parser.add_argument('--seed',
type=int,
default=EngineArgs.seed,
- help='random seed')
+ help='Random seed for operations.')
parser.add_argument('--swap-space',
type=int,
default=EngineArgs.swap_space,
- help='CPU swap space size (GiB) per GPU')
+ help='CPU swap space size (GiB) per GPU.')
parser.add_argument(
'--gpu-memory-utilization',
type=float,
default=EngineArgs.gpu_memory_utilization,
- help='the fraction of GPU memory to be used for '
- 'the model executor, which can range from 0 to 1.'
- 'If unspecified, will use the default value of 0.9.')
+ help='The fraction of GPU memory to be used for the model '
+ 'executor, which can range from 0 to 1. For example, a value of '
+ '0.5 would imply 50%% GPU memory utilization. If unspecified, '
+ 'will use the default value of 0.9.')
parser.add_argument(
'--num-gpu-blocks-override',
type=int,
@@ -258,26 +286,26 @@ def add_cli_args(
parser.add_argument('--max-num-batched-tokens',
type=int,
default=EngineArgs.max_num_batched_tokens,
- help='maximum number of batched tokens per '
- 'iteration')
+ help='Maximum number of batched tokens per '
+ 'iteration.')
parser.add_argument('--max-num-seqs',
type=int,
default=EngineArgs.max_num_seqs,
- help='maximum number of sequences per iteration')
+ help='Maximum number of sequences per iteration.')
parser.add_argument(
'--max-logprobs',
type=int,
default=EngineArgs.max_logprobs,
- help=('max number of log probs to return logprobs is specified in'
- ' SamplingParams'))
+ help=('Max number of log probs to return logprobs is specified in'
+ ' SamplingParams.'))
parser.add_argument('--disable-log-stats',
action='store_true',
- help='disable logging statistics')
+ help='Disable logging statistics.')
# Quantization settings.
parser.add_argument('--quantization',
'-q',
type=str,
- choices=['awq', 'gptq', 'squeezellm', None],
+ choices=[*QUANTIZATION_METHODS, None],
default=EngineArgs.quantization,
help='Method used to quantize the weights. If '
'None, we first check the `quantization_config` '
@@ -304,13 +332,13 @@ def add_cli_args(
parser.add_argument('--max-context-len-to-capture',
type=int,
default=EngineArgs.max_context_len_to_capture,
- help='maximum context length covered by CUDA '
+ help='Maximum context length covered by CUDA '
'graphs. When a sequence has context length '
'larger than this, we fall back to eager mode.')
parser.add_argument('--disable-custom-all-reduce',
action='store_true',
default=EngineArgs.disable_custom_all_reduce,
- help='See ParallelConfig')
+ help='See ParallelConfig.')
parser.add_argument('--tokenizer-pool-size',
type=int,
default=EngineArgs.tokenizer_pool_size,
@@ -403,21 +431,39 @@ def add_cli_args(
'--enable-chunked-prefill',
action='store_true',
help='If set, the prefill requests can be chunked based on the '
- 'max_num_batched_tokens')
+ 'max_num_batched_tokens.')
parser.add_argument(
'--speculative-model',
type=str,
- default=None,
+ default=EngineArgs.speculative_model,
help=
'The name of the draft model to be used in speculative decoding.')
parser.add_argument(
'--num-speculative-tokens',
type=int,
- default=None,
+ default=EngineArgs.num_speculative_tokens,
help='The number of speculative tokens to sample from '
- 'the draft model in speculative decoding')
+ 'the draft model in speculative decoding.')
+
+ parser.add_argument(
+ '--speculative-max-model-len',
+ type=str,
+ default=EngineArgs.speculative_max_model_len,
+ help='The maximum sequence length supported by the '
+ 'draft model. Sequences over this length will skip '
+ 'speculation.')
+
+ parser.add_argument('--model-loader-extra-config',
+ type=str,
+ default=EngineArgs.model_loader_extra_config,
+ help='Extra config for model loader. '
+ 'This will be passed to the model loader '
+ 'corresponding to the chosen load_format. '
+ 'This should be a JSON string that will be '
+ 'parsed into a dictionary.')
+
return parser
@classmethod
@@ -435,21 +481,20 @@ def create_engine_config(self, ) -> EngineConfig:
self.tokenizer,
self.tokenizer_mode,
self.trust_remote_code,
- self.download_dir,
- self.load_format,
self.dtype,
self.seed,
self.revision,
self.code_revision,
self.tokenizer_revision,
self.max_model_len,
+ # UPSTREAM SYNC: keep sparsity argument
self.quantization,
self.quantization_param_path,
- # UPSTREAM SYNC: keep sparsity argument
self.sparsity,
self.enforce_eager,
self.max_context_len_to_capture,
- self.max_logprobs)
+ self.max_logprobs,
+ self.skip_tokenizer_init)
cache_config = CacheConfig(self.block_size,
self.gpu_memory_utilization,
self.swap_space, self.kv_cache_dtype,
@@ -472,6 +517,9 @@ def create_engine_config(self, ) -> EngineConfig:
target_dtype=self.dtype,
speculative_model=self.speculative_model,
num_speculative_tokens=self.num_speculative_tokens,
+ speculative_max_model_len=self.speculative_max_model_len,
+ enable_chunked_prefill=self.enable_chunked_prefill,
+ use_v2_block_manager=self.use_v2_block_manager,
)
scheduler_config = SchedulerConfig(
@@ -493,6 +541,12 @@ def create_engine_config(self, ) -> EngineConfig:
max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
and self.max_cpu_loras > 0 else None) if self.enable_lora else None
+ load_config = LoadConfig(
+ load_format=self.load_format,
+ download_dir=self.download_dir,
+ model_loader_extra_config=self.model_loader_extra_config,
+ )
+
if self.image_input_type:
if (not self.image_token_id or not self.image_input_shape
or not self.image_feature_size):
@@ -509,6 +563,9 @@ def create_engine_config(self, ) -> EngineConfig:
else:
vision_language_config = None
+ decoding_config = DecodingConfig(
+ guided_decoding_backend=self.guided_decoding_backend)
+
return EngineConfig(model_config=model_config,
cache_config=cache_config,
parallel_config=parallel_config,
@@ -516,7 +573,9 @@ def create_engine_config(self, ) -> EngineConfig:
device_config=device_config,
lora_config=lora_config,
vision_language_config=vision_language_config,
- speculative_config=speculative_config)
+ speculative_config=speculative_config,
+ load_config=load_config,
+ decoding_config=decoding_config)
@dataclass
@@ -527,20 +586,31 @@ class AsyncEngineArgs(EngineArgs):
max_log_len: Optional[int] = None
@staticmethod
- def add_cli_args(
- parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
- parser = EngineArgs.add_cli_args(parser)
+ def add_cli_args(parser: argparse.ArgumentParser,
+ async_args_only: bool = False) -> argparse.ArgumentParser:
+ if not async_args_only:
+ parser = EngineArgs.add_cli_args(parser)
parser.add_argument('--engine-use-ray',
action='store_true',
- help='use Ray to start the LLM engine in a '
+ help='Use Ray to start the LLM engine in a '
'separate process as the server process.')
parser.add_argument('--disable-log-requests',
action='store_true',
- help='disable logging requests')
+ help='Disable logging requests.')
parser.add_argument('--max-log-len',
type=int,
default=None,
- help='max number of prompt characters or prompt '
- 'ID numbers being printed in log. '
- 'Default: unlimited.')
+ help='Max number of prompt characters or prompt '
+ 'ID numbers being printed in log.'
+ '\n\nDefault: Unlimited')
return parser
+
+
+# These functions are used by sphinx to build the documentation
+def _engine_args_parser():
+ return EngineArgs.add_cli_args(argparse.ArgumentParser())
+
+
+def _async_engine_args_parser():
+ return AsyncEngineArgs.add_cli_args(argparse.ArgumentParser(),
+ async_args_only=True)
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index f610495135121..4b007d71e9cfc 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -2,15 +2,15 @@
import os
import time
from functools import partial
-from typing import (AsyncIterator, Callable, Dict, Iterable, List, Optional,
- Set, Tuple, Type, Union)
+from typing import (Any, AsyncIterator, Callable, Dict, Iterable, List,
+ Optional, Set, Tuple, Type, Union)
from transformers import PreTrainedTokenizer
from vllm.config import ModelConfig
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.llm_engine import LLMEngine
-from vllm.engine.ray_utils import initialize_ray_cluster, ray
+from vllm.executor.ray_utils import initialize_ray_cluster, ray
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.outputs import RequestOutput
@@ -52,7 +52,7 @@ class AsyncStream:
def __init__(self, request_id: str) -> None:
self.request_id = request_id
- self._queue = asyncio.Queue()
+ self._queue: asyncio.Queue = asyncio.Queue()
self._finished = False
def put(self, item: Union[RequestOutput, Exception]) -> None:
@@ -217,7 +217,15 @@ async def step_async(self) -> List[RequestOutput]:
else:
output = []
- return self._process_model_outputs(output, scheduler_outputs)
+ request_outputs = self._process_model_outputs(
+ output, scheduler_outputs.scheduled_seq_groups,
+ scheduler_outputs.ignored_seq_groups)
+
+ # Log stats.
+ if self.log_stats:
+ self.stat_logger.log(self._get_stats(scheduler_outputs))
+
+ return request_outputs
async def encode_request_async(
self,
@@ -310,15 +318,17 @@ def __init__(self,
self.max_log_len = max_log_len
self.engine = self._init_engine(*args, **kwargs)
- self.background_loop = None
+ self.background_loop: Optional[asyncio.Future] = None
# We need to keep a reference to unshielded
# task as well to prevent it from being garbage
# collected
- self._background_loop_unshielded = None
+ self._background_loop_unshielded: Optional[asyncio.Task[Any]] = None
self.start_engine_loop = start_engine_loop
- self._request_tracker: Optional[RequestTracker] = None
self._errored_with: Optional[BaseException] = None
+ # Lazy initialized fields
+ self._request_tracker: RequestTracker
+
@classmethod
def from_engine_args(
cls,
@@ -331,10 +341,14 @@ def from_engine_args(
engine_config = engine_args.create_engine_config()
if engine_config.device_config.device_type == "neuron":
- raise NotImplementedError("Neuron is not supported for "
- "async engine yet.")
- elif (engine_config.parallel_config.worker_use_ray
- or engine_args.engine_use_ray):
+ from vllm.executor.neuron_executor import NeuronExecutorAsync
+ executor_class = NeuronExecutorAsync
+ elif engine_config.device_config.device_type == "cpu":
+ assert not engine_config.parallel_config.worker_use_ray, (
+ "Ray is not supported with the CPU backend.")
+ from vllm.executor.cpu_executor import CPUExecutorAsync
+ executor_class = CPUExecutorAsync
+ elif engine_config.parallel_config.worker_use_ray:
initialize_ray_cluster(engine_config.parallel_config)
from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync
executor_class = RayGPUExecutorAsync
@@ -360,11 +374,13 @@ def from_engine_args(
@property
def is_running(self) -> bool:
return (self.background_loop is not None
+ and self._background_loop_unshielded is not None
and not self._background_loop_unshielded.done())
@property
def is_stopped(self) -> bool:
- return self.errored or (self.background_loop is not None
+ return self.errored or (self.background_loop is not None and
+ self._background_loop_unshielded is not None
and self._background_loop_unshielded.done())
@property
@@ -380,7 +396,7 @@ def _error_callback(self, exc: Exception) -> None:
async def get_tokenizer(self) -> "PreTrainedTokenizer":
if self.engine_use_ray:
- return await self.engine.get_tokenizer.remote()
+ return await self.engine.get_tokenizer.remote() # type: ignore
else:
return self.engine.get_tokenizer()
@@ -410,8 +426,8 @@ def _init_engine(self, *args,
else:
# FIXME(woosuk): This is a bit hacky. Be careful when changing the
# order of the arguments.
- cache_config = args[1]
- parallel_config = args[2]
+ cache_config = kwargs["cache_config"]
+ parallel_config = kwargs["parallel_config"]
if parallel_config.tensor_parallel_size == 1:
num_gpus = cache_config.gpu_memory_utilization
else:
@@ -433,7 +449,8 @@ async def engine_step(self) -> bool:
# TODO: Maybe add add_request_batch to reduce Ray overhead
try:
if self.engine_use_ray:
- await self.engine.add_request.remote(**new_request)
+ await self.engine.add_request.remote( # type: ignore
+ **new_request)
else:
await self.engine.add_request_async(**new_request)
except ValueError as e:
@@ -448,7 +465,7 @@ async def engine_step(self) -> bool:
await self._engine_abort(finished_requests)
if self.engine_use_ray:
- request_outputs = await self.engine.step.remote()
+ request_outputs = await self.engine.step.remote() # type: ignore
else:
request_outputs = await self.engine.step_async()
@@ -461,7 +478,7 @@ async def engine_step(self) -> bool:
async def _engine_abort(self, request_ids: Iterable[str]):
if self.engine_use_ray:
- await self.engine.abort_request.remote(request_ids)
+ await self.engine.abort_request.remote(request_ids) # type: ignore
else:
self.engine.abort_request(request_ids)
@@ -524,11 +541,12 @@ async def add_request(
arrival_time = time.time()
if self.engine_use_ray:
- prompt_token_ids = await self.engine.encode_request_async.remote(
- request_id=request_id,
- prompt=prompt,
- prompt_token_ids=prompt_token_ids,
- lora_request=lora_request)
+ prompt_token_ids = await (
+ self.engine.encode_request_async.remote( # type: ignore
+ request_id=request_id,
+ prompt=prompt,
+ prompt_token_ids=prompt_token_ids,
+ lora_request=lora_request))
else:
prompt_token_ids = await self.engine.encode_request_async(
request_id=request_id,
@@ -675,13 +693,13 @@ def _abort(self, request_id: str) -> None:
async def get_model_config(self) -> ModelConfig:
"""Get the model configuration of the vLLM engine."""
if self.engine_use_ray:
- return await self.engine.get_model_config.remote()
+ return await self.engine.get_model_config.remote() # type: ignore
else:
return self.engine.get_model_config()
async def do_log_stats(self) -> None:
if self.engine_use_ray:
- await self.engine.do_log_stats.remote()
+ await self.engine.do_log_stats.remote() # type: ignore
else:
self.engine.do_log_stats()
@@ -694,7 +712,7 @@ async def check_health(self) -> None:
if self.engine_use_ray:
try:
- await self.engine.check_health.remote()
+ await self.engine.check_health.remote() # type: ignore
except ray.exceptions.RayActorError as e:
raise RuntimeError("Engine is dead.") from e
else:
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 15d7eabbf9de6..f04462db54ef2 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -1,24 +1,28 @@
import time
-from typing import Iterable, List, Optional, Tuple, Type, Union
+from typing import Iterable, List, Optional, Type, Union
-from transformers import PreTrainedTokenizer
+from transformers import GenerationConfig, PreTrainedTokenizer
import vllm
-from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
- ParallelConfig, SchedulerConfig, SpeculativeConfig,
+from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoadConfig,
+ LoRAConfig, ModelConfig, ParallelConfig,
+ SchedulerConfig, SpeculativeConfig,
VisionLanguageConfig)
from vllm.core.scheduler import Scheduler, SchedulerOutputs
from vllm.engine.arg_utils import EngineArgs
from vllm.engine.metrics import StatLogger, Stats
-from vllm.engine.ray_utils import initialize_ray_cluster
+from vllm.engine.output_processor.interfaces import (
+ SequenceGroupOutputProcessor)
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.engine.output_processor.util import create_output_by_sequence_group
from vllm.executor.executor_base import ExecutorBase
+from vllm.executor.ray_utils import initialize_ray_cluster
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.outputs import RequestOutput
from vllm.sampling_params import SamplingParams
from vllm.sequence import (MultiModalData, SamplerOutput, Sequence,
- SequenceGroup, SequenceGroupOutput, SequenceOutput,
- SequenceStatus)
+ SequenceGroup, SequenceStage)
from vllm.transformers_utils.detokenizer import Detokenizer
from vllm.transformers_utils.tokenizer_group import (BaseTokenizerGroup,
get_tokenizer_group)
@@ -30,6 +34,17 @@
_LOCAL_LOGGING_INTERVAL_SEC = 5
+def _load_generation_config_dict(model_config: ModelConfig):
+ try:
+ return GenerationConfig.from_pretrained(
+ model_config.model,
+ revision=model_config.revision,
+ ).to_diff_dict()
+ except OSError:
+ # Not found.
+ return {}
+
+
class LLMEngine:
"""An LLM engine that receives requests and generates texts.
@@ -71,9 +86,11 @@ def __init__(
parallel_config: ParallelConfig,
scheduler_config: SchedulerConfig,
device_config: DeviceConfig,
+ load_config: LoadConfig,
lora_config: Optional[LoRAConfig],
vision_language_config: Optional[VisionLanguageConfig],
speculative_config: Optional[SpeculativeConfig],
+ decoding_config: Optional[DecodingConfig],
executor_class: Type[ExecutorBase],
log_stats: bool,
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
@@ -83,14 +100,15 @@ def __init__(
f"model={model_config.model!r}, "
f"speculative_config={speculative_config!r}, "
f"tokenizer={model_config.tokenizer!r}, "
+ f"skip_tokenizer_init={model_config.skip_tokenizer_init}, "
f"tokenizer_mode={model_config.tokenizer_mode}, "
f"revision={model_config.revision}, "
f"tokenizer_revision={model_config.tokenizer_revision}, "
f"trust_remote_code={model_config.trust_remote_code}, "
f"dtype={model_config.dtype}, "
f"max_seq_len={model_config.max_model_len}, "
- f"download_dir={model_config.download_dir!r}, "
- f"load_format={model_config.load_format}, "
+ f"download_dir={load_config.download_dir!r}, "
+ f"load_format={load_config.load_format}, "
f"tensor_parallel_size={parallel_config.tensor_parallel_size}, "
f"disable_custom_all_reduce="
f"{parallel_config.disable_custom_all_reduce}, "
@@ -101,6 +119,7 @@ def __init__(
f"kv_cache_dtype={cache_config.cache_dtype}, "
f"quantization_param_path={model_config.quantization_param_path}, "
f"device_config={device_config.device}, "
+ f"decoding_config={decoding_config!r}, "
f"seed={model_config.seed})")
# TODO(woosuk): Print more configs in debug mode.
@@ -112,11 +131,21 @@ def __init__(
self.scheduler_config = scheduler_config
self.device_config = device_config
self.speculative_config = speculative_config
+ self.load_config = load_config
+ self.decoding_config = decoding_config or DecodingConfig()
self.log_stats = log_stats
- self._init_tokenizer()
- self.detokenizer = Detokenizer(self.tokenizer)
+ if not self.model_config.skip_tokenizer_init:
+ self.tokenizer: BaseTokenizerGroup
+ self._init_tokenizer()
+ self.detokenizer = Detokenizer(self.tokenizer)
+ else:
+ self.detokenizer = None
+ self.tokenizer = None
+
self.seq_counter = Counter()
+ self.generation_config_fields = _load_generation_config_dict(
+ model_config)
self.model_executor = executor_class(
model_config=model_config,
@@ -127,6 +156,7 @@ def __init__(
lora_config=lora_config,
vision_language_config=vision_language_config,
speculative_config=speculative_config,
+ load_config=load_config,
)
self._initialize_kv_caches()
@@ -166,9 +196,10 @@ def __init__(
parallel_config.disable_custom_all_reduce,
})
- # Ping the tokenizer to ensure liveness if it runs in a
- # different process.
- self.tokenizer.ping()
+ if self.tokenizer:
+ # Ping the tokenizer to ensure liveness if it runs in a
+ # different process.
+ self.tokenizer.ping()
# Create the scheduler.
# NOTE: the cache_config here have been updated with the numbers of
@@ -182,6 +213,21 @@ def __init__(
labels=dict(model_name=model_config.model))
self.stat_logger.info("cache_config", self.cache_config)
+ # Create sequence output processor, e.g. for beam search or
+ # speculative decoding.
+ self.output_processor = (
+ SequenceGroupOutputProcessor.create_output_processor(
+ self.scheduler_config,
+ self.detokenizer,
+ self.scheduler,
+ self.seq_counter,
+ self.get_tokenizer_for_seq,
+ stop_checker=StopChecker(
+ self.scheduler_config.max_model_len,
+ self.get_tokenizer_for_seq,
+ ),
+ ))
+
def _initialize_kv_caches(self) -> None:
"""Initialize the KV cache in the worker(s).
@@ -260,7 +306,7 @@ def _init_tokenizer(self, **tokenizer_init_kwargs):
trust_remote_code=self.model_config.trust_remote_code,
revision=self.model_config.tokenizer_revision)
init_kwargs.update(tokenizer_init_kwargs)
- self.tokenizer: BaseTokenizerGroup = get_tokenizer_group(
+ self.tokenizer = get_tokenizer_group(
self.parallel_config.tokenizer_pool_config, **init_kwargs)
def _verify_args(self) -> None:
@@ -357,8 +403,13 @@ def add_request(
# Create the sequences.
block_size = self.cache_config.block_size
seq_id = next(self.seq_counter)
- eos_token_id = self.tokenizer.get_lora_tokenizer(
- lora_request).eos_token_id
+ eos_token_id = None
+ if self.tokenizer:
+ eos_token_id = self.tokenizer.get_lora_tokenizer(
+ lora_request).eos_token_id
+ else:
+ logger.warning("Use None for EOS token id because tokenizer is "
+ "not initialized")
seq = Sequence(seq_id, prompt, prompt_token_ids, block_size,
eos_token_id, lora_request)
@@ -368,6 +419,8 @@ def add_request(
# inject the eos token id into the sampling_params to support min_tokens
# processing
sampling_params.eos_token_id = seq.eos_token_id
+ sampling_params.update_from_generation_config(
+ self.generation_config_fields)
# Create the sequence group.
seq_group = SequenceGroup(request_id, [seq], sampling_params,
@@ -407,240 +460,35 @@ def has_unfinished_requests(self) -> bool:
"""Returns True if there are unfinished requests."""
return self.scheduler.has_unfinished_seqs()
- def _check_beam_search_early_stopping(
- self,
- early_stopping: Union[bool, str],
- sampling_params: SamplingParams,
- best_running_seq: Sequence,
- current_worst_seq: Sequence,
- ) -> bool:
- assert sampling_params.use_beam_search
- length_penalty = sampling_params.length_penalty
- if early_stopping is True:
- return True
-
- current_worst_score = current_worst_seq.get_beam_search_score(
- length_penalty=length_penalty,
- eos_token_id=current_worst_seq.eos_token_id)
- if early_stopping is False:
- highest_attainable_score = best_running_seq.get_beam_search_score(
- length_penalty=length_penalty,
- eos_token_id=best_running_seq.eos_token_id)
- else:
- assert early_stopping == "never"
- if length_penalty > 0.0:
- # If length_penalty > 0.0, beam search will prefer longer
- # sequences. The highest attainable score calculation is
- # based on the longest possible sequence length in this case.
- max_possible_length = max(
- best_running_seq.get_prompt_len() +
- sampling_params.max_tokens,
- self.scheduler_config.max_model_len)
- highest_attainable_score = (
- best_running_seq.get_beam_search_score(
- length_penalty=length_penalty,
- eos_token_id=best_running_seq.eos_token_id,
- seq_len=max_possible_length))
- else:
- # Otherwise, beam search will prefer shorter sequences. The
- # highest attainable score calculation is based on the current
- # sequence length.
- highest_attainable_score = (
- best_running_seq.get_beam_search_score(
- length_penalty=length_penalty,
- eos_token_id=best_running_seq.eos_token_id))
- return current_worst_score >= highest_attainable_score
-
- def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
- outputs: SequenceGroupOutput) -> None:
-
- # Process prompt logprobs
- prompt_logprobs = outputs.prompt_logprobs
- if prompt_logprobs is not None and seq_group.sampling_params.detokenize:
- self.detokenizer.decode_prompt_logprobs_inplace(
- seq_group, prompt_logprobs)
- seq_group.prompt_logprobs = prompt_logprobs
-
- # Process samples
- samples = outputs.samples
- parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
- existing_finished_seqs = seq_group.get_finished_seqs()
- parent_child_dict = {
- parent_seq.seq_id: []
- for parent_seq in parent_seqs
- }
- for sample in samples:
- parent_child_dict[sample.parent_seq_id].append(sample)
- # List of (child, parent)
- child_seqs: List[Tuple[Sequence, Sequence]] = []
-
- # Process the child samples for each parent sequence
- for parent in parent_seqs:
- child_samples: List[SequenceOutput] = parent_child_dict[
- parent.seq_id]
- if len(child_samples) == 0:
- # This parent sequence has no children samples. Remove
- # the parent sequence from the sequence group since it will
- # not be used in the future iterations.
- parent.status = SequenceStatus.FINISHED_ABORTED
- seq_group.remove(parent.seq_id)
- self.scheduler.free_seq(parent)
- continue
- # Fork the parent sequence if there are multiple child samples.
- for child_sample in child_samples[:-1]:
- new_child_seq_id = next(self.seq_counter)
- child = parent.fork(new_child_seq_id)
- child.append_token_id(child_sample.output_token,
- child_sample.logprobs)
- child_seqs.append((child, parent))
- # Continue the parent sequence for the last child sample.
- # We reuse the parent sequence here to reduce redundant memory
- # copies, especially when using non-beam search sampling methods.
- last_child_sample = child_samples[-1]
- parent.append_token_id(last_child_sample.output_token,
- last_child_sample.logprobs)
- child_seqs.append((parent, parent))
-
- for seq, _ in child_seqs:
- if seq_group.sampling_params.detokenize:
- new_char_count = self.detokenizer.decode_sequence_inplace(
- seq, seq_group.sampling_params)
- else:
- new_char_count = 0
- self._check_stop(seq, new_char_count, seq_group.sampling_params)
-
- # Non-beam search case
- if not seq_group.sampling_params.use_beam_search:
- # For newly created child sequences, add them to the sequence group
- # and fork them in block manager if they are not finished.
- for seq, parent in child_seqs:
- if seq is not parent:
- seq_group.add(seq)
- if not seq.is_finished():
- self.scheduler.fork_seq(parent, seq)
-
- # Free the finished and selected parent sequences' memory in block
- # manager. Keep them in the sequence group as candidate output.
- # NOTE: we need to fork the new sequences before freeing the
- # old sequences.
- for seq, parent in child_seqs:
- if seq is parent and seq.is_finished():
- self.scheduler.free_seq(seq)
- return
-
- # Beam search case
- # Select the child sequences to keep in the sequence group.
- selected_child_seqs = []
- unselected_child_seqs = []
- beam_width = seq_group.sampling_params.best_of
- length_penalty = seq_group.sampling_params.length_penalty
-
- # Select the newly finished sequences with the highest scores
- # to replace existing finished sequences.
- # Tuple of (seq, parent, is_new)
- existing_finished_seqs = [(seq, None, False)
- for seq in existing_finished_seqs]
- new_finished_seqs = [(seq, parent, True) for seq, parent in child_seqs
- if seq.is_finished()]
- all_finished_seqs = existing_finished_seqs + new_finished_seqs
- # Sort the finished sequences by their scores.
- all_finished_seqs.sort(key=lambda x: x[0].get_beam_search_score(
- length_penalty=length_penalty, eos_token_id=x[0].eos_token_id),
- reverse=True)
- for seq, parent, is_new in all_finished_seqs[:beam_width]:
- if is_new:
- # A newly generated child sequence finishes and has a high
- # score, so we will add it into the sequence group.
- selected_child_seqs.append((seq, parent))
- for seq, parent, is_new in all_finished_seqs[beam_width:]:
- if is_new:
- # A newly generated child sequence finishes but has a low
- # score, so we will not add it into the sequence group.
- # Additionally, if this sequence is a continuation of a
- # parent sequence, we will need remove the parent sequence
- # from the sequence group.
- unselected_child_seqs.append((seq, parent))
- else:
- # An existing finished sequence has a low score, so we will
- # remove it from the sequence group.
- seq_group.remove(seq.seq_id)
-
- # select the top beam_width sequences from the running
- # sequences for the next iteration to continue the beam
- # search.
- running_child_seqs = [(seq, parent) for seq, parent in child_seqs
- if not seq.is_finished()]
- # Sort the running sequences by their scores.
- running_child_seqs.sort(key=lambda x: x[0].get_beam_search_score(
- length_penalty=length_penalty, eos_token_id=x[0].eos_token_id),
- reverse=True)
-
- # Check if we can stop the beam search.
- if len(running_child_seqs) == 0:
- # No running sequences, stop the beam search.
- stop_beam_search = True
- elif len(all_finished_seqs) < beam_width:
- # Not enough finished sequences, continue the beam search.
- stop_beam_search = False
- else:
- # Check the early stopping criteria
- best_running_seq = running_child_seqs[0][0]
- current_worst_seq = all_finished_seqs[beam_width - 1][0]
- stop_beam_search = self._check_beam_search_early_stopping(
- seq_group.sampling_params.early_stopping,
- seq_group.sampling_params, best_running_seq, current_worst_seq)
-
- if stop_beam_search:
- # Stop the beam search and remove all the running sequences from
- # the sequence group.
- unselected_child_seqs.extend(running_child_seqs)
- else:
- # Continue the beam search and select the top beam_width sequences
- # to continue the beam search.
- selected_child_seqs.extend(running_child_seqs[:beam_width])
- # The remaining running sequences will not be used in the next
- # iteration. Again, if these sequences are continuations of
- # parent sequences, we will need to remove the parent sequences
- # from the sequence group.
- unselected_child_seqs.extend(running_child_seqs[beam_width:])
-
- # For newly created child sequences, add them to the sequence group
- # and fork them in block manager if they are not finished.
- for seq, parent in selected_child_seqs:
- if seq is not parent:
- seq_group.add(seq)
- if not seq.is_finished():
- self.scheduler.fork_seq(parent, seq)
-
- # Free the finished and selected parent sequences' memory in block
- # manager. Keep them in the sequence group as candidate output.
- for seq, parent in selected_child_seqs:
- if seq is parent and seq.is_finished():
- self.scheduler.free_seq(seq)
-
- # Remove the unselected parent sequences from the sequence group and
- # free their memory in block manager.
- for seq, parent in unselected_child_seqs:
- if seq is parent:
- # Remove the parent sequence if it is not selected for next
- # iteration
- seq_group.remove(seq.seq_id)
- self.scheduler.free_seq(seq)
-
def _process_model_outputs(
- self, output: SamplerOutput,
- scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]:
+ self, output: List[SamplerOutput],
+ scheduled_seq_groups: List[SequenceGroup],
+ ignored_seq_groups: List[SequenceGroup]) -> List[RequestOutput]:
+ """Apply the model output to the sequences in the scheduled seq groups.
+
+ Returns RequestOutputs that can be returned to the client.
+ """
+
now = time.time()
+
+ # Organize outputs by [sequence group][step] instead of
+ # [step][sequence group].
+ output_by_sequence_group = create_output_by_sequence_group(
+ sampler_outputs=output, num_seq_groups=len(scheduled_seq_groups))
+
# Update the scheduled sequence groups with the model outputs.
- scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups
- for scheduled_seq_group, outputs in zip(scheduled_seq_groups, output):
+ for scheduled_seq_group, outputs in zip(scheduled_seq_groups,
+ output_by_sequence_group):
seq_group = scheduled_seq_group.seq_group
seq_group.update_num_computed_tokens(
scheduled_seq_group.token_chunk_size)
- # If uncomputed tokens > 0, it means prefill is chunked.
- # We don't need to process outputs in that case.
- if seq_group.get_num_uncomputed_tokens() == 0:
- self._process_sequence_group_outputs(seq_group, outputs)
+
+ # If all sequences in the sequence group are in DECODE, then we can
+ # process the output tokens. Otherwise, they are (chunked) prefill
+ # samples and should not be processed.
+ stages = [seq.data._stage for seq in seq_group.seqs_dict.values()]
+ if all(stage == SequenceStage.DECODE for stage in stages):
+ self.output_processor.process_outputs(seq_group, outputs)
# Free the finished sequence groups.
self.scheduler.free_finished_seq_groups()
@@ -652,13 +500,9 @@ def _process_model_outputs(
seq_group.maybe_set_first_token_time(now)
request_output = RequestOutput.from_seq_group(seq_group)
request_outputs.append(request_output)
- for seq_group in scheduler_outputs.ignored_seq_groups:
+ for seq_group in ignored_seq_groups:
request_output = RequestOutput.from_seq_group(seq_group)
request_outputs.append(request_output)
-
- # Log stats.
- if self.log_stats:
- self.stat_logger.log(self._get_stats(scheduler_outputs))
return request_outputs
def step(self) -> List[RequestOutput]:
@@ -716,22 +560,42 @@ def step(self) -> List[RequestOutput]:
if not scheduler_outputs.is_empty():
output = self.model_executor.execute_model(
- seq_group_metadata_list, scheduler_outputs.blocks_to_swap_in,
- scheduler_outputs.blocks_to_swap_out,
- scheduler_outputs.blocks_to_copy)
+ seq_group_metadata_list=seq_group_metadata_list,
+ blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
+ blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
+ blocks_to_copy=scheduler_outputs.blocks_to_copy,
+ num_lookahead_slots=scheduler_outputs.num_lookahead_slots)
else:
output = []
- return self._process_model_outputs(output, scheduler_outputs)
+ request_outputs = self._process_model_outputs(
+ output, scheduler_outputs.scheduled_seq_groups,
+ scheduler_outputs.ignored_seq_groups)
+
+ # Log stats.
+ if self.log_stats:
+ self.stat_logger.log(
+ self._get_stats(scheduler_outputs, model_output=output))
+
+ return request_outputs
def do_log_stats(self) -> None:
"""Forced log when no requests active."""
if self.log_stats:
self.stat_logger.log(self._get_stats(scheduler_outputs=None))
- def _get_stats(self,
- scheduler_outputs: Optional[SchedulerOutputs]) -> Stats:
- """Get Stats to be Logged to Prometheus."""
+ def _get_stats(
+ self,
+ scheduler_outputs: Optional[SchedulerOutputs],
+ model_output: Optional[List[SamplerOutput]] = None) -> Stats:
+ """Get Stats to be Logged to Prometheus.
+
+ Args:
+ scheduler_outputs: Optional, used to populate metrics related to
+ the scheduled batch,
+ model_output: Optional, used to emit speculative decoding metrics
+ which are created by the workers.
+ """
now = time.time()
# KV Cache Usage in %.
@@ -788,6 +652,14 @@ def _get_stats(self,
time_to_first_tokens = time_last_iters if prompt_run else []
time_per_output_tokens = [] if prompt_run else time_last_iters
+ # Spec decode, if enabled, emits specialized metrics from the worker in
+ # sampler output.
+ if model_output and (model_output[0].spec_decode_worker_metrics
+ is not None):
+ spec_decode_metrics = model_output[0].spec_decode_worker_metrics
+ else:
+ spec_decode_metrics = None
+
return Stats(
now=now,
num_running=num_running,
@@ -800,89 +672,9 @@ def _get_stats(self,
time_to_first_tokens=time_to_first_tokens,
time_per_output_tokens=time_per_output_tokens,
time_e2e_requests=time_e2e_requests,
+ spec_decode_metrics=spec_decode_metrics,
)
- def _check_stop(self, seq: Sequence, new_char_count: int,
- sampling_params: SamplingParams) -> None:
- """Stop the finished sequences.
-
- new_char_count is the number of chars added to the
- sequence's output text for the newly generated token
- """
-
- # Check if the minimum number of tokens has been generated yet;
- # skip the stop string/token checks if not
- if seq.get_output_len() < sampling_params.min_tokens:
- return
-
- # Check if the sequence has generated the EOS token.
- if ((not sampling_params.ignore_eos)
- and seq.get_last_token_id() == seq.eos_token_id):
- seq.status = SequenceStatus.FINISHED_STOPPED
- return
-
- # Check if a stop token was encountered.
- # This assumes a single token produced per step.
- last_token_id = seq.get_last_token_id()
- if last_token_id in sampling_params.stop_token_ids:
- if new_char_count and (
- not sampling_params.include_stop_str_in_output):
- # Remove last token
- seq.output_text = seq.output_text[:-new_char_count]
- seq.status = SequenceStatus.FINISHED_STOPPED
- seq.stop_reason = last_token_id
- return
-
- # Check if any stop strings are matched.
- stop_str = self._check_stop_strings(seq, new_char_count,
- sampling_params)
- if stop_str is not None:
- seq.status = SequenceStatus.FINISHED_STOPPED
- seq.stop_reason = stop_str
- return
-
- # Check if the sequence has reached max_model_len.
- if seq.get_len() > self.scheduler_config.max_model_len:
- seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
- return
-
- # Check if the sequence has reached max_tokens.
- if seq.get_output_len() == sampling_params.max_tokens:
- seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
- return
-
- @staticmethod
- def _check_stop_strings(seq: Sequence, new_char_count: int,
- sampling_params: SamplingParams) -> Optional[str]:
- """Check if any stop strings are matched and truncate sequence
- output text accordingly.
-
- Returns the stop string if matched or else None.
- """
- if not new_char_count:
- return None
-
- for stop_str in sampling_params.stop:
- stop_string_len = len(stop_str)
- # Avoid searching already-searched text.
- stop_index = seq.output_text.find(
- stop_str, -new_char_count - stop_string_len)
- if stop_index == -1:
- continue
-
- if sampling_params.include_stop_str_in_output:
- # Truncate to end of stop string.
- stop_index += stop_string_len
- if stop_index >= len(seq.output_text):
- # No truncation required.
- return stop_str
-
- # Truncate the output text to either the beginning
- # or end of the stop string.
- seq.output_text = seq.output_text[:stop_index]
- return stop_str
- return None
-
def add_lora(self, lora_request: LoRARequest) -> bool:
return self.model_executor.add_lora(lora_request)
diff --git a/vllm/engine/metrics.py b/vllm/engine/metrics.py
index 02560907a1282..25e96f6c7eaf7 100644
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@@ -1,6 +1,6 @@
import time
from dataclasses import dataclass
-from typing import Dict, List, Protocol
+from typing import TYPE_CHECKING, Dict, List, Optional, Protocol
import numpy as np
from prometheus_client import (REGISTRY, Counter, Gauge, Histogram, Info,
@@ -8,6 +8,9 @@
from vllm.logger import init_logger
+if TYPE_CHECKING:
+ from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
+
logger = init_logger(__name__)
disable_created_metrics()
@@ -118,6 +121,8 @@ class Stats:
time_per_output_tokens: List[float]
time_e2e_requests: List[float]
+ spec_decode_metrics: Optional["SpecDecodeWorkerMetrics"] = None
+
class SupportsMetricsInfo(Protocol):
@@ -130,7 +135,7 @@ class StatLogger:
def __init__(self, local_interval: float, labels: Dict[str, str]) -> None:
# Metadata for logging locally.
- self.last_local_log = time.monotonic()
+ self.last_local_log = time.time()
self.local_interval = local_interval
# Tracked stats over current local logging interval.
@@ -235,3 +240,19 @@ def log(self, stats: Stats) -> None:
self.num_prompt_tokens = []
self.num_generation_tokens = []
self.last_local_log = stats.now
+
+ if stats.spec_decode_metrics is not None:
+ logger.info(
+ self._format_spec_decode_metrics_str(
+ stats.spec_decode_metrics))
+
+ def _format_spec_decode_metrics_str(
+ self, metrics: "SpecDecodeWorkerMetrics") -> str:
+
+ return ("Speculative metrics: "
+ f"Draft acceptance rate: {metrics.draft_acceptance_rate:.3f}, "
+ f"System efficiency: {metrics.system_efficiency:.3f}, "
+ f"Number of speculative tokens: {metrics.num_spec_tokens}, "
+ f"Number of accepted tokens: {metrics.accepted_tokens}, "
+ f"Number of draft tokens tokens: {metrics.draft_tokens}, "
+ f"Number of emitted tokens tokens: {metrics.emitted_tokens}.")
diff --git a/vllm/engine/output_processor/__init__.py b/vllm/engine/output_processor/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/vllm/engine/output_processor/interfaces.py b/vllm/engine/output_processor/interfaces.py
new file mode 100644
index 0000000000000..f307ea4da3011
--- /dev/null
+++ b/vllm/engine/output_processor/interfaces.py
@@ -0,0 +1,70 @@
+from abc import ABC, abstractmethod
+from typing import Callable, List
+
+from transformers import PreTrainedTokenizer
+
+from vllm.config import SchedulerConfig
+from vllm.core.scheduler import Scheduler
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.sequence import Sequence, SequenceGroup, SequenceGroupOutput
+from vllm.transformers_utils.detokenizer import Detokenizer
+from vllm.utils import Counter
+
+
+class SequenceGroupOutputProcessor(ABC):
+ """Interface for logic that processes new token ids in sequence groups,
+ managing detokenization, stop checking, and freeing/forking sequences with
+ the scheduler.
+
+ This is highly coupled with the LLMEngine and should be seen as an extension
+ of it. The logic is separated to simplify the LLMEngine class and allow
+ separate implementations for single-step decoding (which supports beam
+ search sequence forking) and multi-step decoding (which does not support
+ beam search, but does support speculative decoding).
+ """
+
+ @staticmethod
+ def create_output_processor(
+ scheduler_config: SchedulerConfig,
+ detokenizer: Detokenizer,
+ scheduler: Scheduler,
+ seq_counter: Counter,
+ get_tokenizer_for_seq: Callable[[Sequence], PreTrainedTokenizer],
+ stop_checker: "StopChecker",
+ ):
+ """Create an output processor.
+
+ This returns a single-step output processor if num_lookahead_slots is
+ zero, else returns a multi-step output processor.
+ """
+ if scheduler_config.num_lookahead_slots == 0:
+ # Importing here to avoid cycle.
+ from vllm.engine.output_processor.single_step import (
+ SingleStepOutputProcessor)
+ return SingleStepOutputProcessor(
+ scheduler_config,
+ detokenizer,
+ scheduler,
+ seq_counter,
+ stop_checker,
+ )
+ else:
+ # Importing here to avoid cycle.
+ from vllm.engine.output_processor.multi_step import (
+ MultiStepOutputProcessor)
+ return MultiStepOutputProcessor(
+ detokenizer,
+ scheduler,
+ seq_counter,
+ get_tokenizer_for_seq,
+ stop_checker,
+ )
+
+ @abstractmethod
+ def process_outputs(self, sequence_group: SequenceGroup,
+ outputs: List[SequenceGroupOutput]) -> None:
+ """Process new token ids for the sequence group. Handles logic such as
+ detokenization, stop checking, and freeing/forking sequences in the
+ scheduler.
+ """
+ pass
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
new file mode 100644
index 0000000000000..39e99d06ed875
--- /dev/null
+++ b/vllm/engine/output_processor/multi_step.py
@@ -0,0 +1,127 @@
+from typing import Callable, List
+
+from transformers import PreTrainedTokenizer
+
+from vllm.core.scheduler import Scheduler
+from vllm.engine.output_processor.interfaces import (
+ SequenceGroupOutputProcessor)
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.logger import init_logger
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import (Logprob, Sequence, SequenceGroup,
+ SequenceGroupOutput, SequenceOutput, SequenceStatus)
+from vllm.transformers_utils.detokenizer import Detokenizer
+from vllm.utils import Counter
+
+logger = init_logger(__name__)
+
+
+class MultiStepOutputProcessor(SequenceGroupOutputProcessor):
+ """SequenceGroupOutputProcessor which handles logic related to
+ detokenization and stopping conditions. It specializes to "multi-step
+ decoding", where vLLM's worker may generate multiple tokens per invocation.
+ This is currently mutually exclusive with advanced sampling techniques like
+ beam search, which motivates the separation of this logic from the single
+ step output processor.
+
+ This class is responsible for things such as correctly appending all new
+ token ids to their sequence, detokenizing new token ids, truncating new
+ output tokens after an eos token, and correctly handling the case where the
+ number of new output tokens per sequence differs in a single batch.
+ """
+
+ def __init__(
+ self,
+ detokenizer: Detokenizer,
+ scheduler: Scheduler,
+ seq_counter: Counter,
+ get_tokenizer_for_seq: Callable[[Sequence], PreTrainedTokenizer],
+ stop_checker: StopChecker,
+ ):
+ self.detokenizer = detokenizer
+ self.scheduler = scheduler
+ self.seq_counter = seq_counter
+ self.get_tokenizer_for_seq = get_tokenizer_for_seq
+ self.stop_checker = stop_checker
+
+ def process_outputs(self, sequence_group: SequenceGroup,
+ outputs: List[SequenceGroupOutput]) -> None:
+ """Append new tokens in the outputs to sequences in the sequence group.
+
+ This only supports sequence groups of size 1. It supports greater than
+ one new token per sequence.
+
+ This applies logic like stop condition checking and detokenization,
+ including freeing finished sequences. It also handles cases where there
+ are tokens emitted after the EOS token.
+ """
+ seqs = sequence_group.get_seqs(status=SequenceStatus.RUNNING)
+
+ assert seqs, "expected running sequences"
+ assert len(seqs) == 1, (
+ "Beam search not supported in multi-step decoding.")
+ seq = seqs[0]
+
+ # Since there's only one sequence per sequence group, we can take the
+ # first sample.
+ samples = [outputs[step].samples[0] for step in range(len(outputs))]
+
+ # -1 means the output token is not valid (eg. due to spec decode
+ # rejecting tokens).
+ valid_samples = [
+ sample for sample in samples if sample.output_token != -1
+ ]
+ assert valid_samples
+
+ self._process_seq_outputs(seq, valid_samples,
+ sequence_group.sampling_params)
+
+ def _process_seq_outputs(self, seq: Sequence,
+ valid_samples: List[SequenceOutput],
+ sampling_params: SamplingParams) -> None:
+ output_token_ids = [sample.output_token for sample in valid_samples]
+
+ # Truncate to max_tokens if necessary.
+ remaining_tokens = sampling_params.max_tokens - (seq.get_output_len() +
+ len(output_token_ids))
+ if remaining_tokens < 0:
+ valid_samples = valid_samples[:remaining_tokens]
+ output_token_ids = output_token_ids[:remaining_tokens]
+
+ # Truncate any tokens after EOS. This is required as spec decode
+ # generates a fixed number of tokens without evaluating stopping
+ # conditions within the block. This can cause an eos token to be
+ # unintentionally ignored.
+ if not sampling_params.ignore_eos:
+ eos_token_id = self.get_tokenizer_for_seq(seq).eos_token_id
+ # Avoiding .index calls as exception throwing in the happy path
+ # is expensive.
+ for i in range(len(output_token_ids)):
+ if output_token_ids[i] == eos_token_id:
+ output_token_ids = output_token_ids[:i + 1]
+ valid_samples = valid_samples[:i + 1]
+ break
+
+ # Incrementally append tokens to the sequence, as if we had only one new
+ # token.
+ for output_token_id in output_token_ids:
+ seq.append_token_id(
+ token_id=output_token_id,
+ # TODO emit logprobs in multi-step decoding.
+ logprobs={output_token_id: Logprob(0.0)},
+ )
+
+ new_char_count = 0
+ if sampling_params.detokenize:
+ new_char_count = self.detokenizer.decode_sequence_inplace(
+ seq, sampling_params)
+
+ self.stop_checker.maybe_stop_sequence(
+ seq,
+ new_char_count=new_char_count,
+ sampling_params=sampling_params)
+ if seq.is_finished():
+ break
+
+ if seq.is_finished():
+ self.scheduler.free_seq(seq)
diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py
new file mode 100644
index 0000000000000..7e9d652446703
--- /dev/null
+++ b/vllm/engine/output_processor/single_step.py
@@ -0,0 +1,278 @@
+from typing import Dict, List, Tuple, Union
+
+from vllm.config import SchedulerConfig
+from vllm.core.scheduler import Scheduler
+from vllm.engine.output_processor.interfaces import (
+ SequenceGroupOutputProcessor)
+from vllm.engine.output_processor.stop_checker import StopChecker
+from vllm.logger import init_logger
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import (Sequence, SequenceGroup, SequenceGroupOutput,
+ SequenceOutput, SequenceStatus)
+from vllm.transformers_utils.detokenizer import Detokenizer
+from vllm.utils import Counter
+
+logger = init_logger(__name__)
+
+
+class SingleStepOutputProcessor(SequenceGroupOutputProcessor):
+ """SequenceGroupOutputProcessor which handles "output processing" logic,
+ which happens after the model returns generated token ids and before
+ scheduling of the next batch. Output processing logic includes
+ detokenization, and determining if a sequence is finished (e.g. via max len
+ or eos token).
+
+ The SingleStepOutputProcessor is specialized to the case where the model
+ emits at most a single token per invocation, which precludes configurations
+ such as speculative decoding or multi-step decoding. This enables beam
+ search sampling, which requires forking/finishing/freeing sequences in a way
+ that is currently difficult to schedule multiple steps ahead of time.
+ """
+
+ def __init__(
+ self,
+ scheduler_config: SchedulerConfig,
+ detokenizer: Detokenizer,
+ scheduler: Scheduler,
+ seq_counter: Counter,
+ stop_checker: StopChecker,
+ ):
+ self.scheduler_config = scheduler_config
+ self.detokenizer = detokenizer
+ self.scheduler = scheduler
+ self.seq_counter = seq_counter
+ self.stop_checker = stop_checker
+
+ def process_outputs(self, sequence_group: SequenceGroup,
+ outputs: List[SequenceGroupOutput]) -> None:
+ """Append all new tokens to sequences in the sequence group. Fork any
+ surviving beam candidates; free any unsurviving ones.
+
+ Invokes detokenizer to detokenize new tokens, and also marks sequences
+ as finished if they meet stop conditions.
+ """
+ assert (len(outputs) == 1
+ ), f"{type(self)} does not support multiple outputs per step"
+ return self._process_sequence_group_outputs(sequence_group, outputs[0])
+
+ def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
+ outputs: SequenceGroupOutput) -> None:
+
+ # Process prompt logprobs
+ prompt_logprobs = outputs.prompt_logprobs
+ if prompt_logprobs is not None and \
+ seq_group.sampling_params.detokenize and self.detokenizer:
+ self.detokenizer.decode_prompt_logprobs_inplace(
+ seq_group, prompt_logprobs)
+ seq_group.prompt_logprobs = prompt_logprobs
+
+ # Process samples
+ samples = outputs.samples
+ parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
+ existing_finished_seqs = seq_group.get_finished_seqs()
+ parent_child_dict: Dict[int, List[SequenceOutput]] = {
+ parent_seq.seq_id: []
+ for parent_seq in parent_seqs
+ }
+ for sample in samples:
+ parent_child_dict[sample.parent_seq_id].append(sample)
+ # List of (child, parent)
+ child_seqs: List[Tuple[Sequence, Sequence]] = []
+
+ # Process the child samples for each parent sequence
+ for parent in parent_seqs:
+ child_samples: List[SequenceOutput] = parent_child_dict[
+ parent.seq_id]
+ if len(child_samples) == 0:
+ # This parent sequence has no children samples. Remove
+ # the parent sequence from the sequence group since it will
+ # not be used in the future iterations.
+ parent.status = SequenceStatus.FINISHED_ABORTED
+ seq_group.remove(parent.seq_id)
+ self.scheduler.free_seq(parent)
+ continue
+ # Fork the parent sequence if there are multiple child samples.
+ for child_sample in child_samples[:-1]:
+ new_child_seq_id: int = next(self.seq_counter)
+ child = parent.fork(new_child_seq_id)
+ child.append_token_id(child_sample.output_token,
+ child_sample.logprobs)
+ child_seqs.append((child, parent))
+ # Continue the parent sequence for the last child sample.
+ # We reuse the parent sequence here to reduce redundant memory
+ # copies, especially when using non-beam search sampling methods.
+ last_child_sample = child_samples[-1]
+ parent.append_token_id(last_child_sample.output_token,
+ last_child_sample.logprobs)
+ child_seqs.append((parent, parent))
+
+ for seq, _ in child_seqs:
+ if seq_group.sampling_params.detokenize and self.detokenizer:
+ new_char_count = self.detokenizer.decode_sequence_inplace(
+ seq, seq_group.sampling_params)
+ else:
+ new_char_count = 0
+ self.stop_checker.maybe_stop_sequence(seq, new_char_count,
+ seq_group.sampling_params)
+
+ # Non-beam search case
+ if not seq_group.sampling_params.use_beam_search:
+ # For newly created child sequences, add them to the sequence group
+ # and fork them in block manager if they are not finished.
+ for seq, parent in child_seqs:
+ if seq is not parent:
+ seq_group.add(seq)
+ if not seq.is_finished():
+ self.scheduler.fork_seq(parent, seq)
+
+ # Free the finished and selected parent sequences' memory in block
+ # manager. Keep them in the sequence group as candidate output.
+ # NOTE: we need to fork the new sequences before freeing the
+ # old sequences.
+ for seq, parent in child_seqs:
+ if seq is parent and seq.is_finished():
+ self.scheduler.free_seq(seq)
+ return
+
+ # Beam search case
+ # Select the child sequences to keep in the sequence group.
+ selected_child_seqs = []
+ unselected_child_seqs = []
+ beam_width = seq_group.sampling_params.best_of
+ length_penalty = seq_group.sampling_params.length_penalty
+
+ # Select the newly finished sequences with the highest scores
+ # to replace existing finished sequences.
+ # Tuple of (seq, parent, is_new)
+ existing_finished_seqs = [(seq, None, False)
+ for seq in existing_finished_seqs]
+ new_finished_seqs = [(seq, parent, True) for seq, parent in child_seqs
+ if seq.is_finished()]
+ all_finished_seqs = existing_finished_seqs + new_finished_seqs
+ # Sort the finished sequences by their scores.
+ all_finished_seqs.sort(key=lambda x: x[0].get_beam_search_score(
+ length_penalty=length_penalty, eos_token_id=x[0].eos_token_id),
+ reverse=True)
+ for seq, parent, is_new in all_finished_seqs[:beam_width]:
+ if is_new:
+ # A newly generated child sequence finishes and has a high
+ # score, so we will add it into the sequence group.
+ selected_child_seqs.append((seq, parent))
+ for seq, parent, is_new in all_finished_seqs[beam_width:]:
+ if is_new:
+ # A newly generated child sequence finishes but has a low
+ # score, so we will not add it into the sequence group.
+ # Additionally, if this sequence is a continuation of a
+ # parent sequence, we will need remove the parent sequence
+ # from the sequence group.
+ unselected_child_seqs.append((seq, parent))
+ else:
+ # An existing finished sequence has a low score, so we will
+ # remove it from the sequence group.
+ seq_group.remove(seq.seq_id)
+
+ # select the top beam_width sequences from the running
+ # sequences for the next iteration to continue the beam
+ # search.
+ running_child_seqs = [(seq, parent) for seq, parent in child_seqs
+ if not seq.is_finished()]
+ # Sort the running sequences by their scores.
+ running_child_seqs.sort(key=lambda x: x[0].get_beam_search_score(
+ length_penalty=length_penalty, eos_token_id=x[0].eos_token_id),
+ reverse=True)
+
+ # Check if we can stop the beam search.
+ if len(running_child_seqs) == 0:
+ # No running sequences, stop the beam search.
+ stop_beam_search = True
+ elif len(all_finished_seqs) < beam_width:
+ # Not enough finished sequences, continue the beam search.
+ stop_beam_search = False
+ else:
+ # Check the early stopping criteria
+ best_running_seq = running_child_seqs[0][0]
+ current_worst_seq = all_finished_seqs[beam_width - 1][0]
+ stop_beam_search = self._check_beam_search_early_stopping(
+ seq_group.sampling_params.early_stopping,
+ seq_group.sampling_params, best_running_seq, current_worst_seq)
+
+ if stop_beam_search:
+ # Stop the beam search and remove all the running sequences from
+ # the sequence group.
+ unselected_child_seqs.extend(running_child_seqs)
+ else:
+ # Continue the beam search and select the top beam_width sequences
+ # to continue the beam search.
+ selected_child_seqs.extend(running_child_seqs[:beam_width])
+ # The remaining running sequences will not be used in the next
+ # iteration. Again, if these sequences are continuations of
+ # parent sequences, we will need to remove the parent sequences
+ # from the sequence group.
+ unselected_child_seqs.extend(running_child_seqs[beam_width:])
+
+ # For newly created child sequences, add them to the sequence group
+ # and fork them in block manager if they are not finished.
+ for seq, parent in selected_child_seqs:
+ if seq is not parent:
+ seq_group.add(seq)
+ if not seq.is_finished():
+ self.scheduler.fork_seq(parent, seq)
+
+ # Free the finished and selected parent sequences' memory in block
+ # manager. Keep them in the sequence group as candidate output.
+ for seq, parent in selected_child_seqs:
+ if seq is parent and seq.is_finished():
+ self.scheduler.free_seq(seq)
+
+ # Remove the unselected parent sequences from the sequence group and
+ # free their memory in block manager.
+ for seq, parent in unselected_child_seqs:
+ if seq is parent:
+ # Remove the parent sequence if it is not selected for next
+ # iteration
+ seq_group.remove(seq.seq_id)
+ self.scheduler.free_seq(seq)
+
+ def _check_beam_search_early_stopping(
+ self,
+ early_stopping: Union[bool, str],
+ sampling_params: SamplingParams,
+ best_running_seq: Sequence,
+ current_worst_seq: Sequence,
+ ) -> bool:
+ assert sampling_params.use_beam_search
+ length_penalty = sampling_params.length_penalty
+ if early_stopping is True:
+ return True
+
+ current_worst_score = current_worst_seq.get_beam_search_score(
+ length_penalty=length_penalty,
+ eos_token_id=current_worst_seq.eos_token_id)
+ if early_stopping is False:
+ highest_attainable_score = best_running_seq.get_beam_search_score(
+ length_penalty=length_penalty,
+ eos_token_id=best_running_seq.eos_token_id)
+ else:
+ assert early_stopping == "never"
+ if length_penalty > 0.0:
+ # If length_penalty > 0.0, beam search will prefer longer
+ # sequences. The highest attainable score calculation is
+ # based on the longest possible sequence length in this case.
+ max_possible_length = max(
+ best_running_seq.get_prompt_len() +
+ sampling_params.max_tokens,
+ self.scheduler_config.max_model_len)
+ highest_attainable_score = (
+ best_running_seq.get_beam_search_score(
+ length_penalty=length_penalty,
+ eos_token_id=best_running_seq.eos_token_id,
+ seq_len=max_possible_length))
+ else:
+ # Otherwise, beam search will prefer shorter sequences. The
+ # highest attainable score calculation is based on the current
+ # sequence length.
+ highest_attainable_score = (
+ best_running_seq.get_beam_search_score(
+ length_penalty=length_penalty,
+ eos_token_id=best_running_seq.eos_token_id))
+ return current_worst_score >= highest_attainable_score
diff --git a/vllm/engine/output_processor/stop_checker.py b/vllm/engine/output_processor/stop_checker.py
new file mode 100644
index 0000000000000..66deb9b591746
--- /dev/null
+++ b/vllm/engine/output_processor/stop_checker.py
@@ -0,0 +1,101 @@
+from typing import Callable, Optional
+
+from transformers import PreTrainedTokenizer
+
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import Sequence, SequenceStatus
+
+
+class StopChecker:
+ """LLMEngine helper class which separates out the logic involving stop
+ checking. This checks things such as: whether the eos token was emitted,
+ whether the max_tokens has been consumed, whether a stop string has been
+ emitted, or if we have exceeded the max model len.
+ """
+
+ def __init__(self, max_model_len: int,
+ get_tokenizer_for_seq: Callable[[Sequence],
+ PreTrainedTokenizer]):
+ self.max_model_len = max_model_len
+ self.get_tokenizer_for_seq = get_tokenizer_for_seq
+
+ def maybe_stop_sequence(self, seq: Sequence, new_char_count: int,
+ sampling_params: SamplingParams) -> None:
+ """Stop the finished sequences.
+
+ new_char_count is the number of chars added to the
+ sequence's output text for the newly generated token
+ """
+
+ # Check if the minimum number of tokens has been generated yet;
+ # skip the stop string/token checks if not
+ if seq.get_output_len() < sampling_params.min_tokens:
+ return
+
+ # Check if the sequence has generated the EOS token.
+ if ((not sampling_params.ignore_eos)
+ and seq.get_last_token_id() == seq.eos_token_id):
+ seq.status = SequenceStatus.FINISHED_STOPPED
+ return
+
+ # Check if a stop token was encountered.
+ # This assumes a single token produced per step.
+ last_token_id = seq.get_last_token_id()
+ if last_token_id in sampling_params.stop_token_ids:
+ if new_char_count and (
+ not sampling_params.include_stop_str_in_output):
+ # Remove last token
+ seq.output_text = seq.output_text[:-new_char_count]
+ seq.status = SequenceStatus.FINISHED_STOPPED
+ seq.stop_reason = last_token_id
+ return
+
+ # Check if any stop strings are matched.
+ stop_str = self._check_stop_strings(seq, new_char_count,
+ sampling_params)
+ if stop_str is not None:
+ seq.status = SequenceStatus.FINISHED_STOPPED
+ seq.stop_reason = stop_str
+ return
+
+ # Check if the sequence has reached max_model_len.
+ if seq.get_len() > self.max_model_len:
+ seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
+ return
+
+ # Check if the sequence has reached max_tokens.
+ if seq.get_output_len() == sampling_params.max_tokens:
+ seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
+ return
+
+ @staticmethod
+ def _check_stop_strings(seq: Sequence, new_char_count: int,
+ sampling_params: SamplingParams) -> Optional[str]:
+ """Check if any stop strings are matched and truncate sequence
+ output text accordingly.
+
+ Returns the stop string if matched or else None.
+ """
+ if not new_char_count:
+ return None
+
+ for stop_str in sampling_params.stop:
+ stop_string_len = len(stop_str)
+ # Avoid searching already-searched text.
+ stop_index = seq.output_text.find(
+ stop_str, -new_char_count - stop_string_len)
+ if stop_index == -1:
+ continue
+
+ if sampling_params.include_stop_str_in_output:
+ # Truncate to end of stop string.
+ stop_index += stop_string_len
+ if stop_index >= len(seq.output_text):
+ # No truncation required.
+ return stop_str
+
+ # Truncate the output text to either the beginning
+ # or end of the stop string.
+ seq.output_text = seq.output_text[:stop_index]
+ return stop_str
+ return None
diff --git a/vllm/engine/output_processor/util.py b/vllm/engine/output_processor/util.py
new file mode 100644
index 0000000000000..d076fee8c2a36
--- /dev/null
+++ b/vllm/engine/output_processor/util.py
@@ -0,0 +1,18 @@
+from typing import List
+
+from vllm.sequence import SamplerOutput
+
+
+def create_output_by_sequence_group(sampler_outputs: List[SamplerOutput],
+ num_seq_groups: int):
+ """Helper method which transforms a 2d list organized by
+ [step][sequence group] into [sequence group][step].
+ """
+ output_by_sequence_group: List[List[SamplerOutput]] = [
+ [] for _ in range(num_seq_groups)
+ ]
+ for step in sampler_outputs:
+ for i, sequence_group_output in enumerate(step):
+ output_by_sequence_group[i].append(sequence_group_output)
+
+ return output_by_sequence_group
diff --git a/vllm/entrypoints/api_server.py b/vllm/entrypoints/api_server.py
index 2a47eae112c12..587142adb9c6b 100644
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@@ -47,6 +47,7 @@ async def generate(request: Request) -> Response:
sampling_params = SamplingParams(**request_dict)
request_id = random_uuid()
+ assert engine is not None
results_generator = engine.generate(prompt, sampling_params, request_id)
# Streaming case
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index c5261d6e4556c..8ae1cda8ccda7 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -32,6 +32,9 @@ class LLM:
tokenizer: The name or path of a HuggingFace Transformers tokenizer.
tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer
if available, and "slow" will always use the slow tokenizer.
+ skip_tokenizer_init: If true, skip initialization of tokenizer and
+ detokenizer. Expect valid prompt_token_ids and None for prompt
+ from the input.
trust_remote_code: Trust remote code (e.g., from HuggingFace) when
downloading the model and tokenizer.
tensor_parallel_size: The number of GPUs to use for distributed
@@ -42,10 +45,11 @@ class LLM:
However, if the `torch_dtype` in the config is `float32`, we will
use `float16` instead.
quantization: The method used to quantize the model weights. Currently,
- we support "awq", "gptq" and "squeezellm". If None, we first check
- the `quantization_config` attribute in the model config file. If
- that is None, we assume the model weights are not quantized and use
- `dtype` to determine the data type of the weights.
+ we support "awq", "gptq", "squeezellm", and "fp8" (experimental).
+ If None, we first check the `quantization_config` attribute in the
+ model config file. If that is None, we assume the model weights are
+ not quantized and use `dtype` to determine the data type of
+ the weights.
sparsity: The format of the sparse model weights. Currently,
we support "sparse_w16a16". If None, we first check the `sparsity`
attribute in the model config file. If that is None, we assume the
@@ -80,6 +84,7 @@ def __init__(
model: str,
tokenizer: Optional[str] = None,
tokenizer_mode: str = "auto",
+ skip_tokenizer_init: bool = False,
trust_remote_code: bool = False,
tensor_parallel_size: int = 1,
dtype: str = "auto",
@@ -93,7 +98,7 @@ def __init__(
swap_space: int = 4,
enforce_eager: bool = False,
max_context_len_to_capture: int = 8192,
- disable_custom_all_reduce: bool = True,
+ disable_custom_all_reduce: bool = False,
**kwargs,
) -> None:
if "disable_log_stats" not in kwargs:
@@ -102,6 +107,7 @@ def __init__(
model=model,
tokenizer=tokenizer,
tokenizer_mode=tokenizer_mode,
+ skip_tokenizer_init=skip_tokenizer_init,
trust_remote_code=trust_remote_code,
tensor_parallel_size=tensor_parallel_size,
dtype=dtype,
@@ -135,7 +141,8 @@ def set_tokenizer(
def generate(
self,
prompts: Optional[Union[str, List[str]]] = None,
- sampling_params: Optional[SamplingParams] = None,
+ sampling_params: Optional[Union[SamplingParams,
+ List[SamplingParams]]] = None,
prompt_token_ids: Optional[List[List[int]]] = None,
use_tqdm: bool = True,
lora_request: Optional[LoRARequest] = None,
@@ -150,7 +157,10 @@ def generate(
Args:
prompts: A list of prompts to generate completions for.
sampling_params: The sampling parameters for text generation. If
- None, we use the default sampling parameters.
+ None, we use the default sampling parameters.
+ When it is a single value, it is applied to every prompt.
+ When it is a list, the list must have the same length as the
+ prompts and it is paired one by one with the prompt.
prompt_token_ids: A list of token IDs for the prompts. If None, we
use the tokenizer to convert the prompts to token IDs.
use_tqdm: Whether to use tqdm to display the progress bar.
@@ -164,6 +174,10 @@ def generate(
if prompts is None and prompt_token_ids is None:
raise ValueError("Either prompts or prompt_token_ids must be "
"provided.")
+ if self.llm_engine.model_config.skip_tokenizer_init \
+ and prompts is not None:
+ raise ValueError("prompts must be None if skip_tokenizer_init "
+ "is True")
if isinstance(prompts, str):
# Convert a single prompt to a list.
prompts = [prompts]
@@ -171,23 +185,33 @@ def generate(
and len(prompts) != len(prompt_token_ids)):
raise ValueError("The lengths of prompts and prompt_token_ids "
"must be the same.")
+
+ if prompts is not None:
+ num_requests = len(prompts)
+ else:
+ assert prompt_token_ids is not None
+ num_requests = len(prompt_token_ids)
+
if sampling_params is None:
# Use default sampling params.
sampling_params = SamplingParams()
+ elif isinstance(sampling_params,
+ list) and len(sampling_params) != num_requests:
+ raise ValueError("The lengths of prompts and sampling_params "
+ "must be the same.")
if multi_modal_data:
multi_modal_data.data = multi_modal_data.data.to(torch.float16)
# Add requests to the engine.
- num_requests = len(prompts) if prompts is not None else len(
- prompt_token_ids)
for i in range(num_requests):
prompt = prompts[i] if prompts is not None else None
token_ids = None if prompt_token_ids is None else prompt_token_ids[
i]
self._add_request(
prompt,
- sampling_params,
+ sampling_params[i]
+ if isinstance(sampling_params, list) else sampling_params,
token_ids,
lora_request=lora_request,
# Get ith image while maintaining the batch dim.
@@ -236,4 +260,4 @@ def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]:
# This is necessary because some requests may be finished earlier than
# its previous requests.
outputs = sorted(outputs, key=lambda x: int(x.request_id))
- return outputs
+ return outputs
\ No newline at end of file
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 32282bfd8d12b..37d76b8e74055 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -18,6 +18,7 @@
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.openai.cli_args import make_arg_parser
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+ ChatCompletionResponse,
CompletionRequest, ErrorResponse)
from vllm.entrypoints.openai.serving_chat import OpenAIServingChat
from vllm.entrypoints.openai.serving_completion import OpenAIServingCompletion
@@ -26,8 +27,8 @@
TIMEOUT_KEEP_ALIVE = 5 # seconds
-openai_serving_chat: OpenAIServingChat = None
-openai_serving_completion: OpenAIServingCompletion = None
+openai_serving_chat: OpenAIServingChat
+openai_serving_completion: OpenAIServingCompletion
logger = init_logger(__name__)
@@ -95,6 +96,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
return StreamingResponse(content=generator,
media_type="text/event-stream")
else:
+ assert isinstance(generator, ChatCompletionResponse)
return JSONResponse(content=generator.model_dump())
@@ -150,18 +152,18 @@ async def authentication(request: Request, call_next):
logger.info(f"args: {args}")
if args.served_model_name is not None:
- served_model = args.served_model_name
+ served_model_names = args.served_model_name
else:
- served_model = args.model
+ served_model_names = [args.model]
engine_args = AsyncEngineArgs.from_cli_args(args)
engine = AsyncLLMEngine.from_engine_args(
engine_args, usage_context=UsageContext.OPENAI_API_SERVER)
- openai_serving_chat = OpenAIServingChat(engine, served_model,
+ openai_serving_chat = OpenAIServingChat(engine, served_model_names,
args.response_role,
args.lora_modules,
args.chat_template)
openai_serving_completion = OpenAIServingCompletion(
- engine, served_model, args.lora_modules)
+ engine, served_model_names, args.lora_modules)
app.root_path = args.root_path
uvicorn.run(app,
diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py
index cc71931b97955..5c361b4d184ee 100644
--- a/vllm/entrypoints/openai/cli_args.py
+++ b/vllm/entrypoints/openai/cli_args.py
@@ -54,11 +54,15 @@ def make_arg_parser():
help="If provided, the server will require this key "
"to be presented in the header.")
parser.add_argument("--served-model-name",
+ nargs="+",
type=str,
default=None,
- help="The model name used in the API. If not "
- "specified, the model name will be the same as "
- "the huggingface name.")
+ help="The model name(s) used in the API. If multiple "
+ "names are provided, the server will respond to any "
+ "of the provided names. The model name in the model "
+ "field of a response will be the first name in this "
+ "list. If not specified, the model name will be the "
+ "same as the `--model` argument.")
parser.add_argument(
"--lora-modules",
type=str,
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index f94d22d279cc4..d9763d024eb83 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -4,7 +4,8 @@
from typing import Dict, List, Literal, Optional, Union
import torch
-from pydantic import BaseModel, Field, conint, model_validator
+from pydantic import BaseModel, Field, model_validator
+from typing_extensions import Annotated
from vllm.sampling_params import SamplingParams
from vllm.utils import random_uuid
@@ -30,7 +31,7 @@ class ModelPermission(BaseModel):
allow_fine_tuning: bool = False
organization: str = "*"
group: Optional[str] = None
- is_blocking: str = False
+ is_blocking: bool = False
class ModelCard(BaseModel):
@@ -56,7 +57,7 @@ class UsageInfo(BaseModel):
class ResponseFormat(BaseModel):
# type must be "json_object" or "text"
- type: str = Literal["text", "json_object"]
+ type: Literal["text", "json_object"]
class ChatCompletionRequest(BaseModel):
@@ -133,6 +134,12 @@ class ChatCompletionRequest(BaseModel):
description=(
"If specified, the output will follow the context free grammar."),
)
+ guided_decoding_backend: Optional[str] = Field(
+ default=None,
+ description=(
+ "If specified, will override the default guided decoding backend "
+ "of the server for this specific request. If set, must be either "
+ "'outlines' / 'lm-format-enforcer'"))
# doc: end-chat-completion-extra-params
@@ -146,6 +153,7 @@ def to_sampling_params(self) -> SamplingParams:
def logit_bias_logits_processor(
token_ids: List[int],
logits: torch.Tensor) -> torch.Tensor:
+ assert self.logit_bias is not None
for token_id, bias in self.logit_bias.items():
# Clamp the bias between -100 and 100 per OpenAI API spec
bias = min(100, max(-100, bias))
@@ -207,7 +215,7 @@ class CompletionRequest(BaseModel):
logit_bias: Optional[Dict[str, float]] = None
logprobs: Optional[int] = None
max_tokens: Optional[int] = 16
- n: Optional[int] = 1
+ n: int = 1
presence_penalty: Optional[float] = 0.0
seed: Optional[int] = None
stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
@@ -229,7 +237,7 @@ class CompletionRequest(BaseModel):
min_tokens: Optional[int] = 0
skip_special_tokens: Optional[bool] = True
spaces_between_special_tokens: Optional[bool] = True
- truncate_prompt_tokens: Optional[conint(ge=1)] = None
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
# doc: end-completion-sampling-params
# doc: begin-completion-extra-params
@@ -265,6 +273,12 @@ class CompletionRequest(BaseModel):
description=(
"If specified, the output will follow the context free grammar."),
)
+ guided_decoding_backend: Optional[str] = Field(
+ default=None,
+ description=(
+ "If specified, will override the default guided decoding backend "
+ "of the server for this specific request. If set, must be one of "
+ "'outlines' / 'lm-format-enforcer'"))
# doc: end-completion-extra-params
@@ -277,6 +291,7 @@ def to_sampling_params(self):
def logit_bias_logits_processor(
token_ids: List[int],
logits: torch.Tensor) -> torch.Tensor:
+ assert self.logit_bias is not None
for token_id, bias in self.logit_bias.items():
# Clamp the bias between -100 and 100 per OpenAI API spec
bias = min(100, max(-100, bias))
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index a03c5dc88108f..2ff335eb71073 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -24,12 +24,12 @@ class OpenAIServingChat(OpenAIServing):
def __init__(self,
engine: AsyncLLMEngine,
- served_model: str,
+ served_model_names: List[str],
response_role: str,
lora_modules: Optional[List[LoRA]] = None,
chat_template=None):
super().__init__(engine=engine,
- served_model=served_model,
+ served_model_names=served_model_names,
lora_modules=lora_modules)
self.response_role = response_role
self._load_chat_template(chat_template)
@@ -68,9 +68,13 @@ async def create_chat_completion(
request, prompt=prompt)
sampling_params = request.to_sampling_params()
lora_request = self._maybe_get_lora(request)
+ decoding_config = self.engine.engine.decoding_config
+ guided_decoding_backend = request.guided_decoding_backend \
+ or decoding_config.guided_decoding_backend
guided_decode_logits_processor = (
await get_guided_decoding_logits_processor(
- request, await self.engine.get_tokenizer()))
+ guided_decoding_backend, request, await
+ self.engine.get_tokenizer()))
if guided_decode_logits_processor:
if sampling_params.logits_processors is None:
sampling_params.logits_processors = []
@@ -105,18 +109,18 @@ async def chat_completion_stream_generator(
result_generator: AsyncIterator[RequestOutput], request_id: str
) -> Union[ErrorResponse, AsyncGenerator[str, None]]:
- model_name = request.model
+ model_name = self.served_model_names[0]
created_time = int(time.time())
chunk_object_type = "chat.completion.chunk"
first_iteration = True
# Send response for each token for each request.n (index)
+ assert request.n is not None
previous_texts = [""] * request.n
previous_num_tokens = [0] * request.n
finish_reason_sent = [False] * request.n
try:
async for res in result_generator:
- res: RequestOutput
# We need to do it here, because if there are exceptions in
# the result_generator, it needs to be sent as the FIRST
# response (by the try...catch).
@@ -247,7 +251,7 @@ async def chat_completion_full_generator(
result_generator: AsyncIterator[RequestOutput],
request_id: str) -> Union[ErrorResponse, ChatCompletionResponse]:
- model_name = request.model
+ model_name = self.served_model_names[0]
created_time = int(time.time())
final_res: RequestOutput = None
@@ -315,23 +319,30 @@ async def chat_completion_full_generator(
return response
def _load_chat_template(self, chat_template):
+ tokenizer = self.tokenizer
+
if chat_template is not None:
try:
with open(chat_template, "r") as f:
- self.tokenizer.chat_template = f.read()
- except OSError:
+ tokenizer.chat_template = f.read()
+ except OSError as e:
+ JINJA_CHARS = "{}\n"
+ if not any(c in chat_template for c in JINJA_CHARS):
+ msg = (f"The supplied chat template ({chat_template}) "
+ f"looks like a file path, but it failed to be "
+ f"opened. Reason: {e}")
+ raise ValueError(msg) from e
+
# If opening a file fails, set chat template to be args to
# ensure we decode so our escape are interpreted correctly
- self.tokenizer.chat_template = codecs.decode(
+ tokenizer.chat_template = codecs.decode(
chat_template, "unicode_escape")
logger.info(
- f"Using supplied chat template:\n{self.tokenizer.chat_template}"
- )
- elif self.tokenizer.chat_template is not None:
+ f"Using supplied chat template:\n{tokenizer.chat_template}")
+ elif tokenizer.chat_template is not None:
logger.info(
- f"Using default chat template:\n{self.tokenizer.chat_template}"
- )
+ f"Using default chat template:\n{tokenizer.chat_template}")
else:
logger.warning(
"No chat template provided. Chat API will not work.")
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index e24aa2489a80f..211b2e0424c3e 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -53,10 +53,10 @@ class OpenAIServingCompletion(OpenAIServing):
def __init__(self,
engine: AsyncLLMEngine,
- served_model: str,
+ served_model_names: List[str],
lora_modules: Optional[List[LoRA]] = None):
super().__init__(engine=engine,
- served_model=served_model,
+ served_model_names=served_model_names,
lora_modules=lora_modules)
async def create_completion(self, request: CompletionRequest,
@@ -79,7 +79,7 @@ async def create_completion(self, request: CompletionRequest,
return self.create_error_response(
"suffix is not currently supported")
- model_name = request.model
+ model_name = self.served_model_names[0]
request_id = f"cmpl-{random_uuid()}"
created_time = int(time.time())
@@ -88,9 +88,13 @@ async def create_completion(self, request: CompletionRequest,
try:
sampling_params = request.to_sampling_params()
lora_request = self._maybe_get_lora(request)
+ decoding_config = self.engine.engine.decoding_config
+ guided_decoding_backend = request.guided_decoding_backend \
+ or decoding_config.guided_decoding_backend
guided_decode_logit_processor = (
await get_guided_decoding_logits_processor(
- request, await self.engine.get_tokenizer()))
+ guided_decoding_backend, request, await
+ self.engine.get_tokenizer()))
if guided_decode_logit_processor is not None:
if sampling_params.logits_processors is None:
sampling_params.logits_processors = []
@@ -181,6 +185,7 @@ async def completion_stream_generator(
model_name: str,
num_prompts: int,
) -> AsyncGenerator[str, None]:
+ assert request.n is not None
previous_texts = [""] * request.n * num_prompts
previous_num_tokens = [0] * request.n * num_prompts
has_echoed = [False] * request.n * num_prompts
@@ -198,6 +203,7 @@ async def completion_stream_generator(
# TODO(simon): optimize the performance by avoiding full
# text O(n^2) sending.
+ assert request.max_tokens is not None
if request.echo and request.max_tokens == 0:
# only return the prompt
delta_text = res.prompt
@@ -275,7 +281,7 @@ def request_output_to_completion_response(
created_time: int,
model_name: str,
) -> CompletionResponse:
- choices = []
+ choices: List[CompletionResponseChoice] = []
num_prompt_tokens = 0
num_generated_tokens = 0
for final_res in final_res_batch:
@@ -285,6 +291,7 @@ def request_output_to_completion_response(
prompt_text = final_res.prompt
for output in final_res.outputs:
+ assert request.max_tokens is not None
if request.echo and request.max_tokens == 0:
token_ids = prompt_token_ids
top_logprobs = prompt_logprobs
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 78b951c8ba161..e89d497f436b7 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -4,7 +4,9 @@
from http import HTTPStatus
from typing import Dict, List, Optional, Tuple, Union
-from pydantic import conint
+from pydantic import Field
+from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
+from typing_extensions import Annotated
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
@@ -29,10 +31,10 @@ class OpenAIServing:
def __init__(self,
engine: AsyncLLMEngine,
- served_model: str,
+ served_model_names: List[str],
lora_modules=Optional[List[LoRA]]):
self.engine = engine
- self.served_model = served_model
+ self.served_model_names = served_model_names
if lora_modules is None:
self.lora_requests = []
else:
@@ -45,7 +47,8 @@ def __init__(self,
]
self.max_model_len = 0
- self.tokenizer = None
+ # Lazy initialized
+ self.tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
try:
event_loop = asyncio.get_running_loop()
@@ -68,6 +71,7 @@ async def _post_init(self):
self.tokenizer = get_tokenizer(
engine_model_config.tokenizer,
tokenizer_mode=engine_model_config.tokenizer_mode,
+ tokenizer_revision=engine_model_config.tokenizer_revision,
trust_remote_code=engine_model_config.trust_remote_code,
truncation_side="left")
@@ -82,13 +86,14 @@ async def _post_init(self):
async def show_available_models(self) -> ModelList:
"""Show available models. Right now we only have one model."""
model_cards = [
- ModelCard(id=self.served_model,
- root=self.served_model,
+ ModelCard(id=served_model_name,
+ root=self.served_model_names[0],
permission=[ModelPermission()])
+ for served_model_name in self.served_model_names
]
lora_cards = [
ModelCard(id=lora.lora_name,
- root=self.served_model,
+ root=self.served_model_names[0],
permission=[ModelPermission()])
for lora in self.lora_requests
]
@@ -98,7 +103,7 @@ async def show_available_models(self) -> ModelList:
def _create_logprobs(
self,
token_ids: List[int],
- top_logprobs: Optional[List[Optional[Dict[int, Logprob]]]] = None,
+ top_logprobs: List[Optional[Dict[int, Logprob]]],
num_output_top_logprobs: Optional[int] = None,
initial_text_offset: int = 0,
) -> LogProbs:
@@ -114,6 +119,7 @@ def _create_logprobs(
token = self.tokenizer.decode(token_id)
logprobs.tokens.append(token)
logprobs.token_logprobs.append(None)
+ assert logprobs.top_logprobs is not None
logprobs.top_logprobs.append(None)
else:
token_logprob = step_top_logprobs[token_id].logprob
@@ -122,8 +128,11 @@ def _create_logprobs(
logprobs.token_logprobs.append(token_logprob)
if num_output_top_logprobs:
+ assert logprobs.top_logprobs is not None
logprobs.top_logprobs.append({
- p.decoded_token: p.logprob
+ # Convert float("-inf") to the
+ # JSON-serializable float that OpenAI uses
+ p.decoded_token: max(p.logprob, -9999.0)
for i, p in step_top_logprobs.items()
} if step_top_logprobs else None)
@@ -158,18 +167,18 @@ def create_streaming_error_response(
return json_str
async def _check_model(self, request) -> Optional[ErrorResponse]:
- if request.model == self.served_model:
- return
+ if request.model in self.served_model_names:
+ return None
if request.model in [lora.lora_name for lora in self.lora_requests]:
- return
+ return None
return self.create_error_response(
message=f"The model `{request.model}` does not exist.",
err_type="NotFoundError",
status_code=HTTPStatus.NOT_FOUND)
def _maybe_get_lora(self, request) -> Optional[LoRARequest]:
- if request.model == self.served_model:
- return
+ if request.model in self.served_model_names:
+ return None
for lora in self.lora_requests:
if request.model == lora.lora_name:
return lora
@@ -181,7 +190,7 @@ def _validate_prompt_and_tokenize(
request: Union[ChatCompletionRequest, CompletionRequest],
prompt: Optional[str] = None,
prompt_ids: Optional[List[int]] = None,
- truncate_prompt_tokens: Optional[conint(ge=1)] = None
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None
) -> Tuple[List[int], str]:
if not (prompt or prompt_ids):
raise ValueError("Either prompt or prompt_ids should be provided.")
@@ -205,6 +214,12 @@ def _validate_prompt_and_tokenize(
token_num = len(input_ids)
if request.max_tokens is None:
+ if token_num >= self.max_model_len:
+ raise ValueError(
+ f"This model's maximum context length is "
+ f"{self.max_model_len} tokens. However, you requested "
+ f"{token_num} tokens in the messages, "
+ f"Please reduce the length of the messages.", )
request.max_tokens = self.max_model_len - token_num
if token_num + request.max_tokens > self.max_model_len:
diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py
index eda4e8989c163..8d6a1fff91fd8 100644
--- a/vllm/executor/cpu_executor.py
+++ b/vllm/executor/cpu_executor.py
@@ -1,38 +1,28 @@
import os
-from typing import Dict, List, Optional
+from typing import Dict, List, Set, Tuple
import torch
-from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
- ParallelConfig, SchedulerConfig)
-from vllm.executor.executor_base import ExecutorBase
+from vllm.config import CacheConfig, ModelConfig, SchedulerConfig
+from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
-from vllm.utils import get_distributed_init_method, get_ip, get_open_port
+from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
+ make_async)
logger = init_logger(__name__)
class CPUExecutor(ExecutorBase):
- def __init__(self, model_config: ModelConfig, cache_config: CacheConfig,
- parallel_config: ParallelConfig,
- scheduler_config: SchedulerConfig,
- device_config: DeviceConfig,
- lora_config: Optional[LoRAConfig], *args, **kwargs) -> None:
- assert device_config.device_type == "cpu"
- assert lora_config is None, "cpu backend doesn't support LoRA"
- model_config = _verify_and_get_model_config(model_config)
- cache_config = _verify_and_get_cache_config(cache_config)
- scheduler_config = _verify_and_get_scheduler_config(scheduler_config)
-
- self.model_config = model_config
- self.cache_config = cache_config
- self.lora_config = lora_config
- self.parallel_config = parallel_config
- self.scheduler_config = scheduler_config
- self.device_config = device_config
+ def _init_executor(self) -> None:
+ assert self.device_config.device_type == "cpu"
+ assert self.lora_config is None, "cpu backend doesn't support LoRA"
+ self.model_config = _verify_and_get_model_config(self.model_config)
+ self.cache_config = _verify_and_get_cache_config(self.cache_config)
+ self.scheduler_config = _verify_and_get_scheduler_config(
+ self.scheduler_config)
# Instantiate the worker and load the model to CPU.
self._init_worker()
@@ -51,17 +41,19 @@ def _init_worker(self):
scheduler_config=self.scheduler_config,
device_config=self.device_config,
cache_config=self.cache_config,
+ load_config=self.load_config,
local_rank=0,
rank=0,
distributed_init_method=distributed_init_method,
lora_config=self.lora_config,
+ vision_language_config=self.vision_language_config,
kv_cache_dtype=self.cache_config.cache_dtype,
is_driver_worker=True,
)
self.driver_worker.init_device()
self.driver_worker.load_model()
- def determine_num_available_blocks(self) -> tuple[int, int]:
+ def determine_num_available_blocks(self) -> Tuple[int, int]:
"""Determine the number of available KV blocks by invoking the
underlying worker.
"""
@@ -74,14 +66,18 @@ def initialize_cache(self, num_gpu_blocks: int,
# NOTE: We log here to avoid multiple logs when number of workers is
# greater than one. We could log in the engine, but not all executors
# have GPUs.
- logger.info(f"# CPU blocks: {num_cpu_blocks}")
+ # NOTE: `cpu block` for CPU backend is located on CPU memory but is
+ # referred as `gpu block`. Because we want to reuse the existing block
+ # management procedure.
+ logger.info(f"# CPU blocks: {num_gpu_blocks}")
self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
def execute_model(self,
seq_group_metadata_list: List[SequenceGroupMetadata],
blocks_to_swap_in: Dict[int, int],
blocks_to_swap_out: Dict[int, int],
- blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput:
+ blocks_to_copy: Dict[int, List[int]],
+ num_lookahead_slots: int) -> List[SamplerOutput]:
output = self.driver_worker.execute_model(
seq_group_metadata_list=seq_group_metadata_list,
blocks_to_swap_in=blocks_to_swap_in,
@@ -96,7 +92,7 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
def remove_lora(self, lora_id: int) -> bool:
return self.driver_worker.remove_lora(lora_id)
- def list_loras(self) -> List[int]:
+ def list_loras(self) -> Set[int]:
return self.driver_worker.list_loras()
def check_health(self) -> None:
@@ -105,6 +101,28 @@ def check_health(self) -> None:
return
+class CPUExecutorAsync(CPUExecutor, ExecutorAsyncBase):
+
+ async def execute_model_async(
+ self,
+ seq_group_metadata_list: List[SequenceGroupMetadata],
+ blocks_to_swap_in: Dict[int, int],
+ blocks_to_swap_out: Dict[int, int],
+ blocks_to_copy: Dict[int, List[int]],
+ ) -> SamplerOutput:
+ output = await make_async(self.driver_worker.execute_model)(
+ seq_group_metadata_list=seq_group_metadata_list,
+ blocks_to_swap_in=blocks_to_swap_in,
+ blocks_to_swap_out=blocks_to_swap_out,
+ blocks_to_copy=blocks_to_copy)
+ return output
+
+ async def check_health_async(self) -> None:
+ # CPUExecutor will always be healthy as long as
+ # it's running.
+ return
+
+
def _verify_and_get_model_config(config: ModelConfig) -> ModelConfig:
if config.dtype == torch.float16:
logger.warning("float16 is not supported on CPU, casting to bfloat16.")
diff --git a/vllm/executor/executor_base.py b/vllm/executor/executor_base.py
index c18edd75d7a4d..1839b5603ff3e 100644
--- a/vllm/executor/executor_base.py
+++ b/vllm/executor/executor_base.py
@@ -1,9 +1,9 @@
from abc import ABC, abstractmethod
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Set, Tuple
-from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
- ParallelConfig, SchedulerConfig, SpeculativeConfig,
- VisionLanguageConfig)
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+ ModelConfig, ParallelConfig, SchedulerConfig,
+ SpeculativeConfig, VisionLanguageConfig)
from vllm.lora.request import LoRARequest
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
@@ -16,7 +16,6 @@ class ExecutorBase(ABC):
that can execute the model on multiple devices.
"""
- @abstractmethod
def __init__(
self,
model_config: ModelConfig,
@@ -24,14 +23,29 @@ def __init__(
parallel_config: ParallelConfig,
scheduler_config: SchedulerConfig,
device_config: DeviceConfig,
+ load_config: LoadConfig,
lora_config: Optional[LoRAConfig],
vision_language_config: Optional[VisionLanguageConfig],
speculative_config: Optional[SpeculativeConfig],
) -> None:
- raise NotImplementedError
+ self.model_config = model_config
+ self.cache_config = cache_config
+ self.lora_config = lora_config
+ self.load_config = load_config
+ self.parallel_config = parallel_config
+ self.scheduler_config = scheduler_config
+ self.device_config = device_config
+ self.vision_language_config = vision_language_config
+ self.speculative_config = speculative_config
+
+ self._init_executor()
+
+ @abstractmethod
+ def _init_executor(self) -> None:
+ pass
@abstractmethod
- def determine_num_available_blocks(self) -> tuple[int, int]:
+ def determine_num_available_blocks(self) -> Tuple[int, int]:
"""Determine the number of available blocks for the GPU KV cache and
swappable CPU KV cache.
@@ -39,7 +53,7 @@ def determine_num_available_blocks(self) -> tuple[int, int]:
ExecutorBase may require modification of the result, e.g. to ensure the
selected cache sizes are compatible with all workers.
- Returns a tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
+ Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
are blocks that are "active" on the device and can be appended to.
num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
appended to.
@@ -58,8 +72,9 @@ def execute_model(self,
seq_group_metadata_list: List[SequenceGroupMetadata],
blocks_to_swap_in: Dict[int, int],
blocks_to_swap_out: Dict[int, int],
- blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput:
- """Executes one model step on the given sequences."""
+ blocks_to_copy: Dict[int, List[int]],
+ num_lookahead_slots: int) -> List[SamplerOutput]:
+ """Executes at least one model step on the given sequences."""
raise NotImplementedError
@abstractmethod
@@ -71,7 +86,7 @@ def remove_lora(self, lora_id: int) -> bool:
raise NotImplementedError
@abstractmethod
- def list_loras(self) -> List[int]:
+ def list_loras(self) -> Set[int]:
raise NotImplementedError
@abstractmethod
@@ -94,8 +109,7 @@ async def execute_model_async(
"""Executes one model step on the given sequences."""
raise NotImplementedError
- @abstractmethod
async def check_health_async(self) -> None:
"""Checks if the executor is healthy. If not, it should raise an
exception."""
- raise NotImplementedError
+ self.check_health()
diff --git a/vllm/executor/gpu_executor.py b/vllm/executor/gpu_executor.py
index 80ca5cb7367c5..d413a7d27ff37 100644
--- a/vllm/executor/gpu_executor.py
+++ b/vllm/executor/gpu_executor.py
@@ -1,8 +1,5 @@
-from typing import Dict, List, Optional
+from typing import Dict, List, Set, Tuple
-from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
- ParallelConfig, SchedulerConfig, SpeculativeConfig,
- VisionLanguageConfig)
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
@@ -15,32 +12,18 @@
class GPUExecutor(ExecutorBase):
- def __init__(
- self,
- model_config: ModelConfig,
- cache_config: CacheConfig,
- parallel_config: ParallelConfig,
- scheduler_config: SchedulerConfig,
- device_config: DeviceConfig,
- lora_config: Optional[LoRAConfig],
- vision_language_config: Optional[VisionLanguageConfig],
- speculative_config: Optional[SpeculativeConfig],
- ) -> None:
- self.model_config = model_config
- self.cache_config = cache_config
- self.lora_config = lora_config
- self.parallel_config = parallel_config
- self.scheduler_config = scheduler_config
- self.device_config = device_config
- self.vision_language_config = vision_language_config
-
- assert (not speculative_config
- ), "Speculative decoding not yet supported for GPU backend"
-
- # Instantiate the worker and load the model to GPU.
- self._init_worker()
-
- def _init_worker(self):
+ def _init_executor(self) -> None:
+ """Initialize the worker and load the model.
+
+ If speculative decoding is enabled, we instead create the speculative
+ worker.
+ """
+ if self.speculative_config is None:
+ self._init_non_spec_worker()
+ else:
+ self._init_spec_worker()
+
+ def _init_non_spec_worker(self):
# Lazy import the Worker to avoid importing torch.cuda/xformers
# before CUDA_VISIBLE_DEVICES is set in the Worker
from vllm.worker.worker import Worker
@@ -56,6 +39,7 @@ def _init_worker(self):
scheduler_config=self.scheduler_config,
device_config=self.device_config,
cache_config=self.cache_config,
+ load_config=self.load_config,
local_rank=0,
rank=0,
distributed_init_method=distributed_init_method,
@@ -66,7 +50,61 @@ def _init_worker(self):
self.driver_worker.init_device()
self.driver_worker.load_model()
- def determine_num_available_blocks(self) -> tuple[int, int]:
+ def _init_spec_worker(self):
+ """Initialize a SpecDecodeWorker, using a draft model for proposals.
+ """
+ assert self.speculative_config is not None
+
+ from vllm.spec_decode.multi_step_worker import MultiStepWorker
+ from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker
+ from vllm.worker.worker import Worker
+
+ distributed_init_method = get_distributed_init_method(
+ get_ip(), get_open_port())
+
+ target_worker = Worker(
+ model_config=self.model_config,
+ parallel_config=self.parallel_config,
+ scheduler_config=self.scheduler_config,
+ device_config=self.device_config,
+ cache_config=self.cache_config,
+ load_config=self.load_config,
+ local_rank=0,
+ rank=0,
+ distributed_init_method=distributed_init_method,
+ lora_config=self.lora_config,
+ vision_language_config=self.vision_language_config,
+ is_driver_worker=True,
+ )
+
+ draft_worker = MultiStepWorker(
+ model_config=self.speculative_config.draft_model_config,
+ parallel_config=self.speculative_config.draft_parallel_config,
+ scheduler_config=self.scheduler_config,
+ device_config=self.device_config,
+ cache_config=self.cache_config,
+ # TODO allow draft-model specific load config.
+ load_config=self.load_config,
+ local_rank=0,
+ rank=0,
+ distributed_init_method=distributed_init_method,
+ lora_config=self.lora_config,
+ vision_language_config=self.vision_language_config,
+ is_driver_worker=True,
+ )
+
+ spec_decode_worker = SpecDecodeWorker.from_workers(
+ proposer_worker=draft_worker, scorer_worker=target_worker)
+
+ assert self.parallel_config.world_size == 1, (
+ "GPUExecutor only supports single GPU.")
+
+ self.driver_worker = spec_decode_worker
+
+ # Load model handled in spec decode worker.
+ self.driver_worker.init_device()
+
+ def determine_num_available_blocks(self) -> Tuple[int, int]:
"""Determine the number of available KV blocks by invoking the
underlying worker.
"""
@@ -83,16 +121,20 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks) -> None:
self.driver_worker.initialize_cache(num_gpu_blocks, num_cpu_blocks)
- def execute_model(self,
- seq_group_metadata_list: List[SequenceGroupMetadata],
- blocks_to_swap_in: Dict[int, int],
- blocks_to_swap_out: Dict[int, int],
- blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput:
+ def execute_model(
+ self,
+ seq_group_metadata_list: List[SequenceGroupMetadata],
+ blocks_to_swap_in: Dict[int, int],
+ blocks_to_swap_out: Dict[int, int],
+ blocks_to_copy: Dict[int, List[int]],
+ num_lookahead_slots: int,
+ ) -> List[SamplerOutput]:
output = self.driver_worker.execute_model(
seq_group_metadata_list=seq_group_metadata_list,
blocks_to_swap_in=blocks_to_swap_in,
blocks_to_swap_out=blocks_to_swap_out,
blocks_to_copy=blocks_to_copy,
+ num_lookahead_slots=num_lookahead_slots,
)
return output
@@ -104,7 +146,7 @@ def remove_lora(self, lora_id: int) -> bool:
assert lora_id > 0, "lora_id must be greater than 0."
return self.driver_worker.remove_lora(lora_id)
- def list_loras(self) -> List[int]:
+ def list_loras(self) -> Set[int]:
return self.driver_worker.list_loras()
def check_health(self) -> None:
@@ -128,8 +170,3 @@ async def execute_model_async(
blocks_to_swap_out=blocks_to_swap_out,
blocks_to_copy=blocks_to_copy)
return output
-
- async def check_health_async(self) -> None:
- # GPUExecutor will always be healthy as long as
- # it's running.
- return
diff --git a/vllm/executor/neuron_executor.py b/vllm/executor/neuron_executor.py
index 57436a85cfa27..5a137d1bdcb3b 100644
--- a/vllm/executor/neuron_executor.py
+++ b/vllm/executor/neuron_executor.py
@@ -1,35 +1,20 @@
-from typing import Dict, List, Optional
+from typing import Dict, List, Set, Tuple
-from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
- ParallelConfig, SchedulerConfig, SpeculativeConfig,
- VisionLanguageConfig)
-from vllm.executor.executor_base import ExecutorBase
+from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.utils import make_async
logger = init_logger(__name__)
class NeuronExecutor(ExecutorBase):
- def __init__(
- self,
- model_config: ModelConfig,
- cache_config: CacheConfig,
- parallel_config: ParallelConfig,
- scheduler_config: SchedulerConfig,
- device_config: DeviceConfig,
- lora_config: Optional[LoRAConfig],
- vision_language_config: Optional[VisionLanguageConfig],
- speculative_config: Optional[SpeculativeConfig],
- ) -> None:
- self.model_config = model_config
- assert lora_config is None, "LoRA is not supported for Neuron backend."
- self.parallel_config = parallel_config
- self.scheduler_config = scheduler_config
- self.device_config = device_config
- assert (not speculative_config
+ def _init_executor(self) -> None:
+ assert (self.lora_config is
+ None), "LoRA is not supported for Neuron backend."
+ assert (not self.speculative_config
), "Speculative decoding not yet supported for Neuron backend."
# Instantiate the worker and load the model to the device.
@@ -43,11 +28,12 @@ def _init_worker(self):
self.parallel_config,
self.scheduler_config,
self.device_config,
+ self.cache_config,
)
self.driver_worker.init_device()
self.driver_worker.load_model()
- def determine_num_available_blocks(self) -> tuple[int, int]:
+ def determine_num_available_blocks(self) -> Tuple[int, int]:
"""Determine the number of available KV blocks by invoking the
underlying worker.
"""
@@ -63,10 +49,13 @@ def execute_model(self,
seq_group_metadata_list: List[SequenceGroupMetadata],
blocks_to_swap_in: Dict[int, int],
blocks_to_swap_out: Dict[int, int],
- blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput:
+ blocks_to_copy: Dict[int, List[int]],
+ num_lookahead_slots: int) -> List[SamplerOutput]:
assert (blocks_to_swap_in == {} and blocks_to_swap_out == {}
and blocks_to_copy == {}), (
"Cache operations are not supported for Neuron backend.")
+ assert num_lookahead_slots == 0, (
+ "lookahead not supported for Neuron backend.")
output = self.driver_worker.execute_model(
seq_group_metadata_list=seq_group_metadata_list)
@@ -78,10 +67,29 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
def remove_lora(self, lora_id: int) -> bool:
return self.driver_worker.remove_lora(lora_id)
- def list_loras(self) -> List[int]:
+ def list_loras(self) -> Set[int]:
return self.driver_worker.list_loras()
def check_health(self) -> None:
# NeuronExecutor will always be healthy as long as
# it's running.
return
+
+
+class NeuronExecutorAsync(NeuronExecutor, ExecutorAsyncBase):
+
+ async def execute_model_async(
+ self,
+ seq_group_metadata_list: List[SequenceGroupMetadata],
+ blocks_to_swap_in: Dict[int, int],
+ blocks_to_swap_out: Dict[int, int],
+ blocks_to_copy: Dict[int, List[int]],
+ ) -> SamplerOutput:
+ output = await make_async(self.driver_worker.execute_model)(
+ seq_group_metadata_list=seq_group_metadata_list, )
+ return output
+
+ async def check_health_async(self) -> None:
+ # NeuronExecutor will always be healthy as long as
+ # it's running.
+ return
diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index 6c0ccd7e64c90..14b3f803782c6 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -1,20 +1,17 @@
import asyncio
-import copy
import os
import pickle
from collections import defaultdict
-from typing import TYPE_CHECKING, Any, Dict, List, Optional
+from itertools import islice, repeat
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple
-from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
- ParallelConfig, SchedulerConfig, SpeculativeConfig,
- VisionLanguageConfig)
-from vllm.engine.ray_utils import RayWorkerVllm, ray
from vllm.executor.executor_base import ExecutorAsyncBase, ExecutorBase
+from vllm.executor.ray_utils import RayWorkerWrapper, ray
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
from vllm.utils import (get_distributed_init_method, get_ip, get_open_port,
- make_async, set_cuda_visible_devices)
+ get_vllm_instance_id, make_async)
if ray is not None:
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
@@ -32,25 +29,8 @@
class RayGPUExecutor(ExecutorBase):
- def __init__(
- self,
- model_config: ModelConfig,
- cache_config: CacheConfig,
- parallel_config: ParallelConfig,
- scheduler_config: SchedulerConfig,
- device_config: DeviceConfig,
- lora_config: Optional[LoRAConfig],
- vision_language_config: Optional[VisionLanguageConfig],
- speculative_config: Optional[SpeculativeConfig],
- ) -> None:
- self.model_config = model_config
- self.cache_config = cache_config
- self.lora_config = lora_config
- self.parallel_config = parallel_config
- self.scheduler_config = scheduler_config
- self.device_config = device_config
- self.vision_language_config = vision_language_config
- assert (not speculative_config
+ def _init_executor(self) -> None:
+ assert (not self.speculative_config
), "Speculative decoding not yet supported for RayGPU backend."
assert self.parallel_config.worker_use_ray
@@ -68,6 +48,21 @@ def __init__(
if USE_RAY_COMPILED_DAG:
self.forward_dag = self._compiled_ray_dag()
+ def _configure_ray_workers_use_nsight(self,
+ ray_remote_kwargs) -> Dict[str, Any]:
+ # If nsight profiling is enabled, we need to set the profiling
+ # configuration for the ray workers as runtime env.
+ runtime_env = ray_remote_kwargs.setdefault("runtime_env", {})
+ runtime_env.update({
+ "nsight": {
+ "t": "cuda,cudnn,cublas",
+ "o": "'worker_process_%p'",
+ "cuda-graph-trace": "node",
+ }
+ })
+
+ return ray_remote_kwargs
+
def _init_workers_ray(self, placement_group: "PlacementGroup",
**ray_remote_kwargs):
if self.parallel_config.tensor_parallel_size == 1:
@@ -79,9 +74,13 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
# The driver dummy worker does not actually use any resources.
# It holds the resource for the driver worker.
- self.driver_dummy_worker: RayWorkerVllm = None
+ self.driver_dummy_worker: Optional[RayWorkerWrapper] = None
# The remaining workers are the actual ray actors.
- self.workers: List[RayWorkerVllm] = []
+ self.workers: List[RayWorkerWrapper] = []
+
+ if self.parallel_config.ray_workers_use_nsight:
+ ray_remote_kwargs = self._configure_ray_workers_use_nsight(
+ ray_remote_kwargs)
# Create the workers.
driver_ip = get_ip()
@@ -98,13 +97,22 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
num_gpus=num_gpus,
scheduling_strategy=scheduling_strategy,
**ray_remote_kwargs,
- )(RayWorkerVllm).remote(self.model_config.trust_remote_code)
+ )(RayWorkerWrapper).remote(
+ worker_module_name="vllm.worker.worker",
+ worker_class_name="Worker",
+ trust_remote_code=self.model_config.trust_remote_code,
+ )
worker_ip = ray.get(worker.get_node_ip.remote())
if worker_ip == driver_ip and self.driver_dummy_worker is None:
# If the worker is on the same node as the driver, we use it
# as the resource holder for the driver process.
self.driver_dummy_worker = worker
+ self.driver_worker = RayWorkerWrapper(
+ worker_module_name="vllm.worker.worker",
+ worker_class_name="Worker",
+ trust_remote_code=self.model_config.trust_remote_code,
+ )
else:
# Else, added to the list of workers.
self.workers.append(worker)
@@ -116,79 +124,59 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
"GPU node.")
# Get the set of GPU IDs used on each node.
- driver_node_id, driver_gpu_ids = ray.get(
- self.driver_dummy_worker.get_node_and_gpu_ids.remote())
- worker_node_and_gpu_ids = ray.get(
- [worker.get_node_and_gpu_ids.remote() for worker in self.workers])
+ worker_node_and_gpu_ids = self._run_workers("get_node_and_gpu_ids",
+ use_dummy_driver=True)
node_workers = defaultdict(list)
node_gpus = defaultdict(list)
- node_workers[driver_node_id].append(0)
- node_gpus[driver_node_id].extend(driver_gpu_ids)
- for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids,
- start=1):
+ for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids):
node_workers[node_id].append(i)
node_gpus[node_id].extend(gpu_ids)
for node_id, gpu_ids in node_gpus.items():
node_gpus[node_id] = sorted(gpu_ids)
- # Set CUDA_VISIBLE_DEVICES for the driver and workers.
- set_cuda_visible_devices(node_gpus[driver_node_id])
- for worker, (node_id, _) in zip(self.workers, worker_node_and_gpu_ids):
- worker.set_cuda_visible_devices.remote(node_gpus[node_id])
+ VLLM_INSTANCE_ID = get_vllm_instance_id()
+
+ # Set environment variables for the driver and workers.
+ all_args_to_update_environment_variables = [({
+ "CUDA_VISIBLE_DEVICES":
+ ",".join(map(str, node_gpus[node_id])),
+ "VLLM_INSTANCE_ID":
+ VLLM_INSTANCE_ID,
+ "VLLM_TRACE_FUNCTION":
+ os.getenv("VLLM_TRACE_FUNCTION", "0"),
+ }, ) for (node_id, _) in worker_node_and_gpu_ids]
+ self._run_workers("update_environment_variables",
+ all_args=all_args_to_update_environment_variables)
distributed_init_method = get_distributed_init_method(
driver_ip, get_open_port())
- # Lazy import the Worker to avoid importing torch.cuda/xformers
- # before CUDA_VISIBLE_DEVICES is set in the Worker
- from vllm.worker.worker import Worker
-
- model_config = copy.deepcopy(self.model_config)
- parallel_config = copy.deepcopy(self.parallel_config)
- scheduler_config = copy.deepcopy(self.scheduler_config)
- device_config = copy.deepcopy(self.device_config)
- lora_config = copy.deepcopy(self.lora_config)
- cache_config = copy.deepcopy(self.cache_config)
- vision_language_config = copy.deepcopy(self.vision_language_config)
-
- # Initialize the actual workers with the Worker class.
- for rank, (worker, (node_id, _)) in enumerate(
- zip(self.workers, worker_node_and_gpu_ids),
- start=1,
- ):
+ def collect_arg_helper_func(**kwargs):
+ # avoid writing `{"name": value}` manually
+ return kwargs
+
+ # Initialize the actual workers inside worker wrapper.
+ init_worker_all_kwargs = []
+ for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids):
local_rank = node_workers[node_id].index(rank)
- worker.init_worker.remote(
- lambda rank=rank, local_rank=local_rank: Worker(
- model_config=model_config,
- parallel_config=parallel_config,
- scheduler_config=scheduler_config,
- device_config=device_config,
- cache_config=cache_config,
+ init_worker_all_kwargs.append(
+ collect_arg_helper_func(
+ model_config=self.model_config,
+ parallel_config=self.parallel_config,
+ scheduler_config=self.scheduler_config,
+ device_config=self.device_config,
+ cache_config=self.cache_config,
+ load_config=self.load_config,
local_rank=local_rank,
rank=rank,
distributed_init_method=distributed_init_method,
- lora_config=lora_config,
- vision_language_config=vision_language_config,
+ lora_config=self.lora_config,
+ vision_language_config=self.vision_language_config,
+ is_driver_worker=rank == 0,
))
-
- # Initialize the driver worker with the Worker class.
- driver_rank = 0
- driver_local_rank = node_workers[driver_node_id].index(driver_rank)
- self.driver_worker = Worker(
- model_config=self.model_config,
- parallel_config=self.parallel_config,
- scheduler_config=self.scheduler_config,
- device_config=self.device_config,
- cache_config=self.cache_config,
- local_rank=driver_local_rank,
- rank=driver_rank,
- distributed_init_method=distributed_init_method,
- lora_config=self.lora_config,
- vision_language_config=self.vision_language_config,
- is_driver_worker=True,
- )
+ self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs)
self._run_workers("init_device")
self._run_workers(
@@ -197,7 +185,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
max_parallel_loading_workers,
)
- def determine_num_available_blocks(self) -> tuple[int, int]:
+ def determine_num_available_blocks(self) -> Tuple[int, int]:
"""Determine the number of available KV blocks.
This invokes `determine_num_available_blocks` on each worker and takes
@@ -205,7 +193,7 @@ def determine_num_available_blocks(self) -> tuple[int, int]:
compatible with all workers.
Returns:
- - tuple[num_gpu_blocks, num_cpu_blocks]
+ - Tuple[num_gpu_blocks, num_cpu_blocks]
"""
# Get the maximum number of blocks that can be allocated on GPU and CPU.
num_blocks = self._run_workers("determine_num_available_blocks", )
@@ -240,7 +228,8 @@ def execute_model(self,
seq_group_metadata_list: List[SequenceGroupMetadata],
blocks_to_swap_in: Dict[int, int],
blocks_to_swap_out: Dict[int, int],
- blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput:
+ blocks_to_copy: Dict[int, List[int]],
+ num_lookahead_slots: int = 0) -> SamplerOutput:
all_outputs = self._run_workers(
"execute_model",
driver_kwargs={
@@ -269,45 +258,70 @@ def remove_lora(self, lora_id: int) -> bool:
lora_id=lora_id,
)
- def list_loras(self) -> List[int]:
+ def list_loras(self) -> Set[int]:
return self._run_workers("list_loras")
def _run_workers(
self,
method: str,
*args,
- driver_args: Optional[List[Any]] = None,
+ driver_args: Optional[Tuple[Any, ...]] = None,
driver_kwargs: Optional[Dict[str, Any]] = None,
+ all_args: Optional[List[Tuple[Any, ...]]] = None,
+ all_kwargs: Optional[List[Dict[str, Any]]] = None,
+ use_dummy_driver: bool = False,
max_concurrent_workers: Optional[int] = None,
use_ray_compiled_dag: bool = False,
**kwargs,
) -> Any:
- """Runs the given method on all workers."""
+ """Runs the given method on all workers. Can be used in the following
+ ways:
+
+ - args/kwargs: All workers share the same args/kwargs
+ - args/kwargs and driver_args/driver_kwargs: Driver worker has
+ different args
+ - all_args/all_kwargs: args/kwargs for each worker are specified
+ individually
+ """
if max_concurrent_workers:
raise NotImplementedError(
"max_concurrent_workers is not supported yet.")
+ if driver_args is None:
+ driver_args = args if all_args is None else all_args[0]
+ if driver_kwargs is None:
+ driver_kwargs = kwargs if all_kwargs is None else all_kwargs[0]
+
+ count = len(self.workers)
+ all_worker_args = repeat(args, count) if all_args is None \
+ else islice(all_args, 1, None)
+ all_worker_kwargs = repeat(kwargs, count) if all_kwargs is None \
+ else islice(all_kwargs, 1, None)
+
if use_ray_compiled_dag:
# Right now, compiled DAG can only accept a single
# input. TODO(sang): Fix it.
+ assert self.forward_dag is not None
output_channels = self.forward_dag.execute(1)
else:
# Start the ray workers first.
ray_worker_outputs = [
- worker.execute_method.remote(method, *args, **kwargs)
- for worker in self.workers
+ worker.execute_method.remote(method, *worker_args,
+ **worker_kwargs)
+ for (worker, worker_args, worker_kwargs
+ ) in zip(self.workers, all_worker_args, all_worker_kwargs)
]
- if driver_args is None:
- driver_args = args
- if driver_kwargs is None:
- driver_kwargs = kwargs
-
# Start the driver worker after all the ray workers.
- driver_worker_output = getattr(self.driver_worker,
- method)(*driver_args, **driver_kwargs)
-
+ if not use_dummy_driver:
+ driver_worker_output = self.driver_worker.execute_method(
+ method, *driver_args, **driver_kwargs)
+ else:
+ assert self.driver_dummy_worker is not None
+ driver_worker_output = ray.get(
+ self.driver_dummy_worker.execute_method.remote(
+ method, *driver_args, **driver_kwargs))
# Get the results of the ray workers.
if self.workers:
if use_ray_compiled_dag:
@@ -340,8 +354,9 @@ def _compiled_ray_dag(self):
# a dummy value for now. It will be fixed soon.
with InputNode() as input_data:
forward_dag = MultiOutputNode([
- worker.execute_model_compiled_dag_remote.bind(input_data)
- for worker in self.workers
+ worker.execute_model_compiled_dag_remote.
+ bind( # type: ignore[attr-defined]
+ input_data) for worker in self.workers
])
return forward_dag.experimental_compile()
@@ -365,11 +380,15 @@ def _check_if_any_actor_is_dead(self):
class RayGPUExecutorAsync(RayGPUExecutor, ExecutorAsyncBase):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.driver_executor = make_async(self.driver_worker.execute_method)
+
async def _run_workers_async(
self,
method: str,
*args,
- driver_args: Optional[List[Any]] = None,
+ driver_args: Optional[Tuple[Any, ...]] = None,
driver_kwargs: Optional[Dict[str, Any]] = None,
**kwargs,
) -> Any:
@@ -381,9 +400,8 @@ async def _run_workers_async(
if driver_kwargs is None:
driver_kwargs = kwargs
- # Run the driver worker asynchronously.
- driver_executor = make_async(getattr(self.driver_worker, method))
- coros.append(driver_executor(*driver_args, **driver_kwargs))
+ coros.append(
+ self.driver_executor(method, *driver_args, **driver_kwargs))
# Run the ray workers asynchronously.
for worker in self.workers:
@@ -411,7 +429,3 @@ async def execute_model_async(
# Only the driver worker returns the sampling results.
output = all_outputs[0]
return output
-
- async def check_health_async(self) -> None:
- """Raises an error if engine is unhealthy."""
- self._check_if_any_actor_is_dead()
diff --git a/vllm/engine/ray_utils.py b/vllm/executor/ray_utils.py
similarity index 76%
rename from vllm/engine/ray_utils.py
rename to vllm/executor/ray_utils.py
index 70d5c9b1fae05..febae42b84549 100644
--- a/vllm/engine/ray_utils.py
+++ b/vllm/executor/ray_utils.py
@@ -3,47 +3,26 @@
from vllm.config import ParallelConfig
from vllm.logger import init_logger
-from vllm.utils import get_ip, is_hip, set_cuda_visible_devices
+from vllm.utils import get_ip, is_hip
+from vllm.worker.worker_base import WorkerWrapperBase
logger = init_logger(__name__)
try:
import ray
- class RayWorkerVllm:
+ class RayWorkerWrapper(WorkerWrapperBase):
"""Ray wrapper for vllm.worker.Worker, allowing Worker to be
lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES."""
- def __init__(self, init_cached_hf_modules=False) -> None:
- if init_cached_hf_modules:
- from transformers.dynamic_module_utils import init_hf_modules
- init_hf_modules()
- self.worker = None
+ def __init__(self, *args, **kwargs) -> None:
+ super().__init__(*args, **kwargs)
# Since the compiled DAG runs a main execution
# in a different thread that calls cuda.set_device.
# The flag indicates is set_device is called on
# that thread.
self.compiled_dag_cuda_device_set = False
- def init_worker(self, worker_init_fn):
- self.worker = worker_init_fn()
-
- def __getattr__(self, name):
- return getattr(self.worker, name)
-
- def execute_method(self, method, *args, **kwargs):
- try:
- executor = getattr(self, method)
- return executor(*args, **kwargs)
- except Exception as e:
- # exceptions in ray worker may cause deadlock
- # see https://github.com/vllm-project/vllm/issues/3455
- # print the error and inform the user to solve the error
- msg = (f"Error executing method {method}. "
- "This might cause deadlock in distributed execution.")
- logger.exception(msg)
- raise e
-
def get_node_ip(self) -> str:
return get_ip()
@@ -52,9 +31,6 @@ def get_node_and_gpu_ids(self) -> Tuple[str, List[int]]:
gpu_ids = ray.get_gpu_ids()
return node_id, gpu_ids
- def set_cuda_visible_devices(self, device_ids) -> None:
- set_cuda_visible_devices(device_ids)
-
def execute_model_compiled_dag_remote(self, ignored):
"""Used only when compiled DAG is enabled."""
import torch
@@ -70,8 +46,8 @@ def execute_model_compiled_dag_remote(self, ignored):
logger.warning(f"Failed to import Ray with {e!r}. "
"For distributed inference, please install Ray with "
"`pip install ray`.")
- ray = None
- RayWorkerVllm = None
+ ray = None # type: ignore
+ RayWorkerWrapper = None # type: ignore
def initialize_ray_cluster(
diff --git a/vllm/logger.py b/vllm/logger.py
index af9575085ef37..341fc473585d7 100644
--- a/vllm/logger.py
+++ b/vllm/logger.py
@@ -1,9 +1,11 @@
# Adapted from
# https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py
"""Logging configuration for vLLM."""
+import datetime
import logging
import os
import sys
+from functools import partial
from typing import Optional
VLLM_CONFIGURE_LOGGING = int(os.getenv("VLLM_CONFIGURE_LOGGING", "1"))
@@ -65,3 +67,67 @@ def init_logger(name: str):
logger.addHandler(_default_handler)
logger.propagate = False
return logger
+
+
+logger = init_logger(__name__)
+
+
+def _trace_calls(log_path, root_dir, frame, event, arg=None):
+ if event in ['call', 'return']:
+ # Extract the filename, line number, function name, and the code object
+ filename = frame.f_code.co_filename
+ lineno = frame.f_lineno
+ func_name = frame.f_code.co_name
+ if not filename.startswith(root_dir):
+ # only log the functions in the vllm root_dir
+ return
+ # Log every function call or return
+ try:
+ last_frame = frame.f_back
+ if last_frame is not None:
+ last_filename = last_frame.f_code.co_filename
+ last_lineno = last_frame.f_lineno
+ last_func_name = last_frame.f_code.co_name
+ else:
+ # initial frame
+ last_filename = ""
+ last_lineno = 0
+ last_func_name = ""
+ with open(log_path, 'a') as f:
+ if event == 'call':
+ f.write(f"{datetime.datetime.now()} Call to"
+ f" {func_name} in {filename}:{lineno}"
+ f" from {last_func_name} in {last_filename}:"
+ f"{last_lineno}\n")
+ else:
+ f.write(f"{datetime.datetime.now()} Return from"
+ f" {func_name} in {filename}:{lineno}"
+ f" to {last_func_name} in {last_filename}:"
+ f"{last_lineno}\n")
+ except NameError:
+ # modules are deleted during shutdown
+ pass
+ return partial(_trace_calls, log_path, root_dir)
+
+
+def enable_trace_function_call(log_file_path: str,
+ root_dir: Optional[str] = None):
+ """
+ Enable tracing of every function call in code under `root_dir`.
+ This is useful for debugging hangs or crashes.
+ `log_file_path` is the path to the log file.
+ `root_dir` is the root directory of the code to trace. If None, it is the
+ vllm root directory.
+
+ Note that this call is thread-level, any threads calling this function
+ will have the trace enabled. Other threads will not be affected.
+ """
+ logger.warning(
+ "VLLM_TRACE_FUNCTION is enabled. It will record every"
+ " function executed by Python. This will slow down the code. It "
+ "is suggested to be used for debugging hang or crashes only.")
+ logger.info(f"Trace frame log is saved to {log_file_path}")
+ if root_dir is None:
+ # by default, this is the vllm root directory
+ root_dir = os.path.dirname(os.path.dirname(__file__))
+ sys.settrace(partial(_trace_calls, log_file_path, root_dir))
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 4b9653de73a88..98e74168002c4 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -32,14 +32,17 @@
def _get_lora_device(base_layer: nn.Module) -> torch.device:
# code borrowed from https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/vllm/lora/layers.py#L34
"""Returns the device for where to place the LoRA tensors."""
+ # unquantizedLinear
if hasattr(base_layer, "weight"):
return base_layer.weight.device
- if hasattr(base_layer, "linear_weights") and isinstance(
- base_layer.linear_weights, dict):
- values = list(base_layer.linear_weights.values())
- if len(values) and isinstance(values[0], torch.Tensor):
- return values[0].device
- raise ValueError(f"Unsupported base layer: {base_layer}")
+ # GPTQ/AWQ/SqueezeLLM
+ elif hasattr(base_layer, "qweight"):
+ return base_layer.qweight.device
+ # marlin
+ elif hasattr(base_layer, "B"):
+ return base_layer.B.device
+ else:
+ raise ValueError(f"Unsupported base layer: {base_layer}")
def _apply_lora(
@@ -173,6 +176,8 @@ class VocabParallelEmbeddingWithLoRA(BaseLayerWithLoRA):
def __init__(self, base_layer: VocabParallelEmbedding) -> None:
super().__init__()
self.base_layer = base_layer
+ self.embeddings_slice: Optional[Tuple[int, int]]
+ self.embeddings_weights: Optional[torch.Tensor]
def create_lora_weights(
self,
@@ -230,9 +235,10 @@ def create_lora_weights(
self.lora_a_stacked.shape[0] * self.lora_a_stacked.shape[1],
self.lora_a_stacked.shape[2],
)
- self.indices: Optional[torch.Tensor] = None
- self.indices_len: Optional[List[int]] = None
- self.embeddings_indices = None
+ # Lazily initialized.
+ self.indices: torch.Tensor
+ self.indices_len: List[int]
+ self.embeddings_indices: torch.Tensor
def reset_lora(self, index: int):
self.lora_a_stacked[index] = 0
@@ -264,6 +270,7 @@ def set_lora(
self.embeddings_tensors.shape[1],
self.embeddings_tensors.shape[2]
)[self.embeddings_slice[0]:self.embeddings_slice[1]]
+ assert self.embeddings_weights is not None
self.embeddings_weights[:embeddings.shape[0]].copy_(embeddings)
def set_mapping(
@@ -340,11 +347,12 @@ def create_lora_weights(
dtype=lora_config.lora_dtype,
device=self.device,
)
-
- self.indices: Optional[torch.Tensor] = None
- self.indices_len: Optional[List[int]] = None
self.output_dim = self.lora_b_stacked.shape[2]
+ # lazily initialized.
+ self.indices: torch.Tensor
+ self.indices_len: List[int]
+
def reset_lora(self, index: int):
self.lora_a_stacked[index] = 0
self.lora_b_stacked[index] = 0
@@ -472,8 +480,9 @@ def create_lora_weights(
device=self.device,
) for _ in range(n_slices))
- self.indices: Optional[torch.Tensor] = None
self.output_dim = self.lora_b_stacked[0].shape[2]
+ # Lazily initialized.
+ self.indices: torch.Tensor
def reset_lora(self, index: int):
self.lora_a_stacked[0][index] = 0
@@ -687,7 +696,8 @@ def create_lora_weights(
self.kv_proj_shard_size)
self.packed_indices: Optional[torch.Tensor] = None
self.standard_indices: Optional[torch.Tensor] = None
- self.indices_len: Optional[List[int]] = None
+ # lazily initialized.
+ self.indices_len: List[int]
def reset_lora(self, index: int):
self.lora_a_stacked[0][index] = 0
@@ -811,8 +821,9 @@ def create_lora_weights(
dtype=lora_config.lora_dtype,
device=self.device,
)
- self.indices: Optional[torch.Tensor] = None
- self.indices_len: Optional[List[int]] = None
+ # Lazily initialized
+ self.indices: torch.Tensor
+ self.indices_len: List[int]
def reset_lora(self, index: int):
self.lora_a_stacked[index] = 0
@@ -988,9 +999,10 @@ def create_lora_weights(
dtype=self.dtype,
device=self.device,
)
- self.indices = None
- self.indices_padded = None
- self.indices_len = None
+ # Lazily initialized.
+ self.indices: torch.Tensor
+ self.indices_len: List[int]
+ self.indices_padded: torch.Tensor
def reset_lora(self, index: int):
self.lora_a_stacked[index] = 0
diff --git a/vllm/lora/lora.py b/vllm/lora/lora.py
index 21c2196eb2739..d7794aa7cd35c 100644
--- a/vllm/lora/lora.py
+++ b/vllm/lora/lora.py
@@ -33,7 +33,7 @@ def __init__(
def optimize(self) -> "LoRALayerWeights":
"""Optimize the LoRA by merging the scaling into lora_b."""
if self.scaling == 1:
- return
+ return self
self.lora_b *= self.scaling
self.scaling = 1
return self
@@ -97,9 +97,9 @@ def __init__(
self,
module_name: str,
rank: int,
- lora_alphas: List[int],
- lora_a: List[torch.Tensor],
- lora_b: List[torch.Tensor],
+ lora_alphas: List[Optional[int]],
+ lora_a: List[Optional[torch.Tensor]],
+ lora_b: List[Optional[torch.Tensor]],
scaling: Optional[List[float]] = None,
) -> None:
super().__init__(
@@ -108,17 +108,20 @@ def __init__(
lora_alpha=0,
lora_a=lora_a,
lora_b=lora_b,
- scaling=scaling,
+ scaling=scaling, # type: ignore
embeddings_tensor=None,
)
self.lora_alphas = lora_alphas
if scaling is None:
- self.scaling = [
- lora_alpha / self.rank for lora_alpha in self.lora_alphas
+ self.scaling = [ # type: ignore
+ lora_alpha / self.rank # type: ignore # noqa
+ for lora_alpha in self.lora_alphas
]
@classmethod
- def pack(cls, loras: List["LoRALayerWeights"]) -> "PackedLoRALayerWeights":
+ def pack(
+ cls, loras: List[Optional["LoRALayerWeights"]]
+ ) -> "PackedLoRALayerWeights":
"""Pack a list of LoRAs into a single LoRA.
If LoRA is None, it signifies that the submodule does not have a LoRA.
@@ -136,16 +139,19 @@ def pack(cls, loras: List["LoRALayerWeights"]) -> "PackedLoRALayerWeights":
[lora.lora_alpha if lora is not None else None for lora in loras],
[lora.lora_a if lora is not None else None for lora in loras],
[lora.lora_b if lora is not None else None for lora in loras],
- scaling=[1 if lora is not None else None for lora in loras])
+ scaling=[
+ 1 if lora is not None else None # type: ignore
+ for lora in loras
+ ])
return obj
def optimize(self) -> "PackedLoRALayerWeights":
"""Optimize the LoRA by merging the scaling into lora_b."""
for i in range(len(self.lora_b)):
- if self.scaling[i] == 1 or self.lora_b[i] is None:
+ if self.scaling[i] == 1 or self.lora_b[i] is None: # type: ignore
continue
- self.lora_b[i] *= self.scaling[i]
- self.scaling[i] = 1
+ self.lora_b[i] *= self.scaling[i] # type: ignore
+ self.scaling[i] = 1 # type: ignore
return self
@property
diff --git a/vllm/lora/models.py b/vllm/lora/models.py
index 62f1502458008..c249497a4d893 100644
--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -3,7 +3,7 @@
import math
import os
import re
-from typing import Callable, Dict, Hashable, List, Optional, Tuple, Type
+from typing import Callable, Dict, List, Optional, Tuple, Type
import safetensors.torch
import torch
@@ -53,44 +53,46 @@ def convert_mapping(
embeddings.
indices_len: List of lengths of the above tensors.
"""
- indices = list(mapping.index_mapping).copy()
- embedding_indices = indices.copy()
- lora_indices = indices.copy()
- prompt_mapping = [
+ index_mapping_indices: List[int] = list(mapping.index_mapping).copy()
+ embedding_indices = index_mapping_indices.copy()
+ lora_indices = index_mapping_indices.copy()
+ prompt_mapping: List[int] = [
lora_index_to_id.index(x) if x > 0 else -1
for x in mapping.prompt_mapping
]
lora_idx = None
- for i in range(len(indices)):
+ for i in range(len(index_mapping_indices)):
# TODO index can be slow. optimize
- lora_idx = (lora_index_to_id.index(indices[i])
- if indices[i] > 0 else -1)
- embedding_indices[i] = lora_idx if indices[i] > 0 else 0
- indices[i] = i
+ lora_idx = (lora_index_to_id.index(index_mapping_indices[i])
+ if index_mapping_indices[i] > 0 else -1)
+ embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0
+ index_mapping_indices[i] = i
lora_indices[i] = lora_idx
- indices = torch.tensor([indices, lora_indices, embedding_indices],
- dtype=torch.long,
- device="cuda")
- prompt_mapping = torch.tensor(prompt_mapping,
- device="cuda",
- dtype=torch.long)
+ indices = torch.tensor(
+ [index_mapping_indices, lora_indices, embedding_indices],
+ dtype=torch.long,
+ device="cuda")
+ prompt_mapping_tensor = torch.tensor(prompt_mapping,
+ device="cuda",
+ dtype=torch.long)
embeddings_indices = torch.stack([
indices[2] * extra_vocab_size,
indices[2] * (vocab_size + extra_vocab_size)
])
embeddings_indices[embeddings_indices == -1] = max_loras - 1
base_indices = indices[1]
- sampler_indices = prompt_mapping
+ sampler_indices = prompt_mapping_tensor
sampler_indices_padded = sampler_indices.clone()
sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1
sampler_indices_padded = (
torch.arange(
0, len(sampler_indices_padded), device="cuda", dtype=torch.long) +
(sampler_indices_padded * len(sampler_indices_padded)))
- indices_len = (base_indices.shape[-1], sampler_indices.shape[-1],
- sampler_indices_padded.shape[-1],
- embeddings_indices.shape[-1])
+ indices_len = [
+ base_indices.shape[-1], sampler_indices.shape[-1],
+ sampler_indices_padded.shape[-1], embeddings_indices.shape[-1]
+ ]
return (base_indices, sampler_indices, sampler_indices_padded,
embeddings_indices, indices_len)
@@ -149,6 +151,7 @@ def from_lora_tensors(
if module_name not in loras:
lora_embeddings_tensor = None
if embeddings:
+ assert embedding_modules is not None
embeddings_module = next(
(k for k in embedding_modules if k in module_name),
None)
@@ -171,6 +174,7 @@ def from_lora_tensors(
else:
loras[module_name].lora_b = tensor.to(device=device,
dtype=dtype).t()
+ assert embedding_padding_modules is not None
if any(name in module_name
for name in embedding_padding_modules
) and target_embedding_padding is not None:
@@ -212,7 +216,9 @@ def from_local_checkpoint(
target_modules = config["target_modules"]
unexpected_modules = []
for module in target_modules:
- if module not in expected_lora_modules:
+ # Compatible with more modules, such as:layers.11.self_attn.k_proj
+ part_name = module.split(".")[-1]
+ if part_name not in expected_lora_modules:
unexpected_modules.append(module)
# loaded lora's target modules must be a subset of expected_lora_modules
if unexpected_modules:
@@ -293,11 +299,10 @@ def __init__(
self.max_num_batched_tokens,
dtype=torch.long,
device="cuda")
- self.offsets = []
# 4 is the number of indicies tensors defined above
# base_indices, sampler_indices, sampler_indices_padded,
# embeddings_indices
- self.indices_len = [None] * 4
+ self.indices_len: List[Optional[int]] = [None] * 4
self.model: nn.Module = model
if hasattr(self.model, "supported_lora_modules"):
@@ -310,7 +315,7 @@ def __init__(
self._registered_loras: Dict[int, LoRAModel] = {}
# Dict instead of a Set for compatibility with LRUCache.
self._active_loras: Dict[int, None] = {}
- self._last_mapping = None
+ self._last_mapping: Optional[LoRAMapping] = None
self._create_lora_modules()
self.model.lora_manager = self
@@ -368,7 +373,7 @@ def deactivate_lora(self, lora_id: int) -> bool:
return True
return False
- def _add_lora(self, lora: LoRAModel) -> bool:
+ def _add_lora(self, lora: LoRAModel):
self._create_merged_loras_inplace(lora)
self._registered_loras[lora.id] = lora
@@ -416,7 +421,7 @@ def list_loras(self) -> Dict[int, LoRAModel]:
def get_lora(self, lora_id: int) -> Optional[LoRAModel]:
return self._registered_loras.get(lora_id, None)
- def remove_all_loras(self) -> bool:
+ def remove_all_loras(self):
"""Remove all LoRAModels from the manager."""
self._registered_loras.clear()
self.lora_index_to_id = [None] * self.lora_slots
@@ -465,6 +470,7 @@ def create_dummy_lora(
continue
parts = module_name.split(".")
if module_name not in self.packed_modules:
+ assert embedding_modules is not None
if parts[-1] in embedding_modules:
input_dim = (module.base_layer.org_vocab_size +
self.lora_config.lora_extra_vocab_size if
@@ -498,7 +504,7 @@ def create_dummy_lora(
else:
parts = module_name.split(".")
replacements = self.packed_modules_mapping[parts[-1]]
- subloras = []
+ subloras: List[Optional["LoRALayerWeights"]] = []
for i, r in enumerate(replacements):
lora = LoRALayerWeights.create_dummy_lora_weights(
module_name + "." + r,
@@ -536,7 +542,7 @@ def _register_packed_modules(self, module_full_name: str) -> None:
def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None:
for module_name, new_module_names in self.packed_modules.items():
- replacement_loras = []
+ replacement_loras: List[Optional[LoRALayerWeights]] = []
has_replacement = False
for r in new_module_names:
lora = lora_model.get_lora(r)
@@ -555,12 +561,12 @@ def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None:
class LoRALRUCache(LRUCache[LoRAModel]):
- def __init__(self, capacity: int, deactivate_lora_fn: Callable[[Hashable],
- None]):
+ def __init__(self, capacity: int, deactivate_lora_fn: Callable[[int],
+ bool]):
super().__init__(capacity)
self.deactivate_lora_fn = deactivate_lora_fn
- def _on_remove(self, key: Hashable, value: LoRAModel):
+ def _on_remove(self, key: int, value: LoRAModel):
logger.debug(f"Removing LoRA. int id: {key}")
self.deactivate_lora_fn(key)
return super()._on_remove(key, value)
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index a0868defbd3ca..ec3c10c591a18 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -1,5 +1,5 @@
from abc import ABC, abstractmethod, abstractproperty
-from typing import Any, Dict, List, Optional, Set, Type
+from typing import Any, Dict, List, Set, Type
import torch
@@ -37,7 +37,7 @@ def create_lora_manager(
...
@abstractmethod
- def set_active_loras(self, lora_requests: List[LoRARequest],
+ def set_active_loras(self, lora_requests: Set[LoRARequest],
lora_mapping: LoRAMapping) -> None:
...
@@ -54,7 +54,7 @@ def remove_lora(self, lora_id: int) -> bool:
...
@abstractmethod
- def remove_all_loras(self) -> bool:
+ def remove_all_loras(self):
...
@abstractmethod
@@ -81,10 +81,11 @@ def __init__(
embedding_padding_modules: List[str],
lora_model_cls: Type[LoRAModel] = LoRAModel,
):
- self._lora_manager: Optional[LoRAModelManager] = None
self._lora_model_cls = lora_model_cls
self.embedding_modules = embedding_modules
self.embedding_padding_modules = embedding_padding_modules
+ # Lazily initialized by create_lora_manager.
+ self._lora_manager: LoRAModelManager
super().__init__(max_num_seqs, max_num_batched_tokens, vocab_size,
lora_config, device)
@@ -104,15 +105,15 @@ def create_lora_manager(
lora_config=self.lora_config,
lora_manager_cls=self._lora_manager_cls,
)
- self._lora_manager: LoRAModelManager = lora_manager
+ self._lora_manager = lora_manager
return lora_manager.model
- def set_active_loras(self, lora_requests: List[LoRARequest],
+ def set_active_loras(self, lora_requests: Set[LoRARequest],
lora_mapping: LoRAMapping) -> None:
self._apply_loras(lora_requests)
self._lora_manager.set_lora_mapping(lora_mapping)
- def _apply_loras(self, lora_requests: List[LoRARequest]) -> None:
+ def _apply_loras(self, lora_requests: Set[LoRARequest]) -> None:
loras_that_exist = self.list_loras()
loras_map = {
lora_request.lora_int_id: lora_request
@@ -188,7 +189,7 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
def remove_lora(self, lora_id: int) -> bool:
return self._lora_manager.remove_lora(lora_id)
- def remove_all_loras(self) -> bool:
+ def remove_all_loras(self):
self._lora_manager.remove_all_loras()
def list_loras(self) -> Set[int]:
@@ -217,10 +218,10 @@ def create_lora_manager(
lora_config=self.lora_config,
max_num_batched_tokens=self.max_num_batched_tokens,
)
- self._lora_manager: LRUCacheLoRAModelManager = lora_manager
+ self._lora_manager = lora_manager
return lora_manager.model
- def _apply_loras(self, lora_requests: List[LoRARequest]) -> None:
+ def _apply_loras(self, lora_requests: Set[LoRARequest]) -> None:
loras_map = {
lora_request.lora_int_id: lora_request
for lora_request in lora_requests if lora_request
@@ -237,12 +238,14 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
if lora_request.lora_int_id not in self.list_loras():
# Remove before we load the new lora to save memory
if len(self._lora_manager) + 1 > self._lora_manager.capacity:
+ assert isinstance(self._lora_manager, LRUCacheLoRAModelManager)
self._lora_manager.remove_oldest_lora()
lora = self._load_lora(lora_request)
loaded = self._lora_manager.add_lora(lora)
else:
# If the lora is already loaded, just touch it to
# update its position in the caches
- loaded = self._lora_manager.get_lora(lora_request.lora_int_id)
+ loaded = self._lora_manager.get_lora(
+ lora_request.lora_int_id) is not None
self._lora_manager.activate_lora(lora_request.lora_int_id)
return loaded
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
new file mode 100644
index 0000000000000..0558d6c95d97b
--- /dev/null
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -0,0 +1,25 @@
+from typing import Optional, Union
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+ CompletionRequest)
+from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (
+ get_lm_format_enforcer_guided_decoding_logits_processor)
+from vllm.model_executor.guided_decoding.outlines_decoding import (
+ get_outlines_guided_decoding_logits_processor)
+from vllm.sampling_params import LogitsProcessor
+
+
+async def get_guided_decoding_logits_processor(
+ guided_decoding_backend: str, request: Union[CompletionRequest,
+ ChatCompletionRequest],
+ tokenizer) -> Optional[LogitsProcessor]:
+ if guided_decoding_backend == 'outlines':
+ return await get_outlines_guided_decoding_logits_processor(
+ request, tokenizer)
+ if guided_decoding_backend == 'lm-format-enforcer':
+ return await get_lm_format_enforcer_guided_decoding_logits_processor(
+ request, tokenizer)
+
+ raise ValueError(
+ f"Unknown guided decoding backend '{guided_decoding_backend}'. "
+ "Must be one of 'outlines, 'lm-format-enforcer'")
diff --git a/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
new file mode 100644
index 0000000000000..0d74a5f8e81ff
--- /dev/null
+++ b/vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py
@@ -0,0 +1,69 @@
+from functools import lru_cache
+from json import loads as json_loads
+from typing import Optional, Union
+
+from lmformatenforcer import (CharacterLevelParser, JsonSchemaParser,
+ RegexParser, StringParser,
+ TokenEnforcerTokenizerData, UnionParser)
+from lmformatenforcer.integrations.vllm import (
+ build_vllm_logits_processor, build_vllm_token_enforcer_tokenizer_data)
+from pydantic import BaseModel
+from transformers import PreTrainedTokenizerBase
+
+from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
+ CompletionRequest)
+from vllm.model_executor.guided_decoding.outlines_decoding import (
+ get_outlines_guided_decoding_logits_processor)
+from vllm.sampling_params import LogitsProcessor
+
+
+async def get_lm_format_enforcer_guided_decoding_logits_processor(
+ request: Union[CompletionRequest, ChatCompletionRequest],
+ tokenizer) -> Optional[LogitsProcessor]:
+ """
+ Given an OpenAI-compatible request, check for guided decoding parameters
+ and get the necessary logits processor for the given guide.
+ We cache logit processors by (guide, tokenizer), and on cache hit
+ we make a shallow copy to reuse the same underlying FSM.
+ """
+
+ tokenizer_data = _cached_build_vllm_token_enforcer_tokenizer_data(
+ tokenizer)
+ character_level_parser: CharacterLevelParser
+ if request.guided_json:
+ schema = _normalize_json_schema_object(request.guided_json)
+ character_level_parser = JsonSchemaParser(schema)
+ elif request.guided_choice:
+ character_level_parser = UnionParser(
+ [StringParser(choice) for choice in request.guided_choice])
+ elif request.guided_regex:
+ character_level_parser = RegexParser(request.guided_regex)
+ elif request.guided_grammar:
+ # CFG grammar not supported by LMFE, revert to outlines
+ return await get_outlines_guided_decoding_logits_processor(
+ request, tokenizer)
+ elif (request.response_format is not None
+ and request.response_format.type == "json_object"):
+ character_level_parser = JsonSchemaParser(
+ None) # None means any json object
+ else:
+ return None
+
+ logits_processor = build_vllm_logits_processor(tokenizer_data,
+ character_level_parser)
+ return logits_processor
+
+
+def _normalize_json_schema_object(schema: Union[str, dict, BaseModel]) -> dict:
+ if isinstance(schema, str):
+ return json_loads(schema)
+ if isinstance(schema, dict):
+ return schema
+ if isinstance(schema, BaseModel):
+ return schema.model_json_schema()
+
+
+@lru_cache
+def _cached_build_vllm_token_enforcer_tokenizer_data(
+ tokenizer: PreTrainedTokenizerBase) -> TokenEnforcerTokenizerData:
+ return build_vllm_token_enforcer_tokenizer_data(tokenizer)
diff --git a/vllm/model_executor/guided_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py
similarity index 92%
rename from vllm/model_executor/guided_decoding.py
rename to vllm/model_executor/guided_decoding/outlines_decoding.py
index 8e710f1ac2b53..53efebb604048 100644
--- a/vllm/model_executor/guided_decoding.py
+++ b/vllm/model_executor/guided_decoding/outlines_decoding.py
@@ -12,9 +12,8 @@
from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
CompletionRequest)
-from vllm.model_executor.guided_logits_processors import (CFGLogitsProcessor,
- JSONLogitsProcessor,
- RegexLogitsProcessor)
+from vllm.model_executor.guided_decoding.outlines_logits_processors import (
+ CFGLogitsProcessor, JSONLogitsProcessor, RegexLogitsProcessor)
class GuidedDecodingMode(Enum):
@@ -54,9 +53,9 @@ class GuidedDecodingMode(Enum):
global_thread_pool = None # used for generating logits processor fsm
-async def get_guided_decoding_logits_processor(
+async def get_outlines_guided_decoding_logits_processor(
request: Union[CompletionRequest, ChatCompletionRequest],
- tokenizer) -> Union[JSONLogitsProcessor, RegexLogitsProcessor]:
+ tokenizer) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, None]:
"""
Given an OpenAI-compatible request, check for guided decoding parameters
and get the necessary logits processor for the given guide.
@@ -85,7 +84,7 @@ async def get_guided_decoding_logits_processor(
def _get_guide_and_mode(
request: Union[CompletionRequest, ChatCompletionRequest]
-) -> Tuple[str, GuidedDecodingMode]:
+) -> Union[Tuple[str, GuidedDecodingMode], Tuple[None, None]]:
if request.guided_json:
json = request.guided_json
diff --git a/vllm/model_executor/guided_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
similarity index 67%
rename from vllm/model_executor/guided_logits_processors.py
rename to vllm/model_executor/guided_decoding/outlines_logits_processors.py
index 035fe00037328..25ab5bf8b6a9c 100644
--- a/vllm/model_executor/guided_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -13,13 +13,15 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+import copy
import json
import math
from collections import defaultdict
+from functools import lru_cache
from typing import Callable, DefaultDict, Dict, List, Optional, Union
import torch
-from outlines.fsm.fsm import CFGFSM, RegexFSM
+from outlines.fsm.fsm import CFGFSM, FSM, RegexFSM
from outlines.fsm.json_schema import build_regex_from_schema
from pydantic import BaseModel
from transformers import PreTrainedTokenizerBase
@@ -27,49 +29,9 @@
class BaseLogitsProcessor:
- def adapt_tokenizer(self, tokenizer: PreTrainedTokenizerBase):
- """Adapt vLLM's tokenizer to use to compile the FSM.
-
- The API of Outlines tokenizers is slightly different to that of
- `transformers`. The decoder of outlines, returns a list whereas
- the decode of vLLM returns an str. To sync the vLLM decoder with
- outlines internal api, the decoder should be adapted. In addition
- we need to handle the missing spaces to Llama's tokenizer to be
- able to compile FSMs for this model.
-
- """
- if getattr(tokenizer, "_outlines_adapted", False):
- return tokenizer
-
- tokenizer.vocabulary = tokenizer.get_vocab()
- tokenizer.special_tokens = set(tokenizer.all_special_tokens)
-
- def convert_token_to_string(token: str) -> str:
- from transformers.file_utils import SPIECE_UNDERLINE
-
- string = tokenizer.convert_tokens_to_string([token])
-
- # A hack to handle missing spaces to HF's Llama tokenizers
- if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
- return " " + string
-
- return string
-
- def change_decoder(
- decoder: Callable[[List[int]], str]
- ) -> Callable[[List[int]], List[str]]:
- """Sync vLLM's decoder with the outlines by returning list."""
-
- def new_decoder(inp_tokens: List[int]) -> List[str]:
- return [decoder(inp_tokens)]
-
- return new_decoder
-
- tokenizer.convert_token_to_string = convert_token_to_string
- tokenizer.decode = change_decoder(tokenizer.decode)
- setattr(tokenizer, "_outlines_adapted", True) # noqa: B010
-
- return tokenizer
+ def __init__(self):
+ # Child class should use initialize in their init.
+ self.fsm: FSM
def init_state(self):
"""Initialize the FSM states."""
@@ -78,7 +40,6 @@ def init_state(self):
def __call__(self, input_ids: List[int],
scores: torch.Tensor) -> torch.Tensor:
"""Use the FSM to bias the logits before sampling the next token."""
-
seq_id = hash(tuple(input_ids))
if len(input_ids) == 0:
@@ -96,7 +57,6 @@ def __call__(self, input_ids: List[int],
device=scores.device)
mask[allowed_tokens] = 0
scores.add_(mask)
-
return scores
@@ -113,7 +73,7 @@ def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase):
The model's tokenizer
"""
- tokenizer = self.adapt_tokenizer(tokenizer)
+ tokenizer = _adapt_tokenizer(tokenizer)
fsm = RegexFSM(regex_string, tokenizer)
self.fsm = fsm
@@ -167,6 +127,59 @@ def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase):
The model's tokenizer
"""
- tokenizer = self.adapt_tokenizer(tokenizer)
+ tokenizer = _adapt_tokenizer(tokenizer)
fsm = CFGFSM(cfg, tokenizer)
self.fsm = fsm
+
+ def init_state(self):
+ """Initialize state with a CFGFSM copy."""
+ super().init_state()
+ self.fsm = self.fsm.copy()
+
+
+@lru_cache
+def _adapt_tokenizer(tokenizer: PreTrainedTokenizerBase):
+ """Adapt vLLM's tokenizer to use to compile the FSM.
+
+ The API of Outlines tokenizers is slightly different to that of
+ `transformers`. The decoder of outlines, returns a list whereas
+ the decode of vLLM returns an str. To sync the vLLM decoder with
+ outlines internal api, the decoder should be adapted. In addition
+ we need to handle the missing spaces to Llama's tokenizer to be
+ able to compile FSMs for this model.
+
+ """
+ if getattr(tokenizer, "_outlines_adapted", False):
+ return tokenizer
+
+ tokenizer = copy.deepcopy(tokenizer)
+
+ tokenizer.vocabulary = tokenizer.get_vocab()
+ tokenizer.special_tokens = set(tokenizer.all_special_tokens)
+
+ def convert_token_to_string(token: str) -> str:
+ from transformers.file_utils import SPIECE_UNDERLINE
+
+ string = tokenizer.convert_tokens_to_string([token])
+
+ # A hack to handle missing spaces to HF's Llama tokenizers
+ if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
+ return " " + string
+
+ return string
+
+ def change_decoder(
+ decoder: Callable[[List[int]],
+ str]) -> Callable[[List[int]], List[str]]:
+ """Sync vLLM's decoder with the outlines by returning list."""
+
+ def new_decoder(inp_tokens: List[int]) -> List[str]:
+ return [decoder(inp_tokens)]
+
+ return new_decoder
+
+ tokenizer.convert_token_to_string = convert_token_to_string
+ tokenizer.decode = change_decoder(tokenizer.decode)
+ setattr(tokenizer, "_outlines_adapted", True) # noqa: B010
+
+ return tokenizer
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
new file mode 100644
index 0000000000000..2ad07bf79a25c
--- /dev/null
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
@@ -0,0 +1,146 @@
+{
+ "1": {
+ "BLOCK_SIZE_M": 16,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "2": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "4": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 64,
+ "BLOCK_SIZE_K": 64,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "8": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "16": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "24": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "32": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "48": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "64": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "96": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 32,
+ "BLOCK_SIZE_K": 256,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 4,
+ "num_stages": 4
+ },
+ "128": {
+ "BLOCK_SIZE_M": 64,
+ "BLOCK_SIZE_N": 128,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "256": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 1,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "512": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 16,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1024": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "1536": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "2048": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "3072": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 32,
+ "num_warps": 8,
+ "num_stages": 4
+ },
+ "4096": {
+ "BLOCK_SIZE_M": 128,
+ "BLOCK_SIZE_N": 256,
+ "BLOCK_SIZE_K": 128,
+ "GROUP_SIZE_M": 64,
+ "num_warps": 8,
+ "num_stages": 4
+ }
+}
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 377b6588dbf47..ac7c30e2a9727 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -21,6 +21,8 @@ def fused_moe_kernel(
a_ptr,
b_ptr,
c_ptr,
+ a_scale_ptr,
+ b_scale_ptr,
topk_weights_ptr,
sorted_token_ids_ptr,
expert_ids_ptr,
@@ -49,6 +51,7 @@ def fused_moe_kernel(
MUL_ROUTED_WEIGHT: tl.constexpr,
top_k: tl.constexpr,
compute_type: tl.constexpr,
+ use_fp8: tl.constexpr,
):
"""
Implements the fused computation for a Mixture of Experts (MOE) using
@@ -111,6 +114,10 @@ def fused_moe_kernel(
b_ptrs = b_ptr + off_experts * stride_be + (offs_k[:, None] * stride_bk +
offs_bn[None, :] * stride_bn)
+ if use_fp8:
+ a_scale = tl.load(a_scale_ptr)
+ b_scale = tl.load(b_scale_ptr + off_experts)
+
# -----------------------------------------------------------
# Iterate to compute a block of the C matrix.
# We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
@@ -129,7 +136,10 @@ def fused_moe_kernel(
mask=offs_k[:, None] < K - k * BLOCK_SIZE_K,
other=0.0)
# We accumulate along the K dimension.
- accumulator += tl.dot(a, b)
+ if use_fp8:
+ accumulator = tl.dot(a, b, acc=accumulator)
+ else:
+ accumulator += tl.dot(a, b)
# Advance the ptrs to the next K block.
a_ptrs += BLOCK_SIZE_K * stride_ak
b_ptrs += BLOCK_SIZE_K * stride_bk
@@ -140,7 +150,10 @@ def fused_moe_kernel(
other=0)
accumulator = accumulator * moe_weight[:, None]
- accumulator = accumulator.to(compute_type)
+ if use_fp8:
+ accumulator = (accumulator * a_scale * b_scale).to(compute_type)
+ else:
+ accumulator = accumulator.to(compute_type)
# -----------------------------------------------------------
# Write back the block of the output
offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
@@ -207,15 +220,24 @@ def moe_align_block_size(
def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
- topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+ B_scale: torch.Tensor, topk_weights: torch.Tensor,
+ topk_ids: torch.Tensor,
sorted_token_ids: torch.Tensor,
expert_ids: torch.Tensor,
num_tokens_post_padded: torch.Tensor,
mul_routed_weight: bool, top_k: int,
- config: Dict[str, Any]) -> None:
+ config: Dict[str, Any], compute_type: tl.dtype,
+ use_fp8: bool) -> None:
assert topk_weights.stride(1) == 1
assert sorted_token_ids.stride(0) == 1
+ if not use_fp8:
+ A_scale = None
+ assert B_scale is None
+ else:
+ A, A_scale = ops.scaled_fp8_quant(A)
+ assert B_scale is not None
+
grid = lambda META: (triton.cdiv(sorted_token_ids.shape[0], META[
'BLOCK_SIZE_M']) * triton.cdiv(B.shape[1], META['BLOCK_SIZE_N']), )
@@ -223,6 +245,8 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
A,
B,
C,
+ A_scale,
+ B_scale,
topk_weights,
sorted_token_ids,
expert_ids,
@@ -240,18 +264,21 @@ def invoke_fused_moe_kernel(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor,
C.stride(2),
MUL_ROUTED_WEIGHT=mul_routed_weight,
top_k=top_k,
- compute_type=tl.bfloat16 if A.dtype == torch.bfloat16 else tl.float16,
+ compute_type=compute_type,
+ use_fp8=use_fp8,
**config,
)
-def get_config_file_name(E: int, N: int) -> str:
+def get_config_file_name(E: int, N: int, dtype: Optional[str]) -> str:
device_name = torch.cuda.get_device_name().replace(" ", "_")
- return f"E={E},N={N},device_name={device_name}.json"
+ dtype_selector = "" if not dtype else f",dtype={dtype}"
+ return f"E={E},N={N},device_name={device_name}{dtype_selector}.json"
@functools.lru_cache
-def get_moe_configs(E: int, N: int) -> Optional[Dict[int, Any]]:
+def get_moe_configs(E: int, N: int,
+ dtype: Optional[str]) -> Optional[Dict[int, Any]]:
"""
Return optimized configurations for the fused MoE kernel.
@@ -263,7 +290,7 @@ def get_moe_configs(E: int, N: int) -> Optional[Dict[int, Any]]:
# First look up if an optimized configuration is available in the configs
# directory
- json_file_name = get_config_file_name(E, N)
+ json_file_name = get_config_file_name(E, N, dtype)
config_file_path = os.path.join(
os.path.dirname(os.path.realpath(__file__)), "configs", json_file_name)
@@ -288,6 +315,9 @@ def fused_moe(
renormalize: bool,
inplace: bool = False,
override_config: Optional[Dict[str, Any]] = None,
+ use_fp8: bool = False,
+ w1_scale: Optional[torch.Tensor] = None,
+ w2_scale: Optional[torch.Tensor] = None,
) -> torch.Tensor:
"""
This function computes a Mixture of Experts (MoE) layer using two sets of
@@ -305,6 +335,12 @@ def fused_moe(
Defaults to False.
- override_config (Optional[Dict[str, Any]]): Optional override
for the kernel configuration.
+ - use_fp8 (bool): If True, use fp8 arithmetic to compute the inner
+ products for w1 and w2. Defaults to False.
+ - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
+ w1.
+ - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
+ w2.
Returns:
- torch.Tensor: The output tensor after applying the MoE layer.
@@ -358,7 +394,8 @@ def fused_moe(
config = override_config
else:
# First try to load optimal config from the file
- configs = get_moe_configs(E, w2.shape[2])
+ configs = get_moe_configs(E, w2.shape[2],
+ "float8" if use_fp8 else None)
if configs:
# If an optimal configuration map has been found, look up the
@@ -394,17 +431,37 @@ def fused_moe(
sorted_token_ids, expert_ids, num_tokens_post_padded = moe_align_block_size(
topk_ids, config['BLOCK_SIZE_M'], E)
- invoke_fused_moe_kernel(hidden_states, w1, intermediate_cache1,
- topk_weights, topk_ids, sorted_token_ids,
- expert_ids, num_tokens_post_padded, False,
- topk_ids.shape[1], config)
+ invoke_fused_moe_kernel(hidden_states,
+ w1,
+ intermediate_cache1,
+ w1_scale,
+ topk_weights,
+ topk_ids,
+ sorted_token_ids,
+ expert_ids,
+ num_tokens_post_padded,
+ False,
+ topk_ids.shape[1],
+ config,
+ compute_type=tl.float16,
+ use_fp8=use_fp8)
ops.silu_and_mul(intermediate_cache2, intermediate_cache1.view(-1, N))
- invoke_fused_moe_kernel(intermediate_cache2, w2, intermediate_cache3,
- topk_weights, topk_ids, sorted_token_ids,
- expert_ids, num_tokens_post_padded, True, 1,
- config)
+ invoke_fused_moe_kernel(intermediate_cache2,
+ w2,
+ intermediate_cache3,
+ w2_scale,
+ topk_weights,
+ topk_ids,
+ sorted_token_ids,
+ expert_ids,
+ num_tokens_post_padded,
+ True,
+ 1,
+ config,
+ compute_type=tl.float16,
+ use_fp8=use_fp8)
if inplace:
return torch.sum(intermediate_cache3.view(*intermediate_cache3.shape),
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index c1590a1cdabd5..b46cc52df9bf9 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -3,6 +3,7 @@
import torch
import torch.nn.functional as F
+from torch import nn
from torch.nn.parameter import Parameter
from vllm.distributed import (divide, get_tensor_model_parallel_rank,
@@ -32,12 +33,22 @@ class LinearMethodBase(ABC):
@abstractmethod
def create_weights(self, layer: torch.nn.Module,
input_size_per_partition: int,
- output_size_per_partition: int, input_size: int,
+ output_partition_sizes: List[int], input_size: int,
output_size: int, params_dtype: torch.dtype,
**extra_weight_attrs):
- """Create weights for a linear layer.
-
- The weights will be set as attributes of the layer."""
+ """Create weights for a linear layer.
+ The weights will be set as attributes of the layer.
+
+ Args:
+ layer: The layer that is using the LinearMethodBase factory.
+ input_size_per_partition: Size of the weight input dim on rank X.
+ output_partition_sizes: Sizes of the output dim of each logical
+ weight on rank X. E.g., output_partition_sizes for QKVLinear
+ is a list contains the width of Wq, Wk, Wv on rank X.
+ input_size: Size of the input dim of the weight across all ranks.
+ output_size: Size of the output dim of the weight across all ranks.
+ params_dtype: Datatype of the parameters.
+ """
raise NotImplementedError
@abstractmethod
@@ -50,6 +61,13 @@ def apply_weights(self,
Expects create_weights to have been called before on the layer."""
raise NotImplementedError
+ def process_weights_after_loading(self, layer: nn.Module) -> None:
+ """Process the weight after loading.
+
+ This can be used for example, to transpose weights for computation.
+ """
+ return
+
class UnquantizedLinearMethod(LinearMethodBase):
"""Linear method without quantization.
@@ -64,9 +82,10 @@ def __init__(self, separate_bias_add: bool = False):
def create_weights(self, layer: torch.nn.Module,
input_size_per_partition: int,
- output_size_per_partition: int, input_size: int,
+ output_partition_sizes: List[int], input_size: int,
output_size: int, params_dtype: torch.dtype,
**extra_weight_attrs):
+ output_size_per_partition = sum(output_partition_sizes)
weight = Parameter(torch.empty(output_size_per_partition,
input_size_per_partition,
dtype=params_dtype),
@@ -121,7 +140,7 @@ def __init__(
linear_method = UnquantizedLinearMethod()
self.linear_method = linear_method
self.linear_method.create_weights(self, self.input_size,
- self.output_size, self.input_size,
+ [self.output_size], self.input_size,
self.output_size, self.params_dtype)
if bias:
self.bias = Parameter(
@@ -155,6 +174,8 @@ class ColumnParallelLinear(torch.nn.Module):
skip adding bias but instead return it.
params_dtype: Data type for the parameters.
linear_method: (Maybe quantized) linear method.
+ output_sizes: list of output sizes packed into one output, like for QKV
+ the list would be size 3.
"""
def __init__(
@@ -166,6 +187,7 @@ def __init__(
skip_bias_add: bool = False,
params_dtype: Optional[torch.dtype] = None,
linear_method: Optional[LinearMethodBase] = None,
+ output_sizes: Optional[List[int]] = None,
):
super().__init__()
@@ -182,10 +204,12 @@ def __init__(
self.params_dtype = params_dtype
if linear_method is None:
linear_method = UnquantizedLinearMethod()
+ if output_sizes is None:
+ output_sizes = [output_size]
self.linear_method = linear_method
self.linear_method.create_weights(self,
self.input_size,
- self.output_size_per_partition,
+ [x // tp_size for x in output_sizes],
self.input_size,
self.output_size,
self.params_dtype,
@@ -268,14 +292,17 @@ def __init__(
tp_size = get_tensor_model_parallel_world_size()
assert all(output_size % tp_size == 0 for output_size in output_sizes)
super().__init__(input_size, sum(output_sizes), bias, gather_output,
- skip_bias_add, params_dtype, linear_method)
+ skip_bias_add, params_dtype, linear_method,
+ self.output_sizes)
def weight_loader(self,
param: Parameter,
loaded_weight: torch.Tensor,
loaded_shard_id: Optional[int] = None):
+
param_data = param.data
output_dim = getattr(param, "output_dim", None)
+ is_metadata = getattr(param, "is_metadata", False)
if loaded_shard_id is None:
# Loaded weight is already packed.
if output_dim is None:
@@ -328,6 +355,11 @@ def weight_loader(self,
start_idx = tp_rank * shard_size
loaded_weight = loaded_weight.narrow(output_dim, start_idx,
shard_size)
+ elif is_metadata:
+ # metadata indicates fixed size concatenated along dim 0
+ shard_size = loaded_weight.shape[0]
+ shard_offset = loaded_shard_id * shard_size
+ param_data = param_data.narrow(0, shard_offset, shard_size)
else:
ignore_warning = getattr(param, "ignore_warning", False)
if not ignore_warning:
@@ -407,8 +439,14 @@ def __init__(
input_size = self.hidden_size
output_size = (self.num_heads +
2 * self.num_kv_heads) * tp_size * self.head_size
+ output_sizes = [
+ self.num_heads * tp_size * self.head_size,
+ self.num_kv_heads * tp_size * self.head_size,
+ self.num_kv_heads * tp_size * self.head_size
+ ]
+
super().__init__(input_size, output_size, bias, False, skip_bias_add,
- params_dtype, linear_method)
+ params_dtype, linear_method, output_sizes)
def weight_loader(self,
param: Parameter,
@@ -416,6 +454,7 @@ def weight_loader(self,
loaded_shard_id: Optional[str] = None):
param_data = param.data
output_dim = getattr(param, "output_dim", None)
+ is_metadata = getattr(param, "is_metadata", False)
if loaded_shard_id is None:
# Loaded weight is already packed.
@@ -483,6 +522,12 @@ def weight_loader(self,
start_idx = shard_id * shard_size
loaded_weight = loaded_weight.narrow(output_dim, start_idx,
shard_size)
+ elif is_metadata:
+ # metadata indicates fixed size concatenated along dim 0
+ shard_size = loaded_weight.shape[0]
+ shard_index = ["q", "k", "v"].index(loaded_shard_id)
+ param_data = param_data.narrow(0, shard_index * shard_size,
+ shard_size)
else:
ignore_warning = getattr(param, "ignore_warning", False)
if not ignore_warning:
@@ -559,7 +604,7 @@ def __init__(
self.linear_method = linear_method
self.linear_method.create_weights(self,
self.input_size_per_partition,
- self.output_size,
+ [self.output_size],
self.input_size,
self.output_size,
self.params_dtype,
diff --git a/vllm/model_executor/layers/ops/sample.py b/vllm/model_executor/layers/ops/sample.py
index a19e9461f41f7..d08ae6064aa2a 100644
--- a/vllm/model_executor/layers/ops/sample.py
+++ b/vllm/model_executor/layers/ops/sample.py
@@ -29,8 +29,8 @@ def _multi_split_sample(
sampled_tokens_size: Tuple[int, int],
sampled_logprobs_size: Tuple[int, int],
sample_indices: torch.Tensor,
+ logprobs: torch.Tensor,
*,
- logprobs: Optional[torch.Tensor] = None,
modify_greedy_probs: bool = False,
save_logprobs: bool = False,
):
@@ -167,6 +167,7 @@ def sample(
sampled_logprobs_size = (0, 0)
logprobs = probs
+ assert logprobs is not None
if _save_modified_probs:
sampled_modified_probs_size = sampled_tokens_size
else:
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index ad988d48755b0..a525add458499 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -1,14 +1,18 @@
from typing import Type
+from vllm.model_executor.layers.quantization.aqlm import AQLMConfig
from vllm.model_executor.layers.quantization.awq import AWQConfig
from vllm.model_executor.layers.quantization.base_config import (
QuantizationConfig)
+from vllm.model_executor.layers.quantization.fp8 import FP8Config
from vllm.model_executor.layers.quantization.gptq import GPTQConfig
from vllm.model_executor.layers.quantization.marlin import MarlinConfig
from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
-_QUANTIZATION_CONFIG_REGISTRY = {
+QUANTIZATION_METHODS = {
+ "aqlm": AQLMConfig,
"awq": AWQConfig,
+ "fp8": FP8Config,
"gptq": GPTQConfig,
"squeezellm": SqueezeLLMConfig,
"marlin": MarlinConfig,
@@ -16,12 +20,13 @@
def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
- if quantization not in _QUANTIZATION_CONFIG_REGISTRY:
+ if quantization not in QUANTIZATION_METHODS:
raise ValueError(f"Invalid quantization method: {quantization}")
- return _QUANTIZATION_CONFIG_REGISTRY[quantization]
+ return QUANTIZATION_METHODS[quantization]
__all__ = [
"QuantizationConfig",
"get_quantization_config",
+ "QUANTIZATION_METHODS",
]
diff --git a/vllm/model_executor/layers/quantization/aqlm.py b/vllm/model_executor/layers/quantization/aqlm.py
new file mode 100644
index 0000000000000..b48c6e1702be4
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/aqlm.py
@@ -0,0 +1,373 @@
+# Supports AQLM compression, see https://github.com/Vahe1994/AQLM
+# and https://arxiv.org/pdf/2401.06118.pdf
+
+import math
+from typing import Any, Dict, List, Optional
+
+import torch
+import torch.nn.functional as F
+from torch.nn.parameter import Parameter
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.linear import (LinearMethodBase,
+ set_weight_attrs)
+from vllm.model_executor.layers.quantization.base_config import (
+ QuantizationConfig)
+
+
+def get_int_dtype(nbits: int) -> torch.dtype:
+ if nbits <= 8:
+ return torch.int8
+ if nbits <= 16:
+ return torch.int16
+ if nbits <= 32:
+ return torch.int32
+ if nbits <= 64:
+ return torch.int64
+ raise ValueError(f"No dtype available for {nbits}-bit codebooks")
+
+
+@torch.inference_mode()
+def unpack_int_data(data: torch.IntTensor, nbits: int) -> torch.IntTensor:
+ return data.to(torch.int64) % (2**nbits)
+
+
+def dequantize_weight(codes: torch.Tensor,
+ codebooks: torch.Tensor,
+ scales: Optional[torch.Tensor] = None) -> torch.Tensor:
+ """
+ Decode float weights from quantization codes. Differentiable.
+ :param codes: tensor of integer quantization codes, shape
+ [*dims, num_out_groups, num_in_groups, num_codebooks]
+ :param codebooks: tensor of vectors for each quantization code,
+ [num_codebooks, codebook_size, out_group_size, in_group_size]
+ :param scales: weight will be multiplied by this factor, must be
+ broadcastble with
+ [*dims, out_groups, num_in_groups, out_group_size, in_group_size]
+ :return: reconstructed weight tensor of shape
+ [*dims, num_in_groups*group_size]
+ """
+ num_out_groups, num_in_groups, num_codebooks = codes.shape[-3:]
+ num_codebooks, codebook_size, out_group_size, in_group_size = \
+ codebooks.shape
+ out_features = num_out_groups * out_group_size
+ in_features = num_in_groups * in_group_size
+ codebook_offsets = torch.arange(
+ 0, num_codebooks * codebook_size, codebook_size,
+ device=codes.device) # shape: [num_codebooks]
+ reconstructed_weight_flat = F.embedding_bag(
+ codes.flatten(0, -2) + codebook_offsets,
+ codebooks.flatten(0, 1).flatten(-2, -1),
+ mode="sum"
+ ) # [prod(dims) * num_out_groups * num_in_groups, out_group_size
+ # * in_group_size]
+
+ reconstructed_weight_groupwise = reconstructed_weight_flat.view(
+ list(codes.shape[:-3]) +
+ [num_out_groups, num_in_groups, out_group_size, in_group_size])
+ if scales is not None:
+ reconstructed_weight_groupwise = reconstructed_weight_groupwise.mul(
+ scales)
+ return reconstructed_weight_groupwise.swapaxes(
+ -3, -2).reshape(list(codes.shape[:-3]) + [out_features, in_features])
+
+
+def dequantize_gemm(
+ input: torch.Tensor, # [..., in_features]
+ codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
+ codebooks: torch.
+ Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
+ scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
+ bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+ dequantized_weight = dequantize_weight(
+ unpack_int_data(codes, codebooks.shape[1].bit_length() - 1),
+ codebooks,
+ scales,
+ )
+ return F.linear(input, dequantized_weight, bias)
+
+
+# Generic dequantization, slow but flexible.
+def generic_dequantize_gemm(
+ input: torch.Tensor, # [..., in_features]
+ codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
+ codebooks: torch.
+ Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
+ scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
+ output_partition_sizes: torch.IntTensor,
+ bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+ output_shape = input.shape[:-1] + (scales.shape[0], )
+ output = torch.empty(output_shape, dtype=input.dtype, device=input.device)
+ num_outputs = len(output_partition_sizes)
+
+ # break the inputs and codebooks apart then combine the outputs.
+ # Surprisingly (to me) this is faster than doing 3 de-quants and 1 big
+ # multiply at the end.
+ num_codebooks = codebooks.shape[0] // num_outputs
+ assert (scales.shape[0] == codes.shape[0])
+ assert (sum(output_partition_sizes) == scales.shape[0])
+ output_offset = 0
+ codebooks_offset = 0
+ for output_size in output_partition_sizes:
+ shard_output = dequantize_gemm(
+ input, codes.narrow(0, output_offset, output_size),
+ codebooks.narrow(0, codebooks_offset, num_codebooks),
+ scales.narrow(0, output_offset, output_size), None
+ if bias is None else bias.narrow(0, output_offset, output_size))
+
+ output_slice = output.narrow(-1, output_offset, output_size)
+ assert (output_slice.shape == shard_output.shape)
+ output_slice.copy_(shard_output)
+ output_offset += output_size
+ codebooks_offset += num_codebooks
+ return output
+
+
+# Optimized dequnantize/decompression kernels, supports 1x16 and 2x8
+# at 6 and 9 times faster than the generic version above, respectively.
+def optimized_dequantize_gemm(
+ input: torch.Tensor, # [..., in_features]
+ codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
+ codebooks: torch.
+ Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
+ scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
+ output_partition_sizes: torch.IntTensor,
+ bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+ weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+ if bias is None:
+ # scaling the output is fastest, so we do that when possible.
+ output = F.linear(input, weights, bias)
+ orig_shape = output.shape
+ flattened_output = output.view(-1, output.size(-1))
+ f_scales = scales.view(-1, scales.shape[0])
+ b_scales = f_scales.expand(flattened_output.shape[0], -1)
+ flattened_output *= b_scales
+ return output.view(orig_shape)
+ else:
+ b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
+ -1, weights.shape[1])
+ weights *= b_scales
+ return F.linear(input, weights, bias)
+
+
+class AQLMConfig(QuantizationConfig):
+ """Config class for AQLM.
+
+ Reference: https://github.com/Vahe1994/AQLM
+ """
+
+ def __init__(
+ self,
+ in_group_size: int,
+ nbits_per_codebook: int,
+ num_codebooks: int,
+ out_group_size: int,
+ ) -> None:
+ self.in_group_size = in_group_size
+ self.nbits_per_codebook = nbits_per_codebook
+ self.num_codebooks = num_codebooks
+ self.out_group_size = out_group_size
+
+ # out_group_size > 1 is untested, and probably won't work as-is.
+ assert (self.out_group_size == 1)
+ self.pack_factor = (self.in_group_size * self.out_group_size)
+
+ def __repr__(self) -> str:
+ return (f"AQLMConfig(in_group_size={self.in_group_size}, "
+ f"nbits_per_codebook={self.nbits_per_codebook}, "
+ f"num_codebooks={self.num_codebooks}, "
+ f"out_group_size={self.out_group_size})")
+
+ @classmethod
+ def get_name(cls) -> str:
+ return "aqlm"
+
+ @classmethod
+ def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+ return [torch.half]
+
+ @classmethod
+ def get_min_capability(cls) -> int:
+ return 70
+
+ @classmethod
+ def get_config_filenames(cls) -> List[str]:
+ return [] # no extra configs.
+
+ @classmethod
+ def from_config(cls, config: Dict[str, Any]) -> "AQLMConfig":
+ in_group_size = cls.get_from_keys(config, ["in_group_size"])
+ nbits_per_codebook = cls.get_from_keys(config, ["nbits_per_codebook"])
+ num_code_books = cls.get_from_keys(config, ["num_codebooks"])
+ out_group_size = cls.get_from_keys(config, ["out_group_size"])
+ return cls(in_group_size, nbits_per_codebook, num_code_books,
+ out_group_size)
+
+ def get_linear_method(self) -> "AQLMLinearMethod":
+ return AQLMLinearMethod(self)
+
+ def get_scaled_act_names(self) -> List[str]:
+ return []
+
+
+class AQLMLinearMethod(LinearMethodBase):
+ """Linear method for AQLM.
+
+ Args:
+ quant_config: The AQLM quantization config.
+ """
+
+ def __init__(self, quant_config: AQLMConfig):
+ self.quant_config = quant_config
+
+ def create_weights(self, layer: torch.nn.Module,
+ input_size_per_partition: int,
+ output_partition_sizes: List[int], input_size: int,
+ output_size: int, params_dtype: torch.dtype,
+ **extra_weight_attrs):
+ del output_size # Unused.
+ del input_size # Unused.
+
+ if params_dtype != torch.half:
+ raise ValueError("Only half is currently supported by aqlm")
+ if input_size_per_partition % self.quant_config.in_group_size != 0:
+ raise ValueError(
+ "The input size is not aligned with the quantized "
+ "weight shape. This can be caused by too large "
+ "tensor parallel size.")
+
+ output_size_per_partition = sum(output_partition_sizes)
+ if output_size_per_partition % self.quant_config.out_group_size != 0:
+ raise ValueError(
+ "The output size is not aligned with the quantized "
+ "weight shape. This can be caused by too large "
+ "tensor parallel size.")
+
+ codes = Parameter(
+ torch.empty(
+ # There could actually be two pack factors, one along input and
+ # one along output, but we don't currently support
+ # out_group_size, and only the one along output needs to be
+ # marked with "packed_dim" in order for QKVLinear to work.
+ output_size_per_partition,
+ input_size_per_partition // self.quant_config.pack_factor,
+ self.quant_config.num_codebooks,
+ dtype=get_int_dtype(self.quant_config.nbits_per_codebook),
+ ),
+ requires_grad=False,
+ )
+
+ set_weight_attrs(
+ codes,
+ {
+ "input_dim": 1,
+ "output_dim": 0,
+ "packed_dim": 1,
+ "pack_factor": self.quant_config.pack_factor,
+ },
+ )
+
+ codebooks = Parameter(
+ torch.empty(
+ self.quant_config.num_codebooks * len(output_partition_sizes),
+ 2**self.quant_config.nbits_per_codebook,
+ self.quant_config.out_group_size,
+ self.quant_config.in_group_size,
+ dtype=params_dtype,
+ ),
+ requires_grad=False,
+ )
+ set_weight_attrs(
+ codebooks,
+ {
+ # metadata indicates fixed size concatenated along dim 0
+ "is_metadata":
+ True,
+ "output_partition_sizes":
+ torch.tensor(output_partition_sizes, device='cpu'),
+ },
+ )
+
+ scales = Parameter(
+ torch.empty(
+ (
+ output_size_per_partition //
+ self.quant_config.out_group_size,
+ 1,
+ 1,
+ 1,
+ ),
+ dtype=params_dtype,
+ ),
+ requires_grad=False,
+ )
+ set_weight_attrs(
+ scales,
+ {
+ "output_dim": 0,
+ "packed_dim": 0,
+ "pack_factor": self.quant_config.out_group_size
+ },
+ )
+
+ layer.register_parameter("codes", codes)
+ set_weight_attrs(codes, extra_weight_attrs)
+ layer.register_parameter("codebooks", codebooks)
+ set_weight_attrs(codebooks, extra_weight_attrs)
+ layer.register_parameter("scales", scales)
+ set_weight_attrs(scales, extra_weight_attrs)
+
+ def apply_weights(
+ self,
+ layer: torch.nn.Module,
+ x: torch.Tensor,
+ bias: Optional[torch.Tensor] = None,
+ ) -> torch.Tensor:
+ codebooks = layer.codebooks
+ codes = layer.codes
+ scales = layer.scales
+ output_partition_sizes = getattr(codebooks, "output_partition_sizes",
+ None)
+
+ nbooks = codes.shape[2]
+ ingroups = codebooks.shape[3]
+ outgroups = codebooks.shape[2]
+ bits = codebooks.shape[1]
+
+ # We support these formats with dedicated gemm and decompression
+ # kernels.
+ if ingroups == 8 and outgroups == 1 and (
+ (bits == 256 and nbooks == 2) or (bits == 65536 and nbooks == 1)):
+
+ # thresholds determined by timings on an A6000, one GPU
+ use_gemv = math.prod(x.shape[:-1]) <= 6
+
+ return ops.aqlm_gemm(
+ x,
+ codes,
+ codebooks,
+ scales,
+ output_partition_sizes,
+ bias,
+ ) if use_gemv else optimized_dequantize_gemm(
+ x,
+ codes,
+ codebooks,
+ scales,
+ output_partition_sizes,
+ bias,
+ )
+
+ # fall back all unoptimized formats
+ return generic_dequantize_gemm(
+ x,
+ codes,
+ codebooks,
+ scales,
+ output_partition_sizes,
+ bias,
+ )
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index 98651aed8be0e..4f75134ee1889 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -81,7 +81,7 @@ def __init__(self, quant_config: AWQConfig):
def create_weights(self, layer: torch.nn.Module,
input_size_per_partition: int,
- output_size_per_partition: int, input_size: int,
+ output_partition_sizes: List[int], input_size: int,
output_size: int, params_dtype: torch.dtype,
**extra_weight_attrs):
if input_size_per_partition % self.quant_config.group_size != 0:
@@ -89,6 +89,8 @@ def create_weights(self, layer: torch.nn.Module,
"The input size is not aligned with the quantized "
"weight shape. This can be caused by too large "
"tensor parallel size.")
+
+ output_size_per_partition = sum(output_partition_sizes)
if output_size_per_partition % self.quant_config.pack_factor != 0:
raise ValueError(
"The output size is not aligned with the quantized "
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
new file mode 100644
index 0000000000000..01e494c870e71
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -0,0 +1,139 @@
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+from torch.nn import Module
+from torch.nn.parameter import Parameter
+
+from vllm.model_executor.layers.linear import (LinearMethodBase,
+ set_weight_attrs)
+from vllm.model_executor.layers.quantization.base_config import (
+ QuantizationConfig)
+
+
+class FP8Config(QuantizationConfig):
+ """Config class for FP8."""
+
+ @classmethod
+ def get_name(cls) -> str:
+ return "fp8"
+
+ @classmethod
+ def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+ return [torch.bfloat16, torch.half]
+
+ @classmethod
+ def get_min_capability(cls) -> int:
+ # TODO: PyTorch 2.3.0+ is required to run FP8 on
+ # SM 89 (e.g. Ada) GPUs. Specifically, this PR has to
+ # be included: https://github.com/pytorch/pytorch/pull/118881
+ return 90
+
+ @classmethod
+ def get_config_filenames(cls) -> List[str]:
+ return []
+
+ @classmethod
+ def from_config(cls, config: Dict[str, Any]) -> "FP8Config":
+ return cls()
+
+ def get_linear_method(self) -> "Fp8LinearMethod":
+ return Fp8LinearMethod(self)
+
+ def get_scaled_act_names(self) -> List[str]:
+ return []
+
+
+class Fp8LinearMethod(LinearMethodBase):
+ """Linear method for FP8.
+ We now support common FP16/BF16 model checkpoints ONLY. The weight
+ scaling factor will be initialized after the model weights are loaded.
+
+ Limitations:
+ 1. Only support per-tensor quantization due to torch._scaled_mm support.
+ 2. Only support float8_e4m3fn data type due to the limitation of
+ torch._scaled_mm (https://github.com/pytorch/pytorch/blob/2e48b39603411a41c5025efbe52f89560b827825/aten/src/ATen/native/cuda/Blas.cpp#L854-L856)
+
+ Args:
+ quant_config: The quantization config.
+ """
+
+ def __init__(self, quant_config: FP8Config):
+ self.quant_config = quant_config
+
+ def create_weights(
+ self,
+ layer: torch.nn.Module,
+ input_size_per_partition: int,
+ output_partition_sizes: List[int],
+ input_size: int,
+ output_size: int,
+ params_dtype: torch.dtype,
+ **extra_weight_attrs,
+ ):
+ output_size_per_partition = sum(output_partition_sizes)
+ weight = Parameter(torch.empty(output_size_per_partition,
+ input_size_per_partition,
+ dtype=params_dtype),
+ requires_grad=False)
+ layer.register_parameter("weight", weight)
+ set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
+ set_weight_attrs(weight, extra_weight_attrs)
+
+ w_scale = Parameter(
+ torch.empty(1, dtype=torch.float32),
+ requires_grad=False,
+ )
+ layer.register_parameter("weight_scaling_factor", w_scale)
+
+ def process_weights_after_loading(self, layer: Module) -> None:
+ # Although the linear_method is propagated to all layers,
+ # only linear layers invoke "create_weights". So we check
+ # whether "weight_scaling_facor" is registered to determine
+ # whether the layer is a linear layer that requires quantization.
+ if not hasattr(layer, "weight_scaling_factor"):
+ return
+
+ qweight, weight_scale = per_tensor_quantize(layer.weight)
+ # torch._scaled_mm requires column-major in the second
+ # input (weight), so we transpose the quantized weight.
+ layer.weight = Parameter(qweight.t(), requires_grad=False)
+ layer.weight_scaling_factor.data.copy_(weight_scale)
+
+ def apply_weights(self,
+ layer: torch.nn.Module,
+ x: torch.Tensor,
+ bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+ qinput, x_scale = per_tensor_quantize(x)
+ output, _ = torch._scaled_mm(
+ qinput,
+ layer.weight,
+ out_dtype=x.dtype,
+ scale_a=x_scale,
+ scale_b=layer.weight_scaling_factor,
+ bias=bias,
+ )
+ return output
+
+
+def per_tensor_quantize(tensor: torch.Tensor) -> Tuple[torch.Tensor, float]:
+ """Quantize a tensor using per-tensor static scaling factor.
+
+ Args:
+ tensor: The input tensor.
+ """
+ finfo = torch.finfo(torch.float8_e4m3fn)
+ # Calculate the scale as dtype max divided by absmax.
+ # Since .abs() creates a new tensor, we use aminmax to get
+ # the min and max first and then calculate the absmax.
+ min_val, max_val = tensor.aminmax()
+ amax = min_val.abs().max(max_val.abs())
+ scale = finfo.max / amax.clamp(min=1e-12)
+ # scale and clamp the tensor to bring it to
+ # the representative range of float8 data type
+ # (as default cast is unsaturated)
+ qweight = (tensor * scale).clamp(min=finfo.min, max=finfo.max)
+ # Return both float8 data and the inverse scale (as float),
+ # as both required as inputs to torch._scaled_mm
+ qweight = qweight.to(torch.float8_e4m3fn)
+ scale = scale.float().reciprocal()
+ return qweight, scale
diff --git a/vllm/model_executor/layers/quantization/gptq.py b/vllm/model_executor/layers/quantization/gptq.py
index f370b94a210ee..92a5cdb9af928 100644
--- a/vllm/model_executor/layers/quantization/gptq.py
+++ b/vllm/model_executor/layers/quantization/gptq.py
@@ -91,7 +91,7 @@ def create_weights(
self,
layer: torch.nn.Module,
input_size_per_partition: int,
- output_size_per_partition: int,
+ output_partition_sizes: List[int],
input_size: int,
output_size: int,
params_dtype: torch.dtype,
@@ -103,6 +103,7 @@ def create_weights(
"The input size is not aligned with the quantized "
"weight shape. This can be caused by too large "
"tensor parallel size.")
+ output_size_per_partition = sum(output_partition_sizes)
if (output_size_per_partition % self.quant_config.pack_factor.numerator
!= 0):
raise ValueError(
diff --git a/vllm/model_executor/layers/quantization/marlin.py b/vllm/model_executor/layers/quantization/marlin.py
index bf0500f1155a1..00c3c404c2d7a 100644
--- a/vllm/model_executor/layers/quantization/marlin.py
+++ b/vllm/model_executor/layers/quantization/marlin.py
@@ -93,7 +93,7 @@ def create_weights(
self,
layer: torch.nn.Module,
input_size_per_partition: int,
- output_size_per_partition: int,
+ output_partition_sizes: List[int],
input_size: int,
output_size: int,
params_dtype: torch.dtype,
@@ -106,6 +106,7 @@ def create_weights(
f"The params dtype must be float16, but got {params_dtype}")
# Validate output_size_per_partition
+ output_size_per_partition = sum(output_partition_sizes)
if output_size_per_partition % self.quant_config.min_n_threads != 0:
raise ValueError(
f"Weight output_size_per_partition = "
diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py
index 661ff9c55d0d1..cc44447d347b8 100644
--- a/vllm/model_executor/layers/quantization/squeezellm.py
+++ b/vllm/model_executor/layers/quantization/squeezellm.py
@@ -70,7 +70,7 @@ def __init__(self, quant_config: SqueezeLLMConfig):
def create_weights(self, layer: torch.nn.Module,
input_size_per_partition: int,
- output_size_per_partition: int, input_size: int,
+ output_partition_sizes: List[int], input_size: int,
output_size: int, params_dtype: torch.dtype,
**extra_weight_attrs):
if input_size_per_partition % self.quant_config.pack_factor != 0:
@@ -78,6 +78,8 @@ def create_weights(self, layer: torch.nn.Module,
"The input size is not aligned with the quantized "
"weight shape. This can be caused by too large "
"tensor parallel size.")
+
+ output_size_per_partition = sum(output_partition_sizes)
qweight = Parameter(
torch.empty(
input_size_per_partition // self.quant_config.pack_factor,
diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py
index ecd2bd0fce3a3..5edbbf2c70a49 100644
--- a/vllm/model_executor/layers/rejection_sampler.py
+++ b/vllm/model_executor/layers/rejection_sampler.py
@@ -144,6 +144,7 @@ def _batch_modified_rejection_sampling(
recovered_probs = self._get_recovered_probs(
target_probs, draft_probs).reshape(batch_size * k, vocab_size)
+ # NOTE: the recovered_probs are overwritten by this method.
recovered_token_ids = _multinomial(recovered_probs,
num_samples=1).reshape(
batch_size, k)
@@ -307,6 +308,12 @@ def _create_output(
output_with_bonus_tokens[:, -1] = torch.where(output[:, -1] != -1,
bonus_token_ids, -1)
+ # We disable bonus tokens because it causes corrupt KV cache for
+ # proposal methods that require KV cache. We can fix it by "prefilling"
+ # the bonus token in the proposer. The following issue tracks the fix.
+ # https://github.com/vllm-project/vllm/issues/4212
+ output_with_bonus_tokens[:, -1] = -1
+
# Fill the recovered token ids.
output.mul_(~after_false_mask).add_(
recovered_token_ids.mul(after_false_mask))
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index 6519781c8a8eb..b8361af61ae3f 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -108,7 +108,8 @@ def _forward(
query_pass = query[..., self.rotary_dim:]
key_pass = key[..., self.rotary_dim:]
- self.cos_sin_cache = self.cos_sin_cache.to(positions.device)
+ self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to(
+ positions.device)
cos_sin = self.cos_sin_cache[torch.add(positions, offsets)
if offsets is not None else positions]
cos, sin = cos_sin.chunk(2, dim=-1)
@@ -337,6 +338,114 @@ def _compute_cos_sin_cache(self) -> torch.Tensor:
return cache
+class Phi3SuScaledRotaryEmbedding(nn.Module):
+ """Phi3 family of models scaled rotary embedding.
+
+ Based on the original RotaryEmbedding implementation.
+ """
+
+ def __init__(
+ self,
+ head_size: int,
+ rotary_dim: int,
+ max_position_embeddings: int,
+ original_max_position_embeddings: int,
+ base: int,
+ is_neox_style: bool,
+ short_factor: List[float],
+ long_factor: List[float],
+ short_mscale: float = 1.1,
+ long_mscale: float = 1.225,
+ ):
+ super().__init__()
+
+ if rotary_dim != head_size:
+ raise ValueError(
+ f"`Phi3SuScaledRotaryEmbedding` does not support rotary_dim != \
+ head_size ({rotary_dim}!={head_size}).")
+ if is_neox_style is False:
+ raise ValueError(
+ "`Phi3SuScaledRotaryEmbedding` only supports neox_style.")
+
+ self.head_size = head_size
+ self.max_position_embeddings = max_position_embeddings
+ self.original_max_position_embeddings = original_max_position_embeddings
+ self.base = base
+ self.short_factor = short_factor
+ self.long_factor = long_factor
+ self.short_mscale = short_mscale
+ self.long_mscale = long_mscale
+
+ short_cache = self._compute_cos_sin_cache(
+ original_max_position_embeddings, short_factor, short_mscale)
+ short_cache = short_cache.to(torch.get_default_dtype())
+ self.register_buffer("short_cos_sin_cache",
+ short_cache,
+ persistent=False)
+
+ long_cache = self._compute_cos_sin_cache(max_position_embeddings,
+ long_factor, long_mscale)
+ long_cache = long_cache.to(torch.get_default_dtype())
+ self.register_buffer("long_cos_sin_cache",
+ long_cache,
+ persistent=False)
+
+ long_short_cache = torch.cat(
+ [self.short_cos_sin_cache, self.long_cos_sin_cache], dim=0)
+ self.register_buffer("long_short_cos_sin_cache",
+ long_short_cache,
+ persistent=False)
+
+ def _compute_inv_freq(self, rescale_factors: List[float]) -> torch.Tensor:
+ rescale_factors = torch.tensor(rescale_factors, dtype=torch.float32)
+ inv_freq = 1.0 / (rescale_factors * (self.base**(torch.arange(
+ 0, self.head_size, 2, dtype=torch.float) / self.head_size)))
+ return inv_freq
+
+ def _compute_cos_sin_cache(
+ self,
+ max_position_embeddings: int,
+ rescale_factors: List[float],
+ mscale: float,
+ ) -> torch.Tensor:
+ inv_freq = self._compute_inv_freq(rescale_factors)
+ t = torch.arange(max_position_embeddings, dtype=torch.float)
+ freqs = torch.einsum("i,j -> ij", t, inv_freq)
+ cos = freqs.cos() * mscale
+ sin = freqs.sin() * mscale
+ cache = torch.cat((cos, sin), dim=-1)
+ return cache
+
+ def forward(
+ self,
+ positions: torch.Tensor,
+ query: torch.Tensor,
+ key: torch.Tensor,
+ offsets: Optional[torch.Tensor] = None,
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ query = query.view(*query.shape[:-1], -1, self.head_size)
+ key = key.view(*key.shape[:-1], -1, self.head_size)
+
+ k = self.original_max_position_embeddings
+ long_prompt_offset = (torch.any(positions > k).float() *
+ torch.full_like(positions, k)).long()
+ idx = (torch.add(positions, long_prompt_offset)
+ if long_prompt_offset is not None else positions)
+ self.long_short_cos_sin_cache = self.long_short_cos_sin_cache.to(
+ idx.device)
+ idx = torch.add(idx, offsets) if offsets is not None else idx
+ cos_sin = torch.index_select(self.long_short_cos_sin_cache, 0, idx)
+
+ cos, sin = cos_sin.chunk(2, dim=-1)
+ cos = cos.repeat(1, 2).unsqueeze(-2)
+ sin = sin.repeat(1, 2).unsqueeze(-2)
+
+ query = query * cos + _rotate_neox(query) * sin
+ key = key * cos + _rotate_neox(key) * sin
+
+ return query.flatten(-2), key.flatten(-2)
+
+
_ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {}
@@ -348,17 +457,26 @@ def get_rope(
is_neox_style: bool = True,
rope_scaling: Optional[Dict[str, Any]] = None,
) -> RotaryEmbedding:
+ if rope_scaling is not None:
+ # Transforms every value that is a list into a tuple for caching calls
+ rope_scaling_tuple = {
+ k: tuple(v) if isinstance(v, list) else v
+ for k, v in rope_scaling.items()
+ }
+ rope_scaling_args = tuple(rope_scaling_tuple.items())
+ else:
+ rope_scaling_args = None
key = (head_size, rotary_dim, max_position, base, is_neox_style,
- tuple(rope_scaling.items()) if rope_scaling is not None else None)
+ rope_scaling_args)
if key in _ROPE_DICT:
return _ROPE_DICT[key]
-
if rope_scaling is None:
rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base,
is_neox_style)
else:
scaling_type = rope_scaling["type"]
- scaling_factor = rope_scaling["factor"]
+ if scaling_type != "su":
+ scaling_factor = rope_scaling["factor"]
if scaling_type == "linear":
rotary_emb = LinearScalingRotaryEmbedding(head_size, rotary_dim,
max_position, base,
@@ -382,6 +500,19 @@ def get_rope(
base, is_neox_style,
scaling_factor,
**extra_kwargs)
+ elif scaling_type == "su":
+ short_factor = rope_scaling["short_factor"]
+ long_factor = rope_scaling["long_factor"]
+ original_max_position = rope_scaling[
+ "original_max_position_embeddings"]
+ extra_kwargs = {
+ k: v
+ for k, v in rope_scaling.items()
+ if k in ("short_mscale", "long_mscale")
+ }
+ rotary_emb = Phi3SuScaledRotaryEmbedding(
+ head_size, rotary_dim, max_position, original_max_position,
+ base, is_neox_style, short_factor, long_factor, **extra_kwargs)
else:
raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
_ROPE_DICT[key] = rotary_emb
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 03bf38caebe0e..c4b11cb33a677 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -35,6 +35,14 @@ class Sampler(nn.Module):
in logits for each token in the input prompt.
"""
+ def __init__(self):
+ super().__init__()
+
+ # Whether or not the SamplerOutput should have on-device tensors
+ # containing the sampled token ids and probabilities. This is used by
+ # speculative decoding.
+ self.include_gpu_probs_tensor = False
+
def forward(
self,
logits: torch.Tensor,
@@ -79,13 +87,45 @@ def forward(
logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float)
# Sample the next tokens.
- sample_results = _sample(probs, logprobs, sampling_metadata,
- sampling_tensors)
+ sample_results, maybe_sampled_tokens_tensor = _sample(
+ probs,
+ logprobs,
+ sampling_metadata,
+ sampling_tensors,
+ include_gpu_probs_tensor=self.include_gpu_probs_tensor,
+ modify_greedy_probs=self._should_modify_greedy_probs_inplace,
+ )
+
+ if self.include_gpu_probs_tensor:
+ assert maybe_sampled_tokens_tensor is not None
+ sampled_tokens_tensor = maybe_sampled_tokens_tensor
+ on_device_tensors = (probs, sampled_tokens_tensor)
+ else:
+ on_device_tensors = None
+
# Get the logprobs query results.
prompt_logprobs, sample_logprobs = _get_logprobs(
logprobs, sampling_metadata, sample_results)
- return _build_sampler_output(sample_results, sampling_metadata,
- prompt_logprobs, sample_logprobs)
+ return _build_sampler_output(sample_results,
+ sampling_metadata,
+ prompt_logprobs,
+ sample_logprobs,
+ on_device_tensors=on_device_tensors)
+
+ @property
+ def _should_modify_greedy_probs_inplace(self) -> bool:
+ """Whether or not the sampler should modify the probability distribution
+ of greedily-sampled tokens such that multinomial sampling would sample
+ the greedily-sampled token.
+
+ In other words, if True then we set the probability of the greedily-
+ sampled token to 1.
+
+ This is used by speculative decoding, which requires that the sampling
+ method be encoded into the probability distribution.
+ """
+ # Modify greedy probs if include_gpu_probs_tensor is set.
+ return self.include_gpu_probs_tensor
def _get_bin_counts_and_mask(
@@ -359,7 +399,9 @@ def _sample_with_torch(
probs: torch.Tensor,
logprobs: torch.Tensor,
sampling_metadata: SamplingMetadata,
-) -> List[Tuple[List[int], List[int]]]:
+ include_gpu_probs_tensor: bool,
+ modify_greedy_probs: bool,
+) -> Tuple[List[Tuple[List[int], List[int]]], Optional[torch.Tensor]]:
categorized_seq_group_ids = {t: [] for t in SamplingType}
categorized_sample_indices = sampling_metadata.categorized_sample_indices
for i, seq_group in enumerate(sampling_metadata.seq_groups):
@@ -371,6 +413,15 @@ def _sample_with_torch(
sample_metadata = {}
multinomial_samples = {}
+ # Create output tensor for sampled token ids.
+ if include_gpu_probs_tensor:
+ sampled_token_ids_tensor = torch.empty(logprobs.shape[0],
+ 1,
+ dtype=torch.long,
+ device=logprobs.device)
+ else:
+ sampled_token_ids_tensor = None
+
# Counterintiutively, having two loops here is actually faster.
# The first loop can run without waiting on GPU<->CPU sync.
for sampling_type in SamplingType:
@@ -383,9 +434,25 @@ def _sample_with_torch(
is_prompts = [i < sampling_metadata.num_prompts for i in seq_group_ids]
sample_metadata[sampling_type] = (seq_group_ids, seq_groups,
is_prompts, sample_indices)
+ long_sample_indices = sample_indices.long()
+
if sampling_type == SamplingType.GREEDY:
- greedy_samples = torch.argmax(logprobs[sample_indices.long()],
+ greedy_samples = torch.argmax(logprobs[long_sample_indices],
dim=-1)
+
+ if include_gpu_probs_tensor:
+ # Store sampled tokens in output tensor.
+ sampled_token_ids_tensor[
+ long_sample_indices] = greedy_samples.unsqueeze(-1)
+
+ if modify_greedy_probs:
+ # If required, modify the probabilities such that sampling from
+ # the modified distribution would always sample the argmax
+ # token id.
+ _modify_greedy_probs_inplace(logprobs, probs,
+ long_sample_indices,
+ greedy_samples)
+
elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED):
max_best_of_in_batch = 1
for seq_group, is_prompt in zip(seq_groups, is_prompts):
@@ -397,15 +464,23 @@ def _sample_with_torch(
"seq_groups": seq_groups,
"generators": sampling_metadata.generators,
}
+
multinomial_samples[sampling_type] = _multinomial(
- probs[sample_indices.long()], max_best_of_in_batch,
+ probs[long_sample_indices], max_best_of_in_batch,
**seeded_args)
+
+ if include_gpu_probs_tensor:
+ # Store sampled tokens in output tensor.
+ sampled_token_ids_tensor[
+ long_sample_indices] = multinomial_samples[sampling_type]
+
elif sampling_type == SamplingType.BEAM:
beam_search_logprobs = logprobs[sample_indices]
else:
raise ValueError(f"Unsupported sampling type: {sampling_type}")
# GPU<->CPU sync happens in the loop below.
+ # This also converts the sample output to Python objects.
for sampling_type in SamplingType:
if sampling_type not in sample_metadata:
@@ -427,7 +502,7 @@ def _sample_with_torch(
sample_results_dict[i]
for i in range(len(sampling_metadata.seq_groups))
]
- return sample_results
+ return sample_results, sampled_token_ids_tensor
def _sample_with_triton_kernel(
@@ -511,12 +586,17 @@ def _sample_with_triton_kernel(
def _sample(
- probs: torch.Tensor,
- logprobs: torch.Tensor,
- sampling_metadata: SamplingMetadata,
- sampling_tensors: SamplingTensors,
-) -> List[Tuple[List[int], List[int]]]:
- return _sample_with_torch(probs, logprobs, sampling_metadata)
+ probs: torch.Tensor, logprobs: torch.Tensor,
+ sampling_metadata: SamplingMetadata, sampling_tensors: SamplingTensors,
+ include_gpu_probs_tensor: bool, modify_greedy_probs: bool
+) -> Tuple[List[Tuple[List[int], List[int]]], Optional[torch.Tensor]]:
+ return _sample_with_torch(
+ probs,
+ logprobs,
+ sampling_metadata,
+ include_gpu_probs_tensor=include_gpu_probs_tensor,
+ modify_greedy_probs=modify_greedy_probs,
+ )
# TODO: Enable once Triton kernel & associated code is faster.
# return _sample_with_triton_kernel(probs, logprobs, sampling_metadata,
@@ -680,12 +760,73 @@ def _get_logprobs(
return result_prompt_logprobs, result_sample_logprobs
+def _modify_greedy_probs_inplace(logprobs: torch.Tensor, probs: torch.Tensor,
+ sample_indices: torch.Tensor,
+ greedy_samples: torch.Tensor) -> None:
+ """Modify the probability distributions of the greedily-sampled tokens such
+ that each sampled token has a "probability" of 1.0. This is required by
+ speculative decoding, which depends on the sampling method being encoded
+ within the probability distribution for correctness.
+
+ # Why do we only need to do this for greedy sampling?
+
+ vLLM's sampler performs the following steps for greedy or multinomial
+ (random) sampling:
+ 1. Get logits from model.
+ 2. Modify logits according to per-sequence sampling parameters.
+ - Multiply by temperature, top-k and top-p masking, penalize tokens
+ according to their frequency, etc.
+ 3. Sample a token.
+ - Random sampling simply samples from the modified probability
+ distribution.
+ - Greedy sampling performs `argmax` to obtain the token with the
+ highest likelihood.
+
+ Ignoring greedy sampling for a moment, we find that the computed probability
+ distribution has the following property: we can sample from it independently
+ and find that the token sampled by the Sampler has a frequency corresponding
+ to how often we see it in our sampling. In other words, for tokens sampled
+ with vLLM's random SamplingType, the computed probability distribution
+ encodes the sampling methodology completely.
+
+ Greedy sampling does not normally have this property. vLLM modifies logits
+ according to sampling params, then performs `argmax`, then returns the
+ sampled token and the computed probability distribution. If we sample from
+ the distribution, we'll find the likelihood of the greedily-sampled token
+ is not always 1.0.
+
+ Since lossless speculative decoding requires that the sampling methodology
+ be encoded within the probability distribution, we are motivated to modify
+ the probability distribution such that the sampled token has probability 1
+ when speculative decoding is used.
+
+ NOTE: Alternatively, we could use an extremely low temperature to achieve
+ greedy sampling using multinomial computation and unite the codepaths. This
+ has implications on the overall design of the sampler, e.g. how to record
+ accurate logprobs for the user, so this improvement is deferred to later.
+ """
+ logprobs[sample_indices, :] = -float('inf')
+ logprobs[sample_indices, greedy_samples] = 0.0
+ probs[sample_indices, :] = 0
+ probs[sample_indices, greedy_samples] = 1.0
+
+
def _build_sampler_output(
sample_results: List[Tuple[List[int], List[int]]],
sampling_metadata: SamplingMetadata,
prompt_logprobs: List[Optional[PromptLogprobs]],
sample_logprobs: List[SampleLogprobs],
+ on_device_tensors: Optional[Tuple[torch.Tensor, torch.Tensor]],
) -> SamplerOutput:
+ """Construct Python objects with the output of sampling.
+
+ Args:
+ on_device_tensors: Tuple containing on-device tensors with the
+ probabilities used in sampling and the sampled token ids. This
+ allows post-processing without copies to CPU/serialization, e.g. in
+ speculative decoding rejection sampling.
+ """
+
sampler_output = []
for (seq_group, sample_result, group_prompt_logprobs,
group_sample_logprobs) in zip(sampling_metadata.seq_groups,
@@ -701,4 +842,15 @@ def _build_sampler_output(
SequenceOutput(seq_ids[parent_id], next_token_id, logprobs))
sampler_output.append(
SequenceGroupOutput(seq_outputs, group_prompt_logprobs))
- return SamplerOutput(outputs=sampler_output)
+
+ # If not specified, store None values in SamplerOutput.
+ if on_device_tensors is not None:
+ sampled_token_probs, sampled_token_ids = on_device_tensors
+ else:
+ sampled_token_probs, sampled_token_ids = (None, None)
+
+ return SamplerOutput(
+ outputs=sampler_output,
+ sampled_token_probs=sampled_token_probs,
+ sampled_token_ids=sampled_token_ids,
+ )
diff --git a/vllm/model_executor/layers/sparsity/sparse_w16a16_linear_method.py b/vllm/model_executor/layers/sparsity/sparse_w16a16_linear_method.py
index 77244fa399f47..291dd5e9faa52 100644
--- a/vllm/model_executor/layers/sparsity/sparse_w16a16_linear_method.py
+++ b/vllm/model_executor/layers/sparsity/sparse_w16a16_linear_method.py
@@ -1,4 +1,4 @@
-from typing import Optional, Type
+from typing import List, Optional, Type
import torch
import torch.nn.functional as F
@@ -31,13 +31,14 @@ def create_weights(
self,
layer: torch.nn.Module,
input_size_per_partition: int,
- output_size_per_partition: int,
+ output_partition_sizes: List[int],
input_size: int,
output_size: int,
params_dtype: torch.dtype,
**extra_weight_attrs,
):
del input_size, output_size # Unused.
+ output_size_per_partition = sum(output_partition_sizes)
supports_linear = (self.storage_format_cls !=
SparseBEGemmStorageFormat)
diff --git a/vllm/model_executor/model_loader.py b/vllm/model_executor/model_loader.py
deleted file mode 100644
index b95a37a6c1c63..0000000000000
--- a/vllm/model_executor/model_loader.py
+++ /dev/null
@@ -1,122 +0,0 @@
-"""Utilities for selecting and loading models."""
-import contextlib
-from typing import Tuple, Type
-
-import torch
-import torch.nn as nn
-
-from vllm.config import DeviceConfig, ModelConfig
-from vllm.model_executor.models import ModelRegistry
-from vllm.model_executor.models.llava import LlavaForConditionalGeneration
-from vllm.model_executor.weight_utils import (get_quant_config,
- get_sparse_config,
- initialize_dummy_weights)
-
-_VISION_MODEL_CLASSES = [
- LlavaForConditionalGeneration,
-]
-
-
-@contextlib.contextmanager
-def _set_default_torch_dtype(dtype: torch.dtype):
- """Sets the default torch dtype to the given dtype."""
- old_dtype = torch.get_default_dtype()
- torch.set_default_dtype(dtype)
- yield
- torch.set_default_dtype(old_dtype)
-
-
-def _get_model_architecture(
- model_config: ModelConfig) -> Tuple[Type[nn.Module], str]:
- architectures = getattr(model_config.hf_config, "architectures", [])
- # Special handling for quantized Mixtral.
- # FIXME(woosuk): This is a temporary hack.
- if (model_config.quantization is not None
- and "MixtralForCausalLM" in architectures):
- architectures = ["QuantMixtralForCausalLM"]
-
- for arch in architectures:
- model_cls = ModelRegistry.load_model_cls(arch)
- if model_cls is not None:
- return (model_cls, arch)
- raise ValueError(
- f"Model architectures {architectures} are not supported for now. "
- f"Supported architectures: {ModelRegistry.get_supported_archs()}")
-
-
-def get_architecture_class_name(model_config: ModelConfig) -> str:
- return _get_model_architecture(model_config)[1]
-
-
-def get_model(model_config: ModelConfig, device_config: DeviceConfig,
- **kwargs) -> nn.Module:
- lora_config = kwargs.get("lora_config", None)
- vision_language_config = kwargs.get("vision_language_config", None)
- model_class = _get_model_architecture(model_config)[0]
-
- # Get the (maybe sparse or quantized) linear method.
- linear_method = None
- if model_config.quantization is not None:
- quant_config = get_quant_config(model_config)
- capability = torch.cuda.get_device_capability()
- capability = capability[0] * 10 + capability[1]
- if capability < quant_config.get_min_capability():
- raise ValueError(
- f"The quantization method {model_config.quantization} is not "
- "supported for the current GPU. "
- f"Minimum capability: {quant_config.get_min_capability()}. "
- f"Current capability: {capability}.")
- supported_dtypes = quant_config.get_supported_act_dtypes()
- if model_config.dtype not in supported_dtypes:
- raise ValueError(
- f"{model_config.dtype} is not supported for quantization "
- f"method {model_config.quantization}. Supported dtypes: "
- f"{supported_dtypes}")
- linear_method = quant_config.get_linear_method()
- # UPSTREAM SYNC: needed to support sparsity
- if model_config.sparsity is not None:
- sparse_config = get_sparse_config(model_config)
- capability = torch.cuda.get_device_capability()
- capability = capability[0] * 10 + capability[1]
- if capability < sparse_config.get_min_capability():
- raise ValueError(
- f"The sparsity method {model_config.sparsity} is not "
- "supported for the current GPU. "
- f"Minimum capability: {sparse_config.get_min_capability()}. "
- f"Current capability: {capability}.")
- supported_dtypes = sparse_config.get_supported_act_dtypes()
- if model_config.dtype not in supported_dtypes:
- raise ValueError(
- f"{model_config.dtype} is not supported for sparsity "
- f"method {model_config.sparsity}. Supported dtypes: "
- f"{supported_dtypes}")
- linear_method = sparse_config.get_linear_method()
-
- with _set_default_torch_dtype(model_config.dtype):
- # Create a model instance.
- # The weights will be initialized as empty tensors.
- with torch.device(device_config.device):
- if hasattr(model_class, "supported_lora_modules"):
- model = model_class(model_config.hf_config, linear_method,
- lora_config)
- elif lora_config:
- raise ValueError(
- f"Model {model_class.__name__} does not support LoRA, "
- "but LoRA is enabled. Support for this model may "
- "be added in the future. If this is important to you, "
- "please open an issue on github.")
- else:
- if model_class not in _VISION_MODEL_CLASSES:
- model = model_class(model_config.hf_config, linear_method)
- else:
- model = model_class(model_config.hf_config,
- vision_language_config, linear_method)
- if model_config.load_format == "dummy":
- # NOTE(woosuk): For accurate performance evaluation, we assign
- # random values to the weights.
- initialize_dummy_weights(model)
- else:
- # Load the weights from the cached or downloaded files.
- model.load_weights(model_config.model, model_config.download_dir,
- model_config.load_format, model_config.revision)
- return model.eval()
diff --git a/vllm/model_executor/model_loader/__init__.py b/vllm/model_executor/model_loader/__init__.py
new file mode 100644
index 0000000000000..6f90e49994fb2
--- /dev/null
+++ b/vllm/model_executor/model_loader/__init__.py
@@ -0,0 +1,30 @@
+from typing import Optional
+
+from torch import nn
+
+from vllm.config import (DeviceConfig, LoadConfig, LoRAConfig, ModelConfig,
+ ParallelConfig, SchedulerConfig, VisionLanguageConfig)
+from vllm.model_executor.model_loader.loader import (BaseModelLoader,
+ get_model_loader)
+from vllm.model_executor.model_loader.utils import (
+ get_architecture_class_name, get_model_architecture)
+
+
+def get_model(
+ *, model_config: ModelConfig, load_config: LoadConfig,
+ device_config: DeviceConfig, parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig, lora_config: Optional[LoRAConfig],
+ vision_language_config: Optional[VisionLanguageConfig]) -> nn.Module:
+ loader = get_model_loader(load_config)
+ return loader.load_model(model_config=model_config,
+ device_config=device_config,
+ lora_config=lora_config,
+ vision_language_config=vision_language_config,
+ parallel_config=parallel_config,
+ scheduler_config=scheduler_config)
+
+
+__all__ = [
+ "get_model", "get_model_loader", "BaseModelLoader",
+ "get_architecture_class_name", "get_model_architecture"
+]
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
new file mode 100644
index 0000000000000..ea0067b462dfd
--- /dev/null
+++ b/vllm/model_executor/model_loader/loader.py
@@ -0,0 +1,380 @@
+# ruff: noqa: SIM117
+import copy
+import glob
+import os
+from abc import ABC, abstractmethod
+from typing import (TYPE_CHECKING, Any, Dict, Generator, List, Optional, Tuple,
+ Type)
+
+import torch
+from torch import nn
+
+from vllm.config import (VLLM_USE_MODELSCOPE, DeviceConfig, LoadConfig,
+ LoadFormat, LoRAConfig, ModelConfig, ParallelConfig,
+ SchedulerConfig, VisionLanguageConfig)
+from vllm.logger import init_logger
+from vllm.model_executor.model_loader.tensorizer import (
+ TensorizerConfig, is_vllm_serialized_tensorizer, load_with_tensorizer,
+ tensorizer_weights_iterator)
+from vllm.model_executor.model_loader.utils import (get_model_architecture,
+ set_default_torch_dtype)
+# UPSTREAM SYNC: needed for sparsity
+from vllm.model_executor.model_loader.weight_utils import (
+ download_weights_from_hf, filter_files_not_needed_for_inference,
+ get_quant_config, get_sparse_config, initialize_dummy_weights,
+ np_cache_weights_iterator, pt_weights_iterator,
+ safetensors_weights_iterator)
+from vllm.model_executor.models.llava import LlavaForConditionalGeneration
+
+if TYPE_CHECKING:
+ from vllm.model_executor.layers.linear import LinearMethodBase
+
+_VISION_MODEL_CLASSES = [
+ LlavaForConditionalGeneration,
+]
+
+logger = init_logger(__name__)
+
+
+def _get_linear_method(
+ model_config: ModelConfig,
+ load_config: LoadConfig) -> Optional["LinearMethodBase"]:
+ """Get the (maybe quantized) linear method."""
+ linear_method = None
+ if model_config.quantization is not None:
+ quant_config = get_quant_config(model_config, load_config)
+ capability = torch.cuda.get_device_capability()
+ capability = capability[0] * 10 + capability[1]
+ if capability < quant_config.get_min_capability():
+ raise ValueError(
+ f"The quantization method {model_config.quantization} is not "
+ "supported for the current GPU. "
+ f"Minimum capability: {quant_config.get_min_capability()}. "
+ f"Current capability: {capability}.")
+ supported_dtypes = quant_config.get_supported_act_dtypes()
+ if model_config.dtype not in supported_dtypes:
+ raise ValueError(
+ f"{model_config.dtype} is not supported for quantization "
+ f"method {model_config.quantization}. Supported dtypes: "
+ f"{supported_dtypes}")
+
+ linear_method = quant_config.get_linear_method()
+
+ if model_config.sparsity is not None:
+ sparse_config = get_sparse_config(model_config)
+ capability = torch.cuda.get_device_capability()
+ capability = capability[0] * 10 + capability[1]
+ if capability < sparse_config.get_min_capability():
+ raise ValueError(
+ f"The sparsity method {model_config.sparsity} is not "
+ "supported for the current GPU. "
+ f"Minimum capability: {sparse_config.get_min_capability()}. "
+ f"Current capability: {capability}.")
+ supported_dtypes = sparse_config.get_supported_act_dtypes()
+ if model_config.dtype not in supported_dtypes:
+ raise ValueError(
+ f"{model_config.dtype} is not supported for sparsity "
+ f"method {model_config.sparsity}. Supported dtypes: "
+ f"{supported_dtypes}")
+ linear_method = sparse_config.get_linear_method()
+ return linear_method
+
+
+def _get_model_initialization_kwargs(
+ model_class: Type[nn.Module], lora_config: Optional[LoRAConfig],
+ vision_language_config: Optional[VisionLanguageConfig]
+) -> Dict[str, Any]:
+ """Get extra kwargs for model initialization."""
+ extra_kwargs = {}
+ if hasattr(model_class, "supported_lora_modules"):
+ extra_kwargs["lora_config"] = lora_config
+ elif lora_config:
+ raise ValueError(
+ f"Model {model_class.__name__} does not support LoRA, "
+ "but LoRA is enabled. Support for this model may "
+ "be added in the future. If this is important to you, "
+ "please open an issue on github.")
+ elif model_class in _VISION_MODEL_CLASSES:
+ extra_kwargs["vision_language_config"] = vision_language_config
+ return extra_kwargs
+
+
+def _initialize_model(
+ model_config: ModelConfig, load_config: LoadConfig,
+ lora_config: Optional[LoRAConfig],
+ vision_language_config: Optional[VisionLanguageConfig]) -> nn.Module:
+ """Initialize a model with the given configurations."""
+ model_class = get_model_architecture(model_config)[0]
+ linear_method = _get_linear_method(model_config, load_config)
+
+ return model_class(config=model_config.hf_config,
+ linear_method=linear_method,
+ **_get_model_initialization_kwargs(
+ model_class, lora_config, vision_language_config))
+
+
+class BaseModelLoader(ABC):
+ """Base class for model loaders."""
+
+ def __init__(self, load_config: LoadConfig):
+ self.load_config = load_config
+
+ @abstractmethod
+ def load_model(self, *, model_config: ModelConfig,
+ device_config: DeviceConfig,
+ lora_config: Optional[LoRAConfig],
+ vision_language_config: Optional[VisionLanguageConfig],
+ parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig) -> nn.Module:
+ """Load a model with the given configurations."""
+ ...
+
+
+class DefaultModelLoader(BaseModelLoader):
+ """Model loader that can load different file types from disk."""
+
+ def __init__(self, load_config: LoadConfig):
+ super().__init__(load_config)
+ if load_config.model_loader_extra_config:
+ raise ValueError(f"Model loader extra config is not supported for "
+ f"load format {load_config.load_format}")
+
+ def _maybe_download_from_modelscope(
+ self, model: str, revision: Optional[str]) -> Optional[str]:
+ """Download model from ModelScope hub if VLLM_USE_MODELSCOPE is True.
+
+ Returns the path to the downloaded model, or None if the model is not
+ downloaded from ModelScope."""
+ if VLLM_USE_MODELSCOPE:
+ # download model from ModelScope hub,
+ # lazy import so that modelscope is not required for normal use.
+ # pylint: disable=C.
+ from modelscope.hub.snapshot_download import snapshot_download
+
+ if not os.path.exists(model):
+ model_path = snapshot_download(
+ model_id=model,
+ cache_dir=self.load_config.download_dir,
+ revision=revision)
+ else:
+ model_path = model
+ return model_path
+ return None
+
+ def _prepare_weights(self, model_name_or_path: str,
+ revision: Optional[str],
+ fall_back_to_pt: bool) -> Tuple[str, List[str], bool]:
+ """Prepare weights for the model.
+
+ If the model is not local, it will be downloaded."""
+ model_name_or_path = self._maybe_download_from_modelscope(
+ model_name_or_path, revision) or model_name_or_path
+
+ is_local = os.path.isdir(model_name_or_path)
+ load_format = self.load_config.load_format
+ use_safetensors = False
+ # Some quantized models use .pt files for storing the weights.
+ if load_format == LoadFormat.AUTO:
+ allow_patterns = ["*.safetensors", "*.bin"]
+ elif load_format == LoadFormat.SAFETENSORS:
+ use_safetensors = True
+ allow_patterns = ["*.safetensors"]
+ elif load_format == LoadFormat.PT:
+ allow_patterns = ["*.pt"]
+ elif load_format == LoadFormat.NPCACHE:
+ allow_patterns = ["*.bin"]
+ else:
+ raise ValueError(f"Unknown load_format: {load_format}")
+
+ if fall_back_to_pt:
+ allow_patterns += ["*.pt"]
+
+ if not is_local:
+ hf_folder = download_weights_from_hf(model_name_or_path,
+ self.load_config.download_dir,
+ allow_patterns, revision)
+ else:
+ hf_folder = model_name_or_path
+
+ hf_weights_files: List[str] = []
+ for pattern in allow_patterns:
+ hf_weights_files += glob.glob(os.path.join(hf_folder, pattern))
+ if len(hf_weights_files) > 0:
+ if pattern == "*.safetensors":
+ use_safetensors = True
+ break
+
+ if not use_safetensors:
+ hf_weights_files = filter_files_not_needed_for_inference(
+ hf_weights_files)
+
+ if len(hf_weights_files) == 0:
+ raise RuntimeError(
+ f"Cannot find any model weights with `{model_name_or_path}`")
+
+ return hf_folder, hf_weights_files, use_safetensors
+
+ def _get_weights_iterator(
+ self, model_name_or_path: str, revision: Optional[str],
+ fall_back_to_pt: bool
+ ) -> Generator[Tuple[str, torch.Tensor], None, None]:
+ """Get an iterator for the model weights based on the load format."""
+ hf_folder, hf_weights_files, use_safetensors = self._prepare_weights(
+ model_name_or_path, revision, fall_back_to_pt)
+ if self.load_config.load_format == LoadFormat.NPCACHE:
+ # Currently np_cache only support *.bin checkpoints
+ assert use_safetensors is False
+ return np_cache_weights_iterator(model_name_or_path,
+ self.load_config.download_dir,
+ hf_folder, hf_weights_files)
+ if use_safetensors:
+ return safetensors_weights_iterator(hf_weights_files)
+ return pt_weights_iterator(hf_weights_files)
+
+ def load_model(self, *, model_config: ModelConfig,
+ device_config: DeviceConfig,
+ lora_config: Optional[LoRAConfig],
+ vision_language_config: Optional[VisionLanguageConfig],
+ parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig) -> nn.Module:
+ with set_default_torch_dtype(model_config.dtype):
+ with torch.device(device_config.device):
+ model = _initialize_model(model_config, self.load_config,
+ lora_config, vision_language_config)
+ model.load_weights(
+ self._get_weights_iterator(model_config.model,
+ model_config.revision,
+ fall_back_to_pt=getattr(
+ model,
+ "fall_back_to_pt_during_load",
+ True)), )
+ for _, module in model.named_modules():
+ linear_method = getattr(module, "linear_method", None)
+ if linear_method is not None:
+ linear_method.process_weights_after_loading(module)
+ if hasattr(module, "process_weights_after_loading"):
+ module.process_weights_after_loading()
+ return model.eval()
+
+
+class DummyModelLoader(BaseModelLoader):
+ """Model loader that will set model weights to random values."""
+
+ def __init__(self, load_config: LoadConfig):
+ super().__init__(load_config)
+ if load_config.model_loader_extra_config:
+ raise ValueError(f"Model loader extra config is not supported for "
+ f"load format {load_config.load_format}")
+
+ def load_model(self, *, model_config: ModelConfig,
+ device_config: DeviceConfig,
+ lora_config: Optional[LoRAConfig],
+ vision_language_config: Optional[VisionLanguageConfig],
+ parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig) -> nn.Module:
+ with set_default_torch_dtype(model_config.dtype):
+ with torch.device(device_config.device):
+ model = _initialize_model(model_config, self.load_config,
+ lora_config, vision_language_config)
+ # NOTE(woosuk): For accurate performance evaluation, we assign
+ # random values to the weights.
+ initialize_dummy_weights(model)
+ return model.eval()
+
+
+class TensorizerLoader(BaseModelLoader):
+ """Model loader using CoreWeave's tensorizer library."""
+
+ def __init__(self, load_config: LoadConfig):
+ super().__init__(load_config)
+ if isinstance(load_config.model_loader_extra_config, TensorizerConfig):
+ self.tensorizer_config = load_config.model_loader_extra_config
+ else:
+ self.tensorizer_config = TensorizerConfig(
+ **load_config.model_loader_extra_config)
+
+ def _verify_config(self, model_config: ModelConfig,
+ parallel_config: ParallelConfig):
+ self.tensorizer_config.verify_with_model_config(model_config)
+ self.tensorizer_config.verify_with_parallel_config(parallel_config)
+
+ def _get_weights_iterator(
+ self) -> Generator[Tuple[str, torch.Tensor], None, None]:
+ tensorizer_args = self.tensorizer_config._construct_tensorizer_args()
+ return tensorizer_weights_iterator(tensorizer_args)
+
+ def _load_model_unserialized(
+ self, model_config: ModelConfig, device_config: DeviceConfig,
+ lora_config: Optional[LoRAConfig],
+ vision_language_config: Optional[VisionLanguageConfig]
+ ) -> nn.Module:
+ """Load an unserialized model with tensorizer.
+
+ Unserialized here means "not serialized with tensorizer". This
+ should still be faster than default HuggingFace loading, but will
+ be slower than loading a tensorizer-serialized model.
+ """
+ with set_default_torch_dtype(model_config.dtype):
+ with torch.device(device_config.device):
+ model = _initialize_model(model_config, self.load_config,
+ lora_config, vision_language_config)
+
+ model.load_weights(self._get_weights_iterator())
+ return model.eval()
+
+ def _load_model_serialized(
+ self, model_config: ModelConfig, device_config: DeviceConfig,
+ lora_config: Optional[LoRAConfig],
+ vision_language_config: Optional[VisionLanguageConfig]
+ ) -> nn.Module:
+ """Load a serialized model with tensorizer.
+
+ See the examples/tensorize_vllm_model.py example "
+ script for serializing vLLM models."""
+ with set_default_torch_dtype(model_config.dtype):
+ with torch.device(device_config.device):
+ model_class = get_model_architecture(model_config)[0]
+ linear_method = _get_linear_method(model_config,
+ self.load_config)
+ extra_kwargs = _get_model_initialization_kwargs(
+ model_class, lora_config, vision_language_config)
+ extra_kwargs["linear_method"] = linear_method
+
+ tensorizer_config = copy.copy(self.tensorizer_config)
+ tensorizer_config.model_class = model_class
+ tensorizer_config.hf_config = model_config.hf_config
+ tensorizer_config.dtype = model_config.dtype
+
+ model = load_with_tensorizer(tensorizer_config, **extra_kwargs)
+ return model.eval()
+
+ def load_model(self, *, model_config: ModelConfig,
+ device_config: DeviceConfig,
+ lora_config: Optional[LoRAConfig],
+ vision_language_config: Optional[VisionLanguageConfig],
+ parallel_config: ParallelConfig,
+ scheduler_config: SchedulerConfig) -> nn.Module:
+ self._verify_config(model_config, parallel_config)
+
+ if is_vllm_serialized_tensorizer(self.tensorizer_config):
+ return self._load_model_serialized(model_config, device_config,
+ lora_config,
+ vision_language_config)
+ return self._load_model_unserialized(model_config, device_config,
+ lora_config,
+ vision_language_config)
+
+
+def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
+ """Get a model loader based on the load format."""
+
+ if isinstance(load_config.load_format, type):
+ return load_config.load_format(load_config)
+
+ if load_config.load_format == LoadFormat.DUMMY:
+ return DummyModelLoader(load_config)
+
+ if load_config.load_format == LoadFormat.TENSORIZER:
+ return TensorizerLoader(load_config)
+
+ return DefaultModelLoader(load_config)
diff --git a/vllm/model_executor/neuron_model_loader.py b/vllm/model_executor/model_loader/neuron.py
similarity index 92%
rename from vllm/model_executor/neuron_model_loader.py
rename to vllm/model_executor/model_loader/neuron.py
index 43d17ad373b87..07e23aca6cc5f 100644
--- a/vllm/model_executor/neuron_model_loader.py
+++ b/vllm/model_executor/model_loader/neuron.py
@@ -1,7 +1,7 @@
"""Utilities for selecting and loading neuron models."""
import importlib
import os
-from typing import Optional, Type
+from typing import Dict, Optional, Tuple
import torch
import torch.nn as nn
@@ -27,7 +27,7 @@
}
# Models supported by Neuron.
-_NEURON_SUPPORTED_MODELS = {
+_NEURON_SUPPORTED_MODELS: Dict[str, Tuple[str, str, str]] = {
"LlamaForCausalLM": ("transformers_neuronx.llama.model",
"LlamaForSampling", "LlamaForCausalLM"),
"MistralForCausalLM": ("transformers_neuronx.mistral.model",
@@ -43,11 +43,13 @@ def __init__(
) -> None:
super().__init__()
self.config = config
- self.model = None
self.logits_processor = LogitsProcessor(config.vocab_size,
logits_as_input=True)
self.sampler = Sampler()
+ # Lazy initialized
+ self.model: nn.Module
+
def forward(
self,
input_ids: torch.Tensor,
@@ -74,17 +76,17 @@ def sample(
def load_weights(self, model_name_or_path: str, **kwargs):
arch = _get_model_architecture(self.config)
- neuronx_module_path, neuronx_model_cls, hf_model_cls = (
+ neuronx_module_path, neuronx_model_cls_name, hf_model_cls_name = (
_NEURON_SUPPORTED_MODELS[arch])
neuronx_module = importlib.import_module(neuronx_module_path)
- neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls)
+ neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name)
split_model_dir = f"{model_name_or_path}-split"
if os.path.isdir(os.path.join(model_name_or_path,
"pytorch_model.bin")):
split_model_dir = model_name_or_path
elif not os.path.exists(f"{model_name_or_path}-split"):
- hf_model_cls = getattr(transformers, hf_model_cls)
+ hf_model_cls = getattr(transformers, hf_model_cls_name)
from transformers_neuronx.module import save_pretrained_split
hf_model = hf_model_cls.from_pretrained(model_name_or_path,
@@ -96,7 +98,7 @@ def load_weights(self, model_name_or_path: str, **kwargs):
self.model.to_neuron()
-def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
+def _get_model_architecture(config: PretrainedConfig) -> str:
architectures = getattr(config, "architectures", [])
for arch in architectures:
if arch in _NEURON_SUPPORTED_MODELS:
diff --git a/vllm/model_executor/model_loader/tensorizer.py b/vllm/model_executor/model_loader/tensorizer.py
new file mode 100644
index 0000000000000..16be0ecf9ce07
--- /dev/null
+++ b/vllm/model_executor/model_loader/tensorizer.py
@@ -0,0 +1,364 @@
+import argparse
+import dataclasses
+import io
+import os
+import time
+import typing
+from dataclasses import dataclass
+from typing import Generator, Optional, Tuple, Type, Union
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from vllm.config import ModelConfig, ParallelConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import LinearMethodBase
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+ VocabParallelEmbedding)
+
+tensorizer_load_fail = None
+
+try:
+ from tensorizer import (DecryptionParams, EncryptionParams,
+ TensorDeserializer, TensorSerializer)
+ from tensorizer.stream_io import open_stream
+ from tensorizer.utils import (convert_bytes, get_mem_usage,
+ no_init_or_tensor)
+except ImportError as e:
+ tensorizer_load_fail = e
+
+__all__ = [
+ 'EncryptionParams', 'DecryptionParams', 'TensorDeserializer',
+ 'TensorSerializer', 'open_stream', 'convert_bytes', 'get_mem_usage',
+ 'no_init_or_tensor', 'TensorizerConfig'
+]
+
+logger = init_logger(__name__)
+
+
+@dataclass
+class TensorizerConfig:
+ tensorizer_uri: Union[io.BufferedIOBase, io.RawIOBase, typing.BinaryIO,
+ str, bytes, os.PathLike, int]
+ vllm_tensorized: bool
+ verify_hash: Optional[bool] = False
+ num_readers: Optional[int] = 1
+ encryption_keyfile: Optional[str] = None
+ s3_access_key_id: Optional[str] = None
+ s3_secret_access_key: Optional[str] = None
+ s3_endpoint: Optional[str] = None
+ model_class: Optional[Type[torch.nn.Module]] = None
+ hf_config: Optional[PretrainedConfig] = None
+ dtype: Optional[Union[str, torch.dtype]] = None
+
+ def _construct_tensorizer_args(self) -> "TensorizerArgs":
+ tensorizer_args = {
+ "tensorizer_uri": self.tensorizer_uri,
+ "vllm_tensorized": self.vllm_tensorized,
+ "verify_hash": self.verify_hash,
+ "num_readers": self.num_readers,
+ "encryption_keyfile": self.encryption_keyfile,
+ "s3_access_key_id": self.s3_access_key_id,
+ "s3_secret_access_key": self.s3_secret_access_key,
+ "s3_endpoint": self.s3_endpoint,
+ }
+ return TensorizerArgs(**tensorizer_args)
+
+ def verify_with_parallel_config(
+ self,
+ parallel_config: "ParallelConfig",
+ ) -> None:
+ if (parallel_config.tensor_parallel_size > 1
+ and self.tensorizer_uri is not None):
+ raise ValueError(
+ "Loading to multiple GPUs is not currently supported with "
+ "vLLM-serialized models. Please set tensor_parallel_size=1."
+ " or use a non-vLLM-serialized model, such as a "
+ "serialized Hugging Face `PretrainedModel`.")
+
+ def verify_with_model_config(self, model_config: "ModelConfig") -> None:
+ if (model_config.quantization is not None
+ and self.tensorizer_uri is not None):
+ logger.warning(
+ "Loading a model using Tensorizer with quantization on vLLM"
+ " is unstable and may lead to errors.")
+
+
+def load_with_tensorizer(tensorizer_config: TensorizerConfig,
+ **extra_kwargs) -> nn.Module:
+ tensorizer = TensorizerAgent(tensorizer_config, **extra_kwargs)
+ return tensorizer.deserialize()
+
+
+def is_vllm_serialized_tensorizer(tensorizer_config: TensorizerConfig) -> bool:
+ if tensorizer_config is None:
+ return False
+ return tensorizer_config.vllm_tensorized
+
+
+@dataclass
+class TensorizerArgs:
+ tensorizer_uri: Union[io.BufferedIOBase, io.RawIOBase, typing.BinaryIO,
+ str, bytes, os.PathLike, int]
+ vllm_tensorized: bool
+ verify_hash: Optional[bool] = False
+ num_readers: Optional[int] = 1
+ encryption_keyfile: Optional[str] = None
+ s3_access_key_id: Optional[str] = None
+ s3_secret_access_key: Optional[str] = None
+ s3_endpoint: Optional[str] = None
+ """
+ Args for the TensorizerAgent class. These are used to configure the behavior
+ of the TensorDeserializer when loading tensors from a serialized model.
+
+ Args:
+ tensorizer_uri: Path to serialized model tensors. Can be a local file
+ path or a S3 URI.
+ vllm_tensorized: If True, indicates that the serialized model is a
+ vLLM model. This is used to determine the behavior of the
+ TensorDeserializer when loading tensors from a serialized model.
+ It is far faster to deserialize a vLLM model as it utilizes
+ tensorizer's optimized GPU loading.
+ verify_hash: If True, the hashes of each tensor will be verified against
+ the hashes stored in the metadata. A `HashMismatchError` will be
+ raised if any of the hashes do not match.
+ num_readers: Controls how many threads are allowed to read concurrently
+ from the source file. Default is 1. This greatly increases
+ performance.
+ encryption_keyfile: File path to a binary file containing a
+ binary key to use for decryption. `None` (the default) means
+ no decryption. See the example script in
+ examples/tensorize_vllm_model.py.
+ s3_access_key_id: The access key for the S3 bucket. Can also be set via
+ the S3_ACCESS_KEY_ID environment variable.
+ s3_secret_access_key: The secret access key for the S3 bucket. Can also
+ be set via the S3_SECRET_ACCESS_KEY environment variable.
+ s3_endpoint: The endpoint for the S3 bucket. Can also be set via the
+ S3_ENDPOINT_URL environment variable.
+ """
+
+ def __post_init__(self):
+ self.file_obj = self.tensorizer_uri
+ self.s3_access_key_id = (self.s3_access_key_id
+ or os.environ.get("S3_ACCESS_KEY_ID")) or None
+ self.s3_secret_access_key = (
+ self.s3_secret_access_key
+ or os.environ.get("S3_SECRET_ACCESS_KEY")) or None
+ self.s3_endpoint = (self.s3_endpoint
+ or os.environ.get("S3_ENDPOINT_URL")) or None
+ self.stream_params = {
+ "s3_access_key_id": self.s3_access_key_id,
+ "s3_secret_access_key": self.s3_secret_access_key,
+ "s3_endpoint": self.s3_endpoint,
+ }
+
+ self.deserializer_params = {
+ "verify_hash": self.verify_hash,
+ "encryption": self.encryption_keyfile,
+ "num_readers": self.num_readers
+ }
+ if self.encryption_keyfile:
+ with open_stream(
+ self.encryption_keyfile,
+ **self.stream_params,
+ ) as stream:
+ key = stream.read()
+ decryption_params = DecryptionParams.from_key(key)
+ self.deserializer_params['encryption'] = decryption_params
+
+ @staticmethod
+ def add_cli_args(
+ parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+ """Tensorizer CLI arguments"""
+
+ # Tensorizer options arg group
+ group = parser.add_argument_group(
+ 'tensorizer options',
+ description=('Options for configuring the behavior of the'
+ ' tensorizer deserializer when '
+ '--load-format=tensorizer'))
+
+ group.add_argument(
+ "--tensorizer-uri",
+ help="Path to serialized model tensors. Can be a local file path,"
+ " or an HTTP(S) or S3 URI.",
+ )
+ group.add_argument(
+ "--verify-hash",
+ action="store_true",
+ help="If enabled, the hashes of each tensor will be verified"
+ " against the hashes stored in the file metadata. An exception"
+ " will be raised if any of the hashes do not match.",
+ )
+ group.add_argument(
+ "--encryption-keyfile",
+ default=None,
+ help="The file path to a binary file containing a binary key to "
+ "use for decryption. Can be a file path or S3 network URI.")
+ group.add_argument(
+ "--num-readers",
+ default=1,
+ type=int,
+ help="Controls how many threads are allowed to read concurrently "
+ "from the source file.")
+ group.add_argument(
+ "--s3-access-key-id",
+ default=None,
+ help="The access key for the S3 bucket. Can also be set via the "
+ "S3_ACCESS_KEY_ID environment variable.",
+ )
+ group.add_argument(
+ "--s3-secret-access-key",
+ default=None,
+ help="The secret access key for the S3 bucket. Can also be set via "
+ "the S3_SECRET_ACCESS_KEY environment variable.",
+ )
+ group.add_argument(
+ "--s3-endpoint",
+ default=None,
+ help="The endpoint for the S3 bucket. Can also be set via the "
+ "S3_ENDPOINT_URL environment variable.",
+ )
+ group.add_argument(
+ "--vllm-tensorized",
+ action="store_true",
+ help="If enabled, indicates that the serialized model is a vLLM "
+ "model. This is used to determine the behavior of the "
+ "TensorDeserializer when loading tensors from a "
+ "serialized model.")
+
+ return parser
+
+ @classmethod
+ def from_cli_args(cls, args: argparse.Namespace) -> "TensorizerArgs":
+ attrs = [attr.name for attr in dataclasses.fields(cls)]
+ tensorizer_args = cls(**{
+ attr: getattr(args, attr)
+ for attr in attrs if hasattr(args, attr)
+ })
+ return tensorizer_args
+
+
+class TensorizerAgent:
+ """
+ A class for performing tensorizer deserializations specifically for
+ vLLM models using plaid_mode. Uses TensorizerArgs to configure the
+ behavior of the TensorDeserializer when loading tensors from a serialized
+ model. For deserializations of HuggingFace models, TensorDeserializer is
+ instead used as an iterator directly in the func hf_model_weights_iterator
+ in vllm/model_executor/model_loader/weight_utils.py
+ """
+
+ def __init__(self, tensorizer_config: TensorizerConfig,
+ linear_method: LinearMethodBase, **extra_kwargs):
+ if tensorizer_load_fail is not None:
+ raise ImportError(
+ "Tensorizer is not installed. Please install tensorizer "
+ "to use this feature with `pip install vllm[tensorizer]`."
+ ) from tensorizer_load_fail
+
+ self.tensorizer_config = tensorizer_config
+ self.tensorizer_args = (
+ self.tensorizer_config._construct_tensorizer_args())
+ self.extra_kwargs = extra_kwargs
+ if extra_kwargs.get("linear_method", None) is not None:
+ self.linear_method = extra_kwargs["linear_method"]
+ else:
+ self.linear_method = linear_method
+ self.model = self._init_model()
+
+ def _init_model(self):
+ model_args = self.tensorizer_config.hf_config
+ model_args.torch_dtype = self.tensorizer_config.dtype
+ with no_init_or_tensor():
+ return self.tensorizer_config.model_class(
+ config=model_args,
+ linear_method=self.linear_method,
+ **self.extra_kwargs)
+
+ def _resize_lora_embeddings(self):
+ """Modify LoRA embedding layers to use bigger tensors
+ to allow for adapter added tokens."""
+ for child in self.model.modules():
+ if (isinstance(child, VocabParallelEmbedding)
+ and child.weight.shape[0] <
+ child.num_embeddings_per_partition):
+ new_weight = torch.empty(child.num_embeddings_per_partition,
+ child.embedding_dim,
+ dtype=child.weight.dtype,
+ device=child.weight.device)
+ new_weight[:child.weight.shape[0]].copy_(child.weight.data)
+ new_weight[child.weight.shape[0]:].fill_(0)
+ child.weight.data = new_weight
+
+ def _check_tensors_on_meta_device(self):
+ for tensor in self.model.state_dict().values():
+ if tensor.device.type == 'meta':
+ raise ValueError(
+ "The serialized model contains tensors on the meta device,"
+ " indicating that some tensors were not loaded properly."
+ " Please check that the parameters of the model being"
+ " specified match that of the serialized model, such as"
+ " its quantization.")
+
+ def deserialize(self):
+ """
+ Deserialize the model using the TensorDeserializer. This method is
+ specifically for vLLM models using tensorizer's plaid_mode.
+
+ The deserializer makes use of tensorizer_args.stream_params
+ to configure the behavior of the stream when loading tensors from a
+ serialized model. The deserializer_params are used to configure the
+ behavior of the TensorDeserializer when loading tensors themselves.
+ Documentation on these params can be found in TensorizerArgs
+
+ Returns:
+ nn.Module: The deserialized model.
+ """
+ before_mem = get_mem_usage()
+ start = time.perf_counter()
+ with open_stream(
+ self.tensorizer_args.tensorizer_uri,
+ mode="rb",
+ **self.tensorizer_args.stream_params,
+ ) as stream, TensorDeserializer(
+ stream,
+ dtype=self.tensorizer_config.dtype,
+ **self.tensorizer_args.deserializer_params) as deserializer:
+ deserializer.load_into_module(self.model)
+ end = time.perf_counter()
+
+ total_bytes_str = convert_bytes(deserializer.total_tensor_bytes)
+ duration = end - start
+ per_second = convert_bytes(deserializer.total_tensor_bytes / duration)
+ after_mem = get_mem_usage()
+ deserializer.close()
+ logger.info(f"Deserialized {total_bytes_str} in "
+ f"{end - start:0.2f}s, {per_second}/s")
+ logger.info(f"Memory usage before: {before_mem}")
+ logger.info(f"Memory usage after: {after_mem}")
+
+ self._check_tensors_on_meta_device()
+ self._resize_lora_embeddings()
+ return self.model.eval()
+
+
+def tensorizer_weights_iterator(
+ tensorizer_args: "TensorizerArgs"
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+ logger.warning(
+ "Deserializing HuggingFace models is not optimized for "
+ "loading on vLLM, as tensorizer is forced to load to CPU. "
+ "Consider deserializing a vLLM model instead for faster "
+ "load times. See the examples/tensorize_vllm_model.py example "
+ "script for serializing vLLM models.")
+
+ deserializer_args = tensorizer_args.deserializer_params
+ stream_params = tensorizer_args.stream_params
+ stream = open_stream(tensorizer_args.tensorizer_uri, **stream_params)
+ with TensorDeserializer(stream, **deserializer_args,
+ device="cpu") as state:
+ for name, param in state.items():
+ yield name, param
+ del state
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
new file mode 100644
index 0000000000000..f7e0f56c1a46e
--- /dev/null
+++ b/vllm/model_executor/model_loader/utils.py
@@ -0,0 +1,41 @@
+"""Utilities for selecting and loading models."""
+import contextlib
+from typing import Tuple, Type
+
+import torch
+from torch import nn
+
+from vllm.config import ModelConfig
+from vllm.model_executor.models import ModelRegistry
+
+
+@contextlib.contextmanager
+def set_default_torch_dtype(dtype: torch.dtype):
+ """Sets the default torch dtype to the given dtype."""
+ old_dtype = torch.get_default_dtype()
+ torch.set_default_dtype(dtype)
+ yield
+ torch.set_default_dtype(old_dtype)
+
+
+def get_model_architecture(
+ model_config: ModelConfig) -> Tuple[Type[nn.Module], str]:
+ architectures = getattr(model_config.hf_config, "architectures", [])
+ # Special handling for quantized Mixtral.
+ # FIXME(woosuk): This is a temporary hack.
+ if (model_config.quantization is not None
+ and model_config.quantization != "fp8"
+ and "MixtralForCausalLM" in architectures):
+ architectures = ["QuantMixtralForCausalLM"]
+
+ for arch in architectures:
+ model_cls = ModelRegistry.load_model_cls(arch)
+ if model_cls is not None:
+ return (model_cls, arch)
+ raise ValueError(
+ f"Model architectures {architectures} are not supported for now. "
+ f"Supported architectures: {ModelRegistry.get_supported_archs()}")
+
+
+def get_architecture_class_name(model_config: ModelConfig) -> str:
+ return get_model_architecture(model_config)[1]
diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
similarity index 60%
rename from vllm/model_executor/weight_utils.py
rename to vllm/model_executor/model_loader/weight_utils.py
index c77ed29446276..aec58033fa1d3 100644
--- a/vllm/model_executor/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -4,8 +4,9 @@
import hashlib
import json
import os
+import tempfile
from collections import defaultdict
-from typing import Any, Iterable, Iterator, List, Optional, Tuple
+from typing import Any, Generator, Iterable, List, Optional, Tuple
import filelock
import huggingface_hub.constants
@@ -15,7 +16,7 @@
from safetensors.torch import load_file, safe_open, save_file
from tqdm.auto import tqdm
-from vllm.config import ModelConfig
+from vllm.config import LoadConfig, ModelConfig
from vllm.logger import init_logger
# UPSTREAM SYNC: needed for sparsity
from vllm.model_executor.layers.parameters import LazyCompressedParameter
@@ -29,8 +30,7 @@
# can share the same lock without error.
# lock files in the temp directory will be automatically deleted when the
# system reboots, so users will not complain about annoying lock files
-temp_dir = os.environ.get('TMPDIR') or os.environ.get(
- 'TEMP') or os.environ.get('TMP') or "/tmp/"
+temp_dir = tempfile.gettempdir()
def enable_hf_transfer():
@@ -48,7 +48,7 @@ def enable_hf_transfer():
enable_hf_transfer()
-class Disabledtqdm(tqdm):
+class DisabledTqdm(tqdm):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs, disable=True)
@@ -117,7 +117,7 @@ def convert_bin_to_safetensor_file(
# UPSTREAM SYNC: needed for sparsity
# TODO: (MLE) load compressed models from here
-def get_sparse_config(model_config: ModelConfig):
+def get_sparse_config(model_config: ModelConfig) -> QuantizationConfig:
from vllm.model_executor.layers.sparsity import get_sparsity_config
sparsity_cls = get_sparsity_config(model_config.sparsity)
hf_sparsity_config = getattr(model_config.hf_config, "sparsity_config",
@@ -129,7 +129,8 @@ def get_sparse_config(model_config: ModelConfig):
# TODO(woosuk): Move this to other place.
-def get_quant_config(model_config: ModelConfig) -> QuantizationConfig:
+def get_quant_config(model_config: ModelConfig,
+ load_config: LoadConfig) -> QuantizationConfig:
quant_cls = get_quantization_config(model_config.quantization)
# Read the quantization config from the HF model config, if available.
hf_quant_config = getattr(model_config.hf_config, "quantization_config",
@@ -140,19 +141,26 @@ def get_quant_config(model_config: ModelConfig) -> QuantizationConfig:
is_local = os.path.isdir(model_name_or_path)
if not is_local:
# Download the config files.
- with get_lock(model_name_or_path, model_config.download_dir):
+ with get_lock(model_name_or_path, load_config.download_dir):
hf_folder = snapshot_download(model_name_or_path,
revision=model_config.revision,
allow_patterns="*.json",
- cache_dir=model_config.download_dir,
- tqdm_class=Disabledtqdm)
+ cache_dir=load_config.download_dir,
+ tqdm_class=DisabledTqdm)
else:
hf_folder = model_name_or_path
+
+ possible_config_filenames = quant_cls.get_config_filenames()
+
+ # If the quantization config is not found, use the default config.
+ if not possible_config_filenames:
+ return quant_cls()
+
config_files = glob.glob(os.path.join(hf_folder, "*.json"))
quant_config_files = [
f for f in config_files if any(
- f.endswith(x) for x in quant_cls.get_config_filenames())
+ f.endswith(x) for x in possible_config_filenames)
]
if len(quant_config_files) == 0:
raise ValueError(
@@ -168,143 +176,127 @@ def get_quant_config(model_config: ModelConfig) -> QuantizationConfig:
return quant_cls.from_config(config)
-def prepare_hf_model_weights(
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- fall_back_to_pt: bool = True,
- revision: Optional[str] = None,
-) -> Tuple[str, List[str], bool]:
- # Download model weights from huggingface.
- is_local = os.path.isdir(model_name_or_path)
- use_safetensors = False
- # Some quantized models use .pt files for storing the weights.
- if load_format == "auto":
- allow_patterns = ["*.safetensors", "*.bin"]
- elif load_format == "safetensors":
- use_safetensors = True
- allow_patterns = ["*.safetensors"]
- elif load_format == "pt":
- allow_patterns = ["*.pt"]
- elif load_format == "npcache":
- allow_patterns = ["*.bin"]
- else:
- raise ValueError(f"Unknown load_format: {load_format}")
-
- if fall_back_to_pt:
- allow_patterns += ["*.pt"]
+def download_weights_from_hf(model_name_or_path: str,
+ cache_dir: Optional[str],
+ allow_patterns: List[str],
+ revision: Optional[str] = None) -> str:
+ """Download model weights from Hugging Face Hub.
+
+ Args:
+ model_name_or_path (str): The model name or path.
+ cache_dir (Optional[str]): The cache directory to store the model
+ weights. If None, will use HF defaults.
+ allow_patterns (List[str]): The allowed patterns for the
+ weight files. Files matched by any of the patterns will be
+ downloaded.
+ revision (Optional[str]): The revision of the model.
+
+ Returns:
+ str: The path to the downloaded model weights.
+ """
+ # Before we download we look at that is available:
+ fs = HfFileSystem()
+ file_list = fs.ls(model_name_or_path, detail=False, revision=revision)
- if not is_local:
- # Before we download we look at that is available:
- fs = HfFileSystem()
- file_list = fs.ls(model_name_or_path, detail=False, revision=revision)
-
- # depending on what is available we download different things
- for pattern in allow_patterns:
- matching = fnmatch.filter(file_list, pattern)
- if len(matching) > 0:
- allow_patterns = [pattern]
- break
-
- logger.info(f"Using model weights format {allow_patterns}")
- # Use file lock to prevent multiple processes from
- # downloading the same model weights at the same time.
- with get_lock(model_name_or_path, cache_dir):
- hf_folder = snapshot_download(model_name_or_path,
- allow_patterns=allow_patterns,
- cache_dir=cache_dir,
- tqdm_class=Disabledtqdm,
- revision=revision)
- else:
- hf_folder = model_name_or_path
- hf_weights_files: List[str] = []
+ # depending on what is available we download different things
for pattern in allow_patterns:
- hf_weights_files += glob.glob(os.path.join(hf_folder, pattern))
- if len(hf_weights_files) > 0:
- if pattern == "*.safetensors":
- use_safetensors = True
+ matching = fnmatch.filter(file_list, pattern)
+ if len(matching) > 0:
+ allow_patterns = [pattern]
break
- if not use_safetensors:
- # Exclude files that are not needed for inference.
- # https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233
- blacklist = [
- "training_args.bin",
- "optimizer.bin",
- "optimizer.pt",
- "scheduler.pt",
- "scaler.pt",
- ]
- hf_weights_files = [
- f for f in hf_weights_files
- if not any(f.endswith(x) for x in blacklist)
- ]
-
- if len(hf_weights_files) == 0:
- raise RuntimeError(
- f"Cannot find any model weights with `{model_name_or_path}`")
-
- return hf_folder, hf_weights_files, use_safetensors
-
-
-def hf_model_weights_iterator(
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None,
- fall_back_to_pt: Optional[bool] = True,
-) -> Iterator[Tuple[str, torch.Tensor]]:
- hf_folder, hf_weights_files, use_safetensors = prepare_hf_model_weights(
- model_name_or_path,
- cache_dir=cache_dir,
- load_format=load_format,
- fall_back_to_pt=fall_back_to_pt,
- revision=revision)
-
- if load_format == "npcache":
- # Currently np_cache only support *.bin checkpoints
- assert use_safetensors is False
-
- # Convert the model weights from torch tensors to numpy arrays for
- # faster loading.
- np_folder = os.path.join(hf_folder, "np")
- os.makedirs(np_folder, exist_ok=True)
- weight_names_file = os.path.join(np_folder, "weight_names.json")
- # Use file lock to prevent multiple processes from
- # dumping the same model weights to numpy at the same time.
- with get_lock(model_name_or_path, cache_dir):
- if not os.path.exists(weight_names_file):
- weight_names = []
- for bin_file in hf_weights_files:
- state = torch.load(bin_file, map_location="cpu")
- for name, param in state.items():
- param_path = os.path.join(np_folder, name)
- with open(param_path, "wb") as f:
- np.save(f, param.cpu().detach().numpy())
- weight_names.append(name)
- with open(weight_names_file, "w") as f:
- json.dump(weight_names, f)
-
- with open(weight_names_file, "r") as f:
- weight_names = json.load(f)
-
- for name in weight_names:
- param_path = os.path.join(np_folder, name)
- with open(param_path, "rb") as f:
- param = np.load(f)
- yield name, torch.from_numpy(param)
- elif use_safetensors:
- for st_file in hf_weights_files:
- with safe_open(st_file, framework="pt") as f:
- for name in f.keys(): # noqa: SIM118
- param = f.get_tensor(name)
- yield name, param
- else:
- for bin_file in hf_weights_files:
- state = torch.load(bin_file, map_location="cpu")
- for name, param in state.items():
+
+ logger.info(f"Using model weights format {allow_patterns}")
+ # Use file lock to prevent multiple processes from
+ # downloading the same model weights at the same time.
+ with get_lock(model_name_or_path, cache_dir):
+ hf_folder = snapshot_download(model_name_or_path,
+ allow_patterns=allow_patterns,
+ cache_dir=cache_dir,
+ tqdm_class=DisabledTqdm,
+ revision=revision)
+ return hf_folder
+
+
+def filter_files_not_needed_for_inference(
+ hf_weights_files: List[str]) -> List[str]:
+ """
+ Exclude files that are not needed for inference.
+
+ See https://github.com/huggingface/transformers/blob/v4.34.0/src/transformers/trainer.py#L227-L233
+ """
+ blacklist = [
+ "training_args.bin",
+ "optimizer.bin",
+ "optimizer.pt",
+ "scheduler.pt",
+ "scaler.pt",
+ ]
+ hf_weights_files = [
+ f for f in hf_weights_files
+ if not any(f.endswith(x) for x in blacklist)
+ ]
+ return hf_weights_files
+
+
+def np_cache_weights_iterator(
+ model_name_or_path: str, cache_dir: Optional[str], hf_folder: str,
+ hf_weights_files: List[str]
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+ """Iterate over the weights in the model np files.
+
+ Will dump the model weights to numpy files if they are not already dumped.
+ """
+ # Convert the model weights from torch tensors to numpy arrays for
+ # faster loading.
+ np_folder = os.path.join(hf_folder, "np")
+ os.makedirs(np_folder, exist_ok=True)
+ weight_names_file = os.path.join(np_folder, "weight_names.json")
+ # Use file lock to prevent multiple processes from
+ # dumping the same model weights to numpy at the same time.
+ with get_lock(model_name_or_path, cache_dir):
+ if not os.path.exists(weight_names_file):
+ weight_names = []
+ for bin_file in hf_weights_files:
+ state = torch.load(bin_file, map_location="cpu")
+ for name, param in state.items():
+ param_path = os.path.join(np_folder, name)
+ with open(param_path, "wb") as f:
+ np.save(f, param.cpu().detach().numpy())
+ weight_names.append(name)
+ with open(weight_names_file, "w") as f:
+ json.dump(weight_names, f)
+
+ with open(weight_names_file, "r") as f:
+ weight_names = json.load(f)
+
+ for name in weight_names:
+ param_path = os.path.join(np_folder, name)
+ with open(param_path, "rb") as f:
+ param = np.load(f)
+ yield name, torch.from_numpy(param)
+
+
+def safetensors_weights_iterator(
+ hf_weights_files: List[str]
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+ """Iterate over the weights in the model safetensor files."""
+ for st_file in hf_weights_files:
+ with safe_open(st_file, framework="pt") as f:
+ for name in f.keys(): # noqa: SIM118
+ param = f.get_tensor(name)
yield name, param
- del state
- torch.cuda.empty_cache()
+
+
+def pt_weights_iterator(
+ hf_weights_files: List[str]
+) -> Generator[Tuple[str, torch.Tensor], None, None]:
+ """Iterate over the weights in the model bin/pt files."""
+ for bin_file in hf_weights_files:
+ state = torch.load(bin_file, map_location="cpu")
+ for name, param in state.items():
+ yield name, param
+ del state
+ torch.cuda.empty_cache()
def kv_cache_scales_loader(
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index 17fc970568042..6afb2f31c1334 100755
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -42,10 +42,11 @@
"MptForCausalLM": ("mpt", "MPTForCausalLM"),
"MPTForCausalLM": ("mpt", "MPTForCausalLM"),
"MiniCPMForCausalLM": ("minicpm", "MiniCPMForCausalLM"),
- "OLMoForCausalLM": ("olmo", "OLMoForCausalLM"),
+ "OlmoForCausalLM": ("olmo", "OlmoForCausalLM"),
"OPTForCausalLM": ("opt", "OPTForCausalLM"),
"OrionForCausalLM": ("orion", "OrionForCausalLM"),
"PhiForCausalLM": ("phi", "PhiForCausalLM"),
+ "Phi3ForCausalLM": ("llama", "LlamaForCausalLM"),
"QWenLMHeadModel": ("qwen", "QWenLMHeadModel"),
"Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"),
"Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"),
diff --git a/vllm/model_executor/models/baichuan.py b/vllm/model_executor/models/baichuan.py
index 30588aecdebe9..69162b0a92d65 100644
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@@ -19,7 +19,7 @@
# limitations under the License.
"""Inference-only BaiChuan model compatible with HuggingFace weights."""
import math
-from typing import List, Optional, Tuple
+from typing import Iterable, List, Optional, Tuple
import torch
from torch import nn
@@ -40,9 +40,8 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
@@ -340,19 +339,14 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("gate_up_proj", "gate_proj", 0),
("gate_up_proj", "up_proj", 1),
]
params_dict = dict(self.named_parameters())
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path, cache_dir, load_format, revision):
+ for name, loaded_weight in weights:
if "rotary_emb.inv_freq" in name:
continue
if name == "lm_head.weight":
diff --git a/vllm/model_executor/models/bloom.py b/vllm/model_executor/models/bloom.py
index 40966ab33631a..14f325e624f41 100644
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@@ -17,7 +17,7 @@
# limitations under the License.
"""Inference-only BLOOM model compatible with HuggingFace weights."""
import math
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
import torch
from torch import nn
@@ -35,9 +35,8 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
@@ -298,14 +297,9 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
params_dict = dict(self.named_parameters(remove_duplicate=False))
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path, cache_dir, load_format, revision):
+ for name, loaded_weight in weights:
if name == "lm_head.weight":
continue
if not name.startswith("transformer."):
diff --git a/vllm/model_executor/models/chatglm.py b/vllm/model_executor/models/chatglm.py
index 7b46ba306619a..3cdb7a7bca1c1 100644
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@@ -2,7 +2,7 @@
# Adapted from
# https://github.com/THUDM/ChatGLM2-6B
"""Inference-only ChatGLM model compatible with THUDM weights."""
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
import torch
from torch import nn
@@ -22,9 +22,8 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
from vllm.transformers_utils.configs import ChatGLMConfig
@@ -370,14 +369,9 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
params_dict = dict(self.named_parameters(remove_duplicate=False))
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path, cache_dir, load_format, revision):
+ for name, loaded_weight in weights:
if "rotary_pos_emb.inv_freq" in name:
continue
if "word_embeddings" in name:
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index aa9b28b676e0b..d80969773e163 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -20,7 +20,7 @@
# This file is based on the LLama model definition file in transformers
"""PyTorch Cohere model."""
-from typing import List, Optional, Tuple
+from typing import Iterable, List, Optional, Tuple
import torch
import torch.utils.checkpoint
@@ -41,10 +41,9 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.model_executor.utils import set_weight_attrs
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
@@ -335,13 +334,7 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(
- self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None,
- ):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
@@ -352,8 +345,7 @@ def load_weights(
]
params_dict = dict(self.named_parameters())
loaded_params = set()
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path, cache_dir, load_format, revision):
+ for name, loaded_weight in weights:
for param_name, shard_name, shard_id in stacked_params_mapping:
if shard_name not in name:
continue
diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index 49eb7f1b2c185..179094b8fd7aa 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -1,5 +1,5 @@
# coding=utf-8
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
import torch
import torch.nn as nn
@@ -18,10 +18,9 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.model_executor.utils import set_weight_attrs
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
from vllm.transformers_utils.configs.dbrx import DbrxConfig
@@ -391,20 +390,13 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(
- self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None,
- ):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
expert_params_mapping = [(
"ws" if weight_name in ["w1", "v1"] else "w2s",
f"experts.mlp.{weight_name}",
) for weight_name in ["w1", "v1", "w2"]]
params_dict = dict(self.named_parameters(remove_duplicate=False))
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path, cache_dir, load_format, revision):
+ for name, loaded_weight in weights:
for param_name, weight_name in expert_params_mapping:
if weight_name not in name:
continue
diff --git a/vllm/model_executor/models/decilm.py b/vllm/model_executor/models/decilm.py
index abf4a462871b0..d476630ee6f11 100644
--- a/vllm/model_executor/models/decilm.py
+++ b/vllm/model_executor/models/decilm.py
@@ -23,16 +23,15 @@
# limitations under the License.
"""Inference-only DeciLM model compatible with HuggingFace weights."""
-from typing import Optional
+from typing import Iterable, Optional, Tuple
import torch
from transformers import PretrainedConfig
from vllm.config import LoRAConfig
from vllm.model_executor.layers.linear import LinearMethodBase
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.llama import LlamaForCausalLM
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
class DeciLMForCausalLM(LlamaForCausalLM):
@@ -65,11 +64,7 @@ def __init__(
linear_method=linear_method,
lora_config=lora_config)
- def load_weights(self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
@@ -79,8 +74,7 @@ def load_weights(self,
("gate_up_proj", "up_proj", 1),
]
params_dict = dict(self.named_parameters())
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path, cache_dir, load_format, revision):
+ for name, loaded_weight in weights:
if "rotary_emb.inv_freq" in name:
continue
diff --git a/vllm/model_executor/models/deepseek.py b/vllm/model_executor/models/deepseek.py
index c7dd11d07e6da..46101a152ec0d 100644
--- a/vllm/model_executor/models/deepseek.py
+++ b/vllm/model_executor/models/deepseek.py
@@ -21,7 +21,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only Deepseek model."""
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, Iterable, List, Optional, Tuple
import torch
from torch import nn
@@ -44,9 +44,8 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
@@ -316,6 +315,8 @@ def forward(
class DeepseekModel(nn.Module):
+ fall_back_to_pt_during_load = False
+
def __init__(
self,
config: PretrainedConfig,
@@ -395,11 +396,7 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
@@ -410,12 +407,7 @@ def load_weights(self,
]
params_dict = dict(self.named_parameters())
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path,
- cache_dir,
- load_format,
- revision,
- fall_back_to_pt=False):
+ for name, loaded_weight in weights:
if "rotary_emb.inv_freq" in name:
continue
for (param_name, weight_name, shard_id) in stacked_params_mapping:
diff --git a/vllm/model_executor/models/falcon.py b/vllm/model_executor/models/falcon.py
index 4f1ebcd5fb43c..25ce239d14662 100644
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@@ -19,7 +19,7 @@
"""PyTorch Falcon model."""
import math
-from typing import List, Optional, Union
+from typing import Iterable, List, Optional, Tuple, Union
import torch
from torch import nn
@@ -40,9 +40,8 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
from vllm.transformers_utils.configs import RWConfig
@@ -399,11 +398,7 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
total_num_heads = self.config.num_attention_heads
if self.config.new_decoder_architecture:
total_num_kv_heads = self.config.num_kv_heads
@@ -413,8 +408,7 @@ def load_weights(self,
total_num_kv_heads = total_num_heads
num_query_heads_per_kv_head = total_num_heads // total_num_kv_heads
params_dict = dict(self.named_parameters(remove_duplicate=False))
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path, cache_dir, load_format, revision):
+ for name, loaded_weight in weights:
if name == "lm_head.weight":
# Falcon uses tied embeddings.
continue
diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index fc1fc35570368..6d01537c5c344 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -15,7 +15,7 @@
# limitations under the License.
"""Inference-only Gemma model compatible with HuggingFace weights."""
from functools import lru_cache
-from typing import List, Optional, Tuple
+from typing import Iterable, List, Optional, Tuple
import torch
from torch import nn
@@ -36,9 +36,8 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
logger = init_logger(__name__)
@@ -346,11 +345,7 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
@@ -361,8 +356,7 @@ def load_weights(self,
]
params_dict = dict(self.named_parameters())
loaded_params = set()
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path, cache_dir, load_format, revision):
+ for name, loaded_weight in weights:
for (param_name, shard_name, shard_id) in stacked_params_mapping:
if shard_name not in name:
continue
diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
index 43f0d47fcb122..850050c7232d0 100644
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@@ -17,7 +17,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only GPT-2 model compatible with HuggingFace weights."""
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
import torch
from torch import nn
@@ -34,9 +34,8 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
@@ -239,14 +238,9 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
params_dict = dict(self.named_parameters(remove_duplicate=False))
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path, cache_dir, load_format, revision):
+ for name, loaded_weight in weights:
if "lm_head.weight" in name:
# GPT-2 ties the weights of the embedding layer and the final
# linear layer.
diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
index cec2d771adfa8..8278ba02514d5 100644
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@@ -18,7 +18,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only GPTBigCode model compatible with HuggingFace weights."""
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
import torch
from torch import nn
@@ -35,9 +35,8 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
@@ -260,14 +259,9 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
params_dict = dict(self.named_parameters(remove_duplicate=False))
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path, cache_dir, load_format, revision):
+ for name, loaded_weight in weights:
if "lm_head.weight" in name:
continue
if ".attn.bias" in name:
diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py
index 5660097652748..7a830d7f9c965 100644
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@@ -16,7 +16,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only GPT-J model compatible with HuggingFace weights."""
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
import torch
from torch import nn
@@ -34,9 +34,8 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
@@ -248,11 +247,7 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
@@ -262,8 +257,7 @@ def load_weights(self,
("gate_up_proj", "up_proj", 1),
]
params_dict = dict(self.named_parameters())
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path, cache_dir, load_format, revision):
+ for name, loaded_weight in weights:
if "attn.bias" in name or "attn.masked_bias" in name:
continue
for (param_name, weight_name, shard_id) in stacked_params_mapping:
diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
index 2f9e2171cf114..b946aed92ed35 100644
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@@ -16,7 +16,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only GPT-NeoX model compatible with HuggingFace weights."""
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
import torch
from torch import nn
@@ -34,9 +34,8 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
@@ -262,14 +261,9 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
params_dict = dict(self.named_parameters())
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path, cache_dir, load_format, revision):
+ for name, loaded_weight in weights:
if ("attention.bias" in name or "attention.masked_bias" in name
or "rotary_emb.inv_freq" in name):
continue
diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py
index 6e9cbd3f9f43f..db1da8bdc4fb9 100644
--- a/vllm/model_executor/models/internlm2.py
+++ b/vllm/model_executor/models/internlm2.py
@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, Iterable, List, Optional, Tuple
import torch
from torch import nn
@@ -18,9 +18,8 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
@@ -274,19 +273,14 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("gate_up_proj", "w1", 0),
("gate_up_proj", "w3", 1),
]
params_dict = dict(self.named_parameters())
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path, cache_dir, load_format, revision):
+ for name, loaded_weight in weights:
if "rotary_emb.inv_freq" in name:
continue
for (param_name, weight_name, shard_id) in stacked_params_mapping:
diff --git a/vllm/model_executor/models/jais.py b/vllm/model_executor/models/jais.py
index a041b0c9a0452..e7ee749e824e4 100644
--- a/vllm/model_executor/models/jais.py
+++ b/vllm/model_executor/models/jais.py
@@ -20,7 +20,7 @@
"""Inference-only Jais model compatible with HuggingFace weights."""
import math
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
import torch
from torch import nn
@@ -36,9 +36,8 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
from vllm.transformers_utils.configs import JAISConfig
@@ -303,16 +302,9 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(
- self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None,
- ):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
params_dict = dict(self.named_parameters(remove_duplicate=False))
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path, cache_dir, load_format, revision):
+ for name, loaded_weight in weights:
if "lm_head.weight" in name:
# GPT-2 ties the weights of the embedding layer and the final
# linear layer.
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index c86e292e7df1a..c102b40045c92 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -21,7 +21,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only LLaMA model compatible with HuggingFace weights."""
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, Iterable, List, Optional, Tuple
import torch
from torch import nn
@@ -42,10 +42,9 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import (
+ default_weight_loader, kv_cache_scales_loader)
from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator,
- kv_cache_scales_loader)
from vllm.sequence import SamplerOutput
from vllm.utils import is_hip
@@ -181,6 +180,10 @@ def __init__(
self.hidden_size = config.hidden_size
rope_theta = getattr(config, "rope_theta", 10000)
rope_scaling = getattr(config, "rope_scaling", None)
+ if rope_scaling is not None and getattr(
+ config, "original_max_position_embeddings", None):
+ rope_scaling["original_max_position_embeddings"] = (
+ config.original_max_position_embeddings)
max_position_embeddings = getattr(config, "max_position_embeddings",
8192)
sliding_window = getattr(config, "sliding_window", None)
@@ -376,22 +379,17 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
- ("qkv_proj", "q_proj", "q"),
- ("qkv_proj", "k_proj", "k"),
- ("qkv_proj", "v_proj", "v"),
- ("gate_up_proj", "gate_proj", 0),
- ("gate_up_proj", "up_proj", 1),
+ (".qkv_proj", ".q_proj", "q"),
+ (".qkv_proj", ".k_proj", "k"),
+ (".qkv_proj", ".v_proj", "v"),
+ (".gate_up_proj", ".gate_proj", 0),
+ (".gate_up_proj", ".up_proj", 1),
]
params_dict = dict(self.named_parameters())
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path, cache_dir, load_format, revision):
+ for name, loaded_weight in weights:
if "rotary_emb.inv_freq" in name:
continue
if ("rotary_emb.cos_cached" in name
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index c2571d0893c8d..314a2792bf167 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
import torch
from torch import nn
@@ -13,10 +13,9 @@
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.llama import LlamaModel
from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
_KEYS_TO_MODIFY_MAPPING = {
@@ -198,11 +197,7 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
# only doing this for language model part for now.
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
@@ -213,8 +208,7 @@ def load_weights(self,
("gate_up_proj", "up_proj", 1),
]
params_dict = dict(self.named_parameters())
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path, cache_dir, load_format, revision):
+ for name, loaded_weight in weights:
if "rotary_emb.inv_freq" in name:
continue
for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py
index 49eda9c9a8112..f0d72fafcaf70 100644
--- a/vllm/model_executor/models/minicpm.py
+++ b/vllm/model_executor/models/minicpm.py
@@ -22,7 +22,7 @@
# limitations under the License.
"""Inference-only MiniCPM model compatible with HuggingFace weights."""
import math
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, Iterable, List, Optional, Tuple
import torch
from torch import nn
@@ -45,10 +45,9 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.model_executor.utils import set_weight_attrs
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
@@ -472,11 +471,7 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
@@ -493,8 +488,7 @@ def load_weights(self,
for weight_name in ["w1", "w2", "w3"]
]
params_dict = dict(self.named_parameters())
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path, cache_dir, load_format, revision):
+ for name, loaded_weight in weights:
if "rotary_emb.inv_freq" in name:
continue
if ("rotary_emb.cos_cached" in name
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index ff552a9d86536..a33b795d7088e 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -21,7 +21,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only Mixtral model."""
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
import torch
from torch import nn
@@ -39,15 +39,17 @@
ReplicatedLinear,
RowParallelLinear)
from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.fp8 import (Fp8LinearMethod,
+ per_tensor_quantize)
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.model_executor.utils import set_weight_attrs
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
+from vllm.utils import print_warning_once
class MixtralMoE(nn.Module):
@@ -67,6 +69,7 @@ def __init__(
intermediate_size: int,
params_dtype: Optional[torch.dtype] = None,
tp_size: Optional[int] = None,
+ linear_method: Optional[LinearMethodBase] = None,
):
super().__init__()
self.tp_size = tp_size or get_tensor_model_parallel_world_size()
@@ -74,6 +77,9 @@ def __init__(
self.top_k = top_k
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size // self.tp_size
+ # FIXME(pcmoritz): Make this more general to support different
+ # quantization schemes
+ self.use_fp8 = isinstance(linear_method, Fp8LinearMethod)
if params_dtype is None:
params_dtype = torch.get_default_dtype()
@@ -98,6 +104,16 @@ def __init__(
device="cuda",
dtype=self.params_dtype))
+ # Scaling factors for FP8 weights
+ self.ws_scale = nn.Parameter(
+ torch.ones(
+ self.num_total_experts, device="cuda", dtype=torch.float32),
+ requires_grad=False) if self.use_fp8 else None
+ self.w2s_scale = nn.Parameter(
+ torch.ones(
+ self.num_total_experts, device="cuda", dtype=torch.float32),
+ requires_grad=False) if self.use_fp8 else None
+
set_weight_attrs(self.ws, {
"weight_loader": self.weight_loader,
})
@@ -119,6 +135,18 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
if weight_name.endswith("w2.weight"):
param_data[expert_id, :, :] = loaded_weight[:, shard]
+ def process_weights_after_loading(self):
+ if self.use_fp8:
+ ws = torch.empty_like(self.ws.data, dtype=torch.float8_e4m3fn)
+ w2s = torch.empty_like(self.w2s.data, dtype=torch.float8_e4m3fn)
+ for expert in range(self.num_total_experts):
+ ws[expert, :, :], self.ws_scale[expert] = per_tensor_quantize(
+ self.ws.data[expert, :, :])
+ w2s[expert, :, :], self.w2s_scale[
+ expert] = per_tensor_quantize(self.w2s.data[expert, :, :])
+ self.ws = nn.Parameter(ws, requires_grad=False)
+ self.w2s = nn.Parameter(w2s, requires_grad=False)
+
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
num_tokens, hidden_size = hidden_states.shape
hidden_states = hidden_states.view(-1, self.hidden_size)
@@ -130,7 +158,10 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
router_logits,
self.top_k,
renormalize=True,
- inplace=True)
+ inplace=True,
+ use_fp8=self.use_fp8,
+ w1_scale=self.ws_scale,
+ w2_scale=self.w2s_scale)
if self.tp_size > 1:
final_hidden_states = tensor_model_parallel_all_reduce(
@@ -172,6 +203,13 @@ def __init__(self,
self.rope_theta = rope_theta
self.sliding_window = sliding_window
+ if isinstance(linear_method, Fp8LinearMethod):
+ print_warning_once(
+ "For Mixtral FP8 quantization, we currently do not quantize "
+ "the attention layers until their FP8 performance is improved."
+ )
+ linear_method = None
+
self.qkv_proj = QKVParallelLinear(
hidden_size,
self.head_dim,
@@ -239,7 +277,8 @@ def __init__(
num_experts=config.num_local_experts,
top_k=config.num_experts_per_tok,
hidden_size=config.hidden_size,
- intermediate_size=config.intermediate_size)
+ intermediate_size=config.intermediate_size,
+ linear_method=linear_method)
self.input_layernorm = RMSNorm(config.hidden_size,
eps=config.rms_norm_eps)
self.post_attention_layernorm = RMSNorm(config.hidden_size,
@@ -319,6 +358,8 @@ def forward(
class MixtralForCausalLM(nn.Module):
+ fall_back_to_pt_during_load = False
+
packed_modules_mapping = {
"qkv_proj": [
"q_proj",
@@ -393,11 +434,7 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
@@ -414,12 +451,7 @@ def load_weights(self,
]
params_dict = dict(self.named_parameters())
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path,
- cache_dir,
- load_format,
- revision,
- fall_back_to_pt=False):
+ for name, loaded_weight in weights:
if "rotary_emb.inv_freq" in name:
continue
diff --git a/vllm/model_executor/models/mixtral_quant.py b/vllm/model_executor/models/mixtral_quant.py
index 1f0c0e912beea..acd13cc27f159 100644
--- a/vllm/model_executor/models/mixtral_quant.py
+++ b/vllm/model_executor/models/mixtral_quant.py
@@ -21,7 +21,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only Mixtral model."""
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
import numpy as np
import torch
@@ -43,9 +43,8 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
@@ -327,6 +326,7 @@ def forward(
class MixtralForCausalLM(nn.Module):
+ fall_back_to_pt_during_load = False
def __init__(
self,
@@ -366,11 +366,7 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
@@ -379,12 +375,7 @@ def load_weights(self,
]
params_dict = dict(self.named_parameters())
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path,
- cache_dir,
- load_format,
- revision,
- fall_back_to_pt=False):
+ for name, loaded_weight in weights:
if "rotary_emb.inv_freq" in name:
continue
for (param_name, weight_name, shard_id) in stacked_params_mapping:
diff --git a/vllm/model_executor/models/mpt.py b/vllm/model_executor/models/mpt.py
index af4cdce29d085..340f63286739b 100644
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@@ -1,7 +1,7 @@
# coding=utf-8
# Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main
import math
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
import torch
import torch.nn as nn
@@ -18,9 +18,8 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
from vllm.transformers_utils.configs.mpt import MPTConfig
@@ -284,14 +283,9 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
params_dict = dict(self.named_parameters(remove_duplicate=False))
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path, cache_dir, load_format, revision):
+ for name, loaded_weight in weights:
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
diff --git a/vllm/model_executor/models/olmo.py b/vllm/model_executor/models/olmo.py
index 3513c72879102..15527569b9e20 100644
--- a/vllm/model_executor/models/olmo.py
+++ b/vllm/model_executor/models/olmo.py
@@ -1,53 +1,36 @@
# coding=utf-8
# Adapted from
-# https://github.com/allenai/OLMo/blob/v0.2.4/olmo/model.py and
-# https://github.com/allenai/OLMo/blob/v0.2.4/hf_olmo/modeling_olmo.py
-# Copyright 2023 The vLLM team.
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
+# https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/models/olmo/modeling_olmo.py
+# Copyright 2024 The vLLM team.
+# Copyright 2024 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
-# BSD 3-Clause License
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
#
-# Copyright (c) 2022, Tri Dao, trid@cs.stanford.edu.
-# All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
+# http://www.apache.org/licenses/LICENSE-2.0
#
-# * Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# * Neither the name of the copyright holder nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
"""Inference-only OLMo model compatible with HuggingFace weights."""
-from typing import List, Optional, Tuple
+from typing import Iterable, List, Optional, Tuple
import torch
-# this model must need this dependency
-from hf_olmo import OLMoConfig
from torch import nn
+from transformers import OlmoConfig
from vllm.attention import Attention, AttentionMetadata
from vllm.distributed import get_tensor_model_parallel_world_size
from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.linear import (ColumnParallelLinear,
- LinearMethodBase,
+from vllm.model_executor.layers.linear import (LinearMethodBase,
MergedColumnParallelLinear,
QKVParallelLinear,
RowParallelLinear)
@@ -55,10 +38,9 @@
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
- VocabParallelEmbedding)
+ ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
@@ -71,55 +53,52 @@ class OlmoAttention(nn.Module):
def __init__(
self,
- config: OLMoConfig,
+ config: OlmoConfig,
linear_method: Optional[LinearMethodBase] = None,
):
super().__init__()
self.config = config
- self.hidden_size = config.d_model
- assert config.d_model % config.n_heads == 0
+ self.hidden_size = config.hidden_size
tensor_model_parallel_world_size = (
get_tensor_model_parallel_world_size())
- self.total_num_heads = self.config.n_heads
+ self.total_num_heads = config.num_attention_heads
+
+ assert self.hidden_size % self.total_num_heads == 0
assert self.total_num_heads % tensor_model_parallel_world_size == 0
+
self.num_heads = (self.total_num_heads //
tensor_model_parallel_world_size)
self.head_dim = self.hidden_size // self.total_num_heads
+ self.max_position_embeddings = config.max_position_embeddings
+ self.rope_theta = config.rope_theta
+ self.clip_qkv = config.clip_qkv
- # Layer norms.
- self.attn_norm = nn.LayerNorm(config.d_model,
- elementwise_affine=False,
- bias=False)
# Attention input projection. Projects x -> (q, k, v)
- self.att_proj = QKVParallelLinear(
- config.d_model,
+ self.qkv_proj = QKVParallelLinear(
+ self.hidden_size,
self.head_dim,
self.total_num_heads,
- bias=config.include_bias,
+ bias=config.attention_bias,
linear_method=linear_method,
)
# Rotary embeddings.
- if self.config.rope:
- rope_theta = getattr(config, "rope_theta", 10000)
- max_position_embeddings = getattr(config,
- "max_position_embeddings", 8192)
- self.rotary_emb = get_rope(
- self.head_dim,
- rotary_dim=self.head_dim,
- max_position=max_position_embeddings,
- base=rope_theta,
- )
+ self.rotary_emb = get_rope(
+ self.head_dim,
+ rotary_dim=self.head_dim,
+ max_position=self.max_position_embeddings,
+ base=self.rope_theta,
+ )
self.scaling = self.head_dim**-0.5
self.attn = Attention(self.num_heads,
self.head_dim,
scale=self.scaling)
# Attention output projection.
- self.attn_out = RowParallelLinear(
- config.d_model,
- config.d_model,
- bias=config.include_bias,
+ self.o_proj = RowParallelLinear(
+ self.hidden_size,
+ self.hidden_size,
+ bias=config.attention_bias,
linear_method=linear_method,
)
@@ -130,13 +109,13 @@ def forward(
kv_cache: torch.Tensor,
attn_metadata: AttentionMetadata,
) -> torch.Tensor:
- hidden_states = self.attn_norm(hidden_states)
- qkv, _ = self.att_proj(hidden_states)
+ qkv, _ = self.qkv_proj(hidden_states)
+ if self.clip_qkv is not None:
+ qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
q, k, v = qkv.chunk(chunks=3, dim=-1)
- if self.config.rope:
- q, k = self.rotary_emb(positions, q, k)
+ q, k = self.rotary_emb(positions, q, k)
attn_output = self.attn(q, k, v, kv_cache, attn_metadata)
- output, _ = self.attn_out(attn_output)
+ output, _ = self.o_proj(attn_output)
return output
@@ -149,37 +128,30 @@ class OlmoMLP(nn.Module):
def __init__(
self,
- config: OLMoConfig,
+ config: OlmoConfig,
linear_method: Optional[LinearMethodBase] = None,
):
super().__init__()
self.config = config
- self.hidden_size = (config.mlp_hidden_size if config.mlp_hidden_size
- is not None else config.mlp_ratio * config.d_model)
-
- # Layer norms.
- self.ff_norm = nn.LayerNorm(config.d_model,
- elementwise_affine=False,
- bias=False)
+ self.hidden_size = config.hidden_size
+ self.intermediate_size = config.intermediate_size
# Feed-forward input projection.
- self.ff_proj = MergedColumnParallelLinear(
- config.d_model,
- [self.hidden_size // 2] * 2,
- bias=config.include_bias,
+ self.gate_up_proj = MergedColumnParallelLinear(
+ self.hidden_size,
+ [self.intermediate_size] * 2,
+ bias=False,
linear_method=linear_method,
)
# Activation function.
- self.act = SiluAndMul()
- self.act.output_multiplier = 0.5
- assert (self.act.output_multiplier * self.hidden_size) % 1 == 0
+ self.act_fn = SiluAndMul()
# Feed-forward output projection.
- self.ff_out = RowParallelLinear(
- int(self.act.output_multiplier * self.hidden_size),
- config.d_model,
- bias=config.include_bias,
+ self.down_proj = RowParallelLinear(
+ self.intermediate_size,
+ self.hidden_size,
+ bias=False,
linear_method=linear_method,
)
@@ -187,19 +159,13 @@ def forward(
self,
x: torch.Tensor,
) -> torch.Tensor:
- # Add feed-forward projection.
- # shape: (batch_size, seq_len, d_model)
- og_x = x
- x = self.ff_norm(x)
- x, _ = self.ff_proj(x)
- x = self.act(x)
- x, _ = self.ff_out(x)
- x = og_x + x
-
+ gate_up, _ = self.gate_up_proj(x)
+ x = self.act_fn(gate_up)
+ x, _ = self.down_proj(x)
return x
-class OlmoBlock(nn.Module):
+class OlmoDecoderLayer(nn.Module):
"""
This is a typical transformer block where the output is
computed as ``MLP(LN(x + Attention(LN(x))))``
@@ -207,15 +173,23 @@ class OlmoBlock(nn.Module):
"""
def __init__(self,
- config: OLMoConfig,
+ config: OlmoConfig,
linear_method: Optional[LinearMethodBase] = None):
super().__init__()
# Attention block.
- self.attn = OlmoAttention(config, linear_method)
+ self.self_attn = OlmoAttention(config, linear_method)
# MLP block.
self.mlp = OlmoMLP(config, linear_method)
+ # LayerNorm
+ self.input_layernorm = nn.LayerNorm(config.hidden_size,
+ elementwise_affine=False,
+ bias=False)
+ self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
+ elementwise_affine=False,
+ bias=False)
+
def forward(
self,
positions: torch.Tensor,
@@ -224,52 +198,37 @@ def forward(
attn_metadata: AttentionMetadata,
) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, torch.Tensor]]]:
# Attention block.
- og_x = hidden_states
- x = self.attn(positions, hidden_states, kv_cache, attn_metadata)
- x = x + og_x
+ residual = hidden_states
+ hidden_states = self.input_layernorm(hidden_states)
+ hidden_states = self.self_attn(positions, hidden_states, kv_cache,
+ attn_metadata)
+ hidden_states = hidden_states + residual
# MLP block.
- hidden_states = self.mlp(x)
+ residual = hidden_states
+ hidden_states = self.post_attention_layernorm(hidden_states)
+ hidden_states = self.mlp(hidden_states)
+ hidden_states = residual + hidden_states
return hidden_states
class OlmoModel(nn.Module):
def __init__(self,
- config: OLMoConfig,
+ config: OlmoConfig,
linear_method: Optional[LinearMethodBase] = None):
super().__init__()
self.config = config
- self.transformer = nn.ModuleDict(
- dict(
- wte=VocabParallelEmbedding(
- config.embedding_size or config.vocab_size,
- config.d_model,
- ),
- ln_f=nn.LayerNorm(config.d_model,
- elementwise_affine=False,
- bias=False),
- ))
-
- blocks = [
- OlmoBlock(config, linear_method) for i in range(config.n_layers)
- ]
- if self.config.block_group_size > 1:
- raise NotImplementedError("Block group size > 1 not supported yet")
- else:
- self.transformer.update({"blocks": nn.ModuleList(blocks)})
-
- if not config.weight_tying:
- self.transformer.update({
- "ff_out":
- ColumnParallelLinear(
- config.d_model,
- config.embedding_size or config.vocab_size,
- bias=config.include_bias,
- linear_method=linear_method,
- )
- })
+ self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
+ config.hidden_size)
+ self.layers = nn.ModuleList([
+ OlmoDecoderLayer(config, linear_method)
+ for layer_idx in range(config.num_hidden_layers)
+ ])
+ self.norm = nn.LayerNorm(config.hidden_size,
+ elementwise_affine=False,
+ bias=False)
def forward(
self,
@@ -283,39 +242,49 @@ def forward(
"""
# Get embeddings of input.
# shape: (batch_size, seq_len, d_model)
- x = self.transformer.wte(input_ids) # type: ignore
+ inputs_embeds = self.embed_tokens(input_ids)
+
+ # embed positions
+ hidden_states = inputs_embeds
# Apply blocks one-by-one.
- for block_idx, block in enumerate(self.transformer.blocks):
+ for layer_idx, decoder_layer in enumerate(self.layers):
# shape: (batch_size, seq_len, d_model)
- x = block(
+ hidden_states = decoder_layer(
positions,
- x,
- kv_caches[block_idx],
+ hidden_states,
+ kv_caches[layer_idx],
attn_metadata,
)
# Apply final layer norm.
# shape: (batch_size, seq_len or 1, d_model)
- x = self.transformer.ln_f(x) # type: ignore
- return x
+ hidden_states = self.norm(hidden_states)
+ return hidden_states
-class OLMoForCausalLM(nn.Module):
+class OlmoForCausalLM(nn.Module):
"""
Extremely barebones HF model wrapper.
"""
def __init__(self,
- config: OLMoConfig,
+ config: OlmoConfig,
linear_method: Optional[LinearMethodBase] = None):
super().__init__()
self.config = config
self.linear_method = linear_method
self.model = OlmoModel(config, linear_method)
- self.lm_head_weight = (self.model.transformer.wte.weight
- if config.weight_tying else
- self.model.transformer.ff_out.weight)
+ if config.tie_word_embeddings:
+ self.lm_head_weight = self.model.embed_tokens.weight
+ else:
+ self.unpadded_vocab_size = config.vocab_size
+ self.lm_head = ParallelLMHead(
+ self.unpadded_vocab_size,
+ config.hidden_size,
+ org_num_embeddings=config.vocab_size,
+ )
+ self.lm_head_weight = self.lm_head.weight
self.logits_processor = LogitsProcessor(config.vocab_size)
self.sampler = Sampler()
@@ -348,28 +317,40 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(
- self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None,
- ):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+ stacked_params_mapping = [
+ # (param_name, shard_name, shard_id)
+ ("qkv_proj", "q_proj", "q"),
+ ("qkv_proj", "k_proj", "k"),
+ ("qkv_proj", "v_proj", "v"),
+ ("gate_up_proj", "gate_proj", 0),
+ ("gate_up_proj", "up_proj", 1),
+ ]
params_dict = dict(self.named_parameters(remove_duplicate=False))
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path, cache_dir, load_format, revision):
- # attention
- if ".att" in name:
- name = name.replace(".att", ".attn.att")
- # mlp
- if ".ff_proj" in name:
- name = name.replace(".ff_proj", ".mlp.ff_proj")
- # Reverse the weight for the MergeColumnParallelLinear
- loaded_weight = torch.concat(loaded_weight.chunk(2)[::-1])
- if ".ff_out" in name and "transformer.ff_out" not in name:
- name = name.replace(".ff_out", ".mlp.ff_out")
- # there is no bias in olmo
- param = params_dict[name]
- weight_loader = getattr(param, "weight_loader",
- default_weight_loader)
- weight_loader(param, loaded_weight)
+ for name, loaded_weight in weights:
+ if "rotary_emb.inv_freq" in name:
+ continue
+ if ("rotary_emb.cos_cached" in name
+ or "rotary_emb.sin_cached" in name):
+ # Models trained using ColossalAI may include these tensors in
+ # the checkpoint. Skip them.
+ continue
+ for (param_name, weight_name, shard_id) in stacked_params_mapping:
+ if weight_name not in name:
+ continue
+ name = name.replace(weight_name, param_name)
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ param = params_dict[name]
+ weight_loader = param.weight_loader
+ weight_loader(param, loaded_weight, shard_id)
+ break
+ else:
+ # Skip loading extra bias for GPTQ models.
+ if name.endswith(".bias") and name not in params_dict:
+ continue
+ param = params_dict[name]
+ weight_loader = getattr(param, "weight_loader",
+ default_weight_loader)
+ weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
index 3a640850662c0..89263166bca81 100644
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@@ -17,7 +17,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only OPT model compatible with HuggingFace weights."""
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
import torch
from torch import nn
@@ -35,9 +35,8 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
@@ -315,11 +314,7 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
@@ -327,8 +322,7 @@ def load_weights(self,
("qkv_proj", "v_proj", "v"),
]
params_dict = dict(self.named_parameters(remove_duplicate=False))
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path, cache_dir, load_format, revision):
+ for name, loaded_weight in weights:
if "lm_head.weight" in name:
continue
if name.startswith("decoder."):
diff --git a/vllm/model_executor/models/orion.py b/vllm/model_executor/models/orion.py
index c606ac027e9d9..bbb9fa5347cc8 100644
--- a/vllm/model_executor/models/orion.py
+++ b/vllm/model_executor/models/orion.py
@@ -4,7 +4,7 @@
# Copyright (c) OrionStar Inc.
# LICENSE: https://huggingface.co/OrionStarAI/Orion-14B-Base/blob/main/LICENSE
"""Inference-only Orion-14B model compatible with HuggingFace weights."""
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, Iterable, List, Optional, Tuple
import torch
from torch import nn
@@ -22,9 +22,8 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
@@ -280,11 +279,7 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
@@ -294,8 +289,7 @@ def load_weights(self,
("gate_up_proj", "up_proj", 1),
]
params_dict = dict(self.named_parameters())
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path, cache_dir, load_format, revision):
+ for name, loaded_weight in weights:
if "rotary_emb.inv_freq" in name:
continue
if ("rotary_emb.cos_cached" in name
diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py
index e91624da90955..f974b78a0fbda 100644
--- a/vllm/model_executor/models/phi.py
+++ b/vllm/model_executor/models/phi.py
@@ -35,7 +35,7 @@
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""Inference-only Phi-1.5 model compatible with HuggingFace weights."""
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
import torch
from torch import nn
@@ -53,9 +53,8 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
@@ -265,11 +264,7 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
@@ -278,8 +273,7 @@ def load_weights(self,
]
params_dict = dict(self.named_parameters())
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path, cache_dir, load_format, revision):
+ for name, loaded_weight in weights:
if "rotary_emb.inv_freq" in name:
continue
diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py
index 6213a2ded65ab..a77da7cb15984 100644
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -4,7 +4,7 @@
# Copyright (c) Alibaba Cloud.
# LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
"""Inference-only QWen model compatible with HuggingFace weights."""
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, Iterable, List, Optional, Tuple
import torch
from torch import nn
@@ -23,9 +23,8 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
@@ -253,19 +252,14 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("gate_up_proj", "w2", 0),
("gate_up_proj", "w1", 1),
]
params_dict = dict(self.named_parameters())
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path, cache_dir, load_format, revision):
+ for name, loaded_weight in weights:
if "rotary_emb.inv_freq" in name:
continue
for (param_name, weight_name, shard_id) in stacked_params_mapping:
diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py
index 796e30e633e85..71b906e20ac19 100644
--- a/vllm/model_executor/models/qwen2.py
+++ b/vllm/model_executor/models/qwen2.py
@@ -22,7 +22,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only Qwen2 model compatible with HuggingFace weights."""
-from typing import List, Optional, Tuple
+from typing import Iterable, List, Optional, Tuple
import torch
from torch import nn
@@ -42,9 +42,8 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
@@ -331,11 +330,7 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
@@ -345,8 +340,7 @@ def load_weights(self,
("gate_up_proj", "up_proj", 1),
]
params_dict = dict(self.named_parameters(remove_duplicate=False))
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path, cache_dir, load_format, revision):
+ for name, loaded_weight in weights:
if "rotary_emb.inv_freq" in name:
continue
if self.config.tie_word_embeddings and "lm_head.weight" in name:
diff --git a/vllm/model_executor/models/qwen2_moe.py b/vllm/model_executor/models/qwen2_moe.py
index f920b4f5a40c7..59908bc9ef26a 100644
--- a/vllm/model_executor/models/qwen2_moe.py
+++ b/vllm/model_executor/models/qwen2_moe.py
@@ -22,7 +22,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only Qwen2MoE model compatible with HuggingFace weights."""
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, Iterable, List, Optional, Tuple
import torch
import torch.nn.functional as F
@@ -46,9 +46,8 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
@@ -366,6 +365,8 @@ def forward(
class Qwen2MoeForCausalLM(nn.Module):
+ fall_back_to_pt_during_load = False
+
def __init__(
self,
config: PretrainedConfig,
@@ -404,11 +405,7 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
@@ -419,12 +416,7 @@ def load_weights(self,
]
params_dict = dict(self.named_parameters())
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path,
- cache_dir,
- load_format,
- revision,
- fall_back_to_pt=False):
+ for name, loaded_weight in weights:
if "rotary_emb.inv_freq" in name:
continue
for (param_name, weight_name, shard_id) in stacked_params_mapping:
diff --git a/vllm/model_executor/models/stablelm.py b/vllm/model_executor/models/stablelm.py
index 651598b770f13..3e6c2db6f3c65 100644
--- a/vllm/model_executor/models/stablelm.py
+++ b/vllm/model_executor/models/stablelm.py
@@ -19,7 +19,7 @@
# https://huggingface.co/stabilityai/stablelm-3b-4e1t/blob/main/config.json
"""Inference-only StabeLM (https://github.com/Stability-AI/StableLM)
model compatible with HuggingFace weights."""
-from typing import List, Optional, Tuple
+from typing import Iterable, List, Optional, Tuple
import torch
from torch import nn
@@ -37,9 +37,8 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
@@ -262,11 +261,7 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
@@ -276,8 +271,7 @@ def load_weights(self,
("gate_up_proj", "up_proj", 1),
]
params_dict = dict(self.named_parameters())
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path, cache_dir, load_format, revision):
+ for name, loaded_weight in weights:
if "rotary_emb.inv_freq" in name:
continue
if ("rotary_emb.cos_cached" in name
diff --git a/vllm/model_executor/models/starcoder2.py b/vllm/model_executor/models/starcoder2.py
index 76e8e48673413..b90f3da141c2e 100644
--- a/vllm/model_executor/models/starcoder2.py
+++ b/vllm/model_executor/models/starcoder2.py
@@ -18,7 +18,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch Starcoder2 model."""
-from typing import List, Optional
+from typing import Iterable, List, Optional, Tuple
import torch
from torch import nn
@@ -36,9 +36,8 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
@@ -274,11 +273,7 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
@@ -287,8 +282,7 @@ def load_weights(self,
]
params_dict = dict(self.named_parameters(remove_duplicate=False))
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path, cache_dir, load_format, revision):
+ for name, loaded_weight in weights:
if "rotary_emb.inv_freq" in name:
continue
diff --git a/vllm/model_executor/models/xverse.py b/vllm/model_executor/models/xverse.py
index d7b9901931b85..adec5b856edd5 100644
--- a/vllm/model_executor/models/xverse.py
+++ b/vllm/model_executor/models/xverse.py
@@ -20,7 +20,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Inference-only Xverse model compatible with HuggingFace weights."""
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, Iterable, List, Optional, Tuple
import torch
from torch import nn
@@ -40,9 +40,8 @@
from vllm.model_executor.layers.sampler import Sampler
from vllm.model_executor.layers.vocab_parallel_embedding import (
ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.model_executor.weight_utils import (default_weight_loader,
- hf_model_weights_iterator)
from vllm.sequence import SamplerOutput
@@ -331,11 +330,7 @@ def sample(
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
- def load_weights(self,
- model_name_or_path: str,
- cache_dir: Optional[str] = None,
- load_format: str = "auto",
- revision: Optional[str] = None):
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
stacked_params_mapping = [
("qkv_proj", "q_proj", "q"),
("qkv_proj", "k_proj", "k"),
@@ -344,8 +339,7 @@ def load_weights(self,
("gate_up_proj", "up_proj", 1),
]
params_dict = dict(self.named_parameters())
- for name, loaded_weight in hf_model_weights_iterator(
- model_name_or_path, cache_dir, load_format, revision):
+ for name, loaded_weight in weights:
if ("rotary_emb.inv_freq" in name
or "rotary_emb.cos_cached" in name
or "rotary_emb.sin_cached" in name):
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 534cb75c2fd2f..31032c4cead20 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -113,6 +113,8 @@ def from_sampling_metadata(
get_num_triton_sampler_splits(vocab_size))
sample_indices_start_idx = 0
+ assert sampling_metadata.seq_groups is not None
+ assert sampling_metadata.seq_data is not None
for i, seq_group in enumerate(sampling_metadata.seq_groups):
seq_ids, sampling_params = seq_group
temperature = sampling_params.temperature
@@ -147,6 +149,7 @@ def from_sampling_metadata(
and sampling_params.prompt_logprobs is not None):
# For tokens in the prompt that we only need to get
# their logprobs
+ assert sampling_metadata.prompt_lens is not None
prompt_len = sampling_metadata.prompt_lens[i]
temperatures += [temperature] * (prompt_len - 1)
top_ps += [top_p] * (prompt_len - 1)
@@ -172,6 +175,7 @@ def from_sampling_metadata(
is_prompt = i < sampling_metadata.num_prompts
if is_prompt:
prompt_best_of.append(sampling_params.best_of)
+ assert sampling_metadata.prompt_lens is not None
prompt_len = sampling_metadata.prompt_lens[i]
if sampling_params.prompt_logprobs is not None:
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 0b9787608798c..dc0e60344d858 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -2,10 +2,11 @@
import copy
from enum import IntEnum
from functools import cached_property
-from typing import Callable, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union
import torch
-from pydantic import conint
+from pydantic import Field
+from typing_extensions import Annotated
_SAMPLING_EPS = 1e-5
@@ -127,7 +128,7 @@ def __init__(
skip_special_tokens: bool = True,
spaces_between_special_tokens: bool = True,
logits_processors: Optional[List[LogitsProcessor]] = None,
- truncate_prompt_tokens: Optional[conint(ge=1)] = None,
+ truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
) -> None:
self.n = n
self.best_of = best_of if best_of is not None else n
@@ -270,6 +271,18 @@ def _verify_greedy_sampling(self) -> None:
raise ValueError("best_of must be 1 when using greedy sampling."
f"Got {self.best_of}.")
+ def update_from_generation_config(
+ self, generation_config: Dict[str, Any]) -> None:
+ """Update if there are non-default values from generation_config"""
+ # Update eos_token_id for generation
+ if eos_ids := generation_config.get("eos_token_id"):
+ # it can be either int or list of int
+ if isinstance(eos_ids, int):
+ eos_ids = [eos_ids]
+ original_stop_token_ids = set(self.stop_token_ids)
+ original_stop_token_ids.update(eos_ids)
+ self.stop_token_ids = list(original_stop_token_ids)
+
@cached_property
def sampling_type(self) -> SamplingType:
if self.use_beam_search:
diff --git a/vllm/sequence.py b/vllm/sequence.py
index cdb6cce6f0255..b296b37a84f15 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -160,7 +160,7 @@ def reset_state_for_recompute(self) -> None:
self._stage = SequenceStage.PREFILL
def get_num_uncomputed_tokens(self) -> int:
- """Return the number of prefil tokens that are not computed."""
+ """Return the number of prefill tokens that are not computed."""
# we use `get_len()` which includes prompt_len + output_len instead
# of prompt_len here. This is because during recompute we need to
# prefill for both prompt and output.
@@ -171,10 +171,10 @@ def get_last_token_id(self) -> int:
return self.prompt_token_ids[-1]
return self.output_token_ids[-1]
- def get_prompt_token_ids(self) -> int:
+ def get_prompt_token_ids(self) -> List[int]:
return self.prompt_token_ids
- def get_output_token_ids(self) -> int:
+ def get_output_token_ids(self) -> List[int]:
return self.output_token_ids
@property
@@ -345,12 +345,9 @@ def fork(self, new_seq_id: int) -> "Sequence":
def get_num_new_tokens(self) -> int:
"""Get the number of new tokens to be computed.
- Args:
- remainig_token_budget: The remaining token budgets.
Returns:
- The new number of tokens to be computed. I.e., 1 for decode, prompt
- size for prefill. If there's not enough remainig_token_budget, it
- can return the chunked number of new tokens.
+ The new number of tokens to be computed. I.e., 1 for decode, or
+ the remaining prompt size for prefill.
"""
if self.data.stage == SequenceStage.DECODE:
return 1
@@ -370,7 +367,7 @@ class SequenceGroupState:
"""Mutable state tied to a specific sequence group"""
# torch.Generator used in seeded sampling
- generator: Optional = None
+ generator: Optional = None # type: ignore
class MultiModalData:
@@ -511,6 +508,11 @@ def get_num_uncomputed_tokens(self) -> int:
return num_uncomputed_tokens
def num_seqs(self, status: Optional[SequenceStatus] = None) -> int:
+ # Optimization. We don't need to call get_seqs if we don't need to
+ # filter by states.
+ if status is None:
+ return len(self.seqs_dict)
+
return len(self.get_seqs(status))
def num_unfinished_seqs(self) -> int:
@@ -599,7 +601,7 @@ def lora_int_id(self) -> int:
return self.lora_request.lora_int_id if self.lora_request else 0
@property
- def token_chunk_size(self) -> int:
+ def token_chunk_size(self) -> Optional[int]:
"""Return the number of tokens to be processed (chunk size)."""
return self._token_chunk_size
@@ -693,3 +695,16 @@ def __len__(self):
def __eq__(self, other: object):
return isinstance(other,
self.__class__) and self.outputs == other.outputs
+
+ def __repr__(self) -> str:
+ """Show the shape of a tensor instead of its values to reduce noise.
+ """
+ sampled_token_probs_repr = ("None" if self.sampled_token_probs is None
+ else self.sampled_token_probs.shape)
+ sampled_token_ids_repr = ("None" if self.sampled_token_ids is None else
+ self.sampled_token_ids.shape)
+ return (
+ f"SamplerOutput(outputs={self.outputs}, "
+ f"sampled_token_probs={sampled_token_probs_repr}, "
+ f"sampled_token_ids={sampled_token_ids_repr}, "
+ f"spec_decode_worker_metrics={self.spec_decode_worker_metrics})")
diff --git a/vllm/spec_decode/batch_expansion.py b/vllm/spec_decode/batch_expansion.py
index e0b75837e8a39..c29b838f854c0 100644
--- a/vllm/spec_decode/batch_expansion.py
+++ b/vllm/spec_decode/batch_expansion.py
@@ -9,7 +9,7 @@
from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range,
sampler_output_to_torch,
split_batch_by_proposal_len)
-from vllm.worker.worker import Worker
+from vllm.worker.worker_base import WorkerBase
SeqId = int
TargetSeqId = int
@@ -31,7 +31,8 @@ class BatchExpansionTop1Scorer(SpeculativeScorer):
of topk/tree.
"""
- def __init__(self, scorer_worker: Worker, device: str, vocab_size: int):
+ def __init__(self, scorer_worker: WorkerBase, device: str,
+ vocab_size: int):
self._scorer_worker = scorer_worker
self._device = device
self._vocab_size = vocab_size
@@ -71,10 +72,16 @@ def score_proposals(
proposal_lens_list = proposals.proposal_lens.tolist()
proposal_token_ids_list = proposals.proposal_token_ids.tolist()
+ # Filter the list to ignore -1 proposals.
+ proposal_token_ids_list_without_skips = [
+ proposals for proposals in proposal_token_ids_list
+ if -1 not in proposals
+ ]
+
(spec_indices, non_spec_indices, target_seq_group_metadata_list,
num_scoring_tokens) = self._expand_batch(
seq_group_metadata_list=seq_group_metadata_list,
- proposal_token_ids_list=proposal_token_ids_list,
+ proposal_token_ids_list=proposal_token_ids_list_without_skips,
proposal_lens_list=proposal_lens_list,
)
@@ -83,10 +90,12 @@ def score_proposals(
blocks_to_swap_in=blocks_to_swap_in,
blocks_to_swap_out=blocks_to_swap_out,
blocks_to_copy=blocks_to_copy,
- return_python_output=False)
+ )
+ assert len(target_sampler_output) == 1, "expected single-step output"
+ target_sampler_output = target_sampler_output[0]
all_tokens, all_probs = self._contract_batch(
- original_bs=len(seq_group_metadata_list),
+ contracted_bs=len(seq_group_metadata_list),
target_sampler_output=target_sampler_output,
proposals=proposals,
num_scoring_tokens=num_scoring_tokens,
@@ -103,7 +112,7 @@ def score_proposals(
def _expand_batch(
self,
seq_group_metadata_list: List[SequenceGroupMetadata],
- proposal_token_ids_list: List[TokenId],
+ proposal_token_ids_list: List[List[TokenId]],
proposal_lens_list: List[int],
) -> Tuple[List[int], List[int], List[SequenceGroupMetadata], int]:
"""Given the input sequences and potentially multiple corresponding
@@ -125,14 +134,21 @@ def _expand_batch(
select_proposal_len_zero=True)
target_seq_group_metadata_list = self._create_scoring_model_input(
- spec_seqs, proposal_token_ids_list)
+ seq_group_metadata_list=spec_seqs,
+ proposal_token_ids=proposal_token_ids_list,
+ # NOTE: We determine the seq ids in the expanded batch using the
+ # full seq_group_metadata_list, instead of only spec_seqs.
+ target_seq_ids_iter=self._create_target_seq_id_iterator(
+ seq_ids=get_all_seq_ids(seq_group_metadata_list)),
+ )
+
num_scoring_tokens = len(target_seq_group_metadata_list)
target_seq_group_metadata_list.extend(non_spec_seqs)
return (spec_indices, non_spec_indices, target_seq_group_metadata_list,
num_scoring_tokens)
- def _contract_batch(self, original_bs: int,
+ def _contract_batch(self, contracted_bs: int,
target_sampler_output: List[SamplerOutput],
proposals: SpeculativeProposals,
num_scoring_tokens: int, non_spec_indices: List[int],
@@ -141,6 +157,9 @@ def _contract_batch(self, original_bs: int,
"""Contract the expanded batch back into its original size.
This maps the scores of speculative tokens back to their original
sequences.
+
+ contracted_bs is the original batch size, and the batch size that the
+ target_sampler_output will be contracted to.
"""
(target_token_ids, target_probs, non_spec_target_token_ids,
non_spec_target_probs) = self._split_scoring_output(
@@ -148,25 +167,31 @@ def _contract_batch(self, original_bs: int,
# Map distinct sequences used to score each token
# of shape [batch_size * k + 1] back to [batch_size, k + 1].
- batch_size, k = proposals.proposal_token_ids.shape
+ expanded_batch_size, k = proposals.proposal_token_ids.shape
+
+ # The number of tokens in the expanded batch used for speculation is
+ # equal to the total expanded batch size minus the number of samples for
+ # non-speculative sequences.
+ non_spec_expanded_bs, _ = non_spec_target_token_ids.shape
+ spec_expanded_bs = expanded_batch_size - non_spec_expanded_bs
target_token_ids = target_token_ids.squeeze().reshape(
- batch_size, k + 1)
- target_probs = target_probs.squeeze().reshape(batch_size, k + 1,
+ spec_expanded_bs, k + 1)
+ target_probs = target_probs.squeeze().reshape(spec_expanded_bs, k + 1,
self._vocab_size)
- all_tokens = torch.full(size=(original_bs, k + 1),
+ all_tokens = torch.full(size=(contracted_bs, k + 1),
fill_value=-1,
device=self._device,
dtype=torch.long)
- all_probs = torch.zeros(original_bs,
+ all_probs = torch.zeros(contracted_bs,
k + 1,
self._vocab_size,
device=self._device,
dtype=torch.float32)
if non_spec_indices:
- all_tokens[non_spec_indices, 0] = non_spec_target_token_ids
+ all_tokens[non_spec_indices, :1] = non_spec_target_token_ids
all_probs[non_spec_indices, :1, :] = non_spec_target_probs
if spec_indices:
@@ -176,20 +201,22 @@ def _contract_batch(self, original_bs: int,
return all_tokens, all_probs
def _create_scoring_model_input(
- self,
- seq_group_metadata_list: List[SequenceGroupMetadata],
- proposal_token_ids: List[List[TokenId]], # shape: [batch_size, k]
+ self,
+ seq_group_metadata_list: List[SequenceGroupMetadata],
+ proposal_token_ids: List[List[TokenId]], # shape: [batch_size, k]
+ target_seq_ids_iter: Iterator[TargetSeqId],
) -> List[SequenceGroupMetadata]:
"""Given the original input sequences and proposed tokens from the draft
model, create a list of target sequences that can be used for scoring.
+
+ target_seq_ids_iter provides sequence ids for the expanded batch,
+ fulfilling the requirement that no seq id in the expanded batch is equal
+ to the seq id in the original batch.
"""
if not seq_group_metadata_list:
return []
- target_seq_ids_iter = self._create_target_seq_id_iterator(
- get_all_seq_ids(seq_group_metadata_list))
-
target_seq_group_metadata = list(
chain.from_iterable(
self._create_target_seq_group_metadata(
@@ -205,7 +232,7 @@ def _create_scoring_model_input(
def _create_target_seq_group_metadata(
self,
input_seq_group_metadata: SequenceGroupMetadata,
- proposal_token_ids: List[TokenId], # shape: [batch_size, k]
+ proposal_token_ids: List[List[TokenId]], # shape: [batch_size, k]
batch_index: int,
target_seq_ids_iter: Iterator[TargetSeqId],
) -> List[SequenceGroupMetadata]:
@@ -347,7 +374,7 @@ def _get_token_ids_to_score(
[0, 1, 2]
[0, 1, 2, 3]
"""
- empty_token_ids = []
+ empty_token_ids: List[TokenId] = []
token_ids_to_score = [empty_token_ids]
token_ids_to_score.extend([
diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py
index 2a72974d01bdc..dd040779922e9 100644
--- a/vllm/spec_decode/interfaces.py
+++ b/vllm/spec_decode/interfaces.py
@@ -1,6 +1,6 @@
from abc import ABC, abstractmethod
from dataclasses import dataclass
-from typing import Dict, List, Optional, Tuple
+from typing import Dict, List, Optional
import torch
@@ -24,9 +24,9 @@ class SpeculativeProposals:
def __repr__(self):
return (f"SpeculativeProposals("
- f"proposal_token_ids={self.proposal_token_ids.shape}, "
+ f"proposal_token_ids={self.proposal_token_ids}, "
f"proposal_probs={self.proposal_probs.shape}, "
- f"proposal_lens={self.proposal_lens.shape})")
+ f"proposal_lens={self.proposal_lens})")
@dataclass
@@ -73,5 +73,5 @@ def score_proposals(
blocks_to_copy: Optional[Dict[int, List[int]]],
k: int,
proposals: SpeculativeProposals,
- ) -> Tuple[torch.Tensor, torch.Tensor]:
+ ) -> SpeculativeScores:
raise NotImplementedError
diff --git a/vllm/spec_decode/metrics.py b/vllm/spec_decode/metrics.py
index 5df8fc4316d48..ab1d96c558de7 100644
--- a/vllm/spec_decode/metrics.py
+++ b/vllm/spec_decode/metrics.py
@@ -112,6 +112,7 @@ def _copy_rejsample_metrics_async(self) -> torch.cuda.Event:
Returns a CUDA event recording when the copy is complete.
"""
+ assert self._copy_stream is not None
self._copy_stream.wait_stream(torch.cuda.current_stream())
with torch.cuda.stream(self._copy_stream):
@@ -146,15 +147,16 @@ def _collect_rejsample_metrics(
emitted_tokens = self._aggregate_num_emitted_tokens.item()
draft_tokens = self._aggregate_num_draft_tokens
- num_possible_tokens = self.get_max_num_accepted_tokens(draft_tokens, k)
+ max_num_emitted_tokens = self.get_max_num_emitted_tokens(
+ draft_tokens, k)
if draft_tokens > 0:
draft_acceptance_rate = accepted_tokens / draft_tokens
else:
draft_acceptance_rate = float("nan")
- if num_possible_tokens > 0:
- system_efficiency = emitted_tokens / num_possible_tokens
+ if max_num_emitted_tokens > 0:
+ system_efficiency = emitted_tokens / max_num_emitted_tokens
else:
system_efficiency = float("nan")
@@ -168,8 +170,22 @@ def _collect_rejsample_metrics(
)
@staticmethod
- def get_max_num_accepted_tokens(draft_tokens: int, k: int) -> int:
- # Divide by k since batch size can be variable.
- total_num_spec_seqs = draft_tokens / k
- num_accepted_per_seq_if_all_accepted = k + 1
- return int(total_num_spec_seqs / num_accepted_per_seq_if_all_accepted)
+ def get_max_num_emitted_tokens(draft_tokens: int, k: int) -> int:
+ """Calculate the number of emitted tokens, assuming all tokens are
+ accepted.
+
+ This is equal to the number of sequences that have been speculated on,
+ times (speculation len + 1). The +1 comes from the bonus token.
+ """
+ # Determine the number of sequences that have been speculated on. Since
+ # the batch size can be variable, we divide by k.
+ assert draft_tokens % k == 0
+ total_num_spec_seqs = draft_tokens // k
+
+ # A single sequence may emit k accepted tokens and one bonus token in
+ # the best case.
+ num_emitted_per_seq_if_all_accepted = k + 1
+
+ # The max num of emitted tokens is the number of speculated sequences
+ # times the max emitted per seq.
+ return total_num_spec_seqs * num_emitted_per_seq_if_all_accepted
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index 73b6e201c67a9..7cf338bbae5f0 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -25,7 +25,8 @@ class MultiStepWorker(Worker):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
- self._proposer: Optional[DraftModelTop1Proposer] = None
+ # Lazy initialization list.
+ self._proposer: DraftModelTop1Proposer
def init_device(self):
super().init_device()
@@ -69,6 +70,9 @@ def execute_model_multi_step(
blocks_to_swap_out=blocks_to_swap_out,
blocks_to_copy=blocks_to_copy,
)
+ assert (len(model_output) == 1
+ ), "composing multistep workers not supported"
+ model_output = model_output[0]
self._append_new_tokens(model_output,
copied_seq_group_metadata_list)
@@ -324,23 +328,25 @@ def _merge_outputs(
"""
if maybe_sampler_output is None:
# If no speculative tokens, the sampler output will be None.
- # In this case we return empty tensors.
- proposal_tokens = torch.zeros(0,
- max_proposal_len,
- dtype=torch.long,
- device=self._device)
- proposal_probs = torch.zeros(0,
+ # In this case we return empty proposals.
+ proposal_tokens = torch.full(size=(
+ batch_size,
+ max_proposal_len,
+ ),
+ fill_value=-1,
+ dtype=torch.long,
+ device=self._device)
+ proposal_probs = torch.zeros(batch_size,
max_proposal_len,
self._vocab_size,
dtype=torch.float32,
device=self._device)
- proposal_lens = torch.zeros(len(proposal_lens),
- dtype=torch.long,
- device=self._device)
- return proposal_tokens, proposal_probs, proposal_lens
+ proposal_lens_tensor = torch.zeros(len(proposal_lens),
+ dtype=torch.long,
+ device=self._device)
+ return proposal_tokens, proposal_probs, proposal_lens_tensor
sampler_output = maybe_sampler_output
-
proposal_tokens, proposal_probs = sampler_output_to_torch(
sampler_output)
@@ -362,9 +368,9 @@ def _merge_outputs(
proposal_tokens, proposal_probs = (entire_proposal_tokens,
entire_proposal_probs)
- proposal_lens = torch.zeros(batch_size,
- dtype=torch.long,
- device=self._device)
- proposal_lens[nonzero_proposal_len_indices] = max_proposal_len
+ proposal_lens_tensor = torch.zeros(batch_size,
+ dtype=torch.long,
+ device=self._device)
+ proposal_lens_tensor[nonzero_proposal_len_indices] = max_proposal_len
- return proposal_tokens, proposal_probs, proposal_lens
+ return proposal_tokens, proposal_probs, proposal_lens_tensor
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 885bf537568e3..2c6642f5a3c81 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -3,8 +3,9 @@
import torch
+from vllm.logger import init_logger
from vllm.model_executor.layers.rejection_sampler import RejectionSampler
-from vllm.sequence import (SamplerOutput, SequenceGroupMetadata,
+from vllm.sequence import (Logprob, SamplerOutput, SequenceGroupMetadata,
SequenceGroupOutput, SequenceOutput)
from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
from vllm.spec_decode.interfaces import (SpeculativeProposals,
@@ -13,8 +14,9 @@
from vllm.spec_decode.multi_step_worker import MultiStepWorker
from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range,
split_batch_by_proposal_len)
-from vllm.worker.worker import Worker
-from vllm.worker.worker_base import LoraNotSupportedWorkerBase
+from vllm.worker.worker_base import LoraNotSupportedWorkerBase, WorkerBase
+
+logger = init_logger(__name__)
class SpecDecodeWorker(LoraNotSupportedWorkerBase):
@@ -45,10 +47,20 @@ class SpecDecodeWorker(LoraNotSupportedWorkerBase):
More info here https://docs.google.com/document/d/1T-JaS2T1NRfdP51qzqpyakoCXxSXTtORppiwaj5asxA/edit.
"""
+ @classmethod
+ def from_workers(cls, proposer_worker: MultiStepWorker,
+ scorer_worker: WorkerBase) -> "SpecDecodeWorker":
+ return SpecDecodeWorker(
+ proposer_worker,
+ scorer_worker,
+ # TODO(cade) disable strict mode for speedup.
+ rejection_sampler=RejectionSampler(strict_mode=True),
+ )
+
def __init__(
self,
proposer_worker: MultiStepWorker,
- scorer_worker: Worker,
+ scorer_worker: WorkerBase,
rejection_sampler: RejectionSampler,
metrics_collector: Optional[AsyncMetricsCollector] = None,
):
@@ -77,7 +89,8 @@ def __init__(
self.probs_dtype = self.rejection_sampler.probs_dtype
self.token_id_dtype = self.rejection_sampler.token_id_dtype
- self.scorer: SpeculativeScorer = None
+ # Lazy initiazliation.
+ self.scorer: SpeculativeScorer
def init_device(self) -> None:
"""Initialize both scorer and proposer models.
@@ -87,6 +100,10 @@ def init_device(self) -> None:
self.scorer_worker.init_device()
self.proposer_worker.init_device()
+ # NOTE(cade): load_model is not part of the WorkerBase interface.
+ self.scorer_worker.load_model()
+ self.proposer_worker.load_model()
+
self._metrics.init_gpu_tensors(self.rank)
self.rejection_sampler.init_gpu_tensors(self.rank)
self.scorer = BatchExpansionTop1Scorer(
@@ -94,6 +111,32 @@ def init_device(self) -> None:
device=self.device,
vocab_size=self._vocab_size)
+ self._configure_model_sampler_for_spec_decode()
+
+ def _configure_model_sampler_for_spec_decode(self):
+ """Configure model sampler to emit GPU tensors. This allows spec decode
+ to keep data on device without transferring to CPU and serializing,
+ which significantly reduces overhead of rejection sampling.
+
+ NOTE(cade): This breaks abstraction boundaries pretty badly. The better
+ design is to have the "move to CPU and serialize" sampling decision be
+ done outside of the model/sampler; this way the "last-mile" worker
+ object which interfaces with the scheduler can serialize and incur the
+ performance hit as necessary. This allows us to run the worker several
+ iterations in a row without incurring the "move to CPU and serialize"
+ performance penalty.
+
+ Since this requires a large change to vLLM, we defer it to later and
+ temporarily accept this broken abstraction boundary.
+
+ NOTE(cade): This will require a special check if the proposer worker
+ does not have a sampler (e.g. ngram speculation).
+ """
+ (self.scorer_worker.model_runner.model.sampler.include_gpu_probs_tensor
+ ) = True
+ (self.proposer_worker.model_runner.model.sampler.
+ include_gpu_probs_tensor) = True
+
def determine_num_available_blocks(self) -> Tuple[int, int]:
"""Determine the number of cache blocks to use.
@@ -131,7 +174,7 @@ def execute_model(
blocks_to_swap_in: Optional[Dict[int, int]],
blocks_to_swap_out: Optional[Dict[int, int]],
blocks_to_copy: Optional[Dict[int, List[int]]],
- num_spec_tokens: int,
+ num_lookahead_slots: int,
) -> List[SamplerOutput]:
"""Perform speculative decoding on the input batch.
"""
@@ -140,9 +183,11 @@ def execute_model(
"speculative decoding "
"requires non-None seq_group_metadata_list")
+ logger.info(f"spec_decode_worker.execute_model {num_lookahead_slots=}")
+
# If no spec tokens, call the proposer and scorer workers normally.
# Used for prefill.
- if num_spec_tokens == 0 or len(seq_group_metadata_list) == 0:
+ if num_lookahead_slots == 0 or len(seq_group_metadata_list) == 0:
return self._run_no_spec(
seq_group_metadata_list=seq_group_metadata_list,
blocks_to_swap_in=blocks_to_swap_in,
@@ -155,7 +200,7 @@ def execute_model(
blocks_to_swap_in=blocks_to_swap_in,
blocks_to_swap_out=blocks_to_swap_out,
blocks_to_copy=blocks_to_copy,
- k=num_spec_tokens,
+ k=num_lookahead_slots,
)
@nvtx_range("spec_decode_worker._run_no_spec")
@@ -170,20 +215,24 @@ def _run_no_spec(
proposer and scorer model so that the KV cache is consistent between the
two.
"""
+ logger.info("run proposer worker no spec")
self.proposer_worker.execute_model(
seq_group_metadata_list=seq_group_metadata_list,
blocks_to_swap_in=blocks_to_swap_in,
blocks_to_swap_out=blocks_to_swap_out,
blocks_to_copy=blocks_to_copy,
- return_python_output=False)
+ )
+ logger.info("run target worker no spec")
sampler_output = self.scorer_worker.execute_model(
seq_group_metadata_list=seq_group_metadata_list,
blocks_to_swap_in=blocks_to_swap_in,
blocks_to_swap_out=blocks_to_swap_out,
blocks_to_copy=blocks_to_copy,
)
+ assert len(sampler_output) == 1
+ sampler_output = sampler_output[0]
# Clear device tensors from sampler output. This reduces communication
# overhead when the engine runs in a different process than the workers.
@@ -209,11 +258,16 @@ def _run_speculative_decoding_step(
sequence.
"""
+ logger.info("get spec proposals")
# Generate proposals using draft worker.
+ assert blocks_to_swap_in is not None
+ assert blocks_to_swap_out is not None
+ assert blocks_to_copy is not None
proposals = self.proposer_worker.get_spec_proposals(
seq_group_metadata_list, blocks_to_swap_in, blocks_to_swap_out,
blocks_to_copy, k)
+ logger.info("score proposals")
proposal_scores = self.scorer.score_proposals(
seq_group_metadata_list,
blocks_to_swap_in,
@@ -223,9 +277,11 @@ def _run_speculative_decoding_step(
proposals,
)
+ logger.info("verify proposals")
accepted_token_ids = self._verify_tokens(seq_group_metadata_list,
proposal_scores, proposals, k)
+ logger.info("create output list")
return self._create_output_sampler_list(seq_group_metadata_list,
accepted_token_ids, k)
@@ -256,15 +312,26 @@ def _verify_tokens(
select_proposal_len_zero=True)
original_indices = spec_indices + non_spec_indices
- proposal_probs = proposal_scores.probs[spec_indices, :-1]
- bonus_token_ids = proposal_scores.token_ids[spec_indices, -1:]
+ # Get probabilities of target model, excluding bonus token.
+ proposal_verifier_probs = proposal_scores.probs[spec_indices, :-1]
+
+ # Get non-speculative sampled tokens from target model.
non_spec_token_ids = proposal_scores.token_ids[non_spec_indices]
+ # Get bonus tokens from target model.
+ bonus_token_ids = proposal_scores.token_ids[spec_indices, -1:]
+
+ # Get probabilities according to proposal method.
+ proposal_probs = proposals.proposal_probs[spec_indices]
+
+ # Get proposed tokens.
+ proposal_token_ids = proposals.proposal_token_ids[spec_indices]
+
accepted_token_ids = self.rejection_sampler(
- proposal_probs,
- bonus_token_ids,
- proposals.proposal_probs,
- proposals.proposal_token_ids,
+ target_probs=proposal_verifier_probs,
+ bonus_token_ids=bonus_token_ids,
+ draft_probs=proposal_probs,
+ draft_token_ids=proposal_token_ids,
)
# Append output tokens from non-speculative sequences to
@@ -311,7 +378,7 @@ def _create_output_sampler_list(
parent_seq_id=seq_id,
output_token=token_id,
# TODO Add verifier logprobs.
- logprobs={token_id: 0.0},
+ logprobs={token_id: Logprob(0.0)},
)
],
prompt_logprobs=None,
diff --git a/vllm/spec_decode/util.py b/vllm/spec_decode/util.py
index 406568a4bc08c..eb6d4ca1da8e6 100644
--- a/vllm/spec_decode/util.py
+++ b/vllm/spec_decode/util.py
@@ -82,6 +82,32 @@ def sampler_output_to_torch(
return sampled_token_ids, sampled_token_probs
+def maybe_mock_device_tensors(sampler_output: SamplerOutput, batch_size: int,
+ vocab_size: int, device: str) -> None:
+ """Helper method which mocks out the GPU tensors in SamplerOutput with dummy
+ values. This will be removed in PR 7/9.
+ https://docs.google.com/document/d/1rE4pr3IdspRw97XbImY4fS9IWYuJJ3HGtL7AdIKGrw8/edit#heading=h.qijw1sdidrer
+ """
+ values = [
+ sampler_output.sampled_token_probs, sampler_output.sampled_token_ids
+ ]
+ assert all(v is None for v in values) or not any(v is None for v in values)
+ if not any(v is None for v in values):
+ # Do nothing if the tensors are already created (usually in unit tests).
+ return
+
+ # Softmax to ensure valid probs.
+ sampler_output.sampled_token_probs = torch.nn.functional.softmax(
+ torch.rand(batch_size, vocab_size, dtype=torch.float32, device=device),
+ dim=-1)
+
+ sampler_output.sampled_token_ids = torch.randint(low=10,
+ high=100,
+ size=(batch_size, ),
+ dtype=torch.long,
+ device=device)
+
+
@contextmanager
def nvtx_range(msg, *args, **kwargs):
"""
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index ce7a30dce72fa..1756c91a612f0 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -2,7 +2,8 @@
from transformers import AutoConfig, PretrainedConfig
-from vllm.transformers_utils.configs import *
+from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
+ JAISConfig, MPTConfig, RWConfig)
_CONFIG_REGISTRY: Dict[str, PretrainedConfig] = {
"chatglm": ChatGLMConfig,
diff --git a/vllm/transformers_utils/configs/jais.py b/vllm/transformers_utils/configs/jais.py
index 94f438716f8bf..b06a946f34a47 100644
--- a/vllm/transformers_utils/configs/jais.py
+++ b/vllm/transformers_utils/configs/jais.py
@@ -222,13 +222,15 @@ def _alibi_scaling_validation(self):
f"got {alibi_scaling_type}")
if (alibi_scaling_factor is not None
and not isinstance(alibi_scaling_factor, float)
- or alibi_scaling_factor <= 1.0):
+ or (alibi_scaling_factor is not None
+ and alibi_scaling_factor <= 1.0)):
raise ValueError(
f"`alibi_scaling`'s factor field must be a float > 1.0,"
f"got {alibi_scaling_factor}")
if (alibi_dynamic_scaling is not None
and not isinstance(alibi_dynamic_scaling, int)
- or alibi_dynamic_scaling <= 1):
+ or (alibi_dynamic_scaling is not None
+ and alibi_dynamic_scaling <= 1)):
raise ValueError(
f"`alibi_scaling`'s `train_seq_len` field must be an"
f"integer > 1, got {alibi_dynamic_scaling}")
diff --git a/vllm/transformers_utils/detokenizer.py b/vllm/transformers_utils/detokenizer.py
index 005932f1e3df4..f064c26c3f40c 100644
--- a/vllm/transformers_utils/detokenizer.py
+++ b/vllm/transformers_utils/detokenizer.py
@@ -168,8 +168,8 @@ def _convert_tokens_to_string_with_added_encoders(
# NOTE(woosuk): The following code is slow because it runs a for loop over
# the output_tokens. In Python, running a for loop over a list can be slow
# even when the loop body is very simple.
- sub_texts = []
- current_sub_text = []
+ sub_texts: List[str] = []
+ current_sub_text: List[str] = []
all_special_tokens = set(tokenizer.all_special_tokens)
for token in output_tokens:
if skip_special_tokens and token in all_special_tokens:
@@ -263,6 +263,7 @@ def detokenize_incrementally(
tokenizer,
all_input_ids[:-1],
skip_special_tokens=skip_special_tokens)
+ assert prev_tokens is not None
# If the new token id is out of bounds, return an empty string.
if new_token_id >= len(tokenizer):
@@ -271,6 +272,8 @@ def detokenize_incrementally(
# Put new_token_id in a list so skip_special_tokens is respected
new_tokens = tokenizer.convert_ids_to_tokens(
[new_token_id], skip_special_tokens=skip_special_tokens)
+ if isinstance(new_tokens, str):
+ new_tokens = [new_tokens]
output_tokens = prev_tokens + new_tokens
# If this is the first iteration, return all tokens.
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index e216a99af91f9..c98a673bfed4b 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -1,11 +1,13 @@
+import os
from typing import Optional, Union
from transformers import (AutoTokenizer, PreTrainedTokenizer,
PreTrainedTokenizerFast)
+from vllm.config import VLLM_USE_MODELSCOPE
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
-from vllm.transformers_utils.tokenizers import *
+from vllm.transformers_utils.tokenizers import BaichuanTokenizer
from vllm.utils import make_async
logger = init_logger(__name__)
@@ -28,7 +30,7 @@ def get_cached_tokenizer(
tokenizer_all_special_tokens = set(tokenizer.all_special_tokens)
tokenizer_len = len(tokenizer)
- class CachedTokenizer(tokenizer.__class__):
+ class CachedTokenizer(tokenizer.__class__): # type: ignore
@property
def all_special_ids(self):
@@ -57,9 +59,26 @@ def get_tokenizer(
tokenizer_mode: str = "auto",
trust_remote_code: bool = False,
tokenizer_revision: Optional[str] = None,
+ download_dir: Optional[str] = None,
**kwargs,
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
- """Gets a tokenizer for the given model name via Huggingface."""
+ """Gets a tokenizer for the given model name via Huggingface/modelscope."""
+ if VLLM_USE_MODELSCOPE:
+ # download model from ModelScope hub,
+ # lazy import so that modelscope is not required for normal use.
+ # pylint: disable=C.
+ from modelscope.hub.snapshot_download import snapshot_download
+
+ # Only set the tokenizer here, model will be downloaded on the workers.
+ if not os.path.exists(tokenizer_name):
+ tokenizer_path = snapshot_download(
+ model_id=tokenizer_name,
+ cache_dir=download_dir,
+ revision=tokenizer_revision,
+ # Ignore weights - we only need the tokenizer.
+ ignore_file_pattern=["*.pt", "*.safetensors", "*.bin"])
+ tokenizer_name = tokenizer_path
+
if tokenizer_mode == "slow":
if kwargs.get("use_fast", False):
raise ValueError(
diff --git a/vllm/transformers_utils/tokenizer_group/__init__.py b/vllm/transformers_utils/tokenizer_group/__init__.py
index a3b979e8fbc13..0195c40c27f60 100644
--- a/vllm/transformers_utils/tokenizer_group/__init__.py
+++ b/vllm/transformers_utils/tokenizer_group/__init__.py
@@ -1,7 +1,7 @@
from typing import Optional
from vllm.config import TokenizerPoolConfig
-from vllm.engine.ray_utils import ray
+from vllm.executor.ray_utils import ray
from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
BaseTokenizerGroup)
from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
@@ -11,7 +11,7 @@
from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import (
RayTokenizerGroupPool)
else:
- RayTokenizerGroupPool = None
+ RayTokenizerGroupPool = None # type: ignore
def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig],
diff --git a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
index c00b02fdbbbc0..7c605416854b8 100644
--- a/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
+++ b/vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py
@@ -6,7 +6,7 @@
from transformers import PreTrainedTokenizer
from vllm.config import TokenizerPoolConfig
-from vllm.engine.ray_utils import ray
+from vllm.executor.ray_utils import ray
from vllm.lora.request import LoRARequest
from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
BaseTokenizerGroup)
@@ -89,6 +89,7 @@ def encode(self,
This is blocking.
"""
self._ensure_queue_initialized()
+ assert self._idle_actors is not None
if self._idle_actors.empty():
raise RuntimeError("No idle actors available.")
@@ -120,6 +121,7 @@ async def encode_async(
This is non-blocking.
"""
self._ensure_queue_initialized()
+ assert self._idle_actors is not None
actor = await self._idle_actors.get()
try:
diff --git a/vllm/transformers_utils/tokenizers/baichuan.py b/vllm/transformers_utils/tokenizers/baichuan.py
index 79894035cb1f1..76daabc41e0a2 100644
--- a/vllm/transformers_utils/tokenizers/baichuan.py
+++ b/vllm/transformers_utils/tokenizers/baichuan.py
@@ -114,9 +114,9 @@ def _convert_id_to_token(self, index):
token = self.sp_model.IdToPiece(index)
return token
- def convert_tokens_to_string(self, tokens):
+ def convert_tokens_to_string(self, tokens: List[str]):
"""Converts a sequence of tokens (string) in a single string."""
- current_sub_tokens = []
+ current_sub_tokens: List[str] = []
out_string = ""
prev_is_special = False
for i, token in enumerate(tokens):
diff --git a/vllm/usage/usage_lib.py b/vllm/usage/usage_lib.py
index 658fe5c98f5ee..b2672f7f1da61 100644
--- a/vllm/usage/usage_lib.py
+++ b/vllm/usage/usage_lib.py
@@ -7,7 +7,7 @@
from enum import Enum
from pathlib import Path
from threading import Thread
-from typing import Dict, Optional
+from typing import Any, Dict, Optional
from uuid import uuid4
import cpuinfo
@@ -124,7 +124,7 @@ def __init__(self) -> None:
def report_usage(self,
model_architecture: str,
usage_context: UsageContext,
- extra_kvs: Dict[str, any] = None) -> None:
+ extra_kvs: Optional[Dict[str, Any]] = None) -> None:
t = Thread(target=self._report_usage_worker,
args=(model_architecture, usage_context, extra_kvs or {}),
daemon=True)
@@ -132,13 +132,13 @@ def report_usage(self,
def _report_usage_worker(self, model_architecture: str,
usage_context: UsageContext,
- extra_kvs: Dict[str, any]) -> None:
+ extra_kvs: Dict[str, Any]) -> None:
self._report_usage_once(model_architecture, usage_context, extra_kvs)
self._report_continous_usage()
def _report_usage_once(self, model_architecture: str,
usage_context: UsageContext,
- extra_kvs: Dict[str, any]) -> None:
+ extra_kvs: Dict[str, Any]) -> None:
# Platform information
if torch.cuda.is_available():
device_property = torch.cuda.get_device_properties(0)
diff --git a/vllm/utils.py b/vllm/utils.py
index e67d267aed408..eda690e72829f 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -1,16 +1,18 @@
import asyncio
import enum
import gc
+import glob
import os
import socket
import subprocess
import uuid
import warnings
-from collections import OrderedDict, defaultdict
+from collections import defaultdict
from functools import lru_cache, partial
from platform import uname
from typing import (Any, AsyncIterator, Awaitable, Callable, Dict, Generic,
- Hashable, List, Optional, Tuple, TypeVar, Union)
+ Hashable, List, Optional, OrderedDict, Tuple, TypeVar,
+ Union)
import psutil
import torch
@@ -51,7 +53,7 @@ def reset(self) -> None:
class LRUCache(Generic[T]):
def __init__(self, capacity: int):
- self.cache = OrderedDict[Hashable, T]()
+ self.cache: OrderedDict[Hashable, T] = OrderedDict()
self.capacity = capacity
def __contains__(self, key: Hashable) -> bool:
@@ -60,7 +62,7 @@ def __contains__(self, key: Hashable) -> bool:
def __len__(self) -> int:
return len(self.cache)
- def __getitem__(self, key: Hashable) -> T:
+ def __getitem__(self, key: Hashable) -> Optional[T]:
return self.get(key)
def __setitem__(self, key: Hashable, value: T) -> None:
@@ -76,7 +78,7 @@ def get(self,
key: Hashable,
default_value: Optional[T] = None) -> Optional[T]:
if key in self.cache:
- value = self.cache[key]
+ value: Optional[T] = self.cache[key]
self.cache.move_to_end(key)
else:
value = default_value
@@ -87,7 +89,7 @@ def put(self, key: Hashable, value: T) -> None:
self.cache.move_to_end(key)
self._remove_old_if_needed()
- def _on_remove(self, key: Hashable, value: T):
+ def _on_remove(self, key: Hashable, value: Optional[T]):
pass
def remove_oldest(self):
@@ -100,9 +102,11 @@ def _remove_old_if_needed(self) -> None:
while len(self.cache) > self.capacity:
self.remove_oldest()
- def pop(self, key: Hashable, default_value: Optional[Any] = None) -> T:
+ def pop(self,
+ key: Hashable,
+ default_value: Optional[T] = None) -> Optional[T]:
run_on_remove = key in self.cache
- value = self.cache.pop(key, default_value)
+ value: Optional[T] = self.cache.pop(key, default_value)
if run_on_remove:
self._on_remove(key, value)
return value
@@ -159,6 +163,17 @@ def random_uuid() -> str:
return str(uuid.uuid4().hex)
+@lru_cache(maxsize=None)
+def get_vllm_instance_id():
+ """
+ If the environment variable VLLM_INSTANCE_ID is set, return it.
+ Otherwise, return a random UUID.
+ Instance id represents an instance of the VLLM. All processes in the same
+ instance should have the same instance id.
+ """
+ return os.environ.get("VLLM_INSTANCE_ID", f"vllm-instance-{random_uuid()}")
+
+
@lru_cache(maxsize=None)
def in_wsl() -> bool:
# Reference: https://github.com/microsoft/WSL/issues/4071
@@ -268,8 +283,12 @@ def get_open_port() -> int:
return s.getsockname()[1]
-def set_cuda_visible_devices(device_ids: List[int]) -> None:
- os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, device_ids))
+def update_environment_variables(envs: Dict[str, str]):
+ for k, v in envs.items():
+ if k in os.environ and os.environ[k] != v:
+ logger.warning(f"Overwriting environment variable {k} "
+ f"from '{os.environ[k]}' to '{v}'")
+ os.environ[k] = v
def chunk_list(lst, chunk_size):
@@ -502,3 +521,89 @@ def merge_dicts(dict1: Dict[Any, List[Any]],
merged_dict[key].extend(value)
return dict(merged_dict)
+
+
+def init_cached_hf_modules():
+ """
+ Lazy initialization of the Hugging Face modules.
+ """
+ from transformers.dynamic_module_utils import init_hf_modules
+ init_hf_modules()
+
+
+def nccl_integrity_check(filepath):
+ """
+ when the library is corrupted, we cannot catch
+ the exception in python. it will crash the process.
+ instead, we use the exit code of `ldd` to check
+ if the library is corrupted. if not, we will return
+ the version of the library.
+ """
+ exit_code = os.system(f"ldd {filepath} 2>&1 > /dev/null")
+ if exit_code != 0:
+ raise RuntimeError(f"Failed to load NCCL library from {filepath} .")
+ import ctypes
+
+ nccl = ctypes.CDLL(filepath)
+ version = ctypes.c_int()
+ nccl.ncclGetVersion.restype = ctypes.c_int
+ nccl.ncclGetVersion.argtypes = [ctypes.POINTER(ctypes.c_int)]
+ result = nccl.ncclGetVersion(ctypes.byref(version))
+ assert result == 0
+ return version.value
+
+
+@lru_cache(maxsize=None)
+def find_library(lib_name: str) -> str:
+ """
+ Find the library file in the system.
+ `lib_name` is full filename, with both prefix and suffix.
+ This function resolves `lib_name` to the full path of the library.
+ """
+ # Adapted from https://github.com/openai/triton/blob/main/third_party/nvidia/backend/driver.py#L19 # noqa
+ # According to https://en.wikipedia.org/wiki/Filesystem_Hierarchy_Standard
+ # `/sbin/ldconfig` should exist in all Linux systems.
+ # `/sbin/ldconfig` searches the library in the system
+ libs = subprocess.check_output(["/sbin/ldconfig", "-p"]).decode()
+ # each line looks like the following:
+ # libcuda.so.1 (libc6,x86-64) => /lib/x86_64-linux-gnu/libcuda.so.1
+ locs = [line.split()[-1] for line in libs.splitlines() if lib_name in line]
+ # `LD_LIBRARY_PATH` searches the library in the user-defined paths
+ env_ld_library_path = os.getenv("LD_LIBRARY_PATH")
+ if not locs and env_ld_library_path:
+ locs = [
+ os.path.join(dir, lib_name)
+ for dir in env_ld_library_path.split(":")
+ if os.path.exists(os.path.join(dir, lib_name))
+ ]
+ if not locs:
+ raise ValueError(f"Cannot find {lib_name} in the system.")
+ return locs[0]
+
+
+def find_nccl_library():
+ so_file = os.environ.get("VLLM_NCCL_SO_PATH", "")
+
+ # check if we have vllm-managed nccl
+ vllm_nccl_path = None
+ if torch.version.cuda is not None:
+ cuda_major = torch.version.cuda.split(".")[0]
+ path = os.path.expanduser(
+ f"~/.config/vllm/nccl/cu{cuda_major}/libnccl.so.*")
+ files = glob.glob(path)
+ vllm_nccl_path = files[0] if files else None
+
+ # manually load the nccl library
+ if so_file:
+ logger.info(
+ f"Found nccl from environment variable VLLM_NCCL_SO_PATH={so_file}"
+ )
+ else:
+ if torch.version.cuda is not None:
+ so_file = vllm_nccl_path or find_library("libnccl.so.2")
+ elif torch.version.hip is not None:
+ so_file = find_library("librccl.so.1")
+ else:
+ raise ValueError("NCCL only supports CUDA and ROCm backends.")
+ logger.info(f"Found nccl from library {so_file}")
+ return so_file
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 49e1ad5709f5d..bf0a6c84e6f07 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -1,10 +1,11 @@
from typing import Dict, List, Optional, Tuple
import torch
+from torch import nn
from vllm.attention import AttentionMetadata, get_attn_backend
-from vllm.config import (DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig,
- SchedulerConfig)
+from vllm.config import (DeviceConfig, LoadConfig, LoRAConfig, ModelConfig,
+ ParallelConfig, SchedulerConfig, VisionLanguageConfig)
from vllm.distributed import broadcast_tensor_dict
from vllm.logger import init_logger
from vllm.model_executor import SamplingMetadata
@@ -26,7 +27,9 @@ def __init__(
parallel_config: ParallelConfig,
scheduler_config: SchedulerConfig,
device_config: DeviceConfig,
+ load_config: LoadConfig,
lora_config: Optional[LoRAConfig],
+ vision_language_config: Optional[VisionLanguageConfig],
kv_cache_dtype: Optional[str] = "auto",
is_driver_worker: bool = False,
*args,
@@ -36,6 +39,8 @@ def __init__(
self.parallel_config = parallel_config
self.scheduler_config = scheduler_config
self.lora_config = lora_config
+ self.vision_language_config = vision_language_config
+ self.load_config = load_config
self.is_driver_worker = is_driver_worker
# model_config can be None in tests/samplers/test_sampler.py.
@@ -46,30 +51,36 @@ def __init__(
if device_config is not None else DeviceConfig())
self.device = self.device_config.device
- self.model = None
- self.block_size = None # Set after initial profiling.
-
self.kv_cache_dtype = kv_cache_dtype
self.attn_backend = get_attn_backend(
self.model_config.dtype if model_config is not None else None)
+ # Lazy initialization.
+ self.model: nn.Module # Set after init_Model
+ self.block_size: int # Set after initial profiling.
+
def load_model(self) -> None:
- self.model = get_model(self.model_config,
- self.device_config,
- lora_config=self.lora_config,
- parallel_config=self.parallel_config,
- scheduler_config=self.scheduler_config)
+ self.model = get_model(
+ model_config=self.model_config,
+ load_config=self.load_config,
+ device_config=self.device_config,
+ vision_language_config=self.vision_language_config,
+ lora_config=self.lora_config,
+ parallel_config=self.parallel_config,
+ scheduler_config=self.scheduler_config)
def _prepare_prompt(
self,
seq_group_metadata_list: List[SequenceGroupMetadata],
- ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int]]:
+ ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
+ Optional[torch.Tensor]]:
assert len(seq_group_metadata_list) > 0
input_tokens: List[int] = []
input_positions: List[int] = []
slot_mapping: List[int] = []
prompt_lens: List[int] = []
+ multi_modal_input_list: List[torch.Tensor] = []
for seq_group_metadata in seq_group_metadata_list:
assert seq_group_metadata.is_prompt
@@ -90,6 +101,10 @@ def _prepare_prompt(
# is always the first token in the sequence.
input_positions.extend(list(range(computed_len, prompt_len)))
+ if seq_group_metadata.multi_modal_data:
+ multi_modal_input_list.append(
+ seq_group_metadata.multi_modal_data.data)
+
# Compute the slot mapping.
block_table = seq_group_metadata.block_tables[seq_id]
# Mask the [0, start_idx) tokens of the prompt with _PAD_SLOT_ID,
@@ -112,6 +127,15 @@ def _prepare_prompt(
slot = block_number * self.block_size + block_offset
slot_mapping.append(slot)
+ if multi_modal_input_list:
+ assert self.vision_language_config, (
+ "Multi-modal inputs are only supported by "
+ "vision language models.")
+ multi_modal_input = torch.cat(multi_modal_input_list,
+ dim=0).to(self.device)
+ else:
+ multi_modal_input = None
+
num_prompt_tokens = len(input_tokens)
input_tokens = torch.tensor(input_tokens,
@@ -138,12 +162,8 @@ def _prepare_prompt(
slot_mapping=slot_mapping,
kv_cache_dtype=self.kv_cache_dtype,
)
- return (
- input_tokens,
- input_positions,
- attn_metadata,
- prompt_lens,
- )
+ return (input_tokens, input_positions, attn_metadata, prompt_lens,
+ multi_modal_input)
def _prepare_decode(
self,
@@ -241,7 +261,11 @@ def _prepare_sample(
selected_token_indices: List[int] = []
generators: List[torch.Generator] = []
selected_token_start_idx = 0
- categorized_sample_indices = {t: [] for t in SamplingType}
+ categorized_sample_indices: Dict[SamplingType,
+ List[Tuple[int, int]]] = {
+ t: []
+ for t in SamplingType
+ }
categorized_sample_indices_start_idx = 0
categorized_sampled_token_indices_start_idx = 0
@@ -258,10 +282,9 @@ def _prepare_sample(
categorized_sample_indices_start_idx += subquery_len - 1
categorized_sample_indices[
- sampling_params.sampling_type].append([
- categorized_sample_indices_start_idx,
- categorized_sampled_token_indices_start_idx
- ])
+ sampling_params.sampling_type].append(
+ (categorized_sample_indices_start_idx,
+ categorized_sampled_token_indices_start_idx))
categorized_sample_indices_start_idx += 1
categorized_sampled_token_indices_start_idx += 1
@@ -324,17 +347,19 @@ def _prepare_sample(
def prepare_input_tensors(
self,
- seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
- ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata,
- SamplingMetadata]:
+ seq_group_metadata_list: List[SequenceGroupMetadata],
+ ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
+ Optional[torch.Tensor]]:
+ multi_modal_input = None
if self.is_driver_worker:
# NOTE: We assume that all sequences in the group are all prompts or
# all decodes.
is_prompt = seq_group_metadata_list[0].is_prompt
# Prepare input tensors.
if is_prompt:
- (input_tokens, input_positions, attn_metadata,
- prompt_lens) = self._prepare_prompt(seq_group_metadata_list)
+ (input_tokens, input_positions, attn_metadata, prompt_lens,
+ multi_modal_input
+ ) = self._prepare_prompt(seq_group_metadata_list)
else:
(input_tokens, input_positions,
attn_metadata) = self._prepare_decode(seq_group_metadata_list)
@@ -367,20 +392,17 @@ def prepare_input_tensors(
perform_sampling=False,
)
- return (
- input_tokens,
- input_positions,
- attn_metadata,
- sampling_metadata,
- )
+ return (input_tokens, input_positions, attn_metadata,
+ sampling_metadata, multi_modal_input)
@torch.inference_mode()
def execute_model(
self,
- seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+ seq_group_metadata_list: List[SequenceGroupMetadata],
kv_caches: List[torch.Tensor],
) -> Optional[SamplerOutput]:
- (input_tokens, input_positions, attn_metadata, sampling_metadata
+ (input_tokens, input_positions, attn_metadata, sampling_metadata,
+ multi_modal_input
) = self.prepare_input_tensors(seq_group_metadata_list)
model_executable = self.model
@@ -390,6 +412,8 @@ def execute_model(
"kv_caches": kv_caches,
"attn_metadata": attn_metadata,
}
+ if self.vision_language_config:
+ execute_model_kwargs.update({"image_input": multi_modal_input})
hidden_states = model_executable(**execute_model_kwargs)
diff --git a/vllm/worker/cpu_worker.py b/vllm/worker/cpu_worker.py
index 3989207e8dd83..83ededd742533 100644
--- a/vllm/worker/cpu_worker.py
+++ b/vllm/worker/cpu_worker.py
@@ -1,12 +1,13 @@
"""A CPU worker class."""
-from typing import Dict, List, Optional
+from typing import Any, Dict, List, Optional, Tuple
import torch
import torch.distributed
from vllm.attention import get_attn_backend
-from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
- ParallelConfig, SchedulerConfig)
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+ ModelConfig, ParallelConfig, SchedulerConfig,
+ VisionLanguageConfig)
from vllm.distributed import (broadcast_tensor_dict,
ensure_model_parallel_initialized,
init_distributed_environment)
@@ -117,10 +118,12 @@ def __init__(
scheduler_config: SchedulerConfig,
device_config: DeviceConfig,
cache_config: CacheConfig,
+ load_config: LoadConfig,
local_rank: int,
rank: int,
distributed_init_method: str,
lora_config: Optional[LoRAConfig] = None,
+ vision_language_config: Optional[VisionLanguageConfig] = None,
kv_cache_dtype: Optional[str] = "auto",
is_driver_worker: bool = False,
) -> None:
@@ -129,25 +132,34 @@ def __init__(
self.scheduler_config = scheduler_config
self.device_config = device_config
self.cache_config = cache_config
+ self.load_config = load_config
self.local_rank = local_rank
self.rank = rank
self.distributed_init_method = distributed_init_method
self.lora_config = lora_config
+ self.vision_language_config = vision_language_config
self.is_driver_worker = is_driver_worker
if self.is_driver_worker:
assert self.rank == 0, "The driver worker must have rank 0."
- self.model_runner = CPUModelRunner(model_config,
- parallel_config,
- scheduler_config,
- device_config,
- lora_config=self.lora_config,
- kv_cache_dtype=kv_cache_dtype,
- is_driver_worker=is_driver_worker)
+ if self.model_config.trust_remote_code:
+ # note: lazy import to avoid importing torch before initializing
+ from vllm.utils import init_cached_hf_modules
+ init_cached_hf_modules()
+ self.model_runner = CPUModelRunner(
+ model_config,
+ parallel_config,
+ scheduler_config,
+ device_config,
+ load_config=self.load_config,
+ lora_config=self.lora_config,
+ vision_language_config=self.vision_language_config,
+ kv_cache_dtype=kv_cache_dtype,
+ is_driver_worker=is_driver_worker)
# Uninitialized cache engine. Will be initialized by
# initialize_cache.
- self.cache_engine = None
- self.cpu_cache = None
+ self.cache_engine: CPUCacheEngine
+ self.cpu_cache: List[torch.Tensor]
def init_device(self) -> None:
self.init_distributed_environment()
@@ -157,7 +169,7 @@ def init_device(self) -> None:
def load_model(self):
self.model_runner.load_model()
- def determine_num_available_blocks(self) -> tuple[int, int]:
+ def determine_num_available_blocks(self) -> Tuple[int, int]:
"""Determine the number of blocks available for the KV cache.
This determines how many KV blocks can fit into the configured CPU
@@ -248,16 +260,16 @@ def execute_model(
blocks_to_swap_in: Optional[Dict[int, int]] = None,
blocks_to_swap_out: Optional[Dict[int, int]] = None,
blocks_to_copy: Optional[Dict[int, List[int]]] = None,
- ) -> Optional[SamplerOutput]:
+ ) -> List[SamplerOutput]:
if self.is_driver_worker:
assert seq_group_metadata_list is not None
- num_seq_groups = len(seq_group_metadata_list)
+ num_seq_groups: int = len(seq_group_metadata_list)
assert blocks_to_swap_in is not None
assert blocks_to_swap_out is not None
assert blocks_to_copy is not None
assert len(blocks_to_swap_in) == 0
assert len(blocks_to_swap_out) == 0
- data = {
+ data: Dict[str, Any] = {
"num_seq_groups": num_seq_groups,
"blocks_to_copy": blocks_to_copy,
}
@@ -267,15 +279,18 @@ def execute_model(
num_seq_groups = data["num_seq_groups"]
blocks_to_copy = data["blocks_to_copy"]
+ assert blocks_to_copy is not None
self.cache_copy(blocks_to_copy)
# If there is no input, we don't need to execute the model.
if num_seq_groups == 0:
- return {}
+ return []
output = self.model_runner.execute_model(seq_group_metadata_list,
self.cpu_cache)
- return output
+
+ # CPU worker only supports single-step execution.
+ return [output]
def init_distributed_environment(self) -> None:
"""Initialize the distributed environment."""
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 0051d6a3fb309..65996f1710a8a 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -11,8 +11,8 @@
from vllm.attention import (AttentionMetadata, AttentionMetadataPerStage,
get_attn_backend)
-from vllm.config import (DeviceConfig, LoRAConfig, ModelConfig, ParallelConfig,
- SchedulerConfig, VisionLanguageConfig)
+from vllm.config import (DeviceConfig, LoadConfig, LoRAConfig, ModelConfig,
+ ParallelConfig, SchedulerConfig, VisionLanguageConfig)
from vllm.distributed import broadcast_tensor_dict, with_pynccl_for_all_reduce
from vllm.distributed.device_communicators import (custom_all_reduce,
pynccl_utils)
@@ -109,6 +109,7 @@ def __init__(
parallel_config: ParallelConfig,
scheduler_config: SchedulerConfig,
device_config: DeviceConfig,
+ load_config: LoadConfig,
lora_config: Optional[LoRAConfig],
kv_cache_dtype: Optional[str] = "auto",
is_driver_worker: bool = False,
@@ -118,6 +119,7 @@ def __init__(
self.parallel_config = parallel_config
self.scheduler_config = scheduler_config
self.lora_config = lora_config
+ self.load_config = load_config
self.is_driver_worker = is_driver_worker
# model_config can be None in tests/samplers/test_sampler.py.
@@ -128,23 +130,17 @@ def __init__(
if device_config is not None else DeviceConfig())
self.device = self.device_config.device
- self.model = None
- self.block_size = None # Set after initial profiling.
- self.lora_manager = None
+ # Set after load_model.
+ self.lora_manager: LRUCacheWorkerLoRAManager = None
self.graph_runners: Dict[int, CUDAGraphRunner] = {}
- self.graph_memory_pool = None # Set during graph capture.
+ self.graph_memory_pool: Optional[Tuple[
+ int, int]] = None # Set during graph capture.
self.max_context_len_to_capture = (
self.model_config.max_context_len_to_capture
if self.model_config is not None else 0)
- # When using CUDA graph, the input block tables must be padded to
- # max_context_len_to_capture. However, creating the block table in
- # Python can be expensive. To optimize this, we cache the block table
- # in numpy and only copy the actual input content at every iteration.
- # The shape of the cached block table will be
- # (max batch size to capture, max context len to capture / block size).
- self.graph_block_tables = None # Set after initial profiling.
+
self.pin_memory = is_pin_memory_available()
self.kv_cache_dtype = kv_cache_dtype
self.vision_language_config = vision_language_config
@@ -152,15 +148,28 @@ def __init__(
self.attn_backend = get_attn_backend(
self.model_config.dtype if model_config is not None else None)
+ # Lazy initialization
+ self.model: torch.nn.Module # Set after load_model
+ self.block_size: int # Set after initial profiling.
+ # When using CUDA graph, the input block tables must be padded to
+ # max_context_len_to_capture. However, creating the block table in
+ # Python can be expensive. To optimize this, we cache the block table
+ # in numpy and only copy the actual input content at every iteration.
+ # The shape of the cached block table will be
+ # (max batch size to capture, max context len to capture / block size).
+ self.graph_block_tables: torch.Tensor # Set after initial profiling.
+
def load_model(self) -> None:
with CudaMemoryProfiler() as m:
self.model = get_model(
- self.model_config,
- self.device_config,
+ model_config=self.model_config,
+ device_config=self.device_config,
+ load_config=self.load_config,
lora_config=self.lora_config,
vision_language_config=self.vision_language_config,
parallel_config=self.parallel_config,
- scheduler_config=self.scheduler_config)
+ scheduler_config=self.scheduler_config,
+ )
self.model_memory_usage = m.consumed_memory
logger.info(f"Loading model weights took "
@@ -487,16 +496,16 @@ def _prepare_decode(
lora_index_mapping.append(0)
batch_size = graph_batch_size
- context_lens = torch.tensor(context_lens,
- dtype=torch.int,
- device=self.device)
+ context_lens_tensor = torch.tensor(context_lens,
+ dtype=torch.int,
+ device=self.device)
if use_captured_graph:
# When using cuda-graph all these tensors should be
# padded.
- assert context_lens.shape[0] == len(input_tokens)
- assert context_lens.shape[0] == len(input_positions)
- assert context_lens.shape[0] == len(slot_mapping)
+ assert context_lens_tensor.shape[0] == len(input_tokens)
+ assert context_lens_tensor.shape[0] == len(input_positions)
+ assert context_lens_tensor.shape[0] == len(slot_mapping)
# The shape of graph_block_tables is
# [max batch size, max context len // block size].
@@ -525,7 +534,7 @@ def _prepare_decode(
max_prompt_len=None,
subquery_start_loc=None,
seq_start_loc=None,
- context_lens=context_lens,
+ context_lens=context_lens_tensor,
block_tables=block_tables,
use_cuda_graph=use_captured_graph,
)
@@ -549,7 +558,11 @@ def _prepare_sample(
selected_token_indices: List[int] = []
generators: List[torch.Generator] = []
selected_token_start_idx = 0
- categorized_sample_indices = {t: [] for t in SamplingType}
+ categorized_sample_indices: Dict[SamplingType,
+ List[Tuple[int, int]]] = {
+ t: []
+ for t in SamplingType
+ }
categorized_sample_indices_start_idx = 0
categorized_sampled_token_indices_start_idx = 0
@@ -567,10 +580,9 @@ def _prepare_sample(
categorized_sample_indices_start_idx += subquery_len - 1
categorized_sample_indices[
- sampling_params.sampling_type].append([
- categorized_sample_indices_start_idx,
- categorized_sampled_token_indices_start_idx
- ])
+ sampling_params.sampling_type].append(
+ (categorized_sample_indices_start_idx,
+ categorized_sampled_token_indices_start_idx))
categorized_sample_indices_start_idx += 1
categorized_sampled_token_indices_start_idx += 1
@@ -594,15 +606,16 @@ def _prepare_sample(
categorized_sample_indices[
sampling_params.sampling_type].extend(
- zip(
- range(
- categorized_sample_indices_start_idx,
- categorized_sample_indices_start_idx +
- num_seqs),
- range(
- categorized_sampled_token_indices_start_idx,
- categorized_sampled_token_indices_start_idx +
- num_seqs)))
+ list(
+ zip(
+ range(
+ categorized_sample_indices_start_idx,
+ categorized_sample_indices_start_idx +
+ num_seqs),
+ range(
+ categorized_sampled_token_indices_start_idx,
+ categorized_sampled_token_indices_start_idx
+ + num_seqs))))
categorized_sample_indices_start_idx += num_seqs
categorized_sampled_token_indices_start_idx += num_seqs
@@ -639,9 +652,9 @@ def _prepare_sample(
def prepare_input_tensors(
self,
- seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+ seq_group_metadata_list: List[SequenceGroupMetadata],
) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
- Set[int], LoRAMapping, torch.Tensor]:
+ Set[LoRARequest], LoRAMapping, torch.Tensor]:
if self.is_driver_worker:
prefill_reqs = []
decode_reqs = []
@@ -739,6 +752,7 @@ def prepare_input_tensors(
if prefill_attn_metadata is not None:
metadata_dict.update(prefill_attn_metadata.asdict_zerocopy())
else:
+ assert decode_attn_metadata is not None
metadata_dict.update(decode_attn_metadata.asdict_zerocopy())
broadcast_tensor_dict(metadata_dict, src=0)
@@ -807,7 +821,7 @@ def prepare_input_tensors(
@torch.inference_mode()
def execute_model(
self,
- seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+ seq_group_metadata_list: List[SequenceGroupMetadata],
kv_caches: List[torch.Tensor],
) -> Optional[SamplerOutput]:
(input_tokens, input_positions, attn_metadata, sampling_metadata,
@@ -916,12 +930,12 @@ def profile_run(self) -> None:
torch.cuda.synchronize()
return
- def remove_all_loras(self) -> bool:
+ def remove_all_loras(self):
if not self.lora_manager:
raise RuntimeError("LoRA is not enabled.")
- return self.lora_manager.remove_all_loras()
+ self.lora_manager.remove_all_loras()
- def set_active_loras(self, lora_requests: List[LoRARequest],
+ def set_active_loras(self, lora_requests: Set[LoRARequest],
lora_mapping: LoRAMapping) -> None:
if not self.lora_manager:
raise RuntimeError("LoRA is not enabled.")
@@ -1065,10 +1079,16 @@ def __init__(self, model: nn.Module):
super().__init__()
self.model = model
- self.graph = None
self.input_buffers: Dict[str, torch.Tensor] = {}
self.output_buffers: Dict[str, torch.Tensor] = {}
+ self._graph: Optional[torch.cuda.CUDAGraph] = None
+
+ @property
+ def graph(self):
+ assert self._graph is not None
+ return self._graph
+
def capture(
self,
input_ids: torch.Tensor,
@@ -1078,7 +1098,7 @@ def capture(
memory_pool,
**kwargs,
) -> None:
- assert self.graph is None
+ assert self._graph is None
# Run the model once without capturing the graph.
# This is to make sure that the captured graph does not include the
# kernel launches for initial benchmarking (e.g., Triton autotune).
@@ -1095,8 +1115,8 @@ def capture(
# Capture the graph.
# NOTE(woosuk): Python 3.8 does not support multi-line with statements.
# https://stackoverflow.com/questions/31039022/python-multi-line-with-statement
- self.graph = torch.cuda.CUDAGraph()
- with torch.cuda.graph(self.graph, pool=memory_pool): # noqa: SIM117
+ self._graph = torch.cuda.CUDAGraph()
+ with torch.cuda.graph(self._graph, pool=memory_pool): # noqa: SIM117
with _maybe_pynccl():
hidden_states = self.model(
input_ids,
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index fff721a80c204..487df334d73e3 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -1,12 +1,13 @@
from typing import Dict, List, Optional, Tuple
import torch
+from torch import nn
from vllm.config import (DeviceConfig, ModelConfig, ParallelConfig,
SchedulerConfig)
from vllm.logger import init_logger
from vllm.model_executor import SamplingMetadata
-from vllm.model_executor.neuron_model_loader import get_neuron_model
+from vllm.model_executor.model_loader.neuron import get_neuron_model
from vllm.sampling_params import SamplingParams, SamplingType
from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
from vllm.utils import (async_tensor_h2d, is_pin_memory_available,
@@ -34,9 +35,11 @@ def __init__(
self.device_config = (device_config
if device_config is not None else DeviceConfig())
self.device = self.device_config.device
- self.model = None
self.pin_memory = is_pin_memory_available()
+ # Lazy initialization.
+ self.model: nn.Module # initialize after load_model.
+
def load_model(self) -> None:
self.model = get_neuron_model(self.model_config,
parallel_config=self.parallel_config,
@@ -147,7 +150,11 @@ def _prepare_sample(
selected_token_indices: List[int] = []
generators: List[torch.Generator] = []
selected_token_start_idx = 0
- categorized_sample_indices = {t: [] for t in SamplingType}
+ categorized_sample_indices: Dict[SamplingType,
+ List[Tuple[int, int]]] = {
+ t: []
+ for t in SamplingType
+ }
categorized_sample_indices_start_idx = 0
categorized_sampled_token_indices_start_idx = 0
@@ -165,10 +172,9 @@ def _prepare_sample(
categorized_sample_indices_start_idx += prompt_len - 1
categorized_sample_indices[
- sampling_params.sampling_type].append([
- categorized_sample_indices_start_idx,
- categorized_sampled_token_indices_start_idx
- ])
+ sampling_params.sampling_type].append(
+ (categorized_sample_indices_start_idx,
+ categorized_sampled_token_indices_start_idx))
categorized_sample_indices_start_idx += 1
categorized_sampled_token_indices_start_idx += 1
@@ -237,7 +243,7 @@ def _prepare_sample(
def prepare_input_tensors(
self,
- seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+ seq_group_metadata_list: List[SequenceGroupMetadata],
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, SamplingMetadata]:
# NOTE: We assume that all sequences in the group are all prompts or
# all decodes.
@@ -259,7 +265,7 @@ def prepare_input_tensors(
@torch.inference_mode()
def execute_model(
self,
- seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
+ seq_group_metadata_list: List[SequenceGroupMetadata],
) -> Optional[SamplerOutput]:
(input_tokens, input_positions, input_block_ids, sampling_metadata
) = self.prepare_input_tensors(seq_group_metadata_list)
diff --git a/vllm/worker/neuron_worker.py b/vllm/worker/neuron_worker.py
index 6136d50d0c068..d0e6aaed180e6 100644
--- a/vllm/worker/neuron_worker.py
+++ b/vllm/worker/neuron_worker.py
@@ -1,5 +1,5 @@
"""A Neuron worker class."""
-from typing import List, Optional
+from typing import List, Tuple
import torch
import torch.distributed
@@ -29,6 +29,10 @@ def __init__(
self.scheduler_config = scheduler_config
self.device_config = device_config
self.cache_config = cache_config
+ if self.model_config.trust_remote_code:
+ # note: lazy import to avoid importing torch before initializing
+ from vllm.utils import init_cached_hf_modules
+ init_cached_hf_modules()
self.model_runner = NeuronModelRunner(model_config, parallel_config,
scheduler_config, device_config)
@@ -40,7 +44,7 @@ def init_device(self) -> None:
def load_model(self):
self.model_runner.load_model()
- def determine_num_available_blocks(self) -> tuple[int, int]:
+ def determine_num_available_blocks(self) -> Tuple[int, int]:
"""Determine the number of available KV blocks.
Swapping is not yet supported, so always return num_cpu_blocks=0.
@@ -73,15 +77,18 @@ def initialize_cache(self, num_gpu_blocks: int,
def execute_model(
self,
seq_group_metadata_list: List[SequenceGroupMetadata],
- ) -> Optional[SamplerOutput]:
+ ) -> List[SamplerOutput]:
num_seq_groups = len(seq_group_metadata_list)
# If there is no input, we don't need to execute the model.
if num_seq_groups == 0:
- return {}
+ return []
output = self.model_runner.execute_model(seq_group_metadata_list)
- return output
+
+ # Neuron worker only supports single-step output. Wrap the output in a
+ # list to conform to interface.
+ return [output]
def get_cache_block_size_bytes(self) -> int:
"""Determine the size in bytes of a cache block.
diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py
index 3f0b2fd83f3e5..39ad428f16fe3 100644
--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -1,13 +1,14 @@
"""A GPU worker class."""
import gc
import os
-from typing import Dict, List, Optional, Set, Tuple
+from typing import Any, Dict, List, Optional, Set, Tuple
import torch
import torch.distributed
-from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig,
- ParallelConfig, SchedulerConfig, VisionLanguageConfig)
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoRAConfig,
+ ModelConfig, ParallelConfig, SchedulerConfig,
+ VisionLanguageConfig)
from vllm.distributed import (broadcast_tensor_dict,
ensure_model_parallel_initialized,
init_distributed_environment)
@@ -37,6 +38,7 @@ def __init__(
scheduler_config: SchedulerConfig,
device_config: DeviceConfig,
cache_config: CacheConfig,
+ load_config: LoadConfig,
local_rank: int,
rank: int,
distributed_init_method: str,
@@ -53,10 +55,15 @@ def __init__(
self.rank = rank
self.distributed_init_method = distributed_init_method
self.lora_config = lora_config
+ self.load_config = load_config
self.is_driver_worker = is_driver_worker
if self.is_driver_worker:
assert self.rank == 0, "The driver worker must have rank 0."
+ if self.model_config.trust_remote_code:
+ # note: lazy import to avoid importing torch before initializing
+ from vllm.utils import init_cached_hf_modules
+ init_cached_hf_modules()
self.vision_language_config = vision_language_config
if self.vision_language_config:
assert not self.lora_config, (
@@ -67,14 +74,16 @@ def __init__(
parallel_config,
scheduler_config,
device_config,
+ load_config=load_config,
lora_config=self.lora_config,
kv_cache_dtype=self.cache_config.cache_dtype,
is_driver_worker=is_driver_worker,
- vision_language_config=vision_language_config)
+ vision_language_config=vision_language_config,
+ )
# Uninitialized cache engine. Will be initialized by
# initialize_cache.
- self.cache_engine = None
- self.gpu_cache = None
+ self.cache_engine: CacheEngine
+ self.gpu_cache: List[torch.Tensor]
def init_device(self) -> None:
if self.device_config.device.type == "cuda":
@@ -205,14 +214,16 @@ def execute_model(
blocks_to_swap_in: Optional[Dict[int, int]] = None,
blocks_to_swap_out: Optional[Dict[int, int]] = None,
blocks_to_copy: Optional[Dict[int, List[int]]] = None,
- ) -> Optional[SamplerOutput]:
+ num_lookahead_slots: int = 0,
+ ) -> List[SamplerOutput]:
+
if self.is_driver_worker:
assert seq_group_metadata_list is not None
num_seq_groups = len(seq_group_metadata_list)
assert blocks_to_swap_in is not None
assert blocks_to_swap_out is not None
assert blocks_to_copy is not None
- data = {
+ data: Dict[str, Any] = {
"num_seq_groups": num_seq_groups,
"blocks_to_swap_in": blocks_to_swap_in,
"blocks_to_swap_out": blocks_to_swap_out,
@@ -226,15 +237,21 @@ def execute_model(
blocks_to_swap_out = data["blocks_to_swap_out"]
blocks_to_copy = data["blocks_to_copy"]
+ assert blocks_to_swap_in is not None
+ assert blocks_to_swap_out is not None
+ assert blocks_to_copy is not None
self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy)
# If there is no input, we don't need to execute the model.
if num_seq_groups == 0:
- return {}
+ return []
output = self.model_runner.execute_model(seq_group_metadata_list,
self.gpu_cache)
- return output
+
+ # Worker only supports single-step execution. Wrap the output in a list
+ # to conform to interface.
+ return [output]
def add_lora(self, lora_request: LoRARequest) -> bool:
return self.model_runner.add_lora(lora_request)
@@ -281,12 +298,9 @@ def init_worker_distributed_environment(
elif parallel_config.world_size > 1:
# NOTE(woosuk): We don't initialize pynccl process group when world size
# is 1.
- pynccl_utils.init_process_group(
- world_size=parallel_config.world_size,
- local_rank=local_rank,
- rank=rank,
- init_method=distributed_init_method,
- )
+ # NOTE(kaichao): By default, pynccl will use information inside
+ # `parallel_state` for initialization.
+ pynccl_utils.init_process_group()
ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
parallel_config.pipeline_parallel_size)
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index e3027c406ffeb..b5dade0a770a0 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -1,8 +1,17 @@
+import datetime
+import importlib
+import os
+import tempfile
+import threading
from abc import ABC, abstractmethod
-from typing import Dict, List
+from typing import Dict, List, Set, Tuple
+from vllm.logger import enable_trace_function_call, init_logger
from vllm.lora.request import LoRARequest
from vllm.sequence import SamplerOutput, SequenceGroupMetadata
+from vllm.utils import get_vllm_instance_id, update_environment_variables
+
+logger = init_logger(__name__)
class WorkerBase(ABC):
@@ -18,14 +27,14 @@ def init_device(self) -> None:
raise NotImplementedError
@abstractmethod
- def determine_num_available_blocks(self) -> tuple[int, int]:
+ def determine_num_available_blocks(self) -> Tuple[int, int]:
"""Determine the number of available blocks for the GPU KV cache and
swappable CPU KV cache.
The implementation may run profiling or other heuristics to determine
the size of caches.
- Returns a tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
+ Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
are blocks that are "active" on the device and can be appended to.
num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
appended to.
@@ -40,16 +49,17 @@ def initialize_cache(self, num_gpu_blocks: int,
raise NotImplementedError
@abstractmethod
- def execute_model(self,
- seq_group_metadata_list: List[SequenceGroupMetadata],
- blocks_to_swap_in: Dict[int, int],
- blocks_to_swap_out: Dict[int, int],
- blocks_to_copy: Dict[int, List[int]]) -> SamplerOutput:
- """Executes one model step on the given sequences."""
+ def execute_model(
+ self, seq_group_metadata_list: List[SequenceGroupMetadata],
+ blocks_to_swap_in: Dict[int, int], blocks_to_swap_out: Dict[int,
+ int],
+ blocks_to_copy: Dict[int, List[int]]) -> List[SamplerOutput]:
+ """Executes at least one model step on the given sequences, unless no
+ sequences are provided."""
raise NotImplementedError
@abstractmethod
- def get_cache_block_size_bytes() -> int:
+ def get_cache_block_size_bytes(self) -> int:
"""Return the size of a single cache block, in bytes. Used in
speculative decoding.
"""
@@ -64,7 +74,7 @@ def remove_lora(self, lora_id: int) -> bool:
raise NotImplementedError
@abstractmethod
- def list_loras(self) -> List[int]:
+ def list_loras(self) -> Set[int]:
raise NotImplementedError
@@ -79,5 +89,70 @@ def add_lora(self, lora_request: LoRARequest) -> bool:
def remove_lora(self, lora_id: int) -> bool:
raise ValueError(f"{type(self)} does not support LoRA")
- def list_loras(self) -> List[int]:
+ def list_loras(self) -> Set[int]:
raise ValueError(f"{type(self)} does not support LoRA")
+
+
+class WorkerWrapperBase:
+ """
+ The whole point of this class is to lazily initialize the worker.
+ We first instantiate the WorkerWrapper, which remembers the worker module
+ and class name. Then, when we call `update_environment_variables`, and the
+ real initialization happens in `init_worker`.
+ """
+
+ def __init__(self,
+ worker_module_name=None,
+ worker_class_name=None,
+ trust_remote_code: bool = False) -> None:
+ self.worker_module_name = worker_module_name
+ self.worker_class_name = worker_class_name
+ self.worker = None
+ if trust_remote_code:
+ # note: lazy import to avoid importing torch before initializing
+ from vllm.utils import init_cached_hf_modules
+ init_cached_hf_modules()
+
+ @staticmethod
+ def update_environment_variables(envs: Dict[str, str]) -> None:
+ key = 'CUDA_VISIBLE_DEVICES'
+ if key in envs and key in os.environ:
+ # overwriting CUDA_VISIBLE_DEVICES is desired behavior
+ # suppress the warning in `update_environment_variables`
+ del os.environ[key]
+ update_environment_variables(envs)
+
+ def init_worker(self, *args, **kwargs):
+ """
+ Actual initialization of the worker class, and set up
+ function tracing if required.
+ Arguments are passed to the worker class constructor.
+ """
+ if int(os.getenv("VLLM_TRACE_FUNCTION", "0")):
+ tmp_dir = tempfile.gettempdir()
+ filename = (f"VLLM_TRACE_FUNCTION_for_process_{os.getpid()}"
+ f"_thread_{threading.get_ident()}_"
+ f"at_{datetime.datetime.now()}.log").replace(" ", "_")
+ log_path = os.path.join(tmp_dir, "vllm", get_vllm_instance_id(),
+ filename)
+ os.makedirs(os.path.dirname(log_path), exist_ok=True)
+ enable_trace_function_call(log_path)
+
+ mod = importlib.import_module(self.worker_module_name)
+ worker_class = getattr(mod, self.worker_class_name)
+ self.worker = worker_class(*args, **kwargs)
+
+ def execute_method(self, method, *args, **kwargs):
+ try:
+ target = self if self.worker is None else self.worker
+ executor = getattr(target, method)
+ return executor(*args, **kwargs)
+ except Exception as e:
+ # if the driver worker also execute methods,
+ # exceptions in the rest worker may cause deadlock in rpc like ray
+ # see https://github.com/vllm-project/vllm/issues/3455
+ # print the error and inform the user to solve the error
+ msg = (f"Error executing method {method}. "
+ "This might cause deadlock in distributed execution.")
+ logger.exception(msg)
+ raise e