From 5b59532760c82a9d91f65a3e227524da2af7d4ef Mon Sep 17 00:00:00 2001 From: litianjian <45817262+litianjian@users.noreply.github.com> Date: Mon, 23 Sep 2024 01:51:44 +0800 Subject: [PATCH] [Model][VLM] Add LLaVA-Onevision model support (#8486) Co-authored-by: litianjian Co-authored-by: Cyrus Leung Co-authored-by: Roger Wang Co-authored-by: DarkLight1337 --- docs/source/models/supported_models.rst | 7 +- examples/offline_inference_vision_language.py | 60 +- .../vision_language/test_llava_next_video.py | 3 - .../vision_language/test_llava_onevision.py | 356 +++++++ tests/models/test_registry.py | 3 +- vllm/assets/video.py | 2 +- vllm/model_executor/models/__init__.py | 6 +- vllm/model_executor/models/clip.py | 19 + vllm/model_executor/models/llava_onevision.py | 876 ++++++++++++++++++ vllm/model_executor/models/siglip.py | 19 + 10 files changed, 1330 insertions(+), 21 deletions(-) create mode 100644 tests/models/decoder_only/vision_language/test_llava_onevision.py create mode 100644 vllm/model_executor/models/llava_onevision.py diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 9e0303e1dab6c..d86d0860f7f29 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -244,6 +244,11 @@ Multimodal Language Models - Video - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. (see note) - + * - :code:`LlavaOnevisionForConditionalGeneration` + - LLaVA-Onevision + - Image\ :sup:`+` / Video + - :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. (see note) + - * - :code:`MiniCPMV` - MiniCPM-V - Image\ :sup:`+` @@ -288,7 +293,7 @@ Multimodal Language Models For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 .. note:: - For :code:`LLaVA-NeXT-Video` and :code:`Qwen2-VL`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developer version (:code:`21fac7abba2a37fae86106f87fcf9974fd1e3830`) for now. + For :code:`LLaVA-NeXT-Video`, :code:`LLaVA-Onevision` and :code:`Qwen2-VL`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developer version (:code:`21fac7abba2a37fae86106f87fcf9974fd1e3830`) for now. This can be installed by running the following command: .. code-block:: bash diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 464eaf334e3de..c1129316a6e30 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -14,7 +14,8 @@ # LLaVA-1.5 -def run_llava(question): +def run_llava(question, modality): + assert modality == "image" prompt = f"USER: \n{question}\nASSISTANT:" @@ -24,7 +25,8 @@ def run_llava(question): # LLaVA-1.6/LLaVA-NeXT -def run_llava_next(question): +def run_llava_next(question, modality): + assert modality == "image" prompt = f"[INST] \n{question} [/INST]" llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192) @@ -34,15 +36,35 @@ def run_llava_next(question): # LlaVA-NeXT-Video # Currently only support for video input -def run_llava_next_video(question): +def run_llava_next_video(question, modality): + assert modality == "video" + prompt = f"USER: