From 56dac8b1899b20bead3f6ebdc41114a999f2ccd7 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Thu, 9 Jan 2025 01:14:14 +0800 Subject: [PATCH] [Doc] Expand Multimodal API Reference (#11852) Signed-off-by: DarkLight1337 --- docs/source/api/multimodal/index.md | 61 ++++-------------------- docs/source/api/multimodal/inputs.md | 49 +++++++++++++++++++ docs/source/api/multimodal/parse.md | 9 ++++ docs/source/api/multimodal/processing.md | 9 ++++ docs/source/api/multimodal/profiling.md | 9 ++++ docs/source/api/multimodal/registry.md | 9 ++++ vllm/multimodal/parse.py | 31 ++++++++---- vllm/multimodal/processing.py | 26 +++++++--- vllm/multimodal/profiling.py | 7 ++- 9 files changed, 139 insertions(+), 71 deletions(-) create mode 100644 docs/source/api/multimodal/inputs.md create mode 100644 docs/source/api/multimodal/parse.md create mode 100644 docs/source/api/multimodal/processing.md create mode 100644 docs/source/api/multimodal/profiling.md create mode 100644 docs/source/api/multimodal/registry.md diff --git a/docs/source/api/multimodal/index.md b/docs/source/api/multimodal/index.md index 0046b73ea825e..51e24795a34cf 100644 --- a/docs/source/api/multimodal/index.md +++ b/docs/source/api/multimodal/index.md @@ -2,10 +2,6 @@ # Multi-Modality -```{eval-rst} -.. currentmodule:: vllm.multimodal -``` - vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package. Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models) @@ -13,61 +9,20 @@ via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`. Looking to add your own multi-modal model? Please follow the instructions listed [here](#enabling-multimodal-inputs). - ## Module Contents -```{eval-rst} -.. automodule:: vllm.multimodal -``` - -### Registry - ```{eval-rst} .. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY ``` -```{eval-rst} -.. autoclass:: vllm.multimodal.MultiModalRegistry - :members: - :show-inheritance: -``` - -### Base Classes - -```{eval-rst} -.. automodule:: vllm.multimodal.base - :members: - :show-inheritance: -``` +## Submodules -### Input Classes +```{toctree} +:maxdepth: 1 -```{eval-rst} -.. automodule:: vllm.multimodal.inputs - :members: - :show-inheritance: -``` - -### Audio Classes - -```{eval-rst} -.. automodule:: vllm.multimodal.audio - :members: - :show-inheritance: -``` - -### Image Classes - -```{eval-rst} -.. automodule:: vllm.multimodal.image - :members: - :show-inheritance: -``` - -### Video Classes - -```{eval-rst} -.. automodule:: vllm.multimodal.video - :members: - :show-inheritance: +inputs +parse +processing +profiling +registry ``` diff --git a/docs/source/api/multimodal/inputs.md b/docs/source/api/multimodal/inputs.md new file mode 100644 index 0000000000000..3d89666113229 --- /dev/null +++ b/docs/source/api/multimodal/inputs.md @@ -0,0 +1,49 @@ +# Input Definitions + +## User-facing inputs + +```{eval-rst} +.. autodata:: vllm.multimodal.MultiModalDataDict +``` + +## Internal data structures + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.PlaceholderRange + :members: + :show-inheritance: +``` + +```{eval-rst} +.. autodata:: vllm.multimodal.inputs.NestedTensors +``` + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.MultiModalFieldElem + :members: + :show-inheritance: +``` + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.MultiModalFieldConfig + :members: + :show-inheritance: +``` + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.MultiModalKwargsItem + :members: + :show-inheritance: +``` + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.MultiModalKwargs + :members: + :show-inheritance: +``` + +```{eval-rst} +.. autoclass:: vllm.multimodal.inputs.MultiModalInputsV2 + :members: + :show-inheritance: +``` diff --git a/docs/source/api/multimodal/parse.md b/docs/source/api/multimodal/parse.md new file mode 100644 index 0000000000000..4676139efe626 --- /dev/null +++ b/docs/source/api/multimodal/parse.md @@ -0,0 +1,9 @@ +# Data Parsing + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.multimodal.parse + :members: + :member-order: bysource +``` diff --git a/docs/source/api/multimodal/processing.md b/docs/source/api/multimodal/processing.md new file mode 100644 index 0000000000000..0d81c8d3966ee --- /dev/null +++ b/docs/source/api/multimodal/processing.md @@ -0,0 +1,9 @@ +# Data Processing + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.multimodal.processing + :members: + :member-order: bysource +``` diff --git a/docs/source/api/multimodal/profiling.md b/docs/source/api/multimodal/profiling.md new file mode 100644 index 0000000000000..b455145212202 --- /dev/null +++ b/docs/source/api/multimodal/profiling.md @@ -0,0 +1,9 @@ +# Memory Profiling + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.multimodal.profiling + :members: + :member-order: bysource +``` diff --git a/docs/source/api/multimodal/registry.md b/docs/source/api/multimodal/registry.md new file mode 100644 index 0000000000000..0737a4385cf32 --- /dev/null +++ b/docs/source/api/multimodal/registry.md @@ -0,0 +1,9 @@ +# Registry + +## Module Contents + +```{eval-rst} +.. automodule:: vllm.multimodal.registry + :members: + :member-order: bysource +``` diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index 6be046ba77ca7..ccff0e857eec4 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -13,14 +13,16 @@ from .audio import resample_audio from .inputs import (AudioItem, HfAudioItem, HfImageItem, HfVideoItem, - ImageItem, ModalityData, MultiModalDataDict, - NestedTensors, VideoItem) + ImageItem, ModalityData, MultiModalDataDict, VideoItem) _T = TypeVar("_T") _I = TypeVar("_I") class ModalityDataItems(ABC, Generic[_T, _I]): + """ + Represents data items for a modality in :class:`MultiModalDataItems`. + """ def __init__(self, data: _T, modality: str) -> None: super().__init__() @@ -69,6 +71,7 @@ def get_passthrough_data(self) -> Mapping[str, object]: class ProcessorBatchItems(ModalityDataItems[Sequence[_T], _T]): + """Base class for data items that are arranged in a list.""" def get_count(self) -> int: return len(self.data) @@ -83,7 +86,12 @@ def get_passthrough_data(self) -> Mapping[str, object]: return {} -class EmbeddingItems(ModalityDataItems[NestedTensors, torch.Tensor]): +class EmbeddingItems(ModalityDataItems[Union[torch.Tensor, list[torch.Tensor]], + torch.Tensor]): + """ + Base class for data items that are expressed as a batched embedding tensor, + or a list of embedding tensors (one per item). + """ def get_count(self) -> int: return len(self.data) @@ -109,7 +117,7 @@ def __init__(self, data: Sequence[HfAudioItem]) -> None: class AudioEmbeddingItems(EmbeddingItems): - def __init__(self, data: NestedTensors) -> None: + def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None: super().__init__(data, "audio") @@ -137,7 +145,7 @@ def get_image_size(self, item_idx: int) -> ImageSize: class ImageEmbeddingItems(EmbeddingItems): - def __init__(self, data: NestedTensors) -> None: + def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None: super().__init__(data, "image") @@ -163,7 +171,7 @@ def get_frame_size(self, item_idx: int) -> ImageSize: class VideoEmbeddingItems(EmbeddingItems): - def __init__(self, data: NestedTensors) -> None: + def __init__(self, data: Union[torch.Tensor, list[torch.Tensor]]) -> None: super().__init__(data, "video") @@ -172,8 +180,8 @@ def __init__(self, data: NestedTensors) -> None: class MultiModalDataItems(UserDict[str, ModalityDataItems[Any, Any]]): """ - As :class:`MultiModalDataDict`, but normalized such that each entry - corresponds to a list. + As :data:`~vllm.multimodal.inputs.MultiModalDataDict`, but normalized + such that each entry corresponds to a list. """ def get_count(self, modality: str, *, strict: bool = True) -> int: @@ -226,7 +234,8 @@ def get_items( class MultiModalDataParser: """ - Parses :class:`MultiModalDataDict` into :class:`MultiModalDataItems`. + Parses :data:`~vllm.multimodal.inputs.MultiModalDataDict` into + :class:`MultiModalDataItems`. Args: target_sr (float, optional): Enables automatic resampling of audio @@ -238,7 +247,9 @@ def __init__(self, *, target_sr: Optional[float] = None) -> None: self.target_sr = target_sr - def _is_embeddings(self, data: object) -> TypeGuard[NestedTensors]: + def _is_embeddings( + self, data: object + ) -> TypeGuard[Union[torch.Tensor, list[torch.Tensor]]]: if isinstance(data, torch.Tensor): return data.ndim == 3 if is_list_of(data, torch.Tensor): diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index c6a30cacebdd1..07d883d5d7295 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -33,20 +33,24 @@ @dataclass class PromptReplacement: + """ + Defines how to replace portions of an input prompt with placeholder tokens. + """ + modality: str """The modality for which the replacement is made.""" target: _PromptSeq - """The text or token sequence to find and replace.""" + """The token sequence (or text) to find and replace.""" replacement: Union[Callable[[int], _PromptSeq], _PromptSeq] = field(repr=False) """ - Given the index of the processed item within :attr:`modality`, output the - replacement text or token sequence. + Given the index of the processed item within :attr:`modality`, + output the replacement token sequence (or text). - For convenience, you can pass in the replacement instead of a function - if it does not depend on the input. + For convenience, you can directly pass in the replacement token sequence + (or text) instead of a function if it does not depend on the input. """ def bind(self, tokenizer: AnyTokenizer) -> "BoundPromptReplacement": @@ -132,6 +136,11 @@ def token_ids(self) -> list[int]: @dataclass class BoundPromptReplacement: + """ + A :class:`PromptReplacement` bound to a tokenizer to automatically + convert :attr:`target` and the result of :meth:`get_replacement` between + token sequence and text representations. + """ tokenizer: AnyTokenizer = field(repr=False) modality: str @@ -144,6 +153,7 @@ def __post_init__(self) -> None: @property def target(self) -> _BoundPromptSequence: + """The token sequence (or text) to find and replace.""" target = self._target return _BoundPromptSequence( @@ -153,6 +163,10 @@ def target(self) -> _BoundPromptSequence: ) def get_replacement(self, item_idx: int) -> _BoundPromptSequence: + """ + Given the index of the processed item within :attr:`modality`, + output the replacement token sequence (or text). + """ replacement = self._replacement if callable(replacement): cache_key = item_idx @@ -528,7 +542,7 @@ def put( class BaseProcessingInfo: - """Base class containing information to perform processing.""" + """Base class to provide the information necessary for data processing.""" def __init__(self, ctx: InputProcessingContext) -> None: super().__init__() diff --git a/vllm/multimodal/profiling.py b/vllm/multimodal/profiling.py index 2ac3a6bcf3ddd..6f7da1509990f 100644 --- a/vllm/multimodal/profiling.py +++ b/vllm/multimodal/profiling.py @@ -19,7 +19,10 @@ @dataclass class ProcessorInputs: - """Keyword arguments to :meth:`BaseMultiModalProcessor`.""" + """ + Represents the keyword arguments to + :meth:`vllm.multimodal.processing.BaseMultiModalProcessor.apply`. + """ prompt_text: str mm_data: MultiModalDataDict hf_processor_mm_kwargs: Mapping[str, object] = field(default_factory=dict) @@ -47,7 +50,7 @@ def get_dummy_processor_inputs( ) -> ProcessorInputs: """ Build the input which, after processing, results in - `self.info.get_mm_max_tokens_per_item()` placeholder tokens. + :code:`self.info.get_mm_max_tokens_per_item()` placeholder tokens. """ raise NotImplementedError