From 26a9f1b5c177eea869a82b7260ddaf8daa1b3fef Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 19 Feb 2025 16:43:11 +0100 Subject: [PATCH 1/5] tmp --- vllm/engine/llm_engine.py | 2 + vllm/entrypoints/llm.py | 5 ++ vllm/executor/uniproc_executor.py | 1 + vllm/inputs/preprocess.py | 1 + vllm/inputs/registry.py | 2 + vllm/model_executor/models/llama.py | 1 + vllm/model_executor/models/llava.py | 1 + vllm/model_executor/models/transformers.py | 60 +++++++++++++++------- vllm/multimodal/base.py | 1 + vllm/multimodal/processing.py | 24 +++++---- vllm/multimodal/registry.py | 2 + vllm/v1/engine/core.py | 6 ++- vllm/v1/engine/llm_engine.py | 1 + vllm/v1/engine/mm_input_cache.py | 3 ++ vllm/v1/engine/processor.py | 7 ++- vllm/v1/worker/gpu_model_runner.py | 4 ++ vllm/worker/model_runner.py | 18 ++++++- vllm/worker/worker_base.py | 2 + 18 files changed, 109 insertions(+), 32 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 2e5bc75c6db38..816baab8c0bd0 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -724,6 +724,7 @@ def add_request( if inputs is not None: prompt = inputs assert prompt is not None and params is not None + print("CALL add_request", prompt) if lora_request is not None and not self.lora_config: raise ValueError(f"Got lora_request {lora_request} but LoRA is " @@ -754,6 +755,7 @@ def add_request( lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, ) + print("mm_hashes", preprocessed_inputs.get('mm_hashes')) processed_inputs = self.input_processor(preprocessed_inputs) self._add_processed_request( diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 40b7a529ebfb5..e71ac91e5fcc6 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -691,6 +691,11 @@ def chat( ] tokenizer = self.get_tokenizer() + model_config = self.llm_engine.processor.input_preprocessor.model_config + mm_processor = self.llm_engine.processor.input_preprocessor.mm_registry.create_processor(model_config, tokenizer) + processor = mm_processor.info.ctx.get_hf_processor() + chat_template = processor.chat_template + model_config = self.llm_engine.get_model_config() resolved_content_format = resolve_chat_template_content_format( chat_template, diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py index 94db232240d55..bf693cab5cc7c 100644 --- a/vllm/executor/uniproc_executor.py +++ b/vllm/executor/uniproc_executor.py @@ -53,6 +53,7 @@ def collective_rpc(self, kwargs: Optional[Dict] = None) -> List[Any]: if kwargs is None: kwargs = {} + # print("self.driver_worker", self.driver_worker, method, getattr(self.driver_worker, method)) answer = run_method(self.driver_worker, method, args, kwargs) return [answer] diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index bc5856990da6f..b2e4866c6045c 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -317,6 +317,7 @@ def _prompt_to_llm_inputs( * :class:`SingletonInputs` instance """ parsed = parse_singleton_prompt(prompt) + print("CALLED PROCESSOR", parsed["type"]) if parsed["type"] == "str": prompt_text = parsed["content"] diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 87b7a7631e42e..d8e16e9456986 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -176,6 +176,7 @@ def call_hf_processor( allow_var_kwargs=True, ) + # print("CTX", data.keys(), merged_kwargs.keys()) try: return hf_processor(**data, **merged_kwargs, return_tensors="pt") except Exception as exc: @@ -461,6 +462,7 @@ def process_input(self, model_config: "ModelConfig", processor, ) + print("process_input", processor, mm_processor_kwargs.keys()) processed_inputs = processor( InputContext(model_config), inputs, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 2ff52dd789125..a5fd00b670de2 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -375,6 +375,7 @@ def forward( "residual": residual }) + # print(hidden_states.shape) hidden_states, _ = self.norm(hidden_states, residual) return hidden_states diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 6a4277adb6bf4..a60e0b83a8fee 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -530,6 +530,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: hf_config=config.text_config, prefix=maybe_prefix(prefix, "language_model"), ) + print("self.language_model", self.language_model.__class__, self.vision_tower.__class__) self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 9b456b2489525..f7dfdfa619c88 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -38,7 +38,10 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .interfaces import SupportsQuant +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry +from vllm.inputs import InputRegistry, INPUT_REGISTRY, DummyData +from vll.Sequence import SequenceData +from .interfaces import SupportsQuant, SupportsMultiModal from .utils import maybe_prefix logger = init_logger(__name__) @@ -119,7 +122,20 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: ) -class TransformersModel(nn.Module, SupportsQuant): +def map_auto_class(config): + AutoModel + + +def dummy_encoder_data_for_whisper(ctx, seq_len: int, mm_counts): + assert mm_counts["image"] == 1 + return DummyData( + SequenceData.from_prompt_token_counts((0, 596)), + {"image": np.zeros((3, 336, 336))}, + ) + +@INPUT_REGISTRY.register_dummy_encoder_data(dummy_encoder_data_for_whisper) +@MULTIMODAL_REGISTRY.register_max_multimodal_tokens("image", 576) +class TransformersModel(nn.Module, SupportsQuant, SupportsMultiModal): embedding_padding_modules = ["lm_head"] embedding_modules = ["embed_tokens" ] # TODO transformers will have a util to get it @@ -132,12 +148,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: cache_config = vllm_config.cache_config self.config = config - self.vocab_size = config.vocab_size - self.unpadded_vocab_size = config.vocab_size + self.vocab_size = config.get_text_config().vocab_size + self.unpadded_vocab_size = config.get_text_config().vocab_size self.model: PreTrainedModel = AutoModel.from_config( self.config, - attn_implementation="vllm", + attn_implementation={"text_config": "vllm", "vision_config": "eager"}, torch_dtype=vllm_config.model_config.dtype, trust_remote_code=vllm_config.model_config.trust_remote_code, ) @@ -150,39 +166,42 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: tp_size = get_tensor_model_parallel_world_size() self.attention_instances = [ Attention( - num_heads=divide(config.num_attention_heads, tp_size), - head_size=config.head_dim, + num_heads=divide(config.get_text_config().num_attention_heads, tp_size), + head_size=config.get_text_config().head_dim, # NOTE: We use Llama scale as default, if it's set by # Transformers, it's updated in vllm_flash_attention_forward - scale=config.head_dim**-0.5, - num_kv_heads=divide(config.num_key_value_heads, tp_size), + scale=config.get_text_config().head_dim**-0.5, + num_kv_heads=divide(config.get_text_config().num_key_value_heads, tp_size), cache_config=cache_config, quant_config=self.quant_config, - prefix=f"{i}.attn") for i in range(config.num_hidden_layers) + prefix=f"{i}.attn") for i in range(config.get_text_config().num_hidden_layers) ] # Model modifications self.replace_vocab_embed_class(self.model) # ForCausalLM modifications - self.lm_head = ParallelLMHead(config.vocab_size, - config.hidden_size, + self.lm_head = ParallelLMHead(config.get_text_config().vocab_size, + config.get_text_config().hidden_size, quant_config=self.quant_config, prefix=maybe_prefix(prefix, "lm_head")) - if config.tie_word_embeddings: + if config.get_text_config().tie_word_embeddings: self.lm_head.weight = self.model.get_input_embeddings().weight logit_scale = getattr(config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, - config.vocab_size, logit_scale) + config.get_text_config().vocab_size, logit_scale) self.sampler = get_sampler() + MultiModalRegistry()._get_plugin("image").register_max_multimodal_tokens(576) + InputRegistry()._dummy_factories_by_model_type[model_cls] = factory + def apply_base_model_tp_plan(self, module: nn.Module, prefix: str = ""): """ Apply the base model tensor parallelization plan to a module. Currently only supports linear layers. """ - if (self.config.base_model_tp_plan is None + if (self.config.get_text_config().base_model_tp_plan is None and get_tensor_model_parallel_world_size() > 1): raise ValueError( "Trying to run tensor parallelization but the model does not " @@ -190,7 +209,7 @@ def apply_base_model_tp_plan(self, module: nn.Module, prefix: str = ""): for child_name, child_module in module.named_children(): qual_name = maybe_prefix(prefix, child_name) - for pattern, style in self.config.base_model_tp_plan.items(): + for pattern, style in self.config.get_text_config().base_model_tp_plan.items(): if re.match(pattern, qual_name) and isinstance( child_module, nn.Linear): new_module = replace_linear_class(child_module, style, @@ -204,8 +223,8 @@ def replace_vocab_embed_class(self, module: nn.Module): # Use native set input embeddings new_module = VocabParallelEmbedding( self.vocab_size, - self.config.hidden_size, - org_num_embeddings=self.config.vocab_size, + self.config.get_text_config().hidden_size, + org_num_embeddings=self.vocab_size, quant_config=None, ) log_replacement("input embedding", self.model.get_input_embeddings(), @@ -252,7 +271,10 @@ def load_weights(self, weights: Iterable[tuple[str, loaded_params = set[str]() for name, loaded_weight in weights: if name not in params_dict: - name = f"{self.model.base_model_prefix}.{name}" + if "lm_head" in name: + name = name.replace("language_model.", "") + else: + name = f"{self.model.base_model_prefix}.{name}" if name in params_dict: param = params_dict[name] weight_loader = getattr(param, "weight_loader", diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index c48d07ba365ba..28abc8b5fe65d 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -219,6 +219,7 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int: if not supports_multimodal(model_cls): return 0 + print("_max_mm_tokens", self._max_mm_tokens, model_cls, self.__class__) max_mm_tokens = self._max_mm_tokens.get(model_cls) if max_mm_tokens is None: return 0 diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index fcd02fbd5203c..268e2dc0b19a2 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -867,6 +867,7 @@ def _apply_hf_processor_text_mm( mm_data=processor_data, mm_kwargs=hf_processor_mm_kwargs, ) + print("prompt_text", prompt_text, processed_data.keys()) processed_data.update(passthrough_data) prompt_ids, = processed_data.pop("input_ids").tolist() @@ -995,6 +996,7 @@ def _cached_apply_hf_processor( _, passthrough_data = self._get_hf_mm_data(mm_data_items) if cache is None or passthrough_data: + print("NO CACHE!") return self._apply_hf_processor_main( prompt=prompt, mm_items=mm_data_items, @@ -1039,6 +1041,7 @@ def _cached_apply_hf_processor( modality: 0 for modality in mm_missing_data_items } + print("CACHED!", mm_missing_idxs) merged_kw_items = list[MultiModalKwargsItem]() for modality, kw_items in mm_maybe_cached_kw_items.items(): @@ -1232,14 +1235,16 @@ def apply( else: mm_hashes = None - ( - prompt_ids, - mm_kwargs, - is_repl_applied, - ) = self._cached_apply_hf_processor( - prompt, - mm_items, - hf_processor_mm_kwargs, + # prompt_ids, mm_kwargs, is_repl_applied = self._cached_apply_hf_processor( + # prompt, + # mm_items, + # hf_processor_mm_kwargs, + # ) + prompt_ids, mm_kwargs, is_repl_applied = self._apply_hf_processor_main( + prompt=prompt, + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + enable_hf_prompt_replacement=True, ) unbound_prompt_repls = self._get_prompt_replacements( @@ -1279,12 +1284,13 @@ def apply( for modality, placeholders in mm_placeholders.items() } + print("DONE HERE?") return MultiModalInputs( type="multimodal", prompt=prompt, prompt_token_ids=prompt_ids, mm_kwargs=mm_kwargs, - mm_hashes=mm_hashes, + mm_hashes=None, #mm_hashes, mm_placeholders=mm_placeholder_ranges, ) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 613d1db416720..cc91e9d0279df 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -266,6 +266,7 @@ def get_max_tokens_per_item_by_modality( return processor.info.get_mm_max_tokens_per_item( seq_len, mm_limits) + print(self._plugins['image'].get_max_multimodal_tokens(model_config), self._plugins['image'].get_max_multimodal_tokens) return { key: plugin.get_max_multimodal_tokens(model_config) for key, plugin in self._plugins.items() @@ -285,6 +286,7 @@ def get_max_tokens_per_item_by_nonzero_modality( usage of a model. """ mm_limits = self.get_mm_limits_per_prompt(model_config) + print("mm_limits", mm_limits) return { key: max_tokens_per_mm_item diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 66e252b7ccb0f..5ccbbd32bd469 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -129,9 +129,11 @@ def add_request(self, request: EngineCoreRequest): assert request.mm_inputs is not None request.mm_inputs = self.mm_input_cache_server.get_and_update( request.mm_inputs, request.mm_hashes) + # print("request.mm_hashes is None", request.mm_inputs[0] is None) # V1 ADDED HERE IF CACHED from MMInputMapperServer req = Request.from_engine_core_request(request) + # print("self.scheduler.add_request", self.scheduler.add_request) # vllm.v1.core.scheduler.Scheduler.add_request self.scheduler.add_request(req) def abort_requests(self, request_ids: List[str]): @@ -150,7 +152,9 @@ def step(self) -> EngineCoreOutputs: return EngineCoreOutputs( outputs=[], scheduler_stats=self.scheduler.make_stats()) - scheduler_output = self.scheduler.schedule() + scheduler_output = self.scheduler.schedule() # kinda allocated new kv cache and updates many internal stats for the requests + # print("scheduler_output", scheduler_output) # DEFI HAS pixel values when V1 is set + # print("self.model_executor", self.model_executor.execute_model) output = self.model_executor.execute_model(scheduler_output) engine_core_outputs = self.scheduler.update_from_output( scheduler_output, output) # type: ignore diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index c9a4c5369dfd8..c5d03ace73215 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -131,6 +131,7 @@ def add_request( ) -> None: # 1) Process raw inputs into the request. + print("CALL add_request", prompt, self.processor) request = self.processor.process_inputs(request_id, prompt, params, arrival_time, lora_request, trace_headers, diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py index a1d802bf818a2..0aaf264ad31d7 100644 --- a/vllm/v1/engine/mm_input_cache.py +++ b/vllm/v1/engine/mm_input_cache.py @@ -100,6 +100,7 @@ def process_inputs( mm_input = self.mm_cache.get(mm_hash) self.mm_cache_total += 1 + # print("mm_input is None", mm_input is None) if mm_input is None: if precomputed_mm_inputs is not None: # Reuse precomputed input (for merged preprocessor) @@ -118,7 +119,9 @@ def process_inputs( else: self.mm_cache_hits += 1 mm_input = None # Avoids sending mm_input to Server + print(" Avoids sending mm_input to Server, use cache somewhow, I dunno how yet") + # print("mm_input is None", mm_input is None) ret_inputs.append(mm_input) return ret_inputs diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index b7eee5a39972b..1232579b1b460 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -111,6 +111,9 @@ def process_inputs( # 2. For multimodal models with a merged preprocessor, preprocess # multimodal data and expand prompt token ids accordingly. # 3. Apply prompt adapter to prompt token ids if one exists. + # Process inputs. + + # CALL input_preprocessor (preprocess.py) where print(add request) preprocessed_inputs = self.input_preprocessor.preprocess( prompt, request_id=request_id, @@ -155,6 +158,7 @@ def process_inputs( # Fallback to using MultiModalHasher directly. else: mm_hashes = MultiModalHasher.hash_prompt_mm_data(prompt) + print('HASHING', bool(decoder_inputs.multi_modal_hashes), mm_hashes) # For merged preprocessor, mm_data is already mm_inputs precomputed_mm_inputs: Optional[list[MultiModalKwargs]] = None @@ -187,6 +191,7 @@ def process_inputs( mm_positions, mm_hashes, ) + print("mm_positions", mm_positions) # NOTE: Sort multimodal inputs/kwargs ONLY IF there are multiple # modalities involved AND the model supports merged input processor. @@ -214,7 +219,7 @@ def process_inputs( mm_hashes=sorted_mm_hashes, mm_processor_kwargs=decoder_inputs.mm_processor_kwargs, precomputed_mm_inputs=precomputed_mm_inputs, - ) + ) # THIS ONE REMOVES INPUT IMAGES IF CACHED with MMInputMapperClient else: sorted_mm_inputs = None sorted_mm_hashes = None diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 31fe095a91bc0..b86099e780e51 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -813,6 +813,7 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"): # depending on the input multimodal items. curr_group_outputs = self.model.get_multimodal_embeddings( **batched_mm_inputs) + print("curr_group_outputs", curr_group_outputs.shape) for output in curr_group_outputs: encoder_outputs.append(output) @@ -881,6 +882,7 @@ def execute_model( # Prepare the decoder inputs. attn_metadata, logits_indices = self._prepare_inputs(scheduler_output) + num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens if (self.use_cuda_graph and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]): @@ -893,6 +895,7 @@ def execute_model( num_input_tokens = num_scheduled_tokens attn_metadata.num_input_tokens = num_input_tokens + print("self.is_multimodal_model", num_scheduled_tokens, num_input_tokens) if self.is_multimodal_model: # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) @@ -932,6 +935,7 @@ def execute_model( for k, v in self.intermediate_tensors.items() }) + print("inputs_embeds", inputs_embeds.shape) # THIS IN THE ENTRYPOINT IN V1 # Run the decoder. # Use persistent buffers for CUDA graphs. with set_forward_context(attn_metadata, self.vllm_config): diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 67d175c373d82..ea05c444efb45 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -542,6 +542,7 @@ def _compute_for_prefix_cache_hit( remaining blocks. """ computed_block_nums = inter_data.computed_block_nums + print("cache hit", computed_block_nums is not None, inter_data.is_prompt) # Note that prefix caching does not support sliding window. prefix_cache_hit = (computed_block_nums is not None @@ -651,10 +652,12 @@ def _compute_prompt_adapter_input( # Note that when is_prompt=True, we expect only one sequence # in the group. if not self.enable_prompt_adapter: + print("no enable_prompt_adapter") return prompt_adapter_id = seq_group_metadata.prompt_adapter_id if prompt_adapter_id <= 0 or not inter_data.is_prompt: + print("no prompt_adapter_id", prompt_adapter_id) return # We expect only one sequence in the group when is_prompt=True. @@ -670,6 +673,7 @@ def _compute_prompt_adapter_input( inter_data.prompt_adapter_prompt_mapping = [prompt_adapter_id] * ( query_len if seq_group_metadata.sampling_params and seq_group_metadata.sampling_params.prompt_logprobs else 1) + print("prompt adapters", inter_data.prompt_adapter_index_mapping, inter_data.prompt_adapter_prompt_mapping) def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, seq_group_metadata: SequenceGroupMetadata): @@ -683,9 +687,11 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, if not mm_data: return + # print("seq_group_metadata", seq_group_metadata, positions) if self.runner.mm_registry.has_processor(self.runner.model_config): mm_kwargs = mm_data else: + print("RUN INPUT MAPPER AGAIN BUT WHY") mm_kwargs = self.multi_modal_input_mapper( mm_data, seq_group_metadata.mm_processor_kwargs, @@ -693,6 +699,7 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, inter_data.multi_modal_kwargs = mm_kwargs inter_data.multi_modal_placeholder_maps = placeholder_maps + print("placeholder_maps", placeholder_maps["image"].src_ranges, placeholder_maps["image"].dest_ranges) # special processing for mrope position deltas. if self.runner.model_config.uses_mrope: @@ -752,12 +759,14 @@ def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): encoder_seq_len=encoder_seq_len) self.inter_data_list.append(inter_data) + # print("input_tokens", inter_data.input_tokens) for seq_idx in range(n_seqs): for per_seq_fn in self.per_seq_compute_fns: - per_seq_fn(inter_data, seq_idx, seq_group_metadata) + per_seq_fn(inter_data, seq_idx, seq_group_metadata) # ADDS PLACEHOLDER HERE I GUESS? for per_seq_group_fn in self.per_seq_group_compute_fns: - per_seq_group_fn(inter_data, seq_group_metadata) + per_seq_group_fn(inter_data, seq_group_metadata) # ADDS MM KWARGS HERE + # print("inter_data should have mm here!!!", inter_data.multi_modal_kwargs is not None) def _use_captured_graph(self, batch_size: int, @@ -977,6 +986,7 @@ def build(self) -> ModelInputForGPU: if data.multi_modal_kwargs is not None ] multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) + # print("building", multi_modal_kwargs.keys()) return self.model_input_cls( input_tokens=input_tokens_tensor, @@ -1718,6 +1728,10 @@ def execute_model( model_forward_end = torch.cuda.Event(enable_timing=True) model_forward_start.record() + if "pixel_values" in multi_modal_kwargs: + print('FINALLY FORWARD', model_input.input_tokens.shape, multi_modal_kwargs["pixel_values"].shape) + else: + print('DECODE', model_input.input_tokens.shape) if not bypass_model_exec: with set_forward_context(model_input.attn_metadata, self.vllm_config, virtual_engine): diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 190429074d56c..8984ee83636cb 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -343,6 +343,7 @@ def _get_driver_input_and_broadcast( execute_model_req.virtual_engine, execute_model_req.finished_requests_ids)) + # print("self.model_runner.prepare_model_input", self.model_runner.prepare_model_input) kwargs = extract_previous_hidden_states(execute_model_req) if self.do_metadata_broadcast: @@ -417,6 +418,7 @@ def execute_model( orig_model_execute_time = intermediate_tensors.tensors.get( "model_execute_time", torch.tensor(0)).item() + # print("self.model_runner.execute_model", self.model_runner.execute_model) output = self.model_runner.execute_model( model_input=model_input, kv_caches=self.kv_cache[worker_input.virtual_engine] From a502988faf74c4faddd39e4c351017e4463d4da0 Mon Sep 17 00:00:00 2001 From: raushan Date: Fri, 21 Feb 2025 17:12:08 +0100 Subject: [PATCH 2/5] dump --- vllm/inputs/registry.py | 1 + vllm/model_executor/models/transformers.py | 266 ++++++++++++++++++--- vllm/multimodal/inputs.py | 9 + vllm/multimodal/processing.py | 3 +- vllm/v1/worker/gpu_model_runner.py | 2 +- 5 files changed, 247 insertions(+), 34 deletions(-) diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index d8e16e9456986..2536924a51d1a 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -388,6 +388,7 @@ def dummy_data_for_profiling( f"Expected at least {num_expected} dummy '{k}' instances " f"for profiling, but found {num_items} instances instead.") + print(dummy_data.multi_modal_data["pixel_values"].shape) return dummy_data def _default_input_processor( diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index f7dfdfa619c88..02442eea845f8 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -19,7 +19,7 @@ import torch from torch import nn -from transformers import AutoModel, PreTrainedModel +from transformers import AutoModel, PreTrainedModel, LlavaConfig from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS from vllm.attention import Attention, AttentionMetadata @@ -37,10 +37,12 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors +from vllm.transformers_utils.processor import cached_get_processor +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry, MultiModalKwargs +from vllm.multimodal.processing import BaseMultiModalProcessor, BaseProcessingInfo +from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs +from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalInputs, PlaceholderRange -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry -from vllm.inputs import InputRegistry, INPUT_REGISTRY, DummyData -from vll.Sequence import SequenceData from .interfaces import SupportsQuant, SupportsMultiModal from .utils import maybe_prefix @@ -122,23 +124,181 @@ def forward(self, input: torch.Tensor) -> torch.Tensor: ) -def map_auto_class(config): - AutoModel +class MultiModalProcessingInfo(BaseProcessingInfo): + def get_hf_config(self): + # NOTE: this means we don't check if return config type is same as requested + # VLLM on contrary always checks. In whcih cases we can have different config types tho? + return self.ctx.model_config.hf_config + def get_supported_mm_limits(self): + return {"image": None, "video": None} -def dummy_encoder_data_for_whisper(ctx, seq_len: int, mm_counts): - assert mm_counts["image"] == 1 - return DummyData( - SequenceData.from_prompt_token_counts((0, 596)), - {"image": np.zeros((3, 336, 336))}, - ) + def get_mm_max_tokens_per_item(self, seq_len, mm_counts): + return {"image": self.get_max_image_tokens(), "video": 100} + + def get_max_image_tokens(self) -> int: + # Is already an attribute in some VLMs and now reason to make it a required attribute + # TODO: @raushan add it for all VLM configs + return self.get_hf_config().image_seq_length + + def get_hf_processor(self): + processor = cached_get_processor(self.ctx.model_config.model) + return processor + + +class MultiModalDummyInputsBuilder(BaseDummyInputsBuilder): + def get_dummy_processor_inputs( + self, + seq_len, + mm_counts, + ) -> ProcessorInputs: + num_images = mm_counts.get("image", 0) + num_videos = mm_counts.get("video", 0) + num_frames = 8 + + processor = self.info.get_hf_processor() + image_token = getattr(processor, "image_token", None) + video_token = getattr(processor, "video_token", None) + + # TODO: raushan, we can have processor attr for `processor.max_output_size` which will infer + # max features for model in HF side. IMO should be all done on processor side, not on model config + vision_config = self.info.get_hf_config().vision_config + target_width = target_height = vision_config.image_size + + # NOTE: we can pass videos/images/audio to any processor With the new API used in MLLMs, + # HF processor will take the modality needed for model and ignore all others + mm_data = { + "image": self._get_dummy_images( + width=target_width, + height=target_height, + num_images=num_images + ), + "video": self._get_dummy_videos( + width=target_width, + height=target_height, + num_frames=num_frames, + num_videos=num_videos, + ) + } + + prompt_text = video_token*num_videos if video_token is not None else image_token*num_images + return ProcessorInputs( + prompt_text=prompt_text, + mm_data=mm_data, + ) + + +class MultiModalProcessor(BaseMultiModalProcessor): + def _get_prompt_replacements( + self, + mm_items, + hf_processor_mm_kwargs, + out_mm_kwargs: MultiModalKwargs, + ): + return + + def _get_mm_fields_config( + self, + hf_inputs, + hf_processor_mm_kwargs, + ): + return dict( + pixel_values=MultiModalFieldConfig.batched("image"), + mm_token_type_ids=MultiModalFieldConfig.batched("image"), + pixel_values_videos=MultiModalFieldConfig.batched("video"), + image_embeds=MultiModalFieldConfig.batched("image"), + video_embeds=MultiModalFieldConfig.batched("video"), + ) + + def _apply_hf_processor_text_mm( + self, + prompt_text, + mm_items, + hf_processor_mm_kwargs, + ): + """ + Apply the HF processor on the prompt text and multi-modal data + together. + + In addition, return whether prompt replacements have been applied. + """ + processor_data, passthrough_data = self._get_hf_mm_data(mm_items) + processor_data["return_mm_token_type_ids"] = True + + processed_data = self._call_hf_processor( + prompt=prompt_text, + mm_data=processor_data, + mm_kwargs=hf_processor_mm_kwargs, + ) + print("prompt_text", prompt_text, processed_data["pixel_values"][0].shape) + processed_data.update(passthrough_data) + + prompt_ids, = processed_data.pop("input_ids").tolist() + mm_token_type_ids = processed_data.pop("mm_token_type_ids") -@INPUT_REGISTRY.register_dummy_encoder_data(dummy_encoder_data_for_whisper) -@MULTIMODAL_REGISTRY.register_max_multimodal_tokens("image", 576) + mm_kwargs = MultiModalKwargs.from_hf_inputs( + processed_data, + self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs), + ) + + return prompt_ids, mm_kwargs, mm_token_type_ids + + def apply( + self, + prompt, + mm_data, + hf_processor_mm_kwargs, + ) -> MultiModalInputs: + """ + Process multi-modal inputs to be used in vLLM. + + Apply HF Processor on prompt text and multi-modal data together, + outputting token IDs and processed tensors. + """ + mm_items = self._to_mm_items(mm_data) + prompt_ids, mm_kwargs, mm_token_type_ids = self._apply_hf_processor_text_mm( + prompt_text=prompt, + mm_items=mm_items, + hf_processor_mm_kwargs=hf_processor_mm_kwargs, + ) + + # HF processor will return `mm_token_type_ids` from which + # we can infer mm_placeholders. Until then hardcode to make code run + # Below tested on Llava. Prompts and `mm_token_type_ids` are always bs=1 + mm_positions = torch.where(mm_token_type_ids == 1)[1] + hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs) + mm_tokens_per_modality = hf_processor._get_num_mm_tokens( + image_inputs=mm_kwargs.get_hf_inputs("image"), + video_inputs=mm_kwargs.get_hf_inputs("video"), + ) + + mm_placeholders = {} + for modality in mm_tokens_per_modality: + split_sizes = mm_tokens_per_modality[modality] + if split_sizes != 0: + chunked_mm_positions = torch.split(mm_positions, split_sizes) + ranges = [ + PlaceholderRange(offset=positions[0].item(), length=positions.shape[0]) + for positions in chunked_mm_positions + ] + mm_placeholders = {modality: ranges} + + return MultiModalInputs( + type="multimodal", + prompt=prompt, + prompt_token_ids=prompt_ids, + mm_kwargs=mm_kwargs, + mm_hashes=None, + mm_placeholders=mm_placeholders, + ) + + +@MULTIMODAL_REGISTRY.register_processor(MultiModalProcessor, + info=MultiModalProcessingInfo, + dummy_inputs=MultiModalDummyInputsBuilder) class TransformersModel(nn.Module, SupportsQuant, SupportsMultiModal): embedding_padding_modules = ["lm_head"] - embedding_modules = ["embed_tokens" - ] # TODO transformers will have a util to get it + embedding_modules = ["embed_tokens"] # TODO transformers will have a util to get it def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super().__init__() @@ -148,8 +308,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: cache_config = vllm_config.cache_config self.config = config - self.vocab_size = config.get_text_config().vocab_size - self.unpadded_vocab_size = config.get_text_config().vocab_size + self.text_config = config.get_text_config() + self.vocab_size = self.text_config.vocab_size + self.unpadded_vocab_size = self.text_config.vocab_size self.model: PreTrainedModel = AutoModel.from_config( self.config, @@ -166,31 +327,31 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: tp_size = get_tensor_model_parallel_world_size() self.attention_instances = [ Attention( - num_heads=divide(config.get_text_config().num_attention_heads, tp_size), - head_size=config.get_text_config().head_dim, + num_heads=divide(self.text_config.num_attention_heads, tp_size), + head_size=self.text_config.head_dim, # NOTE: We use Llama scale as default, if it's set by # Transformers, it's updated in vllm_flash_attention_forward - scale=config.get_text_config().head_dim**-0.5, - num_kv_heads=divide(config.get_text_config().num_key_value_heads, tp_size), + scale=self.text_config.head_dim**-0.5, + num_kv_heads=divide(self.text_config.num_key_value_heads, tp_size), cache_config=cache_config, quant_config=self.quant_config, - prefix=f"{i}.attn") for i in range(config.get_text_config().num_hidden_layers) + prefix=f"{i}.attn") for i in range(self.text_config.num_hidden_layers) ] # Model modifications self.replace_vocab_embed_class(self.model) # ForCausalLM modifications - self.lm_head = ParallelLMHead(config.get_text_config().vocab_size, - config.get_text_config().hidden_size, + self.lm_head = ParallelLMHead(self.text_config.vocab_size, + self.text_config.hidden_size, quant_config=self.quant_config, prefix=maybe_prefix(prefix, "lm_head")) - if config.get_text_config().tie_word_embeddings: + if self.text_config.tie_word_embeddings: self.lm_head.weight = self.model.get_input_embeddings().weight logit_scale = getattr(config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, - config.get_text_config().vocab_size, logit_scale) + self.vocab_size, logit_scale) self.sampler = get_sampler() MultiModalRegistry()._get_plugin("image").register_max_multimodal_tokens(576) @@ -201,7 +362,7 @@ def apply_base_model_tp_plan(self, module: nn.Module, prefix: str = ""): Apply the base model tensor parallelization plan to a module. Currently only supports linear layers. """ - if (self.config.get_text_config().base_model_tp_plan is None + if (self.text_config.base_model_tp_plan is None and get_tensor_model_parallel_world_size() > 1): raise ValueError( "Trying to run tensor parallelization but the model does not " @@ -209,7 +370,7 @@ def apply_base_model_tp_plan(self, module: nn.Module, prefix: str = ""): for child_name, child_module in module.named_children(): qual_name = maybe_prefix(prefix, child_name) - for pattern, style in self.config.get_text_config().base_model_tp_plan.items(): + for pattern, style in self.text_config.base_model_tp_plan.items(): if re.match(pattern, qual_name) and isinstance( child_module, nn.Linear): new_module = replace_linear_class(child_module, style, @@ -223,7 +384,7 @@ def replace_vocab_embed_class(self, module: nn.Module): # Use native set input embeddings new_module = VocabParallelEmbedding( self.vocab_size, - self.config.get_text_config().hidden_size, + self.text_config.hidden_size, org_num_embeddings=self.vocab_size, quant_config=None, ) @@ -241,7 +402,8 @@ def forward( inputs_embeds: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, IntermediateTensors]: model_output = self.model( - input_ids[None, ...], + input_ids[None, ...] if input_ids is not None else None, + inputs_embeds=inputs_embeds[None, ...] if inputs_embeds is not None else None, use_cache=False, position_ids=positions[None, ...], attn_metadata=attn_metadata, @@ -271,6 +433,8 @@ def load_weights(self, weights: Iterable[tuple[str, loaded_params = set[str]() for name, loaded_weight in weights: if name not in params_dict: + # In MLLM the head is usually part of the LM so we might want to strip it + # Very bad workaround, needs smth better if "lm_head" in name: name = name.replace("language_model.", "") else: @@ -282,3 +446,43 @@ def load_weights(self, weights: Iterable[tuple[str, weight_loader(param, loaded_weight) loaded_params.add(name) return loaded_params + + def get_multimodal_embeddings(self, **kwargs): + pixel_values = kwargs.pop("pixel_values", None) + image_embeds = kwargs.pop("image_embeds", None) + + if pixel_values is None and image_embeds is None: + return None + + if pixel_values is not None: + vision_embeddings = self.model.get_image_features( + # Thing about pixels being batched again, adding extra dim + # TODO: find out do we really need that extra dim + pixel_values.flatten(0, 1), + vision_feature_layer=self.config.vision_feature_layer, + vision_feature_select_strategy=self.config.vision_feature_select_strategy, + ) + return vision_embeddings + + if image_embeds is not None: + return image_embeds + + def get_input_embeddings( + self, + input_ids: torch.Tensor, + multimodal_embeddings = None, + ) -> torch.Tensor: + inputs_embeds = self.model.get_input_embeddings()(input_ids) + if multimodal_embeddings is not None: + # most supported VLMs merge like this, otherwise we can add a special + # `merge_multimodal_embeddings` method on HF side + mask = (input_ids == self.config.image_token_index) + mask = mask.unsqueeze(-1).expand_as(inputs_embeds) + multimodal_embeddings = torch.cat(multimodal_embeddings) + + # FIXME: The returned multimodal_embeddings must be either a 3D torch.Tensor of shape + # (num_items, feature_size, hidden_size), or a list / tuple of 2D torch.Tensor’s of shape + # (feature_size, hidden_size), so that multimodal_embeddings[i] retrieves the embeddings generated + # from the i-th multimodal data item (e.g, image) of the request. + inputs_embeds = inputs_embeds.masked_scatter(mask, multimodal_embeddings) + return inputs_embeds \ No newline at end of file diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py index e93fa24a6e4dc..3d12f01fb6f15 100644 --- a/vllm/multimodal/inputs.py +++ b/vllm/multimodal/inputs.py @@ -702,6 +702,15 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]: self._validate_modality("get_items", modality) return self._items_by_modality[modality] + def get_hf_inputs(self, modality: str) -> dict[str, NestedTensors]: + modality_items = self._items_by_modality.get(modality, None) + hf_inputs = defaultdict[str, list[NestedTensors]](list) + if modality_items is not None: + for mm_kwargs_item in modality_items: + for key, value in mm_kwargs_item.items(): + hf_inputs[key].append(value.data) + hf_inputs = {key: torch.stack(value) for key, value in hf_inputs.items()} + return hf_inputs MultiModalPlaceholderDict = Mapping[str, Sequence[PlaceholderRange]] """ diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 268e2dc0b19a2..c0b2b597f5292 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -867,7 +867,6 @@ def _apply_hf_processor_text_mm( mm_data=processor_data, mm_kwargs=hf_processor_mm_kwargs, ) - print("prompt_text", prompt_text, processed_data.keys()) processed_data.update(passthrough_data) prompt_ids, = processed_data.pop("input_ids").tolist() @@ -1284,7 +1283,7 @@ def apply( for modality, placeholders in mm_placeholders.items() } - print("DONE HERE?") + print("DONE HERE?", mm_placeholder_ranges) return MultiModalInputs( type="multimodal", prompt=prompt, diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index b86099e780e51..4a07ef32a4b85 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -813,7 +813,7 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"): # depending on the input multimodal items. curr_group_outputs = self.model.get_multimodal_embeddings( **batched_mm_inputs) - print("curr_group_outputs", curr_group_outputs.shape) + print("curr_group_outputs", curr_group_outputs[0].shape) for output in curr_group_outputs: encoder_outputs.append(output) From e0b534beb59405f76282e3910dca93d39930c450 Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 24 Feb 2025 10:02:45 +0100 Subject: [PATCH 3/5] clean up --- vllm/inputs/preprocess.py | 1 - vllm/inputs/registry.py | 3 --- vllm/model_executor/models/llama.py | 1 - vllm/model_executor/models/llava.py | 1 - vllm/model_executor/models/transformers.py | 1 - vllm/multimodal/base.py | 1 - vllm/multimodal/processing.py | 19 +++++-------------- vllm/multimodal/registry.py | 2 -- vllm/v1/engine/core.py | 6 +----- vllm/v1/engine/llm_engine.py | 1 - vllm/v1/engine/mm_input_cache.py | 3 --- vllm/v1/engine/processor.py | 4 ---- vllm/v1/worker/gpu_model_runner.py | 3 --- vllm/worker/model_runner.py | 18 ++---------------- 14 files changed, 8 insertions(+), 56 deletions(-) diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index b2e4866c6045c..bc5856990da6f 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -317,7 +317,6 @@ def _prompt_to_llm_inputs( * :class:`SingletonInputs` instance """ parsed = parse_singleton_prompt(prompt) - print("CALLED PROCESSOR", parsed["type"]) if parsed["type"] == "str": prompt_text = parsed["content"] diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 2536924a51d1a..87b7a7631e42e 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -176,7 +176,6 @@ def call_hf_processor( allow_var_kwargs=True, ) - # print("CTX", data.keys(), merged_kwargs.keys()) try: return hf_processor(**data, **merged_kwargs, return_tensors="pt") except Exception as exc: @@ -388,7 +387,6 @@ def dummy_data_for_profiling( f"Expected at least {num_expected} dummy '{k}' instances " f"for profiling, but found {num_items} instances instead.") - print(dummy_data.multi_modal_data["pixel_values"].shape) return dummy_data def _default_input_processor( @@ -463,7 +461,6 @@ def process_input(self, model_config: "ModelConfig", processor, ) - print("process_input", processor, mm_processor_kwargs.keys()) processed_inputs = processor( InputContext(model_config), inputs, diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index a5fd00b670de2..2ff52dd789125 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -375,7 +375,6 @@ def forward( "residual": residual }) - # print(hidden_states.shape) hidden_states, _ = self.norm(hidden_states, residual) return hidden_states diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index a60e0b83a8fee..6a4277adb6bf4 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -530,7 +530,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: hf_config=config.text_config, prefix=maybe_prefix(prefix, "language_model"), ) - print("self.language_model", self.language_model.__class__, self.vision_tower.__class__) self.make_empty_intermediate_tensors = ( self.language_model.make_empty_intermediate_tensors) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 02442eea845f8..59a2a1e69b455 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -230,7 +230,6 @@ def _apply_hf_processor_text_mm( mm_data=processor_data, mm_kwargs=hf_processor_mm_kwargs, ) - print("prompt_text", prompt_text, processed_data["pixel_values"][0].shape) processed_data.update(passthrough_data) prompt_ids, = processed_data.pop("input_ids").tolist() diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 28abc8b5fe65d..c48d07ba365ba 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -219,7 +219,6 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int: if not supports_multimodal(model_cls): return 0 - print("_max_mm_tokens", self._max_mm_tokens, model_cls, self.__class__) max_mm_tokens = self._max_mm_tokens.get(model_cls) if max_mm_tokens is None: return 0 diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index c0b2b597f5292..3415beece53c0 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -995,7 +995,6 @@ def _cached_apply_hf_processor( _, passthrough_data = self._get_hf_mm_data(mm_data_items) if cache is None or passthrough_data: - print("NO CACHE!") return self._apply_hf_processor_main( prompt=prompt, mm_items=mm_data_items, @@ -1040,7 +1039,6 @@ def _cached_apply_hf_processor( modality: 0 for modality in mm_missing_data_items } - print("CACHED!", mm_missing_idxs) merged_kw_items = list[MultiModalKwargsItem]() for modality, kw_items in mm_maybe_cached_kw_items.items(): @@ -1234,16 +1232,10 @@ def apply( else: mm_hashes = None - # prompt_ids, mm_kwargs, is_repl_applied = self._cached_apply_hf_processor( - # prompt, - # mm_items, - # hf_processor_mm_kwargs, - # ) - prompt_ids, mm_kwargs, is_repl_applied = self._apply_hf_processor_main( - prompt=prompt, - mm_items=mm_items, - hf_processor_mm_kwargs=hf_processor_mm_kwargs, - enable_hf_prompt_replacement=True, + prompt_ids, mm_kwargs, is_repl_applied = self._cached_apply_hf_processor( + prompt, + mm_items, + hf_processor_mm_kwargs, ) unbound_prompt_repls = self._get_prompt_replacements( @@ -1283,13 +1275,12 @@ def apply( for modality, placeholders in mm_placeholders.items() } - print("DONE HERE?", mm_placeholder_ranges) return MultiModalInputs( type="multimodal", prompt=prompt, prompt_token_ids=prompt_ids, mm_kwargs=mm_kwargs, - mm_hashes=None, #mm_hashes, + mm_hashes=mm_hashes, mm_placeholders=mm_placeholder_ranges, ) diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index cc91e9d0279df..613d1db416720 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -266,7 +266,6 @@ def get_max_tokens_per_item_by_modality( return processor.info.get_mm_max_tokens_per_item( seq_len, mm_limits) - print(self._plugins['image'].get_max_multimodal_tokens(model_config), self._plugins['image'].get_max_multimodal_tokens) return { key: plugin.get_max_multimodal_tokens(model_config) for key, plugin in self._plugins.items() @@ -286,7 +285,6 @@ def get_max_tokens_per_item_by_nonzero_modality( usage of a model. """ mm_limits = self.get_mm_limits_per_prompt(model_config) - print("mm_limits", mm_limits) return { key: max_tokens_per_mm_item diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 5ccbbd32bd469..66e252b7ccb0f 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -129,11 +129,9 @@ def add_request(self, request: EngineCoreRequest): assert request.mm_inputs is not None request.mm_inputs = self.mm_input_cache_server.get_and_update( request.mm_inputs, request.mm_hashes) - # print("request.mm_hashes is None", request.mm_inputs[0] is None) # V1 ADDED HERE IF CACHED from MMInputMapperServer req = Request.from_engine_core_request(request) - # print("self.scheduler.add_request", self.scheduler.add_request) # vllm.v1.core.scheduler.Scheduler.add_request self.scheduler.add_request(req) def abort_requests(self, request_ids: List[str]): @@ -152,9 +150,7 @@ def step(self) -> EngineCoreOutputs: return EngineCoreOutputs( outputs=[], scheduler_stats=self.scheduler.make_stats()) - scheduler_output = self.scheduler.schedule() # kinda allocated new kv cache and updates many internal stats for the requests - # print("scheduler_output", scheduler_output) # DEFI HAS pixel values when V1 is set - # print("self.model_executor", self.model_executor.execute_model) + scheduler_output = self.scheduler.schedule() output = self.model_executor.execute_model(scheduler_output) engine_core_outputs = self.scheduler.update_from_output( scheduler_output, output) # type: ignore diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index c5d03ace73215..c9a4c5369dfd8 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -131,7 +131,6 @@ def add_request( ) -> None: # 1) Process raw inputs into the request. - print("CALL add_request", prompt, self.processor) request = self.processor.process_inputs(request_id, prompt, params, arrival_time, lora_request, trace_headers, diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py index 0aaf264ad31d7..a1d802bf818a2 100644 --- a/vllm/v1/engine/mm_input_cache.py +++ b/vllm/v1/engine/mm_input_cache.py @@ -100,7 +100,6 @@ def process_inputs( mm_input = self.mm_cache.get(mm_hash) self.mm_cache_total += 1 - # print("mm_input is None", mm_input is None) if mm_input is None: if precomputed_mm_inputs is not None: # Reuse precomputed input (for merged preprocessor) @@ -119,9 +118,7 @@ def process_inputs( else: self.mm_cache_hits += 1 mm_input = None # Avoids sending mm_input to Server - print(" Avoids sending mm_input to Server, use cache somewhow, I dunno how yet") - # print("mm_input is None", mm_input is None) ret_inputs.append(mm_input) return ret_inputs diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 1232579b1b460..908204adf7236 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -112,8 +112,6 @@ def process_inputs( # multimodal data and expand prompt token ids accordingly. # 3. Apply prompt adapter to prompt token ids if one exists. # Process inputs. - - # CALL input_preprocessor (preprocess.py) where print(add request) preprocessed_inputs = self.input_preprocessor.preprocess( prompt, request_id=request_id, @@ -158,7 +156,6 @@ def process_inputs( # Fallback to using MultiModalHasher directly. else: mm_hashes = MultiModalHasher.hash_prompt_mm_data(prompt) - print('HASHING', bool(decoder_inputs.multi_modal_hashes), mm_hashes) # For merged preprocessor, mm_data is already mm_inputs precomputed_mm_inputs: Optional[list[MultiModalKwargs]] = None @@ -191,7 +188,6 @@ def process_inputs( mm_positions, mm_hashes, ) - print("mm_positions", mm_positions) # NOTE: Sort multimodal inputs/kwargs ONLY IF there are multiple # modalities involved AND the model supports merged input processor. diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4a07ef32a4b85..16ec44eefacb1 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -813,7 +813,6 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"): # depending on the input multimodal items. curr_group_outputs = self.model.get_multimodal_embeddings( **batched_mm_inputs) - print("curr_group_outputs", curr_group_outputs[0].shape) for output in curr_group_outputs: encoder_outputs.append(output) @@ -895,7 +894,6 @@ def execute_model( num_input_tokens = num_scheduled_tokens attn_metadata.num_input_tokens = num_input_tokens - print("self.is_multimodal_model", num_scheduled_tokens, num_input_tokens) if self.is_multimodal_model: # NOTE(woosuk): To unify token ids and soft tokens (vision # embeddings), we always use embeddings (rather than token ids) @@ -935,7 +933,6 @@ def execute_model( for k, v in self.intermediate_tensors.items() }) - print("inputs_embeds", inputs_embeds.shape) # THIS IN THE ENTRYPOINT IN V1 # Run the decoder. # Use persistent buffers for CUDA graphs. with set_forward_context(attn_metadata, self.vllm_config): diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index ea05c444efb45..67d175c373d82 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -542,7 +542,6 @@ def _compute_for_prefix_cache_hit( remaining blocks. """ computed_block_nums = inter_data.computed_block_nums - print("cache hit", computed_block_nums is not None, inter_data.is_prompt) # Note that prefix caching does not support sliding window. prefix_cache_hit = (computed_block_nums is not None @@ -652,12 +651,10 @@ def _compute_prompt_adapter_input( # Note that when is_prompt=True, we expect only one sequence # in the group. if not self.enable_prompt_adapter: - print("no enable_prompt_adapter") return prompt_adapter_id = seq_group_metadata.prompt_adapter_id if prompt_adapter_id <= 0 or not inter_data.is_prompt: - print("no prompt_adapter_id", prompt_adapter_id) return # We expect only one sequence in the group when is_prompt=True. @@ -673,7 +670,6 @@ def _compute_prompt_adapter_input( inter_data.prompt_adapter_prompt_mapping = [prompt_adapter_id] * ( query_len if seq_group_metadata.sampling_params and seq_group_metadata.sampling_params.prompt_logprobs else 1) - print("prompt adapters", inter_data.prompt_adapter_index_mapping, inter_data.prompt_adapter_prompt_mapping) def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, seq_group_metadata: SequenceGroupMetadata): @@ -687,11 +683,9 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, if not mm_data: return - # print("seq_group_metadata", seq_group_metadata, positions) if self.runner.mm_registry.has_processor(self.runner.model_config): mm_kwargs = mm_data else: - print("RUN INPUT MAPPER AGAIN BUT WHY") mm_kwargs = self.multi_modal_input_mapper( mm_data, seq_group_metadata.mm_processor_kwargs, @@ -699,7 +693,6 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup, inter_data.multi_modal_kwargs = mm_kwargs inter_data.multi_modal_placeholder_maps = placeholder_maps - print("placeholder_maps", placeholder_maps["image"].src_ranges, placeholder_maps["image"].dest_ranges) # special processing for mrope position deltas. if self.runner.model_config.uses_mrope: @@ -759,14 +752,12 @@ def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): encoder_seq_len=encoder_seq_len) self.inter_data_list.append(inter_data) - # print("input_tokens", inter_data.input_tokens) for seq_idx in range(n_seqs): for per_seq_fn in self.per_seq_compute_fns: - per_seq_fn(inter_data, seq_idx, seq_group_metadata) # ADDS PLACEHOLDER HERE I GUESS? + per_seq_fn(inter_data, seq_idx, seq_group_metadata) for per_seq_group_fn in self.per_seq_group_compute_fns: - per_seq_group_fn(inter_data, seq_group_metadata) # ADDS MM KWARGS HERE - # print("inter_data should have mm here!!!", inter_data.multi_modal_kwargs is not None) + per_seq_group_fn(inter_data, seq_group_metadata) def _use_captured_graph(self, batch_size: int, @@ -986,7 +977,6 @@ def build(self) -> ModelInputForGPU: if data.multi_modal_kwargs is not None ] multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) - # print("building", multi_modal_kwargs.keys()) return self.model_input_cls( input_tokens=input_tokens_tensor, @@ -1728,10 +1718,6 @@ def execute_model( model_forward_end = torch.cuda.Event(enable_timing=True) model_forward_start.record() - if "pixel_values" in multi_modal_kwargs: - print('FINALLY FORWARD', model_input.input_tokens.shape, multi_modal_kwargs["pixel_values"].shape) - else: - print('DECODE', model_input.input_tokens.shape) if not bypass_model_exec: with set_forward_context(model_input.attn_metadata, self.vllm_config, virtual_engine): From 7e8f0d8a0ed0d17696b9a5628915b7dbe3041814 Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 24 Feb 2025 10:06:16 +0100 Subject: [PATCH 4/5] clean up 2 --- vllm/engine/llm_engine.py | 2 -- vllm/entrypoints/llm.py | 4 ---- vllm/executor/uniproc_executor.py | 1 - vllm/multimodal/processing.py | 6 +++++- vllm/v1/engine/processor.py | 3 +-- vllm/v1/worker/gpu_model_runner.py | 1 - vllm/worker/worker_base.py | 2 -- 7 files changed, 6 insertions(+), 13 deletions(-) diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 816baab8c0bd0..2e5bc75c6db38 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -724,7 +724,6 @@ def add_request( if inputs is not None: prompt = inputs assert prompt is not None and params is not None - print("CALL add_request", prompt) if lora_request is not None and not self.lora_config: raise ValueError(f"Got lora_request {lora_request} but LoRA is " @@ -755,7 +754,6 @@ def add_request( lora_request=lora_request, prompt_adapter_request=prompt_adapter_request, ) - print("mm_hashes", preprocessed_inputs.get('mm_hashes')) processed_inputs = self.input_processor(preprocessed_inputs) self._add_processed_request( diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index e71ac91e5fcc6..075ef3e59d885 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -691,10 +691,6 @@ def chat( ] tokenizer = self.get_tokenizer() - model_config = self.llm_engine.processor.input_preprocessor.model_config - mm_processor = self.llm_engine.processor.input_preprocessor.mm_registry.create_processor(model_config, tokenizer) - processor = mm_processor.info.ctx.get_hf_processor() - chat_template = processor.chat_template model_config = self.llm_engine.get_model_config() resolved_content_format = resolve_chat_template_content_format( diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py index bf693cab5cc7c..94db232240d55 100644 --- a/vllm/executor/uniproc_executor.py +++ b/vllm/executor/uniproc_executor.py @@ -53,7 +53,6 @@ def collective_rpc(self, kwargs: Optional[Dict] = None) -> List[Any]: if kwargs is None: kwargs = {} - # print("self.driver_worker", self.driver_worker, method, getattr(self.driver_worker, method)) answer = run_method(self.driver_worker, method, args, kwargs) return [answer] diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 3415beece53c0..fcd02fbd5203c 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1232,7 +1232,11 @@ def apply( else: mm_hashes = None - prompt_ids, mm_kwargs, is_repl_applied = self._cached_apply_hf_processor( + ( + prompt_ids, + mm_kwargs, + is_repl_applied, + ) = self._cached_apply_hf_processor( prompt, mm_items, hf_processor_mm_kwargs, diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index 908204adf7236..b7eee5a39972b 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -111,7 +111,6 @@ def process_inputs( # 2. For multimodal models with a merged preprocessor, preprocess # multimodal data and expand prompt token ids accordingly. # 3. Apply prompt adapter to prompt token ids if one exists. - # Process inputs. preprocessed_inputs = self.input_preprocessor.preprocess( prompt, request_id=request_id, @@ -215,7 +214,7 @@ def process_inputs( mm_hashes=sorted_mm_hashes, mm_processor_kwargs=decoder_inputs.mm_processor_kwargs, precomputed_mm_inputs=precomputed_mm_inputs, - ) # THIS ONE REMOVES INPUT IMAGES IF CACHED with MMInputMapperClient + ) else: sorted_mm_inputs = None sorted_mm_hashes = None diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 16ec44eefacb1..31fe095a91bc0 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -881,7 +881,6 @@ def execute_model( # Prepare the decoder inputs. attn_metadata, logits_indices = self._prepare_inputs(scheduler_output) - num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens if (self.use_cuda_graph and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]): diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py index 8984ee83636cb..190429074d56c 100644 --- a/vllm/worker/worker_base.py +++ b/vllm/worker/worker_base.py @@ -343,7 +343,6 @@ def _get_driver_input_and_broadcast( execute_model_req.virtual_engine, execute_model_req.finished_requests_ids)) - # print("self.model_runner.prepare_model_input", self.model_runner.prepare_model_input) kwargs = extract_previous_hidden_states(execute_model_req) if self.do_metadata_broadcast: @@ -418,7 +417,6 @@ def execute_model( orig_model_execute_time = intermediate_tensors.tensors.get( "model_execute_time", torch.tensor(0)).item() - # print("self.model_runner.execute_model", self.model_runner.execute_model) output = self.model_runner.execute_model( model_input=model_input, kv_caches=self.kv_cache[worker_input.virtual_engine] From 57c2d85cfae0dcd32c9de46806846e4552cf38d6 Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 24 Feb 2025 10:28:24 +0100 Subject: [PATCH 5/5] use arbitrary high resolution in dummy inputs --- vllm/model_executor/models/transformers.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py index 59a2a1e69b455..b1a6fa0d95a59 100644 --- a/vllm/model_executor/models/transformers.py +++ b/vllm/model_executor/models/transformers.py @@ -161,9 +161,9 @@ def get_dummy_processor_inputs( video_token = getattr(processor, "video_token", None) # TODO: raushan, we can have processor attr for `processor.max_output_size` which will infer - # max features for model in HF side. IMO should be all done on processor side, not on model config - vision_config = self.info.get_hf_config().vision_config - target_width = target_height = vision_config.image_size + # max features for model in HF side. But imo we can just set a veru high resolution + # and the processor will return us pixels with correct max shape. Resolution 3kx3k is high enough + target_width = target_height = 3000 # NOTE: we can pass videos/images/audio to any processor With the new API used in MLLMs, # HF processor will take the modality needed for model and ignore all others @@ -353,9 +353,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: self.vocab_size, logit_scale) self.sampler = get_sampler() - MultiModalRegistry()._get_plugin("image").register_max_multimodal_tokens(576) - InputRegistry()._dummy_factories_by_model_type[model_cls] = factory - def apply_base_model_tp_plan(self, module: nn.Module, prefix: str = ""): """ Apply the base model tensor parallelization plan to a module.