From 26a9f1b5c177eea869a82b7260ddaf8daa1b3fef Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Wed, 19 Feb 2025 16:43:11 +0100
Subject: [PATCH 1/5] tmp

---
 vllm/engine/llm_engine.py                  |  2 +
 vllm/entrypoints/llm.py                    |  5 ++
 vllm/executor/uniproc_executor.py          |  1 +
 vllm/inputs/preprocess.py                  |  1 +
 vllm/inputs/registry.py                    |  2 +
 vllm/model_executor/models/llama.py        |  1 +
 vllm/model_executor/models/llava.py        |  1 +
 vllm/model_executor/models/transformers.py | 60 +++++++++++++++-------
 vllm/multimodal/base.py                    |  1 +
 vllm/multimodal/processing.py              | 24 +++++----
 vllm/multimodal/registry.py                |  2 +
 vllm/v1/engine/core.py                     |  6 ++-
 vllm/v1/engine/llm_engine.py               |  1 +
 vllm/v1/engine/mm_input_cache.py           |  3 ++
 vllm/v1/engine/processor.py                |  7 ++-
 vllm/v1/worker/gpu_model_runner.py         |  4 ++
 vllm/worker/model_runner.py                | 18 ++++++-
 vllm/worker/worker_base.py                 |  2 +
 18 files changed, 109 insertions(+), 32 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 2e5bc75c6db38..816baab8c0bd0 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -724,6 +724,7 @@ def add_request(
         if inputs is not None:
             prompt = inputs
         assert prompt is not None and params is not None
+        print("CALL add_request", prompt)
 
         if lora_request is not None and not self.lora_config:
             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
@@ -754,6 +755,7 @@ def add_request(
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,
         )
+        print("mm_hashes", preprocessed_inputs.get('mm_hashes'))
         processed_inputs = self.input_processor(preprocessed_inputs)
 
         self._add_processed_request(
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 40b7a529ebfb5..e71ac91e5fcc6 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -691,6 +691,11 @@ def chat(
             ]
 
         tokenizer = self.get_tokenizer()
+        model_config = self.llm_engine.processor.input_preprocessor.model_config
+        mm_processor = self.llm_engine.processor.input_preprocessor.mm_registry.create_processor(model_config, tokenizer)
+        processor = mm_processor.info.ctx.get_hf_processor()
+        chat_template = processor.chat_template
+
         model_config = self.llm_engine.get_model_config()
         resolved_content_format = resolve_chat_template_content_format(
             chat_template,
diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py
index 94db232240d55..bf693cab5cc7c 100644
--- a/vllm/executor/uniproc_executor.py
+++ b/vllm/executor/uniproc_executor.py
@@ -53,6 +53,7 @@ def collective_rpc(self,
                        kwargs: Optional[Dict] = None) -> List[Any]:
         if kwargs is None:
             kwargs = {}
+        # print("self.driver_worker", self.driver_worker, method, getattr(self.driver_worker, method))
         answer = run_method(self.driver_worker, method, args, kwargs)
         return [answer]
 
diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index bc5856990da6f..b2e4866c6045c 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -317,6 +317,7 @@ def _prompt_to_llm_inputs(
         * :class:`SingletonInputs` instance
         """
         parsed = parse_singleton_prompt(prompt)
+        print("CALLED PROCESSOR", parsed["type"])
 
         if parsed["type"] == "str":
             prompt_text = parsed["content"]
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 87b7a7631e42e..d8e16e9456986 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -176,6 +176,7 @@ def call_hf_processor(
             allow_var_kwargs=True,
         )
 
+        # print("CTX", data.keys(), merged_kwargs.keys())
         try:
             return hf_processor(**data, **merged_kwargs, return_tensors="pt")
         except Exception as exc:
@@ -461,6 +462,7 @@ def process_input(self, model_config: "ModelConfig",
             processor,
         )
 
+        print("process_input", processor, mm_processor_kwargs.keys())
         processed_inputs = processor(
             InputContext(model_config),
             inputs,
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 2ff52dd789125..a5fd00b670de2 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -375,6 +375,7 @@ def forward(
                 "residual": residual
             })
 
+        # print(hidden_states.shape)
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 6a4277adb6bf4..a60e0b83a8fee 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -530,6 +530,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
             hf_config=config.text_config,
             prefix=maybe_prefix(prefix, "language_model"),
         )
+        print("self.language_model", self.language_model.__class__, self.vision_tower.__class__)
 
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 9b456b2489525..f7dfdfa619c88 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -38,7 +38,10 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 
-from .interfaces import SupportsQuant
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
+from vllm.inputs import InputRegistry, INPUT_REGISTRY, DummyData
+from vll.Sequence import SequenceData
+from .interfaces import SupportsQuant, SupportsMultiModal
 from .utils import maybe_prefix
 
 logger = init_logger(__name__)
@@ -119,7 +122,20 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
     )
 
 
-class TransformersModel(nn.Module, SupportsQuant):
+def map_auto_class(config):
+    AutoModel
+
+
+def dummy_encoder_data_for_whisper(ctx, seq_len: int, mm_counts):
+    assert mm_counts["image"] == 1
+    return DummyData(
+        SequenceData.from_prompt_token_counts((0, 596)),
+        {"image": np.zeros((3, 336, 336))},
+    )
+
+@INPUT_REGISTRY.register_dummy_encoder_data(dummy_encoder_data_for_whisper)
+@MULTIMODAL_REGISTRY.register_max_multimodal_tokens("image", 576)
+class TransformersModel(nn.Module, SupportsQuant, SupportsMultiModal):
     embedding_padding_modules = ["lm_head"]
     embedding_modules = ["embed_tokens"
                          ]  # TODO transformers will have a util to get it
@@ -132,12 +148,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         cache_config = vllm_config.cache_config
 
         self.config = config
-        self.vocab_size = config.vocab_size
-        self.unpadded_vocab_size = config.vocab_size
+        self.vocab_size = config.get_text_config().vocab_size
+        self.unpadded_vocab_size = config.get_text_config().vocab_size
 
         self.model: PreTrainedModel = AutoModel.from_config(
             self.config,
-            attn_implementation="vllm",
+            attn_implementation={"text_config": "vllm", "vision_config": "eager"},
             torch_dtype=vllm_config.model_config.dtype,
             trust_remote_code=vllm_config.model_config.trust_remote_code,
         )
@@ -150,39 +166,42 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         tp_size = get_tensor_model_parallel_world_size()
         self.attention_instances = [
             Attention(
-                num_heads=divide(config.num_attention_heads, tp_size),
-                head_size=config.head_dim,
+                num_heads=divide(config.get_text_config().num_attention_heads, tp_size),
+                head_size=config.get_text_config().head_dim,
                 # NOTE: We use Llama scale as default, if it's set by
                 # Transformers, it's updated in vllm_flash_attention_forward
-                scale=config.head_dim**-0.5,
-                num_kv_heads=divide(config.num_key_value_heads, tp_size),
+                scale=config.get_text_config().head_dim**-0.5,
+                num_kv_heads=divide(config.get_text_config().num_key_value_heads, tp_size),
                 cache_config=cache_config,
                 quant_config=self.quant_config,
-                prefix=f"{i}.attn") for i in range(config.num_hidden_layers)
+                prefix=f"{i}.attn") for i in range(config.get_text_config().num_hidden_layers)
         ]
 
         # Model modifications
         self.replace_vocab_embed_class(self.model)
 
         # ForCausalLM modifications
-        self.lm_head = ParallelLMHead(config.vocab_size,
-                                      config.hidden_size,
+        self.lm_head = ParallelLMHead(config.get_text_config().vocab_size,
+                                      config.get_text_config().hidden_size,
                                       quant_config=self.quant_config,
                                       prefix=maybe_prefix(prefix, "lm_head"))
-        if config.tie_word_embeddings:
+        if config.get_text_config().tie_word_embeddings:
             self.lm_head.weight = self.model.get_input_embeddings().weight
 
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.vocab_size, logit_scale)
+                                                config.get_text_config().vocab_size, logit_scale)
         self.sampler = get_sampler()
 
+        MultiModalRegistry()._get_plugin("image").register_max_multimodal_tokens(576)
+        InputRegistry()._dummy_factories_by_model_type[model_cls] = factory
+
     def apply_base_model_tp_plan(self, module: nn.Module, prefix: str = ""):
         """
         Apply the base model tensor parallelization plan to a module.
         Currently only supports linear layers.
         """
-        if (self.config.base_model_tp_plan is None
+        if (self.config.get_text_config().base_model_tp_plan is None
                 and get_tensor_model_parallel_world_size() > 1):
             raise ValueError(
                 "Trying to run tensor parallelization but the model does not "
@@ -190,7 +209,7 @@ def apply_base_model_tp_plan(self, module: nn.Module, prefix: str = ""):
 
         for child_name, child_module in module.named_children():
             qual_name = maybe_prefix(prefix, child_name)
-            for pattern, style in self.config.base_model_tp_plan.items():
+            for pattern, style in self.config.get_text_config().base_model_tp_plan.items():
                 if re.match(pattern, qual_name) and isinstance(
                         child_module, nn.Linear):
                     new_module = replace_linear_class(child_module, style,
@@ -204,8 +223,8 @@ def replace_vocab_embed_class(self, module: nn.Module):
         # Use native set input embeddings
         new_module = VocabParallelEmbedding(
             self.vocab_size,
-            self.config.hidden_size,
-            org_num_embeddings=self.config.vocab_size,
+            self.config.get_text_config().hidden_size,
+            org_num_embeddings=self.vocab_size,
             quant_config=None,
         )
         log_replacement("input embedding", self.model.get_input_embeddings(),
@@ -252,7 +271,10 @@ def load_weights(self, weights: Iterable[tuple[str,
         loaded_params = set[str]()
         for name, loaded_weight in weights:
             if name not in params_dict:
-                name = f"{self.model.base_model_prefix}.{name}"
+                if "lm_head" in name:
+                    name = name.replace("language_model.", "")
+                else:
+                    name = f"{self.model.base_model_prefix}.{name}"
             if name in params_dict:
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index c48d07ba365ba..28abc8b5fe65d 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -219,6 +219,7 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
         if not supports_multimodal(model_cls):
             return 0
 
+        print("_max_mm_tokens", self._max_mm_tokens, model_cls, self.__class__)
         max_mm_tokens = self._max_mm_tokens.get(model_cls)
         if max_mm_tokens is None:
             return 0
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index fcd02fbd5203c..268e2dc0b19a2 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -867,6 +867,7 @@ def _apply_hf_processor_text_mm(
             mm_data=processor_data,
             mm_kwargs=hf_processor_mm_kwargs,
         )
+        print("prompt_text", prompt_text, processed_data.keys())
         processed_data.update(passthrough_data)
 
         prompt_ids, = processed_data.pop("input_ids").tolist()
@@ -995,6 +996,7 @@ def _cached_apply_hf_processor(
 
         _, passthrough_data = self._get_hf_mm_data(mm_data_items)
         if cache is None or passthrough_data:
+            print("NO CACHE!")
             return self._apply_hf_processor_main(
                 prompt=prompt,
                 mm_items=mm_data_items,
@@ -1039,6 +1041,7 @@ def _cached_apply_hf_processor(
             modality: 0
             for modality in mm_missing_data_items
         }
+        print("CACHED!", mm_missing_idxs)
 
         merged_kw_items = list[MultiModalKwargsItem]()
         for modality, kw_items in mm_maybe_cached_kw_items.items():
@@ -1232,14 +1235,16 @@ def apply(
         else:
             mm_hashes = None
 
-        (
-            prompt_ids,
-            mm_kwargs,
-            is_repl_applied,
-        ) = self._cached_apply_hf_processor(
-            prompt,
-            mm_items,
-            hf_processor_mm_kwargs,
+        # prompt_ids, mm_kwargs, is_repl_applied = self._cached_apply_hf_processor(
+        #     prompt,
+        #     mm_items,
+        #     hf_processor_mm_kwargs,
+        # )
+        prompt_ids, mm_kwargs, is_repl_applied = self._apply_hf_processor_main(
+            prompt=prompt,
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+            enable_hf_prompt_replacement=True,
         )
 
         unbound_prompt_repls = self._get_prompt_replacements(
@@ -1279,12 +1284,13 @@ def apply(
             for modality, placeholders in mm_placeholders.items()
         }
 
+        print("DONE HERE?")
         return MultiModalInputs(
             type="multimodal",
             prompt=prompt,
             prompt_token_ids=prompt_ids,
             mm_kwargs=mm_kwargs,
-            mm_hashes=mm_hashes,
+            mm_hashes=None, #mm_hashes,
             mm_placeholders=mm_placeholder_ranges,
         )
 
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 613d1db416720..cc91e9d0279df 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -266,6 +266,7 @@ def get_max_tokens_per_item_by_modality(
             return processor.info.get_mm_max_tokens_per_item(
                 seq_len, mm_limits)
 
+        print(self._plugins['image'].get_max_multimodal_tokens(model_config), self._plugins['image'].get_max_multimodal_tokens)
         return {
             key: plugin.get_max_multimodal_tokens(model_config)
             for key, plugin in self._plugins.items()
@@ -285,6 +286,7 @@ def get_max_tokens_per_item_by_nonzero_modality(
             usage of a model.
         """
         mm_limits = self.get_mm_limits_per_prompt(model_config)
+        print("mm_limits", mm_limits)
 
         return {
             key: max_tokens_per_mm_item
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 66e252b7ccb0f..5ccbbd32bd469 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -129,9 +129,11 @@ def add_request(self, request: EngineCoreRequest):
             assert request.mm_inputs is not None
             request.mm_inputs = self.mm_input_cache_server.get_and_update(
                 request.mm_inputs, request.mm_hashes)
+            # print("request.mm_hashes is None", request.mm_inputs[0] is None) # V1 ADDED HERE IF CACHED from MMInputMapperServer
 
         req = Request.from_engine_core_request(request)
 
+        # print("self.scheduler.add_request", self.scheduler.add_request) # vllm.v1.core.scheduler.Scheduler.add_request
         self.scheduler.add_request(req)
 
     def abort_requests(self, request_ids: List[str]):
@@ -150,7 +152,9 @@ def step(self) -> EngineCoreOutputs:
             return EngineCoreOutputs(
                 outputs=[], scheduler_stats=self.scheduler.make_stats())
 
-        scheduler_output = self.scheduler.schedule()
+        scheduler_output = self.scheduler.schedule() # kinda allocated new kv cache and updates many internal stats for the requests
+        # print("scheduler_output", scheduler_output) # DEFI HAS pixel values when V1 is set
+        # print("self.model_executor", self.model_executor.execute_model)
         output = self.model_executor.execute_model(scheduler_output)
         engine_core_outputs = self.scheduler.update_from_output(
             scheduler_output, output)  # type: ignore
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index c9a4c5369dfd8..c5d03ace73215 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -131,6 +131,7 @@ def add_request(
     ) -> None:
 
         # 1) Process raw inputs into the request.
+        print("CALL add_request", prompt, self.processor)
         request = self.processor.process_inputs(request_id, prompt, params,
                                                 arrival_time, lora_request,
                                                 trace_headers,
diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
index a1d802bf818a2..0aaf264ad31d7 100644
--- a/vllm/v1/engine/mm_input_cache.py
+++ b/vllm/v1/engine/mm_input_cache.py
@@ -100,6 +100,7 @@ def process_inputs(
                 mm_input = self.mm_cache.get(mm_hash)
 
             self.mm_cache_total += 1
+            # print("mm_input is None", mm_input is None)
             if mm_input is None:
                 if precomputed_mm_inputs is not None:
                     # Reuse precomputed input (for merged preprocessor)
@@ -118,7 +119,9 @@ def process_inputs(
             else:
                 self.mm_cache_hits += 1
                 mm_input = None  # Avoids sending mm_input to Server
+                print(" Avoids sending mm_input to Server, use cache somewhow, I dunno how yet")
 
+            # print("mm_input is None", mm_input is None)
             ret_inputs.append(mm_input)
 
         return ret_inputs
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index b7eee5a39972b..1232579b1b460 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -111,6 +111,9 @@ def process_inputs(
         # 2. For multimodal models with a merged preprocessor, preprocess
         #   multimodal data and expand prompt token ids accordingly.
         # 3. Apply prompt adapter to prompt token ids if one exists.
+        # Process inputs.
+
+        # CALL input_preprocessor (preprocess.py) where print(add request)
         preprocessed_inputs = self.input_preprocessor.preprocess(
             prompt,
             request_id=request_id,
@@ -155,6 +158,7 @@ def process_inputs(
             # Fallback to using MultiModalHasher directly.
             else:
                 mm_hashes = MultiModalHasher.hash_prompt_mm_data(prompt)
+            print('HASHING', bool(decoder_inputs.multi_modal_hashes), mm_hashes)
 
         # For merged preprocessor, mm_data is already mm_inputs
         precomputed_mm_inputs: Optional[list[MultiModalKwargs]] = None
@@ -187,6 +191,7 @@ def process_inputs(
                 mm_positions,
                 mm_hashes,
             )
+            print("mm_positions", mm_positions)
 
             # NOTE: Sort multimodal inputs/kwargs ONLY IF there are multiple
             # modalities involved AND the model supports merged input processor.
@@ -214,7 +219,7 @@ def process_inputs(
                 mm_hashes=sorted_mm_hashes,
                 mm_processor_kwargs=decoder_inputs.mm_processor_kwargs,
                 precomputed_mm_inputs=precomputed_mm_inputs,
-            )
+            ) # THIS ONE REMOVES INPUT IMAGES IF CACHED with MMInputMapperClient
         else:
             sorted_mm_inputs = None
             sorted_mm_hashes = None
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 31fe095a91bc0..b86099e780e51 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -813,6 +813,7 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
             # depending on the input multimodal items.
             curr_group_outputs = self.model.get_multimodal_embeddings(
                 **batched_mm_inputs)
+            print("curr_group_outputs", curr_group_outputs.shape)
 
             for output in curr_group_outputs:
                 encoder_outputs.append(output)
@@ -881,6 +882,7 @@ def execute_model(
 
         # Prepare the decoder inputs.
         attn_metadata, logits_indices = self._prepare_inputs(scheduler_output)
+
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.use_cuda_graph
                 and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
@@ -893,6 +895,7 @@ def execute_model(
             num_input_tokens = num_scheduled_tokens
         attn_metadata.num_input_tokens = num_input_tokens
 
+        print("self.is_multimodal_model", num_scheduled_tokens, num_input_tokens)
         if self.is_multimodal_model:
             # NOTE(woosuk): To unify token ids and soft tokens (vision
             # embeddings), we always use embeddings (rather than token ids)
@@ -932,6 +935,7 @@ def execute_model(
                 for k, v in self.intermediate_tensors.items()
             })
 
+        print("inputs_embeds", inputs_embeds.shape) # THIS IN THE ENTRYPOINT IN V1
         # Run the decoder.
         # Use persistent buffers for CUDA graphs.
         with set_forward_context(attn_metadata, self.vllm_config):
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 67d175c373d82..ea05c444efb45 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -542,6 +542,7 @@ def _compute_for_prefix_cache_hit(
         remaining blocks.
         """
         computed_block_nums = inter_data.computed_block_nums
+        print("cache hit", computed_block_nums is not None, inter_data.is_prompt)
 
         # Note that prefix caching does not support sliding window.
         prefix_cache_hit = (computed_block_nums is not None
@@ -651,10 +652,12 @@ def _compute_prompt_adapter_input(
         # Note that when is_prompt=True, we expect only one sequence
         # in the group.
         if not self.enable_prompt_adapter:
+            print("no enable_prompt_adapter")
             return
 
         prompt_adapter_id = seq_group_metadata.prompt_adapter_id
         if prompt_adapter_id <= 0 or not inter_data.is_prompt:
+            print("no prompt_adapter_id", prompt_adapter_id)
             return
 
         # We expect only one sequence in the group when is_prompt=True.
@@ -670,6 +673,7 @@ def _compute_prompt_adapter_input(
         inter_data.prompt_adapter_prompt_mapping = [prompt_adapter_id] * (
             query_len if seq_group_metadata.sampling_params
             and seq_group_metadata.sampling_params.prompt_logprobs else 1)
+        print("prompt adapters", inter_data.prompt_adapter_index_mapping, inter_data.prompt_adapter_prompt_mapping)
 
     def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
                                    seq_group_metadata: SequenceGroupMetadata):
@@ -683,9 +687,11 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
         if not mm_data:
             return
 
+        # print("seq_group_metadata", seq_group_metadata, positions)
         if self.runner.mm_registry.has_processor(self.runner.model_config):
             mm_kwargs = mm_data
         else:
+            print("RUN INPUT MAPPER AGAIN BUT WHY")
             mm_kwargs = self.multi_modal_input_mapper(
                 mm_data,
                 seq_group_metadata.mm_processor_kwargs,
@@ -693,6 +699,7 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
 
         inter_data.multi_modal_kwargs = mm_kwargs
         inter_data.multi_modal_placeholder_maps = placeholder_maps
+        print("placeholder_maps", placeholder_maps["image"].src_ranges, placeholder_maps["image"].dest_ranges)
 
         # special processing for mrope position deltas.
         if self.runner.model_config.uses_mrope:
@@ -752,12 +759,14 @@ def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata):
             encoder_seq_len=encoder_seq_len)
 
         self.inter_data_list.append(inter_data)
+        # print("input_tokens", inter_data.input_tokens)
 
         for seq_idx in range(n_seqs):
             for per_seq_fn in self.per_seq_compute_fns:
-                per_seq_fn(inter_data, seq_idx, seq_group_metadata)
+                per_seq_fn(inter_data, seq_idx, seq_group_metadata) # ADDS PLACEHOLDER HERE I GUESS?
         for per_seq_group_fn in self.per_seq_group_compute_fns:
-            per_seq_group_fn(inter_data, seq_group_metadata)
+            per_seq_group_fn(inter_data, seq_group_metadata) # ADDS MM KWARGS HERE
+        # print("inter_data should have mm here!!!", inter_data.multi_modal_kwargs is not None)
 
     def _use_captured_graph(self,
                             batch_size: int,
@@ -977,6 +986,7 @@ def build(self) -> ModelInputForGPU:
             if data.multi_modal_kwargs is not None
         ]
         multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
+        # print("building", multi_modal_kwargs.keys())
 
         return self.model_input_cls(
             input_tokens=input_tokens_tensor,
@@ -1718,6 +1728,10 @@ def execute_model(
             model_forward_end = torch.cuda.Event(enable_timing=True)
             model_forward_start.record()
 
+        if "pixel_values" in multi_modal_kwargs:
+            print('FINALLY FORWARD', model_input.input_tokens.shape, multi_modal_kwargs["pixel_values"].shape)
+        else:
+            print('DECODE', model_input.input_tokens.shape)
         if not bypass_model_exec:
             with set_forward_context(model_input.attn_metadata,
                                      self.vllm_config, virtual_engine):
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 190429074d56c..8984ee83636cb 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -343,6 +343,7 @@ def _get_driver_input_and_broadcast(
                 execute_model_req.virtual_engine,
                 execute_model_req.finished_requests_ids))
 
+        # print("self.model_runner.prepare_model_input", self.model_runner.prepare_model_input)
         kwargs = extract_previous_hidden_states(execute_model_req)
 
         if self.do_metadata_broadcast:
@@ -417,6 +418,7 @@ def execute_model(
                 orig_model_execute_time = intermediate_tensors.tensors.get(
                     "model_execute_time", torch.tensor(0)).item()
 
+        # print("self.model_runner.execute_model", self.model_runner.execute_model)
         output = self.model_runner.execute_model(
             model_input=model_input,
             kv_caches=self.kv_cache[worker_input.virtual_engine]

From a502988faf74c4faddd39e4c351017e4463d4da0 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Fri, 21 Feb 2025 17:12:08 +0100
Subject: [PATCH 2/5] dump

---
 vllm/inputs/registry.py                    |   1 +
 vllm/model_executor/models/transformers.py | 266 ++++++++++++++++++---
 vllm/multimodal/inputs.py                  |   9 +
 vllm/multimodal/processing.py              |   3 +-
 vllm/v1/worker/gpu_model_runner.py         |   2 +-
 5 files changed, 247 insertions(+), 34 deletions(-)

diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index d8e16e9456986..2536924a51d1a 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -388,6 +388,7 @@ def dummy_data_for_profiling(
                     f"Expected at least {num_expected} dummy '{k}' instances "
                     f"for profiling, but found {num_items} instances instead.")
 
+        print(dummy_data.multi_modal_data["pixel_values"].shape)
         return dummy_data
 
     def _default_input_processor(
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index f7dfdfa619c88..02442eea845f8 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -19,7 +19,7 @@
 
 import torch
 from torch import nn
-from transformers import AutoModel, PreTrainedModel
+from transformers import AutoModel, PreTrainedModel, LlavaConfig
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
 
 from vllm.attention import Attention, AttentionMetadata
@@ -37,10 +37,12 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.processor import cached_get_processor
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry, MultiModalKwargs
+from vllm.multimodal.processing import BaseMultiModalProcessor, BaseProcessingInfo
+from vllm.multimodal.profiling import BaseDummyInputsBuilder, ProcessorInputs
+from vllm.multimodal.inputs import MultiModalFieldConfig, MultiModalInputs, PlaceholderRange
 
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
-from vllm.inputs import InputRegistry, INPUT_REGISTRY, DummyData
-from vll.Sequence import SequenceData
 from .interfaces import SupportsQuant, SupportsMultiModal
 from .utils import maybe_prefix
 
@@ -122,23 +124,181 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
     )
 
 
-def map_auto_class(config):
-    AutoModel
+class MultiModalProcessingInfo(BaseProcessingInfo):
+    def get_hf_config(self):
+        # NOTE: this means we don't check if return config type is same as requested
+        # VLLM on contrary always checks. In whcih cases we can have different config types tho?
+        return self.ctx.model_config.hf_config
 
+    def get_supported_mm_limits(self):
+        return {"image": None, "video": None}
 
-def dummy_encoder_data_for_whisper(ctx, seq_len: int, mm_counts):
-    assert mm_counts["image"] == 1
-    return DummyData(
-        SequenceData.from_prompt_token_counts((0, 596)),
-        {"image": np.zeros((3, 336, 336))},
-    )
+    def get_mm_max_tokens_per_item(self, seq_len, mm_counts):
+        return {"image": self.get_max_image_tokens(), "video": 100}
+
+    def get_max_image_tokens(self) -> int:
+        # Is already an attribute in some VLMs and now reason to make it a required attribute
+        # TODO: @raushan add it for all VLM configs
+        return self.get_hf_config().image_seq_length
+
+    def get_hf_processor(self):
+        processor = cached_get_processor(self.ctx.model_config.model)
+        return processor
+
+
+class MultiModalDummyInputsBuilder(BaseDummyInputsBuilder):
+    def get_dummy_processor_inputs(
+        self,
+        seq_len,
+        mm_counts,
+    ) -> ProcessorInputs:
+        num_images = mm_counts.get("image", 0)
+        num_videos = mm_counts.get("video", 0)
+        num_frames = 8
+
+        processor = self.info.get_hf_processor()
+        image_token = getattr(processor, "image_token", None)
+        video_token = getattr(processor, "video_token", None)
+
+        # TODO: raushan, we can have processor attr for `processor.max_output_size` which will infer
+        # max features for model in HF side. IMO should be all done on processor side, not on model config
+        vision_config = self.info.get_hf_config().vision_config
+        target_width = target_height = vision_config.image_size
+
+        # NOTE: we can pass videos/images/audio to any processor With the new API used in MLLMs,
+        # HF processor will take the modality needed for model and ignore all others
+        mm_data = {
+            "image": self._get_dummy_images(
+                width=target_width,
+                height=target_height,
+                num_images=num_images
+            ),
+            "video": self._get_dummy_videos(
+                width=target_width,
+                height=target_height,
+                num_frames=num_frames,
+                num_videos=num_videos,
+            )
+        }
+
+        prompt_text = video_token*num_videos if video_token is not None else image_token*num_images    
+        return ProcessorInputs(
+            prompt_text=prompt_text,
+            mm_data=mm_data,
+        )
+
+
+class MultiModalProcessor(BaseMultiModalProcessor):
+    def _get_prompt_replacements(
+        self,
+        mm_items,
+        hf_processor_mm_kwargs,
+        out_mm_kwargs: MultiModalKwargs,
+    ):
+        return
+
+    def _get_mm_fields_config(
+        self,
+        hf_inputs,
+        hf_processor_mm_kwargs,
+    ):
+        return dict(
+            pixel_values=MultiModalFieldConfig.batched("image"),
+            mm_token_type_ids=MultiModalFieldConfig.batched("image"),
+            pixel_values_videos=MultiModalFieldConfig.batched("video"),
+            image_embeds=MultiModalFieldConfig.batched("image"),
+            video_embeds=MultiModalFieldConfig.batched("video"),
+        )
+    
+    def _apply_hf_processor_text_mm(
+        self,
+        prompt_text,
+        mm_items,
+        hf_processor_mm_kwargs,
+    ):
+        """
+        Apply the HF processor on the prompt text and multi-modal data
+        together.
+
+        In addition, return whether prompt replacements have been applied.
+        """
+        processor_data, passthrough_data = self._get_hf_mm_data(mm_items)
+        processor_data["return_mm_token_type_ids"] = True
+
+        processed_data = self._call_hf_processor(
+            prompt=prompt_text,
+            mm_data=processor_data,
+            mm_kwargs=hf_processor_mm_kwargs,
+        )
+        print("prompt_text", prompt_text, processed_data["pixel_values"][0].shape)
+        processed_data.update(passthrough_data)
+
+        prompt_ids, = processed_data.pop("input_ids").tolist()
+        mm_token_type_ids = processed_data.pop("mm_token_type_ids")
 
-@INPUT_REGISTRY.register_dummy_encoder_data(dummy_encoder_data_for_whisper)
-@MULTIMODAL_REGISTRY.register_max_multimodal_tokens("image", 576)
+        mm_kwargs = MultiModalKwargs.from_hf_inputs(
+            processed_data,
+            self._get_mm_fields_config(processed_data, hf_processor_mm_kwargs),
+        )
+
+        return prompt_ids, mm_kwargs, mm_token_type_ids
+
+    def apply(
+        self,
+        prompt,
+        mm_data,
+        hf_processor_mm_kwargs,
+    ) -> MultiModalInputs:
+        """
+        Process multi-modal inputs to be used in vLLM.
+
+        Apply HF Processor on prompt text and multi-modal data together,
+        outputting token IDs and processed tensors.
+        """
+        mm_items = self._to_mm_items(mm_data)
+        prompt_ids, mm_kwargs, mm_token_type_ids = self._apply_hf_processor_text_mm(
+            prompt_text=prompt,
+            mm_items=mm_items,
+            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
+        )
+
+        # HF processor will return `mm_token_type_ids` from which
+        # we can infer mm_placeholders. Until then hardcode to make code run
+        # Below tested on Llava. Prompts and `mm_token_type_ids` are always bs=1
+        mm_positions = torch.where(mm_token_type_ids == 1)[1]
+        hf_processor = self.info.get_hf_processor(**hf_processor_mm_kwargs)
+        mm_tokens_per_modality = hf_processor._get_num_mm_tokens(
+            image_inputs=mm_kwargs.get_hf_inputs("image"),
+            video_inputs=mm_kwargs.get_hf_inputs("video"),
+        )
+
+        mm_placeholders = {}
+        for modality in mm_tokens_per_modality:
+            split_sizes = mm_tokens_per_modality[modality]
+            if split_sizes != 0:
+                chunked_mm_positions = torch.split(mm_positions, split_sizes)
+                ranges = [
+                    PlaceholderRange(offset=positions[0].item(), length=positions.shape[0]) 
+                    for positions in chunked_mm_positions
+                ]
+                mm_placeholders = {modality: ranges}
+
+        return MultiModalInputs(
+            type="multimodal",
+            prompt=prompt,
+            prompt_token_ids=prompt_ids,
+            mm_kwargs=mm_kwargs,
+            mm_hashes=None,
+            mm_placeholders=mm_placeholders,
+        )
+
+
+@MULTIMODAL_REGISTRY.register_processor(MultiModalProcessor,
+                                        info=MultiModalProcessingInfo,
+                                        dummy_inputs=MultiModalDummyInputsBuilder)
 class TransformersModel(nn.Module, SupportsQuant, SupportsMultiModal):
     embedding_padding_modules = ["lm_head"]
-    embedding_modules = ["embed_tokens"
-                         ]  # TODO transformers will have a util to get it
+    embedding_modules = ["embed_tokens"]  # TODO transformers will have a util to get it
 
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super().__init__()
@@ -148,8 +308,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         cache_config = vllm_config.cache_config
 
         self.config = config
-        self.vocab_size = config.get_text_config().vocab_size
-        self.unpadded_vocab_size = config.get_text_config().vocab_size
+        self.text_config = config.get_text_config()
+        self.vocab_size = self.text_config.vocab_size
+        self.unpadded_vocab_size = self.text_config.vocab_size
 
         self.model: PreTrainedModel = AutoModel.from_config(
             self.config,
@@ -166,31 +327,31 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         tp_size = get_tensor_model_parallel_world_size()
         self.attention_instances = [
             Attention(
-                num_heads=divide(config.get_text_config().num_attention_heads, tp_size),
-                head_size=config.get_text_config().head_dim,
+                num_heads=divide(self.text_config.num_attention_heads, tp_size),
+                head_size=self.text_config.head_dim,
                 # NOTE: We use Llama scale as default, if it's set by
                 # Transformers, it's updated in vllm_flash_attention_forward
-                scale=config.get_text_config().head_dim**-0.5,
-                num_kv_heads=divide(config.get_text_config().num_key_value_heads, tp_size),
+                scale=self.text_config.head_dim**-0.5,
+                num_kv_heads=divide(self.text_config.num_key_value_heads, tp_size),
                 cache_config=cache_config,
                 quant_config=self.quant_config,
-                prefix=f"{i}.attn") for i in range(config.get_text_config().num_hidden_layers)
+                prefix=f"{i}.attn") for i in range(self.text_config.num_hidden_layers)
         ]
 
         # Model modifications
         self.replace_vocab_embed_class(self.model)
 
         # ForCausalLM modifications
-        self.lm_head = ParallelLMHead(config.get_text_config().vocab_size,
-                                      config.get_text_config().hidden_size,
+        self.lm_head = ParallelLMHead(self.text_config.vocab_size,
+                                      self.text_config.hidden_size,
                                       quant_config=self.quant_config,
                                       prefix=maybe_prefix(prefix, "lm_head"))
-        if config.get_text_config().tie_word_embeddings:
+        if self.text_config.tie_word_embeddings:
             self.lm_head.weight = self.model.get_input_embeddings().weight
 
         logit_scale = getattr(config, "logit_scale", 1.0)
         self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
-                                                config.get_text_config().vocab_size, logit_scale)
+                                                self.vocab_size, logit_scale)
         self.sampler = get_sampler()
 
         MultiModalRegistry()._get_plugin("image").register_max_multimodal_tokens(576)
@@ -201,7 +362,7 @@ def apply_base_model_tp_plan(self, module: nn.Module, prefix: str = ""):
         Apply the base model tensor parallelization plan to a module.
         Currently only supports linear layers.
         """
-        if (self.config.get_text_config().base_model_tp_plan is None
+        if (self.text_config.base_model_tp_plan is None
                 and get_tensor_model_parallel_world_size() > 1):
             raise ValueError(
                 "Trying to run tensor parallelization but the model does not "
@@ -209,7 +370,7 @@ def apply_base_model_tp_plan(self, module: nn.Module, prefix: str = ""):
 
         for child_name, child_module in module.named_children():
             qual_name = maybe_prefix(prefix, child_name)
-            for pattern, style in self.config.get_text_config().base_model_tp_plan.items():
+            for pattern, style in self.text_config.base_model_tp_plan.items():
                 if re.match(pattern, qual_name) and isinstance(
                         child_module, nn.Linear):
                     new_module = replace_linear_class(child_module, style,
@@ -223,7 +384,7 @@ def replace_vocab_embed_class(self, module: nn.Module):
         # Use native set input embeddings
         new_module = VocabParallelEmbedding(
             self.vocab_size,
-            self.config.get_text_config().hidden_size,
+            self.text_config.hidden_size,
             org_num_embeddings=self.vocab_size,
             quant_config=None,
         )
@@ -241,7 +402,8 @@ def forward(
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, IntermediateTensors]:
         model_output = self.model(
-            input_ids[None, ...],
+            input_ids[None, ...] if input_ids is not None else None,
+            inputs_embeds=inputs_embeds[None, ...] if inputs_embeds is not None else None,
             use_cache=False,
             position_ids=positions[None, ...],
             attn_metadata=attn_metadata,
@@ -271,6 +433,8 @@ def load_weights(self, weights: Iterable[tuple[str,
         loaded_params = set[str]()
         for name, loaded_weight in weights:
             if name not in params_dict:
+                # In MLLM the head is usually part of the LM so we might want to strip it
+                # Very bad workaround, needs smth better
                 if "lm_head" in name:
                     name = name.replace("language_model.", "")
                 else:
@@ -282,3 +446,43 @@ def load_weights(self, weights: Iterable[tuple[str,
                 weight_loader(param, loaded_weight)
                 loaded_params.add(name)
         return loaded_params
+
+    def get_multimodal_embeddings(self, **kwargs):
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_embeds = kwargs.pop("image_embeds", None)
+
+        if pixel_values is None and image_embeds is None:
+            return None
+
+        if pixel_values is not None:
+            vision_embeddings = self.model.get_image_features(
+                # Thing about pixels being batched again, adding extra dim
+                # TODO: find out do we really need that extra dim
+                pixel_values.flatten(0, 1), 
+                vision_feature_layer=self.config.vision_feature_layer,
+                vision_feature_select_strategy=self.config.vision_feature_select_strategy,
+            )
+            return vision_embeddings
+
+        if image_embeds is not None:
+            return image_embeds
+
+    def get_input_embeddings(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self.model.get_input_embeddings()(input_ids)
+        if multimodal_embeddings is not None:
+            # most supported VLMs merge like this, otherwise we can add a special
+            # `merge_multimodal_embeddings` method on HF side
+            mask = (input_ids == self.config.image_token_index)
+            mask = mask.unsqueeze(-1).expand_as(inputs_embeds)
+            multimodal_embeddings = torch.cat(multimodal_embeddings)
+
+            # FIXME: The returned multimodal_embeddings must be either a 3D torch.Tensor of shape
+            # (num_items, feature_size, hidden_size), or a list / tuple of 2D torch.Tensor’s of shape
+            # (feature_size, hidden_size), so that multimodal_embeddings[i] retrieves the embeddings generated
+            # from the i-th multimodal data item (e.g, image) of the request.
+            inputs_embeds = inputs_embeds.masked_scatter(mask, multimodal_embeddings)
+        return inputs_embeds
\ No newline at end of file
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index e93fa24a6e4dc..3d12f01fb6f15 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -702,6 +702,15 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]:
         self._validate_modality("get_items", modality)
         return self._items_by_modality[modality]
 
+    def get_hf_inputs(self, modality: str) -> dict[str, NestedTensors]:
+        modality_items = self._items_by_modality.get(modality, None)
+        hf_inputs = defaultdict[str, list[NestedTensors]](list)
+        if modality_items is not None: 
+            for mm_kwargs_item in modality_items:
+                for key, value in mm_kwargs_item.items():
+                    hf_inputs[key].append(value.data)
+        hf_inputs = {key: torch.stack(value) for key, value in hf_inputs.items()}
+        return hf_inputs
 
 MultiModalPlaceholderDict = Mapping[str, Sequence[PlaceholderRange]]
 """
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 268e2dc0b19a2..c0b2b597f5292 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -867,7 +867,6 @@ def _apply_hf_processor_text_mm(
             mm_data=processor_data,
             mm_kwargs=hf_processor_mm_kwargs,
         )
-        print("prompt_text", prompt_text, processed_data.keys())
         processed_data.update(passthrough_data)
 
         prompt_ids, = processed_data.pop("input_ids").tolist()
@@ -1284,7 +1283,7 @@ def apply(
             for modality, placeholders in mm_placeholders.items()
         }
 
-        print("DONE HERE?")
+        print("DONE HERE?", mm_placeholder_ranges)
         return MultiModalInputs(
             type="multimodal",
             prompt=prompt,
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index b86099e780e51..4a07ef32a4b85 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -813,7 +813,7 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
             # depending on the input multimodal items.
             curr_group_outputs = self.model.get_multimodal_embeddings(
                 **batched_mm_inputs)
-            print("curr_group_outputs", curr_group_outputs.shape)
+            print("curr_group_outputs", curr_group_outputs[0].shape)
 
             for output in curr_group_outputs:
                 encoder_outputs.append(output)

From e0b534beb59405f76282e3910dca93d39930c450 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Mon, 24 Feb 2025 10:02:45 +0100
Subject: [PATCH 3/5] clean up

---
 vllm/inputs/preprocess.py                  |  1 -
 vllm/inputs/registry.py                    |  3 ---
 vllm/model_executor/models/llama.py        |  1 -
 vllm/model_executor/models/llava.py        |  1 -
 vllm/model_executor/models/transformers.py |  1 -
 vllm/multimodal/base.py                    |  1 -
 vllm/multimodal/processing.py              | 19 +++++--------------
 vllm/multimodal/registry.py                |  2 --
 vllm/v1/engine/core.py                     |  6 +-----
 vllm/v1/engine/llm_engine.py               |  1 -
 vllm/v1/engine/mm_input_cache.py           |  3 ---
 vllm/v1/engine/processor.py                |  4 ----
 vllm/v1/worker/gpu_model_runner.py         |  3 ---
 vllm/worker/model_runner.py                | 18 ++----------------
 14 files changed, 8 insertions(+), 56 deletions(-)

diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py
index b2e4866c6045c..bc5856990da6f 100644
--- a/vllm/inputs/preprocess.py
+++ b/vllm/inputs/preprocess.py
@@ -317,7 +317,6 @@ def _prompt_to_llm_inputs(
         * :class:`SingletonInputs` instance
         """
         parsed = parse_singleton_prompt(prompt)
-        print("CALLED PROCESSOR", parsed["type"])
 
         if parsed["type"] == "str":
             prompt_text = parsed["content"]
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 2536924a51d1a..87b7a7631e42e 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -176,7 +176,6 @@ def call_hf_processor(
             allow_var_kwargs=True,
         )
 
-        # print("CTX", data.keys(), merged_kwargs.keys())
         try:
             return hf_processor(**data, **merged_kwargs, return_tensors="pt")
         except Exception as exc:
@@ -388,7 +387,6 @@ def dummy_data_for_profiling(
                     f"Expected at least {num_expected} dummy '{k}' instances "
                     f"for profiling, but found {num_items} instances instead.")
 
-        print(dummy_data.multi_modal_data["pixel_values"].shape)
         return dummy_data
 
     def _default_input_processor(
@@ -463,7 +461,6 @@ def process_input(self, model_config: "ModelConfig",
             processor,
         )
 
-        print("process_input", processor, mm_processor_kwargs.keys())
         processed_inputs = processor(
             InputContext(model_config),
             inputs,
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index a5fd00b670de2..2ff52dd789125 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -375,7 +375,6 @@ def forward(
                 "residual": residual
             })
 
-        # print(hidden_states.shape)
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index a60e0b83a8fee..6a4277adb6bf4 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -530,7 +530,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
             hf_config=config.text_config,
             prefix=maybe_prefix(prefix, "language_model"),
         )
-        print("self.language_model", self.language_model.__class__, self.vision_tower.__class__)
 
         self.make_empty_intermediate_tensors = (
             self.language_model.make_empty_intermediate_tensors)
diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 02442eea845f8..59a2a1e69b455 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -230,7 +230,6 @@ def _apply_hf_processor_text_mm(
             mm_data=processor_data,
             mm_kwargs=hf_processor_mm_kwargs,
         )
-        print("prompt_text", prompt_text, processed_data["pixel_values"][0].shape)
         processed_data.update(passthrough_data)
 
         prompt_ids, = processed_data.pop("input_ids").tolist()
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 28abc8b5fe65d..c48d07ba365ba 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -219,7 +219,6 @@ def get_max_multimodal_tokens(self, model_config: "ModelConfig") -> int:
         if not supports_multimodal(model_cls):
             return 0
 
-        print("_max_mm_tokens", self._max_mm_tokens, model_cls, self.__class__)
         max_mm_tokens = self._max_mm_tokens.get(model_cls)
         if max_mm_tokens is None:
             return 0
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index c0b2b597f5292..3415beece53c0 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -995,7 +995,6 @@ def _cached_apply_hf_processor(
 
         _, passthrough_data = self._get_hf_mm_data(mm_data_items)
         if cache is None or passthrough_data:
-            print("NO CACHE!")
             return self._apply_hf_processor_main(
                 prompt=prompt,
                 mm_items=mm_data_items,
@@ -1040,7 +1039,6 @@ def _cached_apply_hf_processor(
             modality: 0
             for modality in mm_missing_data_items
         }
-        print("CACHED!", mm_missing_idxs)
 
         merged_kw_items = list[MultiModalKwargsItem]()
         for modality, kw_items in mm_maybe_cached_kw_items.items():
@@ -1234,16 +1232,10 @@ def apply(
         else:
             mm_hashes = None
 
-        # prompt_ids, mm_kwargs, is_repl_applied = self._cached_apply_hf_processor(
-        #     prompt,
-        #     mm_items,
-        #     hf_processor_mm_kwargs,
-        # )
-        prompt_ids, mm_kwargs, is_repl_applied = self._apply_hf_processor_main(
-            prompt=prompt,
-            mm_items=mm_items,
-            hf_processor_mm_kwargs=hf_processor_mm_kwargs,
-            enable_hf_prompt_replacement=True,
+        prompt_ids, mm_kwargs, is_repl_applied = self._cached_apply_hf_processor(
+            prompt,
+            mm_items,
+            hf_processor_mm_kwargs,
         )
 
         unbound_prompt_repls = self._get_prompt_replacements(
@@ -1283,13 +1275,12 @@ def apply(
             for modality, placeholders in mm_placeholders.items()
         }
 
-        print("DONE HERE?", mm_placeholder_ranges)
         return MultiModalInputs(
             type="multimodal",
             prompt=prompt,
             prompt_token_ids=prompt_ids,
             mm_kwargs=mm_kwargs,
-            mm_hashes=None, #mm_hashes,
+            mm_hashes=mm_hashes,
             mm_placeholders=mm_placeholder_ranges,
         )
 
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index cc91e9d0279df..613d1db416720 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -266,7 +266,6 @@ def get_max_tokens_per_item_by_modality(
             return processor.info.get_mm_max_tokens_per_item(
                 seq_len, mm_limits)
 
-        print(self._plugins['image'].get_max_multimodal_tokens(model_config), self._plugins['image'].get_max_multimodal_tokens)
         return {
             key: plugin.get_max_multimodal_tokens(model_config)
             for key, plugin in self._plugins.items()
@@ -286,7 +285,6 @@ def get_max_tokens_per_item_by_nonzero_modality(
             usage of a model.
         """
         mm_limits = self.get_mm_limits_per_prompt(model_config)
-        print("mm_limits", mm_limits)
 
         return {
             key: max_tokens_per_mm_item
diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
index 5ccbbd32bd469..66e252b7ccb0f 100644
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -129,11 +129,9 @@ def add_request(self, request: EngineCoreRequest):
             assert request.mm_inputs is not None
             request.mm_inputs = self.mm_input_cache_server.get_and_update(
                 request.mm_inputs, request.mm_hashes)
-            # print("request.mm_hashes is None", request.mm_inputs[0] is None) # V1 ADDED HERE IF CACHED from MMInputMapperServer
 
         req = Request.from_engine_core_request(request)
 
-        # print("self.scheduler.add_request", self.scheduler.add_request) # vllm.v1.core.scheduler.Scheduler.add_request
         self.scheduler.add_request(req)
 
     def abort_requests(self, request_ids: List[str]):
@@ -152,9 +150,7 @@ def step(self) -> EngineCoreOutputs:
             return EngineCoreOutputs(
                 outputs=[], scheduler_stats=self.scheduler.make_stats())
 
-        scheduler_output = self.scheduler.schedule() # kinda allocated new kv cache and updates many internal stats for the requests
-        # print("scheduler_output", scheduler_output) # DEFI HAS pixel values when V1 is set
-        # print("self.model_executor", self.model_executor.execute_model)
+        scheduler_output = self.scheduler.schedule()
         output = self.model_executor.execute_model(scheduler_output)
         engine_core_outputs = self.scheduler.update_from_output(
             scheduler_output, output)  # type: ignore
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
index c5d03ace73215..c9a4c5369dfd8 100644
--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
@@ -131,7 +131,6 @@ def add_request(
     ) -> None:
 
         # 1) Process raw inputs into the request.
-        print("CALL add_request", prompt, self.processor)
         request = self.processor.process_inputs(request_id, prompt, params,
                                                 arrival_time, lora_request,
                                                 trace_headers,
diff --git a/vllm/v1/engine/mm_input_cache.py b/vllm/v1/engine/mm_input_cache.py
index 0aaf264ad31d7..a1d802bf818a2 100644
--- a/vllm/v1/engine/mm_input_cache.py
+++ b/vllm/v1/engine/mm_input_cache.py
@@ -100,7 +100,6 @@ def process_inputs(
                 mm_input = self.mm_cache.get(mm_hash)
 
             self.mm_cache_total += 1
-            # print("mm_input is None", mm_input is None)
             if mm_input is None:
                 if precomputed_mm_inputs is not None:
                     # Reuse precomputed input (for merged preprocessor)
@@ -119,9 +118,7 @@ def process_inputs(
             else:
                 self.mm_cache_hits += 1
                 mm_input = None  # Avoids sending mm_input to Server
-                print(" Avoids sending mm_input to Server, use cache somewhow, I dunno how yet")
 
-            # print("mm_input is None", mm_input is None)
             ret_inputs.append(mm_input)
 
         return ret_inputs
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 1232579b1b460..908204adf7236 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -112,8 +112,6 @@ def process_inputs(
         #   multimodal data and expand prompt token ids accordingly.
         # 3. Apply prompt adapter to prompt token ids if one exists.
         # Process inputs.
-
-        # CALL input_preprocessor (preprocess.py) where print(add request)
         preprocessed_inputs = self.input_preprocessor.preprocess(
             prompt,
             request_id=request_id,
@@ -158,7 +156,6 @@ def process_inputs(
             # Fallback to using MultiModalHasher directly.
             else:
                 mm_hashes = MultiModalHasher.hash_prompt_mm_data(prompt)
-            print('HASHING', bool(decoder_inputs.multi_modal_hashes), mm_hashes)
 
         # For merged preprocessor, mm_data is already mm_inputs
         precomputed_mm_inputs: Optional[list[MultiModalKwargs]] = None
@@ -191,7 +188,6 @@ def process_inputs(
                 mm_positions,
                 mm_hashes,
             )
-            print("mm_positions", mm_positions)
 
             # NOTE: Sort multimodal inputs/kwargs ONLY IF there are multiple
             # modalities involved AND the model supports merged input processor.
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4a07ef32a4b85..16ec44eefacb1 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -813,7 +813,6 @@ def _execute_encoder(self, scheduler_output: "SchedulerOutput"):
             # depending on the input multimodal items.
             curr_group_outputs = self.model.get_multimodal_embeddings(
                 **batched_mm_inputs)
-            print("curr_group_outputs", curr_group_outputs[0].shape)
 
             for output in curr_group_outputs:
                 encoder_outputs.append(output)
@@ -895,7 +894,6 @@ def execute_model(
             num_input_tokens = num_scheduled_tokens
         attn_metadata.num_input_tokens = num_input_tokens
 
-        print("self.is_multimodal_model", num_scheduled_tokens, num_input_tokens)
         if self.is_multimodal_model:
             # NOTE(woosuk): To unify token ids and soft tokens (vision
             # embeddings), we always use embeddings (rather than token ids)
@@ -935,7 +933,6 @@ def execute_model(
                 for k, v in self.intermediate_tensors.items()
             })
 
-        print("inputs_embeds", inputs_embeds.shape) # THIS IN THE ENTRYPOINT IN V1
         # Run the decoder.
         # Use persistent buffers for CUDA graphs.
         with set_forward_context(attn_metadata, self.vllm_config):
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index ea05c444efb45..67d175c373d82 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -542,7 +542,6 @@ def _compute_for_prefix_cache_hit(
         remaining blocks.
         """
         computed_block_nums = inter_data.computed_block_nums
-        print("cache hit", computed_block_nums is not None, inter_data.is_prompt)
 
         # Note that prefix caching does not support sliding window.
         prefix_cache_hit = (computed_block_nums is not None
@@ -652,12 +651,10 @@ def _compute_prompt_adapter_input(
         # Note that when is_prompt=True, we expect only one sequence
         # in the group.
         if not self.enable_prompt_adapter:
-            print("no enable_prompt_adapter")
             return
 
         prompt_adapter_id = seq_group_metadata.prompt_adapter_id
         if prompt_adapter_id <= 0 or not inter_data.is_prompt:
-            print("no prompt_adapter_id", prompt_adapter_id)
             return
 
         # We expect only one sequence in the group when is_prompt=True.
@@ -673,7 +670,6 @@ def _compute_prompt_adapter_input(
         inter_data.prompt_adapter_prompt_mapping = [prompt_adapter_id] * (
             query_len if seq_group_metadata.sampling_params
             and seq_group_metadata.sampling_params.prompt_logprobs else 1)
-        print("prompt adapters", inter_data.prompt_adapter_index_mapping, inter_data.prompt_adapter_prompt_mapping)
 
     def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
                                    seq_group_metadata: SequenceGroupMetadata):
@@ -687,11 +683,9 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
         if not mm_data:
             return
 
-        # print("seq_group_metadata", seq_group_metadata, positions)
         if self.runner.mm_registry.has_processor(self.runner.model_config):
             mm_kwargs = mm_data
         else:
-            print("RUN INPUT MAPPER AGAIN BUT WHY")
             mm_kwargs = self.multi_modal_input_mapper(
                 mm_data,
                 seq_group_metadata.mm_processor_kwargs,
@@ -699,7 +693,6 @@ def _compute_multi_modal_input(self, inter_data: InterDataForSeqGroup,
 
         inter_data.multi_modal_kwargs = mm_kwargs
         inter_data.multi_modal_placeholder_maps = placeholder_maps
-        print("placeholder_maps", placeholder_maps["image"].src_ranges, placeholder_maps["image"].dest_ranges)
 
         # special processing for mrope position deltas.
         if self.runner.model_config.uses_mrope:
@@ -759,14 +752,12 @@ def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata):
             encoder_seq_len=encoder_seq_len)
 
         self.inter_data_list.append(inter_data)
-        # print("input_tokens", inter_data.input_tokens)
 
         for seq_idx in range(n_seqs):
             for per_seq_fn in self.per_seq_compute_fns:
-                per_seq_fn(inter_data, seq_idx, seq_group_metadata) # ADDS PLACEHOLDER HERE I GUESS?
+                per_seq_fn(inter_data, seq_idx, seq_group_metadata)
         for per_seq_group_fn in self.per_seq_group_compute_fns:
-            per_seq_group_fn(inter_data, seq_group_metadata) # ADDS MM KWARGS HERE
-        # print("inter_data should have mm here!!!", inter_data.multi_modal_kwargs is not None)
+            per_seq_group_fn(inter_data, seq_group_metadata)
 
     def _use_captured_graph(self,
                             batch_size: int,
@@ -986,7 +977,6 @@ def build(self) -> ModelInputForGPU:
             if data.multi_modal_kwargs is not None
         ]
         multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
-        # print("building", multi_modal_kwargs.keys())
 
         return self.model_input_cls(
             input_tokens=input_tokens_tensor,
@@ -1728,10 +1718,6 @@ def execute_model(
             model_forward_end = torch.cuda.Event(enable_timing=True)
             model_forward_start.record()
 
-        if "pixel_values" in multi_modal_kwargs:
-            print('FINALLY FORWARD', model_input.input_tokens.shape, multi_modal_kwargs["pixel_values"].shape)
-        else:
-            print('DECODE', model_input.input_tokens.shape)
         if not bypass_model_exec:
             with set_forward_context(model_input.attn_metadata,
                                      self.vllm_config, virtual_engine):

From 7e8f0d8a0ed0d17696b9a5628915b7dbe3041814 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Mon, 24 Feb 2025 10:06:16 +0100
Subject: [PATCH 4/5] clean up 2

---
 vllm/engine/llm_engine.py          | 2 --
 vllm/entrypoints/llm.py            | 4 ----
 vllm/executor/uniproc_executor.py  | 1 -
 vllm/multimodal/processing.py      | 6 +++++-
 vllm/v1/engine/processor.py        | 3 +--
 vllm/v1/worker/gpu_model_runner.py | 1 -
 vllm/worker/worker_base.py         | 2 --
 7 files changed, 6 insertions(+), 13 deletions(-)

diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index 816baab8c0bd0..2e5bc75c6db38 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -724,7 +724,6 @@ def add_request(
         if inputs is not None:
             prompt = inputs
         assert prompt is not None and params is not None
-        print("CALL add_request", prompt)
 
         if lora_request is not None and not self.lora_config:
             raise ValueError(f"Got lora_request {lora_request} but LoRA is "
@@ -755,7 +754,6 @@ def add_request(
             lora_request=lora_request,
             prompt_adapter_request=prompt_adapter_request,
         )
-        print("mm_hashes", preprocessed_inputs.get('mm_hashes'))
         processed_inputs = self.input_processor(preprocessed_inputs)
 
         self._add_processed_request(
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index e71ac91e5fcc6..075ef3e59d885 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -691,10 +691,6 @@ def chat(
             ]
 
         tokenizer = self.get_tokenizer()
-        model_config = self.llm_engine.processor.input_preprocessor.model_config
-        mm_processor = self.llm_engine.processor.input_preprocessor.mm_registry.create_processor(model_config, tokenizer)
-        processor = mm_processor.info.ctx.get_hf_processor()
-        chat_template = processor.chat_template
 
         model_config = self.llm_engine.get_model_config()
         resolved_content_format = resolve_chat_template_content_format(
diff --git a/vllm/executor/uniproc_executor.py b/vllm/executor/uniproc_executor.py
index bf693cab5cc7c..94db232240d55 100644
--- a/vllm/executor/uniproc_executor.py
+++ b/vllm/executor/uniproc_executor.py
@@ -53,7 +53,6 @@ def collective_rpc(self,
                        kwargs: Optional[Dict] = None) -> List[Any]:
         if kwargs is None:
             kwargs = {}
-        # print("self.driver_worker", self.driver_worker, method, getattr(self.driver_worker, method))
         answer = run_method(self.driver_worker, method, args, kwargs)
         return [answer]
 
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 3415beece53c0..fcd02fbd5203c 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1232,7 +1232,11 @@ def apply(
         else:
             mm_hashes = None
 
-        prompt_ids, mm_kwargs, is_repl_applied = self._cached_apply_hf_processor(
+        (
+            prompt_ids,
+            mm_kwargs,
+            is_repl_applied,
+        ) = self._cached_apply_hf_processor(
             prompt,
             mm_items,
             hf_processor_mm_kwargs,
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 908204adf7236..b7eee5a39972b 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -111,7 +111,6 @@ def process_inputs(
         # 2. For multimodal models with a merged preprocessor, preprocess
         #   multimodal data and expand prompt token ids accordingly.
         # 3. Apply prompt adapter to prompt token ids if one exists.
-        # Process inputs.
         preprocessed_inputs = self.input_preprocessor.preprocess(
             prompt,
             request_id=request_id,
@@ -215,7 +214,7 @@ def process_inputs(
                 mm_hashes=sorted_mm_hashes,
                 mm_processor_kwargs=decoder_inputs.mm_processor_kwargs,
                 precomputed_mm_inputs=precomputed_mm_inputs,
-            ) # THIS ONE REMOVES INPUT IMAGES IF CACHED with MMInputMapperClient
+            )
         else:
             sorted_mm_inputs = None
             sorted_mm_hashes = None
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 16ec44eefacb1..31fe095a91bc0 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -881,7 +881,6 @@ def execute_model(
 
         # Prepare the decoder inputs.
         attn_metadata, logits_indices = self._prepare_inputs(scheduler_output)
-
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.use_cuda_graph
                 and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]):
diff --git a/vllm/worker/worker_base.py b/vllm/worker/worker_base.py
index 8984ee83636cb..190429074d56c 100644
--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
@@ -343,7 +343,6 @@ def _get_driver_input_and_broadcast(
                 execute_model_req.virtual_engine,
                 execute_model_req.finished_requests_ids))
 
-        # print("self.model_runner.prepare_model_input", self.model_runner.prepare_model_input)
         kwargs = extract_previous_hidden_states(execute_model_req)
 
         if self.do_metadata_broadcast:
@@ -418,7 +417,6 @@ def execute_model(
                 orig_model_execute_time = intermediate_tensors.tensors.get(
                     "model_execute_time", torch.tensor(0)).item()
 
-        # print("self.model_runner.execute_model", self.model_runner.execute_model)
         output = self.model_runner.execute_model(
             model_input=model_input,
             kv_caches=self.kv_cache[worker_input.virtual_engine]

From 57c2d85cfae0dcd32c9de46806846e4552cf38d6 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Mon, 24 Feb 2025 10:28:24 +0100
Subject: [PATCH 5/5] use arbitrary high resolution in dummy inputs

---
 vllm/model_executor/models/transformers.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/vllm/model_executor/models/transformers.py b/vllm/model_executor/models/transformers.py
index 59a2a1e69b455..b1a6fa0d95a59 100644
--- a/vllm/model_executor/models/transformers.py
+++ b/vllm/model_executor/models/transformers.py
@@ -161,9 +161,9 @@ def get_dummy_processor_inputs(
         video_token = getattr(processor, "video_token", None)
 
         # TODO: raushan, we can have processor attr for `processor.max_output_size` which will infer
-        # max features for model in HF side. IMO should be all done on processor side, not on model config
-        vision_config = self.info.get_hf_config().vision_config
-        target_width = target_height = vision_config.image_size
+        # max features for model in HF side. But imo we can just set a veru high resolution
+        # and the processor will return us pixels with correct max shape. Resolution 3kx3k is high enough
+        target_width = target_height = 3000
 
         # NOTE: we can pass videos/images/audio to any processor With the new API used in MLLMs,
         # HF processor will take the modality needed for model and ignore all others
@@ -353,9 +353,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
                                                 self.vocab_size, logit_scale)
         self.sampler = get_sampler()
 
-        MultiModalRegistry()._get_plugin("image").register_max_multimodal_tokens(576)
-        InputRegistry()._dummy_factories_by_model_type[model_cls] = factory
-
     def apply_base_model_tp_plan(self, module: nn.Module, prefix: str = ""):
         """
         Apply the base model tensor parallelization plan to a module.