From 1191485f675ebdb03c71a6720eebed5bd2f271c4 Mon Sep 17 00:00:00 2001
From: NickLucche <nlucches@redhat.com>
Date: Mon, 20 Jan 2025 17:05:55 +0000
Subject: [PATCH 1/2] fix vocab_parallel_embedding sharding

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 .../decoder_only/language/test_models.py      | 42 +++++++++++++++++++
 vllm/config.py                                |  2 +-
 .../layers/vocab_parallel_embedding.py        |  4 +-
 vllm/transformers_utils/config.py             |  4 +-
 4 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py
index c7efa4edbbc0a..1ab9a82275d2e 100644
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -87,3 +87,45 @@ def print_model(model):
         name_0="hf",
         name_1="vllm",
     )
+
+@pytest.mark.parametrize(
+    "model",
+    [
+        pytest.param("cognitivecomputations/TinyDolphin-2.8-1.1b"), # testing VocabParallelEmbedding crash
+    ])
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("tp", [2])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_tp_models(
+    hf_runner,
+    vllm_runner,
+    example_prompts,
+    model: str,
+    dtype: str,
+    tp: int,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
+
+    with vllm_runner(model, dtype=dtype, tensor_parallel_size=tp) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+        # This test is for verifying whether the model's extra_repr
+        # can be printed correctly.
+        def print_model(model):
+            print(model)
+
+        vllm_model.apply_model(print_model)
+
+    check_logprobs_close(
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )
diff --git a/vllm/config.py b/vllm/config.py
index 4698a05020332..ac1cee66d73ed 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -277,7 +277,7 @@ def __init__(self,
         self.max_logprobs = max_logprobs
         self.disable_sliding_window = disable_sliding_window
         self.skip_tokenizer_init = skip_tokenizer_init
-
+        # breakpoint()
         hf_config = get_config(self.model, trust_remote_code, revision,
                                code_revision, config_format)
 
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 65920aa61ba15..3eb5c39ccf580 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -355,7 +355,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         elif isinstance(param, UninitializedParameter):
             shape = list(loaded_weight.shape)
             if output_dim is not None:
-                shape[output_dim] = shape[output_dim] // self.tp_size
+                shape[output_dim] = self.num_embeddings_per_partition
             param.materialize(tuple(shape), dtype=loaded_weight.dtype)
 
         # If parameter does not have output dim, then it should
@@ -381,7 +381,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         else:
             assert loaded_weight.shape[output_dim] == self.org_vocab_size
 
-        # Copy the data.
+        # Copy the data. Select chunk corresponding to current shard.
         loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
 
         if current_platform.is_hpu():
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index f57dfded0a62f..88ba098d73927 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -206,7 +206,7 @@ def get_config(
             token=HF_TOKEN,
             **kwargs,
         )
-
+        # config_dict["model_type"] = "granite"
         # Use custom model class if it's in our registry
         model_type = config_dict.get("model_type")
         if model_type in _CONFIG_REGISTRY:
@@ -228,6 +228,7 @@ def get_config(
                     token=HF_TOKEN,
                     **kwargs,
                 )
+                # config.model_type = 'granite'
             except ValueError as e:
                 if (not trust_remote_code
                         and "requires you to execute the configuration file"
@@ -252,6 +253,7 @@ def get_config(
         if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
             raise RuntimeError(
                 f"Can't get gguf config for {config.model_type}.")
+        # model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES['granite']
         model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type]
         config.update({"architectures": [model_type]})
 

From 5389ab6e011a0ecdb9a3dedbfe0b4a19943e2f06 Mon Sep 17 00:00:00 2001
From: NickLucche <nlucches@redhat.com>
Date: Mon, 20 Jan 2025 19:32:53 +0000
Subject: [PATCH 2/2] add tests

Signed-off-by: NickLucche <nlucches@redhat.com>
---
 .../models/decoder_only/language/test_gguf.py | 10 +++++
 .../decoder_only/language/test_models.py      | 42 -------------------
 vllm/config.py                                |  2 +-
 vllm/transformers_utils/config.py             |  4 +-
 4 files changed, 12 insertions(+), 46 deletions(-)

diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py
index 81b93ebdf0fc0..38cea2462b440 100644
--- a/tests/models/decoder_only/language/test_gguf.py
+++ b/tests/models/decoder_only/language/test_gguf.py
@@ -66,12 +66,20 @@ def gguf_model(self):
     gguf_filename="starcoder2-3b.Q6_K.gguf",
 )
 
+DOLPHIN_CONFIG = GGUFTestConfig(
+    # Test VocabParallelEmbedding sharding issue.
+    original_model="cognitivecomputations/TinyDolphin-2.8-1.1b",
+    gguf_repo="tsunemoto/TinyDolphin-2.8-1.1b-GGUF",
+    gguf_filename="tinydolphin-2.8-1.1b.Q6_K.gguf",
+)
+
 MODELS = [
     LLAMA_CONFIG,
     QWEN2_CONFIG,
     PHI3_CONFIG,
     GPT2_CONFIG,
     STABLELM_CONFIG,
+    DOLPHIN_CONFIG
     # STARCODER_CONFIG, # broken
 ]
 
@@ -107,6 +115,7 @@ def test_models(
 
     # Run unquantized model.
     with vllm_runner(model_name=model.original_model,
+                     enforce_eager=True, # faster tests
                      dtype=dtype,
                      max_model_len=MAX_MODEL_LEN,
                      tensor_parallel_size=tp_size) as original_model:
@@ -115,6 +124,7 @@ def test_models(
 
     # Run gguf model.
     with vllm_runner(model_name=model.gguf_model,
+                     enforce_eager=True,
                      tokenizer_name=model.original_model,
                      dtype=dtype,
                      max_model_len=MAX_MODEL_LEN,
diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py
index 1ab9a82275d2e..c7efa4edbbc0a 100644
--- a/tests/models/decoder_only/language/test_models.py
+++ b/tests/models/decoder_only/language/test_models.py
@@ -87,45 +87,3 @@ def print_model(model):
         name_0="hf",
         name_1="vllm",
     )
-
-@pytest.mark.parametrize(
-    "model",
-    [
-        pytest.param("cognitivecomputations/TinyDolphin-2.8-1.1b"), # testing VocabParallelEmbedding crash
-    ])
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("tp", [2])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_tp_models(
-    hf_runner,
-    vllm_runner,
-    example_prompts,
-    model: str,
-    dtype: str,
-    tp: int,
-    max_tokens: int,
-    num_logprobs: int,
-) -> None:
-
-    with hf_runner(model, dtype=dtype) as hf_model:
-        hf_outputs = hf_model.generate_greedy_logprobs_limit(
-            example_prompts, max_tokens, num_logprobs)
-
-    with vllm_runner(model, dtype=dtype, tensor_parallel_size=tp) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy_logprobs(
-            example_prompts, max_tokens, num_logprobs)
-
-        # This test is for verifying whether the model's extra_repr
-        # can be printed correctly.
-        def print_model(model):
-            print(model)
-
-        vllm_model.apply_model(print_model)
-
-    check_logprobs_close(
-        outputs_0_lst=hf_outputs,
-        outputs_1_lst=vllm_outputs,
-        name_0="hf",
-        name_1="vllm",
-    )
diff --git a/vllm/config.py b/vllm/config.py
index ac1cee66d73ed..4698a05020332 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -277,7 +277,7 @@ def __init__(self,
         self.max_logprobs = max_logprobs
         self.disable_sliding_window = disable_sliding_window
         self.skip_tokenizer_init = skip_tokenizer_init
-        # breakpoint()
+
         hf_config = get_config(self.model, trust_remote_code, revision,
                                code_revision, config_format)
 
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 88ba098d73927..f57dfded0a62f 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -206,7 +206,7 @@ def get_config(
             token=HF_TOKEN,
             **kwargs,
         )
-        # config_dict["model_type"] = "granite"
+
         # Use custom model class if it's in our registry
         model_type = config_dict.get("model_type")
         if model_type in _CONFIG_REGISTRY:
@@ -228,7 +228,6 @@ def get_config(
                     token=HF_TOKEN,
                     **kwargs,
                 )
-                # config.model_type = 'granite'
             except ValueError as e:
                 if (not trust_remote_code
                         and "requires you to execute the configuration file"
@@ -253,7 +252,6 @@ def get_config(
         if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
             raise RuntimeError(
                 f"Can't get gguf config for {config.model_type}.")
-        # model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES['granite']
         model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type]
         config.update({"architectures": [model_type]})