From 1191485f675ebdb03c71a6720eebed5bd2f271c4 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Mon, 20 Jan 2025 17:05:55 +0000 Subject: [PATCH 1/2] fix vocab_parallel_embedding sharding Signed-off-by: NickLucche --- .../decoder_only/language/test_models.py | 42 +++++++++++++++++++ vllm/config.py | 2 +- .../layers/vocab_parallel_embedding.py | 4 +- vllm/transformers_utils/config.py | 4 +- 4 files changed, 48 insertions(+), 4 deletions(-) diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py index c7efa4edbbc0a..1ab9a82275d2e 100644 --- a/tests/models/decoder_only/language/test_models.py +++ b/tests/models/decoder_only/language/test_models.py @@ -87,3 +87,45 @@ def print_model(model): name_0="hf", name_1="vllm", ) + +@pytest.mark.parametrize( + "model", + [ + pytest.param("cognitivecomputations/TinyDolphin-2.8-1.1b"), # testing VocabParallelEmbedding crash + ]) +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("tp", [2]) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_tp_models( + hf_runner, + vllm_runner, + example_prompts, + model: str, + dtype: str, + tp: int, + max_tokens: int, + num_logprobs: int, +) -> None: + + with hf_runner(model, dtype=dtype) as hf_model: + hf_outputs = hf_model.generate_greedy_logprobs_limit( + example_prompts, max_tokens, num_logprobs) + + with vllm_runner(model, dtype=dtype, tensor_parallel_size=tp) as vllm_model: + vllm_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, num_logprobs) + + # This test is for verifying whether the model's extra_repr + # can be printed correctly. + def print_model(model): + print(model) + + vllm_model.apply_model(print_model) + + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) diff --git a/vllm/config.py b/vllm/config.py index 4698a05020332..ac1cee66d73ed 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -277,7 +277,7 @@ def __init__(self, self.max_logprobs = max_logprobs self.disable_sliding_window = disable_sliding_window self.skip_tokenizer_init = skip_tokenizer_init - + # breakpoint() hf_config = get_config(self.model, trust_remote_code, revision, code_revision, config_format) diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index 65920aa61ba15..3eb5c39ccf580 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -355,7 +355,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): elif isinstance(param, UninitializedParameter): shape = list(loaded_weight.shape) if output_dim is not None: - shape[output_dim] = shape[output_dim] // self.tp_size + shape[output_dim] = self.num_embeddings_per_partition param.materialize(tuple(shape), dtype=loaded_weight.dtype) # If parameter does not have output dim, then it should @@ -381,7 +381,7 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): else: assert loaded_weight.shape[output_dim] == self.org_vocab_size - # Copy the data. + # Copy the data. Select chunk corresponding to current shard. loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) if current_platform.is_hpu(): diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index f57dfded0a62f..88ba098d73927 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -206,7 +206,7 @@ def get_config( token=HF_TOKEN, **kwargs, ) - + # config_dict["model_type"] = "granite" # Use custom model class if it's in our registry model_type = config_dict.get("model_type") if model_type in _CONFIG_REGISTRY: @@ -228,6 +228,7 @@ def get_config( token=HF_TOKEN, **kwargs, ) + # config.model_type = 'granite' except ValueError as e: if (not trust_remote_code and "requires you to execute the configuration file" @@ -252,6 +253,7 @@ def get_config( if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES: raise RuntimeError( f"Can't get gguf config for {config.model_type}.") + # model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES['granite'] model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type] config.update({"architectures": [model_type]}) From 5389ab6e011a0ecdb9a3dedbfe0b4a19943e2f06 Mon Sep 17 00:00:00 2001 From: NickLucche Date: Mon, 20 Jan 2025 19:32:53 +0000 Subject: [PATCH 2/2] add tests Signed-off-by: NickLucche --- .../models/decoder_only/language/test_gguf.py | 10 +++++ .../decoder_only/language/test_models.py | 42 ------------------- vllm/config.py | 2 +- vllm/transformers_utils/config.py | 4 +- 4 files changed, 12 insertions(+), 46 deletions(-) diff --git a/tests/models/decoder_only/language/test_gguf.py b/tests/models/decoder_only/language/test_gguf.py index 81b93ebdf0fc0..38cea2462b440 100644 --- a/tests/models/decoder_only/language/test_gguf.py +++ b/tests/models/decoder_only/language/test_gguf.py @@ -66,12 +66,20 @@ def gguf_model(self): gguf_filename="starcoder2-3b.Q6_K.gguf", ) +DOLPHIN_CONFIG = GGUFTestConfig( + # Test VocabParallelEmbedding sharding issue. + original_model="cognitivecomputations/TinyDolphin-2.8-1.1b", + gguf_repo="tsunemoto/TinyDolphin-2.8-1.1b-GGUF", + gguf_filename="tinydolphin-2.8-1.1b.Q6_K.gguf", +) + MODELS = [ LLAMA_CONFIG, QWEN2_CONFIG, PHI3_CONFIG, GPT2_CONFIG, STABLELM_CONFIG, + DOLPHIN_CONFIG # STARCODER_CONFIG, # broken ] @@ -107,6 +115,7 @@ def test_models( # Run unquantized model. with vllm_runner(model_name=model.original_model, + enforce_eager=True, # faster tests dtype=dtype, max_model_len=MAX_MODEL_LEN, tensor_parallel_size=tp_size) as original_model: @@ -115,6 +124,7 @@ def test_models( # Run gguf model. with vllm_runner(model_name=model.gguf_model, + enforce_eager=True, tokenizer_name=model.original_model, dtype=dtype, max_model_len=MAX_MODEL_LEN, diff --git a/tests/models/decoder_only/language/test_models.py b/tests/models/decoder_only/language/test_models.py index 1ab9a82275d2e..c7efa4edbbc0a 100644 --- a/tests/models/decoder_only/language/test_models.py +++ b/tests/models/decoder_only/language/test_models.py @@ -87,45 +87,3 @@ def print_model(model): name_0="hf", name_1="vllm", ) - -@pytest.mark.parametrize( - "model", - [ - pytest.param("cognitivecomputations/TinyDolphin-2.8-1.1b"), # testing VocabParallelEmbedding crash - ]) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("tp", [2]) -@pytest.mark.parametrize("max_tokens", [32]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_tp_models( - hf_runner, - vllm_runner, - example_prompts, - model: str, - dtype: str, - tp: int, - max_tokens: int, - num_logprobs: int, -) -> None: - - with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy_logprobs_limit( - example_prompts, max_tokens, num_logprobs) - - with vllm_runner(model, dtype=dtype, tensor_parallel_size=tp) as vllm_model: - vllm_outputs = vllm_model.generate_greedy_logprobs( - example_prompts, max_tokens, num_logprobs) - - # This test is for verifying whether the model's extra_repr - # can be printed correctly. - def print_model(model): - print(model) - - vllm_model.apply_model(print_model) - - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) diff --git a/vllm/config.py b/vllm/config.py index ac1cee66d73ed..4698a05020332 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -277,7 +277,7 @@ def __init__(self, self.max_logprobs = max_logprobs self.disable_sliding_window = disable_sliding_window self.skip_tokenizer_init = skip_tokenizer_init - # breakpoint() + hf_config = get_config(self.model, trust_remote_code, revision, code_revision, config_format) diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 88ba098d73927..f57dfded0a62f 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -206,7 +206,7 @@ def get_config( token=HF_TOKEN, **kwargs, ) - # config_dict["model_type"] = "granite" + # Use custom model class if it's in our registry model_type = config_dict.get("model_type") if model_type in _CONFIG_REGISTRY: @@ -228,7 +228,6 @@ def get_config( token=HF_TOKEN, **kwargs, ) - # config.model_type = 'granite' except ValueError as e: if (not trust_remote_code and "requires you to execute the configuration file" @@ -253,7 +252,6 @@ def get_config( if config.model_type not in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES: raise RuntimeError( f"Can't get gguf config for {config.model_type}.") - # model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES['granite'] model_type = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[config.model_type] config.update({"architectures": [model_type]})