vllm-project · mgoin · Nov 5, 2024 · Nov 1, 2024 · Nov 4, 2024 · Nov 4, 2024
@@ -21,19 +21,19 @@
     "kv_cache_dtype,base_model,test_model,scale_path",
     [
         # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors.
-        ("fp8_e4m3", "meta-llama/Meta-Llama-3-8B-Instruct",
-         "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV", None),
+        ("fp8_e4m3", "meta-llama/Llama-3.2-1B-Instruct",
+         "nm-testing/Llama-3.2-1B-Instruct-FP8-KV", None),
         # Test FP16 checkpoint w. fp8_e5m2 kv-cache.
-        ("fp8_e5m2", "meta-llama/Meta-Llama-3-8B-Instruct",
-         "meta-llama/Meta-Llama-3-8B-Instruct", None),
+        ("fp8_e5m2", "meta-llama/Llama-3.2-1B-Instruct",
+         "meta-llama/Llama-3.2-1B-Instruct", None),
         # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json.
         ("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf",
          "meta-llama/Llama-2-7b-chat-hf",
          "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json")
     ])
 # Due to low-precision numerical divergence, we only test logprob of 4 tokens
 @pytest.mark.parametrize("max_tokens", [4])
-@pytest.mark.parametrize("enforce_eager", [False, True])
+@pytest.mark.parametrize("enforce_eager", [True])
 @pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"])
 # NOTE: Increasing this in this suite will fail CI because we currently cannot
 # reset distributed env properly. Use a value > 1 just when you test.

@@ -22,24 +22,11 @@
 MAX_MODEL_LEN = 1024
 
 MODELS = [
-    # act_order==False, group_size=channelwise
-    ("robertgshaw2/zephyr-7b-beta-channelwise-gptq", "main"),
-    # act_order==False, group_size=128
-    ("TheBloke/Llama-2-7B-GPTQ", "main"),
-
     # act_order==True, group_size=128
     ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "main"),
-    # act_order==True, group_size=64
-    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-64g-actorder_True"),
-    # act_order==True, group_size=32
-    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-4bit-32g-actorder_True"),
 
     # 8-bit, act_order==True, group_size=channelwise
     ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit--1g-actorder_True"),
-    # 8-bit, act_order==True, group_size=128
-    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-128g-actorder_True"),
-    # 8-bit, act_order==True, group_size=32
-    ("TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ", "gptq-8bit-32g-actorder_True"),
 
     # 4-bit, act_order==True, group_size=128
     ("TechxGenus/gemma-1.1-2b-it-GPTQ", "main")

@@ -25,16 +25,16 @@ class ModelPair:
     # 4-bit, group_size == 128
     ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-g128",
               model_gptq="alexm-nm/tinyllama-24-gptq-4bit-g128"),
-    # 4-bit, group_size == channelwise
-    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
-              model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
+    # # 4-bit, group_size == channelwise
+    # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-4bit-channelwise",
+    #           model_gptq="alexm-nm/tinyllama-24-gptq-4bit-channelwise"),
 
     # 8-bit, group_size == 128
     ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-g128",
               model_gptq="alexm-nm/tinyllama-24-gptq-8bit-g128"),
-    # 8-bit, group_size == channelwise
-    ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
-              model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
+    # # 8-bit, group_size == channelwise
+    # ModelPair(model_marlin="alexm-nm/tinyllama-24-marlin24-8bit-channelwise",
+    #           model_gptq="alexm-nm/tinyllama-24-gptq-8bit-channelwise"),
 ]
 
 

@@ -4,7 +4,7 @@
 """
 import pytest
 
-from vllm import LLM, SamplingParams
+from vllm import SamplingParams
 
 from ...utils import check_logprobs_close
 
@@ -15,6 +15,10 @@
     # "mistralai/Mistral-Nemo-Instruct-2407"
 ]
 
+MISTRAL_FORMAT_MODELS = [
+    "mistralai/Mistral-7B-Instruct-v0.3",
+]
+
 SAMPLING_PARAMS = SamplingParams(max_tokens=512, temperature=0.0, logprobs=5)
 SYMBOLIC_LANG_PROMPTS = [
     "勇敢な船乗りについての詩を書く",  # japanese
@@ -95,7 +99,7 @@ def test_models(
     )
 
 
-@pytest.mark.parametrize("model", MODELS[1:])
+@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("num_logprobs", [5])
@@ -135,28 +139,29 @@ def test_mistral_format(
     )
 
 
-@pytest.mark.parametrize("model", MODELS[1:])
+@pytest.mark.parametrize("model", MISTRAL_FORMAT_MODELS)
 @pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("prompt", SYMBOLIC_LANG_PROMPTS)
 def test_mistral_symbolic_languages(
+    vllm_runner,
     model: str,
     dtype: str,
-    prompt: str,
 ) -> None:
-    prompt = "hi"
-    msg = {"role": "user", "content": prompt}
-    llm = LLM(model=model,
-              dtype=dtype,
-              max_model_len=8192,
-              tokenizer_mode="mistral",
-              config_format="mistral",
-              load_format="mistral")
-    outputs = llm.chat([msg], sampling_params=SAMPLING_PARAMS)
-    assert "�" not in outputs[0].outputs[0].text.strip()
+    with vllm_runner(model,
+                     dtype=dtype,
+                     max_model_len=8192,
+                     tokenizer_mode="mistral",
+                     config_format="mistral",
+                     load_format="mistral") as vllm_model:
+        for prompt in SYMBOLIC_LANG_PROMPTS:
+            msg = {"role": "user", "content": prompt}
+            outputs = vllm_model.model.chat([msg],
+                                            sampling_params=SAMPLING_PARAMS)
+            assert "�" not in outputs[0].outputs[0].text.strip()
 
 
 @pytest.mark.parametrize("dtype", ["bfloat16"])
-@pytest.mark.parametrize("model", MODELS[1:])  # v1 can't do func calling
+@pytest.mark.parametrize("model",
+                         MISTRAL_FORMAT_MODELS)  # v1 can't do func calling
 def test_mistral_function_calling(
     vllm_runner,
     model: str,