vllm-project · llmpros · Jul 1, 2024 · Jul 6, 2024 · Jul 6, 2024 · Jul 6, 2024
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -58,6 +58,7 @@ steps:
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
   - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
+  - DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_distributed_gptq_marlin.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py

diff --git a/tests/distributed/test_distributed_gptq_marlin.py b/tests/distributed/test_distributed_gptq_marlin.py
@@ -0,0 +1,45 @@
+"""Compares the outputs of gptq vs gptq_marlin when tp > 1
+Note: GPTQ and Marlin do not have bitwise correctness.
+As a result, in this test, we just confirm that the top selected tokens of the
+Marlin/GPTQ models are in the top 5 selections of each other.
+Note: Marlin internally uses locks to synchronize the threads. This can
+result in very slight nondeterminism for Marlin. As a result, we re-run the test
+up to 3 times to see if we pass.
+
+Run `pytest tests/models/test_distributed_gptq_marlin.py`.
+"""
+import os
+
+import pytest
+
+from tests.models.test_gptq_marlin import MODELS, run_test
+from tests.quantization.utils import is_quant_method_supported
+from vllm.utils import cuda_device_count_stateless
+
+
+@pytest.mark.parametrize("tensor_parallel_size", [2])
+@pytest.mark.flaky(reruns=3)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half", "bfloat16"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(vllm_runner, example_prompts, model, dtype: str,
+                max_tokens: int, num_logprobs: int,
+                tensor_parallel_size: int) -> None:
+
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip(
+            f"Need at least {tensor_parallel_size} GPUs to run the test.")
+
+    if not is_quant_method_supported("gptq_marlin"):
+        pytest.skip("gptq_marlin is not supported on this GPU type.")
+
+    distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND")
+    run_test(vllm_runner,
+             example_prompts,
+             model,
+             dtype,
+             max_tokens,
+             num_logprobs,
+             tensor_parallel_size=tensor_parallel_size,
+             distributed_executor_backend=distributed_executor_backend)
@@ -9,6 +9,7 @@
 Run `pytest tests/models/test_gptq_marlin.py`.
 """
 import os
+from typing import Optional
 
 import pytest
 
@@ -18,7 +19,6 @@
 from .utils import check_logprobs_close
 
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
-
 MAX_MODEL_LEN = 1024
 
 MODELS = [
@@ -46,30 +46,28 @@
 ]
 
 
-@pytest.mark.flaky(reruns=3)
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
-                    reason="gptq_marlin is not supported on this GPU type.")
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half", "bfloat16"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(
+def run_test(
     vllm_runner,
     example_prompts,
     model,
     dtype: str,
     max_tokens: int,
     num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
 ) -> None:
     model_name, revision = model
+    distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND")
 
     # Run marlin.
     with vllm_runner(model_name=model_name,
                      revision=revision,
                      dtype=dtype,
                      quantization="marlin",
                      max_model_len=MAX_MODEL_LEN,
-                     tensor_parallel_size=1) as gptq_marlin_model:
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend
+                     ) as gptq_marlin_model:
 
         gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
             example_prompts[:-1], max_tokens, num_logprobs)
@@ -84,7 +82,9 @@ def test_models(
                      dtype="half",
                      quantization="gptq",
                      max_model_len=MAX_MODEL_LEN,
-                     tensor_parallel_size=1) as gptq_model:
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend
+                     ) as gptq_model:
         gptq_outputs = gptq_model.generate_greedy_logprobs(
             example_prompts[:-1], max_tokens, num_logprobs)
 
@@ -94,3 +94,29 @@ def test_models(
         name_0="gptq",
         name_1="gptq_marlin",
     )
+
+
+@pytest.mark.flaky(reruns=3)
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half", "bfloat16"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    if not is_quant_method_supported("gptq_marlin"):
+        pytest.skip("gptq_marlin is not supported on this GPU type.")
+    run_test(
+        vllm_runner,
+        example_prompts,
+        model,
+        dtype,
+        max_tokens,
+        num_logprobs,
+        tensor_parallel_size=1,
+    )