From e011cdea144e85fba05b3ab7cb88abb9d480c645 Mon Sep 17 00:00:00 2001
From: nzhang1220 <ning2008wisc@gmail.com>
Date: Sun, 30 Jun 2024 20:51:16 -0700
Subject: [PATCH 1/5] [ci][distributed] add distributed test gptq_marlin with
 tp = 2

---
 .buildkite/test-pipeline.yaml                 |  1 +
 .../test_distributed_gptq_marlin.py           | 39 +++++++++++++++
 tests/models/test_gptq_marlin.py              | 48 ++++++++++++++-----
 3 files changed, 77 insertions(+), 11 deletions(-)
 create mode 100644 tests/distributed/test_distributed_gptq_marlin.py

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 8013fbb642bb8..de7da35e0b8c1 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -58,6 +58,7 @@ steps:
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
   - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
+  - DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_distributed_gptq_marlin.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
diff --git a/tests/distributed/test_distributed_gptq_marlin.py b/tests/distributed/test_distributed_gptq_marlin.py
new file mode 100644
index 0000000000000..6581cb2c4b4aa
--- /dev/null
+++ b/tests/distributed/test_distributed_gptq_marlin.py
@@ -0,0 +1,39 @@
+"""Compares the outputs of gptq vs gptq_marlin when tp > 1
+Note: GPTQ and Marlin do not have bitwise correctness.
+As a result, in this test, we just confirm that the top selected tokens of the
+Marlin/GPTQ models are in the top 5 selections of each other.
+Note: Marlin internally uses locks to synchronize the threads. This can
+result in very slight nondeterminism for Marlin. As a result, we re-run the test
+up to 3 times to see if we pass.
+
+Run `pytest tests/models/test_distributed_gptq_marlin.py`.
+"""
+import os
+
+import pytest
+
+from tests.models.test_gptq_marlin import MODELS, run_test
+from tests.quantization.utils import is_quant_method_supported
+
+
+@pytest.mark.parametrize("tensor_parallel_size", [2])
+@pytest.mark.flaky(reruns=3)
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
+                    reason="gptq_marlin is not supported on this GPU type.")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half", "bfloat16"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(vllm_runner, example_prompts, model, dtype: str,
+                max_tokens: int, num_logprobs: int,
+                tensor_parallel_size: int) -> None:
+
+    distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND")
+    run_test(vllm_runner,
+             example_prompts,
+             model,
+             dtype,
+             max_tokens,
+             num_logprobs,
+             tensor_parallel_size=tensor_parallel_size,
+             distributed_executor_backend=distributed_executor_backend)
\ No newline at end of file
diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py
index 4abbc41c9c287..23898be2b4164 100644
--- a/tests/models/test_gptq_marlin.py
+++ b/tests/models/test_gptq_marlin.py
@@ -9,6 +9,7 @@
 Run `pytest tests/models/test_gptq_marlin.py`.
 """
 import os
+from typing import Optional
 
 import pytest
 
@@ -18,7 +19,6 @@
 from .utils import check_logprobs_close
 
 os.environ["TOKENIZERS_PARALLELISM"] = "true"
-
 MAX_MODEL_LEN = 1024
 
 MODELS = [
@@ -46,22 +46,18 @@
 ]
 
 
-@pytest.mark.flaky(reruns=3)
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
-                    reason="gptq_marlin is not supported on this GPU type.")
-@pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half", "bfloat16"])
-@pytest.mark.parametrize("max_tokens", [32])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(
+def run_test(
     vllm_runner,
     example_prompts,
     model,
     dtype: str,
     max_tokens: int,
     num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
 ) -> None:
     model_name, revision = model
+    distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND")
 
     # Run marlin.
     with vllm_runner(model_name=model_name,
@@ -69,7 +65,9 @@ def test_models(
                      dtype=dtype,
                      quantization="marlin",
                      max_model_len=MAX_MODEL_LEN,
-                     tensor_parallel_size=1) as gptq_marlin_model:
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend
+                     ) as gptq_marlin_model:
 
         gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
             example_prompts[:-1], max_tokens, num_logprobs)
@@ -84,7 +82,9 @@ def test_models(
                      dtype="half",
                      quantization="gptq",
                      max_model_len=MAX_MODEL_LEN,
-                     tensor_parallel_size=1) as gptq_model:
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend
+                     ) as gptq_model:
         gptq_outputs = gptq_model.generate_greedy_logprobs(
             example_prompts[:-1], max_tokens, num_logprobs)
 
@@ -94,3 +94,29 @@ def test_models(
         name_0="gptq",
         name_1="gptq_marlin",
     )
+
+
+@pytest.mark.flaky(reruns=3)
+@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
+                    reason="gptq_marlin is not supported on this GPU type.")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half", "bfloat16"])
+@pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(
+    vllm_runner,
+    example_prompts,
+    model,
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+) -> None:
+    run_test(
+        vllm_runner,
+        example_prompts,
+        model,
+        dtype,
+        max_tokens,
+        num_logprobs,
+        tensor_parallel_size=1,
+    )
\ No newline at end of file

From d9de3ead3c1f5ad8dd926971717eeee40bc91c1e Mon Sep 17 00:00:00 2001
From: nzhang1220 <ning2008wisc@gmail.com>
Date: Fri, 5 Jul 2024 21:25:20 -0700
Subject: [PATCH 2/5] running cuda related func inside test function instead
 calling  at the top level

---
 tests/distributed/test_distributed_gptq_marlin.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tests/distributed/test_distributed_gptq_marlin.py b/tests/distributed/test_distributed_gptq_marlin.py
index 6581cb2c4b4aa..434baca04e493 100644
--- a/tests/distributed/test_distributed_gptq_marlin.py
+++ b/tests/distributed/test_distributed_gptq_marlin.py
@@ -13,13 +13,12 @@
 import pytest
 
 from tests.models.test_gptq_marlin import MODELS, run_test
-from tests.quantization.utils import is_quant_method_supported
+from tests.quantization.utils import (cuda_device_count_stateless,
+                                      is_quant_method_supported)
 
 
 @pytest.mark.parametrize("tensor_parallel_size", [2])
 @pytest.mark.flaky(reruns=3)
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
-                    reason="gptq_marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half", "bfloat16"])
 @pytest.mark.parametrize("max_tokens", [32])
@@ -28,6 +27,13 @@ def test_models(vllm_runner, example_prompts, model, dtype: str,
                 max_tokens: int, num_logprobs: int,
                 tensor_parallel_size: int) -> None:
 
+    if cuda_device_count_stateless() < tensor_parallel_size:
+        pytest.skip("gptq_marlin is not supported on this GPU type.")
+
+    if not is_quant_method_supported("gptq_marlin"):
+        pytest.skip(
+            f"Need at least {tensor_parallel_size} GPUs to run the test.")
+
     distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND")
     run_test(vllm_runner,
              example_prompts,

From ab7fa8c57f284958b8769a27b1f7cd84d28f767d Mon Sep 17 00:00:00 2001
From: nzhang1220 <ning2008wisc@gmail.com>
Date: Fri, 5 Jul 2024 21:30:46 -0700
Subject: [PATCH 3/5] fix the import location of cuda_device_count_stateless

---
 tests/distributed/test_distributed_gptq_marlin.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tests/distributed/test_distributed_gptq_marlin.py b/tests/distributed/test_distributed_gptq_marlin.py
index 434baca04e493..76e68b3efebc8 100644
--- a/tests/distributed/test_distributed_gptq_marlin.py
+++ b/tests/distributed/test_distributed_gptq_marlin.py
@@ -13,8 +13,8 @@
 import pytest
 
 from tests.models.test_gptq_marlin import MODELS, run_test
-from tests.quantization.utils import (cuda_device_count_stateless,
-                                      is_quant_method_supported)
+from tests.quantization.utils import is_quant_method_supported
+from vllm.utils import cuda_device_count_stateless
 
 
 @pytest.mark.parametrize("tensor_parallel_size", [2])
@@ -28,12 +28,13 @@ def test_models(vllm_runner, example_prompts, model, dtype: str,
                 tensor_parallel_size: int) -> None:
 
     if cuda_device_count_stateless() < tensor_parallel_size:
-        pytest.skip("gptq_marlin is not supported on this GPU type.")
-
+        pytest.skip(
+            "gptq_marlin is not supported on this GPU type.")
+        
     if not is_quant_method_supported("gptq_marlin"):
         pytest.skip(
             f"Need at least {tensor_parallel_size} GPUs to run the test.")
-
+        
     distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND")
     run_test(vllm_runner,
              example_prompts,

From e79ffa8c27e1d2bc954bc3f469d195d03e5da84c Mon Sep 17 00:00:00 2001
From: nzhang1220 <ning2008wisc@gmail.com>
Date: Fri, 5 Jul 2024 21:33:28 -0700
Subject: [PATCH 4/5] all grtq tests may call CUDA related funcs inside

---
 tests/distributed/test_distributed_gptq_marlin.py | 5 +++--
 tests/models/test_gptq_marlin.py                  | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/distributed/test_distributed_gptq_marlin.py b/tests/distributed/test_distributed_gptq_marlin.py
index 76e68b3efebc8..35e8e5d918711 100644
--- a/tests/distributed/test_distributed_gptq_marlin.py
+++ b/tests/distributed/test_distributed_gptq_marlin.py
@@ -29,11 +29,12 @@ def test_models(vllm_runner, example_prompts, model, dtype: str,
 
     if cuda_device_count_stateless() < tensor_parallel_size:
         pytest.skip(
-            "gptq_marlin is not supported on this GPU type.")
+            f"Need at least {tensor_parallel_size} GPUs to run the test.")
+
         
     if not is_quant_method_supported("gptq_marlin"):
         pytest.skip(
-            f"Need at least {tensor_parallel_size} GPUs to run the test.")
+            "gptq_marlin is not supported on this GPU type.")
         
     distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND")
     run_test(vllm_runner,
diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py
index 23898be2b4164..3d3ecaf866860 100644
--- a/tests/models/test_gptq_marlin.py
+++ b/tests/models/test_gptq_marlin.py
@@ -97,8 +97,6 @@ def run_test(
 
 
 @pytest.mark.flaky(reruns=3)
-@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"),
-                    reason="gptq_marlin is not supported on this GPU type.")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half", "bfloat16"])
 @pytest.mark.parametrize("max_tokens", [32])
@@ -111,6 +109,9 @@ def test_models(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
+    if not is_quant_method_supported("gptq_marlin"):
+        pytest.skip(
+            "gptq_marlin is not supported on this GPU type.")
     run_test(
         vllm_runner,
         example_prompts,

From e9783604f82188b0188e43cce22205421a975599 Mon Sep 17 00:00:00 2001
From: nzhang1220 <ning2008wisc@gmail.com>
Date: Fri, 5 Jul 2024 21:37:25 -0700
Subject: [PATCH 5/5] formatting

---
 tests/distributed/test_distributed_gptq_marlin.py | 6 ++----
 tests/models/test_gptq_marlin.py                  | 3 +--
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/tests/distributed/test_distributed_gptq_marlin.py b/tests/distributed/test_distributed_gptq_marlin.py
index 35e8e5d918711..65744fbc33b99 100644
--- a/tests/distributed/test_distributed_gptq_marlin.py
+++ b/tests/distributed/test_distributed_gptq_marlin.py
@@ -31,11 +31,9 @@ def test_models(vllm_runner, example_prompts, model, dtype: str,
         pytest.skip(
             f"Need at least {tensor_parallel_size} GPUs to run the test.")
 
-        
     if not is_quant_method_supported("gptq_marlin"):
-        pytest.skip(
-            "gptq_marlin is not supported on this GPU type.")
-        
+        pytest.skip("gptq_marlin is not supported on this GPU type.")
+
     distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND")
     run_test(vllm_runner,
              example_prompts,
diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py
index 3d3ecaf866860..d83f8827a177d 100644
--- a/tests/models/test_gptq_marlin.py
+++ b/tests/models/test_gptq_marlin.py
@@ -110,8 +110,7 @@ def test_models(
     num_logprobs: int,
 ) -> None:
     if not is_quant_method_supported("gptq_marlin"):
-        pytest.skip(
-            "gptq_marlin is not supported on this GPU type.")
+        pytest.skip("gptq_marlin is not supported on this GPU type.")
     run_test(
         vllm_runner,
         example_prompts,