From e011cdea144e85fba05b3ab7cb88abb9d480c645 Mon Sep 17 00:00:00 2001 From: nzhang1220 Date: Sun, 30 Jun 2024 20:51:16 -0700 Subject: [PATCH 1/5] [ci][distributed] add distributed test gptq_marlin with tp = 2 --- .buildkite/test-pipeline.yaml | 1 + .../test_distributed_gptq_marlin.py | 39 +++++++++++++++ tests/models/test_gptq_marlin.py | 48 ++++++++++++++----- 3 files changed, 77 insertions(+), 11 deletions(-) create mode 100644 tests/distributed/test_distributed_gptq_marlin.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8013fbb642bb8..de7da35e0b8c1 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -58,6 +58,7 @@ steps: - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py + - DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_distributed_gptq_marlin.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py diff --git a/tests/distributed/test_distributed_gptq_marlin.py b/tests/distributed/test_distributed_gptq_marlin.py new file mode 100644 index 0000000000000..6581cb2c4b4aa --- /dev/null +++ b/tests/distributed/test_distributed_gptq_marlin.py @@ -0,0 +1,39 @@ +"""Compares the outputs of gptq vs gptq_marlin when tp > 1 +Note: GPTQ and Marlin do not have bitwise correctness. +As a result, in this test, we just confirm that the top selected tokens of the +Marlin/GPTQ models are in the top 5 selections of each other. +Note: Marlin internally uses locks to synchronize the threads. This can +result in very slight nondeterminism for Marlin. As a result, we re-run the test +up to 3 times to see if we pass. + +Run `pytest tests/models/test_distributed_gptq_marlin.py`. +""" +import os + +import pytest + +from tests.models.test_gptq_marlin import MODELS, run_test +from tests.quantization.utils import is_quant_method_supported + + +@pytest.mark.parametrize("tensor_parallel_size", [2]) +@pytest.mark.flaky(reruns=3) +@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"), + reason="gptq_marlin is not supported on this GPU type.") +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["half", "bfloat16"]) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_models(vllm_runner, example_prompts, model, dtype: str, + max_tokens: int, num_logprobs: int, + tensor_parallel_size: int) -> None: + + distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND") + run_test(vllm_runner, + example_prompts, + model, + dtype, + max_tokens, + num_logprobs, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend) \ No newline at end of file diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py index 4abbc41c9c287..23898be2b4164 100644 --- a/tests/models/test_gptq_marlin.py +++ b/tests/models/test_gptq_marlin.py @@ -9,6 +9,7 @@ Run `pytest tests/models/test_gptq_marlin.py`. """ import os +from typing import Optional import pytest @@ -18,7 +19,6 @@ from .utils import check_logprobs_close os.environ["TOKENIZERS_PARALLELISM"] = "true" - MAX_MODEL_LEN = 1024 MODELS = [ @@ -46,22 +46,18 @@ ] -@pytest.mark.flaky(reruns=3) -@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"), - reason="gptq_marlin is not supported on this GPU type.") -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half", "bfloat16"]) -@pytest.mark.parametrize("max_tokens", [32]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models( +def run_test( vllm_runner, example_prompts, model, dtype: str, max_tokens: int, num_logprobs: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, ) -> None: model_name, revision = model + distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND") # Run marlin. with vllm_runner(model_name=model_name, @@ -69,7 +65,9 @@ def test_models( dtype=dtype, quantization="marlin", max_model_len=MAX_MODEL_LEN, - tensor_parallel_size=1) as gptq_marlin_model: + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend + ) as gptq_marlin_model: gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs( example_prompts[:-1], max_tokens, num_logprobs) @@ -84,7 +82,9 @@ def test_models( dtype="half", quantization="gptq", max_model_len=MAX_MODEL_LEN, - tensor_parallel_size=1) as gptq_model: + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend + ) as gptq_model: gptq_outputs = gptq_model.generate_greedy_logprobs( example_prompts[:-1], max_tokens, num_logprobs) @@ -94,3 +94,29 @@ def test_models( name_0="gptq", name_1="gptq_marlin", ) + + +@pytest.mark.flaky(reruns=3) +@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"), + reason="gptq_marlin is not supported on this GPU type.") +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["half", "bfloat16"]) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_models( + vllm_runner, + example_prompts, + model, + dtype: str, + max_tokens: int, + num_logprobs: int, +) -> None: + run_test( + vllm_runner, + example_prompts, + model, + dtype, + max_tokens, + num_logprobs, + tensor_parallel_size=1, + ) \ No newline at end of file From d9de3ead3c1f5ad8dd926971717eeee40bc91c1e Mon Sep 17 00:00:00 2001 From: nzhang1220 Date: Fri, 5 Jul 2024 21:25:20 -0700 Subject: [PATCH 2/5] running cuda related func inside test function instead calling at the top level --- tests/distributed/test_distributed_gptq_marlin.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/distributed/test_distributed_gptq_marlin.py b/tests/distributed/test_distributed_gptq_marlin.py index 6581cb2c4b4aa..434baca04e493 100644 --- a/tests/distributed/test_distributed_gptq_marlin.py +++ b/tests/distributed/test_distributed_gptq_marlin.py @@ -13,13 +13,12 @@ import pytest from tests.models.test_gptq_marlin import MODELS, run_test -from tests.quantization.utils import is_quant_method_supported +from tests.quantization.utils import (cuda_device_count_stateless, + is_quant_method_supported) @pytest.mark.parametrize("tensor_parallel_size", [2]) @pytest.mark.flaky(reruns=3) -@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"), - reason="gptq_marlin is not supported on this GPU type.") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half", "bfloat16"]) @pytest.mark.parametrize("max_tokens", [32]) @@ -28,6 +27,13 @@ def test_models(vllm_runner, example_prompts, model, dtype: str, max_tokens: int, num_logprobs: int, tensor_parallel_size: int) -> None: + if cuda_device_count_stateless() < tensor_parallel_size: + pytest.skip("gptq_marlin is not supported on this GPU type.") + + if not is_quant_method_supported("gptq_marlin"): + pytest.skip( + f"Need at least {tensor_parallel_size} GPUs to run the test.") + distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND") run_test(vllm_runner, example_prompts, From ab7fa8c57f284958b8769a27b1f7cd84d28f767d Mon Sep 17 00:00:00 2001 From: nzhang1220 Date: Fri, 5 Jul 2024 21:30:46 -0700 Subject: [PATCH 3/5] fix the import location of cuda_device_count_stateless --- tests/distributed/test_distributed_gptq_marlin.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/distributed/test_distributed_gptq_marlin.py b/tests/distributed/test_distributed_gptq_marlin.py index 434baca04e493..76e68b3efebc8 100644 --- a/tests/distributed/test_distributed_gptq_marlin.py +++ b/tests/distributed/test_distributed_gptq_marlin.py @@ -13,8 +13,8 @@ import pytest from tests.models.test_gptq_marlin import MODELS, run_test -from tests.quantization.utils import (cuda_device_count_stateless, - is_quant_method_supported) +from tests.quantization.utils import is_quant_method_supported +from vllm.utils import cuda_device_count_stateless @pytest.mark.parametrize("tensor_parallel_size", [2]) @@ -28,12 +28,13 @@ def test_models(vllm_runner, example_prompts, model, dtype: str, tensor_parallel_size: int) -> None: if cuda_device_count_stateless() < tensor_parallel_size: - pytest.skip("gptq_marlin is not supported on this GPU type.") - + pytest.skip( + "gptq_marlin is not supported on this GPU type.") + if not is_quant_method_supported("gptq_marlin"): pytest.skip( f"Need at least {tensor_parallel_size} GPUs to run the test.") - + distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND") run_test(vllm_runner, example_prompts, From e79ffa8c27e1d2bc954bc3f469d195d03e5da84c Mon Sep 17 00:00:00 2001 From: nzhang1220 Date: Fri, 5 Jul 2024 21:33:28 -0700 Subject: [PATCH 4/5] all grtq tests may call CUDA related funcs inside --- tests/distributed/test_distributed_gptq_marlin.py | 5 +++-- tests/models/test_gptq_marlin.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/distributed/test_distributed_gptq_marlin.py b/tests/distributed/test_distributed_gptq_marlin.py index 76e68b3efebc8..35e8e5d918711 100644 --- a/tests/distributed/test_distributed_gptq_marlin.py +++ b/tests/distributed/test_distributed_gptq_marlin.py @@ -29,11 +29,12 @@ def test_models(vllm_runner, example_prompts, model, dtype: str, if cuda_device_count_stateless() < tensor_parallel_size: pytest.skip( - "gptq_marlin is not supported on this GPU type.") + f"Need at least {tensor_parallel_size} GPUs to run the test.") + if not is_quant_method_supported("gptq_marlin"): pytest.skip( - f"Need at least {tensor_parallel_size} GPUs to run the test.") + "gptq_marlin is not supported on this GPU type.") distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND") run_test(vllm_runner, diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py index 23898be2b4164..3d3ecaf866860 100644 --- a/tests/models/test_gptq_marlin.py +++ b/tests/models/test_gptq_marlin.py @@ -97,8 +97,6 @@ def run_test( @pytest.mark.flaky(reruns=3) -@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"), - reason="gptq_marlin is not supported on this GPU type.") @pytest.mark.parametrize("model", MODELS) @pytest.mark.parametrize("dtype", ["half", "bfloat16"]) @pytest.mark.parametrize("max_tokens", [32]) @@ -111,6 +109,9 @@ def test_models( max_tokens: int, num_logprobs: int, ) -> None: + if not is_quant_method_supported("gptq_marlin"): + pytest.skip( + "gptq_marlin is not supported on this GPU type.") run_test( vllm_runner, example_prompts, From e9783604f82188b0188e43cce22205421a975599 Mon Sep 17 00:00:00 2001 From: nzhang1220 Date: Fri, 5 Jul 2024 21:37:25 -0700 Subject: [PATCH 5/5] formatting --- tests/distributed/test_distributed_gptq_marlin.py | 6 ++---- tests/models/test_gptq_marlin.py | 3 +-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/distributed/test_distributed_gptq_marlin.py b/tests/distributed/test_distributed_gptq_marlin.py index 35e8e5d918711..65744fbc33b99 100644 --- a/tests/distributed/test_distributed_gptq_marlin.py +++ b/tests/distributed/test_distributed_gptq_marlin.py @@ -31,11 +31,9 @@ def test_models(vllm_runner, example_prompts, model, dtype: str, pytest.skip( f"Need at least {tensor_parallel_size} GPUs to run the test.") - if not is_quant_method_supported("gptq_marlin"): - pytest.skip( - "gptq_marlin is not supported on this GPU type.") - + pytest.skip("gptq_marlin is not supported on this GPU type.") + distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND") run_test(vllm_runner, example_prompts, diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py index 3d3ecaf866860..d83f8827a177d 100644 --- a/tests/models/test_gptq_marlin.py +++ b/tests/models/test_gptq_marlin.py @@ -110,8 +110,7 @@ def test_models( num_logprobs: int, ) -> None: if not is_quant_method_supported("gptq_marlin"): - pytest.skip( - "gptq_marlin is not supported on this GPU type.") + pytest.skip("gptq_marlin is not supported on this GPU type.") run_test( vllm_runner, example_prompts,