diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 8013fbb642bb8..de7da35e0b8c1 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -58,6 +58,7 @@ steps: - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py + - DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_distributed_gptq_marlin.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py diff --git a/tests/distributed/test_distributed_gptq_marlin.py b/tests/distributed/test_distributed_gptq_marlin.py new file mode 100644 index 0000000000000..65744fbc33b99 --- /dev/null +++ b/tests/distributed/test_distributed_gptq_marlin.py @@ -0,0 +1,45 @@ +"""Compares the outputs of gptq vs gptq_marlin when tp > 1 +Note: GPTQ and Marlin do not have bitwise correctness. +As a result, in this test, we just confirm that the top selected tokens of the +Marlin/GPTQ models are in the top 5 selections of each other. +Note: Marlin internally uses locks to synchronize the threads. This can +result in very slight nondeterminism for Marlin. As a result, we re-run the test +up to 3 times to see if we pass. + +Run `pytest tests/models/test_distributed_gptq_marlin.py`. +""" +import os + +import pytest + +from tests.models.test_gptq_marlin import MODELS, run_test +from tests.quantization.utils import is_quant_method_supported +from vllm.utils import cuda_device_count_stateless + + +@pytest.mark.parametrize("tensor_parallel_size", [2]) +@pytest.mark.flaky(reruns=3) +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["half", "bfloat16"]) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_models(vllm_runner, example_prompts, model, dtype: str, + max_tokens: int, num_logprobs: int, + tensor_parallel_size: int) -> None: + + if cuda_device_count_stateless() < tensor_parallel_size: + pytest.skip( + f"Need at least {tensor_parallel_size} GPUs to run the test.") + + if not is_quant_method_supported("gptq_marlin"): + pytest.skip("gptq_marlin is not supported on this GPU type.") + + distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND") + run_test(vllm_runner, + example_prompts, + model, + dtype, + max_tokens, + num_logprobs, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend) \ No newline at end of file diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py index 4abbc41c9c287..d83f8827a177d 100644 --- a/tests/models/test_gptq_marlin.py +++ b/tests/models/test_gptq_marlin.py @@ -9,6 +9,7 @@ Run `pytest tests/models/test_gptq_marlin.py`. """ import os +from typing import Optional import pytest @@ -18,7 +19,6 @@ from .utils import check_logprobs_close os.environ["TOKENIZERS_PARALLELISM"] = "true" - MAX_MODEL_LEN = 1024 MODELS = [ @@ -46,22 +46,18 @@ ] -@pytest.mark.flaky(reruns=3) -@pytest.mark.skipif(not is_quant_method_supported("gptq_marlin"), - reason="gptq_marlin is not supported on this GPU type.") -@pytest.mark.parametrize("model", MODELS) -@pytest.mark.parametrize("dtype", ["half", "bfloat16"]) -@pytest.mark.parametrize("max_tokens", [32]) -@pytest.mark.parametrize("num_logprobs", [5]) -def test_models( +def run_test( vllm_runner, example_prompts, model, dtype: str, max_tokens: int, num_logprobs: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, ) -> None: model_name, revision = model + distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND") # Run marlin. with vllm_runner(model_name=model_name, @@ -69,7 +65,9 @@ def test_models( dtype=dtype, quantization="marlin", max_model_len=MAX_MODEL_LEN, - tensor_parallel_size=1) as gptq_marlin_model: + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend + ) as gptq_marlin_model: gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs( example_prompts[:-1], max_tokens, num_logprobs) @@ -84,7 +82,9 @@ def test_models( dtype="half", quantization="gptq", max_model_len=MAX_MODEL_LEN, - tensor_parallel_size=1) as gptq_model: + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend + ) as gptq_model: gptq_outputs = gptq_model.generate_greedy_logprobs( example_prompts[:-1], max_tokens, num_logprobs) @@ -94,3 +94,29 @@ def test_models( name_0="gptq", name_1="gptq_marlin", ) + + +@pytest.mark.flaky(reruns=3) +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["half", "bfloat16"]) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_models( + vllm_runner, + example_prompts, + model, + dtype: str, + max_tokens: int, + num_logprobs: int, +) -> None: + if not is_quant_method_supported("gptq_marlin"): + pytest.skip("gptq_marlin is not supported on this GPU type.") + run_test( + vllm_runner, + example_prompts, + model, + dtype, + max_tokens, + num_logprobs, + tensor_parallel_size=1, + ) \ No newline at end of file