diff --git a/tests/models/test_compressed.py b/tests/models/test_compressed.py index f9afe509bf25e..06e4f9dcd2e6b 100644 --- a/tests/models/test_compressed.py +++ b/tests/models/test_compressed.py @@ -9,6 +9,7 @@ import gc import pytest + from tests.models.utils import check_logprobs_close MAX_MODEL_LEN = 1024 diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py index 12c735d4056d0..561d4a1756587 100644 --- a/tests/models/test_gptq_marlin.py +++ b/tests/models/test_gptq_marlin.py @@ -13,11 +13,10 @@ import pytest import torch +from tests.models.utils import check_logprobs_close from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.layers.rotary_embedding import _ROPE_DICT -from tests.models.utils import check_logprobs_close - os.environ["TOKENIZERS_PARALLELISM"] = "true" MAX_MODEL_LEN = 1024 diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py index d34c06598ccfc..d3770fa69f6f1 100644 --- a/tests/models/test_marlin.py +++ b/tests/models/test_marlin.py @@ -20,9 +20,8 @@ import pytest import torch -from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS - from tests.models.utils import check_logprobs_close +from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS capability = torch.cuda.get_device_capability() capability = capability[0] * 10 + capability[1] diff --git a/tests/models/test_models_logprobs.py b/tests/models/test_models_logprobs.py index d8210e86e827a..091620ca357b5 100644 --- a/tests/models/test_models_logprobs.py +++ b/tests/models/test_models_logprobs.py @@ -3,6 +3,7 @@ Run `pytest tests/models/test_models_logprobs.py --forked`. """ import pytest + from tests.models.utils import check_logprobs_close MODEL_MAX_LEN = 1024 diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py index 7af9bb1afd489..cc377247765e3 100644 --- a/tests/tensorizer_loader/test_tensorizer.py +++ b/tests/tensorizer_loader/test_tensorizer.py @@ -2,7 +2,6 @@ import json import os import subprocess -from tests.utils import ServerRunner from unittest.mock import MagicMock, patch import openai @@ -10,6 +9,7 @@ import ray import torch +from tests.utils import ServerRunner from vllm import SamplingParams # yapf: disable from vllm.model_executor.model_loader.tensorizer import (TensorizerConfig,