vllm-project · Isotr0py · Nov 8, 2024 · Jun 13, 2024 · Jun 13, 2024 · Jun 15, 2024
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
@@ -19,5 +19,6 @@ docker exec cpu-test bash -c "python3 examples/offline_inference.py"
 # Run basic model test
 docker exec cpu-test bash -c "cd tests;
   pip install pytest Pillow protobuf
+  bash ../.buildkite/download-images.sh
   cd ../
-  pytest -v -s tests/models -m \"not llava\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"
+  pytest -v -s tests/models --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
@@ -1,6 +1,7 @@
 from typing import List, Tuple
 
 import pytest
+import torch
 from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
@@ -65,9 +66,15 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
     return hf_input_ids, hf_output_str
 
 
+# TODO: remove this after CPU float16 support ready
+target_dtype = "float"
+if torch.cuda.is_available():
+    target_dtype = "half"
+
+
 # TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
 def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
                 model_and_config, dtype: str, max_tokens: int) -> None:

diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
@@ -1,6 +1,7 @@
 from typing import List, Tuple
 
 import pytest
+import torch
 from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
@@ -72,11 +73,17 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
     return hf_input_ids, hf_output_str
 
 
+# TODO: remove this after CPU float16 support ready
+target_dtype = "float"
+if torch.cuda.is_available():
+    target_dtype = "half"
+
+
 @pytest.mark.xfail(
     reason="Inconsistent image processor being used due to lack "
     "of support for dynamic image token replacement")
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
 def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
                 model_and_config, dtype: str, max_tokens: int) -> None: