Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BugFix] Fix test breakages from transformers 4.45 upgrade #8829

Merged
merged 24 commits into from
Sep 26, 2024
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
4f53397
[BugFix] Fix test breakages from transformers 4.45 upgrade
njhill Sep 26, 2024
e2ae1bb
Also fix llava OOM from @ywang96
njhill Sep 26, 2024
66c0c19
Fix next failures
njhill Sep 26, 2024
a5b289c
Catch any Exception when attempting to load lora-specific tokenizer
njhill Sep 26, 2024
ce1d477
Change "default" rope scaling type back to "mrope" in HF config
njhill Sep 26, 2024
4eaa8e1
raise gpu mem
ywang96 Sep 26, 2024
899003b
Merge branch 'main' into transformers-fixes
DarkLight1337 Sep 26, 2024
562f816
Remove unnecessary overwrite
DarkLight1337 Sep 26, 2024
51b9abc
Remove unnecessary version guards
DarkLight1337 Sep 26, 2024
8e7f2b6
Update A100 distributed test with new file location (missed in #7820)
DarkLight1337 Sep 26, 2024
57b7328
Replace legacy `tmpdir` with modern `tmp_path` fixture
DarkLight1337 Sep 26, 2024
0ebd4fb
Reduce max_model_len in LLaVA-OneVision test to avoid OOM
DarkLight1337 Sep 26, 2024
4a924c8
Patch `ChatGLMTokenizer._pad`
DarkLight1337 Sep 26, 2024
0c30e87
Run OOT test in a clean process to solve OOM in AMD
DarkLight1337 Sep 26, 2024
9f2fac8
Fix insufficient `max_model_len`
DarkLight1337 Sep 26, 2024
2b6948c
Fix wrong test being updated
DarkLight1337 Sep 26, 2024
45e2b54
Cleanup
DarkLight1337 Sep 26, 2024
f0584fa
raise mem
ywang96 Sep 26, 2024
27b96c1
format
ywang96 Sep 26, 2024
cd105be
Merge remote-tracking branch 'upstream/main' into transformers-fixes
ywang96 Sep 26, 2024
315ff90
remove comment
ywang96 Sep 26, 2024
8fdad1c
skip test
ywang96 Sep 26, 2024
6decd70
revert soft fail
ywang96 Sep 26, 2024
59bc78d
Update tokenizer patch
DarkLight1337 Sep 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,8 @@ steps:
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- pytest -v -s entrypoints/openai
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
- pytest -v -s entrypoints/test_chat_utils.py
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests

Expand Down Expand Up @@ -463,7 +464,7 @@ steps:
# NOTE: don't test llama model here, it seems hf implementation is buggy
# see https://github.com/vllm-project/vllm/pull/5689 for details
- pytest -v -s distributed/test_custom_all_reduce.py
- TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m distributed_2_gpus
- pytest -v -s -x lora/test_mixtral.py

- label: LM Eval Large Models # optional
Expand Down
1 change: 0 additions & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -699,7 +699,6 @@ def generate_w_logprobs(
if videos is not None:
for i, video in enumerate(videos):
inputs[i]["multi_modal_data"] = {"video": video}
print(f"[INPUTS!!!!]: {inputs}, {sampling_params}")

req_outputs = self.model.generate(inputs,
sampling_params=sampling_params)
Expand Down
7 changes: 0 additions & 7 deletions tests/distributed/test_pipeline_parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
import os

import pytest
from packaging import version
from transformers import __version__ as transformers_version

from vllm.logger import init_logger

Expand Down Expand Up @@ -49,11 +47,6 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL,
pytest.skip("Skipping multi-node pipeline parallel test for "
"multiprocessing distributed backend")

# Skip tests that require transformers>=4.45.0
if "Qwen2-VL" in MODEL_NAME and version.parse(
transformers_version) < version.parse("4.45.0.dev0"):
pytest.skip("This test requires transformers>=4.45.0")

pp_args = [
# use half precision for speed and memory savings in CI environment
"--dtype",
Expand Down
8 changes: 4 additions & 4 deletions tests/engine/test_custom_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,9 @@ def test_custom_executor_type_checking(model):


@pytest.mark.parametrize("model", ["facebook/opt-125m"])
def test_custom_executor(model, tmpdir):
def test_custom_executor(model, tmp_path):
cwd = os.path.abspath(".")
os.chdir(tmpdir)
os.chdir(tmp_path)
try:
assert not os.path.exists(".marker")

Expand All @@ -68,9 +68,9 @@ def test_custom_executor(model, tmpdir):


@pytest.mark.parametrize("model", ["facebook/opt-125m"])
def test_custom_executor_async(model, tmpdir):
def test_custom_executor_async(model, tmp_path):
cwd = os.path.abspath(".")
os.chdir(tmpdir)
os.chdir(tmp_path)
try:
assert not os.path.exists(".marker")

Expand Down
6 changes: 6 additions & 0 deletions tests/entrypoints/openai/test_serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@
BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]


@dataclass
class MockHFConfig:
model_type: str = "any"


@dataclass
class MockModelConfig:
tokenizer = MODEL_NAME
Expand All @@ -24,6 +29,7 @@ class MockModelConfig:
tokenizer_revision = None
embedding_mode = False
multimodal_config = MultiModalConfig()
hf_config = MockHFConfig()


@dataclass
Expand Down
4 changes: 2 additions & 2 deletions tests/lora/test_tokenizer_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ async def test_tokenizer_group_lora(sql_lora_files, tokenizer_group_type):
lora_request)


def test_get_lora_tokenizer(sql_lora_files, tmpdir):
def test_get_lora_tokenizer(sql_lora_files, tmp_path):
lora_request = None
tokenizer = get_lora_tokenizer(lora_request)
assert not tokenizer
Expand All @@ -50,6 +50,6 @@ def test_get_lora_tokenizer(sql_lora_files, tmpdir):
tokenizer = get_lora_tokenizer(lora_request)
assert tokenizer.get_added_vocab()

lora_request = LoRARequest("1", 1, str(tmpdir))
lora_request = LoRARequest("1", 1, str(tmp_path))
tokenizer = get_lora_tokenizer(lora_request)
assert not tokenizer
4 changes: 0 additions & 4 deletions tests/models/decoder_only/language/test_granite.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
Run `pytest tests/models/test_granite.py`.
"""
import pytest
import transformers

from ...utils import check_logprobs_close

Expand All @@ -12,9 +11,6 @@
]


# GraniteForCausalLM will be in transformers >= 4.45
@pytest.mark.skipif(transformers.__version__ < "4.45",
reason="granite model test requires transformers >= 4.45")
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["bfloat16"])
@pytest.mark.parametrize("max_tokens", [64])
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import List, Optional, Tuple, Type, overload

import pytest
import transformers
from transformers import AutoConfig, AutoModelForVision2Seq, AutoTokenizer

from vllm.multimodal.utils import (rescale_video_size, resize_video,
Expand Down Expand Up @@ -158,8 +157,6 @@ def run_test(
)


@pytest.mark.skipif(transformers.__version__ < "4.45",
reason="Waiting for next transformers release")
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
Expand Down Expand Up @@ -203,8 +200,6 @@ def test_models(hf_runner, vllm_runner, video_assets, model, size_factors,
)


@pytest.mark.skipif(transformers.__version__ < "4.45",
reason="Waiting for next transformers release")
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"sizes",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import List, Optional, Tuple, Type, overload

import pytest
import transformers
from transformers import (AutoConfig, AutoModelForVision2Seq, AutoTokenizer,
BatchEncoding)

Expand Down Expand Up @@ -166,8 +165,6 @@ def process(hf_inputs: BatchEncoding):
)


@pytest.mark.skipif(transformers.__version__ < "4.45",
reason="Waiting for next transformers release")
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
Expand Down Expand Up @@ -211,8 +208,6 @@ def test_models(hf_runner, vllm_runner, video_assets, model, size_factors,
)


@pytest.mark.skipif(transformers.__version__ < "4.45",
reason="Waiting for next transformers release")
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"sizes",
Expand Down Expand Up @@ -259,7 +254,9 @@ def run_image_test(
# max_model_len should be greater than image_feature_size
with vllm_runner(model,
dtype=dtype,
max_model_len=32768,
max_num_seqs=1,
max_model_len=16384,
gpu_memory_utilization=0.98,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True,
Expand Down Expand Up @@ -305,8 +302,6 @@ def process(hf_inputs: BatchEncoding):
)


@pytest.mark.skipif(transformers.__version__ < "4.45",
reason="Waiting for next transformers release")
@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
Expand Down
6 changes: 0 additions & 6 deletions tests/models/test_registry.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,9 @@
import pytest
import transformers

from vllm.model_executor.models import _MODELS, ModelRegistry


@pytest.mark.parametrize("model_cls", _MODELS)
def test_registry_imports(model_cls):
if (model_cls in ("LlavaOnevisionForConditionalGeneration",
"Qwen2VLForConditionalGeneration")
and transformers.__version__ < "4.45"):
pytest.skip("Waiting for next transformers release")

# Ensure all model classes can be imported successfully
ModelRegistry.resolve_model_cls([model_cls])
18 changes: 15 additions & 3 deletions tests/samplers/test_sampler.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import itertools
import random
from dataclasses import dataclass
from typing import Dict, List, Optional, Tuple
from unittest.mock import Mock, patch

Expand Down Expand Up @@ -596,8 +597,19 @@ def test_sampler_top_k_top_p(seed: int, device: str):
generation_config = GenerationConfig(top_k=top_k,
top_p=top_p,
do_sample=True)
warpers = generation_model._get_logits_warper(generation_config, device)
assert len(warpers) == 2 # top_p and top_k

@dataclass
class MockConfig:
is_encoder_decoder: bool = False

generation_model.config = MockConfig() # needed by the following method
generation_model._prepare_special_tokens(generation_config, device=device)
processors = generation_model._get_logits_processor(generation_config,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_get_logits_warper was rolled into _get_logits_processor

None,
None,
None, [],
device=device)
assert len(processors) == 2 # top_p and top_k

seq_group_metadata_list: List[SequenceGroupMetadata] = []
seq_lens: List[int] = []
Expand Down Expand Up @@ -639,7 +651,7 @@ def mock_sample(probs, *args, **kwargs):

assert sample_probs is not None

hf_probs = warpers(torch.zeros_like(fake_logits), fake_logits.clone())
hf_probs = processors(torch.zeros_like(fake_logits), fake_logits.clone())
hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float)
torch.testing.assert_close(hf_probs, sample_probs, rtol=0.0, atol=1e-5)
assert torch.equal(hf_probs.eq(0), sample_probs.eq(0))
Expand Down
4 changes: 2 additions & 2 deletions vllm/entrypoints/openai/serving_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,13 +152,13 @@ async def create_chat_completion(
**(request.chat_template_kwargs or {}),
)
except Exception as e:
logger.error("Error in applying chat template from request: %s", e)
logger.exception("Error in applying chat template from request")
return self.create_error_response(str(e))

try:
mm_data = await mm_data_future
except Exception as e:
logger.error("Error in loading multi-modal data: %s", e)
logger.exception("Error in loading multi-modal data")
return self.create_error_response(str(e))

# validation for OpenAI tools
Expand Down
24 changes: 23 additions & 1 deletion vllm/transformers_utils/tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import warnings
from pathlib import Path
from types import MethodType
from typing import Optional, Union

import huggingface_hub
Expand Down Expand Up @@ -152,6 +153,27 @@ def get_tokenizer(
else:
raise e

if type(tokenizer).__name__ == "ChatGLMTokenizer":
assert isinstance(tokenizer, PreTrainedTokenizer)
orig_pad = tokenizer._pad

# Patch _pad method to accept `padding_side`
def _pad(
self: PreTrainedTokenizer,
*args,
padding_side: Optional[str] = None,
**kwargs,
):
if (padding_side is not None
and padding_side != self.padding_side):
msg = ("`padding_side` argument is not supported by "
"ChatGLMTokenizer and will be ignored.")
warnings.warn(msg, stacklevel=2)

return orig_pad(*args, **kwargs)

tokenizer._pad = MethodType(_pad, tokenizer)

if not isinstance(tokenizer, PreTrainedTokenizerFast):
logger.warning(
"Using a slow tokenizer. This might cause a significant "
Expand All @@ -167,7 +189,7 @@ def get_lora_tokenizer(lora_request: LoRARequest, *args,
return None
try:
tokenizer = get_tokenizer(lora_request.lora_path, *args, **kwargs)
except OSError as e:
except Exception as e:
# No tokenizer was found in the LoRA folder,
# use base model tokenizer
logger.warning(
Expand Down
Loading