Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Misc] Remove deprecated code #12383

Merged
merged 4 commits into from
Jan 24, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 14 additions & 9 deletions tests/async_engine/test_api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,27 +25,32 @@ def _query_server_long(prompt: str) -> dict:


@pytest.fixture
def api_server(tokenizer_pool_size: int, worker_use_ray: bool):
def api_server(tokenizer_pool_size: int, distributed_executor_backend: str):
script_path = Path(__file__).parent.joinpath(
"api_server_async_engine.py").absolute()
commands = [
sys.executable, "-u",
str(script_path), "--model", "facebook/opt-125m", "--host",
"127.0.0.1", "--tokenizer-pool-size",
str(tokenizer_pool_size)
sys.executable,
"-u",
str(script_path),
"--model",
"facebook/opt-125m",
"--host",
"127.0.0.1",
"--tokenizer-pool-size",
str(tokenizer_pool_size),
"--distributed-executor-backend",
distributed_executor_backend,
]

if worker_use_ray:
commands.append("--worker-use-ray")
uvicorn_process = subprocess.Popen(commands)
yield
uvicorn_process.terminate()


@pytest.mark.parametrize("tokenizer_pool_size", [0, 2])
@pytest.mark.parametrize("worker_use_ray", [False, True])
@pytest.mark.parametrize("distributed_executor_backend", ["mp", "ray"])
def test_api_server(api_server, tokenizer_pool_size: int,
worker_use_ray: bool):
distributed_executor_backend: str):
"""
Run the API server and test it.

Expand Down
18 changes: 9 additions & 9 deletions tests/basic_correctness/test_preemption.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@ def check_settings():


@pytest.fixture
def worker_use_ray() -> bool:
# When SPMD worker is used, use ray_use_worker=True
def distributed_executor_backend() -> str:
# When SPMD worker is used, use distributed_executor_backend="ray"
# to test delta input optimization works with preemption.
return envs.VLLM_USE_RAY_SPMD_WORKER
return "ray" if envs.VLLM_USE_RAY_SPMD_WORKER else "mp"


@pytest.mark.parametrize("model", MODELS)
Expand All @@ -47,7 +47,7 @@ def test_chunked_prefill_recompute(
dtype: str,
max_tokens: int,
chunked_prefill_token_size: int,
worker_use_ray: bool,
distributed_executor_backend: str,
) -> None:
"""Ensure that chunked prefill works with preemption."""
max_num_seqs = min(chunked_prefill_token_size, 256)
Expand All @@ -66,7 +66,7 @@ def test_chunked_prefill_recompute(
max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=enable_chunked_prefill,
max_num_seqs=max_num_seqs,
worker_use_ray=worker_use_ray,
distributed_executor_backend=distributed_executor_backend,
disable_log_stats=False,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
Expand All @@ -93,7 +93,7 @@ def test_preemption(
model: str,
dtype: str,
max_tokens: int,
worker_use_ray: bool,
distributed_executor_backend: str,
) -> None:
"""By default, recompute preemption is enabled"""

Expand All @@ -104,7 +104,7 @@ def test_preemption(
model,
dtype=dtype,
disable_log_stats=False,
worker_use_ray=worker_use_ray,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
assert (vllm_model.model.llm_engine.scheduler[0].artificial_preempt_cnt
Expand Down Expand Up @@ -144,7 +144,7 @@ def test_preemption_infeasible(
model: str,
dtype: str,
max_tokens: int,
worker_use_ray: bool,
distributed_executor_backend: str,
) -> None:
"""Verify infeasible preemption request will be ignored."""
BLOCK_SIZE = 16
Expand All @@ -159,7 +159,7 @@ def test_preemption_infeasible(
# ignored instead of hanging forever.
num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
worker_use_ray=worker_use_ray,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
sampling_params = SamplingParams(max_tokens=max_tokens,
ignore_eos=True)
Expand Down
3 changes: 2 additions & 1 deletion tests/multi_step/test_correctness_async_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
NUM_PROMPTS = [10]

DEFAULT_SERVER_ARGS: List[str] = [
"--worker-use-ray",
"--distributed-executor-backend",
"ray",
"--gpu-memory-utilization",
"0.85",
"--swap-space",
Expand Down
10 changes: 0 additions & 10 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1227,9 +1227,6 @@ class ParallelConfig:
pipeline_parallel_size: int = 1 # Number of pipeline parallel groups.
tensor_parallel_size: int = 1 # Number of tensor parallel groups.

# Deprecated, use distributed_executor_backend instead.
worker_use_ray: Optional[bool] = None

# Maximum number of multiple batches
# when load model sequentially. To avoid RAM OOM when using tensor
# parallel and large models.
Expand Down Expand Up @@ -1283,13 +1280,6 @@ def __post_init__(self) -> None:
self.world_size = self.pipeline_parallel_size * \
self.tensor_parallel_size

if self.worker_use_ray:
if self.distributed_executor_backend is None:
self.distributed_executor_backend = "ray"
elif not self.use_ray:
raise ValueError(f"worker-use-ray can't be used with "
f"distributed executor backend "
f"'{self.distributed_executor_backend}'.")
ray_only_devices = ["tpu"]
from vllm.platforms import current_platform
if (current_platform.device_type in ray_only_devices
Expand Down
6 changes: 0 additions & 6 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,6 @@ class EngineArgs:
kv_cache_dtype: str = 'auto'
seed: int = 0
max_model_len: Optional[int] = None
worker_use_ray: bool = False
# Note: Specifying a custom executor backend by passing a class
# is intended for expert use only. The API may change without
# notice.
Expand Down Expand Up @@ -389,10 +388,6 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
'to "ray" if Ray is installed and fail otherwise. Note that tpu '
'only supports Ray for distributed inference.')

parser.add_argument(
'--worker-use-ray',
action='store_true',
help='Deprecated, use ``--distributed-executor-backend=ray``.')
parser.add_argument('--pipeline-parallel-size',
'-pp',
type=int,
Expand Down Expand Up @@ -1071,7 +1066,6 @@ def create_engine_config(self,
parallel_config = ParallelConfig(
pipeline_parallel_size=self.pipeline_parallel_size,
tensor_parallel_size=self.tensor_parallel_size,
worker_use_ray=self.worker_use_ray,
max_parallel_loading_workers=self.max_parallel_loading_workers,
disable_custom_all_reduce=self.disable_custom_all_reduce,
tokenizer_pool_config=TokenizerPoolConfig.create_config(
Expand Down
43 changes: 0 additions & 43 deletions vllm/engine/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,21 +259,6 @@ def __init__(self, labelnames: List[str], vllm_config: VllmConfig):
documentation="Number of emitted tokens.",
labelnames=labelnames))

# Deprecated in favor of vllm:prompt_tokens_total
self.gauge_avg_prompt_throughput = self._gauge_cls(
name="vllm:avg_prompt_throughput_toks_per_s",
documentation="Average prefill throughput in tokens/s.",
labelnames=labelnames,
multiprocess_mode="sum",
)
# Deprecated in favor of vllm:generation_tokens_total
self.gauge_avg_generation_throughput = self._gauge_cls(
name="vllm:avg_generation_throughput_toks_per_s",
documentation="Average generation throughput in tokens/s.",
labelnames=labelnames,
multiprocess_mode="sum",
)


# end-metrics-definitions

Expand Down Expand Up @@ -635,20 +620,6 @@ def _log_prometheus(self, stats: Stats) -> None:
self._log_histogram(self.metrics.histogram_max_tokens_request,
stats.max_tokens_requests)

def _log_prometheus_interval(self, prompt_throughput: float,
generation_throughput: float) -> None:
# Logs metrics to prometheus that are computed every logging_interval.
# Support legacy gauge metrics that make throughput calculations on
# the vLLM side. Moving forward, we should use counters like
# counter_prompt_tokens, counter_generation_tokens
# Which log raw data and calculate summaries using rate() on the
# grafana/prometheus side. See
# https://github.com/vllm-project/vllm/pull/2316#discussion_r1464204666
self.metrics.gauge_avg_prompt_throughput.labels(
**self.labels).set(prompt_throughput)
self.metrics.gauge_avg_generation_throughput.labels(
**self.labels).set(generation_throughput)

def log(self, stats: Stats):
"""Logs to prometheus and tracked stats every iteration."""
# Log to prometheus.
Expand All @@ -664,20 +635,6 @@ def log(self, stats: Stats):
# Log locally every local_interval seconds.
if local_interval_elapsed(stats.now, self.last_local_log,
self.local_interval):
# Compute summary metrics for tracked stats (and log them
# to promethus if applicable).
prompt_throughput = get_throughput(self.num_prompt_tokens,
now=stats.now,
last_log=self.last_local_log)
generation_throughput = get_throughput(
self.num_generation_tokens,
now=stats.now,
last_log=self.last_local_log)

self._log_prometheus_interval(
prompt_throughput=prompt_throughput,
generation_throughput=generation_throughput)

if self.spec_decode_metrics is not None:
self._log_gauge(
self.metrics.gauge_spec_decode_draft_acceptance_rate,
Expand Down
Loading