Skip to content

Commit

Permalink
[core][distributed] support n layers % pp size != 0 (vllm-project#6115)
Browse files Browse the repository at this point in the history
  • Loading branch information
youkaichao authored and jimpang committed Jul 24, 2024
1 parent 330e502 commit 6584397
Show file tree
Hide file tree
Showing 7 changed files with 19 additions and 10 deletions.
1 change: 1 addition & 0 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ steps:
commands:
- TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py
- TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
- TP_SIZE=1 PP_SIZE=3 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
- PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py
- PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py

Expand Down
15 changes: 6 additions & 9 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,8 +265,6 @@ def verify_with_parallel_config(
" must be divisible by tensor parallel size "
f"({tensor_parallel_size}).")

total_num_hidden_layers = getattr(self.hf_text_config,
"num_hidden_layers", 0)
pipeline_parallel_size = parallel_config.pipeline_parallel_size
architectures = getattr(self.hf_config, "architectures", [])
if not all(arch in _PP_SUPPORTED_MODELS
Expand All @@ -275,12 +273,6 @@ def verify_with_parallel_config(
"Pipeline parallelism is only supported for the following "
f" architectures: {_PP_SUPPORTED_MODELS}.")

if total_num_hidden_layers % pipeline_parallel_size != 0:
raise ValueError(
f"Total number of hidden layers ({total_num_hidden_layers}) "
"must be divisible by pipeline parallel size "
f"({pipeline_parallel_size}).")

if self.quantization == "bitsandbytes" and (
parallel_config.tensor_parallel_size > 1
or parallel_config.pipeline_parallel_size > 1):
Expand Down Expand Up @@ -385,9 +377,13 @@ def get_num_attention_heads(self,
return num_heads // parallel_config.tensor_parallel_size

def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
from vllm.distributed.utils import get_pp_indices
total_num_hidden_layers = getattr(self.hf_text_config,
"num_hidden_layers", 0)
return total_num_hidden_layers // parallel_config.pipeline_parallel_size
pp_rank = parallel_config.rank // parallel_config.tensor_parallel_size
pp_size = parallel_config.pipeline_parallel_size
start, end = get_pp_indices(total_num_hidden_layers, pp_rank, pp_size)
return end - start

def contains_seqlen_agnostic_layers(
self, parallel_config: "ParallelConfig") -> bool:
Expand Down Expand Up @@ -709,6 +705,7 @@ def __init__(
{"CUDA_VISIBLE_DEVICES": envs.CUDA_VISIBLE_DEVICES})

self._verify_args()
self.rank = 0

def _verify_args(self) -> None:
if (self.pipeline_parallel_size > 1
Expand Down
9 changes: 8 additions & 1 deletion vllm/distributed/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,15 @@ def split_tensor_along_last_dim(

def get_pp_indices(num_hidden_layers: int, pp_rank: int,
pp_size: int) -> Tuple[int, int]:
layers_per_partition = divide(num_hidden_layers, pp_size)
"""Try to evenly distribute layers across partitions.
If the number of layers is not divisible by the number of partitions,
the last partition will have the remaining layers.
"""
layers_per_partition = num_hidden_layers // pp_size
start_layer = pp_rank * layers_per_partition
end_layer = start_layer + layers_per_partition

if pp_rank == pp_size - 1:
end_layer = num_hidden_layers

return (start_layer, end_layer)
1 change: 1 addition & 0 deletions vllm/worker/openvino_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ def __init__(
) -> None:
self.model_config = model_config
self.parallel_config = parallel_config
self.parallel_config.rank = rank
self.scheduler_config = scheduler_config
self.device_config = device_config
self.cache_config = cache_config
Expand Down
1 change: 1 addition & 0 deletions vllm/worker/tpu_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def __init__(
) -> None:
self.model_config = model_config
self.parallel_config = parallel_config
self.parallel_config.rank = rank
self.scheduler_config = scheduler_config
self.device_config = device_config
self.cache_config = cache_config
Expand Down
1 change: 1 addition & 0 deletions vllm/worker/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def __init__(
) -> None:
self.model_config = model_config
self.parallel_config = parallel_config
self.parallel_config.rank = rank
self.scheduler_config = scheduler_config
self.device_config = device_config
self.cache_config = cache_config
Expand Down
1 change: 1 addition & 0 deletions vllm/worker/xpu_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def __init__(

self.model_config = model_config
self.parallel_config = parallel_config
self.parallel_config.rank = rank
self.scheduler_config = scheduler_config
self.device_config = device_config
self.cache_config = cache_config
Expand Down

0 comments on commit 6584397

Please sign in to comment.