From 4a67fb97102919c42b8a34fe8006179e2b2137e6 Mon Sep 17 00:00:00 2001 From: "jiang1.li" Date: Fri, 12 Jul 2024 06:49:53 +0000 Subject: [PATCH] Add IPEX Allreduce --- Dockerfile.cpu | 5 ++--- docs/source/getting_started/cpu-installation.rst | 2 -- requirements-cpu.txt | 4 ++-- vllm/distributed/parallel_state.py | 3 +++ vllm/executor/cpu_executor.py | 4 ++++ 5 files changed, 11 insertions(+), 7 deletions(-) diff --git a/Dockerfile.cpu b/Dockerfile.cpu index 10729049b8690..c13ebb6af6118 100644 --- a/Dockerfile.cpu +++ b/Dockerfile.cpu @@ -13,10 +13,9 @@ RUN pip install intel-openmp ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so:$LD_PRELOAD" - RUN echo 'ulimit -c 0' >> ~/.bashrc -RUN pip install https://intel-extension-for-pytorch.s3.amazonaws.com/ipex_dev/cpu/intel_extension_for_pytorch-2.3.100%2Bgit0eb3473-cp310-cp310-linux_x86_64.whl +RUN pip install --proxy http://child-prc.intel.com:913 http://mlpc.intel.com/downloads/cpu/ipex-2.4/rc0/intel_extension_for_pytorch-2.4.0-cp310-cp310-manylinux2014_x86_64.whl RUN pip install --upgrade pip \ && pip install wheel packaging ninja "setuptools>=49.4.0" numpy @@ -27,7 +26,7 @@ COPY ./ /workspace/vllm WORKDIR /workspace/vllm -RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu +RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/test/cpu # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ... ARG VLLM_CPU_DISABLE_AVX512 diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst index 0230c9e31205a..a5fb9a5682b56 100644 --- a/docs/source/getting_started/cpu-installation.rst +++ b/docs/source/getting_started/cpu-installation.rst @@ -88,8 +88,6 @@ Intel Extension for PyTorch - `Intel Extension for PyTorch (IPEX) `_ extends PyTorch with up-to-date features optimizations for an extra performance boost on Intel hardware. -- IPEX after the ``2.3.0`` can be enabled in the CPU backend by default if it is installed. - .. _cpu_backend_performance_tips: Performance tips diff --git a/requirements-cpu.txt b/requirements-cpu.txt index 754070df21c0a..a8ce104d83290 100644 --- a/requirements-cpu.txt +++ b/requirements-cpu.txt @@ -2,6 +2,6 @@ -r requirements-common.txt # Dependencies for x86_64 CPUs -torch == 2.3.1+cpu; platform_machine != "ppc64le" -torchvision == 0.18.1+cpu; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch +torch == 2.4.0; platform_machine != "ppc64le" +torchvision; platform_machine != "ppc64le" # required for the image processor of phi3v, this must be updated alongside torch triton >= 2.2.0 # FIXME(woosuk): This is a hack to avoid import error. diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 66ffe6e8a9fa9..438f501fd53df 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -288,6 +288,9 @@ def all_reduce(self, input_: torch.Tensor) -> torch.Tensor: pynccl_comm = self.pynccl_comm if (pynccl_comm is not None and not pynccl_comm.disabled): pynccl_comm.all_reduce(input_) + elif input_.is_cpu: + import intel_extension_for_pytorch as ipex + ipex.distributed.all_reduce(input_, group=self.device_group) else: torch.distributed.all_reduce(input_, group=self.device_group) return input_ diff --git a/vllm/executor/cpu_executor.py b/vllm/executor/cpu_executor.py index 15fd91505d489..819d39379f7f8 100644 --- a/vllm/executor/cpu_executor.py +++ b/vllm/executor/cpu_executor.py @@ -48,6 +48,10 @@ def _init_executor(self) -> None: os.environ['KMP_PLAIN_BARRIER_PATTERN'] = "dist,dist" os.environ['KMP_REDUCTION_BARRIER_PATTERN'] = "dist,dist" + # To hint IPEX uses shared memory based AllReduce + os.environ["LOCAL_WORLD_SIZE"] = str( + self.parallel_config.tensor_parallel_size) + self.model_config = _verify_and_get_model_config(self.model_config) self.cache_config = _verify_and_get_cache_config(self.cache_config) self.scheduler_config = _verify_and_get_scheduler_config(