diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index af438f7d5820c..e83bce4283555 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -132,7 +132,6 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: # Execute a forward pass with dummy inputs to profile the memory usage # of the model. self.model_runner.profile_run() - torch.cuda.synchronize() free_gpu_memory, _ = torch.cuda.mem_get_info() # NOTE(woosuk): Here we assume that the other processes using the same diff --git a/vllm/worker/worker.py b/vllm/worker/worker.py index 0f12549e3f3fd..a3e377ef2b19d 100644 --- a/vllm/worker/worker.py +++ b/vllm/worker/worker.py @@ -200,7 +200,6 @@ def determine_num_available_blocks(self) -> Tuple[int, int]: weights_memory_in_bytes=self.model_runner. model_memory_usage) as result: self.model_runner.profile_run() - torch.cuda.synchronize() self._assert_memory_footprint_increased_during_profiling()