From d919e450e58834099b1820570f57af161928a8ae Mon Sep 17 00:00:00 2001 From: Adam Tilghman Date: Wed, 17 Apr 2024 21:56:26 -0700 Subject: [PATCH 1/2] [Bugfix] Fix pcie nvlink topology detection (#3974) --- vllm/distributed/device_communicators/custom_all_reduce.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index f83caef879da3..c4434ec77fa5b 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -145,8 +145,9 @@ def _is_full_nvlink(rank, world_size): for i in range(world_size): if i != rank: try: - link_state = pynvml.nvmlDeviceGetNvLinkState(handle, i) - if not link_state: + peer_handle = pynvml.nvmlDeviceGetHandleByIndex(i) + p2p_status = pynvml.nvmlDeviceGetP2PStatus(handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK) + if p2p_status != pynvml.NVML_P2P_STATUS_OK: return False except pynvml.NVMLError as error: logger.info( From 78f10fd0f040a7a825c8dce2b60c90c2d8aac43b Mon Sep 17 00:00:00 2001 From: Adam Tilghman Date: Wed, 17 Apr 2024 22:53:42 -0700 Subject: [PATCH 2/2] formatting --- vllm/distributed/device_communicators/custom_all_reduce.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py index c4434ec77fa5b..7602897d3dd8f 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce.py +++ b/vllm/distributed/device_communicators/custom_all_reduce.py @@ -146,7 +146,8 @@ def _is_full_nvlink(rank, world_size): if i != rank: try: peer_handle = pynvml.nvmlDeviceGetHandleByIndex(i) - p2p_status = pynvml.nvmlDeviceGetP2PStatus(handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK) + p2p_status = pynvml.nvmlDeviceGetP2PStatus( + handle, peer_handle, pynvml.NVML_P2P_CAPS_INDEX_NVLINK) if p2p_status != pynvml.NVML_P2P_STATUS_OK: return False except pynvml.NVMLError as error: