Update CI - Bump vllm to v0.4.2 (#43)

triton-inference-server · May 29, 2024 · f77614e · f77614e
1 parent 861a198
commit f77614e
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 9 deletions.
diff --git a/ci/L0_multi_gpu/vllm_backend/test.sh b/ci/L0_multi_gpu/vllm_backend/test.sh
@@ -42,8 +42,6 @@ rm -rf models && mkdir -p models
 cp -r ${SAMPLE_MODELS_REPO}/vllm_model models/vllm_opt
 sed -i '3s/^/    "tensor_parallel_size": 2,\n/' models/vllm_opt/1/model.json
 
-python3 -m pip install --upgrade pip && pip3 install tritonclient[grpc] nvidia-ml-py3
-
 RET=0
 
 run_server

diff --git a/ci/L0_multi_gpu/vllm_backend/vllm_multi_gpu_test.py b/ci/L0_multi_gpu/vllm_backend/vllm_multi_gpu_test.py
@@ -28,7 +28,7 @@
 import unittest
 from functools import partial
 
-import nvidia_smi
+import pynvml
 import tritonclient.grpc as grpcclient
 from tritonclient.utils import *
 
@@ -38,20 +38,20 @@
 
 class VLLMMultiGPUTest(TestResultCollector):
     def setUp(self):
-        nvidia_smi.nvmlInit()
+        pynvml.nvmlInit()
         self.triton_client = grpcclient.InferenceServerClient(url="localhost:8001")
         self.vllm_model_name = "vllm_opt"
 
     def get_gpu_memory_utilization(self, gpu_id):
-        handle = nvidia_smi.nvmlDeviceGetHandleByIndex(gpu_id)
-        info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
+        handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
+        info = pynvml.nvmlDeviceGetMemoryInfo(handle)
         return info.used
 
     def get_available_gpu_ids(self):
-        device_count = nvidia_smi.nvmlDeviceGetCount()
+        device_count = pynvml.nvmlDeviceGetCount()
         available_gpus = []
         for gpu_id in range(device_count):
-            handle = nvidia_smi.nvmlDeviceGetHandleByIndex(gpu_id)
+            handle = pynvml.nvmlDeviceGetHandleByIndex(gpu_id)
             if handle:
                 available_gpus.append(gpu_id)
         return available_gpus
@@ -119,7 +119,7 @@ def _test_vllm_model(self, send_parameters_as_tensor=True):
         self.triton_client.stop_stream()
 
     def tearDown(self):
-        nvidia_smi.nvmlShutdown()
+        pynvml.nvmlShutdown()
         self.triton_client.close()