Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Validate CUDA SHM region registration size #7178

Merged
merged 13 commits into from
May 7, 2024
313 changes: 161 additions & 152 deletions qa/L0_cuda_shared_memory/cuda_shared_memory_test.py

Large diffs are not rendered by default.

5 changes: 3 additions & 2 deletions qa/L0_cuda_shared_memory/test.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/bin/bash
# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
# Copyright 2019-2024, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -47,7 +47,8 @@ for i in \
test_register_after_inference \
test_too_big_shm \
test_mixed_raw_shm \
test_unregisterall; do
test_unregisterall \
test_register_out_of_bound; do
for client_type in http grpc; do
SERVER_ARGS="--model-repository=`pwd`/models --log-verbose=1"
SERVER_LOG="./$i.$client_type.server.log"
Expand Down
4 changes: 2 additions & 2 deletions qa/L0_grpc/python_grpc_aio_test.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#!/usr/bin/env python
# Copyright 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -113,7 +113,7 @@ async def test_get_cuda_shared_memory_status(self):
async def test_register_cuda_shared_memory(self):
with self.assertRaisesRegex(
InferenceServerException,
"\[StatusCode\.INVALID_ARGUMENT\] failed to register CUDA shared memory region '': failed to open CUDA IPC handle: invalid argument",
"failed to register shared memory region.*invalid args",
):
await self._triton_client.register_cuda_shared_memory("", b"", 0, 0)

Expand Down
127 changes: 49 additions & 78 deletions qa/L0_shared_memory/shared_memory_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import os
import unittest

import infer_util as iu
import numpy as np
import test_util as tu
import tritonclient.grpc as grpcclient
Expand Down Expand Up @@ -186,84 +187,20 @@ def _cleanup_server(self, shm_handles):
for shm_handle in shm_handles:
shm.destroy_shared_memory_region(shm_handle)

def _basic_inference(
self,
shm_ip0_handle,
shm_ip1_handle,
shm_op0_handle,
shm_op1_handle,
error_msg,
big_shm_name="",
big_shm_size=DEFAULT_SHM_BYTE_SIZE,
shm_output_offset=0,
shm_output_byte_size=DEFAULT_SHM_BYTE_SIZE,
default_shm_byte_size=DEFAULT_SHM_BYTE_SIZE,
):
input0_data = np.arange(start=0, stop=16, dtype=np.int32)
input1_data = np.ones(shape=16, dtype=np.int32)
inputs = []
outputs = []
if self.protocol == "http":
inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=True))
outputs.append(
httpclient.InferRequestedOutput("OUTPUT1", binary_data=False)
)
else:
inputs.append(grpcclient.InferInput("INPUT0", [1, 16], "INT32"))
inputs.append(grpcclient.InferInput("INPUT1", [1, 16], "INT32"))
outputs.append(grpcclient.InferRequestedOutput("OUTPUT0"))
outputs.append(grpcclient.InferRequestedOutput("OUTPUT1"))

inputs[0].set_shared_memory("input0_data", default_shm_byte_size)

if type(shm_ip1_handle) == np.array:
inputs[1].set_data_from_numpy(input0_data, binary_data=True)
elif big_shm_name != "":
inputs[1].set_shared_memory(big_shm_name, big_shm_size)
else:
inputs[1].set_shared_memory("input1_data", default_shm_byte_size)

outputs[0].set_shared_memory(
"output0_data", shm_output_byte_size, offset=shm_output_offset
)
outputs[1].set_shared_memory(
"output1_data", shm_output_byte_size, offset=shm_output_offset
)

try:
results = self.triton_client.infer(
"simple", inputs, model_version="", outputs=outputs
)
output = results.get_output("OUTPUT0")
if self.protocol == "http":
output_datatype = output["datatype"]
output_shape = output["shape"]
else:
output_datatype = output.datatype
output_shape = output.shape
output_dtype = utils.triton_to_np_dtype(output_datatype)
output_data = shm.get_contents_as_numpy(
shm_op0_handle, output_dtype, output_shape
)
self.assertTrue(
(output_data[0] == (input0_data + input1_data)).all(),
"Model output does not match expected output",
)
except Exception as ex:
error_msg.append(str(ex))

def test_unregister_after_inference(self):
# Unregister after inference
error_msg = []
shm_handles = self._configure_server()
self._basic_inference(
iu.shm_basic_infer(
self,
self.triton_client,
shm_handles[0],
shm_handles[1],
shm_handles[2],
shm_handles[3],
error_msg,
protocol=self.protocol,
use_system_shared_memory=True,
)
if len(error_msg) > 0:
raise Exception(str(error_msg))
Expand All @@ -279,9 +216,19 @@ def test_register_after_inference(self):
# Register after inference
error_msg = []
shm_handles = self._configure_server()
self._basic_inference(
shm_handles[0], shm_handles[1], shm_handles[2], shm_handles[3], error_msg

iu.shm_basic_infer(
self,
self.triton_client,
shm_handles[0],
shm_handles[1],
shm_handles[2],
shm_handles[3],
error_msg,
protocol=self.protocol,
use_system_shared_memory=True,
)

if len(error_msg) > 0:
raise Exception(str(error_msg))
shm_ip2_handle = shm.create_shared_memory_region(
Expand All @@ -308,14 +255,19 @@ def test_too_big_shm(self):
self.triton_client.register_system_shared_memory(
"input2_data", "/input2_data", 128
)
self._basic_inference(

iu.shm_basic_infer(
self,
self.triton_client,
shm_handles[0],
shm_ip2_handle,
shm_handles[2],
shm_handles[3],
error_msg,
"input2_data",
128,
big_shm_name="input2_data",
big_shm_size=128,
protocol=self.protocol,
use_system_shared_memory=True,
)
if len(error_msg) > 0:
self.assertTrue(
Expand All @@ -330,8 +282,17 @@ def test_mixed_raw_shm(self):
error_msg = []
shm_handles = self._configure_server()
input1_data = np.ones(shape=16, dtype=np.int32)
self._basic_inference(
shm_handles[0], [input1_data], shm_handles[2], shm_handles[3], error_msg

iu.shm_basic_infer(
self,
self.triton_client,
shm_handles[0],
[input1_data],
shm_handles[2],
shm_handles[3],
error_msg,
protocol=self.protocol,
use_system_shared_memory=True,
)
if len(error_msg) > 0:
raise Exception(error_msg[-1])
Expand Down Expand Up @@ -365,14 +326,20 @@ def test_infer_offset_out_of_bound(self):
# gRPC will throw an error if > 2**63 - 1, so instead test for
# exceeding shm region size by 1 byte, given its size is 64 bytes
offset = 64
self._basic_inference(

iu.shm_basic_infer(
self,
self.triton_client,
shm_handles[0],
shm_handles[1],
shm_handles[2],
shm_handles[3],
error_msg,
shm_output_offset=offset,
protocol=self.protocol,
use_system_shared_memory=True,
)

self.assertEqual(len(error_msg), 1)
self.assertIn("Invalid offset for shared memory region", error_msg[0])
self._cleanup_server(shm_handles)
Expand All @@ -384,14 +351,18 @@ def test_infer_byte_size_out_of_bound(self):
offset = 60
byte_size = self.DEFAULT_SHM_BYTE_SIZE

self._basic_inference(
iu.shm_basic_infer(
self,
self.triton_client,
shm_handles[0],
shm_handles[1],
shm_handles[2],
shm_handles[3],
error_msg,
shm_output_offset=offset,
shm_output_byte_size=byte_size,
protocol=self.protocol,
use_system_shared_memory=True,
)
self.assertEqual(len(error_msg), 1)
self.assertIn(
Expand Down
89 changes: 88 additions & 1 deletion qa/common/infer_util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python3

# Copyright 2018-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Copyright 2018-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
Expand Down Expand Up @@ -1306,3 +1306,90 @@ def infer_zero(
shm.destroy_shared_memory_region(shm_op_handles[io_num])

return results


# Perform basic inference for shared memory tests
def shm_basic_infer(
tester,
triton_client,
shm_ip0_handle,
shm_ip1_handle,
shm_op0_handle,
shm_op1_handle,
error_msg,
big_shm_name="",
big_shm_size=64,
default_shm_byte_size=64,
shm_output_offset=0,
shm_output_byte_size=64,
protocol="http",
use_system_shared_memory=False,
use_cuda_shared_memory=False,
):
# Lazy shm imports...
if use_system_shared_memory:
import tritonclient.utils.shared_memory as shm
elif use_cuda_shared_memory:
import tritonclient.utils.cuda_shared_memory as cudashm
else:
raise Exception("No shared memory type specified")

input0_data = np.arange(start=0, stop=16, dtype=np.int32)
input1_data = np.ones(shape=16, dtype=np.int32)
inputs = []
outputs = []
if protocol == "http":
inputs.append(httpclient.InferInput("INPUT0", [1, 16], "INT32"))
inputs.append(httpclient.InferInput("INPUT1", [1, 16], "INT32"))
outputs.append(httpclient.InferRequestedOutput("OUTPUT0", binary_data=True))
outputs.append(httpclient.InferRequestedOutput("OUTPUT1", binary_data=False))
else:
inputs.append(grpcclient.InferInput("INPUT0", [1, 16], "INT32"))
inputs.append(grpcclient.InferInput("INPUT1", [1, 16], "INT32"))
outputs.append(grpcclient.InferRequestedOutput("OUTPUT0"))
outputs.append(grpcclient.InferRequestedOutput("OUTPUT1"))

inputs[0].set_shared_memory("input0_data", default_shm_byte_size)

if type(shm_ip1_handle) == np.array:
inputs[1].set_data_from_numpy(input0_data, binary_data=True)
elif big_shm_name != "":
inputs[1].set_shared_memory(big_shm_name, big_shm_size)
else:
inputs[1].set_shared_memory("input1_data", default_shm_byte_size)

outputs[0].set_shared_memory(
"output0_data", shm_output_byte_size, offset=shm_output_offset
)
outputs[1].set_shared_memory(
"output1_data", shm_output_byte_size, offset=shm_output_offset
)

try:
results = triton_client.infer(
"simple", inputs, model_version="", outputs=outputs
)
output = results.get_output("OUTPUT0")
if protocol == "http":
output_datatype = output["datatype"]
output_shape = output["shape"]
else:
output_datatype = output.datatype
output_shape = output.shape
output_dtype = triton_to_np_dtype(output_datatype)

if use_system_shared_memory:
output_data = shm.get_contents_as_numpy(
shm_op0_handle, output_dtype, output_shape
)
elif use_cuda_shared_memory:
output_data = cudashm.get_contents_as_numpy(
shm_op0_handle, output_dtype, output_shape
)

tester.assertTrue(
(output_data[0] == (input0_data + input1_data)).all(),
"Model output does not match expected output",
)
except Exception as ex:
error_msg.append(str(ex))
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ if(${TRITON_ENABLE_GPU})
main
PRIVATE
CUDA::cudart
-lcuda
)
endif() # TRITON_ENABLE_GPU

Expand Down
Loading
Loading