triton-inference-server · oandreeva-nv · Dec 4, 2024 · Nov 21, 2024 · Nov 25, 2024 · Nov 26, 2024
diff --git a/src/test/sequence/CMakeLists.txt b/src/test/sequence/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2021-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -43,7 +43,7 @@ add_library(
   TritonSequenceBackend::triton-sequence-backend ALIAS triton-sequence-backend
 )
 
-target_compile_features(triton-sequence-backend PRIVATE cxx_std_11)
+target_compile_features(triton-sequence-backend PRIVATE cxx_std_17)
 target_compile_options(
   triton-sequence-backend PRIVATE
   $<$<OR:$<CXX_COMPILER_ID:Clang>,$<CXX_COMPILER_ID:AppleClang>,$<CXX_COMPILER_ID:GNU>>:

diff --git a/src/test/sequence/src/sequence.cc b/src/test/sequence/src/sequence.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2021-2024, NVIDIA CORPORATION. All rights reserved.
 //
 // Redistribution and use in source and binary forms, with or without
 // modification, are permitted provided that the following conditions
@@ -847,9 +847,15 @@ TRITONBACKEND_ModelInstanceExecute(
     if (input_memory_type == TRITONSERVER_MEMORY_GPU) {
       ipbuffer_vec.resize(input_element_cnt);
       ipbuffer_int = ipbuffer_vec.data();
-      cudaMemcpy(
-          const_cast<int32_t*>(ipbuffer_int), input_buffer, input_byte_size,
-          cudaMemcpyDeviceToHost);
+      LOG_IF_CUDA_ERROR(
+          cudaMemcpyAsync(
+              const_cast<int32_t*>(ipbuffer_int), input_buffer, input_byte_size,
+              cudaMemcpyDeviceToHost, instance_state->CudaStream()),
+          "failed to copy buffer from Device to Host");
+
+      LOG_IF_CUDA_ERROR(
+          cudaStreamSynchronize(instance_state->CudaStream()),
+          "failed to perform synchronization on cuda stream");
     } else {
       ipbuffer_int = reinterpret_cast<const int32_t*>(input_buffer);
     }
@@ -939,9 +945,15 @@ TRITONBACKEND_ModelInstanceExecute(
         }
 
         if (output_memory_type == TRITONSERVER_MEMORY_GPU) {
-          cudaMemcpy(
-              output_buffer, const_cast<int32_t*>(obuffer_int),
-              buffer_byte_size, cudaMemcpyHostToDevice);
+          LOG_IF_CUDA_ERROR(
+              cudaMemcpyAsync(
+                  output_buffer, const_cast<int32_t*>(obuffer_int),
+                  buffer_byte_size, cudaMemcpyHostToDevice,
+                  instance_state->CudaStream()),
+              "failed to copy buffer from Device to Host");
+          LOG_IF_CUDA_ERROR(
+              cudaStreamSynchronize(instance_state->CudaStream()),
+              "failed to perform synchronization on cuda stream");
         }
       }
     }