PaddlePaddle · jiangjiajun · Oct 21, 2022 · Oct 20, 2022 · Oct 20, 2022 · Oct 20, 2022
diff --git a/fastdeploy/backends/tensorrt/trt_backend.cc b/fastdeploy/backends/tensorrt/trt_backend.cc
@@ -306,17 +306,21 @@ bool TrtBackend::Infer(std::vector<FDTensor>& inputs,
 
   SetInputs(inputs);
   AllocateOutputsBuffer(outputs);
+
   if (!context_->enqueueV2(bindings_.data(), stream_, nullptr)) {
     FDERROR << "Failed to Infer with TensorRT." << std::endl;
     return false;
   }
   for (size_t i = 0; i < outputs->size(); ++i) {
     FDASSERT(cudaMemcpyAsync((*outputs)[i].Data(),
-                             outputs_buffer_[(*outputs)[i].name].data(),
+                             outputs_device_buffer_[(*outputs)[i].name].data(),
                              (*outputs)[i].Nbytes(), cudaMemcpyDeviceToHost,
                              stream_) == 0,
              "[ERROR] Error occurs while copy memory from GPU to CPU.");
   }
+  FDASSERT(cudaStreamSynchronize(stream_) == cudaSuccess,
+           "[ERROR] Error occurs while sync cuda stream.");
+
   return true;
 }
 
@@ -332,10 +336,10 @@ void TrtBackend::GetInputOutputInfo() {
     auto dtype = engine_->getBindingDataType(i);
     if (engine_->bindingIsInput(i)) {
       inputs_desc_.emplace_back(TrtValueInfo{name, shape, dtype});
-      inputs_buffer_[name] = FDDeviceBuffer(dtype);
+      inputs_device_buffer_[name] = FDDeviceBuffer(dtype);
     } else {
       outputs_desc_.emplace_back(TrtValueInfo{name, shape, dtype});
-      outputs_buffer_[name] = FDDeviceBuffer(dtype);
+      outputs_device_buffer_[name] = FDDeviceBuffer(dtype);
     }
   }
   bindings_.resize(num_binds);
@@ -357,30 +361,31 @@ void TrtBackend::SetInputs(const std::vector<FDTensor>& inputs) {
                  "please use INT32 input");
       } else {
         // no copy
-        inputs_buffer_[item.name].SetExternalData(dims, item.Data());
+        inputs_device_buffer_[item.name].SetExternalData(dims, item.Data());
       }
     } else {
       // Allocate input buffer memory
-      inputs_buffer_[item.name].resize(dims);
+      inputs_device_buffer_[item.name].resize(dims);
 
       // copy from cpu to gpu
       if (item.dtype == FDDataType::INT64) {
         int64_t* data = static_cast<int64_t*>(const_cast<void*>(item.Data()));
         std::vector<int32_t> casted_data(data, data + item.Numel());
-        FDASSERT(cudaMemcpyAsync(inputs_buffer_[item.name].data(),
+        FDASSERT(cudaMemcpyAsync(inputs_device_buffer_[item.name].data(),
                                  static_cast<void*>(casted_data.data()),
                                  item.Nbytes() / 2, cudaMemcpyHostToDevice,
                                  stream_) == 0,
                  "Error occurs while copy memory from CPU to GPU.");
       } else {
-        FDASSERT(cudaMemcpyAsync(inputs_buffer_[item.name].data(), item.Data(),
+        FDASSERT(cudaMemcpyAsync(inputs_device_buffer_[item.name].data(),
+                                 item.Data(),
                                  item.Nbytes(), cudaMemcpyHostToDevice,
                                  stream_) == 0,
                  "Error occurs while copy memory from CPU to GPU.");
       }
     }
     // binding input buffer
-    bindings_[idx] = inputs_buffer_[item.name].data();
+    bindings_[idx] = inputs_device_buffer_[item.name].data();
   }
 }
 
@@ -399,15 +404,19 @@ void TrtBackend::AllocateOutputsBuffer(std::vector<FDTensor>* outputs) {
         "Cannot find output: %s of tensorrt network from the original model.",
         outputs_desc_[i].name.c_str());
     auto ori_idx = iter->second;
+
     // set user's outputs info
     std::vector<int64_t> shape(output_dims.d,
                                output_dims.d + output_dims.nbDims);
+    (*outputs)[ori_idx].is_pinned_memory = option_.enable_pinned_memory;
     (*outputs)[ori_idx].Resize(shape, GetFDDataType(outputs_desc_[i].dtype),
                                outputs_desc_[i].name);
+
     // Allocate output buffer memory
-    outputs_buffer_[outputs_desc_[i].name].resize(output_dims);
+    outputs_device_buffer_[outputs_desc_[i].name].resize(output_dims);
+
     // binding output buffer
-    bindings_[idx] = outputs_buffer_[outputs_desc_[i].name].data();
+    bindings_[idx] = outputs_device_buffer_[outputs_desc_[i].name].data();
   }
 }
 

diff --git a/fastdeploy/backends/tensorrt/trt_backend.h b/fastdeploy/backends/tensorrt/trt_backend.h
@@ -70,6 +70,7 @@ struct TrtBackendOption {
   std::map<std::string, std::vector<int32_t>> min_shape;
   std::map<std::string, std::vector<int32_t>> opt_shape;
   std::string serialize_file = "";
+  bool enable_pinned_memory = false;
 
   // inside parameter, maybe remove next version
   bool remove_multiclass_nms_ = false;
@@ -118,8 +119,8 @@ class TrtBackend : public BaseBackend {
   std::vector<void*> bindings_;
   std::vector<TrtValueInfo> inputs_desc_;
   std::vector<TrtValueInfo> outputs_desc_;
-  std::map<std::string, FDDeviceBuffer> inputs_buffer_;
-  std::map<std::string, FDDeviceBuffer> outputs_buffer_;
+  std::map<std::string, FDDeviceBuffer> inputs_device_buffer_;
+  std::map<std::string, FDDeviceBuffer> outputs_device_buffer_;
 
   std::string calibration_str_;
 

diff --git a/fastdeploy/backends/tensorrt/utils.h b/fastdeploy/backends/tensorrt/utils.h
@@ -206,6 +206,8 @@ class FDGenericBuffer {
 };
 
 using FDDeviceBuffer = FDGenericBuffer<FDDeviceAllocator, FDDeviceFree>;
+using FDDeviceHostBuffer = FDGenericBuffer<FDDeviceHostAllocator,
+                                           FDDeviceHostFree>;
 
 class FDTrtLogger : public nvinfer1::ILogger {
  public:

diff --git a/fastdeploy/core/allocate.cc b/fastdeploy/core/allocate.cc
@@ -34,6 +34,12 @@ bool FDDeviceAllocator::operator()(void** ptr, size_t size) const {
 
 void FDDeviceFree::operator()(void* ptr) const { cudaFree(ptr); }
 
+bool FDDeviceHostAllocator::operator()(void** ptr, size_t size) const {
+  return cudaMallocHost(ptr, size) == cudaSuccess;
+}
+
+void FDDeviceHostFree::operator()(void* ptr) const { cudaFreeHost(ptr); }
+
 #endif
 
 }  // namespace fastdeploy
diff --git a/fastdeploy/core/allocate.h b/fastdeploy/core/allocate.h
@@ -45,6 +45,16 @@ class FASTDEPLOY_DECL FDDeviceFree {
   void operator()(void* ptr) const;
 };
 
+class FASTDEPLOY_DECL FDDeviceHostAllocator {
+ public:
+  bool operator()(void** ptr, size_t size) const;
+};
+
+class FASTDEPLOY_DECL FDDeviceHostFree {
+ public:
+  void operator()(void* ptr) const;
+};
+
 #endif
 
 }  // namespace fastdeploy
diff --git a/fastdeploy/core/fd_tensor.cc b/fastdeploy/core/fd_tensor.cc
@@ -207,9 +207,27 @@ bool FDTensor::ReallocFn(size_t nbytes) {
              "-DWITH_GPU=ON,"
              "so this is an unexpected problem happend.");
 #endif
+  } else {
+    if (is_pinned_memory) {
+#ifdef WITH_GPU
+      size_t original_nbytes = Nbytes();
+      if (nbytes > original_nbytes) {
+        if (buffer_ != nullptr) {
+          FDDeviceHostFree()(buffer_);
+        }
+        FDDeviceHostAllocator()(&buffer_, nbytes);
+      }
+      return buffer_ != nullptr;
+#else
+      FDASSERT(false,
+              "The FastDeploy FDTensor allocator didn't compile under "
+              "-DWITH_GPU=ON,"
+              "so this is an unexpected problem happend.");
+#endif
+    }
+    buffer_ = realloc(buffer_, nbytes);
+    return buffer_ != nullptr;
   }
-  buffer_ = realloc(buffer_, nbytes);
-  return buffer_ != nullptr;
 }
 
 void FDTensor::FreeFn() {
@@ -220,7 +238,13 @@ void FDTensor::FreeFn() {
       FDDeviceFree()(buffer_);
 #endif
     } else {
-      FDHostFree()(buffer_);
+      if (is_pinned_memory) {
+#ifdef WITH_GPU
+        FDDeviceHostFree()(buffer_);
+#endif
+      } else {
+        FDHostFree()(buffer_);
+      }
     }
     buffer_ = nullptr;
   }
@@ -231,15 +255,26 @@ void FDTensor::CopyBuffer(void* dst, const void* src, size_t nbytes) {
 #ifdef WITH_GPU
     FDASSERT(cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToDevice) == 0,
              "[ERROR] Error occurs while copy memory from GPU to GPU");
-
 #else
     FDASSERT(false,
              "The FastDeploy didn't compile under -DWITH_GPU=ON, so copying "
              "gpu buffer is "
              "an unexpected problem happend.");
 #endif
   } else {
-    std::memcpy(dst, src, nbytes);
+    if (is_pinned_memory) {
+#ifdef WITH_GPU
+      FDASSERT(cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToHost) == 0,
+               "[ERROR] Error occurs while copy memory from host to host");
+#else
+      FDASSERT(false,
+               "The FastDeploy didn't compile under -DWITH_GPU=ON, so copying "
+               "gpu buffer is "
+               "an unexpected problem happend.");
+#endif
+    } else {
+      std::memcpy(dst, src, nbytes);
+    }
   }
 }
 

diff --git a/fastdeploy/core/fd_tensor.h b/fastdeploy/core/fd_tensor.h
@@ -40,6 +40,10 @@ struct FASTDEPLOY_DECL FDTensor {
   // so we can skip data transfer, which may improve the efficience
   Device device = Device::CPU;
 
+  // Whether the data buffer is in pinned memory, which is allocated
+  // with cudaMallocHost()
+  bool is_pinned_memory = false;
+
   // if the external data is not on CPU, we use this temporary buffer
   // to transfer data to CPU at some cases we need to visit the
   // other devices' data

diff --git a/fastdeploy/runtime.cc b/fastdeploy/runtime.cc
@@ -356,6 +356,10 @@ void RuntimeOption::EnableTrtFP16() { trt_enable_fp16 = true; }
 
 void RuntimeOption::DisableTrtFP16() { trt_enable_fp16 = false; }
 
+void RuntimeOption::EnableTrtPinnedMemory() { trt_enable_pinned_memory = true; }
+
+void RuntimeOption::DisableTrtPinnedMemory() { trt_enable_pinned_memory = false; }
+
 void RuntimeOption::SetTrtCacheFile(const std::string& cache_file_path) {
   trt_serialize_file = cache_file_path;
 }
@@ -606,6 +610,7 @@ void Runtime::CreateTrtBackend() {
   trt_option.min_shape = option.trt_min_shape;
   trt_option.opt_shape = option.trt_opt_shape;
   trt_option.serialize_file = option.trt_serialize_file;
+  trt_option.enable_pinned_memory = option.trt_enable_pinned_memory;
 
   // TODO(jiangjiajun): inside usage, maybe remove this later
   trt_option.remove_multiclass_nms_ = option.remove_multiclass_nms_;

diff --git a/fastdeploy/runtime.h b/fastdeploy/runtime.h
@@ -204,6 +204,15 @@ struct FASTDEPLOY_DECL RuntimeOption {
    */
   void SetTrtCacheFile(const std::string& cache_file_path);
 
+  /**
+   * @brief Enable pinned memory in trt backend
+   */
+  void EnableTrtPinnedMemory();
+
+  /**
+   * @brief Disable pinned memory in trt backend
+   */
+  void DisableTrtPinnedMemory();
 
   /**
    * @brief Enable to collect shape in paddle trt backend
@@ -259,6 +268,7 @@ struct FASTDEPLOY_DECL RuntimeOption {
   bool trt_enable_int8 = false;
   size_t trt_max_batch_size = 32;
   size_t trt_max_workspace_size = 1 << 30;
+  bool trt_enable_pinned_memory = false;
 
   // ======Only for Poros Backend=======
   bool is_dynamic = false;