Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backend] TRT backend & PP-Infer backend support pinned memory #403

Merged
merged 10 commits into from
Oct 21, 2022
29 changes: 19 additions & 10 deletions fastdeploy/backends/tensorrt/trt_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -306,17 +306,21 @@ bool TrtBackend::Infer(std::vector<FDTensor>& inputs,

SetInputs(inputs);
AllocateOutputsBuffer(outputs);

if (!context_->enqueueV2(bindings_.data(), stream_, nullptr)) {
FDERROR << "Failed to Infer with TensorRT." << std::endl;
return false;
}
for (size_t i = 0; i < outputs->size(); ++i) {
FDASSERT(cudaMemcpyAsync((*outputs)[i].Data(),
outputs_buffer_[(*outputs)[i].name].data(),
outputs_device_buffer_[(*outputs)[i].name].data(),
(*outputs)[i].Nbytes(), cudaMemcpyDeviceToHost,
stream_) == 0,
"[ERROR] Error occurs while copy memory from GPU to CPU.");
}
FDASSERT(cudaStreamSynchronize(stream_) == cudaSuccess,
"[ERROR] Error occurs while sync cuda stream.");

return true;
}

Expand All @@ -332,10 +336,10 @@ void TrtBackend::GetInputOutputInfo() {
auto dtype = engine_->getBindingDataType(i);
if (engine_->bindingIsInput(i)) {
inputs_desc_.emplace_back(TrtValueInfo{name, shape, dtype});
inputs_buffer_[name] = FDDeviceBuffer(dtype);
inputs_device_buffer_[name] = FDDeviceBuffer(dtype);
} else {
outputs_desc_.emplace_back(TrtValueInfo{name, shape, dtype});
outputs_buffer_[name] = FDDeviceBuffer(dtype);
outputs_device_buffer_[name] = FDDeviceBuffer(dtype);
}
}
bindings_.resize(num_binds);
Expand All @@ -357,30 +361,31 @@ void TrtBackend::SetInputs(const std::vector<FDTensor>& inputs) {
"please use INT32 input");
} else {
// no copy
inputs_buffer_[item.name].SetExternalData(dims, item.Data());
inputs_device_buffer_[item.name].SetExternalData(dims, item.Data());
}
} else {
// Allocate input buffer memory
inputs_buffer_[item.name].resize(dims);
inputs_device_buffer_[item.name].resize(dims);

// copy from cpu to gpu
if (item.dtype == FDDataType::INT64) {
int64_t* data = static_cast<int64_t*>(const_cast<void*>(item.Data()));
std::vector<int32_t> casted_data(data, data + item.Numel());
FDASSERT(cudaMemcpyAsync(inputs_buffer_[item.name].data(),
FDASSERT(cudaMemcpyAsync(inputs_device_buffer_[item.name].data(),
static_cast<void*>(casted_data.data()),
item.Nbytes() / 2, cudaMemcpyHostToDevice,
stream_) == 0,
"Error occurs while copy memory from CPU to GPU.");
} else {
FDASSERT(cudaMemcpyAsync(inputs_buffer_[item.name].data(), item.Data(),
FDASSERT(cudaMemcpyAsync(inputs_device_buffer_[item.name].data(),
item.Data(),
item.Nbytes(), cudaMemcpyHostToDevice,
stream_) == 0,
"Error occurs while copy memory from CPU to GPU.");
}
}
// binding input buffer
bindings_[idx] = inputs_buffer_[item.name].data();
bindings_[idx] = inputs_device_buffer_[item.name].data();
}
}

Expand All @@ -399,15 +404,19 @@ void TrtBackend::AllocateOutputsBuffer(std::vector<FDTensor>* outputs) {
"Cannot find output: %s of tensorrt network from the original model.",
outputs_desc_[i].name.c_str());
auto ori_idx = iter->second;

// set user's outputs info
std::vector<int64_t> shape(output_dims.d,
output_dims.d + output_dims.nbDims);
(*outputs)[ori_idx].is_pinned_memory = option_.enable_pinned_memory;
(*outputs)[ori_idx].Resize(shape, GetFDDataType(outputs_desc_[i].dtype),
outputs_desc_[i].name);

// Allocate output buffer memory
outputs_buffer_[outputs_desc_[i].name].resize(output_dims);
outputs_device_buffer_[outputs_desc_[i].name].resize(output_dims);

// binding output buffer
bindings_[idx] = outputs_buffer_[outputs_desc_[i].name].data();
bindings_[idx] = outputs_device_buffer_[outputs_desc_[i].name].data();
}
}

Expand Down
5 changes: 3 additions & 2 deletions fastdeploy/backends/tensorrt/trt_backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ struct TrtBackendOption {
std::map<std::string, std::vector<int32_t>> min_shape;
std::map<std::string, std::vector<int32_t>> opt_shape;
std::string serialize_file = "";
bool enable_pinned_memory = false;

// inside parameter, maybe remove next version
bool remove_multiclass_nms_ = false;
Expand Down Expand Up @@ -118,8 +119,8 @@ class TrtBackend : public BaseBackend {
std::vector<void*> bindings_;
std::vector<TrtValueInfo> inputs_desc_;
std::vector<TrtValueInfo> outputs_desc_;
std::map<std::string, FDDeviceBuffer> inputs_buffer_;
std::map<std::string, FDDeviceBuffer> outputs_buffer_;
std::map<std::string, FDDeviceBuffer> inputs_device_buffer_;
std::map<std::string, FDDeviceBuffer> outputs_device_buffer_;

std::string calibration_str_;

Expand Down
2 changes: 2 additions & 0 deletions fastdeploy/backends/tensorrt/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,8 @@ class FDGenericBuffer {
};

using FDDeviceBuffer = FDGenericBuffer<FDDeviceAllocator, FDDeviceFree>;
using FDDeviceHostBuffer = FDGenericBuffer<FDDeviceHostAllocator,
FDDeviceHostFree>;

class FDTrtLogger : public nvinfer1::ILogger {
public:
Expand Down
6 changes: 6 additions & 0 deletions fastdeploy/core/allocate.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ bool FDDeviceAllocator::operator()(void** ptr, size_t size) const {

void FDDeviceFree::operator()(void* ptr) const { cudaFree(ptr); }

bool FDDeviceHostAllocator::operator()(void** ptr, size_t size) const {
return cudaMallocHost(ptr, size) == cudaSuccess;
}

void FDDeviceHostFree::operator()(void* ptr) const { cudaFreeHost(ptr); }

#endif

} // namespace fastdeploy
10 changes: 10 additions & 0 deletions fastdeploy/core/allocate.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,16 @@ class FASTDEPLOY_DECL FDDeviceFree {
void operator()(void* ptr) const;
};

class FASTDEPLOY_DECL FDDeviceHostAllocator {
public:
bool operator()(void** ptr, size_t size) const;
};

class FASTDEPLOY_DECL FDDeviceHostFree {
public:
void operator()(void* ptr) const;
};

#endif

} // namespace fastdeploy
45 changes: 40 additions & 5 deletions fastdeploy/core/fd_tensor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -207,9 +207,27 @@ bool FDTensor::ReallocFn(size_t nbytes) {
"-DWITH_GPU=ON,"
"so this is an unexpected problem happend.");
#endif
} else {
if (is_pinned_memory) {
#ifdef WITH_GPU
size_t original_nbytes = Nbytes();
if (nbytes > original_nbytes) {
if (buffer_ != nullptr) {
FDDeviceHostFree()(buffer_);
}
FDDeviceHostAllocator()(&buffer_, nbytes);
}
return buffer_ != nullptr;
#else
FDASSERT(false,
"The FastDeploy FDTensor allocator didn't compile under "
"-DWITH_GPU=ON,"
"so this is an unexpected problem happend.");
#endif
}
buffer_ = realloc(buffer_, nbytes);
return buffer_ != nullptr;
}
buffer_ = realloc(buffer_, nbytes);
return buffer_ != nullptr;
}

void FDTensor::FreeFn() {
Expand All @@ -220,7 +238,13 @@ void FDTensor::FreeFn() {
FDDeviceFree()(buffer_);
#endif
} else {
FDHostFree()(buffer_);
if (is_pinned_memory) {
#ifdef WITH_GPU
FDDeviceHostFree()(buffer_);
#endif
} else {
FDHostFree()(buffer_);
}
}
buffer_ = nullptr;
}
Expand All @@ -231,15 +255,26 @@ void FDTensor::CopyBuffer(void* dst, const void* src, size_t nbytes) {
#ifdef WITH_GPU
FDASSERT(cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToDevice) == 0,
"[ERROR] Error occurs while copy memory from GPU to GPU");

#else
FDASSERT(false,
"The FastDeploy didn't compile under -DWITH_GPU=ON, so copying "
"gpu buffer is "
"an unexpected problem happend.");
#endif
} else {
std::memcpy(dst, src, nbytes);
if (is_pinned_memory) {
#ifdef WITH_GPU
FDASSERT(cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToHost) == 0,
"[ERROR] Error occurs while copy memory from host to host");
#else
FDASSERT(false,
"The FastDeploy didn't compile under -DWITH_GPU=ON, so copying "
"gpu buffer is "
"an unexpected problem happend.");
#endif
} else {
std::memcpy(dst, src, nbytes);
}
}
}

Expand Down
4 changes: 4 additions & 0 deletions fastdeploy/core/fd_tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ struct FASTDEPLOY_DECL FDTensor {
// so we can skip data transfer, which may improve the efficience
Device device = Device::CPU;

// Whether the data buffer is in pinned memory, which is allocated
// with cudaMallocHost()
bool is_pinned_memory = false;

// if the external data is not on CPU, we use this temporary buffer
// to transfer data to CPU at some cases we need to visit the
// other devices' data
Expand Down
5 changes: 5 additions & 0 deletions fastdeploy/runtime.cc
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,10 @@ void RuntimeOption::EnableTrtFP16() { trt_enable_fp16 = true; }

void RuntimeOption::DisableTrtFP16() { trt_enable_fp16 = false; }

void RuntimeOption::EnableTrtPinnedMemory() { trt_enable_pinned_memory = true; }

void RuntimeOption::DisableTrtPinnedMemory() { trt_enable_pinned_memory = false; }

void RuntimeOption::SetTrtCacheFile(const std::string& cache_file_path) {
trt_serialize_file = cache_file_path;
}
Expand Down Expand Up @@ -606,6 +610,7 @@ void Runtime::CreateTrtBackend() {
trt_option.min_shape = option.trt_min_shape;
trt_option.opt_shape = option.trt_opt_shape;
trt_option.serialize_file = option.trt_serialize_file;
trt_option.enable_pinned_memory = option.trt_enable_pinned_memory;

// TODO(jiangjiajun): inside usage, maybe remove this later
trt_option.remove_multiclass_nms_ = option.remove_multiclass_nms_;
Expand Down
10 changes: 10 additions & 0 deletions fastdeploy/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,15 @@ struct FASTDEPLOY_DECL RuntimeOption {
*/
void SetTrtCacheFile(const std::string& cache_file_path);

/**
* @brief Enable pinned memory in trt backend
*/
void EnableTrtPinnedMemory();

/**
* @brief Disable pinned memory in trt backend
*/
void DisableTrtPinnedMemory();

/**
* @brief Enable to collect shape in paddle trt backend
Expand Down Expand Up @@ -259,6 +268,7 @@ struct FASTDEPLOY_DECL RuntimeOption {
bool trt_enable_int8 = false;
size_t trt_max_batch_size = 32;
size_t trt_max_workspace_size = 1 << 30;
bool trt_enable_pinned_memory = false;

// ======Only for Poros Backend=======
bool is_dynamic = false;
Expand Down