Skip to content

Commit

Permalink
[TVM EP] Support zero copying TVM EP output tensor to ONNX Runtime ou…
Browse files Browse the repository at this point in the history
…tput tensor (microsoft#12593)

**Description**:
Support new feature of TVM Virtual Machine (method `set_outputs`) on TVM
Execution Provider side. It allows to avoid excess copying from TVM EP
output tensor to ONNX Runtime one

**Motivation and Context**
Tests with multiple output topologies and big output tensors shows that
there is overheads spent on copying from TVM EP to ONNX Runtime.
Returning output(s) on preallocated memory for VirtualMachine was
implemented on TVM side.

**Details**
`set_output_zero_copy` provider option for TVM EP switches on/off this
feature. It is true by default.
The feature works for both GraphExecutor and VirtualMachine from TVM.

---------

Co-authored-by: Valery Chernov <[email protected]>
  • Loading branch information
2 people authored and preetha-intel committed Feb 15, 2023
1 parent b8901c3 commit 8a7e605
Show file tree
Hide file tree
Showing 8 changed files with 123 additions and 47 deletions.
2 changes: 1 addition & 1 deletion cmake/external/tvm.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,4 @@ if (onnxruntime_USE_TVM)

set(tvm_INCLUDE_DIRS ${tvm_SOURCE_DIR}/include)

endif()
endif()
49 changes: 33 additions & 16 deletions onnxruntime/core/providers/tvm/tvm_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,7 @@ TvmModule TVMCompile(const TvmEPOptions& options,
const std::string& onnx_txt,
const std::string& model_path,
int opset,
const TVMTensorShapes& input_shapes)
{
const TVMTensorShapes& input_shapes) {
::tvm::Array<TvmIntArray> shapes;
for (size_t i = 0; i < input_shapes.size(); ++i)
{
Expand Down Expand Up @@ -203,8 +202,7 @@ TvmModule TVMSoCompile(const TvmEPOptions& options) {

void TVMSetInputs(TvmModule& mod,
std::vector<size_t>& inds,
std::vector<DLTensor>& inputs)
{
std::vector<DLTensor>& inputs) {
TvmPackedFunc set_input = mod.GetFunction("set_input", false);
TvmPackedFunc set_input_zero_copy = mod.GetFunction("set_input_zero_copy", false);
for (size_t i = 0; i < inds.size(); ++i) {
Expand All @@ -218,8 +216,7 @@ void TVMSetInputs(TvmModule& mod,

void TVM_VM_SetInputs(TvmModule& mod,
std::vector<size_t>& inds,
std::vector<DLTensor>& inputs)
{
std::vector<DLTensor>& inputs) {
size_t num_total_args = inputs.size() + 1;
std::vector<TVMValue> tvm_values(num_total_args);
std::vector<int> tvm_type_codes(num_total_args);
Expand All @@ -235,18 +232,41 @@ void TVM_VM_SetInputs(TvmModule& mod,
set_input.CallPacked(::tvm::runtime::TVMArgs(tvm_values.data(), tvm_type_codes.data(), int(num_total_args)), &rv);
}

void TVMSetOutputsZeroCopy(TvmModule& mod,
std::vector<DLTensor>& outputs) {
TvmPackedFunc set_output = mod.GetFunction("set_output_zero_copy", false);
for (size_t i = 0; i < outputs.size(); ++i) {
set_output(i, &outputs[i]);
}
}

void TVM_VM_SetOutputsZeroCopy(TvmModule& mod,
std::vector<DLTensor>& outputs) {
size_t num_total_args = outputs.size() + 1;
std::vector<TVMValue> tvm_values(num_total_args);
std::vector<int> tvm_type_codes(num_total_args);
tvm_rt::TVMArgsSetter setter(tvm_values.data(), tvm_type_codes.data());
const std::string func_name = "main";
setter(0, func_name.c_str());
for (size_t k = 0; k < num_total_args - 1; ++k) {
setter(k+1, &outputs[k]);
}

TvmPackedFunc set_output = mod.GetFunction("set_outputs", false);
tvm_rt::TVMRetValue rv;
set_output.CallPacked(tvm_rt::TVMArgs(tvm_values.data(), tvm_type_codes.data(), num_total_args), &rv);
}

void TVMGetOutputs(TvmModule& mod,
std::vector<DLTensor>& outputs)
{
std::vector<DLTensor>& outputs) {
TvmPackedFunc get_output = mod.GetFunction("get_output", false);
for (size_t i = 0; i < outputs.size(); ++i) {
get_output(i, &outputs[i]);
}
}

void TVM_VM_GetOutputs(TvmModule& mod,
std::vector<DLTensor>& outputs)
{
std::vector<DLTensor>& outputs) {
TvmPackedFunc get_output = mod.GetFunction("get_output", false);
for (size_t i = 0; i < outputs.size(); ++i) {
// TODO(vvchernov): think about improvement of memory management
Expand All @@ -256,8 +276,7 @@ void TVM_VM_GetOutputs(TvmModule& mod,
}

void TVMGetOutputShapes(TvmModule& mod,
TVMTensorShapes& output_shapes)
{
TVMTensorShapes& output_shapes) {
size_t size = output_shapes.size();
TvmPackedFunc get_output = mod.GetFunction("get_output", false);
for (size_t i = 0; i < size; ++i) {
Expand All @@ -272,15 +291,13 @@ void TVMGetOutputShapes(TvmModule& mod,
}
}

void TVMRun(TvmModule& mod)
{
void TVMRun(TvmModule& mod) {
TvmPackedFunc run = mod.GetFunction("run", false);
ORT_ENFORCE(run != nullptr, "Unable to retrieve graph executor run.");
run();
}

void TVM_VM_Run(TvmModule& mod)
{
void TVM_VM_Run(TvmModule& mod) {
TvmPackedFunc run = mod.GetFunction("invoke", false);
ORT_ENFORCE(run != nullptr, "Unable to retrieve virtual machine invoke.");
run("main");
Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/core/providers/tvm/tvm_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ namespace tvm {

void TVMSetInputs(TvmModule& mod, std::vector<size_t>& inds, std::vector<DLTensor>& inputs);
void TVM_VM_SetInputs(TvmModule& mod, std::vector<size_t>& inds, std::vector<DLTensor>& inputs);
void TVMSetOutputsZeroCopy(TvmModule& mod, std::vector<DLTensor>& outputs);
void TVM_VM_SetOutputsZeroCopy(TvmModule& mod, std::vector<DLTensor>& outputs);
void TVMGetOutputs(TvmModule& mod, std::vector<DLTensor>& outputs);
void TVM_VM_GetOutputs(TvmModule& mod, std::vector<DLTensor>& outputs);
void TVMGetOutputShapes(TvmModule& mod,
Expand Down
4 changes: 4 additions & 0 deletions onnxruntime/core/providers/tvm/tvm_ep_options.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ constexpr const char* kTarget = "target";
constexpr const char* kTargetHost = "target_host";
constexpr const char* kOptLevel = "opt_level";
constexpr const char* kFreezeWeights = "freeze_weights";
constexpr const char* kSetOutputZeroCopy = "set_output_zero_copy";
constexpr const char* kToNHWC = "to_nhwc";
constexpr const char* kTuningFilePath = "tuning_file_path";
constexpr const char* kTuningType = "tuning_type";
Expand All @@ -38,6 +39,7 @@ static const std::unordered_set<std::string> valid_keys {
std::string{kTargetHost},
std::string{kOptLevel},
std::string{kFreezeWeights},
std::string{kSetOutputZeroCopy},
std::string{kToNHWC},
std::string{kTuningFilePath},
std::string{kTuningType},
Expand Down Expand Up @@ -124,6 +126,7 @@ TvmEPOptions TvmEPOptionsHelper::FromProviderOptions(const ProviderOptions& pr_o
.AddAssignmentToReference(tvm::provider_option_names::kTargetHost, options.target_host)
.AddAssignmentToReference(tvm::provider_option_names::kOptLevel, options.opt_level)
.AddAssignmentToReference(tvm::provider_option_names::kFreezeWeights, options.freeze_weights)
.AddAssignmentToReference(tvm::provider_option_names::kSetOutputZeroCopy, options.set_output_zero_copy)
.AddAssignmentToReference(tvm::provider_option_names::kToNHWC, options.to_nhwc)
.AddAssignmentToReference(tvm::provider_option_names::kTuningFilePath, options.tuning_file_path)
.AddAssignmentToReference(tvm::provider_option_names::kTuningType, options.tuning_type)
Expand Down Expand Up @@ -261,6 +264,7 @@ std::ostream& operator<<(std::ostream& out, const TvmEPOptions& options) {
"target_host: " << options.target_host << "\n" <<
"opt level: " << options.opt_level << "\n" <<
"freeze weights: " << options.freeze_weights << "\n" <<
"set_output_zero_copy: " << options.set_output_zero_copy << "\n" <<
"tuning file path: " << options.tuning_file_path << "\n" <<
"tuning type: " << options.tuning_type << "\n" <<
"convert layout to NHWC: " << options.to_nhwc << "\n" <<
Expand Down
1 change: 1 addition & 0 deletions onnxruntime/core/providers/tvm/tvm_ep_options.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ struct TvmEPOptions {
unsigned int opt_level{tvm::default_opt_level};
bool freeze_weights = true;
bool to_nhwc = false;
bool set_output_zero_copy = true;
std::string tuning_file_path{""};
std::string tuning_type{tvm::default_tuning_type};
std::string input_names_str{""};
Expand Down
5 changes: 3 additions & 2 deletions onnxruntime/core/providers/tvm/tvm_runner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,9 @@ TVMRunner::TVMRunner(const TvmEPOptions& options,
runner_ = getTVMRunnerImpl(mod, options, inputs_info, output_tensors);
}

common::Status TVMRunner::operator()(FunctionState state, const OrtApi* api, OrtKernelContext* context) {
return runner_->run(api, context);
common::Status TVMRunner::operator()(FunctionState state, const OrtApi* /*api*/, OrtKernelContext* context) {
Ort::KernelContext ctx(context);
return runner_->run(ctx);
}

} // namespace tvm
Expand Down
47 changes: 35 additions & 12 deletions onnxruntime/core/providers/tvm/tvm_runner_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,11 @@ std::shared_ptr<RunnerImpl> getTVMRunnerImpl(const std::shared_ptr<TvmModule>& m
const std::vector<DLTensor> output_tensors) {
const std::string& name = options.executor;
if (name == "graph") {
return std::make_shared<GERunnerImpl>(mod, inputs_info, options.output_shapes, output_tensors);
return std::make_shared<GERunnerImpl>(mod, inputs_info, options.output_shapes,
output_tensors, options.set_output_zero_copy);
} else if (name == "vm") {
return std::make_shared<VMRunnerImpl>(mod, inputs_info, options.output_shapes, output_tensors);
return std::make_shared<VMRunnerImpl>(mod, inputs_info, options.output_shapes,
output_tensors, options.set_output_zero_copy);
}
return nullptr;
}
Expand All @@ -30,10 +32,12 @@ std::shared_ptr<RunnerImpl> getTVMRunnerImpl(const std::shared_ptr<TvmModule>& m
RunnerImpl::RunnerImpl(const std::shared_ptr<TvmModule>& mod,
const InputsInfoMap& inputs_info,
const TVMTensorShapes output_shapes,
const std::vector<DLTensor> output_tensors) : mod_(mod),
inputs_info_(inputs_info),
output_shapes_(output_shapes),
output_tensors_(output_tensors) {
const std::vector<DLTensor> output_tensors,
bool set_output_zero_copy) : mod_(mod),
inputs_info_(inputs_info),
output_shapes_(output_shapes),
output_tensors_(output_tensors),
set_output_zero_copy_(set_output_zero_copy) {
}

void RunnerImpl::convert_input_tensors2dl_tensors(Ort::KernelContext& context,
Expand Down Expand Up @@ -88,7 +92,9 @@ void RunnerImpl::add_device_type_data2output_tensors(Ort::KernelContext& context
GERunnerImpl::GERunnerImpl(const std::shared_ptr<TvmModule>& mod,
const InputsInfoMap& inputs_info,
const TVMTensorShapes output_shapes,
const std::vector<DLTensor> output_tensors) : RunnerImpl(mod, inputs_info, output_shapes, output_tensors) {
const std::vector<DLTensor> output_tensors,
bool set_output_zero_copy) :
RunnerImpl(mod, inputs_info, output_shapes, output_tensors, set_output_zero_copy) {
}

void GERunnerImpl::set_input(Ort::KernelContext& context) {
Expand All @@ -103,8 +109,15 @@ void GERunnerImpl::connect_output_tensors2ort(Ort::KernelContext& context) {
add_device_type_data2output_tensors(context);
}

void GERunnerImpl::run_and_get_output() {
void GERunnerImpl::set_output_zero_copy() {
tvm::TVMSetOutputsZeroCopy(*mod_, output_tensors_);
}

void GERunnerImpl::run() {
tvm::TVMRun(*mod_);
}

void GERunnerImpl::get_outputs() {
tvm::TVMGetOutputs(*mod_, output_tensors_);
}

Expand All @@ -113,7 +126,9 @@ void GERunnerImpl::run_and_get_output() {
VMRunnerImpl::VMRunnerImpl(const std::shared_ptr<TvmModule>& mod,
const InputsInfoMap& inputs_info,
const TVMTensorShapes output_shapes,
const std::vector<DLTensor> output_tensors) : RunnerImpl(mod, inputs_info, output_shapes, output_tensors) {
const std::vector<DLTensor> output_tensors,
bool set_output_zero_copy) :
RunnerImpl(mod, inputs_info, output_shapes, output_tensors, set_output_zero_copy) {
}

void VMRunnerImpl::set_input(Ort::KernelContext& context) {
Expand All @@ -125,20 +140,28 @@ void VMRunnerImpl::set_input(Ort::KernelContext& context) {
}

void VMRunnerImpl::connect_output_tensors2ort(Ort::KernelContext& context) {
if (!probe_infer_) {
// TODO(vvchernov): try to find more flexible solution
if(!probe_infer_) {
infer_once_to_get_output_shapes();
}

add_device_type_data2output_tensors(context);
}

void VMRunnerImpl::run_and_get_output() {
void VMRunnerImpl::set_output_zero_copy() {
tvm::TVM_VM_SetOutputsZeroCopy(*mod_, output_tensors_);
}

void VMRunnerImpl::run() {
tvm::TVM_VM_Run(*mod_);
}

void VMRunnerImpl::get_outputs() {
tvm::TVM_VM_GetOutputs(*mod_, output_tensors_);
}

void VMRunnerImpl::infer_once_to_get_output_shapes() {
tvm::TVM_VM_Run(*mod_);
run();
size_t num_outputs = output_tensors_.size();
// TODO(vvchernov): check it
output_shapes_.resize(num_outputs);
Expand Down
60 changes: 44 additions & 16 deletions onnxruntime/core/providers/tvm/tvm_runner_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,22 +24,43 @@ class RunnerImpl {
RunnerImpl(const std::shared_ptr<TvmModule>& mod,
const InputsInfoMap& inputs_info,
const TVMTensorShapes output_shapes,
const std::vector<DLTensor> tensors_outputs);
const std::vector<DLTensor> tensors_outputs,
bool set_output_zero_copy);
virtual ~RunnerImpl() = default;

virtual common::Status run(const OrtApi* /* api */, OrtKernelContext* context) {
virtual common::Status run(Ort::KernelContext& context) {
common::Status res;
if (set_output_zero_copy_) {
res = run_without_output_copying(context);
} else {
res = run_with_output_copying(context);
}
return res;
}

virtual common::Status run_without_output_copying(Ort::KernelContext& context) {
set_input(context);
connect_output_tensors2ort(context);
set_output_zero_copy();
run();

return Status::OK();
}

Ort::KernelContext ctx{context};
set_input(ctx);
connect_output_tensors2ort(ctx);
run_and_get_output();
virtual common::Status run_with_output_copying(Ort::KernelContext& context) {
set_input(context);
connect_output_tensors2ort(context);
run();
get_outputs();

return Status::OK();
}

virtual void set_input(Ort::KernelContext& ctx) = 0;
virtual void set_input(Ort::KernelContext& context) = 0;
virtual void connect_output_tensors2ort(Ort::KernelContext& context) = 0;
virtual void run_and_get_output() = 0;
virtual void set_output_zero_copy() = 0;
virtual void run() = 0;
virtual void get_outputs() = 0;

protected:
void convert_input_tensors2dl_tensors(Ort::KernelContext& context,
Expand All @@ -52,6 +73,7 @@ class RunnerImpl {
InputsInfoMap inputs_info_;
TVMTensorShapes output_shapes_;
std::vector<DLTensor> output_tensors_;
bool set_output_zero_copy_;
};


Expand All @@ -61,12 +83,15 @@ class GERunnerImpl : public RunnerImpl {
GERunnerImpl(const std::shared_ptr<TvmModule>& mod,
const InputsInfoMap& inputs_info,
const TVMTensorShapes output_shapes,
const std::vector<DLTensor> tensors_outputs);
const std::vector<DLTensor> tensors_outputs,
bool set_output_zero_copy);
virtual ~GERunnerImpl() = default;

void set_input(Ort::KernelContext& context) override final;
void connect_output_tensors2ort(Ort::KernelContext& context) override final;
void run_and_get_output() override final;
void set_input(Ort::KernelContext& context) final;
void connect_output_tensors2ort(Ort::KernelContext& context) final;
void set_output_zero_copy() final;
void run() final;
void get_outputs() final;
};


Expand All @@ -76,12 +101,15 @@ class VMRunnerImpl : public RunnerImpl {
VMRunnerImpl(const std::shared_ptr<TvmModule>& mod,
const InputsInfoMap& inputs_info,
const TVMTensorShapes output_shapes,
const std::vector<DLTensor> tensors_outputs);
const std::vector<DLTensor> tensors_outputs,
bool set_output_zero_copy);
virtual ~VMRunnerImpl() = default;

void set_input(Ort::KernelContext& context) override final;
void connect_output_tensors2ort(Ort::KernelContext& context) override final;
void run_and_get_output() override final;
void set_input(Ort::KernelContext& context) final;
void connect_output_tensors2ort(Ort::KernelContext& context) final;
void set_output_zero_copy() final;
void run() final;
void get_outputs() final;

private:
void infer_once_to_get_output_shapes();
Expand Down

0 comments on commit 8a7e605

Please sign in to comment.