Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

inference overheads optimizations #1392

Merged
merged 5 commits into from
Jul 19, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions onnxruntime/core/framework/execution_frame.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ IExecutionFrame::IExecutionFrame(const std::vector<int>& feed_mlvalue_idxs, cons
const OrtValueNameIdxMap& ort_value_idx_map, const NodeIndexInfo& node_index_info)
: node_index_info_{node_index_info}, fetch_mlvalue_idxs_{fetch_mlvalue_idxs} {
ORT_ENFORCE(feeds.size() == feed_mlvalue_idxs.size());
ORT_ENFORCE(fetches.empty() || fetches.size() == fetch_mlvalue_idxs.size());
ORT_ENFORCE(fetches.empty() || fetches.size() == fetch_mlvalue_idxs_.size());

Init(feed_mlvalue_idxs, feeds, initializers, fetch_mlvalue_idxs, fetches, ort_value_idx_map);
Init(feed_mlvalue_idxs, feeds, initializers, fetches, ort_value_idx_map);
}

IExecutionFrame::~IExecutionFrame() = default;
Expand Down Expand Up @@ -104,17 +104,17 @@ int IExecutionFrame::GetNodeIdxToMLValueIdx(int index) const {

void IExecutionFrame::Init(const std::vector<int>& feed_mlvalue_idxs, const std::vector<OrtValue>& feeds,
const std::unordered_map<int, OrtValue>& initializers,
const std::vector<int>& fetch_mlvalue_idxs, const std::vector<OrtValue>& fetches,
const std::vector<OrtValue>& fetches,
const OrtValueNameIdxMap& ort_value_idx_map) {
// 1. resize the all_value_ vector
all_values_.resize(ort_value_idx_map.MaxIdx() + 1);

// 2. Handle non-empty output vector
if (!fetches.empty()) {
auto num_fetches = fetch_mlvalue_idxs.size();
auto num_fetches = fetch_mlvalue_idxs_.size();

for (size_t idx = 0; idx < num_fetches; ++idx) {
int ort_value_idx = fetch_mlvalue_idxs[idx];
int ort_value_idx = fetch_mlvalue_idxs_[idx];
all_values_[ort_value_idx] = fetches[idx];
}
}
Expand Down Expand Up @@ -189,7 +189,7 @@ ExecutionFrame::ExecutionFrame(const std::vector<int>& feed_mlvalue_idxs, const
// and we have execution plan generated, try to setup
// memory pattern optimization.
if (session_state.GetEnableMemoryPattern() && session_state.GetExecutionPlan()) {
std::vector<TensorShape> input_shapes;
std::vector<std::reference_wrapper<const TensorShape>> input_shapes;
bool all_tensors = true;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is awesome. Can you please close Task 3344 on aiinfra when this is checked in? There are a number of other potential small improvements in tasks under the same parent if you're interested.

// Reserve mem to avoid re-allocation.
input_shapes.reserve(feeds.size());
Expand All @@ -199,7 +199,7 @@ ExecutionFrame::ExecutionFrame(const std::vector<int>& feed_mlvalue_idxs, const
break;
}
auto& tensor = feed.Get<Tensor>();
input_shapes.push_back(tensor.Shape());
input_shapes.push_back(std::cref(tensor.Shape()));
}

//if there are some traditional ml value type in inputs disable the memory pattern optimization.
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/framework/execution_frame.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ class IExecutionFrame {
ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(IExecutionFrame);

void Init(const std::vector<int>& feed_mlvalue_idxs, const std::vector<OrtValue>& feeds,
const std::unordered_map<int, OrtValue>& initializers, const std::vector<int>& fetch_mlvalue_idxs,
const std::unordered_map<int, OrtValue>& initializers,
const std::vector<OrtValue>& fetches, const OrtValueNameIdxMap& ort_value_idx_map);

const OrtValue& GetMLValue(int ort_value_index) const {
Expand Down
4 changes: 1 addition & 3 deletions onnxruntime/core/framework/feeds_fetches_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,7 @@ Status FeedsFetchesManager::Create(const std::vector<std::string>& feed_names,
const std::vector<std::string>& output_names,
const OrtValueNameIdxMap& ort_value_name_idx_map,
std::unique_ptr<FeedsFetchesManager>& feed_fetch_manager) {
FeedsFetchesInfo info;
info.feed_names = feed_names;
info.output_names = output_names;
FeedsFetchesInfo info{feed_names, output_names};

ORT_RETURN_IF_ERROR(info.SetMLValueIdxs(ort_value_name_idx_map));

Expand Down
3 changes: 2 additions & 1 deletion onnxruntime/core/framework/onnxruntime_typeinfo.cc
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,8 @@ OrtStatus* OrtTypeInfo::FromDataTypeImpl(const ONNX_NAMESPACE::TypeProto* input,
auto& t = s.dim(i);
shape_data[i] = t.has_dim_value() ? t.dim_value() : -1;
}
st = GetTensorShapeAndType(reinterpret_cast<const TensorShape*>(&shape_data), type, &info);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The original code is good. Casting is a no-op at runtime.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The original code is tricky. It's error-prone.


In reply to: 303077037 [](ancestors = 303077037)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But that is the design.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See: https://github.com/microsoft/onnxruntime/blob/master/include/onnxruntime/core/framework/tensor_shape.h#L135

This function has been there for a long long time. It was added by a WinML guy, and it is widely used in WinML. We must guarantee it always works.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TensorShape is std::vector. They have the same memory layout. You can reinterpret_cast one of them to the other. So, there is no real use case for a move. You can still do it, it's cheap, but it's not the best way.

This is the implementation detail of type TensorShape. Since TensorShape is private inherited from std::vector, this implementation detail is hidden from those who use this class. Anyone who use TensorShape does not need to know, and should not assume this detail.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@fs-eire In general, you're right. But in this project, for this class('TensorShape'), we can treat it special. Because it's so important, every dev should know it is just an alias of std::vector<int64_t>, like a typedef.
For example, you may found this class is strange. In general, every inherited class must has a virtual destructor, otherwise the object may not get deleted correctly . But this class doesn't have it, and std::vector<int64_t> doesn't have it too. It's kind of violating the basic law, but it doesn't mean the code is bad. It designed as that, so every user should know the contract and obey it. Otherwise what's the meaning of inheriting from std::vector? We can easily implement this class without bothering std::vector. We do the inheritance only because we want to do the casting.

Copy link
Contributor

@pranavsharma pranavsharma Jul 18, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The virtual destructor applies for public inheritance only; so it's not violating any law here. If you're publicly inheriting without a virtual destructor, it is indeed bad code, unless you know for sure no one is going to store the derived class ptr in the base class ptr.
Private inheritance means "is implemented in terms of". This means the vector is purely an implementation detail of the TensorShape class.

TensorShape shape(std::move(shape_data));
st = GetTensorShapeAndType(&shape, type, &info);
} else {
st = GetTensorShapeAndType(nullptr, type, &info);
}
Expand Down
4 changes: 2 additions & 2 deletions onnxruntime/core/framework/parallel_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -84,15 +84,15 @@ Status ParallelExecutor::Execute(const SessionState& session_state, const std::v
VLOGS(logger, 1) << "Done execution.";

if (root_frame_->HasMemoryPatternPlanner()) {
std::vector<TensorShape> input_shapes;
std::vector<std::reference_wrapper<const TensorShape>> input_shapes;
bool all_tensors = true;
for (const auto& feed : feeds) {
if (!(feed.IsTensor())) {
all_tensors = false;
break;
}
auto& tensor = feed.Get<Tensor>();
input_shapes.push_back(tensor.Shape());
input_shapes.push_back(std::cref(tensor.Shape()));
}

if (all_tensors) {
Expand Down
4 changes: 2 additions & 2 deletions onnxruntime/core/framework/sequential_executor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -181,15 +181,15 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
VLOGS(logger, 1) << "Done with execution.";

if (frame.HasMemoryPatternPlanner()) {
std::vector<TensorShape> input_shapes;
std::vector<std::reference_wrapper<const TensorShape>> input_shapes;
bool all_tensors = true;
for (const auto& feed : feeds) {
if (!(feed.IsTensor())) {
all_tensors = false;
break;
}
auto& tensor = feed.Get<Tensor>();
input_shapes.push_back(tensor.Shape());
input_shapes.push_back(std::cref(tensor.Shape()));
}

if (all_tensors) {
Expand Down
15 changes: 8 additions & 7 deletions onnxruntime/core/framework/session_state.cc
Original file line number Diff line number Diff line change
Expand Up @@ -78,26 +78,27 @@ void SessionState::SetProfiler(profiling::Profiler& profiler) { profiler_ = &pro

::onnxruntime::profiling::Profiler& SessionState::Profiler() const { return *profiler_; }

static int64_t CalculateMemoryPatternsKey(const std::vector<TensorShape>& shapes) {
static int64_t CalculateMemoryPatternsKey(const std::vector<std::reference_wrapper<const TensorShape>>& shapes) {
int64_t key = 0;
for (auto& shape : shapes) {
for (auto dim : shape.GetDims()) key ^= dim;
for (auto shape : shapes) {
for (auto dim : shape.get().GetDims()) key ^= dim;
}
return key;
}

const MemoryPatternGroup* SessionState::GetMemoryPatternGroup(const std::vector<TensorShape>& input_shapes) const {
std::lock_guard<OrtMutex> lock(mem_patterns_lock_);
const MemoryPatternGroup* SessionState::GetMemoryPatternGroup(const std::vector<std::reference_wrapper<const TensorShape>>& input_shapes) const {
int64_t key = CalculateMemoryPatternsKey(input_shapes);

std::lock_guard<OrtMutex> lock(mem_patterns_lock_);
auto it = mem_patterns_.find(key);
if (it == mem_patterns_.end()) return nullptr;

return it->second.get();
}

Status SessionState::UpdateMemoryPatternGroupCache(const std::vector<TensorShape>& input_shape,
Status SessionState::UpdateMemoryPatternGroupCache(const std::vector<std::reference_wrapper<const TensorShape>>& input_shapes,
std::unique_ptr<MemoryPatternGroup> mem_patterns) const {
int64_t key = CalculateMemoryPatternsKey(input_shape);
int64_t key = CalculateMemoryPatternsKey(input_shapes);

std::lock_guard<OrtMutex> lock(mem_patterns_lock_);
auto it = mem_patterns_.find(key);
Expand Down
4 changes: 2 additions & 2 deletions onnxruntime/core/framework/session_state.h
Original file line number Diff line number Diff line change
Expand Up @@ -120,13 +120,13 @@ class SessionState {
/**
Get cached memory pattern based on input shapes
*/
const MemoryPatternGroup* GetMemoryPatternGroup(const std::vector<TensorShape>& input_shapes) const;
const MemoryPatternGroup* GetMemoryPatternGroup(const std::vector<std::reference_wrapper<const TensorShape>>& input_shapes) const;

/**
Set generated memory pattern with a given input shapes.
Const as it's an internal cache update only.
*/
Status UpdateMemoryPatternGroupCache(const std::vector<TensorShape>& input_shape,
Status UpdateMemoryPatternGroupCache(const std::vector<std::reference_wrapper<const TensorShape>>& input_shape,
std::unique_ptr<MemoryPatternGroup> mem_patterns) const;

/**
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/framework/tensor_shape.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ namespace onnxruntime {
TensorShape::TensorShape(const std::vector<int64_t>& dims) : std::vector<int64_t>(dims) {
}

TensorShape::TensorShape(std::vector<int64_t>&& dims) : std::vector<int64_t>(dims) {
TensorShape::TensorShape(std::vector<int64_t>&& dims) : std::vector<int64_t>(std::move(dims)) {
}

TensorShape::TensorShape(const std::initializer_list<int64_t>& dims) : std::vector<int64_t>(dims) {
Expand Down
3 changes: 1 addition & 2 deletions onnxruntime/core/framework/tensorprotoutils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -466,8 +466,7 @@ Status TensorProtoToMLValue(const Env& env, const ORTCHAR_T* tensor_proto_path,
}
std::vector<int64_t> tensor_shape_vec = GetTensorShapeFromTensorProto(tensor_proto);
// Note: We permit an empty tensor_shape_vec, and treat it as a scalar (a tensor of size 1).
TensorShape tensor_shape{tensor_shape_vec};
value.Init(new Tensor(type, tensor_shape, tensor_data, allocator), DataTypeImpl::GetType<Tensor>(),
value.Init(new Tensor(type, TensorShape(std::move(tensor_shape_vec)), tensor_data, allocator), DataTypeImpl::GetType<Tensor>(),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would say Pranav created a bad example there. If he didn't add the move constructor for TensorShape, you won't use it

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From the performance point view, I am neutral on std::move vs reinterpret_cast<>. However, copy constructor of std::vector is much heavier than either of the both and should be avoid when possible.

Copy link
Contributor

@pranavsharma pranavsharma Jul 18, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's so "bad example" about the move constructor? What's bad is exposing implementation details of a class like this, using reinterpret_cast and then doing additional checks to ensure the cast is successful. Really?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't need the move constructor. If you didn't add it, nobody would use it.

Copy link
Contributor Author

@fs-eire fs-eire Jul 18, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Change of this line exactly shows why a move constructor is needed.

I mean, I do not have to use a move constructor. My goal is to reduce the unnecessary copy of a std::vector object. Is there a better way to do here?

BTW, profiling shows that the major overhead comes from memory allocation when copying std::vector. Few instructions in move constructor are trivial.

DataTypeImpl::GetType<Tensor>()->GetDeleteFunc());
return Status::OK();
}
Expand Down
10 changes: 2 additions & 8 deletions onnxruntime/core/session/onnxruntime_c_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -165,12 +165,8 @@ ORT_API_STATUS_IMPL(OrtFillStringTensor, _In_ OrtValue* value, _In_ const char*
template <typename T>
OrtStatus* CreateTensorImpl(const int64_t* shape, size_t shape_len, OrtAllocator* allocator,
std::unique_ptr<Tensor>* out) {
std::vector<int64_t> shapes(shape_len);
for (size_t i = 0; i != shape_len; ++i) {
shapes[i] = shape[i];
}
std::shared_ptr<IAllocator> alloc_ptr = std::make_shared<onnxruntime::AllocatorWrapper>(allocator);
*out = std::make_unique<Tensor>(DataTypeImpl::GetType<T>(), onnxruntime::TensorShape(shapes), alloc_ptr);
*out = std::make_unique<Tensor>(DataTypeImpl::GetType<T>(), onnxruntime::TensorShape(shape, shape_len), alloc_ptr);
return nullptr;
}

Expand All @@ -182,10 +178,8 @@ template <typename T>
OrtStatus* CreateTensorImpl(const int64_t* shape, size_t shape_len, const OrtAllocatorInfo* info,
void* p_data, size_t p_data_len, std::unique_ptr<Tensor>* out) {
size_t elem_count = 1;
std::vector<int64_t> shapes(shape_len);
for (size_t i = 0; i != shape_len; ++i) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given TensorShape has code to do this can we create a TensorShape instance here, get Size() from it, and then std::move it when calling the Tensor ctor?

Should we also have a check somewhere to ensure there's no symbolic dimension with value of -1 leading to a negative total size?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will figure out this in a separated PR.

elem_count *= shape[i];
shapes[i] = shape[i];
}

size_t size_to_allocate;
Expand All @@ -197,7 +191,7 @@ OrtStatus* CreateTensorImpl(const int64_t* shape, size_t shape_len, const OrtAll
oss << "not enough space: expected " << size_to_allocate << ", got " << p_data_len;
return OrtCreateStatus(ORT_INVALID_ARGUMENT, oss.str().c_str());
}
*out = std::make_unique<Tensor>(DataTypeImpl::GetType<T>(), onnxruntime::TensorShape(shapes), p_data, *info);
*out = std::make_unique<Tensor>(DataTypeImpl::GetType<T>(), onnxruntime::TensorShape(shape, shape_len), p_data, *info);
return nullptr;
}

Expand Down