Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED works with CUDNN_RNN_PADDED_… #1428

Merged
merged 2 commits into from
Jul 18, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion onnxruntime/core/providers/cuda/cudnn_common.cc
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ Status CudnnDataTensor::Set(cudnnDataType_t dataType,
const int32_t* seq_lengths) {
ORT_RETURN_IF_ERROR(CreateTensorIfNeeded());

cudnnRNNDataLayout_t layout = CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED;
// CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED works with CUDNN_RNN_PADDED_IO_ENABLED, so that it will auto fill 0 for the shorter sequences
cudnnRNNDataLayout_t layout = CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED;
float padding_fill = 0.0f;
CUDNN_RETURN_IF_ERROR(cudnnSetRNNDataDescriptor(tensor_, dataType, layout,
static_cast<int>(max_seq_length),
Expand Down
11 changes: 8 additions & 3 deletions onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,6 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
x_data = x_reversed_data.get();
}

auto byte_size = X->DataType()->Size();
const T* hx_data = (initial_h == nullptr) ? nullptr : initial_h->template Data<T>();
const T* cx_data = (initial_c == nullptr) ? nullptr : initial_c->template Data<T>();
T* y_h_data = (Y_h == nullptr) ? nullptr : Y_h->template MutableData<T>();
Expand All @@ -234,10 +233,12 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
y_alloc_data = GetScratchBuffer<T>(output_size);
y_data = y_alloc_data.get();
}
// Cudnn library doesn't guarantee the data beyond the shorter sequence will be initialized to 0, so we need to do it manually.
cudaMemset(y_data, 0, output_size * byte_size);

const int32_t* sequence_lens_data = (sequence_lens == nullptr) ? nullptr : sequence_lens->template Data<int32_t>();

// CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED works with CUDNN_RNN_PADDED_IO_ENABLED, so that it will auto fill 0 for the shorter sequences
CUDNN_RETURN_IF_ERROR(cudnnSetRNNPaddingMode(rnn_desc_, CUDNN_RNN_PADDED_IO_ENABLED));

size_t workspace_bytes;
CUDNN_RETURN_IF_ERROR(cudnnGetRNNWorkspaceSize(CudnnHandle(), rnn_desc_, gsl::narrow_cast<int>(seq_length), x_desc.data(), &workspace_bytes));
auto workspace_cuda = GetScratchBuffer<void>(workspace_bytes);
Expand Down Expand Up @@ -288,6 +289,10 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
nullptr, nullptr, nullptr, nullptr,
workspace_cuda.get(),
workspace_bytes));
// Early terminate for this case since Y data is not required, and Y_h is obtained correctly, no need the following code to retrive Y_h from Y data.
if (nullptr == Y) {
return Status::OK();
}
}

IAllocatorUniquePtr<T> y_reorganized_data;
Expand Down
31 changes: 31 additions & 0 deletions onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1090,6 +1090,37 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMSequenceLengthShorterThanInputSequenceLength)
LstmOpContext2x1x2x2 context(direction);
context.RunTest(X_data, batch_size, seq_len, &initial_h, &initial_c, Y_data, Y_h_data, {}, &sequence_length);
}

TEST(LSTMTest, ONNXRuntime_TestLSTMSequenceLengthShorterThanInputSequenceLengthNoP) {
const int seq_len = 2;
const int batch_size = 1;

std::vector<float> X_data = {-0.455351f, -0.276391f,
-0.185934f, -0.269585f};

std::vector<int> sequence_length = {1};

std::vector<float> initial_h = {0.0f, 0.0f,
-0.0306872f, 0.028035f};

std::vector<float> initial_c = {0.0f, 0.0f,
-0.07243599f, 0.0467052f};

std::vector<float> Y_data = {0.0415416f, 0.0196912f,
0.0295027f, 0.0334400f,

0.0f, 0.0f,
0.0f, 0.0f};

std::vector<float> Y_h_data = {0.0415416f, 0.0196912f,
0.0295027f, 0.0334400f};

std::string direction = "bidirectional";

LstmOpContext2x1x2x2 context(direction);
// CUDA implementation doesn't support peephole
context.RunTest(X_data, batch_size, seq_len, &initial_h, &initial_c, Y_data, Y_h_data, {}, &sequence_length, false);
}
#endif // USE_NGRAPH

} // namespace test
Expand Down