From fe2a7d9d016420eabedb9c70274735f07177ce51 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Tue, 8 Oct 2019 16:56:44 -0700 Subject: [PATCH 01/19] Adding second NMS op --- src/operator/contrib/bounding_box-inl.cuh | 2 + src/operator/contrib/bounding_box.cc | 73 ++++++ src/operator/contrib/bounding_box.cu | 289 ++++++++++++++++++++++ 3 files changed, 364 insertions(+) diff --git a/src/operator/contrib/bounding_box-inl.cuh b/src/operator/contrib/bounding_box-inl.cuh index de215ce35a98..128d3cbcecc2 100644 --- a/src/operator/contrib/bounding_box-inl.cuh +++ b/src/operator/contrib/bounding_box-inl.cuh @@ -202,7 +202,9 @@ void NMSApply(mshadow::Stream *s, int in_format) { using namespace mxnet_op; constexpr int THRESHOLD = 1024; + std::cout << "TOPK " << topk << std::endl; for (int ref = 0; ref < topk; ref += THRESHOLD) { + std::cout << "REF: " << ref << std::endl; constexpr int block_size = 512; constexpr int N = THRESHOLD / block_size; auto stream = mshadow::Stream::GetStream(s); diff --git a/src/operator/contrib/bounding_box.cc b/src/operator/contrib/bounding_box.cc index 62b7c2e0bf4b..9363eb09d5a4 100644 --- a/src/operator/contrib/bounding_box.cc +++ b/src/operator/contrib/bounding_box.cc @@ -34,6 +34,79 @@ DMLC_REGISTER_PARAMETER(BoxOverlapParam); DMLC_REGISTER_PARAMETER(BipartiteMatchingParam); DMLC_REGISTER_PARAMETER(BoxDecodeParam); +NNVM_REGISTER_OP(_contrib_box_nms2) +.describe(R"code(Apply non-maximum suppression to input. + +The output will be sorted in descending order according to `score`. Boxes with +overlaps larger than `overlap_thresh`, smaller scores and background boxes +will be removed and filled with -1, the corresponding position will be recorded +for backward propogation. + +During back-propagation, the gradient will be copied to the original +position according to the input index. For positions that have been suppressed, +the in_grad will be assigned 0. +In summary, gradients are sticked to its boxes, will either be moved or discarded +according to its original index in input. + +Input requirements:: + + 1. Input tensor have at least 2 dimensions, (n, k), any higher dims will be regarded + as batch, e.g. (a, b, c, d, n, k) == (a*b*c*d, n, k) + 2. n is the number of boxes in each batch + 3. k is the width of each box item. + +By default, a box is [id, score, xmin, ymin, xmax, ymax, ...], +additional elements are allowed. + +- `id_index`: optional, use -1 to ignore, useful if `force_suppress=False`, which means + we will skip highly overlapped boxes if one is `apple` while the other is `car`. + +- `background_id`: optional, default=-1, class id for background boxes, useful + when `id_index >= 0` which means boxes with background id will be filtered before nms. + +- `coord_start`: required, default=2, the starting index of the 4 coordinates. + Two formats are supported: + + - `corner`: [xmin, ymin, xmax, ymax] + + - `center`: [x, y, width, height] + +- `score_index`: required, default=1, box score/confidence. + When two boxes overlap IOU > `overlap_thresh`, the one with smaller score will be suppressed. + +- `in_format` and `out_format`: default='corner', specify in/out box formats. + +Examples:: + + x = [[0, 0.5, 0.1, 0.1, 0.2, 0.2], [1, 0.4, 0.1, 0.1, 0.2, 0.2], + [0, 0.3, 0.1, 0.1, 0.14, 0.14], [2, 0.6, 0.5, 0.5, 0.7, 0.8]] + box_nms(x, overlap_thresh=0.1, coord_start=2, score_index=1, id_index=0, + force_suppress=True, in_format='corner', out_typ='corner') = + [[2, 0.6, 0.5, 0.5, 0.7, 0.8], [0, 0.5, 0.1, 0.1, 0.2, 0.2], + [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1]] + out_grad = [[0.1, 0.1, 0.1, 0.1, 0.1, 0.1], [0.2, 0.2, 0.2, 0.2, 0.2, 0.2], + [0.3, 0.3, 0.3, 0.3, 0.3, 0.3], [0.4, 0.4, 0.4, 0.4, 0.4, 0.4]] + # exe.backward + in_grad = [[0.2, 0.2, 0.2, 0.2, 0.2, 0.2], [0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0], [0.1, 0.1, 0.1, 0.1, 0.1, 0.1]] + +)code" ADD_FILELINE) +.set_num_inputs(1) +.set_num_outputs(2) +.set_attr_parser(ParamParser) +.set_attr("FNumVisibleOutputs", BoxNMSNumVisibleOutputs) +.set_attr("FInferShape", BoxNMSShape) +.set_attr("FInferType", ElemwiseType<1, 2>) +.set_attr("FResourceRequest", + [](const NodeAttrs& attrs) { + return std::vector{ResourceRequest::kTempSpace}; + }) +.set_attr("THasDeterministicOutput", true) +.set_attr("FCompute", BoxNMSForward) +.set_attr("FGradient", ElemwiseGradUseOut{"_backward_contrib_box_nms"}) +.add_argument("data", "NDArray-or-Symbol", "The input") +.add_arguments(BoxNMSParam::__FIELDS__()); + NNVM_REGISTER_OP(_contrib_box_nms) .add_alias("_contrib_box_non_maximum_suppression") .describe(R"code(Apply non-maximum suppression to input. diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu index b20c570ea417..c5ea0e9e7db4 100644 --- a/src/operator/contrib/bounding_box.cu +++ b/src/operator/contrib/bounding_box.cu @@ -27,9 +27,298 @@ #include "./bounding_box-inl.cuh" #include "./bounding_box-inl.h" #include "../elemwise_op_common.h" +#include namespace mxnet { namespace op { + +namespace { + +using mshadow::Tensor; +using mshadow::Stream; + +template +struct TempWorkspace { + index_t scores_temp_space; + DType* scores; + index_t batch_temp_space; + index_t* batches; + index_t scratch_space; + uint8_t* scratch; + index_t nms_scratch_space; + uint32_t* nms_scratch; + index_t indices_temp_spaces; + index_t* indices; +}; + +inline index_t ceil_div(index_t x, index_t y) { + return (x + y - 1) / y; +} + +inline index_t align(index_t x, index_t alignment) { + return ceil_div(x, alignment) * alignment; +} + +template +__global__ void FilterAndPrepareAuxData_kernel(const DType* data, DType* out, DType* scores, + index_t* batches, index_t num_elements_per_batch, + const index_t element_width, const float threshold, + const int id_index, const int score_index, + const int background_id) { + index_t tid = blockIdx.x * blockDim.x + threadIdx.x; + bool first_in_element = (tid % element_width == 0); + index_t my_batch = tid / (num_elements_per_batch * element_width); + index_t start_of_my_element = tid - (tid % element_width); + + DType my_score = data[start_of_my_element + score_index]; + bool filtered_out = my_score <= threshold; + if (id_index != -1 && background_id != -1) { + DType my_id = data[start_of_my_element + id_index]; + filtered_out = filtered_out || (my_id == background_id); + } + if (!filtered_out) { + out[tid] = data[tid]; + } else { + out[tid] = -1; + my_score = -1; + } + + if (first_in_element) { + index_t offset = tid / element_width; + scores[offset] = my_score; + batches[offset] = my_batch; + } +} + +template +void FilterAndPrepareAuxData(const Tensor& data, + Tensor* out, + const TempWorkspace& workspace, + const BoxNMSParam& param, + Stream* s) { + const int n_threads = 512; + index_t N = data.shape_.Size(); + const auto blocks = ceil_div(N, n_threads); + FilterAndPrepareAuxData_kernel<<::GetStream(s)>>>( + data.dptr_, out->dptr_, workspace.scores, + workspace.batches, data.shape_[1], data.shape_[2], + param.valid_thresh, param.id_index, + param.score_index, param.background_id); +} + +template +__global__ void CompactData_kernel(const index_t* indices, const DType* source, + DType* destination, const index_t topk, + const index_t element_width, + const index_t num_elements_per_batch, + const index_t N) { + const index_t tid_start = blockIdx.x * blockDim.x + threadIdx.x; + for (index_t tid = tid_start; tid < N; tid += blockDim.x * gridDim.x) { + const index_t my_element = tid / element_width; + const index_t my_element_in_batch = my_element % num_elements_per_batch; + if (write_whole_output && my_element_in_batch >= topk) { + destination[tid] = -1; + } else { + const index_t source_element = indices[my_element]; + destination[tid] = source[source_element * element_width + tid % element_width]; + } + } +} + +template +void CompactData(const Tensor& indices, + const Tensor& source, + Tensor* destination, + const index_t topk, + Stream* s) { + const int n_threads = 512; + const int max_blocks = 320; + index_t N = source.shape_.Size(); + const auto blocks = std::min(ceil_div(N, n_threads), max_blocks); + CompactData_kernel<<::GetStream(s)>>>( + indices.dptr_, source.dptr_, + destination->dptr_, topk, + source.shape_[2], source.shape_[1], N); +} + +template +void WorkspaceForSort(const int num_batch, + const int num_elem, + const int width_elem, + const int alignment, + TempWorkspace* workspace) { + const index_t sort_scores_temp_space = mxnet::op::SortByKeyWorkspaceSize(num_batch * num_elem); + const index_t sort_batch_temp_space = mxnet::op::SortByKeyWorkspaceSize(num_batch * num_elem); + workspace->scratch_space = align(std::max(sort_scores_temp_space, + sort_batch_temp_space), + alignment); +} + +template +__global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* result, + const index_t current_start); + +template +struct NMS { + static const int THRESHOLD = 1024; + + void operator()(Tensor* data, + Tensor* scratch, + const index_t topk, + const BoxNMSParam& param, + Stream* s) { + const int n_threads = 512; + const index_t n_batch = data->shape_[0]; + for (index_t current_start = 0; current_start < topk; current_start += THRESHOLD) { + const index_t n_elems = topk - current_start; + const int n_blocks = ceil_div(THRESHOLD / (sizeof(uint32_t) * 8) * n_elems * n_batch, n_threads); + CalculateGreedyNMSResults_kernel<<::GetStream(s)>>>( + data->dptr_, scratch->dptr_, current_start); + } + } +}; + +template +__global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* result, + const index_t current_start) { +} + +template +TempWorkspace GetWorkspace(const int num_batch, + const int num_elem, + const int width_elem, + const index_t topk, + const OpContext& ctx) { + TempWorkspace workspace; + Stream *s = ctx.get_stream(); + const int alignment = 128; + + // Get the workspace size + workspace.scores_temp_space = align(num_batch * num_elem * sizeof(DType), alignment); + workspace.batch_temp_space = align(num_batch * num_elem * sizeof(index_t), alignment); + workspace.indices_temp_spaces = align(num_batch * num_elem * sizeof(index_t), alignment); + WorkspaceForSort(num_batch, num_elem, width_elem, alignment, &workspace); + // Place for a buffer + workspace.scratch_space = std::max(workspace.scratch_space, + align(num_batch * num_elem * width_elem * sizeof(DType), + alignment)); + workspace.nms_scratch_space = align(NMS::THRESHOLD / (sizeof(uint32_t) * 8) * + num_batch * topk * sizeof(uint32_t), alignment); + + const index_t workspace_size = workspace.scores_temp_space + + workspace.batch_temp_space + + workspace.scratch_space + + workspace.nms_scratch_space + + workspace.indices_temp_spaces; + + // Obtain the memory for workspace + Tensor scratch_memory = ctx.requested[box_nms_enum::kTempSpace] + .get_space_typed(mshadow::Shape1(workspace_size), s); + + // Populate workspace pointers + workspace.scores = scratch_memory.dptr_; + workspace.batches = reinterpret_cast(reinterpret_cast(workspace.scores) + + workspace.scores_temp_space); + workspace.scratch = reinterpret_cast(workspace.batches) + + workspace.batch_temp_space; + workspace.nms_scratch = reinterpret_cast(workspace.scratch + + workspace.scratch_space); + workspace.indices = reinterpret_cast( + reinterpret_cast(workspace.nms_scratch) + workspace.nms_scratch_space); + return workspace; +} + +} // namespace + +void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using mshadow::Shape1; + using mshadow::Shape2; + using mshadow::Shape3; + CHECK_NE(req[0], kAddTo) << "BoxNMS does not support kAddTo"; + CHECK_NE(req[0], kWriteInplace) << "BoxNMS does not support in place computation"; + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 2U) << "BoxNMS output: [output, temp]"; + const BoxNMSParam& param = nnvm::get(attrs.parsed); + Stream *s = ctx.get_stream(); + mxnet::TShape in_shape = inputs[box_nms_enum::kData].shape_; + int indim = in_shape.ndim(); + int num_batch = indim <= 2? 1 : in_shape.ProdShape(0, indim - 2); + int num_elem = in_shape[indim - 2]; + int width_elem = in_shape[indim - 1]; + + MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, { + Tensor data = inputs[box_nms_enum::kData] + .get_with_shape(Shape3(num_batch, num_elem, width_elem), s); + Tensor out = outputs[box_nms_enum::kOut] + .get_with_shape(Shape3(num_batch, num_elem, width_elem), s); + // Special case for topk == 0 + if (param.topk == 0) { + if (req[0] != kNullOp && + req[0] != kWriteInplace) { + out = mshadow::expr::F(data); + } + return; + } + + index_t topk = param.topk > 0 ? std::min(param.topk, num_elem) : num_elem; + const auto& workspace = GetWorkspace(num_batch, num_elem, + width_elem, topk, ctx); + + FilterAndPrepareAuxData(data, &out, workspace, param, s); + Tensor scores(workspace.scores, Shape1(num_batch * num_elem), s); + Tensor batches(workspace.batches, Shape1(num_batch * num_elem), s); + Tensor indices(workspace.indices, Shape1(num_batch * num_elem), s); + Tensor scratch(reinterpret_cast(workspace.scratch), + Shape1(workspace.scratch_space), s); + Tensor nms_scratch(workspace.nms_scratch, + Shape2(NMS::THRESHOLD / (sizeof(uint32_t) * 8), topk * num_batch), s); + indices = mshadow::expr::range(0, num_batch * num_elem); + mxnet::op::SortByKey(scores, indices, false, &scratch); + batches = indices / mshadow::expr::ScalarExp(num_elem); + mxnet::op::SortByKey(batches, indices, true, &scratch); + Tensor buffer(reinterpret_cast(workspace.scratch), + Shape3(num_batch, num_elem, width_elem), s); + CompactData(indices, out, &buffer, topk, s); + NMS nms; + nms(&buffer, &nms_scratch, topk, param, s); + mshadow::Copy(out, buffer, s); + }); +} + +void BoxNMSForwardGPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + using namespace mshadow; + using namespace mshadow::expr; + using namespace mxnet_op; + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 2U) << "BoxNMS output: [output, temp]"; + std::cout << "Reqs" << std::endl; + for (const auto& r : req) { + std::cout << r << std::endl; + } + std::cout << "END: Reqs" << std::endl; + if (req[1] == kNullOp) { + BoxNMSForwardGPU_notemp(attrs, ctx, inputs, req, outputs); + return; + } + BoxNMSForward(attrs, ctx, inputs, req, outputs); +} + + +NNVM_REGISTER_OP(_contrib_box_nms2) +.set_attr("FCompute", BoxNMSForwardGPU); + NNVM_REGISTER_OP(_contrib_box_nms) .set_attr("FCompute", BoxNMSForward); From 5d0a43049d8e284371b61d1c99269022c33428ce Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Mon, 14 Oct 2019 15:41:23 -0700 Subject: [PATCH 02/19] NMS kernel --- src/operator/contrib/bounding_box.cu | 402 +++++++++++++++++++++++++-- 1 file changed, 376 insertions(+), 26 deletions(-) diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu index c5ea0e9e7db4..f824f6d58e25 100644 --- a/src/operator/contrib/bounding_box.cu +++ b/src/operator/contrib/bounding_box.cu @@ -45,6 +45,8 @@ struct TempWorkspace { index_t* batches; index_t scratch_space; uint8_t* scratch; + index_t buffer_space; + DType* buffer; index_t nms_scratch_space; uint32_t* nms_scratch; index_t indices_temp_spaces; @@ -109,7 +111,7 @@ void FilterAndPrepareAuxData(const Tensor& data, param.score_index, param.background_id); } -template +template __global__ void CompactData_kernel(const index_t* indices, const DType* source, DType* destination, const index_t topk, const index_t element_width, @@ -119,7 +121,7 @@ __global__ void CompactData_kernel(const index_t* indices, const DType* source, for (index_t tid = tid_start; tid < N; tid += blockDim.x * gridDim.x) { const index_t my_element = tid / element_width; const index_t my_element_in_batch = my_element % num_elements_per_batch; - if (write_whole_output && my_element_in_batch >= topk) { + if (check_topk && my_element_in_batch >= topk) { destination[tid] = -1; } else { const index_t source_element = indices[my_element]; @@ -138,11 +140,19 @@ void CompactData(const Tensor& indices, const int max_blocks = 320; index_t N = source.shape_.Size(); const auto blocks = std::min(ceil_div(N, n_threads), max_blocks); - CompactData_kernel<<::GetStream(s)>>>( - indices.dptr_, source.dptr_, - destination->dptr_, topk, - source.shape_[2], source.shape_[1], N); + if (topk > 0) { + CompactData_kernel<<::GetStream(s)>>>( + indices.dptr_, source.dptr_, + destination->dptr_, topk, + source.shape_[2], source.shape_[1], N); + } else { + CompactData_kernel<<::GetStream(s)>>>( + indices.dptr_, source.dptr_, + destination->dptr_, topk, + source.shape_[2], source.shape_[1], N); + } } template @@ -158,13 +168,45 @@ void WorkspaceForSort(const int num_batch, alignment); } -template +template __global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* result, - const index_t current_start); + const index_t current_start, + const index_t num_elems, + const index_t num_batches, + const index_t num_blocks_per_row_batch, + const index_t num_blocks_per_row, + const index_t topk, + const index_t element_width, + const index_t num_elements_per_batch, + const int coord_index, + const int class_index, + const int score_index, + const float threshold); + +template +__global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results, + DType * data, + const index_t score_index, + const index_t element_width, + const index_t num_batches, + const index_t num_elems, + const index_t start_index, + const index_t topk); + +template +__global__ void ReduceNMSResultRest_kernel(DType* data, + const uint32_t* nms_results, + const index_t score_index, + const index_t element_width, + const index_t num_batches, + const index_t num_elements_per_batch, + const index_t start_index, + const index_t topk, + const index_t num_blocks_per_batch); template struct NMS { - static const int THRESHOLD = 1024; + static constexpr int THRESHOLD = 512; void operator()(Tensor* data, Tensor* scratch, @@ -172,19 +214,292 @@ struct NMS { const BoxNMSParam& param, Stream* s) { const int n_threads = 512; - const index_t n_batch = data->shape_[0]; + const index_t num_batches = data->shape_[0]; + const index_t num_elements_per_batch = data->shape_[1]; + const index_t element_width = data->shape_[2]; for (index_t current_start = 0; current_start < topk; current_start += THRESHOLD) { const index_t n_elems = topk - current_start; - const int n_blocks = ceil_div(THRESHOLD / (sizeof(uint32_t) * 8) * n_elems * n_batch, n_threads); - CalculateGreedyNMSResults_kernel<<::GetStream(s)>>>( - data->dptr_, scratch->dptr_, current_start); + const index_t num_blocks_per_row_batch = ceil_div(n_elems, n_threads); + const index_t num_blocks_per_row = num_blocks_per_row_batch * num_batches; + const index_t n_blocks = THRESHOLD / (sizeof(uint32_t) * 8) * num_blocks_per_row; + if (param.in_format == box_common_enum::kCorner) { + CalculateGreedyNMSResults_kernel + <<::GetStream(s)>>>( + data->dptr_, scratch->dptr_, current_start, n_elems, num_batches, + num_blocks_per_row_batch, num_blocks_per_row, topk, element_width, + num_elements_per_batch, param.coord_start, param.id_index, + param.score_index, param.overlap_thresh); + } else { + CalculateGreedyNMSResults_kernel + <<::GetStream(s)>>>( + data->dptr_, scratch->dptr_, current_start, n_elems, num_batches, + num_blocks_per_row_batch, num_blocks_per_row, topk, element_width, + num_elements_per_batch, param.coord_start, param.id_index, + param.score_index, param.overlap_thresh); + } + ReduceNMSResultTriangle_kernel<<::GetStream(s)>>>( + scratch->dptr_, data->dptr_, param.score_index, + element_width, num_batches, num_elements_per_batch, + current_start, topk); + const index_t n_rest_elems = n_elems - THRESHOLD; + const index_t num_rest_blocks_per_batch = ceil_div(n_rest_elems, n_threads); + const index_t num_rest_blocks = num_rest_blocks_per_batch * num_batches; + if (n_rest_elems > 0) { + ReduceNMSResultRest_kernel<<::GetStream(s)>>>( + data->dptr_, scratch->dptr_, param.score_index, element_width, + num_batches, num_elements_per_batch, current_start, topk, + num_rest_blocks_per_batch); + } } } }; -template +template +__device__ __forceinline__ DType calculate_area(const DType b0, const DType b1, + const DType b2, const DType b3) { + DType width = b2; + DType height = b3; + if (encode == box_common_enum::kCorner) { + width -= b0; + height -= b1; + } + if (width < 0 || height < 0) return 0; + return width * height; +} + +template +__device__ __forceinline__ DType calculate_intersection(const DType a0, const DType a1, + const DType a2, const DType a3, + const DType b0, const DType b1, + const DType b2, const DType b3) { + DType wx, wy; + if (encode == box_common_enum::kCorner) { + const DType left = a0 > b0 ? a0 : b0; + const DType bottom = a1 > b1 ? a1 : b1; + const DType right = a2 < b2 ? a2 : b2; + const DType top = a3 < b3 ? a3 : b3; + wx = right - left; + wy = top - bottom; + } else { + const DType al = 2 * a0 - a2; + const DType ar = 2 * a0 + a2; + const DType bl = 2 * b0 - b2; + const DType br = 2 * b0 + b2; + const DType left = bl > al ? bl : al; + const DType right = br < ar ? br : ar; + wx = right - left; + const DType ab = 2 * a1 - a3; + const DType at = 2 * a1 + a3; + const DType bb = 2 * b1 - b3; + const DType bt = 2 * b1 + b3; + const DType bottom = bb > ab ? bb : ab; + const DType top = bt < at ? bt : at; + wy = top - bottom; + wy = wy / 4; // To compensate for both wx and wy being 2x too large + } + if (wx <= 0 || wy <= 0) { + return 0; + } else { + return (wx * wy); + } +} + +template +__launch_bounds__(512) __global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* result, - const index_t current_start) { + const index_t current_start, + const index_t num_elems, + const index_t num_batches, + const index_t num_blocks_per_row_batch, + const index_t num_blocks_per_row, + const index_t topk, + const index_t element_width, + const index_t num_elements_per_batch, + const int coord_index, + const int class_index, + const int score_index, + const float threshold) { + constexpr int max_elem_width = 20; + constexpr int num_other_boxes = sizeof(uint32_t) * 8; + __shared__ DType other_boxes[max_elem_width * num_other_boxes]; + const index_t my_row = blockIdx.x / num_blocks_per_row; + const index_t my_block_offset_in_row = blockIdx.x % num_blocks_per_row; + const index_t my_block_offset_in_batch = my_block_offset_in_row % num_blocks_per_row_batch; + const index_t my_batch = (my_block_offset_in_row) / num_blocks_per_row_batch; + const index_t my_element_in_batch = my_block_offset_in_batch * blockDim.x + current_start + threadIdx.x; + + // Load other boxes + const index_t offset = (my_batch * num_elements_per_batch + + current_start + my_row * num_other_boxes) * + element_width; + for (int i = threadIdx.x; i < element_width * num_other_boxes; i += blockDim.x) { + other_boxes[i] = data[offset + i]; + } + __syncthreads(); + + if (my_element_in_batch >= topk) return; + + DType my_box[4]; + DType my_class = -1; + DType my_score = -1; + const index_t my_offset = (my_batch * num_elements_per_batch + my_element_in_batch) * + element_width; + my_score = data[my_offset + score_index]; +#pragma unroll + for (int i = 0; i < 4; ++i) { + my_box[i] = data[my_offset + coord_index + i]; + } + if (class_index != -1) { + my_class = data[my_offset + class_index]; + } + DType my_area = calculate_area(my_box[0], my_box[1], my_box[2], my_box[3]); + + /*if (my_element_in_batch == 2) {*/ + /*printf("My score: %f\n", my_score);*/ + /*}*/ + uint32_t ret = 0; + if (my_score != -1) { +#pragma unroll + for (int i = 0; i < num_other_boxes; ++i) { + const int other_boxes_offset = element_width * i; + if ((class_index == -1 || my_class == other_boxes[other_boxes_offset + class_index]) && + other_boxes[other_boxes_offset + score_index] != -1){ + const DType their_area = calculate_area( + other_boxes[other_boxes_offset + coord_index + 0], + other_boxes[other_boxes_offset + coord_index + 1], + other_boxes[other_boxes_offset + coord_index + 2], + other_boxes[other_boxes_offset + coord_index + 3]); + + const DType intersect = calculate_intersection( + my_box[0], my_box[1], my_box[2], my_box[3], + other_boxes[other_boxes_offset + coord_index + 0], + other_boxes[other_boxes_offset + coord_index + 1], + other_boxes[other_boxes_offset + coord_index + 2], + other_boxes[other_boxes_offset + coord_index + 3]); + const DType iou = intersect / (my_area + their_area - intersect); + if (iou > threshold) { + ret = ret | (1u << i); + } + } + } + } + result[my_row * topk * num_batches + my_element_in_batch] = ~ret; + /*if (my_element_in_batch == 2) {*/ + /*printf("myRow: %d, ret: %x\n", my_row, ~ret);*/ + /*}*/ + /*if (ret) {*/ + /*printf("HAHA %d %d, %x\n", blockIdx.x, threadIdx.x, ret);*/ + /*}*/ +} + +template +__launch_bounds__(NMS::THRESHOLD) +__global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results, + DType * data, + const index_t score_index, + const index_t element_width, + const index_t num_batches, + const index_t num_elements_per_batch, + const index_t start_index, + const index_t topk) { + constexpr int n_threads = NMS::THRESHOLD; + constexpr int warp_size = 32; + const index_t my_batch = blockIdx.x; + const index_t my_element_in_batch = threadIdx.x + start_index; + const index_t my_element = my_batch * topk + my_element_in_batch; + const int my_warp = threadIdx.x / warp_size; + const int my_lane = threadIdx.x % warp_size; + + __shared__ uint32_t current_valid_boxes; + bool valid = true; + const uint32_t full_mask = 0xFFFFFFFF; + uint32_t valid_boxes; + +#pragma unroll + for (int i = 0; i < n_threads / warp_size; ++i) { + const uint32_t my_mask = my_element_in_batch < topk ? + nms_results[i * topk * num_batches + my_element]: + full_mask; + if (my_warp == i) { + valid_boxes = __ballot_sync(full_mask, valid); + /*if (i == 0) {*/ + /*printf("After ballot %d: %x\n", threadIdx.x, valid_boxes);*/ + /*}*/ +#pragma unroll + for (int j = 0; j < warp_size; ++j) { +// if ((valid_boxes & (1 << j)) != 0) { + const uint32_t mask = __shfl_sync(full_mask, valid?my_mask:full_mask, j); + const uint32_t p = (1 << (j+1)) - 1; + const uint32_t mp = mask | p; + valid_boxes = valid_boxes & mp; + valid = (valid_boxes & (1 << my_lane)) != 0; + /*if (i == 0) {*/ + /*printf("After %d %d: %x %x %x\n",j, threadIdx.x, mask, p,valid_boxes);*/ + /*}*/ +// } + } + valid = (valid_boxes & (1 << my_lane)) != 0; + /*if (i == 0) {*/ + /*printf("After everything %d: %x\n", threadIdx.x, valid_boxes);*/ + /*}*/ + /*printf("%d, %d: %u\n", i, threadIdx.x, valid_boxes);*/ + if (my_lane == 0) { + current_valid_boxes = valid_boxes; + } + } + __syncthreads(); + if (my_warp > i) { + valid = valid && (((~my_mask) & current_valid_boxes) == 0); + } + __syncthreads(); + } + if (my_lane == 0) { + nms_results[my_element] = valid_boxes; + } + valid = (valid_boxes & (1 << my_lane)) != 0; + if (!valid) { + data[(my_batch * num_elements_per_batch + my_element_in_batch) * element_width + score_index] = -1; + } +} + +template +__launch_bounds__(512) +__global__ void ReduceNMSResultRest_kernel(DType* data, + const uint32_t* nms_results, + const index_t score_index, + const index_t element_width, + const index_t num_batches, + const index_t num_elements_per_batch, + const index_t start_index, + const index_t topk, + const index_t num_blocks_per_batch) { + constexpr int num_other_boxes = sizeof(uint32_t) * 8; + constexpr int num_iterations = NMS::THRESHOLD / num_other_boxes; + constexpr int warp_size = 32; + const index_t my_block_offset_in_batch = blockIdx.x % num_blocks_per_batch; + const index_t my_batch = blockIdx.x / num_blocks_per_batch; + const index_t my_element_in_batch = my_block_offset_in_batch * blockDim.x + start_index + NMS::THRESHOLD + threadIdx.x; + const index_t my_element = my_batch * topk + my_element_in_batch; + + if (my_element_in_batch >= topk) return; + + bool valid = true; + +#pragma unroll + for (int i = 0; i < num_iterations; ++i) { + const uint32_t my_mask = nms_results[i * topk * num_batches + my_element]; + const uint32_t valid_boxes = nms_results[my_batch * topk + i * warp_size + start_index]; + + const bool no_hit = (valid_boxes & (~my_mask)) == 0; + valid = valid && no_hit; + /*if (my_element_in_batch == 645) {*/ + /*printf("my_mask %x valid_boxes %x no_hit %d valid %d\n", my_mask, valid_boxes, (int)no_hit, (int)valid);*/ + /*}*/ + } + + if (!valid) { + data[(my_batch * num_elements_per_batch + my_element_in_batch) * element_width + score_index] = -1; + } } template @@ -203,9 +518,7 @@ TempWorkspace GetWorkspace(const int num_batch, workspace.indices_temp_spaces = align(num_batch * num_elem * sizeof(index_t), alignment); WorkspaceForSort(num_batch, num_elem, width_elem, alignment, &workspace); // Place for a buffer - workspace.scratch_space = std::max(workspace.scratch_space, - align(num_batch * num_elem * width_elem * sizeof(DType), - alignment)); + workspace.buffer_space = align(num_batch * num_elem * width_elem * sizeof(DType), alignment); workspace.nms_scratch_space = align(NMS::THRESHOLD / (sizeof(uint32_t) * 8) * num_batch * topk * sizeof(uint32_t), alignment); @@ -225,13 +538,48 @@ TempWorkspace GetWorkspace(const int num_batch, workspace.scores_temp_space); workspace.scratch = reinterpret_cast(workspace.batches) + workspace.batch_temp_space; - workspace.nms_scratch = reinterpret_cast(workspace.scratch + - workspace.scratch_space); + workspace.buffer = reinterpret_cast(workspace.scratch + + workspace.scratch_space); + workspace.nms_scratch = reinterpret_cast( + reinterpret_cast(workspace.buffer) + + workspace.buffer_space); workspace.indices = reinterpret_cast( - reinterpret_cast(workspace.nms_scratch) + workspace.nms_scratch_space); + reinterpret_cast(workspace.nms_scratch) + + workspace.nms_scratch_space); return workspace; } +template +__global__ void ExtractScores_kernel(const DType* data, DType* scores, + const index_t N, const int element_width, + const int score_index) { + const index_t tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid < N) { + scores[tid] = data[tid * element_width + score_index]; + } +} + +template +void CompactNMSResults(const Tensor& data, + Tensor* out, + Tensor* indices, + Tensor* scores, + const int score_index, + Stream* s) { + constexpr int n_threads = 512; + const index_t num_elements = scores->shape_.Size(); + const index_t num_elements_per_batch = data.shape_[1]; + const int element_width = data.shape_[2]; + const index_t n_blocks = ceil_div(num_elements, n_threads); + ExtractScores_kernel<<::GetStream(s)>>>( + data.dptr_, scores->dptr_, num_elements, element_width, score_index); + *indices = mshadow::expr::range(0, num_elements); + //mxnet::op::SortByKey(scores, indices, false, &scratch); + //batches = indices / mshadow::expr::ScalarExp(num_elements_per_batch); + //mxnet::op::SortByKey(batches, indices, true, &scratch); + CompactData(*indices, data, out, -1, s); +} + } // namespace void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs, @@ -278,18 +626,20 @@ void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs, Tensor indices(workspace.indices, Shape1(num_batch * num_elem), s); Tensor scratch(reinterpret_cast(workspace.scratch), Shape1(workspace.scratch_space), s); + Tensor buffer(workspace.buffer, + Shape3(num_batch, num_elem, width_elem), s); Tensor nms_scratch(workspace.nms_scratch, - Shape2(NMS::THRESHOLD / (sizeof(uint32_t) * 8), topk * num_batch), s); + Shape2(NMS::THRESHOLD / (sizeof(uint32_t) * 8), + topk * num_batch), + s); indices = mshadow::expr::range(0, num_batch * num_elem); mxnet::op::SortByKey(scores, indices, false, &scratch); batches = indices / mshadow::expr::ScalarExp(num_elem); mxnet::op::SortByKey(batches, indices, true, &scratch); - Tensor buffer(reinterpret_cast(workspace.scratch), - Shape3(num_batch, num_elem, width_elem), s); CompactData(indices, out, &buffer, topk, s); NMS nms; nms(&buffer, &nms_scratch, topk, param, s); - mshadow::Copy(out, buffer, s); + CompactNMSResults(buffer, &out, &indices, &scores, param.score_index, s); }); } From 0829222f56f8317f1e2a814500ecff1bf58e352c Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 17 Oct 2019 11:10:40 -0700 Subject: [PATCH 03/19] Removing second sort --- src/operator/contrib/bounding_box-inl.cuh | 2 - src/operator/contrib/bounding_box.cu | 154 +++++++++++----------- 2 files changed, 80 insertions(+), 76 deletions(-) diff --git a/src/operator/contrib/bounding_box-inl.cuh b/src/operator/contrib/bounding_box-inl.cuh index 128d3cbcecc2..de215ce35a98 100644 --- a/src/operator/contrib/bounding_box-inl.cuh +++ b/src/operator/contrib/bounding_box-inl.cuh @@ -202,9 +202,7 @@ void NMSApply(mshadow::Stream *s, int in_format) { using namespace mxnet_op; constexpr int THRESHOLD = 1024; - std::cout << "TOPK " << topk << std::endl; for (int ref = 0; ref < topk; ref += THRESHOLD) { - std::cout << "REF: " << ref << std::endl; constexpr int block_size = 512; constexpr int N = THRESHOLD / block_size; auto stream = mshadow::Stream::GetStream(s); diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu index f824f6d58e25..f87ce148313a 100644 --- a/src/operator/contrib/bounding_box.cu +++ b/src/operator/contrib/bounding_box.cu @@ -41,8 +41,6 @@ template struct TempWorkspace { index_t scores_temp_space; DType* scores; - index_t batch_temp_space; - index_t* batches; index_t scratch_space; uint8_t* scratch; index_t buffer_space; @@ -63,13 +61,12 @@ inline index_t align(index_t x, index_t alignment) { template __global__ void FilterAndPrepareAuxData_kernel(const DType* data, DType* out, DType* scores, - index_t* batches, index_t num_elements_per_batch, + index_t num_elements_per_batch, const index_t element_width, const float threshold, const int id_index, const int score_index, const int background_id) { index_t tid = blockIdx.x * blockDim.x + threadIdx.x; bool first_in_element = (tid % element_width == 0); - index_t my_batch = tid / (num_elements_per_batch * element_width); index_t start_of_my_element = tid - (tid % element_width); DType my_score = data[start_of_my_element + score_index]; @@ -88,7 +85,6 @@ __global__ void FilterAndPrepareAuxData_kernel(const DType* data, DType* out, DT if (first_in_element) { index_t offset = tid / element_width; scores[offset] = my_score; - batches[offset] = my_batch; } } @@ -106,16 +102,17 @@ void FilterAndPrepareAuxData(const Tensor& data, 0, Stream::GetStream(s)>>>( data.dptr_, out->dptr_, workspace.scores, - workspace.batches, data.shape_[1], data.shape_[2], + data.shape_[1], data.shape_[2], param.valid_thresh, param.id_index, param.score_index, param.background_id); } -template +template __global__ void CompactData_kernel(const index_t* indices, const DType* source, DType* destination, const index_t topk, const index_t element_width, const index_t num_elements_per_batch, + const int score_index, const index_t N) { const index_t tid_start = blockIdx.x * blockDim.x + threadIdx.x; for (index_t tid = tid_start; tid < N; tid += blockDim.x * gridDim.x) { @@ -124,47 +121,57 @@ __global__ void CompactData_kernel(const index_t* indices, const DType* source, if (check_topk && my_element_in_batch >= topk) { destination[tid] = -1; } else { + DType ret; const index_t source_element = indices[my_element]; - destination[tid] = source[source_element * element_width + tid % element_width]; + DType score = 0; + if (check_score) { + score = source[source_element * element_width + score_index]; + } + if (score >= 0) { + ret = source[source_element * element_width + tid % element_width]; + } else { + ret = -1; + } + destination[tid] = ret; } } } -template +template void CompactData(const Tensor& indices, const Tensor& source, Tensor* destination, const index_t topk, + const int score_index, Stream* s) { const int n_threads = 512; const int max_blocks = 320; index_t N = source.shape_.Size(); const auto blocks = std::min(ceil_div(N, n_threads), max_blocks); if (topk > 0) { - CompactData_kernel<<::GetStream(s)>>>( + CompactData_kernel<<::GetStream(s)>>>( indices.dptr_, source.dptr_, destination->dptr_, topk, - source.shape_[2], source.shape_[1], N); + source.shape_[2], source.shape_[1], + score_index, N); } else { - CompactData_kernel<<::GetStream(s)>>>( + CompactData_kernel<<::GetStream(s)>>>( indices.dptr_, source.dptr_, destination->dptr_, topk, - source.shape_[2], source.shape_[1], N); + source.shape_[2], source.shape_[1], + score_index, N); } } template -void WorkspaceForSort(const int num_batch, - const int num_elem, +void WorkspaceForSort(const int num_elem, const int width_elem, const int alignment, TempWorkspace* workspace) { - const index_t sort_scores_temp_space = mxnet::op::SortByKeyWorkspaceSize(num_batch * num_elem); - const index_t sort_batch_temp_space = mxnet::op::SortByKeyWorkspaceSize(num_batch * num_elem); - workspace->scratch_space = align(std::max(sort_scores_temp_space, - sort_batch_temp_space), + const index_t sort_scores_temp_space = mxnet::op::SortByKeyWorkspaceSize(num_elem); + workspace->scratch_space = align(sort_scores_temp_space, alignment); } @@ -227,14 +234,16 @@ struct NMS { <<::GetStream(s)>>>( data->dptr_, scratch->dptr_, current_start, n_elems, num_batches, num_blocks_per_row_batch, num_blocks_per_row, topk, element_width, - num_elements_per_batch, param.coord_start, param.id_index, + num_elements_per_batch, param.coord_start, + param.force_suppress ? -1 : param.id_index, param.score_index, param.overlap_thresh); } else { CalculateGreedyNMSResults_kernel <<::GetStream(s)>>>( data->dptr_, scratch->dptr_, current_start, n_elems, num_batches, num_blocks_per_row_batch, num_blocks_per_row, topk, element_width, - num_elements_per_batch, param.coord_start, param.id_index, + num_elements_per_batch, param.coord_start, + param.force_suppress ? -1 : param.id_index, param.score_index, param.overlap_thresh); } ReduceNMSResultTriangle_kernel<<::GetStream(s)>>>( @@ -354,9 +363,6 @@ __global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* re } DType my_area = calculate_area(my_box[0], my_box[1], my_box[2], my_box[3]); - /*if (my_element_in_batch == 2) {*/ - /*printf("My score: %f\n", my_score);*/ - /*}*/ uint32_t ret = 0; if (my_score != -1) { #pragma unroll @@ -384,12 +390,6 @@ __global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* re } } result[my_row * topk * num_batches + my_element_in_batch] = ~ret; - /*if (my_element_in_batch == 2) {*/ - /*printf("myRow: %d, ret: %x\n", my_row, ~ret);*/ - /*}*/ - /*if (ret) {*/ - /*printf("HAHA %d %d, %x\n", blockIdx.x, threadIdx.x, ret);*/ - /*}*/ } template @@ -413,6 +413,7 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results, __shared__ uint32_t current_valid_boxes; bool valid = true; const uint32_t full_mask = 0xFFFFFFFF; + const uint32_t my_warp_mask = 1 << my_lane; uint32_t valid_boxes; #pragma unroll @@ -422,27 +423,15 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results, full_mask; if (my_warp == i) { valid_boxes = __ballot_sync(full_mask, valid); - /*if (i == 0) {*/ - /*printf("After ballot %d: %x\n", threadIdx.x, valid_boxes);*/ - /*}*/ #pragma unroll for (int j = 0; j < warp_size; ++j) { -// if ((valid_boxes & (1 << j)) != 0) { const uint32_t mask = __shfl_sync(full_mask, valid?my_mask:full_mask, j); const uint32_t p = (1 << (j+1)) - 1; const uint32_t mp = mask | p; valid_boxes = valid_boxes & mp; - valid = (valid_boxes & (1 << my_lane)) != 0; - /*if (i == 0) {*/ - /*printf("After %d %d: %x %x %x\n",j, threadIdx.x, mask, p,valid_boxes);*/ - /*}*/ -// } + valid = (valid_boxes & my_warp_mask) != 0; } - valid = (valid_boxes & (1 << my_lane)) != 0; - /*if (i == 0) {*/ - /*printf("After everything %d: %x\n", threadIdx.x, valid_boxes);*/ - /*}*/ - /*printf("%d, %d: %u\n", i, threadIdx.x, valid_boxes);*/ + // valid = (valid_boxes & my_warp_mask) != 0; if (my_lane == 0) { current_valid_boxes = valid_boxes; } @@ -456,7 +445,7 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results, if (my_lane == 0) { nms_results[my_element] = valid_boxes; } - valid = (valid_boxes & (1 << my_lane)) != 0; + // valid = (valid_boxes & my_warp_mask) != 0; if (!valid) { data[(my_batch * num_elements_per_batch + my_element_in_batch) * element_width + score_index] = -1; } @@ -492,9 +481,6 @@ __global__ void ReduceNMSResultRest_kernel(DType* data, const bool no_hit = (valid_boxes & (~my_mask)) == 0; valid = valid && no_hit; - /*if (my_element_in_batch == 645) {*/ - /*printf("my_mask %x valid_boxes %x no_hit %d valid %d\n", my_mask, valid_boxes, (int)no_hit, (int)valid);*/ - /*}*/ } if (!valid) { @@ -514,16 +500,14 @@ TempWorkspace GetWorkspace(const int num_batch, // Get the workspace size workspace.scores_temp_space = align(num_batch * num_elem * sizeof(DType), alignment); - workspace.batch_temp_space = align(num_batch * num_elem * sizeof(index_t), alignment); workspace.indices_temp_spaces = align(num_batch * num_elem * sizeof(index_t), alignment); - WorkspaceForSort(num_batch, num_elem, width_elem, alignment, &workspace); + WorkspaceForSort(num_elem, width_elem, alignment, &workspace); // Place for a buffer workspace.buffer_space = align(num_batch * num_elem * width_elem * sizeof(DType), alignment); workspace.nms_scratch_space = align(NMS::THRESHOLD / (sizeof(uint32_t) * 8) * num_batch * topk * sizeof(uint32_t), alignment); const index_t workspace_size = workspace.scores_temp_space + - workspace.batch_temp_space + workspace.scratch_space + workspace.nms_scratch_space + workspace.indices_temp_spaces; @@ -534,10 +518,8 @@ TempWorkspace GetWorkspace(const int num_batch, // Populate workspace pointers workspace.scores = scratch_memory.dptr_; - workspace.batches = reinterpret_cast(reinterpret_cast(workspace.scores) + - workspace.scores_temp_space); - workspace.scratch = reinterpret_cast(workspace.batches) + - workspace.batch_temp_space; + workspace.scratch = reinterpret_cast(workspace.scores) + + workspace.scores_temp_space; workspace.buffer = reinterpret_cast(workspace.scratch + workspace.scratch_space); workspace.nms_scratch = reinterpret_cast( @@ -564,20 +546,30 @@ void CompactNMSResults(const Tensor& data, Tensor* out, Tensor* indices, Tensor* scores, + Tensor* scratch, const int score_index, Stream* s) { + using mshadow::Shape1; constexpr int n_threads = 512; const index_t num_elements = scores->shape_.Size(); const index_t num_elements_per_batch = data.shape_[1]; + const index_t num_batches = data.shape_[0]; const int element_width = data.shape_[2]; const index_t n_blocks = ceil_div(num_elements, n_threads); ExtractScores_kernel<<::GetStream(s)>>>( data.dptr_, scores->dptr_, num_elements, element_width, score_index); *indices = mshadow::expr::range(0, num_elements); - //mxnet::op::SortByKey(scores, indices, false, &scratch); - //batches = indices / mshadow::expr::ScalarExp(num_elements_per_batch); - //mxnet::op::SortByKey(batches, indices, true, &scratch); - CompactData(*indices, data, out, -1, s); + for (index_t i = 0; i < num_batches; ++i) { + // Sort each batch separately + Tensor scores_batch(scores->dptr_ + i * num_elements_per_batch, + Shape1(num_elements_per_batch), + s); + Tensor indices_batch(indices->dptr_ + i * num_elements_per_batch, + Shape1(num_elements_per_batch), + s); + mxnet::op::SortByKey(scores_batch, indices_batch, false, scratch); + } + CompactData(*indices, data, out, -1, score_index, s); } } // namespace @@ -602,7 +594,9 @@ void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs, int num_elem = in_shape[indim - 2]; int width_elem = in_shape[indim - 1]; - MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, { + //MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, + using DType = float; + { Tensor data = inputs[box_nms_enum::kData] .get_with_shape(Shape3(num_batch, num_elem, width_elem), s); Tensor out = outputs[box_nms_enum::kOut] @@ -622,7 +616,6 @@ void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs, FilterAndPrepareAuxData(data, &out, workspace, param, s); Tensor scores(workspace.scores, Shape1(num_batch * num_elem), s); - Tensor batches(workspace.batches, Shape1(num_batch * num_elem), s); Tensor indices(workspace.indices, Shape1(num_batch * num_elem), s); Tensor scratch(reinterpret_cast(workspace.scratch), Shape1(workspace.scratch_space), s); @@ -633,14 +626,32 @@ void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs, topk * num_batch), s); indices = mshadow::expr::range(0, num_batch * num_elem); - mxnet::op::SortByKey(scores, indices, false, &scratch); - batches = indices / mshadow::expr::ScalarExp(num_elem); - mxnet::op::SortByKey(batches, indices, true, &scratch); - CompactData(indices, out, &buffer, topk, s); + for (index_t i = 0; i < num_batch; ++i) { + // Sort each batch separately + Tensor scores_batch(scores.dptr_ + i * num_elem, + Shape1(num_elem), + s); + Tensor indices_batch(indices.dptr_ + i * num_elem, + Shape1(num_elem), + s); + mxnet::op::SortByKey(scores_batch, indices_batch, false, &scratch); + } + CompactData(indices, out, &buffer, topk, -1, s); NMS nms; nms(&buffer, &nms_scratch, topk, param, s); - CompactNMSResults(buffer, &out, &indices, &scores, param.score_index, s); - }); + CompactNMSResults(buffer, &out, &indices, &scores, &scratch, param.score_index, s); + + // convert encoding + if (param.in_format != param.out_format) { + if (box_common_enum::kCenter == param.out_format) { + mxnet::op::mxnet_op::Kernel::Launch(s, num_batch * num_elem, + out.dptr_ + param.coord_start, width_elem); + } else { + mxnet::op::mxnet_op::Kernel::Launch(s, num_batch * num_elem, + out.dptr_ + param.coord_start, width_elem); + } + } + };//); } void BoxNMSForwardGPU(const nnvm::NodeAttrs& attrs, @@ -653,11 +664,6 @@ void BoxNMSForwardGPU(const nnvm::NodeAttrs& attrs, using namespace mxnet_op; CHECK_EQ(inputs.size(), 1U); CHECK_EQ(outputs.size(), 2U) << "BoxNMS output: [output, temp]"; - std::cout << "Reqs" << std::endl; - for (const auto& r : req) { - std::cout << r << std::endl; - } - std::cout << "END: Reqs" << std::endl; if (req[1] == kNullOp) { BoxNMSForwardGPU_notemp(attrs, ctx, inputs, req, outputs); return; From f3445f872f7908cb6bb896dce4008bbb2a3ffb4b Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 17 Oct 2019 13:11:16 -0700 Subject: [PATCH 04/19] Optimization --- src/operator/contrib/bounding_box.cu | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu index f87ce148313a..228c1273e156 100644 --- a/src/operator/contrib/bounding_box.cu +++ b/src/operator/contrib/bounding_box.cu @@ -416,10 +416,15 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results, const uint32_t my_warp_mask = 1 << my_lane; uint32_t valid_boxes; + uint32_t my_next_mask = my_element_in_batch < topk ? + nms_results[my_element]: + full_mask; #pragma unroll for (int i = 0; i < n_threads / warp_size; ++i) { - const uint32_t my_mask = my_element_in_batch < topk ? - nms_results[i * topk * num_batches + my_element]: + const uint32_t my_mask = my_next_mask; + my_next_mask = (((i + 1) < n_threads / warp_size) && + (my_element_in_batch < topk)) ? + nms_results[(i + 1) * topk * num_batches + my_element]: full_mask; if (my_warp == i) { valid_boxes = __ballot_sync(full_mask, valid); @@ -431,7 +436,6 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results, valid_boxes = valid_boxes & mp; valid = (valid_boxes & my_warp_mask) != 0; } - // valid = (valid_boxes & my_warp_mask) != 0; if (my_lane == 0) { current_valid_boxes = valid_boxes; } @@ -445,7 +449,6 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results, if (my_lane == 0) { nms_results[my_element] = valid_boxes; } - // valid = (valid_boxes & my_warp_mask) != 0; if (!valid) { data[(my_batch * num_elements_per_batch + my_element_in_batch) * element_width + score_index] = -1; } From e653f58bd9027bfc6ac2c979c3f8b816cc5cc4dc Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 17 Oct 2019 14:45:46 -0700 Subject: [PATCH 05/19] Adding out-of-place ability to SortByKey --- src/operator/tensor/sort_op-inl.cuh | 130 +++++++++++++++++++++------- src/operator/tensor/sort_op.h | 50 +++++++++-- 2 files changed, 145 insertions(+), 35 deletions(-) diff --git a/src/operator/tensor/sort_op-inl.cuh b/src/operator/tensor/sort_op-inl.cuh index b20b466d9c2b..63ccc5752fb0 100644 --- a/src/operator/tensor/sort_op-inl.cuh +++ b/src/operator/tensor/sort_op-inl.cuh @@ -95,13 +95,22 @@ SortPairsWorkspaceSize(const size_t num_keys) { template inline typename std::enable_if::value, size_t>::type -SortByKeyWorkspaceSize(const size_t num_keys) { +SortByKeyWorkspaceSize(const size_t num_keys, + const bool keys_in_place, + const bool values_in_place) { #ifdef SORT_WITH_THRUST return 0; #else size_t keys_bytes, values_bytes; WorkspaceSize4KeysAndValues(num_keys, &keys_bytes, &values_bytes); - return keys_bytes + values_bytes + SortPairsWorkspaceSize(num_keys); + size_t ret = SortPairsWorkspaseSize(num_keys); + if (keys_in_place) { + ret += keys_bytes; + } + if (values_in_place) { + ret += values_bytes; + } + return ret; #endif } @@ -111,7 +120,9 @@ inline typename std::enable_if::val SortByKeyImpl(mshadow::Tensor keys, mshadow::Tensor values, bool is_ascend, mshadow::Tensor* workspace, - const int begin_bit, const int end_bit) { + const int begin_bit, const int end_bit, + mshadow::Tensor* sorted_keys, + mshadow::Tensor* sorted_values) { CHECK_EQ(keys.CheckContiguous(), true); CHECK_EQ(values.CheckContiguous(), true); #if CUDA_VERSION >= 7000 @@ -135,11 +146,23 @@ SortByKeyImpl(mshadow::Tensor keys, NULL, NULL, NULL, NULL, keys.size(0), begin_bit, end_bit, stream); } + + size_t required_storage = sortpairs_bytes + + (sorted_keys == nullptr ? keys_bytes : 0) + + (sorted_values == nullptr ? values_bytes : 0); + // Check that we have enough storage - CHECK_GE(workspace->size(0), keys_bytes + values_bytes + sortpairs_bytes); - // - KDType* keys_out_ptr = reinterpret_cast(workspace->dptr_); - VDType* values_out_ptr = reinterpret_cast(workspace->dptr_ + keys_bytes); + CHECK_GE(workspace->size(0), required_storage) + << "Workspace given to SortByKey is too small: requested " << required_storage << + " B and got " << workspace->size(0) << " B."; + + KDType* keys_out_ptr = sorted_keys == nullptr ? + reinterpret_cast(workspace->dptr_) : + sorted_keys->dptr_; + VDType* values_out_ptr = sorted_values == nullptr ? + reinterpret_cast(workspace->dptr_ + keys_bytes) : + sorted_values->dptr_; + void* temp_storage = reinterpret_cast(workspace->dptr_ + keys_bytes + values_bytes); // Sort if (is_ascend) { @@ -152,17 +175,31 @@ SortByKeyImpl(mshadow::Tensor keys, keys.size(0), begin_bit, end_bit, stream); } // Copy result back to [keys, values] - mshadow::Tensor keys_out(keys_out_ptr, mshadow::Shape1(keys.size(0)), - keys.stream_); - mshadow::Tensor values_out(values_out_ptr, mshadow::Shape1(keys.size(0)), - keys.stream_); - mshadow::Copy(keys, keys_out, keys.stream_); - mshadow::Copy(values, values_out, values.stream_); + if (sorted_keys == nullptr) { + mshadow::Tensor keys_out(keys_out_ptr, mshadow::Shape1(keys.size(0)), + keys.stream_); + mshadow::Copy(keys, keys_out, keys.stream_); + } + if (sorted_values == nullptr) { + mshadow::Tensor values_out(values_out_ptr, mshadow::Shape1(keys.size(0)), + keys.stream_); + mshadow::Copy(values, values_out, values.stream_); + } } else { #endif // SORT_WITH_THRUST // No workspace, sort using thrust - thrust::device_ptr key_iter = thrust::device_pointer_cast(keys.dptr_); - thrust::device_ptr value_iter = thrust::device_pointer_cast(values.dptr_); + auto* k = &keys; + auto* v = &values; + if (sorted_keys != nullptr) { + k = sorted_keys; + mshadow::Copy(*sorted_keys, keys, keys.stream_); + } + if (sorted_values != nullptr) { + v = sorted_values; + mshadow::Copy(*sorted_values, values, values.stream_); + } + const auto key_iter = thrust::device_pointer_cast(k->dptr_); + const auto value_iter = thrust::device_pointer_cast(v->dptr_); if (is_ascend) { thrust::stable_sort_by_key( thrust::cuda::par.on(stream), @@ -187,14 +224,25 @@ inline typename std::enable_if<((!std::is_same::va SortByKeyImpl(mshadow::Tensor keys, mshadow::Tensor values, bool is_ascend, mshadow::Tensor* workspace, - const int begin_bit, const int end_bit) { + const int begin_bit, const int end_bit, + mshadow::Tensor* sorted_keys, + mshadow::Tensor* sorted_values) { CHECK_EQ(keys.CheckContiguous(), true); CHECK_EQ(values.CheckContiguous(), true); #if CUDA_VERSION >= 9000 cudaStream_t stream = mshadow::Stream::GetStream(keys.stream_); - thrust::device_ptr key_iter = thrust::device_pointer_cast(keys.dptr_); - thrust::device_ptr<__half> value_iter = thrust::device_pointer_cast( - reinterpret_cast<__half*>(values.dptr_)); + auto* k = &keys; + auto* v = &values; + if (sorted_keys != nullptr) { + k = sorted_keys; + mshadow::Copy(*sorted_keys, keys, keys.stream_); + } + if (sorted_values != nullptr) { + v = sorted_values; + mshadow::Copy(*sorted_values, values, values.stream_); + } + const auto key_iter = thrust::device_pointer_cast(k->dptr_); + const auto value_iter = thrust::device_pointer_cast(reinterpret_cast<__half*>(v->dptr_)); if (is_ascend) { thrust::stable_sort_by_key( thrust::cuda::par.on(stream), @@ -216,14 +264,25 @@ inline typename std::enable_if<(std::is_same::valu SortByKeyImpl(mshadow::Tensor keys, mshadow::Tensor values, bool is_ascend, mshadow::Tensor* workspace, - const int begin_bit, const int end_bit) { + const int begin_bit, const int end_bit, + mshadow::Tensor* sorted_keys, + mshadow::Tensor* sorted_values) { CHECK_EQ(keys.CheckContiguous(), true); CHECK_EQ(values.CheckContiguous(), true); #if CUDA_VERSION >= 9000 cudaStream_t stream = mshadow::Stream::GetStream(keys.stream_); - thrust::device_ptr<__half> key_iter = thrust::device_pointer_cast( - reinterpret_cast<__half*>(keys.dptr_)); - thrust::device_ptr value_iter = thrust::device_pointer_cast(values.dptr_); + auto* k = &keys; + auto* v = &values; + if (sorted_keys != nullptr) { + k = sorted_keys; + mshadow::Copy(*sorted_keys, keys, keys.stream_); + } + if (sorted_values != nullptr) { + v = sorted_values; + mshadow::Copy(*sorted_values, values, values.stream_); + } + const auto key_iter = thrust::device_pointer_cast(reinterpret_cast<__half*>(k->dptr_)); + const auto value_iter = thrust::device_pointer_cast(v->dptr_); if (is_ascend) { thrust::stable_sort_by_key( thrust::cuda::par.on(stream), @@ -246,15 +305,25 @@ inline typename std::enable_if<(std::is_same::valu SortByKeyImpl(mshadow::Tensor keys, mshadow::Tensor values, bool is_ascend, mshadow::Tensor* workspace, - const int begin_bit, const int end_bit) { + const int begin_bit, const int end_bit, + mshadow::Tensor* sorted_keys, + mshadow::Tensor* sorted_values) { CHECK_EQ(keys.CheckContiguous(), true); CHECK_EQ(values.CheckContiguous(), true); #if CUDA_VERSION >= 9000 cudaStream_t stream = mshadow::Stream::GetStream(keys.stream_); - thrust::device_ptr<__half> key_iter = thrust::device_pointer_cast( - reinterpret_cast<__half*>(keys.dptr_)); - thrust::device_ptr<__half> value_iter = thrust::device_pointer_cast( - reinterpret_cast<__half*>(values.dptr_)); + auto* k = &keys; + auto* v = &values; + if (sorted_keys != nullptr) { + k = sorted_keys; + mshadow::Copy(*sorted_keys, keys, keys.stream_); + } + if (sorted_values != nullptr) { + v = sorted_values; + mshadow::Copy(*sorted_values, values, values.stream_); + } + const auto key_iter = thrust::device_pointer_cast(reinterpret_cast(k->dptr_)); + const auto value_iter = thrust::device_pointer_cast(reinterpret_cast(v->dptr_)); if (is_ascend) { thrust::stable_sort_by_key( thrust::cuda::par.on(stream), @@ -274,7 +343,10 @@ template inline void SortByKey(mshadow::Tensor keys, mshadow::Tensor values, bool is_ascend, mshadow::Tensor* workspace, const int begin_bit, const int end_bit) { - SortByKeyImpl(keys, values, is_ascend, workspace, begin_bit, end_bit); + const int begin_bit, const int end_bit, + mshadow::Tensor* sorted_keys, + mshadow::Tensor* sorted_values) { + SortByKeyImpl(keys, values, is_ascend, workspace, begin_bit, end_bit, sorted_keys, sorted_values); } } // namespace op diff --git a/src/operator/tensor/sort_op.h b/src/operator/tensor/sort_op.h index 6d4675a0775a..11aea9db09ec 100644 --- a/src/operator/tensor/sort_op.h +++ b/src/operator/tensor/sort_op.h @@ -49,11 +49,17 @@ namespace op { * \param keys the keys to sort * \param values the values that sorts w.r.t the key * \param is_ascend whether to sort key in ascending order + * \param begin_bit The beginning bit of the different values in keys. Default 0. + * \param end_bit The ending bit of the different values in keys. Default to 8 * sizeof(dtype of key). + * \param sorted_keys If specified, keys will be sorted out of place. + * \param sorted_values If specified, values will be sorted out of place. */ template inline void SortByKey(mshadow::Tensor keys, mshadow::Tensor values, bool is_ascend = true, mshadow::Tensor* workspace = NULL, - const int begin_bit = 0, const int end_bit = sizeof(KDType)*8) { + const int begin_bit = 0, const int end_bit = sizeof(KDType)*8, + mshadow::Tensor* sorted_keys = nullptr, + mshadow::Tensor* sorted_values = nullptr) { CHECK_EQ(keys.CheckContiguous(), true); CHECK_EQ(values.CheckContiguous(), true); CHECK_EQ(keys.size(0), values.size(0)) @@ -62,6 +68,12 @@ inline void SortByKey(mshadow::Tensor keys, mshadow::Tensor idx(keys.size(0)); std::vector keys_vec(keys.size(0)); std::vector values_vec(values.size(0)); + if (sorted_keys == nullptr) { + sorted_keys = &keys; + } + if (sorted_values == nullptr) { + sorted_values = &values; + } for (index_t i = 0; i < keys.size(0); i++) { idx[i] = i; keys_vec[i] = keys[i]; @@ -77,18 +89,28 @@ inline void SortByKey(mshadow::Tensor keys, mshadow::Tensor keys_vec[i2]; }); } for (index_t i = 0; i < values.size(0); i++) { - keys[i] = keys_vec[idx[i]]; - values[i] = values_vec[idx[i]]; + (*sorted_keys)[i] = keys_vec[idx[i]]; + (*sorted_values)[i] = values_vec[idx[i]]; } } /*! * \brief CPU/GPU: Return the amount of temporary storage in bytes required for SortByKey * \param num_keys number of keys to sort + * \param keys_in_place Whether the sorting of keys will happen in place. + * Default true. If set to false, subsequent + * call to SortByKey needs to specify the + * sorted_keys parameter. + * \param values_in_place Whether the sorting of values will happen in place. + * Default true. If set to false, subsequent + * call to SortByKey needs to specify the + * sorted_values parameter. */ template inline typename std::enable_if::value, size_t>::type -SortByKeyWorkspaceSize(const size_t num_keys) { +SortByKeyWorkspaceSize(const size_t num_keys, + const bool keys_in_place = true, + const bool values_in_place = true) { return 0; } @@ -97,18 +119,34 @@ SortByKeyWorkspaceSize(const size_t num_keys) { * \param keys the keys to sort * \param values the values that sorts w.r.t the key * \param is_ascend whether to sort key in ascending order + * \param begin_bit The beginning bit of the different values in keys. Default 0. + * \param end_bit The ending bit of the different values in keys. Default to 8 * sizeof(dtype of key). + * \param sorted_keys If specified, keys will be sorted out of place. + * \param sorted_values If specified, values will be sorted out of place. */ template inline void SortByKey(mshadow::Tensor keys, mshadow::Tensor values, bool is_ascend = true, mshadow::Tensor* workspace = NULL, - const int begin_bit = 0, const int end_bit = sizeof(KDType)*8); + const int begin_bit = 0, const int end_bit = sizeof(KDType)*8, + mshadow::Tensor* sorted_keys = nullptr, + mshadow::Tensor* sorted_values = nullptr); /*! * \brief CPU/GPU: Return the amount of temporary storage in bytes required for SortByKey * \param num_keys number of keys to sort + * \param keys_in_place Whether the sorting of keys will happen in place. + * Default true. If set to false, subsequent + * call to SortByKey needs to specify the + * sorted_keys parameter. + * \param values_in_place Whether the sorting of values will happen in place. + * Default true. If set to false, subsequent + * call to SortByKey needs to specify the + * sorted_values parameter. */ template inline typename std::enable_if::value, size_t>::type -SortByKeyWorkspaceSize(const size_t num_keys); +SortByKeyWorkspaceSize(const size_t num_keys, + const bool keys_in_place = true, + const bool values_in_place = true); } // namespace op } // namespace mxnet From 49ded5a64e2d1a7b3f0ff2b6a3d2fcde2971e480 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 17 Oct 2019 15:36:44 -0700 Subject: [PATCH 06/19] Optimization pt2 --- src/operator/contrib/bounding_box.cu | 66 +++++++++++++++++++--------- src/operator/tensor/sort_op-inl.cuh | 11 +++-- 2 files changed, 53 insertions(+), 24 deletions(-) diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu index 228c1273e156..bcd31a7ad657 100644 --- a/src/operator/contrib/bounding_box.cu +++ b/src/operator/contrib/bounding_box.cu @@ -170,7 +170,7 @@ void WorkspaceForSort(const int num_elem, const int width_elem, const int alignment, TempWorkspace* workspace) { - const index_t sort_scores_temp_space = mxnet::op::SortByKeyWorkspaceSize(num_elem); + const index_t sort_scores_temp_space = mxnet::op::SortByKeyWorkspaceSize(num_elem, false, false); workspace->scratch_space = align(sort_scores_temp_space, alignment); } @@ -411,9 +411,10 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results, const int my_lane = threadIdx.x % warp_size; __shared__ uint32_t current_valid_boxes; - bool valid = true; const uint32_t full_mask = 0xFFFFFFFF; - const uint32_t my_warp_mask = 1 << my_lane; + const uint32_t my_lane_mask = 1 << my_lane; + const uint32_t my_warp_mask = (1 << (my_lane + 1)) - 1; + uint32_t valid = my_lane_mask; uint32_t valid_boxes; uint32_t my_next_mask = my_element_in_batch < topk ? @@ -421,35 +422,35 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results, full_mask; #pragma unroll for (int i = 0; i < n_threads / warp_size; ++i) { - const uint32_t my_mask = my_next_mask; + uint32_t my_mask = my_next_mask; my_next_mask = (((i + 1) < n_threads / warp_size) && (my_element_in_batch < topk)) ? nms_results[(i + 1) * topk * num_batches + my_element]: full_mask; if (my_warp == i) { - valid_boxes = __ballot_sync(full_mask, valid); + my_mask = my_mask | my_warp_mask; + // Loop over warp_size - 1 because the last + // thread does not contribute to the mask anyway #pragma unroll - for (int j = 0; j < warp_size; ++j) { - const uint32_t mask = __shfl_sync(full_mask, valid?my_mask:full_mask, j); - const uint32_t p = (1 << (j+1)) - 1; - const uint32_t mp = mask | p; - valid_boxes = valid_boxes & mp; - valid = (valid_boxes & my_warp_mask) != 0; + for (int j = 0; j < warp_size - 1; ++j) { + const uint32_t mask = __shfl_sync(full_mask, valid ? my_mask : full_mask, j); + valid = valid & mask; } + valid_boxes = __ballot_sync(full_mask, valid); if (my_lane == 0) { current_valid_boxes = valid_boxes; } } __syncthreads(); - if (my_warp > i) { - valid = valid && (((~my_mask) & current_valid_boxes) == 0); + if ((my_warp > i) && (((~my_mask) & current_valid_boxes) != 0)) { + valid = 0; } __syncthreads(); } if (my_lane == 0) { nms_results[my_element] = valid_boxes; } - if (!valid) { + if (valid == 0) { data[(my_batch * num_elements_per_batch + my_element_in_batch) * element_width + score_index] = -1; } } @@ -502,8 +503,8 @@ TempWorkspace GetWorkspace(const int num_batch, const int alignment = 128; // Get the workspace size - workspace.scores_temp_space = align(num_batch * num_elem * sizeof(DType), alignment); - workspace.indices_temp_spaces = align(num_batch * num_elem * sizeof(index_t), alignment); + workspace.scores_temp_space = 2 * align(num_batch * num_elem * sizeof(DType), alignment); + workspace.indices_temp_spaces = 2 * align(num_batch * num_elem * sizeof(index_t), alignment); WorkspaceForSort(num_elem, width_elem, alignment, &workspace); // Place for a buffer workspace.buffer_space = align(num_batch * num_elem * width_elem * sizeof(DType), alignment); @@ -549,6 +550,8 @@ void CompactNMSResults(const Tensor& data, Tensor* out, Tensor* indices, Tensor* scores, + Tensor* sorted_indices, + Tensor* sorted_scores, Tensor* scratch, const int score_index, Stream* s) { @@ -570,9 +573,18 @@ void CompactNMSResults(const Tensor& data, Tensor indices_batch(indices->dptr_ + i * num_elements_per_batch, Shape1(num_elements_per_batch), s); - mxnet::op::SortByKey(scores_batch, indices_batch, false, scratch); + Tensor sorted_scores_batch(sorted_scores->dptr_ + i * num_elements_per_batch, + Shape1(num_elements_per_batch), + s); + Tensor sorted_indices_batch(sorted_indices->dptr_ + i * num_elements_per_batch, + Shape1(num_elements_per_batch), + s); + mxnet::op::SortByKey(scores_batch, indices_batch, false, scratch, + 0, 8 * sizeof(DType), &sorted_scores_batch, + &sorted_indices_batch); } - CompactData(*indices, data, out, -1, score_index, s); + CompactData(*sorted_indices, data, out, -1, score_index, s); + //CompactData(*indices, data, out, -1, score_index, s); } } // namespace @@ -619,7 +631,9 @@ void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs, FilterAndPrepareAuxData(data, &out, workspace, param, s); Tensor scores(workspace.scores, Shape1(num_batch * num_elem), s); + Tensor sorted_scores(workspace.scores + scores.MSize(), Shape1(num_batch * num_elem), s); Tensor indices(workspace.indices, Shape1(num_batch * num_elem), s); + Tensor sorted_indices(workspace.indices + indices.MSize(), Shape1(num_batch * num_elem), s); Tensor scratch(reinterpret_cast(workspace.scratch), Shape1(workspace.scratch_space), s); Tensor buffer(workspace.buffer, @@ -637,12 +651,22 @@ void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs, Tensor indices_batch(indices.dptr_ + i * num_elem, Shape1(num_elem), s); - mxnet::op::SortByKey(scores_batch, indices_batch, false, &scratch); + Tensor sorted_scores_batch(sorted_scores.dptr_ + i * num_elem, + Shape1(num_elem), + s); + Tensor sorted_indices_batch(sorted_indices.dptr_ + i * num_elem, + Shape1(num_elem), + s); + mxnet::op::SortByKey(scores_batch, indices_batch, false, &scratch, 0, + 8 * sizeof(DType), &sorted_scores_batch, + &sorted_indices_batch); } - CompactData(indices, out, &buffer, topk, -1, s); + CompactData(sorted_indices, out, &buffer, topk, -1, s); + /*CompactData(indices, out, &buffer, topk, -1, s);*/ NMS nms; nms(&buffer, &nms_scratch, topk, param, s); - CompactNMSResults(buffer, &out, &indices, &scores, &scratch, param.score_index, s); + CompactNMSResults(buffer, &out, &indices, &scores, &sorted_indices, + &sorted_scores, &scratch, param.score_index, s); // convert encoding if (param.in_format != param.out_format) { diff --git a/src/operator/tensor/sort_op-inl.cuh b/src/operator/tensor/sort_op-inl.cuh index 63ccc5752fb0..1a2f2db9b953 100644 --- a/src/operator/tensor/sort_op-inl.cuh +++ b/src/operator/tensor/sort_op-inl.cuh @@ -156,14 +156,19 @@ SortByKeyImpl(mshadow::Tensor keys, << "Workspace given to SortByKey is too small: requested " << required_storage << " B and got " << workspace->size(0) << " B."; + size_t start_keys = 0; + size_t start_values = start_keys + + sorted_keys == nullptr ? keys_bytes : 0; + size_t start_scratch = start_values + + sorted_values == nullptr ? values_bytes : 0; KDType* keys_out_ptr = sorted_keys == nullptr ? - reinterpret_cast(workspace->dptr_) : + reinterpret_cast(workspace->dptr_ + start_keys) : sorted_keys->dptr_; VDType* values_out_ptr = sorted_values == nullptr ? - reinterpret_cast(workspace->dptr_ + keys_bytes) : + reinterpret_cast(workspace->dptr_ + start_values) : sorted_values->dptr_; - void* temp_storage = reinterpret_cast(workspace->dptr_ + keys_bytes + values_bytes); + void* temp_storage = reinterpret_cast(workspace->dptr_ + start_scratch); // Sort if (is_ascend) { cub::DeviceRadixSort::SortPairs(temp_storage, sortpairs_bytes, From 3680e1d8ca8187c7770a82c134a7e792efa47e8e Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 18 Oct 2019 13:28:18 -0700 Subject: [PATCH 07/19] Optimizations pt3 --- src/operator/contrib/bounding_box.cu | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu index bcd31a7ad657..17115291250e 100644 --- a/src/operator/contrib/bounding_box.cu +++ b/src/operator/contrib/bounding_box.cu @@ -410,7 +410,7 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results, const int my_warp = threadIdx.x / warp_size; const int my_lane = threadIdx.x % warp_size; - __shared__ uint32_t current_valid_boxes; + __shared__ uint32_t current_valid_boxes[n_threads / warp_size]; const uint32_t full_mask = 0xFFFFFFFF; const uint32_t my_lane_mask = 1 << my_lane; const uint32_t my_warp_mask = (1 << (my_lane + 1)) - 1; @@ -437,15 +437,14 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results, valid = valid & mask; } valid_boxes = __ballot_sync(full_mask, valid); - if (my_lane == 0) { - current_valid_boxes = valid_boxes; - } + } + if (my_lane == 0 && my_warp == i) { + current_valid_boxes[i] = valid_boxes; } __syncthreads(); - if ((my_warp > i) && (((~my_mask) & current_valid_boxes) != 0)) { + if ((my_warp > i) && (((~my_mask) & current_valid_boxes[i]) != 0)) { valid = 0; } - __syncthreads(); } if (my_lane == 0) { nms_results[my_element] = valid_boxes; From 1978b07c005d9f5816af4aaae88898dd24626730 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 18 Oct 2019 13:46:10 -0700 Subject: [PATCH 08/19] Do not recompute other boxes area every time --- src/operator/contrib/bounding_box.cu | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu index 17115291250e..ce672db1416a 100644 --- a/src/operator/contrib/bounding_box.cu +++ b/src/operator/contrib/bounding_box.cu @@ -331,6 +331,7 @@ __global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* re constexpr int max_elem_width = 20; constexpr int num_other_boxes = sizeof(uint32_t) * 8; __shared__ DType other_boxes[max_elem_width * num_other_boxes]; + __shared__ DType other_boxes_areas[num_other_boxes]; const index_t my_row = blockIdx.x / num_blocks_per_row; const index_t my_block_offset_in_row = blockIdx.x % num_blocks_per_row; const index_t my_block_offset_in_batch = my_block_offset_in_row % num_blocks_per_row_batch; @@ -346,6 +347,17 @@ __global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* re } __syncthreads(); + if (threadIdx.x < num_other_boxes) { + const int other_boxes_offset = element_width * threadIdx.x; + const DType their_area = calculate_area( + other_boxes[other_boxes_offset + coord_index + 0], + other_boxes[other_boxes_offset + coord_index + 1], + other_boxes[other_boxes_offset + coord_index + 2], + other_boxes[other_boxes_offset + coord_index + 3]); + other_boxes_areas[threadIdx.x] = their_area; + } + __syncthreads(); + if (my_element_in_batch >= topk) return; DType my_box[4]; @@ -368,13 +380,9 @@ __global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* re #pragma unroll for (int i = 0; i < num_other_boxes; ++i) { const int other_boxes_offset = element_width * i; - if ((class_index == -1 || my_class == other_boxes[other_boxes_offset + class_index]) && + if ((class_index == -1 || my_class == other_boxes[other_boxes_offset + class_index]) && other_boxes[other_boxes_offset + score_index] != -1){ - const DType their_area = calculate_area( - other_boxes[other_boxes_offset + coord_index + 0], - other_boxes[other_boxes_offset + coord_index + 1], - other_boxes[other_boxes_offset + coord_index + 2], - other_boxes[other_boxes_offset + coord_index + 3]); + const DType their_area = other_boxes_areas[i]; const DType intersect = calculate_intersection( my_box[0], my_box[1], my_box[2], my_box[3], @@ -382,8 +390,8 @@ __global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* re other_boxes[other_boxes_offset + coord_index + 1], other_boxes[other_boxes_offset + coord_index + 2], other_boxes[other_boxes_offset + coord_index + 3]); - const DType iou = intersect / (my_area + their_area - intersect); - if (iou > threshold) { + //const DType iou = intersect / (my_area + their_area - intersect); + if (intersect > threshold * (my_area + their_area - intersect)) { ret = ret | (1u << i); } } From dfaa4c5d114c689cb6f90aff7d48c9a71020eda1 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 18 Oct 2019 14:55:11 -0700 Subject: [PATCH 09/19] Sort only topk results during second sorting --- src/operator/contrib/bounding_box.cu | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu index ce672db1416a..331646fb3c68 100644 --- a/src/operator/contrib/bounding_box.cu +++ b/src/operator/contrib/bounding_box.cu @@ -166,12 +166,13 @@ void CompactData(const Tensor& indices, } template -void WorkspaceForSort(const int num_elem, - const int width_elem, +void WorkspaceForSort(const index_t num_elem, + const index_t topk, const int alignment, TempWorkspace* workspace) { const index_t sort_scores_temp_space = mxnet::op::SortByKeyWorkspaceSize(num_elem, false, false); - workspace->scratch_space = align(sort_scores_temp_space, + const index_t sort_topk_scores_temp_space = mxnet::op::SortByKeyWorkspaceSize(topk, false, false); + workspace->scratch_space = align(std::max(sort_scores_temp_space, sort_topk_scores_temp_space), alignment); } @@ -500,8 +501,8 @@ __global__ void ReduceNMSResultRest_kernel(DType* data, } template -TempWorkspace GetWorkspace(const int num_batch, - const int num_elem, +TempWorkspace GetWorkspace(const index_t num_batch, + const index_t num_elem, const int width_elem, const index_t topk, const OpContext& ctx) { @@ -512,7 +513,7 @@ TempWorkspace GetWorkspace(const int num_batch, // Get the workspace size workspace.scores_temp_space = 2 * align(num_batch * num_elem * sizeof(DType), alignment); workspace.indices_temp_spaces = 2 * align(num_batch * num_elem * sizeof(index_t), alignment); - WorkspaceForSort(num_elem, width_elem, alignment, &workspace); + WorkspaceForSort(num_elem, topk, alignment, &workspace); // Place for a buffer workspace.buffer_space = align(num_batch * num_elem * width_elem * sizeof(DType), alignment); workspace.nms_scratch_space = align(NMS::THRESHOLD / (sizeof(uint32_t) * 8) * @@ -561,6 +562,7 @@ void CompactNMSResults(const Tensor& data, Tensor* sorted_scores, Tensor* scratch, const int score_index, + const index_t topk, Stream* s) { using mshadow::Shape1; constexpr int n_threads = 512; @@ -575,22 +577,22 @@ void CompactNMSResults(const Tensor& data, for (index_t i = 0; i < num_batches; ++i) { // Sort each batch separately Tensor scores_batch(scores->dptr_ + i * num_elements_per_batch, - Shape1(num_elements_per_batch), + Shape1(topk), s); Tensor indices_batch(indices->dptr_ + i * num_elements_per_batch, - Shape1(num_elements_per_batch), + Shape1(topk), s); Tensor sorted_scores_batch(sorted_scores->dptr_ + i * num_elements_per_batch, - Shape1(num_elements_per_batch), + Shape1(topk), s); Tensor sorted_indices_batch(sorted_indices->dptr_ + i * num_elements_per_batch, - Shape1(num_elements_per_batch), + Shape1(topk), s); mxnet::op::SortByKey(scores_batch, indices_batch, false, scratch, 0, 8 * sizeof(DType), &sorted_scores_batch, &sorted_indices_batch); } - CompactData(*sorted_indices, data, out, -1, score_index, s); + CompactData(*sorted_indices, data, out, topk, score_index, s); //CompactData(*indices, data, out, -1, score_index, s); } @@ -673,7 +675,7 @@ void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs, NMS nms; nms(&buffer, &nms_scratch, topk, param, s); CompactNMSResults(buffer, &out, &indices, &scores, &sorted_indices, - &sorted_scores, &scratch, param.score_index, s); + &sorted_scores, &scratch, param.score_index, topk, s); // convert encoding if (param.in_format != param.out_format) { From 9069192cc0818f6996bee3824f913e509f9fc1eb Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 18 Oct 2019 15:29:40 -0700 Subject: [PATCH 10/19] Cleaning --- src/operator/contrib/bounding_box.cu | 33 ++++++++++++++++------------ 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu index 331646fb3c68..33698e7abeeb 100644 --- a/src/operator/contrib/bounding_box.cu +++ b/src/operator/contrib/bounding_box.cu @@ -170,8 +170,10 @@ void WorkspaceForSort(const index_t num_elem, const index_t topk, const int alignment, TempWorkspace* workspace) { - const index_t sort_scores_temp_space = mxnet::op::SortByKeyWorkspaceSize(num_elem, false, false); - const index_t sort_topk_scores_temp_space = mxnet::op::SortByKeyWorkspaceSize(topk, false, false); + const index_t sort_scores_temp_space = + mxnet::op::SortByKeyWorkspaceSize(num_elem, false, false); + const index_t sort_topk_scores_temp_space = + mxnet::op::SortByKeyWorkspaceSize(topk, false, false); workspace->scratch_space = align(std::max(sort_scores_temp_space, sort_topk_scores_temp_space), alignment); } @@ -337,7 +339,8 @@ __global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* re const index_t my_block_offset_in_row = blockIdx.x % num_blocks_per_row; const index_t my_block_offset_in_batch = my_block_offset_in_row % num_blocks_per_row_batch; const index_t my_batch = (my_block_offset_in_row) / num_blocks_per_row_batch; - const index_t my_element_in_batch = my_block_offset_in_batch * blockDim.x + current_start + threadIdx.x; + const index_t my_element_in_batch = my_block_offset_in_batch * blockDim.x + + current_start + threadIdx.x; // Load other boxes const index_t offset = (my_batch * num_elements_per_batch + @@ -459,7 +462,8 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results, nms_results[my_element] = valid_boxes; } if (valid == 0) { - data[(my_batch * num_elements_per_batch + my_element_in_batch) * element_width + score_index] = -1; + data[(my_batch * num_elements_per_batch + my_element_in_batch) * element_width + + score_index] = -1; } } @@ -479,7 +483,8 @@ __global__ void ReduceNMSResultRest_kernel(DType* data, constexpr int warp_size = 32; const index_t my_block_offset_in_batch = blockIdx.x % num_blocks_per_batch; const index_t my_batch = blockIdx.x / num_blocks_per_batch; - const index_t my_element_in_batch = my_block_offset_in_batch * blockDim.x + start_index + NMS::THRESHOLD + threadIdx.x; + const index_t my_element_in_batch = my_block_offset_in_batch * blockDim.x + + start_index + NMS::THRESHOLD + threadIdx.x; const index_t my_element = my_batch * topk + my_element_in_batch; if (my_element_in_batch >= topk) return; @@ -496,7 +501,8 @@ __global__ void ReduceNMSResultRest_kernel(DType* data, } if (!valid) { - data[(my_batch * num_elements_per_batch + my_element_in_batch) * element_width + score_index] = -1; + data[(my_batch * num_elements_per_batch + my_element_in_batch) * element_width + + score_index] = -1; } } @@ -593,7 +599,6 @@ void CompactNMSResults(const Tensor& data, &sorted_indices_batch); } CompactData(*sorted_indices, data, out, topk, score_index, s); - //CompactData(*indices, data, out, -1, score_index, s); } } // namespace @@ -618,13 +623,12 @@ void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs, int num_elem = in_shape[indim - 2]; int width_elem = in_shape[indim - 1]; - //MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, - using DType = float; - { + MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, { Tensor data = inputs[box_nms_enum::kData] .get_with_shape(Shape3(num_batch, num_elem, width_elem), s); Tensor out = outputs[box_nms_enum::kOut] .get_with_shape(Shape3(num_batch, num_elem, width_elem), s); + // Special case for topk == 0 if (param.topk == 0) { if (req[0] != kNullOp && @@ -640,9 +644,11 @@ void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs, FilterAndPrepareAuxData(data, &out, workspace, param, s); Tensor scores(workspace.scores, Shape1(num_batch * num_elem), s); - Tensor sorted_scores(workspace.scores + scores.MSize(), Shape1(num_batch * num_elem), s); + Tensor sorted_scores(workspace.scores + scores.MSize(), + Shape1(num_batch * num_elem), s); Tensor indices(workspace.indices, Shape1(num_batch * num_elem), s); - Tensor sorted_indices(workspace.indices + indices.MSize(), Shape1(num_batch * num_elem), s); + Tensor sorted_indices(workspace.indices + indices.MSize(), + Shape1(num_batch * num_elem), s); Tensor scratch(reinterpret_cast(workspace.scratch), Shape1(workspace.scratch_space), s); Tensor buffer(workspace.buffer, @@ -671,7 +677,6 @@ void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs, &sorted_indices_batch); } CompactData(sorted_indices, out, &buffer, topk, -1, s); - /*CompactData(indices, out, &buffer, topk, -1, s);*/ NMS nms; nms(&buffer, &nms_scratch, topk, param, s); CompactNMSResults(buffer, &out, &indices, &scores, &sorted_indices, @@ -687,7 +692,7 @@ void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs, out.dptr_ + param.coord_start, width_elem); } } - };//); + }); } void BoxNMSForwardGPU(const nnvm::NodeAttrs& attrs, From ad60b565ea4a5780ae92433dfc75946e5331778f Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 18 Oct 2019 20:51:47 -0700 Subject: [PATCH 11/19] Fixes from rebase --- src/operator/contrib/bounding_box.cc | 72 ---------------------------- src/operator/contrib/bounding_box.cu | 5 +- 2 files changed, 1 insertion(+), 76 deletions(-) diff --git a/src/operator/contrib/bounding_box.cc b/src/operator/contrib/bounding_box.cc index 9363eb09d5a4..5fd8a08e5f9e 100644 --- a/src/operator/contrib/bounding_box.cc +++ b/src/operator/contrib/bounding_box.cc @@ -34,78 +34,6 @@ DMLC_REGISTER_PARAMETER(BoxOverlapParam); DMLC_REGISTER_PARAMETER(BipartiteMatchingParam); DMLC_REGISTER_PARAMETER(BoxDecodeParam); -NNVM_REGISTER_OP(_contrib_box_nms2) -.describe(R"code(Apply non-maximum suppression to input. - -The output will be sorted in descending order according to `score`. Boxes with -overlaps larger than `overlap_thresh`, smaller scores and background boxes -will be removed and filled with -1, the corresponding position will be recorded -for backward propogation. - -During back-propagation, the gradient will be copied to the original -position according to the input index. For positions that have been suppressed, -the in_grad will be assigned 0. -In summary, gradients are sticked to its boxes, will either be moved or discarded -according to its original index in input. - -Input requirements:: - - 1. Input tensor have at least 2 dimensions, (n, k), any higher dims will be regarded - as batch, e.g. (a, b, c, d, n, k) == (a*b*c*d, n, k) - 2. n is the number of boxes in each batch - 3. k is the width of each box item. - -By default, a box is [id, score, xmin, ymin, xmax, ymax, ...], -additional elements are allowed. - -- `id_index`: optional, use -1 to ignore, useful if `force_suppress=False`, which means - we will skip highly overlapped boxes if one is `apple` while the other is `car`. - -- `background_id`: optional, default=-1, class id for background boxes, useful - when `id_index >= 0` which means boxes with background id will be filtered before nms. - -- `coord_start`: required, default=2, the starting index of the 4 coordinates. - Two formats are supported: - - - `corner`: [xmin, ymin, xmax, ymax] - - - `center`: [x, y, width, height] - -- `score_index`: required, default=1, box score/confidence. - When two boxes overlap IOU > `overlap_thresh`, the one with smaller score will be suppressed. - -- `in_format` and `out_format`: default='corner', specify in/out box formats. - -Examples:: - - x = [[0, 0.5, 0.1, 0.1, 0.2, 0.2], [1, 0.4, 0.1, 0.1, 0.2, 0.2], - [0, 0.3, 0.1, 0.1, 0.14, 0.14], [2, 0.6, 0.5, 0.5, 0.7, 0.8]] - box_nms(x, overlap_thresh=0.1, coord_start=2, score_index=1, id_index=0, - force_suppress=True, in_format='corner', out_typ='corner') = - [[2, 0.6, 0.5, 0.5, 0.7, 0.8], [0, 0.5, 0.1, 0.1, 0.2, 0.2], - [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1]] - out_grad = [[0.1, 0.1, 0.1, 0.1, 0.1, 0.1], [0.2, 0.2, 0.2, 0.2, 0.2, 0.2], - [0.3, 0.3, 0.3, 0.3, 0.3, 0.3], [0.4, 0.4, 0.4, 0.4, 0.4, 0.4]] - # exe.backward - in_grad = [[0.2, 0.2, 0.2, 0.2, 0.2, 0.2], [0, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 0], [0.1, 0.1, 0.1, 0.1, 0.1, 0.1]] - -)code" ADD_FILELINE) -.set_num_inputs(1) -.set_num_outputs(2) -.set_attr_parser(ParamParser) -.set_attr("FNumVisibleOutputs", BoxNMSNumVisibleOutputs) -.set_attr("FInferShape", BoxNMSShape) -.set_attr("FInferType", ElemwiseType<1, 2>) -.set_attr("FResourceRequest", - [](const NodeAttrs& attrs) { - return std::vector{ResourceRequest::kTempSpace}; - }) -.set_attr("THasDeterministicOutput", true) -.set_attr("FCompute", BoxNMSForward) -.set_attr("FGradient", ElemwiseGradUseOut{"_backward_contrib_box_nms"}) -.add_argument("data", "NDArray-or-Symbol", "The input") -.add_arguments(BoxNMSParam::__FIELDS__()); NNVM_REGISTER_OP(_contrib_box_nms) .add_alias("_contrib_box_non_maximum_suppression") diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu index 33698e7abeeb..f6aeea3efd2d 100644 --- a/src/operator/contrib/bounding_box.cu +++ b/src/operator/contrib/bounding_box.cu @@ -713,11 +713,8 @@ void BoxNMSForwardGPU(const nnvm::NodeAttrs& attrs, } -NNVM_REGISTER_OP(_contrib_box_nms2) -.set_attr("FCompute", BoxNMSForwardGPU); - NNVM_REGISTER_OP(_contrib_box_nms) -.set_attr("FCompute", BoxNMSForward); +.set_attr("FCompute", BoxNMSForwardGPU); NNVM_REGISTER_OP(_backward_contrib_box_nms) .set_attr("FCompute", BoxNMSBackward); From eb89b0730ca05d62294ba727a3e03fbe0a49e956 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 18 Oct 2019 21:10:35 -0700 Subject: [PATCH 12/19] Fix lint and more fixes from rebase --- src/operator/contrib/bounding_box.cu | 6 +++--- src/operator/tensor/sort_op-inl.cuh | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu index f6aeea3efd2d..b45cae1b2d27 100644 --- a/src/operator/contrib/bounding_box.cu +++ b/src/operator/contrib/bounding_box.cu @@ -24,10 +24,11 @@ * \author Joshua Zhang */ +#include + #include "./bounding_box-inl.cuh" #include "./bounding_box-inl.h" #include "../elemwise_op_common.h" -#include namespace mxnet { namespace op { @@ -385,7 +386,7 @@ __global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* re for (int i = 0; i < num_other_boxes; ++i) { const int other_boxes_offset = element_width * i; if ((class_index == -1 || my_class == other_boxes[other_boxes_offset + class_index]) && - other_boxes[other_boxes_offset + score_index] != -1){ + other_boxes[other_boxes_offset + score_index] != -1) { const DType their_area = other_boxes_areas[i]; const DType intersect = calculate_intersection( @@ -394,7 +395,6 @@ __global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* re other_boxes[other_boxes_offset + coord_index + 1], other_boxes[other_boxes_offset + coord_index + 2], other_boxes[other_boxes_offset + coord_index + 3]); - //const DType iou = intersect / (my_area + their_area - intersect); if (intersect > threshold * (my_area + their_area - intersect)) { ret = ret | (1u << i); } diff --git a/src/operator/tensor/sort_op-inl.cuh b/src/operator/tensor/sort_op-inl.cuh index 1a2f2db9b953..39f7e0d7aabb 100644 --- a/src/operator/tensor/sort_op-inl.cuh +++ b/src/operator/tensor/sort_op-inl.cuh @@ -347,7 +347,6 @@ SortByKeyImpl(mshadow::Tensor keys, template inline void SortByKey(mshadow::Tensor keys, mshadow::Tensor values, bool is_ascend, mshadow::Tensor* workspace, - const int begin_bit, const int end_bit) { const int begin_bit, const int end_bit, mshadow::Tensor* sorted_keys, mshadow::Tensor* sorted_values) { From 2567266356e1b65d2f14472c7be645624ac5f719 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 18 Oct 2019 21:58:40 -0700 Subject: [PATCH 13/19] Fix typo --- src/operator/tensor/sort_op-inl.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/tensor/sort_op-inl.cuh b/src/operator/tensor/sort_op-inl.cuh index 39f7e0d7aabb..4ecf0994e7b1 100644 --- a/src/operator/tensor/sort_op-inl.cuh +++ b/src/operator/tensor/sort_op-inl.cuh @@ -103,7 +103,7 @@ SortByKeyWorkspaceSize(const size_t num_keys, #else size_t keys_bytes, values_bytes; WorkspaceSize4KeysAndValues(num_keys, &keys_bytes, &values_bytes); - size_t ret = SortPairsWorkspaseSize(num_keys); + size_t ret = SortPairsWorkspaceSize(num_keys); if (keys_in_place) { ret += keys_bytes; } From 6ac371cafad3597ff82dbac47c8673da3f507935 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 18 Oct 2019 21:19:14 -0700 Subject: [PATCH 14/19] Early exit in Triangle kernel --- src/operator/contrib/bounding_box.cu | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu index b45cae1b2d27..45da87e4617b 100644 --- a/src/operator/contrib/bounding_box.cu +++ b/src/operator/contrib/bounding_box.cu @@ -425,9 +425,9 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results, __shared__ uint32_t current_valid_boxes[n_threads / warp_size]; const uint32_t full_mask = 0xFFFFFFFF; const uint32_t my_lane_mask = 1 << my_lane; - const uint32_t my_warp_mask = (1 << (my_lane + 1)) - 1; + const uint32_t earlier_threads_mask = (1 << (my_lane + 1)) - 1; uint32_t valid = my_lane_mask; - uint32_t valid_boxes; + uint32_t valid_boxes = full_mask; uint32_t my_next_mask = my_element_in_batch < topk ? nms_results[my_element]: @@ -439,8 +439,8 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results, (my_element_in_batch < topk)) ? nms_results[(i + 1) * topk * num_batches + my_element]: full_mask; - if (my_warp == i) { - my_mask = my_mask | my_warp_mask; + if (my_warp == i && !__all_sync(full_mask, my_mask == full_mask)) { + my_mask = my_mask | earlier_threads_mask; // Loop over warp_size - 1 because the last // thread does not contribute to the mask anyway #pragma unroll From b0ce511c43bfb683f106afd5bd8abba1d5d7665e Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 18 Oct 2019 22:02:42 -0700 Subject: [PATCH 15/19] Fixes --- src/operator/contrib/bounding_box.cu | 42 +++++++++++++++------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu index 45da87e4617b..eb6e269e41a1 100644 --- a/src/operator/contrib/bounding_box.cu +++ b/src/operator/contrib/bounding_box.cu @@ -63,29 +63,33 @@ inline index_t align(index_t x, index_t alignment) { template __global__ void FilterAndPrepareAuxData_kernel(const DType* data, DType* out, DType* scores, index_t num_elements_per_batch, - const index_t element_width, const float threshold, + const index_t element_width, + const index_t N, + const float threshold, const int id_index, const int score_index, const int background_id) { index_t tid = blockIdx.x * blockDim.x + threadIdx.x; bool first_in_element = (tid % element_width == 0); index_t start_of_my_element = tid - (tid % element_width); - DType my_score = data[start_of_my_element + score_index]; - bool filtered_out = my_score <= threshold; - if (id_index != -1 && background_id != -1) { - DType my_id = data[start_of_my_element + id_index]; - filtered_out = filtered_out || (my_id == background_id); - } - if (!filtered_out) { - out[tid] = data[tid]; - } else { - out[tid] = -1; - my_score = -1; - } + if (tid < N) { + DType my_score = data[start_of_my_element + score_index]; + bool filtered_out = my_score <= threshold; + if (id_index != -1 && background_id != -1) { + DType my_id = data[start_of_my_element + id_index]; + filtered_out = filtered_out || (my_id == background_id); + } + if (!filtered_out) { + out[tid] = data[tid]; + } else { + out[tid] = -1; + my_score = -1; + } - if (first_in_element) { - index_t offset = tid / element_width; - scores[offset] = my_score; + if (first_in_element) { + index_t offset = tid / element_width; + scores[offset] = my_score; + } } } @@ -103,7 +107,7 @@ void FilterAndPrepareAuxData(const Tensor& data, 0, Stream::GetStream(s)>>>( data.dptr_, out->dptr_, workspace.scores, - data.shape_[1], data.shape_[2], + data.shape_[1], data.shape_[2], N, param.valid_thresh, param.id_index, param.score_index, param.background_id); } @@ -401,7 +405,7 @@ __global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* re } } } - result[my_row * topk * num_batches + my_element_in_batch] = ~ret; + result[(my_row * num_batches + my_batch) * topk + my_element_in_batch] = ~ret; } template @@ -678,7 +682,7 @@ void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs, } CompactData(sorted_indices, out, &buffer, topk, -1, s); NMS nms; - nms(&buffer, &nms_scratch, topk, param, s); + nms(&buffer, &nms_scratch, topk, param, s); CompactNMSResults(buffer, &out, &indices, &scores, &sorted_indices, &sorted_scores, &scratch, param.score_index, topk, s); From 753e7c0eea2b28c6cdfd131b0b1c77ee9e7b7f37 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 24 Oct 2019 16:23:52 -0700 Subject: [PATCH 16/19] Fix sort --- src/operator/tensor/sort_op-inl.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/operator/tensor/sort_op-inl.cuh b/src/operator/tensor/sort_op-inl.cuh index 4ecf0994e7b1..f30bbf3b8433 100644 --- a/src/operator/tensor/sort_op-inl.cuh +++ b/src/operator/tensor/sort_op-inl.cuh @@ -158,9 +158,9 @@ SortByKeyImpl(mshadow::Tensor keys, size_t start_keys = 0; size_t start_values = start_keys + - sorted_keys == nullptr ? keys_bytes : 0; + (sorted_keys == nullptr ? keys_bytes : 0); size_t start_scratch = start_values + - sorted_values == nullptr ? values_bytes : 0; + (sorted_values == nullptr ? values_bytes : 0); KDType* keys_out_ptr = sorted_keys == nullptr ? reinterpret_cast(workspace->dptr_ + start_keys) : sorted_keys->dptr_; From 8cb9a5bbb7f99f1ceb893ddc86b8b34d5f0c9b61 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 24 Oct 2019 16:50:02 -0700 Subject: [PATCH 17/19] Fix from rebase --- src/operator/tensor/sort_op-inl.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/operator/tensor/sort_op-inl.cuh b/src/operator/tensor/sort_op-inl.cuh index f30bbf3b8433..c157de99a4e1 100644 --- a/src/operator/tensor/sort_op-inl.cuh +++ b/src/operator/tensor/sort_op-inl.cuh @@ -327,8 +327,8 @@ SortByKeyImpl(mshadow::Tensor keys, v = sorted_values; mshadow::Copy(*sorted_values, values, values.stream_); } - const auto key_iter = thrust::device_pointer_cast(reinterpret_cast(k->dptr_)); - const auto value_iter = thrust::device_pointer_cast(reinterpret_cast(v->dptr_)); + const auto key_iter = thrust::device_pointer_cast(reinterpret_cast<__half*>(k->dptr_)); + const auto value_iter = thrust::device_pointer_cast(reinterpret_cast<__half*>(v->dptr_)); if (is_ascend) { thrust::stable_sort_by_key( thrust::cuda::par.on(stream), From dc1a5992c90703c5901081923a066322fa44d532 Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Thu, 24 Oct 2019 16:52:15 -0700 Subject: [PATCH 18/19] Fix for the mixed naming convention --- src/operator/contrib/bounding_box.cu | 34 ++++++++++++++-------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu index eb6e269e41a1..9d6d8bfb5951 100644 --- a/src/operator/contrib/bounding_box.cu +++ b/src/operator/contrib/bounding_box.cu @@ -61,7 +61,7 @@ inline index_t align(index_t x, index_t alignment) { } template -__global__ void FilterAndPrepareAuxData_kernel(const DType* data, DType* out, DType* scores, +__global__ void FilterAndPrepareAuxDataKernel(const DType* data, DType* out, DType* scores, index_t num_elements_per_batch, const index_t element_width, const index_t N, @@ -102,7 +102,7 @@ void FilterAndPrepareAuxData(const Tensor& data, const int n_threads = 512; index_t N = data.shape_.Size(); const auto blocks = ceil_div(N, n_threads); - FilterAndPrepareAuxData_kernel<<::GetStream(s)>>>( @@ -113,7 +113,7 @@ void FilterAndPrepareAuxData(const Tensor& data, } template -__global__ void CompactData_kernel(const index_t* indices, const DType* source, +__global__ void CompactDataKernel(const index_t* indices, const DType* source, DType* destination, const index_t topk, const index_t element_width, const index_t num_elements_per_batch, @@ -154,14 +154,14 @@ void CompactData(const Tensor& indices, index_t N = source.shape_.Size(); const auto blocks = std::min(ceil_div(N, n_threads), max_blocks); if (topk > 0) { - CompactData_kernel<<<<::GetStream(s)>>>( indices.dptr_, source.dptr_, destination->dptr_, topk, source.shape_[2], source.shape_[1], score_index, N); } else { - CompactData_kernel<<<<::GetStream(s)>>>( indices.dptr_, source.dptr_, destination->dptr_, topk, @@ -184,7 +184,7 @@ void WorkspaceForSort(const index_t num_elem, } template -__global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* result, +__global__ void CalculateGreedyNMSResultsKernel(const DType* data, uint32_t* result, const index_t current_start, const index_t num_elems, const index_t num_batches, @@ -199,7 +199,7 @@ __global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* re const float threshold); template -__global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results, +__global__ void ReduceNMSResultTriangleKernel(uint32_t* nms_results, DType * data, const index_t score_index, const index_t element_width, @@ -209,7 +209,7 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results, const index_t topk); template -__global__ void ReduceNMSResultRest_kernel(DType* data, +__global__ void ReduceNMSResultRestKernel(DType* data, const uint32_t* nms_results, const index_t score_index, const index_t element_width, @@ -238,7 +238,7 @@ struct NMS { const index_t num_blocks_per_row = num_blocks_per_row_batch * num_batches; const index_t n_blocks = THRESHOLD / (sizeof(uint32_t) * 8) * num_blocks_per_row; if (param.in_format == box_common_enum::kCorner) { - CalculateGreedyNMSResults_kernel + CalculateGreedyNMSResultsKernel <<::GetStream(s)>>>( data->dptr_, scratch->dptr_, current_start, n_elems, num_batches, num_blocks_per_row_batch, num_blocks_per_row, topk, element_width, @@ -246,7 +246,7 @@ struct NMS { param.force_suppress ? -1 : param.id_index, param.score_index, param.overlap_thresh); } else { - CalculateGreedyNMSResults_kernel + CalculateGreedyNMSResultsKernel <<::GetStream(s)>>>( data->dptr_, scratch->dptr_, current_start, n_elems, num_batches, num_blocks_per_row_batch, num_blocks_per_row, topk, element_width, @@ -254,7 +254,7 @@ struct NMS { param.force_suppress ? -1 : param.id_index, param.score_index, param.overlap_thresh); } - ReduceNMSResultTriangle_kernel<<::GetStream(s)>>>( + ReduceNMSResultTriangleKernel<<::GetStream(s)>>>( scratch->dptr_, data->dptr_, param.score_index, element_width, num_batches, num_elements_per_batch, current_start, topk); @@ -262,7 +262,7 @@ struct NMS { const index_t num_rest_blocks_per_batch = ceil_div(n_rest_elems, n_threads); const index_t num_rest_blocks = num_rest_blocks_per_batch * num_batches; if (n_rest_elems > 0) { - ReduceNMSResultRest_kernel<<::GetStream(s)>>>( + ReduceNMSResultRestKernel<<::GetStream(s)>>>( data->dptr_, scratch->dptr_, param.score_index, element_width, num_batches, num_elements_per_batch, current_start, topk, num_rest_blocks_per_batch); @@ -323,7 +323,7 @@ __device__ __forceinline__ DType calculate_intersection(const DType a0, const DT template __launch_bounds__(512) -__global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* result, +__global__ void CalculateGreedyNMSResultsKernel(const DType* data, uint32_t* result, const index_t current_start, const index_t num_elems, const index_t num_batches, @@ -410,7 +410,7 @@ __global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* re template __launch_bounds__(NMS::THRESHOLD) -__global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results, +__global__ void ReduceNMSResultTriangleKernel(uint32_t* nms_results, DType * data, const index_t score_index, const index_t element_width, @@ -473,7 +473,7 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results, template __launch_bounds__(512) -__global__ void ReduceNMSResultRest_kernel(DType* data, +__global__ void ReduceNMSResultRestKernel(DType* data, const uint32_t* nms_results, const index_t score_index, const index_t element_width, @@ -554,7 +554,7 @@ TempWorkspace GetWorkspace(const index_t num_batch, } template -__global__ void ExtractScores_kernel(const DType* data, DType* scores, +__global__ void ExtractScoresKernel(const DType* data, DType* scores, const index_t N, const int element_width, const int score_index) { const index_t tid = blockIdx.x * blockDim.x + threadIdx.x; @@ -581,7 +581,7 @@ void CompactNMSResults(const Tensor& data, const index_t num_batches = data.shape_[0]; const int element_width = data.shape_[2]; const index_t n_blocks = ceil_div(num_elements, n_threads); - ExtractScores_kernel<<::GetStream(s)>>>( + ExtractScoresKernel<<::GetStream(s)>>>( data.dptr_, scores->dptr_, num_elements, element_width, score_index); *indices = mshadow::expr::range(0, num_elements); for (index_t i = 0; i < num_batches; ++i) { From 3dfca4dc3500c270d2f3ee501151cda85b2d33de Mon Sep 17 00:00:00 2001 From: Przemek Tredak Date: Fri, 25 Oct 2019 09:48:26 -0700 Subject: [PATCH 19/19] Fix the index_t with int comparisoon --- src/operator/contrib/bounding_box.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu index 9d6d8bfb5951..ffc48d7d5a44 100644 --- a/src/operator/contrib/bounding_box.cu +++ b/src/operator/contrib/bounding_box.cu @@ -150,7 +150,7 @@ void CompactData(const Tensor& indices, const int score_index, Stream* s) { const int n_threads = 512; - const int max_blocks = 320; + const index_t max_blocks = 320; index_t N = source.shape_.Size(); const auto blocks = std::min(ceil_div(N, n_threads), max_blocks); if (topk > 0) {