From fe2a7d9d016420eabedb9c70274735f07177ce51 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Tue, 8 Oct 2019 16:56:44 -0700
Subject: [PATCH 01/19] Adding second NMS op

---
 src/operator/contrib/bounding_box-inl.cuh |   2 +
 src/operator/contrib/bounding_box.cc      |  73 ++++++
 src/operator/contrib/bounding_box.cu      | 289 ++++++++++++++++++++++
 3 files changed, 364 insertions(+)
diff --git a/src/operator/contrib/bounding_box-inl.cuh b/src/operator/contrib/bounding_box-inl.cuh
index de215ce35a98..128d3cbcecc2 100644
--- a/src/operator/contrib/bounding_box-inl.cuh
+++ b/src/operator/contrib/bounding_box-inl.cuh
@@ -202,7 +202,9 @@ void NMSApply(mshadow::Stream<gpu> *s,
               int in_format) {
   using namespace mxnet_op;
   constexpr int THRESHOLD = 1024;
+  std::cout << "TOPK " << topk << std::endl;
   for (int ref = 0; ref < topk; ref += THRESHOLD) {
+    std::cout << "REF: " << ref << std::endl;
     constexpr int block_size = 512;
     constexpr int N = THRESHOLD / block_size;
     auto stream = mshadow::Stream<gpu>::GetStream(s);
diff --git a/src/operator/contrib/bounding_box.cc b/src/operator/contrib/bounding_box.cc
index 62b7c2e0bf4b..9363eb09d5a4 100644
--- a/src/operator/contrib/bounding_box.cc
+++ b/src/operator/contrib/bounding_box.cc
@@ -34,6 +34,79 @@ DMLC_REGISTER_PARAMETER(BoxOverlapParam);
 DMLC_REGISTER_PARAMETER(BipartiteMatchingParam);
 DMLC_REGISTER_PARAMETER(BoxDecodeParam);
 
+NNVM_REGISTER_OP(_contrib_box_nms2)
+.describe(R"code(Apply non-maximum suppression to input.
+
+The output will be sorted in descending order according to `score`. Boxes with
+overlaps larger than `overlap_thresh`, smaller scores and background boxes
+will be removed and filled with -1, the corresponding position will be recorded
+for backward propogation.
+
+During back-propagation, the gradient will be copied to the original
+position according to the input index. For positions that have been suppressed,
+the in_grad will be assigned 0.
+In summary, gradients are sticked to its boxes, will either be moved or discarded
+according to its original index in input.
+
+Input requirements::
+
+  1. Input tensor have at least 2 dimensions, (n, k), any higher dims will be regarded
+  as batch, e.g. (a, b, c, d, n, k) == (a*b*c*d, n, k)
+  2. n is the number of boxes in each batch
+  3. k is the width of each box item.
+
+By default, a box is [id, score, xmin, ymin, xmax, ymax, ...],
+additional elements are allowed.
+
+- `id_index`: optional, use -1 to ignore, useful if `force_suppress=False`, which means
+  we will skip highly overlapped boxes if one is `apple` while the other is `car`.
+
+- `background_id`: optional, default=-1, class id for background boxes, useful
+  when `id_index >= 0` which means boxes with background id will be filtered before nms.
+
+- `coord_start`: required, default=2, the starting index of the 4 coordinates.
+  Two formats are supported:
+
+    - `corner`: [xmin, ymin, xmax, ymax]
+
+    - `center`: [x, y, width, height]
+
+- `score_index`: required, default=1, box score/confidence.
+  When two boxes overlap IOU > `overlap_thresh`, the one with smaller score will be suppressed.
+
+- `in_format` and `out_format`: default='corner', specify in/out box formats.
+
+Examples::
+
+  x = [[0, 0.5, 0.1, 0.1, 0.2, 0.2], [1, 0.4, 0.1, 0.1, 0.2, 0.2],
+       [0, 0.3, 0.1, 0.1, 0.14, 0.14], [2, 0.6, 0.5, 0.5, 0.7, 0.8]]
+  box_nms(x, overlap_thresh=0.1, coord_start=2, score_index=1, id_index=0,
+      force_suppress=True, in_format='corner', out_typ='corner') =
+      [[2, 0.6, 0.5, 0.5, 0.7, 0.8], [0, 0.5, 0.1, 0.1, 0.2, 0.2],
+       [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1]]
+  out_grad = [[0.1, 0.1, 0.1, 0.1, 0.1, 0.1], [0.2, 0.2, 0.2, 0.2, 0.2, 0.2],
+              [0.3, 0.3, 0.3, 0.3, 0.3, 0.3], [0.4, 0.4, 0.4, 0.4, 0.4, 0.4]]
+  # exe.backward
+  in_grad = [[0.2, 0.2, 0.2, 0.2, 0.2, 0.2], [0, 0, 0, 0, 0, 0],
+             [0, 0, 0, 0, 0, 0], [0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]
+
+)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(2)
+.set_attr_parser(ParamParser<BoxNMSParam>)
+.set_attr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs", BoxNMSNumVisibleOutputs)
+.set_attr<mxnet::FInferShape>("FInferShape", BoxNMSShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 2>)
+.set_attr<FResourceRequest>("FResourceRequest",
+  [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+  })
+.set_attr<THasDeterministicOutput>("THasDeterministicOutput", true)
+.set_attr<FCompute>("FCompute<cpu>", BoxNMSForward<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_contrib_box_nms"})
+.add_argument("data", "NDArray-or-Symbol", "The input")
+.add_arguments(BoxNMSParam::__FIELDS__());
+
 NNVM_REGISTER_OP(_contrib_box_nms)
 .add_alias("_contrib_box_non_maximum_suppression")
 .describe(R"code(Apply non-maximum suppression to input.
diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu
index b20c570ea417..c5ea0e9e7db4 100644
--- a/src/operator/contrib/bounding_box.cu
+++ b/src/operator/contrib/bounding_box.cu
@@ -27,9 +27,298 @@
 #include "./bounding_box-inl.cuh"
 #include "./bounding_box-inl.h"
 #include "../elemwise_op_common.h"
+#include <cub/cub.cuh>
 
 namespace mxnet {
 namespace op {
+
+namespace {
+
+using mshadow::Tensor;
+using mshadow::Stream;
+
+template <typename DType>
+struct TempWorkspace {
+  index_t scores_temp_space;
+  DType* scores;
+  index_t batch_temp_space;
+  index_t* batches;
+  index_t scratch_space;
+  uint8_t* scratch;
+  index_t nms_scratch_space;
+  uint32_t* nms_scratch;
+  index_t indices_temp_spaces;
+  index_t* indices;
+};
+
+inline index_t ceil_div(index_t x, index_t y) {
+  return (x + y - 1) / y;
+}
+
+inline index_t align(index_t x, index_t alignment) {
+  return ceil_div(x, alignment)  * alignment;
+}
+
+template <typename DType>
+__global__ void FilterAndPrepareAuxData_kernel(const DType* data, DType* out, DType* scores,
+                                               index_t* batches, index_t num_elements_per_batch,
+                                               const index_t element_width, const float threshold,
+                                               const int id_index, const int score_index,
+                                               const int background_id) {
+  index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  bool first_in_element = (tid % element_width == 0);
+  index_t my_batch = tid / (num_elements_per_batch * element_width);
+  index_t start_of_my_element = tid - (tid % element_width);
+
+  DType my_score = data[start_of_my_element + score_index];
+  bool filtered_out = my_score <= threshold;
+  if (id_index != -1 && background_id != -1) {
+    DType my_id = data[start_of_my_element + id_index];
+    filtered_out = filtered_out || (my_id == background_id);
+  }
+  if (!filtered_out) {
+    out[tid] = data[tid];
+  } else {
+    out[tid] = -1;
+    my_score = -1;
+  }
+
+  if (first_in_element) {
+    index_t offset = tid / element_width;
+    scores[offset] = my_score;
+    batches[offset] = my_batch;
+  }
+}
+
+template <typename DType>
+void FilterAndPrepareAuxData(const Tensor<gpu, 3, DType>& data,
+                             Tensor<gpu, 3, DType>* out,
+                             const TempWorkspace<DType>& workspace,
+                             const BoxNMSParam& param,
+                             Stream<gpu>* s) {
+  const int n_threads = 512;
+  index_t N = data.shape_.Size();
+  const auto blocks = ceil_div(N, n_threads);
+  FilterAndPrepareAuxData_kernel<<<blocks,
+                                   n_threads,
+                                   0,
+                                   Stream<gpu>::GetStream(s)>>>(
+    data.dptr_, out->dptr_, workspace.scores,
+    workspace.batches, data.shape_[1], data.shape_[2],
+    param.valid_thresh, param.id_index,
+    param.score_index, param.background_id);
+}
+
+template <bool write_whole_output, typename DType>
+__global__ void CompactData_kernel(const index_t* indices, const DType* source,
+                                   DType* destination, const index_t topk,
+                                   const index_t element_width,
+                                   const index_t num_elements_per_batch,
+                                   const index_t N) {
+  const index_t tid_start = blockIdx.x * blockDim.x + threadIdx.x;
+  for (index_t tid = tid_start; tid < N; tid += blockDim.x * gridDim.x) {
+    const index_t my_element = tid / element_width;
+    const index_t my_element_in_batch = my_element % num_elements_per_batch;
+    if (write_whole_output && my_element_in_batch >= topk) {
+      destination[tid] = -1;
+    } else {
+      const index_t source_element = indices[my_element];
+      destination[tid] = source[source_element * element_width + tid % element_width];
+    }
+  }
+}
+
+template <typename DType>
+void CompactData(const Tensor<gpu, 1, index_t>& indices,
+                 const Tensor<gpu, 3, DType>& source,
+                 Tensor<gpu, 3, DType>* destination,
+                 const index_t topk,
+                 Stream<gpu>* s) {
+  const int n_threads = 512;
+  const int max_blocks = 320;
+  index_t N = source.shape_.Size();
+  const auto blocks = std::min(ceil_div(N, n_threads), max_blocks);
+  CompactData_kernel<true><<<blocks, n_threads, 0,
+                             Stream<gpu>::GetStream(s)>>>(
+    indices.dptr_, source.dptr_,
+    destination->dptr_, topk,
+    source.shape_[2], source.shape_[1], N);
+}
+
+template <typename DType>
+void WorkspaceForSort(const int num_batch,
+                      const int num_elem,
+                      const int width_elem,
+                      const int alignment,
+                      TempWorkspace<DType>* workspace) {
+  const index_t sort_scores_temp_space = mxnet::op::SortByKeyWorkspaceSize<DType, index_t, gpu>(num_batch * num_elem);
+  const index_t sort_batch_temp_space = mxnet::op::SortByKeyWorkspaceSize<index_t, index_t, gpu>(num_batch * num_elem);
+  workspace->scratch_space = align(std::max(sort_scores_temp_space,
+                                            sort_batch_temp_space),
+                                   alignment);
+}
+
+template <typename DType>
+__global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* result,
+                                                 const index_t current_start);
+
+template <typename DType>
+struct NMS {
+  static const int THRESHOLD = 1024;
+
+  void operator()(Tensor<gpu, 3, DType>* data,
+                  Tensor<gpu, 2, uint32_t>* scratch,
+                  const index_t topk,
+                  const BoxNMSParam& param,
+                  Stream<gpu>* s) {
+    const int n_threads = 512;
+    const index_t n_batch = data->shape_[0];
+    for (index_t current_start = 0; current_start < topk; current_start += THRESHOLD) {
+      const index_t n_elems = topk - current_start;
+      const int n_blocks = ceil_div(THRESHOLD / (sizeof(uint32_t) * 8) * n_elems * n_batch, n_threads);
+      CalculateGreedyNMSResults_kernel<<<n_blocks, n_threads, 0, Stream<gpu>::GetStream(s)>>>(
+          data->dptr_, scratch->dptr_, current_start);
+    }
+  }
+};
+
+template <typename DType>
+__global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* result,
+                                                 const index_t current_start) {
+}
+
+template <typename DType>
+TempWorkspace<DType> GetWorkspace(const int num_batch,
+                                  const int num_elem,
+                                  const int width_elem,
+                                  const index_t topk,
+                                  const OpContext& ctx) {
+  TempWorkspace<DType> workspace;
+  Stream<gpu> *s = ctx.get_stream<gpu>();
+  const int alignment = 128;
+
+  // Get the workspace size
+  workspace.scores_temp_space = align(num_batch * num_elem * sizeof(DType), alignment);
+  workspace.batch_temp_space = align(num_batch * num_elem * sizeof(index_t), alignment);
+  workspace.indices_temp_spaces = align(num_batch * num_elem * sizeof(index_t), alignment);
+  WorkspaceForSort(num_batch, num_elem, width_elem, alignment, &workspace);
+  // Place for a buffer
+  workspace.scratch_space = std::max(workspace.scratch_space,
+                                     align(num_batch * num_elem * width_elem * sizeof(DType),
+                                           alignment));
+  workspace.nms_scratch_space = align(NMS<DType>::THRESHOLD / (sizeof(uint32_t) * 8) *
+                                      num_batch * topk * sizeof(uint32_t), alignment);
+
+  const index_t workspace_size = workspace.scores_temp_space +
+                                 workspace.batch_temp_space +
+                                 workspace.scratch_space +
+                                 workspace.nms_scratch_space +
+                                 workspace.indices_temp_spaces;
+
+  // Obtain the memory for workspace
+  Tensor<gpu, 1, DType> scratch_memory = ctx.requested[box_nms_enum::kTempSpace]
+    .get_space_typed<gpu, 1, DType>(mshadow::Shape1(workspace_size), s);
+
+  // Populate workspace pointers
+  workspace.scores = scratch_memory.dptr_;
+  workspace.batches = reinterpret_cast<index_t*>(reinterpret_cast<uint8_t*>(workspace.scores) +
+                                                 workspace.scores_temp_space);
+  workspace.scratch = reinterpret_cast<uint8_t*>(workspace.batches) +
+                                                 workspace.batch_temp_space;
+  workspace.nms_scratch = reinterpret_cast<uint32_t*>(workspace.scratch +
+                                                      workspace.scratch_space);
+  workspace.indices = reinterpret_cast<index_t*>(
+      reinterpret_cast<uint8_t*>(workspace.nms_scratch) + workspace.nms_scratch_space);
+  return workspace;
+}
+
+}  // namespace
+
+void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs,
+                             const OpContext& ctx,
+                             const std::vector<TBlob>& inputs,
+                             const std::vector<OpReqType>& req,
+                             const std::vector<TBlob>& outputs) {
+  using mshadow::Shape1;
+  using mshadow::Shape2;
+  using mshadow::Shape3;
+  CHECK_NE(req[0], kAddTo) << "BoxNMS does not support kAddTo";
+  CHECK_NE(req[0], kWriteInplace) << "BoxNMS does not support in place computation";
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 2U) << "BoxNMS output: [output, temp]";
+  const BoxNMSParam& param = nnvm::get<BoxNMSParam>(attrs.parsed);
+  Stream<gpu> *s = ctx.get_stream<gpu>();
+  mxnet::TShape in_shape = inputs[box_nms_enum::kData].shape_;
+  int indim = in_shape.ndim();
+  int num_batch = indim <= 2? 1 : in_shape.ProdShape(0, indim - 2);
+  int num_elem = in_shape[indim - 2];
+  int width_elem = in_shape[indim - 1];
+
+  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+    Tensor<gpu, 3, DType> data = inputs[box_nms_enum::kData]
+     .get_with_shape<gpu, 3, DType>(Shape3(num_batch, num_elem, width_elem), s);
+    Tensor<gpu, 3, DType> out = outputs[box_nms_enum::kOut]
+     .get_with_shape<gpu, 3, DType>(Shape3(num_batch, num_elem, width_elem), s);
+    // Special case for topk == 0
+    if (param.topk == 0) {
+      if (req[0] != kNullOp &&
+          req[0] != kWriteInplace) {
+        out = mshadow::expr::F<mshadow_op::identity>(data);
+      }
+      return;
+    }
+
+    index_t topk = param.topk > 0 ? std::min(param.topk, num_elem) : num_elem;
+    const auto& workspace = GetWorkspace<DType>(num_batch, num_elem,
+                                                width_elem, topk, ctx);
+
+    FilterAndPrepareAuxData(data, &out, workspace, param, s);
+    Tensor<gpu, 1, DType> scores(workspace.scores, Shape1(num_batch * num_elem), s);
+    Tensor<gpu, 1, index_t> batches(workspace.batches, Shape1(num_batch * num_elem), s);
+    Tensor<gpu, 1, index_t> indices(workspace.indices, Shape1(num_batch * num_elem), s);
+    Tensor<gpu, 1, char> scratch(reinterpret_cast<char*>(workspace.scratch),
+                                        Shape1(workspace.scratch_space), s);
+    Tensor<gpu, 2, uint32_t> nms_scratch(workspace.nms_scratch,
+                                        Shape2(NMS<DType>::THRESHOLD / (sizeof(uint32_t) * 8), topk * num_batch), s);
+    indices = mshadow::expr::range<index_t>(0, num_batch * num_elem);
+    mxnet::op::SortByKey(scores, indices, false, &scratch);
+    batches = indices / mshadow::expr::ScalarExp<index_t>(num_elem);
+    mxnet::op::SortByKey(batches, indices, true, &scratch);
+    Tensor<gpu, 3, DType> buffer(reinterpret_cast<DType*>(workspace.scratch),
+                                 Shape3(num_batch, num_elem, width_elem), s);
+    CompactData(indices, out, &buffer, topk, s);
+    NMS<DType> nms;
+    nms(&buffer, &nms_scratch,  topk, param, s);
+    mshadow::Copy(out, buffer, s);
+  });
+}
+
+void BoxNMSForwardGPU(const nnvm::NodeAttrs& attrs,
+                      const OpContext& ctx,
+                      const std::vector<TBlob>& inputs,
+                      const std::vector<OpReqType>& req,
+                      const std::vector<TBlob>& outputs) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  using namespace mxnet_op;
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 2U) << "BoxNMS output: [output, temp]";
+  std::cout << "Reqs" << std::endl;
+  for (const auto& r : req) {
+    std::cout << r << std::endl;
+  }
+  std::cout << "END: Reqs" << std::endl;
+  if (req[1] == kNullOp) {
+    BoxNMSForwardGPU_notemp(attrs, ctx, inputs, req, outputs);
+    return;
+  }
+  BoxNMSForward<gpu>(attrs, ctx, inputs, req, outputs);
+}
+
+
+NNVM_REGISTER_OP(_contrib_box_nms2)
+.set_attr<FCompute>("FCompute<gpu>", BoxNMSForwardGPU);
+
 NNVM_REGISTER_OP(_contrib_box_nms)
 .set_attr<FCompute>("FCompute<gpu>", BoxNMSForward<gpu>);
 

From 5d0a43049d8e284371b61d1c99269022c33428ce Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Mon, 14 Oct 2019 15:41:23 -0700
Subject: [PATCH 02/19] NMS kernel

---
 src/operator/contrib/bounding_box.cu | 402 +++++++++++++++++++++++++--
 1 file changed, 376 insertions(+), 26 deletions(-)

diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu
index c5ea0e9e7db4..f824f6d58e25 100644
--- a/src/operator/contrib/bounding_box.cu
+++ b/src/operator/contrib/bounding_box.cu
@@ -45,6 +45,8 @@ struct TempWorkspace {
   index_t* batches;
   index_t scratch_space;
   uint8_t* scratch;
+  index_t buffer_space;
+  DType* buffer;
   index_t nms_scratch_space;
   uint32_t* nms_scratch;
   index_t indices_temp_spaces;
@@ -109,7 +111,7 @@ void FilterAndPrepareAuxData(const Tensor<gpu, 3, DType>& data,
     param.score_index, param.background_id);
 }
 
-template <bool write_whole_output, typename DType>
+template <bool check_topk, typename DType>
 __global__ void CompactData_kernel(const index_t* indices, const DType* source,
                                    DType* destination, const index_t topk,
                                    const index_t element_width,
@@ -119,7 +121,7 @@ __global__ void CompactData_kernel(const index_t* indices, const DType* source,
   for (index_t tid = tid_start; tid < N; tid += blockDim.x * gridDim.x) {
     const index_t my_element = tid / element_width;
     const index_t my_element_in_batch = my_element % num_elements_per_batch;
-    if (write_whole_output && my_element_in_batch >= topk) {
+    if (check_topk && my_element_in_batch >= topk) {
       destination[tid] = -1;
     } else {
       const index_t source_element = indices[my_element];
@@ -138,11 +140,19 @@ void CompactData(const Tensor<gpu, 1, index_t>& indices,
   const int max_blocks = 320;
   index_t N = source.shape_.Size();
   const auto blocks = std::min(ceil_div(N, n_threads), max_blocks);
-  CompactData_kernel<true><<<blocks, n_threads, 0,
-                             Stream<gpu>::GetStream(s)>>>(
-    indices.dptr_, source.dptr_,
-    destination->dptr_, topk,
-    source.shape_[2], source.shape_[1], N);
+  if (topk > 0) {
+    CompactData_kernel<true><<<blocks, n_threads, 0,
+                               Stream<gpu>::GetStream(s)>>>(
+      indices.dptr_, source.dptr_,
+      destination->dptr_, topk,
+      source.shape_[2], source.shape_[1], N);
+  } else {
+    CompactData_kernel<false><<<blocks, n_threads, 0,
+                                Stream<gpu>::GetStream(s)>>>(
+      indices.dptr_, source.dptr_,
+      destination->dptr_, topk,
+      source.shape_[2], source.shape_[1], N);
+  }
 }
 
 template <typename DType>
@@ -158,13 +168,45 @@ void WorkspaceForSort(const int num_batch,
                                    alignment);
 }
 
-template <typename DType>
+template <int encode, typename DType>
 __global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* result,
-                                                 const index_t current_start);
+                                                 const index_t current_start,
+                                                 const index_t num_elems,
+                                                 const index_t num_batches,
+                                                 const index_t num_blocks_per_row_batch,
+                                                 const index_t num_blocks_per_row,
+                                                 const index_t topk,
+                                                 const index_t element_width,
+                                                 const index_t num_elements_per_batch,
+                                                 const int coord_index,
+                                                 const int class_index,
+                                                 const int score_index,
+                                                 const float threshold);
+
+template <typename DType>
+__global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results,
+                                               DType * data,
+                                               const index_t score_index,
+                                               const index_t element_width,
+                                               const index_t num_batches,
+                                               const index_t num_elems,
+                                               const index_t start_index,
+                                               const index_t topk);
+
+template <typename DType>
+__global__ void ReduceNMSResultRest_kernel(DType* data,
+                                           const uint32_t* nms_results,
+                                           const index_t score_index,
+                                           const index_t element_width,
+                                           const index_t num_batches,
+                                           const index_t num_elements_per_batch,
+                                           const index_t start_index,
+                                           const index_t topk,
+                                           const index_t num_blocks_per_batch);
 
 template <typename DType>
 struct NMS {
-  static const int THRESHOLD = 1024;
+  static constexpr int THRESHOLD = 512;
 
   void operator()(Tensor<gpu, 3, DType>* data,
                   Tensor<gpu, 2, uint32_t>* scratch,
@@ -172,19 +214,292 @@ struct NMS {
                   const BoxNMSParam& param,
                   Stream<gpu>* s) {
     const int n_threads = 512;
-    const index_t n_batch = data->shape_[0];
+    const index_t num_batches = data->shape_[0];
+    const index_t num_elements_per_batch = data->shape_[1];
+    const index_t element_width = data->shape_[2];
     for (index_t current_start = 0; current_start < topk; current_start += THRESHOLD) {
       const index_t n_elems = topk - current_start;
-      const int n_blocks = ceil_div(THRESHOLD / (sizeof(uint32_t) * 8) * n_elems * n_batch, n_threads);
-      CalculateGreedyNMSResults_kernel<<<n_blocks, n_threads, 0, Stream<gpu>::GetStream(s)>>>(
-          data->dptr_, scratch->dptr_, current_start);
+      const index_t num_blocks_per_row_batch = ceil_div(n_elems, n_threads);
+      const index_t num_blocks_per_row =  num_blocks_per_row_batch * num_batches;
+      const index_t n_blocks = THRESHOLD / (sizeof(uint32_t) * 8) * num_blocks_per_row;
+      if (param.in_format == box_common_enum::kCorner) {
+        CalculateGreedyNMSResults_kernel<box_common_enum::kCorner>
+          <<<n_blocks, n_threads, 0, Stream<gpu>::GetStream(s)>>>(
+            data->dptr_, scratch->dptr_, current_start, n_elems, num_batches,
+            num_blocks_per_row_batch, num_blocks_per_row, topk, element_width,
+            num_elements_per_batch, param.coord_start, param.id_index,
+            param.score_index, param.overlap_thresh);
+      } else {
+        CalculateGreedyNMSResults_kernel<box_common_enum::kCenter>
+          <<<n_blocks, n_threads, 0, Stream<gpu>::GetStream(s)>>>(
+            data->dptr_, scratch->dptr_, current_start, n_elems, num_batches,
+            num_blocks_per_row_batch, num_blocks_per_row, topk, element_width,
+            num_elements_per_batch, param.coord_start, param.id_index,
+            param.score_index, param.overlap_thresh);
+      }
+      ReduceNMSResultTriangle_kernel<<<num_batches, THRESHOLD, 0, Stream<gpu>::GetStream(s)>>>(
+          scratch->dptr_, data->dptr_, param.score_index,
+          element_width, num_batches, num_elements_per_batch,
+          current_start, topk);
+      const index_t n_rest_elems = n_elems - THRESHOLD;
+      const index_t num_rest_blocks_per_batch = ceil_div(n_rest_elems, n_threads);
+      const index_t num_rest_blocks = num_rest_blocks_per_batch * num_batches;
+      if (n_rest_elems > 0) {
+        ReduceNMSResultRest_kernel<<<num_rest_blocks, n_threads, 0, Stream<gpu>::GetStream(s)>>>(
+            data->dptr_, scratch->dptr_, param.score_index, element_width,
+            num_batches, num_elements_per_batch, current_start, topk,
+            num_rest_blocks_per_batch);
+      }
     }
   }
 };
 
-template <typename DType>
+template <int encode, typename DType>
+__device__ __forceinline__ DType calculate_area(const DType b0, const DType b1,
+                                                const DType b2, const DType b3) {
+  DType width = b2;
+  DType height = b3;
+  if (encode == box_common_enum::kCorner) {
+    width -= b0;
+    height -= b1;
+  }
+  if (width < 0 || height < 0) return 0;
+  return width * height;
+}
+
+template <int encode, typename DType>
+__device__ __forceinline__ DType calculate_intersection(const DType a0, const DType a1,
+                                                        const DType a2, const DType a3,
+                                                        const DType b0, const DType b1,
+                                                        const DType b2, const DType b3) {
+  DType wx, wy;
+  if (encode == box_common_enum::kCorner) {
+    const DType left = a0 > b0 ? a0 : b0;
+    const DType bottom = a1 > b1 ? a1 : b1;
+    const DType right = a2 < b2 ? a2 : b2;
+    const DType top = a3 < b3 ? a3 : b3;
+    wx = right - left;
+    wy = top - bottom;
+  } else {
+    const DType al = 2 * a0 - a2;
+    const DType ar = 2 * a0 + a2;
+    const DType bl = 2 * b0 - b2;
+    const DType br = 2 * b0 + b2;
+    const DType left = bl > al ? bl : al;
+    const DType right = br < ar ? br : ar;
+    wx = right - left;
+    const DType ab = 2 * a1 - a3;
+    const DType at = 2 * a1 + a3;
+    const DType bb = 2 * b1 - b3;
+    const DType bt = 2 * b1 + b3;
+    const DType bottom = bb > ab ? bb : ab;
+    const DType top = bt < at ? bt : at;
+    wy = top - bottom;
+    wy = wy / 4;  // To compensate for both wx and wy being 2x too large
+  }
+  if (wx <= 0 || wy <= 0) {
+    return 0;
+  } else {
+    return (wx * wy);
+  }
+}
+
+template <int encode, typename DType>
+__launch_bounds__(512)
 __global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* result,
-                                                 const index_t current_start) {
+                                                 const index_t current_start,
+                                                 const index_t num_elems,
+                                                 const index_t num_batches,
+                                                 const index_t num_blocks_per_row_batch,
+                                                 const index_t num_blocks_per_row,
+                                                 const index_t topk,
+                                                 const index_t element_width,
+                                                 const index_t num_elements_per_batch,
+                                                 const int coord_index,
+                                                 const int class_index,
+                                                 const int score_index,
+                                                 const float threshold) {
+  constexpr int max_elem_width = 20;
+  constexpr int num_other_boxes = sizeof(uint32_t) * 8;
+  __shared__ DType other_boxes[max_elem_width * num_other_boxes];
+  const index_t my_row = blockIdx.x / num_blocks_per_row;
+  const index_t my_block_offset_in_row = blockIdx.x % num_blocks_per_row;
+  const index_t my_block_offset_in_batch = my_block_offset_in_row % num_blocks_per_row_batch;
+  const index_t my_batch = (my_block_offset_in_row) / num_blocks_per_row_batch;
+  const index_t my_element_in_batch = my_block_offset_in_batch * blockDim.x + current_start + threadIdx.x;
+
+  // Load other boxes
+  const index_t offset = (my_batch * num_elements_per_batch +
+                         current_start + my_row * num_other_boxes) *
+                         element_width;
+  for (int i = threadIdx.x; i < element_width * num_other_boxes; i += blockDim.x) {
+    other_boxes[i] = data[offset + i];
+  }
+  __syncthreads();
+
+  if (my_element_in_batch >= topk) return;
+
+  DType my_box[4];
+  DType my_class = -1;
+  DType my_score = -1;
+  const index_t my_offset = (my_batch * num_elements_per_batch + my_element_in_batch) *
+                            element_width;
+  my_score = data[my_offset + score_index];
+#pragma unroll
+  for (int i = 0; i < 4; ++i) {
+    my_box[i] = data[my_offset + coord_index + i];
+  }
+  if (class_index != -1) {
+    my_class = data[my_offset + class_index];
+  }
+  DType my_area = calculate_area<encode>(my_box[0], my_box[1], my_box[2], my_box[3]);
+
+  /*if (my_element_in_batch == 2) {*/
+    /*printf("My score: %f\n", my_score);*/
+  /*}*/
+  uint32_t ret = 0;
+  if (my_score != -1) {
+#pragma unroll
+    for (int i = 0; i < num_other_boxes; ++i) {
+      const int other_boxes_offset = element_width * i;
+      if ((class_index == -1 ||  my_class == other_boxes[other_boxes_offset + class_index]) &&
+          other_boxes[other_boxes_offset + score_index] != -1){
+        const DType their_area = calculate_area<encode>(
+            other_boxes[other_boxes_offset + coord_index + 0],
+            other_boxes[other_boxes_offset + coord_index + 1],
+            other_boxes[other_boxes_offset + coord_index + 2],
+            other_boxes[other_boxes_offset + coord_index + 3]);
+
+        const DType intersect = calculate_intersection<encode>(
+            my_box[0], my_box[1], my_box[2], my_box[3],
+            other_boxes[other_boxes_offset + coord_index + 0],
+            other_boxes[other_boxes_offset + coord_index + 1],
+            other_boxes[other_boxes_offset + coord_index + 2],
+            other_boxes[other_boxes_offset + coord_index + 3]);
+        const DType iou = intersect / (my_area + their_area - intersect);
+        if (iou > threshold) {
+          ret = ret | (1u << i);
+        }
+      }
+    }
+  }
+  result[my_row * topk * num_batches + my_element_in_batch] = ~ret;
+  /*if (my_element_in_batch == 2) {*/
+    /*printf("myRow: %d, ret: %x\n", my_row, ~ret);*/
+  /*}*/
+  /*if (ret) {*/
+    /*printf("HAHA %d %d, %x\n", blockIdx.x, threadIdx.x, ret);*/
+  /*}*/
+}
+
+template <typename DType>
+__launch_bounds__(NMS<DType>::THRESHOLD)
+__global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results,
+                                               DType * data,
+                                               const index_t score_index,
+                                               const index_t element_width,
+                                               const index_t num_batches,
+                                               const index_t num_elements_per_batch,
+                                               const index_t start_index,
+                                               const index_t topk) {
+  constexpr int n_threads = NMS<DType>::THRESHOLD;
+  constexpr int warp_size = 32;
+  const index_t my_batch = blockIdx.x;
+  const index_t my_element_in_batch = threadIdx.x + start_index;
+  const index_t my_element = my_batch * topk + my_element_in_batch;
+  const int my_warp = threadIdx.x / warp_size;
+  const int my_lane = threadIdx.x % warp_size;
+
+  __shared__ uint32_t current_valid_boxes;
+  bool valid = true;
+  const uint32_t full_mask = 0xFFFFFFFF;
+  uint32_t valid_boxes;
+
+#pragma unroll
+  for (int i = 0; i < n_threads / warp_size; ++i) {
+    const uint32_t my_mask = my_element_in_batch < topk ?
+      nms_results[i * topk * num_batches + my_element]:
+      full_mask;
+    if (my_warp == i) {
+      valid_boxes = __ballot_sync(full_mask, valid);
+      /*if (i == 0) {*/
+      /*printf("After ballot %d: %x\n", threadIdx.x, valid_boxes);*/
+      /*}*/
+#pragma unroll
+      for (int j = 0; j < warp_size; ++j) {
+//        if ((valid_boxes & (1 << j)) != 0) {
+          const uint32_t mask = __shfl_sync(full_mask, valid?my_mask:full_mask, j);
+          const uint32_t p = (1 << (j+1)) - 1;
+          const uint32_t mp = mask | p;
+          valid_boxes = valid_boxes & mp;
+          valid = (valid_boxes & (1 << my_lane)) != 0;
+      /*if (i == 0) {*/
+      /*printf("After %d %d: %x %x %x\n",j, threadIdx.x, mask, p,valid_boxes);*/
+      /*}*/
+//        }
+      }
+      valid = (valid_boxes & (1 << my_lane)) != 0;
+      /*if (i == 0) {*/
+      /*printf("After everything %d: %x\n", threadIdx.x, valid_boxes);*/
+      /*}*/
+      /*printf("%d, %d: %u\n", i, threadIdx.x, valid_boxes);*/
+      if (my_lane == 0) {
+        current_valid_boxes = valid_boxes;
+      }
+    }
+    __syncthreads();
+    if (my_warp > i) {
+      valid = valid && (((~my_mask) & current_valid_boxes) == 0);
+    }
+    __syncthreads();
+  }
+  if (my_lane == 0) {
+    nms_results[my_element] = valid_boxes;
+  }
+  valid = (valid_boxes & (1 << my_lane)) != 0;
+  if (!valid) {
+    data[(my_batch * num_elements_per_batch + my_element_in_batch) * element_width + score_index] = -1;
+  }
+}
+
+template <typename DType>
+__launch_bounds__(512)
+__global__ void ReduceNMSResultRest_kernel(DType* data,
+                                           const uint32_t* nms_results,
+                                           const index_t score_index,
+                                           const index_t element_width,
+                                           const index_t num_batches,
+                                           const index_t num_elements_per_batch,
+                                           const index_t start_index,
+                                           const index_t topk,
+                                           const index_t num_blocks_per_batch) {
+  constexpr int num_other_boxes = sizeof(uint32_t) * 8;
+  constexpr int num_iterations = NMS<DType>::THRESHOLD / num_other_boxes;
+  constexpr int warp_size = 32;
+  const index_t my_block_offset_in_batch = blockIdx.x % num_blocks_per_batch;
+  const index_t my_batch = blockIdx.x / num_blocks_per_batch;
+  const index_t my_element_in_batch = my_block_offset_in_batch * blockDim.x + start_index + NMS<DType>::THRESHOLD + threadIdx.x;
+  const index_t my_element = my_batch * topk + my_element_in_batch;
+
+  if (my_element_in_batch >= topk) return;
+
+  bool valid = true;
+
+#pragma unroll
+  for (int i = 0; i < num_iterations; ++i) {
+    const uint32_t my_mask = nms_results[i * topk * num_batches + my_element];
+    const uint32_t valid_boxes = nms_results[my_batch * topk + i * warp_size + start_index];
+
+    const bool no_hit = (valid_boxes & (~my_mask)) == 0;
+    valid = valid && no_hit;
+    /*if (my_element_in_batch == 645) {*/
+      /*printf("my_mask %x valid_boxes %x no_hit %d valid %d\n", my_mask, valid_boxes, (int)no_hit, (int)valid);*/
+    /*}*/
+  }
+
+  if (!valid) {
+    data[(my_batch * num_elements_per_batch + my_element_in_batch) * element_width + score_index] = -1;
+  }
 }
 
 template <typename DType>
@@ -203,9 +518,7 @@ TempWorkspace<DType> GetWorkspace(const int num_batch,
   workspace.indices_temp_spaces = align(num_batch * num_elem * sizeof(index_t), alignment);
   WorkspaceForSort(num_batch, num_elem, width_elem, alignment, &workspace);
   // Place for a buffer
-  workspace.scratch_space = std::max(workspace.scratch_space,
-                                     align(num_batch * num_elem * width_elem * sizeof(DType),
-                                           alignment));
+  workspace.buffer_space = align(num_batch * num_elem * width_elem * sizeof(DType), alignment);
   workspace.nms_scratch_space = align(NMS<DType>::THRESHOLD / (sizeof(uint32_t) * 8) *
                                       num_batch * topk * sizeof(uint32_t), alignment);
 
@@ -225,13 +538,48 @@ TempWorkspace<DType> GetWorkspace(const int num_batch,
                                                  workspace.scores_temp_space);
   workspace.scratch = reinterpret_cast<uint8_t*>(workspace.batches) +
                                                  workspace.batch_temp_space;
-  workspace.nms_scratch = reinterpret_cast<uint32_t*>(workspace.scratch +
-                                                      workspace.scratch_space);
+  workspace.buffer = reinterpret_cast<DType*>(workspace.scratch +
+                                              workspace.scratch_space);
+  workspace.nms_scratch = reinterpret_cast<uint32_t*>(
+                            reinterpret_cast<uint8_t*>(workspace.buffer) +
+                            workspace.buffer_space);
   workspace.indices = reinterpret_cast<index_t*>(
-      reinterpret_cast<uint8_t*>(workspace.nms_scratch) + workspace.nms_scratch_space);
+                            reinterpret_cast<uint8_t*>(workspace.nms_scratch) +
+                            workspace.nms_scratch_space);
   return workspace;
 }
 
+template <typename DType>
+__global__ void ExtractScores_kernel(const DType* data, DType* scores,
+                                     const index_t N, const int element_width,
+                                     const int score_index) {
+  const index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  if (tid < N) {
+    scores[tid] = data[tid * element_width + score_index];
+  }
+}
+
+template <typename DType>
+void CompactNMSResults(const Tensor<gpu, 3, DType>& data,
+                       Tensor<gpu, 3, DType>* out,
+                       Tensor<gpu, 1, index_t>* indices,
+                       Tensor<gpu, 1, DType>* scores,
+                       const int score_index,
+                       Stream<gpu>* s) {
+  constexpr int n_threads = 512;
+  const index_t num_elements = scores->shape_.Size();
+  const index_t num_elements_per_batch = data.shape_[1];
+  const int element_width = data.shape_[2];
+  const index_t n_blocks = ceil_div(num_elements, n_threads);
+  ExtractScores_kernel<<<n_blocks, n_threads, 0, Stream<gpu>::GetStream(s)>>>(
+      data.dptr_, scores->dptr_, num_elements, element_width, score_index);
+  *indices = mshadow::expr::range<index_t>(0, num_elements);
+  //mxnet::op::SortByKey(scores, indices, false, &scratch);
+  //batches = indices / mshadow::expr::ScalarExp<index_t>(num_elements_per_batch);
+  //mxnet::op::SortByKey(batches, indices, true, &scratch);
+  CompactData(*indices, data, out, -1, s);
+}
+
 }  // namespace
 
 void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs,
@@ -278,18 +626,20 @@ void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs,
     Tensor<gpu, 1, index_t> indices(workspace.indices, Shape1(num_batch * num_elem), s);
     Tensor<gpu, 1, char> scratch(reinterpret_cast<char*>(workspace.scratch),
                                         Shape1(workspace.scratch_space), s);
+    Tensor<gpu, 3, DType> buffer(workspace.buffer,
+                                 Shape3(num_batch, num_elem, width_elem), s);
     Tensor<gpu, 2, uint32_t> nms_scratch(workspace.nms_scratch,
-                                        Shape2(NMS<DType>::THRESHOLD / (sizeof(uint32_t) * 8), topk * num_batch), s);
+                                         Shape2(NMS<DType>::THRESHOLD / (sizeof(uint32_t) * 8),
+                                                topk * num_batch),
+                                         s);
     indices = mshadow::expr::range<index_t>(0, num_batch * num_elem);
     mxnet::op::SortByKey(scores, indices, false, &scratch);
     batches = indices / mshadow::expr::ScalarExp<index_t>(num_elem);
     mxnet::op::SortByKey(batches, indices, true, &scratch);
-    Tensor<gpu, 3, DType> buffer(reinterpret_cast<DType*>(workspace.scratch),
-                                 Shape3(num_batch, num_elem, width_elem), s);
     CompactData(indices, out, &buffer, topk, s);
     NMS<DType> nms;
     nms(&buffer, &nms_scratch,  topk, param, s);
-    mshadow::Copy(out, buffer, s);
+    CompactNMSResults(buffer, &out, &indices, &scores, param.score_index, s);
   });
 }
 

From 0829222f56f8317f1e2a814500ecff1bf58e352c Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Thu, 17 Oct 2019 11:10:40 -0700
Subject: [PATCH 03/19] Removing second sort

---
 src/operator/contrib/bounding_box-inl.cuh |   2 -
 src/operator/contrib/bounding_box.cu      | 154 +++++++++++-----------
 2 files changed, 80 insertions(+), 76 deletions(-)

diff --git a/src/operator/contrib/bounding_box-inl.cuh b/src/operator/contrib/bounding_box-inl.cuh
index 128d3cbcecc2..de215ce35a98 100644
--- a/src/operator/contrib/bounding_box-inl.cuh
+++ b/src/operator/contrib/bounding_box-inl.cuh
@@ -202,9 +202,7 @@ void NMSApply(mshadow::Stream<gpu> *s,
               int in_format) {
   using namespace mxnet_op;
   constexpr int THRESHOLD = 1024;
-  std::cout << "TOPK " << topk << std::endl;
   for (int ref = 0; ref < topk; ref += THRESHOLD) {
-    std::cout << "REF: " << ref << std::endl;
     constexpr int block_size = 512;
     constexpr int N = THRESHOLD / block_size;
     auto stream = mshadow::Stream<gpu>::GetStream(s);
diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu
index f824f6d58e25..f87ce148313a 100644
--- a/src/operator/contrib/bounding_box.cu
+++ b/src/operator/contrib/bounding_box.cu
@@ -41,8 +41,6 @@ template <typename DType>
 struct TempWorkspace {
   index_t scores_temp_space;
   DType* scores;
-  index_t batch_temp_space;
-  index_t* batches;
   index_t scratch_space;
   uint8_t* scratch;
   index_t buffer_space;
@@ -63,13 +61,12 @@ inline index_t align(index_t x, index_t alignment) {
 
 template <typename DType>
 __global__ void FilterAndPrepareAuxData_kernel(const DType* data, DType* out, DType* scores,
-                                               index_t* batches, index_t num_elements_per_batch,
+                                               index_t num_elements_per_batch,
                                                const index_t element_width, const float threshold,
                                                const int id_index, const int score_index,
                                                const int background_id) {
   index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
   bool first_in_element = (tid % element_width == 0);
-  index_t my_batch = tid / (num_elements_per_batch * element_width);
   index_t start_of_my_element = tid - (tid % element_width);
 
   DType my_score = data[start_of_my_element + score_index];
@@ -88,7 +85,6 @@ __global__ void FilterAndPrepareAuxData_kernel(const DType* data, DType* out, DT
   if (first_in_element) {
     index_t offset = tid / element_width;
     scores[offset] = my_score;
-    batches[offset] = my_batch;
   }
 }
 
@@ -106,16 +102,17 @@ void FilterAndPrepareAuxData(const Tensor<gpu, 3, DType>& data,
                                    0,
                                    Stream<gpu>::GetStream(s)>>>(
     data.dptr_, out->dptr_, workspace.scores,
-    workspace.batches, data.shape_[1], data.shape_[2],
+    data.shape_[1], data.shape_[2],
     param.valid_thresh, param.id_index,
     param.score_index, param.background_id);
 }
 
-template <bool check_topk, typename DType>
+template <bool check_topk, bool check_score, typename DType>
 __global__ void CompactData_kernel(const index_t* indices, const DType* source,
                                    DType* destination, const index_t topk,
                                    const index_t element_width,
                                    const index_t num_elements_per_batch,
+                                   const int score_index,
                                    const index_t N) {
   const index_t tid_start = blockIdx.x * blockDim.x + threadIdx.x;
   for (index_t tid = tid_start; tid < N; tid += blockDim.x * gridDim.x) {
@@ -124,47 +121,57 @@ __global__ void CompactData_kernel(const index_t* indices, const DType* source,
     if (check_topk && my_element_in_batch >= topk) {
       destination[tid] = -1;
     } else {
+      DType ret;
       const index_t source_element = indices[my_element];
-      destination[tid] = source[source_element * element_width + tid % element_width];
+      DType score = 0;
+      if (check_score) {
+        score = source[source_element * element_width + score_index];
+      }
+      if (score >= 0) {
+        ret = source[source_element * element_width + tid % element_width];
+      } else {
+        ret = -1;
+      }
+      destination[tid] = ret;
     }
   }
 }
 
-template <typename DType>
+template <bool check_score, typename DType>
 void CompactData(const Tensor<gpu, 1, index_t>& indices,
                  const Tensor<gpu, 3, DType>& source,
                  Tensor<gpu, 3, DType>* destination,
                  const index_t topk,
+                 const int score_index,
                  Stream<gpu>* s) {
   const int n_threads = 512;
   const int max_blocks = 320;
   index_t N = source.shape_.Size();
   const auto blocks = std::min(ceil_div(N, n_threads), max_blocks);
   if (topk > 0) {
-    CompactData_kernel<true><<<blocks, n_threads, 0,
-                               Stream<gpu>::GetStream(s)>>>(
+    CompactData_kernel<true, check_score><<<blocks, n_threads, 0,
+                                            Stream<gpu>::GetStream(s)>>>(
       indices.dptr_, source.dptr_,
       destination->dptr_, topk,
-      source.shape_[2], source.shape_[1], N);
+      source.shape_[2], source.shape_[1],
+      score_index, N);
   } else {
-    CompactData_kernel<false><<<blocks, n_threads, 0,
-                                Stream<gpu>::GetStream(s)>>>(
+    CompactData_kernel<false, check_score><<<blocks, n_threads, 0,
+                                             Stream<gpu>::GetStream(s)>>>(
       indices.dptr_, source.dptr_,
       destination->dptr_, topk,
-      source.shape_[2], source.shape_[1], N);
+      source.shape_[2], source.shape_[1],
+      score_index, N);
   }
 }
 
 template <typename DType>
-void WorkspaceForSort(const int num_batch,
-                      const int num_elem,
+void WorkspaceForSort(const int num_elem,
                       const int width_elem,
                       const int alignment,
                       TempWorkspace<DType>* workspace) {
-  const index_t sort_scores_temp_space = mxnet::op::SortByKeyWorkspaceSize<DType, index_t, gpu>(num_batch * num_elem);
-  const index_t sort_batch_temp_space = mxnet::op::SortByKeyWorkspaceSize<index_t, index_t, gpu>(num_batch * num_elem);
-  workspace->scratch_space = align(std::max(sort_scores_temp_space,
-                                            sort_batch_temp_space),
+  const index_t sort_scores_temp_space = mxnet::op::SortByKeyWorkspaceSize<DType, index_t, gpu>(num_elem);
+  workspace->scratch_space = align(sort_scores_temp_space,
                                    alignment);
 }
 
@@ -227,14 +234,16 @@ struct NMS {
           <<<n_blocks, n_threads, 0, Stream<gpu>::GetStream(s)>>>(
             data->dptr_, scratch->dptr_, current_start, n_elems, num_batches,
             num_blocks_per_row_batch, num_blocks_per_row, topk, element_width,
-            num_elements_per_batch, param.coord_start, param.id_index,
+            num_elements_per_batch, param.coord_start,
+            param.force_suppress ? -1 : param.id_index,
             param.score_index, param.overlap_thresh);
       } else {
         CalculateGreedyNMSResults_kernel<box_common_enum::kCenter>
           <<<n_blocks, n_threads, 0, Stream<gpu>::GetStream(s)>>>(
             data->dptr_, scratch->dptr_, current_start, n_elems, num_batches,
             num_blocks_per_row_batch, num_blocks_per_row, topk, element_width,
-            num_elements_per_batch, param.coord_start, param.id_index,
+            num_elements_per_batch, param.coord_start,
+            param.force_suppress ? -1 : param.id_index,
             param.score_index, param.overlap_thresh);
       }
       ReduceNMSResultTriangle_kernel<<<num_batches, THRESHOLD, 0, Stream<gpu>::GetStream(s)>>>(
@@ -354,9 +363,6 @@ __global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* re
   }
   DType my_area = calculate_area<encode>(my_box[0], my_box[1], my_box[2], my_box[3]);
 
-  /*if (my_element_in_batch == 2) {*/
-    /*printf("My score: %f\n", my_score);*/
-  /*}*/
   uint32_t ret = 0;
   if (my_score != -1) {
 #pragma unroll
@@ -384,12 +390,6 @@ __global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* re
     }
   }
   result[my_row * topk * num_batches + my_element_in_batch] = ~ret;
-  /*if (my_element_in_batch == 2) {*/
-    /*printf("myRow: %d, ret: %x\n", my_row, ~ret);*/
-  /*}*/
-  /*if (ret) {*/
-    /*printf("HAHA %d %d, %x\n", blockIdx.x, threadIdx.x, ret);*/
-  /*}*/
 }
 
 template <typename DType>
@@ -413,6 +413,7 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results,
   __shared__ uint32_t current_valid_boxes;
   bool valid = true;
   const uint32_t full_mask = 0xFFFFFFFF;
+  const uint32_t my_warp_mask = 1 << my_lane;
   uint32_t valid_boxes;
 
 #pragma unroll
@@ -422,27 +423,15 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results,
       full_mask;
     if (my_warp == i) {
       valid_boxes = __ballot_sync(full_mask, valid);
-      /*if (i == 0) {*/
-      /*printf("After ballot %d: %x\n", threadIdx.x, valid_boxes);*/
-      /*}*/
 #pragma unroll
       for (int j = 0; j < warp_size; ++j) {
-//        if ((valid_boxes & (1 << j)) != 0) {
           const uint32_t mask = __shfl_sync(full_mask, valid?my_mask:full_mask, j);
           const uint32_t p = (1 << (j+1)) - 1;
           const uint32_t mp = mask | p;
           valid_boxes = valid_boxes & mp;
-          valid = (valid_boxes & (1 << my_lane)) != 0;
-      /*if (i == 0) {*/
-      /*printf("After %d %d: %x %x %x\n",j, threadIdx.x, mask, p,valid_boxes);*/
-      /*}*/
-//        }
+          valid = (valid_boxes & my_warp_mask) != 0;
       }
-      valid = (valid_boxes & (1 << my_lane)) != 0;
-      /*if (i == 0) {*/
-      /*printf("After everything %d: %x\n", threadIdx.x, valid_boxes);*/
-      /*}*/
-      /*printf("%d, %d: %u\n", i, threadIdx.x, valid_boxes);*/
+      // valid = (valid_boxes & my_warp_mask) != 0;
       if (my_lane == 0) {
         current_valid_boxes = valid_boxes;
       }
@@ -456,7 +445,7 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results,
   if (my_lane == 0) {
     nms_results[my_element] = valid_boxes;
   }
-  valid = (valid_boxes & (1 << my_lane)) != 0;
+  // valid = (valid_boxes & my_warp_mask) != 0;
   if (!valid) {
     data[(my_batch * num_elements_per_batch + my_element_in_batch) * element_width + score_index] = -1;
   }
@@ -492,9 +481,6 @@ __global__ void ReduceNMSResultRest_kernel(DType* data,
 
     const bool no_hit = (valid_boxes & (~my_mask)) == 0;
     valid = valid && no_hit;
-    /*if (my_element_in_batch == 645) {*/
-      /*printf("my_mask %x valid_boxes %x no_hit %d valid %d\n", my_mask, valid_boxes, (int)no_hit, (int)valid);*/
-    /*}*/
   }
 
   if (!valid) {
@@ -514,16 +500,14 @@ TempWorkspace<DType> GetWorkspace(const int num_batch,
 
   // Get the workspace size
   workspace.scores_temp_space = align(num_batch * num_elem * sizeof(DType), alignment);
-  workspace.batch_temp_space = align(num_batch * num_elem * sizeof(index_t), alignment);
   workspace.indices_temp_spaces = align(num_batch * num_elem * sizeof(index_t), alignment);
-  WorkspaceForSort(num_batch, num_elem, width_elem, alignment, &workspace);
+  WorkspaceForSort(num_elem, width_elem, alignment, &workspace);
   // Place for a buffer
   workspace.buffer_space = align(num_batch * num_elem * width_elem * sizeof(DType), alignment);
   workspace.nms_scratch_space = align(NMS<DType>::THRESHOLD / (sizeof(uint32_t) * 8) *
                                       num_batch * topk * sizeof(uint32_t), alignment);
 
   const index_t workspace_size = workspace.scores_temp_space +
-                                 workspace.batch_temp_space +
                                  workspace.scratch_space +
                                  workspace.nms_scratch_space +
                                  workspace.indices_temp_spaces;
@@ -534,10 +518,8 @@ TempWorkspace<DType> GetWorkspace(const int num_batch,
 
   // Populate workspace pointers
   workspace.scores = scratch_memory.dptr_;
-  workspace.batches = reinterpret_cast<index_t*>(reinterpret_cast<uint8_t*>(workspace.scores) +
-                                                 workspace.scores_temp_space);
-  workspace.scratch = reinterpret_cast<uint8_t*>(workspace.batches) +
-                                                 workspace.batch_temp_space;
+  workspace.scratch = reinterpret_cast<uint8_t*>(workspace.scores) +
+                                                 workspace.scores_temp_space;
   workspace.buffer = reinterpret_cast<DType*>(workspace.scratch +
                                               workspace.scratch_space);
   workspace.nms_scratch = reinterpret_cast<uint32_t*>(
@@ -564,20 +546,30 @@ void CompactNMSResults(const Tensor<gpu, 3, DType>& data,
                        Tensor<gpu, 3, DType>* out,
                        Tensor<gpu, 1, index_t>* indices,
                        Tensor<gpu, 1, DType>* scores,
+                       Tensor<gpu, 1, char>* scratch,
                        const int score_index,
                        Stream<gpu>* s) {
+  using mshadow::Shape1;
   constexpr int n_threads = 512;
   const index_t num_elements = scores->shape_.Size();
   const index_t num_elements_per_batch = data.shape_[1];
+  const index_t num_batches = data.shape_[0];
   const int element_width = data.shape_[2];
   const index_t n_blocks = ceil_div(num_elements, n_threads);
   ExtractScores_kernel<<<n_blocks, n_threads, 0, Stream<gpu>::GetStream(s)>>>(
       data.dptr_, scores->dptr_, num_elements, element_width, score_index);
   *indices = mshadow::expr::range<index_t>(0, num_elements);
-  //mxnet::op::SortByKey(scores, indices, false, &scratch);
-  //batches = indices / mshadow::expr::ScalarExp<index_t>(num_elements_per_batch);
-  //mxnet::op::SortByKey(batches, indices, true, &scratch);
-  CompactData(*indices, data, out, -1, s);
+  for (index_t i = 0; i < num_batches; ++i) {
+    // Sort each batch separately
+    Tensor<gpu, 1, DType> scores_batch(scores->dptr_ + i * num_elements_per_batch,
+                                       Shape1(num_elements_per_batch),
+                                       s);
+    Tensor<gpu, 1, index_t> indices_batch(indices->dptr_ + i * num_elements_per_batch,
+                                          Shape1(num_elements_per_batch),
+                                          s);
+    mxnet::op::SortByKey(scores_batch, indices_batch, false, scratch);
+  }
+  CompactData<true>(*indices, data, out, -1, score_index, s);
 }
 
 }  // namespace
@@ -602,7 +594,9 @@ void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs,
   int num_elem = in_shape[indim - 2];
   int width_elem = in_shape[indim - 1];
 
-  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
+  //MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, 
+  using DType = float;
+  {
     Tensor<gpu, 3, DType> data = inputs[box_nms_enum::kData]
      .get_with_shape<gpu, 3, DType>(Shape3(num_batch, num_elem, width_elem), s);
     Tensor<gpu, 3, DType> out = outputs[box_nms_enum::kOut]
@@ -622,7 +616,6 @@ void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs,
 
     FilterAndPrepareAuxData(data, &out, workspace, param, s);
     Tensor<gpu, 1, DType> scores(workspace.scores, Shape1(num_batch * num_elem), s);
-    Tensor<gpu, 1, index_t> batches(workspace.batches, Shape1(num_batch * num_elem), s);
     Tensor<gpu, 1, index_t> indices(workspace.indices, Shape1(num_batch * num_elem), s);
     Tensor<gpu, 1, char> scratch(reinterpret_cast<char*>(workspace.scratch),
                                         Shape1(workspace.scratch_space), s);
@@ -633,14 +626,32 @@ void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs,
                                                 topk * num_batch),
                                          s);
     indices = mshadow::expr::range<index_t>(0, num_batch * num_elem);
-    mxnet::op::SortByKey(scores, indices, false, &scratch);
-    batches = indices / mshadow::expr::ScalarExp<index_t>(num_elem);
-    mxnet::op::SortByKey(batches, indices, true, &scratch);
-    CompactData(indices, out, &buffer, topk, s);
+    for (index_t i = 0; i < num_batch; ++i) {
+      // Sort each batch separately
+      Tensor<gpu, 1, DType> scores_batch(scores.dptr_ + i * num_elem,
+                                         Shape1(num_elem),
+                                         s);
+      Tensor<gpu, 1, index_t> indices_batch(indices.dptr_ + i * num_elem,
+                                            Shape1(num_elem),
+                                            s);
+      mxnet::op::SortByKey(scores_batch, indices_batch, false, &scratch);
+    }
+    CompactData<false>(indices, out, &buffer, topk, -1, s);
     NMS<DType> nms;
     nms(&buffer, &nms_scratch,  topk, param, s);
-    CompactNMSResults(buffer, &out, &indices, &scores, param.score_index, s);
-  });
+    CompactNMSResults(buffer, &out, &indices, &scores, &scratch, param.score_index, s);
+
+    // convert encoding
+    if (param.in_format != param.out_format) {
+      if (box_common_enum::kCenter == param.out_format) {
+        mxnet::op::mxnet_op::Kernel<corner_to_center, gpu>::Launch(s, num_batch * num_elem,
+          out.dptr_ + param.coord_start, width_elem);
+      } else {
+        mxnet::op::mxnet_op::Kernel<center_to_corner, gpu>::Launch(s, num_batch * num_elem,
+          out.dptr_ + param.coord_start, width_elem);
+      }
+    }
+  };//);
 }
 
 void BoxNMSForwardGPU(const nnvm::NodeAttrs& attrs,
@@ -653,11 +664,6 @@ void BoxNMSForwardGPU(const nnvm::NodeAttrs& attrs,
   using namespace mxnet_op;
   CHECK_EQ(inputs.size(), 1U);
   CHECK_EQ(outputs.size(), 2U) << "BoxNMS output: [output, temp]";
-  std::cout << "Reqs" << std::endl;
-  for (const auto& r : req) {
-    std::cout << r << std::endl;
-  }
-  std::cout << "END: Reqs" << std::endl;
   if (req[1] == kNullOp) {
     BoxNMSForwardGPU_notemp(attrs, ctx, inputs, req, outputs);
     return;

From f3445f872f7908cb6bb896dce4008bbb2a3ffb4b Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Thu, 17 Oct 2019 13:11:16 -0700
Subject: [PATCH 04/19] Optimization

---
 src/operator/contrib/bounding_box.cu | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu
index f87ce148313a..228c1273e156 100644
--- a/src/operator/contrib/bounding_box.cu
+++ b/src/operator/contrib/bounding_box.cu
@@ -416,10 +416,15 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results,
   const uint32_t my_warp_mask = 1 << my_lane;
   uint32_t valid_boxes;
 
+  uint32_t my_next_mask = my_element_in_batch < topk ?
+    nms_results[my_element]:
+    full_mask;
 #pragma unroll
   for (int i = 0; i < n_threads / warp_size; ++i) {
-    const uint32_t my_mask = my_element_in_batch < topk ?
-      nms_results[i * topk * num_batches + my_element]:
+    const uint32_t my_mask = my_next_mask;
+    my_next_mask = (((i + 1) < n_threads / warp_size) &&
+                    (my_element_in_batch < topk)) ?
+      nms_results[(i + 1) * topk * num_batches + my_element]:
       full_mask;
     if (my_warp == i) {
       valid_boxes = __ballot_sync(full_mask, valid);
@@ -431,7 +436,6 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results,
           valid_boxes = valid_boxes & mp;
           valid = (valid_boxes & my_warp_mask) != 0;
       }
-      // valid = (valid_boxes & my_warp_mask) != 0;
       if (my_lane == 0) {
         current_valid_boxes = valid_boxes;
       }
@@ -445,7 +449,6 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results,
   if (my_lane == 0) {
     nms_results[my_element] = valid_boxes;
   }
-  // valid = (valid_boxes & my_warp_mask) != 0;
   if (!valid) {
     data[(my_batch * num_elements_per_batch + my_element_in_batch) * element_width + score_index] = -1;
   }

From e653f58bd9027bfc6ac2c979c3f8b816cc5cc4dc Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Thu, 17 Oct 2019 14:45:46 -0700
Subject: [PATCH 05/19] Adding out-of-place ability to SortByKey

---
 src/operator/tensor/sort_op-inl.cuh | 130 +++++++++++++++++++++-------
 src/operator/tensor/sort_op.h       |  50 +++++++++--
 2 files changed, 145 insertions(+), 35 deletions(-)

diff --git a/src/operator/tensor/sort_op-inl.cuh b/src/operator/tensor/sort_op-inl.cuh
index b20b466d9c2b..63ccc5752fb0 100644
--- a/src/operator/tensor/sort_op-inl.cuh
+++ b/src/operator/tensor/sort_op-inl.cuh
@@ -95,13 +95,22 @@ SortPairsWorkspaceSize(const size_t num_keys) {
 
 template <typename KDType, typename VDType, typename xpu>
 inline typename std::enable_if<std::is_same<xpu, gpu>::value, size_t>::type
-SortByKeyWorkspaceSize(const size_t num_keys) {
+SortByKeyWorkspaceSize(const size_t num_keys,
+                       const bool keys_in_place,
+                       const bool values_in_place) {
 #ifdef SORT_WITH_THRUST
   return 0;
 #else
   size_t keys_bytes, values_bytes;
   WorkspaceSize4KeysAndValues<KDType, VDType>(num_keys, &keys_bytes, &values_bytes);
-  return keys_bytes + values_bytes + SortPairsWorkspaceSize<KDType, VDType>(num_keys);
+  size_t ret = SortPairsWorkspaseSize<KDType, VDType>(num_keys);
+  if (keys_in_place) {
+    ret += keys_bytes;
+  }
+  if (values_in_place) {
+    ret += values_bytes;
+  }
+  return ret;
 #endif
 }
 
@@ -111,7 +120,9 @@ inline typename std::enable_if<!(std::is_same<KDType,mshadow::half::half_t>::val
 SortByKeyImpl(mshadow::Tensor<gpu, 1, KDType> keys,
               mshadow::Tensor<gpu, 1, VDType> values, bool is_ascend,
               mshadow::Tensor<gpu, 1, char>* workspace,
-              const int begin_bit, const int end_bit) {
+              const int begin_bit, const int end_bit,
+              mshadow::Tensor<gpu, 1, KDType>* sorted_keys,
+              mshadow::Tensor<gpu, 1, VDType>* sorted_values) {
   CHECK_EQ(keys.CheckContiguous(), true);
   CHECK_EQ(values.CheckContiguous(), true);
 #if CUDA_VERSION >= 7000
@@ -135,11 +146,23 @@ SortByKeyImpl(mshadow::Tensor<gpu, 1, KDType> keys,
           NULL, NULL, NULL, NULL,
           keys.size(0), begin_bit, end_bit, stream);
     }
+
+    size_t required_storage = sortpairs_bytes +
+                              (sorted_keys == nullptr ? keys_bytes : 0) +
+                              (sorted_values == nullptr ? values_bytes : 0);
+
     // Check that we have enough storage
-    CHECK_GE(workspace->size(0), keys_bytes + values_bytes + sortpairs_bytes);
-    //
-    KDType* keys_out_ptr = reinterpret_cast<KDType *>(workspace->dptr_);
-    VDType* values_out_ptr = reinterpret_cast<VDType *>(workspace->dptr_ + keys_bytes);
+    CHECK_GE(workspace->size(0), required_storage)
+      << "Workspace given to SortByKey is too small: requested " << required_storage <<
+      " B and got " << workspace->size(0) << " B.";
+
+    KDType* keys_out_ptr = sorted_keys == nullptr ?
+                           reinterpret_cast<KDType *>(workspace->dptr_) :
+                           sorted_keys->dptr_;
+    VDType* values_out_ptr = sorted_values == nullptr ?
+                             reinterpret_cast<VDType *>(workspace->dptr_ + keys_bytes) :
+                             sorted_values->dptr_;
+
     void* temp_storage = reinterpret_cast<void *>(workspace->dptr_ + keys_bytes + values_bytes);
     // Sort
     if (is_ascend) {
@@ -152,17 +175,31 @@ SortByKeyImpl(mshadow::Tensor<gpu, 1, KDType> keys,
         keys.size(0), begin_bit, end_bit, stream);
     }
     // Copy result back to [keys, values]
-    mshadow::Tensor<gpu, 1, KDType> keys_out(keys_out_ptr, mshadow::Shape1(keys.size(0)),
-      keys.stream_);
-    mshadow::Tensor<gpu, 1, VDType> values_out(values_out_ptr, mshadow::Shape1(keys.size(0)),
-      keys.stream_);
-    mshadow::Copy(keys, keys_out, keys.stream_);
-    mshadow::Copy(values, values_out, values.stream_);
+    if (sorted_keys == nullptr) {
+      mshadow::Tensor<gpu, 1, KDType> keys_out(keys_out_ptr, mshadow::Shape1(keys.size(0)),
+        keys.stream_);
+      mshadow::Copy(keys, keys_out, keys.stream_);
+    }
+    if (sorted_values == nullptr) {
+      mshadow::Tensor<gpu, 1, VDType> values_out(values_out_ptr, mshadow::Shape1(keys.size(0)),
+        keys.stream_);
+      mshadow::Copy(values, values_out, values.stream_);
+    }
   } else {
 #endif // SORT_WITH_THRUST
     // No workspace, sort using thrust
-    thrust::device_ptr<KDType> key_iter = thrust::device_pointer_cast(keys.dptr_);
-    thrust::device_ptr<VDType> value_iter = thrust::device_pointer_cast(values.dptr_);
+    auto* k = &keys;
+    auto* v = &values;
+    if (sorted_keys != nullptr) {
+      k = sorted_keys;
+      mshadow::Copy(*sorted_keys, keys, keys.stream_);
+    }
+    if (sorted_values != nullptr) {
+      v = sorted_values;
+      mshadow::Copy(*sorted_values, values, values.stream_);
+    }
+    const auto key_iter = thrust::device_pointer_cast(k->dptr_);
+    const auto value_iter = thrust::device_pointer_cast(v->dptr_);
     if (is_ascend) {
       thrust::stable_sort_by_key(
         thrust::cuda::par.on(stream),
@@ -187,14 +224,25 @@ inline typename std::enable_if<((!std::is_same<KDType,mshadow::half::half_t>::va
 SortByKeyImpl(mshadow::Tensor<gpu, 1, KDType> keys,
               mshadow::Tensor<gpu, 1, VDType> values, bool is_ascend,
               mshadow::Tensor<gpu, 1, char>* workspace,
-              const int begin_bit, const int end_bit) {
+              const int begin_bit, const int end_bit,
+              mshadow::Tensor<gpu, 1, KDType>* sorted_keys,
+              mshadow::Tensor<gpu, 1, VDType>* sorted_values) {
   CHECK_EQ(keys.CheckContiguous(), true);
   CHECK_EQ(values.CheckContiguous(), true);
 #if CUDA_VERSION >= 9000
   cudaStream_t stream = mshadow::Stream<gpu>::GetStream(keys.stream_);
-  thrust::device_ptr<KDType> key_iter = thrust::device_pointer_cast(keys.dptr_);
-  thrust::device_ptr<__half> value_iter = thrust::device_pointer_cast(
-    reinterpret_cast<__half*>(values.dptr_));
+  auto* k = &keys;
+  auto* v = &values;
+  if (sorted_keys != nullptr) {
+    k = sorted_keys;
+    mshadow::Copy(*sorted_keys, keys, keys.stream_);
+  }
+  if (sorted_values != nullptr) {
+    v = sorted_values;
+    mshadow::Copy(*sorted_values, values, values.stream_);
+  }
+  const auto key_iter = thrust::device_pointer_cast(k->dptr_);
+  const auto value_iter = thrust::device_pointer_cast(reinterpret_cast<__half*>(v->dptr_));
   if (is_ascend) {
     thrust::stable_sort_by_key(
       thrust::cuda::par.on(stream),
@@ -216,14 +264,25 @@ inline typename std::enable_if<(std::is_same<KDType,mshadow::half::half_t>::valu
 SortByKeyImpl(mshadow::Tensor<gpu, 1, KDType> keys,
               mshadow::Tensor<gpu, 1, VDType> values, bool is_ascend,
               mshadow::Tensor<gpu, 1, char>* workspace,
-              const int begin_bit, const int end_bit) {
+              const int begin_bit, const int end_bit,
+              mshadow::Tensor<gpu, 1, KDType>* sorted_keys,
+              mshadow::Tensor<gpu, 1, VDType>* sorted_values) {
   CHECK_EQ(keys.CheckContiguous(), true);
   CHECK_EQ(values.CheckContiguous(), true);
 #if CUDA_VERSION >= 9000
   cudaStream_t stream = mshadow::Stream<gpu>::GetStream(keys.stream_);
-  thrust::device_ptr<__half> key_iter = thrust::device_pointer_cast(
-    reinterpret_cast<__half*>(keys.dptr_));
-  thrust::device_ptr<VDType> value_iter = thrust::device_pointer_cast(values.dptr_);
+  auto* k = &keys;
+  auto* v = &values;
+  if (sorted_keys != nullptr) {
+    k = sorted_keys;
+    mshadow::Copy(*sorted_keys, keys, keys.stream_);
+  }
+  if (sorted_values != nullptr) {
+    v = sorted_values;
+    mshadow::Copy(*sorted_values, values, values.stream_);
+  }
+  const auto key_iter = thrust::device_pointer_cast(reinterpret_cast<__half*>(k->dptr_));
+  const auto value_iter = thrust::device_pointer_cast(v->dptr_);
   if (is_ascend) {
     thrust::stable_sort_by_key(
       thrust::cuda::par.on(stream),
@@ -246,15 +305,25 @@ inline typename std::enable_if<(std::is_same<KDType,mshadow::half::half_t>::valu
 SortByKeyImpl(mshadow::Tensor<gpu, 1, KDType> keys,
               mshadow::Tensor<gpu, 1, VDType> values, bool is_ascend,
               mshadow::Tensor<gpu, 1, char>* workspace,
-              const int begin_bit, const int end_bit) {
+              const int begin_bit, const int end_bit,
+              mshadow::Tensor<gpu, 1, KDType>* sorted_keys,
+              mshadow::Tensor<gpu, 1, VDType>* sorted_values) {
   CHECK_EQ(keys.CheckContiguous(), true);
   CHECK_EQ(values.CheckContiguous(), true);
 #if CUDA_VERSION >= 9000
   cudaStream_t stream = mshadow::Stream<gpu>::GetStream(keys.stream_);
-  thrust::device_ptr<__half> key_iter = thrust::device_pointer_cast(
-    reinterpret_cast<__half*>(keys.dptr_));
-  thrust::device_ptr<__half> value_iter = thrust::device_pointer_cast(
-    reinterpret_cast<__half*>(values.dptr_));
+  auto* k = &keys;
+  auto* v = &values;
+  if (sorted_keys != nullptr) {
+    k = sorted_keys;
+    mshadow::Copy(*sorted_keys, keys, keys.stream_);
+  }
+  if (sorted_values != nullptr) {
+    v = sorted_values;
+    mshadow::Copy(*sorted_values, values, values.stream_);
+  }
+  const auto key_iter = thrust::device_pointer_cast(reinterpret_cast<half*>(k->dptr_));
+  const auto value_iter = thrust::device_pointer_cast(reinterpret_cast<half*>(v->dptr_));
   if (is_ascend) {
     thrust::stable_sort_by_key(
       thrust::cuda::par.on(stream),
@@ -274,7 +343,10 @@ template<typename KDType, typename VDType>
 inline void SortByKey(mshadow::Tensor<gpu, 1, KDType> keys, mshadow::Tensor<gpu, 1, VDType> values,
                       bool is_ascend, mshadow::Tensor<gpu, 1, char>* workspace,
                       const int begin_bit, const int end_bit) {
-  SortByKeyImpl(keys, values, is_ascend, workspace, begin_bit, end_bit);
+                      const int begin_bit, const int end_bit,
+                      mshadow::Tensor<gpu, 1, KDType>* sorted_keys,
+                      mshadow::Tensor<gpu, 1, VDType>* sorted_values) {
+  SortByKeyImpl(keys, values, is_ascend, workspace, begin_bit, end_bit, sorted_keys, sorted_values);
 }
 
 }  // namespace op
diff --git a/src/operator/tensor/sort_op.h b/src/operator/tensor/sort_op.h
index 6d4675a0775a..11aea9db09ec 100644
--- a/src/operator/tensor/sort_op.h
+++ b/src/operator/tensor/sort_op.h
@@ -49,11 +49,17 @@ namespace op {
  * \param keys the keys to sort
  * \param values the values that sorts w.r.t the key
  * \param is_ascend whether to sort key in ascending order
+ * \param begin_bit The beginning bit of the different values in keys. Default 0.
+ * \param end_bit The ending bit of the different values in keys. Default to 8 * sizeof(dtype of key).
+ * \param sorted_keys If specified, keys will be sorted out of place.
+ * \param sorted_values If specified, values will be sorted out of place.
  */
 template<typename KDType, typename VDType>
 inline void SortByKey(mshadow::Tensor<cpu, 1, KDType> keys, mshadow::Tensor<cpu, 1, VDType> values,
                       bool is_ascend = true, mshadow::Tensor<cpu, 1, char>* workspace = NULL,
-                      const int begin_bit = 0, const int end_bit = sizeof(KDType)*8) {
+                      const int begin_bit = 0, const int end_bit = sizeof(KDType)*8,
+                      mshadow::Tensor<cpu, 1, KDType>* sorted_keys = nullptr,
+                      mshadow::Tensor<cpu, 1, VDType>* sorted_values = nullptr) {
   CHECK_EQ(keys.CheckContiguous(), true);
   CHECK_EQ(values.CheckContiguous(), true);
   CHECK_EQ(keys.size(0), values.size(0))
@@ -62,6 +68,12 @@ inline void SortByKey(mshadow::Tensor<cpu, 1, KDType> keys, mshadow::Tensor<cpu,
   std::vector<size_t> idx(keys.size(0));
   std::vector<KDType> keys_vec(keys.size(0));
   std::vector<VDType> values_vec(values.size(0));
+  if (sorted_keys == nullptr) {
+    sorted_keys = &keys;
+  }
+  if (sorted_values == nullptr) {
+    sorted_values = &values;
+  }
   for (index_t i = 0; i < keys.size(0); i++) {
     idx[i] = i;
     keys_vec[i] = keys[i];
@@ -77,18 +89,28 @@ inline void SortByKey(mshadow::Tensor<cpu, 1, KDType> keys, mshadow::Tensor<cpu,
                        {return keys_vec[i1] > keys_vec[i2]; });
   }
   for (index_t i = 0; i < values.size(0); i++) {
-    keys[i] = keys_vec[idx[i]];
-    values[i] = values_vec[idx[i]];
+    (*sorted_keys)[i] = keys_vec[idx[i]];
+    (*sorted_values)[i] = values_vec[idx[i]];
   }
 }
 
 /*!
  * \brief CPU/GPU: Return the amount of temporary storage in bytes required for SortByKey
  * \param num_keys number of keys to sort
+ * \param keys_in_place Whether the sorting of keys will happen in place.
+ *                      Default true. If set to false, subsequent
+ *                      call to SortByKey needs to specify the
+ *                      sorted_keys parameter.
+ * \param values_in_place Whether the sorting of values will happen in place.
+ *                        Default true. If set to false, subsequent
+ *                        call to SortByKey needs to specify the
+ *                        sorted_values parameter.
  */
 template <typename KDType, typename VDType, typename xpu>
 inline typename std::enable_if<std::is_same<xpu, cpu>::value, size_t>::type
-SortByKeyWorkspaceSize(const size_t num_keys) {
+SortByKeyWorkspaceSize(const size_t num_keys,
+                       const bool keys_in_place = true,
+                       const bool values_in_place = true) {
   return 0;
 }
 
@@ -97,18 +119,34 @@ SortByKeyWorkspaceSize(const size_t num_keys) {
  * \param keys the keys to sort
  * \param values the values that sorts w.r.t the key
  * \param is_ascend whether to sort key in ascending order
+ * \param begin_bit The beginning bit of the different values in keys. Default 0.
+ * \param end_bit The ending bit of the different values in keys. Default to 8 * sizeof(dtype of key).
+ * \param sorted_keys If specified, keys will be sorted out of place.
+ * \param sorted_values If specified, values will be sorted out of place.
  */
 template<typename KDType, typename VDType>
 inline void SortByKey(mshadow::Tensor<gpu, 1, KDType> keys, mshadow::Tensor<gpu, 1, VDType> values,
                       bool is_ascend = true, mshadow::Tensor<gpu, 1, char>* workspace = NULL,
-                      const int begin_bit = 0, const int end_bit = sizeof(KDType)*8);
+                      const int begin_bit = 0, const int end_bit = sizeof(KDType)*8,
+                      mshadow::Tensor<gpu, 1, KDType>* sorted_keys = nullptr,
+                      mshadow::Tensor<gpu, 1, VDType>* sorted_values = nullptr);
 /*!
  * \brief CPU/GPU: Return the amount of temporary storage in bytes required for SortByKey
  * \param num_keys number of keys to sort
+ * \param keys_in_place Whether the sorting of keys will happen in place.
+ *                      Default true. If set to false, subsequent
+ *                      call to SortByKey needs to specify the
+ *                      sorted_keys parameter.
+ * \param values_in_place Whether the sorting of values will happen in place.
+ *                        Default true. If set to false, subsequent
+ *                        call to SortByKey needs to specify the
+ *                        sorted_values parameter.
  */
 template <typename KDType, typename VDType, typename xpu>
 inline typename std::enable_if<std::is_same<xpu, gpu>::value, size_t>::type
-SortByKeyWorkspaceSize(const size_t num_keys);
+SortByKeyWorkspaceSize(const size_t num_keys,
+                       const bool keys_in_place = true,
+                       const bool values_in_place = true);
 
 }  // namespace op
 }  // namespace mxnet

From 49ded5a64e2d1a7b3f0ff2b6a3d2fcde2971e480 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Thu, 17 Oct 2019 15:36:44 -0700
Subject: [PATCH 06/19] Optimization pt2

---
 src/operator/contrib/bounding_box.cu | 66 +++++++++++++++++++---------
 src/operator/tensor/sort_op-inl.cuh  | 11 +++--
 2 files changed, 53 insertions(+), 24 deletions(-)

diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu
index 228c1273e156..bcd31a7ad657 100644
--- a/src/operator/contrib/bounding_box.cu
+++ b/src/operator/contrib/bounding_box.cu
@@ -170,7 +170,7 @@ void WorkspaceForSort(const int num_elem,
                       const int width_elem,
                       const int alignment,
                       TempWorkspace<DType>* workspace) {
-  const index_t sort_scores_temp_space = mxnet::op::SortByKeyWorkspaceSize<DType, index_t, gpu>(num_elem);
+  const index_t sort_scores_temp_space = mxnet::op::SortByKeyWorkspaceSize<DType, index_t, gpu>(num_elem, false, false);
   workspace->scratch_space = align(sort_scores_temp_space,
                                    alignment);
 }
@@ -411,9 +411,10 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results,
   const int my_lane = threadIdx.x % warp_size;
 
   __shared__ uint32_t current_valid_boxes;
-  bool valid = true;
   const uint32_t full_mask = 0xFFFFFFFF;
-  const uint32_t my_warp_mask = 1 << my_lane;
+  const uint32_t my_lane_mask = 1 << my_lane;
+  const uint32_t my_warp_mask = (1 << (my_lane + 1)) - 1;
+  uint32_t valid = my_lane_mask;
   uint32_t valid_boxes;
 
   uint32_t my_next_mask = my_element_in_batch < topk ?
@@ -421,35 +422,35 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results,
     full_mask;
 #pragma unroll
   for (int i = 0; i < n_threads / warp_size; ++i) {
-    const uint32_t my_mask = my_next_mask;
+    uint32_t my_mask = my_next_mask;
     my_next_mask = (((i + 1) < n_threads / warp_size) &&
                     (my_element_in_batch < topk)) ?
       nms_results[(i + 1) * topk * num_batches + my_element]:
       full_mask;
     if (my_warp == i) {
-      valid_boxes = __ballot_sync(full_mask, valid);
+      my_mask = my_mask | my_warp_mask;
+      // Loop over warp_size - 1 because the last
+      // thread does not contribute to the mask anyway
 #pragma unroll
-      for (int j = 0; j < warp_size; ++j) {
-          const uint32_t mask = __shfl_sync(full_mask, valid?my_mask:full_mask, j);
-          const uint32_t p = (1 << (j+1)) - 1;
-          const uint32_t mp = mask | p;
-          valid_boxes = valid_boxes & mp;
-          valid = (valid_boxes & my_warp_mask) != 0;
+      for (int j = 0; j < warp_size - 1; ++j) {
+          const uint32_t mask = __shfl_sync(full_mask, valid ? my_mask : full_mask, j);
+          valid = valid & mask;
       }
+      valid_boxes = __ballot_sync(full_mask, valid);
       if (my_lane == 0) {
         current_valid_boxes = valid_boxes;
       }
     }
     __syncthreads();
-    if (my_warp > i) {
-      valid = valid && (((~my_mask) & current_valid_boxes) == 0);
+    if ((my_warp > i) && (((~my_mask) & current_valid_boxes) != 0)) {
+      valid = 0;
     }
     __syncthreads();
   }
   if (my_lane == 0) {
     nms_results[my_element] = valid_boxes;
   }
-  if (!valid) {
+  if (valid == 0) {
     data[(my_batch * num_elements_per_batch + my_element_in_batch) * element_width + score_index] = -1;
   }
 }
@@ -502,8 +503,8 @@ TempWorkspace<DType> GetWorkspace(const int num_batch,
   const int alignment = 128;
 
   // Get the workspace size
-  workspace.scores_temp_space = align(num_batch * num_elem * sizeof(DType), alignment);
-  workspace.indices_temp_spaces = align(num_batch * num_elem * sizeof(index_t), alignment);
+  workspace.scores_temp_space = 2 * align(num_batch * num_elem * sizeof(DType), alignment);
+  workspace.indices_temp_spaces = 2 * align(num_batch * num_elem * sizeof(index_t), alignment);
   WorkspaceForSort(num_elem, width_elem, alignment, &workspace);
   // Place for a buffer
   workspace.buffer_space = align(num_batch * num_elem * width_elem * sizeof(DType), alignment);
@@ -549,6 +550,8 @@ void CompactNMSResults(const Tensor<gpu, 3, DType>& data,
                        Tensor<gpu, 3, DType>* out,
                        Tensor<gpu, 1, index_t>* indices,
                        Tensor<gpu, 1, DType>* scores,
+                       Tensor<gpu, 1, index_t>* sorted_indices,
+                       Tensor<gpu, 1, DType>* sorted_scores,
                        Tensor<gpu, 1, char>* scratch,
                        const int score_index,
                        Stream<gpu>* s) {
@@ -570,9 +573,18 @@ void CompactNMSResults(const Tensor<gpu, 3, DType>& data,
     Tensor<gpu, 1, index_t> indices_batch(indices->dptr_ + i * num_elements_per_batch,
                                           Shape1(num_elements_per_batch),
                                           s);
-    mxnet::op::SortByKey(scores_batch, indices_batch, false, scratch);
+    Tensor<gpu, 1, DType> sorted_scores_batch(sorted_scores->dptr_ + i * num_elements_per_batch,
+                                              Shape1(num_elements_per_batch),
+                                              s);
+    Tensor<gpu, 1, index_t> sorted_indices_batch(sorted_indices->dptr_ + i * num_elements_per_batch,
+                                                 Shape1(num_elements_per_batch),
+                                                 s);
+    mxnet::op::SortByKey(scores_batch, indices_batch, false, scratch,
+                         0, 8 * sizeof(DType), &sorted_scores_batch,
+                         &sorted_indices_batch);
   }
-  CompactData<true>(*indices, data, out, -1, score_index, s);
+  CompactData<true>(*sorted_indices, data, out, -1, score_index, s);
+  //CompactData<true>(*indices, data, out, -1, score_index, s);
 }
 
 }  // namespace
@@ -619,7 +631,9 @@ void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs,
 
     FilterAndPrepareAuxData(data, &out, workspace, param, s);
     Tensor<gpu, 1, DType> scores(workspace.scores, Shape1(num_batch * num_elem), s);
+    Tensor<gpu, 1, DType> sorted_scores(workspace.scores + scores.MSize(), Shape1(num_batch * num_elem), s);
     Tensor<gpu, 1, index_t> indices(workspace.indices, Shape1(num_batch * num_elem), s);
+    Tensor<gpu, 1, index_t> sorted_indices(workspace.indices + indices.MSize(), Shape1(num_batch * num_elem), s);
     Tensor<gpu, 1, char> scratch(reinterpret_cast<char*>(workspace.scratch),
                                         Shape1(workspace.scratch_space), s);
     Tensor<gpu, 3, DType> buffer(workspace.buffer,
@@ -637,12 +651,22 @@ void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs,
       Tensor<gpu, 1, index_t> indices_batch(indices.dptr_ + i * num_elem,
                                             Shape1(num_elem),
                                             s);
-      mxnet::op::SortByKey(scores_batch, indices_batch, false, &scratch);
+      Tensor<gpu, 1, DType> sorted_scores_batch(sorted_scores.dptr_ + i * num_elem,
+                                                Shape1(num_elem),
+                                                s);
+      Tensor<gpu, 1, index_t> sorted_indices_batch(sorted_indices.dptr_ + i * num_elem,
+                                                   Shape1(num_elem),
+                                                   s);
+      mxnet::op::SortByKey(scores_batch, indices_batch, false, &scratch, 0,
+                           8 * sizeof(DType), &sorted_scores_batch,
+                           &sorted_indices_batch);
     }
-    CompactData<false>(indices, out, &buffer, topk, -1, s);
+    CompactData<false>(sorted_indices, out, &buffer, topk, -1, s);
+    /*CompactData<false>(indices, out, &buffer, topk, -1, s);*/
     NMS<DType> nms;
     nms(&buffer, &nms_scratch,  topk, param, s);
-    CompactNMSResults(buffer, &out, &indices, &scores, &scratch, param.score_index, s);
+    CompactNMSResults(buffer, &out, &indices, &scores, &sorted_indices,
+                      &sorted_scores, &scratch, param.score_index, s);
 
     // convert encoding
     if (param.in_format != param.out_format) {
diff --git a/src/operator/tensor/sort_op-inl.cuh b/src/operator/tensor/sort_op-inl.cuh
index 63ccc5752fb0..1a2f2db9b953 100644
--- a/src/operator/tensor/sort_op-inl.cuh
+++ b/src/operator/tensor/sort_op-inl.cuh
@@ -156,14 +156,19 @@ SortByKeyImpl(mshadow::Tensor<gpu, 1, KDType> keys,
       << "Workspace given to SortByKey is too small: requested " << required_storage <<
       " B and got " << workspace->size(0) << " B.";
 
+    size_t start_keys = 0;
+    size_t start_values = start_keys +
+                          sorted_keys == nullptr ? keys_bytes : 0;
+    size_t start_scratch = start_values +
+                           sorted_values == nullptr ? values_bytes : 0;
     KDType* keys_out_ptr = sorted_keys == nullptr ?
-                           reinterpret_cast<KDType *>(workspace->dptr_) :
+                           reinterpret_cast<KDType *>(workspace->dptr_ + start_keys) :
                            sorted_keys->dptr_;
     VDType* values_out_ptr = sorted_values == nullptr ?
-                             reinterpret_cast<VDType *>(workspace->dptr_ + keys_bytes) :
+                             reinterpret_cast<VDType *>(workspace->dptr_ + start_values) :
                              sorted_values->dptr_;
 
-    void* temp_storage = reinterpret_cast<void *>(workspace->dptr_ + keys_bytes + values_bytes);
+    void* temp_storage = reinterpret_cast<void *>(workspace->dptr_ + start_scratch);
     // Sort
     if (is_ascend) {
       cub::DeviceRadixSort::SortPairs(temp_storage, sortpairs_bytes,

From 3680e1d8ca8187c7770a82c134a7e792efa47e8e Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 18 Oct 2019 13:28:18 -0700
Subject: [PATCH 07/19] Optimizations pt3

---
 src/operator/contrib/bounding_box.cu | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu
index bcd31a7ad657..17115291250e 100644
--- a/src/operator/contrib/bounding_box.cu
+++ b/src/operator/contrib/bounding_box.cu
@@ -410,7 +410,7 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results,
   const int my_warp = threadIdx.x / warp_size;
   const int my_lane = threadIdx.x % warp_size;
 
-  __shared__ uint32_t current_valid_boxes;
+  __shared__ uint32_t current_valid_boxes[n_threads / warp_size];
   const uint32_t full_mask = 0xFFFFFFFF;
   const uint32_t my_lane_mask = 1 << my_lane;
   const uint32_t my_warp_mask = (1 << (my_lane + 1)) - 1;
@@ -437,15 +437,14 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results,
           valid = valid & mask;
       }
       valid_boxes = __ballot_sync(full_mask, valid);
-      if (my_lane == 0) {
-        current_valid_boxes = valid_boxes;
-      }
+    }
+    if (my_lane == 0 && my_warp == i) {
+      current_valid_boxes[i] = valid_boxes;
     }
     __syncthreads();
-    if ((my_warp > i) && (((~my_mask) & current_valid_boxes) != 0)) {
+    if ((my_warp > i) && (((~my_mask) & current_valid_boxes[i]) != 0)) {
       valid = 0;
     }
-    __syncthreads();
   }
   if (my_lane == 0) {
     nms_results[my_element] = valid_boxes;

From 1978b07c005d9f5816af4aaae88898dd24626730 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 18 Oct 2019 13:46:10 -0700
Subject: [PATCH 08/19] Do not recompute other boxes area every time

---
 src/operator/contrib/bounding_box.cu | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu
index 17115291250e..ce672db1416a 100644
--- a/src/operator/contrib/bounding_box.cu
+++ b/src/operator/contrib/bounding_box.cu
@@ -331,6 +331,7 @@ __global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* re
   constexpr int max_elem_width = 20;
   constexpr int num_other_boxes = sizeof(uint32_t) * 8;
   __shared__ DType other_boxes[max_elem_width * num_other_boxes];
+  __shared__ DType other_boxes_areas[num_other_boxes];
   const index_t my_row = blockIdx.x / num_blocks_per_row;
   const index_t my_block_offset_in_row = blockIdx.x % num_blocks_per_row;
   const index_t my_block_offset_in_batch = my_block_offset_in_row % num_blocks_per_row_batch;
@@ -346,6 +347,17 @@ __global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* re
   }
   __syncthreads();
 
+  if (threadIdx.x < num_other_boxes) {
+    const int other_boxes_offset = element_width * threadIdx.x;
+    const DType their_area = calculate_area<encode>(
+        other_boxes[other_boxes_offset + coord_index + 0],
+        other_boxes[other_boxes_offset + coord_index + 1],
+        other_boxes[other_boxes_offset + coord_index + 2],
+        other_boxes[other_boxes_offset + coord_index + 3]);
+    other_boxes_areas[threadIdx.x] = their_area;
+  }
+  __syncthreads();
+
   if (my_element_in_batch >= topk) return;
 
   DType my_box[4];
@@ -368,13 +380,9 @@ __global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* re
 #pragma unroll
     for (int i = 0; i < num_other_boxes; ++i) {
       const int other_boxes_offset = element_width * i;
-      if ((class_index == -1 ||  my_class == other_boxes[other_boxes_offset + class_index]) &&
+      if ((class_index == -1 || my_class == other_boxes[other_boxes_offset + class_index]) &&
           other_boxes[other_boxes_offset + score_index] != -1){
-        const DType their_area = calculate_area<encode>(
-            other_boxes[other_boxes_offset + coord_index + 0],
-            other_boxes[other_boxes_offset + coord_index + 1],
-            other_boxes[other_boxes_offset + coord_index + 2],
-            other_boxes[other_boxes_offset + coord_index + 3]);
+        const DType their_area = other_boxes_areas[i];
 
         const DType intersect = calculate_intersection<encode>(
             my_box[0], my_box[1], my_box[2], my_box[3],
@@ -382,8 +390,8 @@ __global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* re
             other_boxes[other_boxes_offset + coord_index + 1],
             other_boxes[other_boxes_offset + coord_index + 2],
             other_boxes[other_boxes_offset + coord_index + 3]);
-        const DType iou = intersect / (my_area + their_area - intersect);
-        if (iou > threshold) {
+        //const DType iou = intersect / (my_area + their_area - intersect);
+        if (intersect > threshold * (my_area + their_area - intersect)) {
           ret = ret | (1u << i);
         }
       }

From dfaa4c5d114c689cb6f90aff7d48c9a71020eda1 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 18 Oct 2019 14:55:11 -0700
Subject: [PATCH 09/19] Sort only topk results during second sorting

---
 src/operator/contrib/bounding_box.cu | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu
index ce672db1416a..331646fb3c68 100644
--- a/src/operator/contrib/bounding_box.cu
+++ b/src/operator/contrib/bounding_box.cu
@@ -166,12 +166,13 @@ void CompactData(const Tensor<gpu, 1, index_t>& indices,
 }
 
 template <typename DType>
-void WorkspaceForSort(const int num_elem,
-                      const int width_elem,
+void WorkspaceForSort(const index_t num_elem,
+                      const index_t topk,
                       const int alignment,
                       TempWorkspace<DType>* workspace) {
   const index_t sort_scores_temp_space = mxnet::op::SortByKeyWorkspaceSize<DType, index_t, gpu>(num_elem, false, false);
-  workspace->scratch_space = align(sort_scores_temp_space,
+  const index_t sort_topk_scores_temp_space = mxnet::op::SortByKeyWorkspaceSize<DType, index_t, gpu>(topk, false, false);
+  workspace->scratch_space = align(std::max(sort_scores_temp_space, sort_topk_scores_temp_space),
                                    alignment);
 }
 
@@ -500,8 +501,8 @@ __global__ void ReduceNMSResultRest_kernel(DType* data,
 }
 
 template <typename DType>
-TempWorkspace<DType> GetWorkspace(const int num_batch,
-                                  const int num_elem,
+TempWorkspace<DType> GetWorkspace(const index_t num_batch,
+                                  const index_t num_elem,
                                   const int width_elem,
                                   const index_t topk,
                                   const OpContext& ctx) {
@@ -512,7 +513,7 @@ TempWorkspace<DType> GetWorkspace(const int num_batch,
   // Get the workspace size
   workspace.scores_temp_space = 2 * align(num_batch * num_elem * sizeof(DType), alignment);
   workspace.indices_temp_spaces = 2 * align(num_batch * num_elem * sizeof(index_t), alignment);
-  WorkspaceForSort(num_elem, width_elem, alignment, &workspace);
+  WorkspaceForSort(num_elem, topk, alignment, &workspace);
   // Place for a buffer
   workspace.buffer_space = align(num_batch * num_elem * width_elem * sizeof(DType), alignment);
   workspace.nms_scratch_space = align(NMS<DType>::THRESHOLD / (sizeof(uint32_t) * 8) *
@@ -561,6 +562,7 @@ void CompactNMSResults(const Tensor<gpu, 3, DType>& data,
                        Tensor<gpu, 1, DType>* sorted_scores,
                        Tensor<gpu, 1, char>* scratch,
                        const int score_index,
+                       const index_t topk,
                        Stream<gpu>* s) {
   using mshadow::Shape1;
   constexpr int n_threads = 512;
@@ -575,22 +577,22 @@ void CompactNMSResults(const Tensor<gpu, 3, DType>& data,
   for (index_t i = 0; i < num_batches; ++i) {
     // Sort each batch separately
     Tensor<gpu, 1, DType> scores_batch(scores->dptr_ + i * num_elements_per_batch,
-                                       Shape1(num_elements_per_batch),
+                                       Shape1(topk),
                                        s);
     Tensor<gpu, 1, index_t> indices_batch(indices->dptr_ + i * num_elements_per_batch,
-                                          Shape1(num_elements_per_batch),
+                                          Shape1(topk),
                                           s);
     Tensor<gpu, 1, DType> sorted_scores_batch(sorted_scores->dptr_ + i * num_elements_per_batch,
-                                              Shape1(num_elements_per_batch),
+                                              Shape1(topk),
                                               s);
     Tensor<gpu, 1, index_t> sorted_indices_batch(sorted_indices->dptr_ + i * num_elements_per_batch,
-                                                 Shape1(num_elements_per_batch),
+                                                 Shape1(topk),
                                                  s);
     mxnet::op::SortByKey(scores_batch, indices_batch, false, scratch,
                          0, 8 * sizeof(DType), &sorted_scores_batch,
                          &sorted_indices_batch);
   }
-  CompactData<true>(*sorted_indices, data, out, -1, score_index, s);
+  CompactData<true>(*sorted_indices, data, out, topk, score_index, s);
   //CompactData<true>(*indices, data, out, -1, score_index, s);
 }
 
@@ -673,7 +675,7 @@ void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs,
     NMS<DType> nms;
     nms(&buffer, &nms_scratch,  topk, param, s);
     CompactNMSResults(buffer, &out, &indices, &scores, &sorted_indices,
-                      &sorted_scores, &scratch, param.score_index, s);
+                      &sorted_scores, &scratch, param.score_index, topk, s);
 
     // convert encoding
     if (param.in_format != param.out_format) {

From 9069192cc0818f6996bee3824f913e509f9fc1eb Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 18 Oct 2019 15:29:40 -0700
Subject: [PATCH 10/19] Cleaning

---
 src/operator/contrib/bounding_box.cu | 33 ++++++++++++++++------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu
index 331646fb3c68..33698e7abeeb 100644
--- a/src/operator/contrib/bounding_box.cu
+++ b/src/operator/contrib/bounding_box.cu
@@ -170,8 +170,10 @@ void WorkspaceForSort(const index_t num_elem,
                       const index_t topk,
                       const int alignment,
                       TempWorkspace<DType>* workspace) {
-  const index_t sort_scores_temp_space = mxnet::op::SortByKeyWorkspaceSize<DType, index_t, gpu>(num_elem, false, false);
-  const index_t sort_topk_scores_temp_space = mxnet::op::SortByKeyWorkspaceSize<DType, index_t, gpu>(topk, false, false);
+  const index_t sort_scores_temp_space =
+    mxnet::op::SortByKeyWorkspaceSize<DType, index_t, gpu>(num_elem, false, false);
+  const index_t sort_topk_scores_temp_space =
+    mxnet::op::SortByKeyWorkspaceSize<DType, index_t, gpu>(topk, false, false);
   workspace->scratch_space = align(std::max(sort_scores_temp_space, sort_topk_scores_temp_space),
                                    alignment);
 }
@@ -337,7 +339,8 @@ __global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* re
   const index_t my_block_offset_in_row = blockIdx.x % num_blocks_per_row;
   const index_t my_block_offset_in_batch = my_block_offset_in_row % num_blocks_per_row_batch;
   const index_t my_batch = (my_block_offset_in_row) / num_blocks_per_row_batch;
-  const index_t my_element_in_batch = my_block_offset_in_batch * blockDim.x + current_start + threadIdx.x;
+  const index_t my_element_in_batch = my_block_offset_in_batch * blockDim.x +
+                                      current_start + threadIdx.x;
 
   // Load other boxes
   const index_t offset = (my_batch * num_elements_per_batch +
@@ -459,7 +462,8 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results,
     nms_results[my_element] = valid_boxes;
   }
   if (valid == 0) {
-    data[(my_batch * num_elements_per_batch + my_element_in_batch) * element_width + score_index] = -1;
+    data[(my_batch * num_elements_per_batch + my_element_in_batch) * element_width +
+         score_index] = -1;
   }
 }
 
@@ -479,7 +483,8 @@ __global__ void ReduceNMSResultRest_kernel(DType* data,
   constexpr int warp_size = 32;
   const index_t my_block_offset_in_batch = blockIdx.x % num_blocks_per_batch;
   const index_t my_batch = blockIdx.x / num_blocks_per_batch;
-  const index_t my_element_in_batch = my_block_offset_in_batch * blockDim.x + start_index + NMS<DType>::THRESHOLD + threadIdx.x;
+  const index_t my_element_in_batch = my_block_offset_in_batch * blockDim.x +
+                                      start_index + NMS<DType>::THRESHOLD + threadIdx.x;
   const index_t my_element = my_batch * topk + my_element_in_batch;
 
   if (my_element_in_batch >= topk) return;
@@ -496,7 +501,8 @@ __global__ void ReduceNMSResultRest_kernel(DType* data,
   }
 
   if (!valid) {
-    data[(my_batch * num_elements_per_batch + my_element_in_batch) * element_width + score_index] = -1;
+    data[(my_batch * num_elements_per_batch + my_element_in_batch) * element_width +
+          score_index] = -1;
   }
 }
 
@@ -593,7 +599,6 @@ void CompactNMSResults(const Tensor<gpu, 3, DType>& data,
                          &sorted_indices_batch);
   }
   CompactData<true>(*sorted_indices, data, out, topk, score_index, s);
-  //CompactData<true>(*indices, data, out, -1, score_index, s);
 }
 
 }  // namespace
@@ -618,13 +623,12 @@ void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs,
   int num_elem = in_shape[indim - 2];
   int width_elem = in_shape[indim - 1];
 
-  //MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, 
-  using DType = float;
-  {
+  MSHADOW_REAL_TYPE_SWITCH(outputs[0].type_flag_, DType, {
     Tensor<gpu, 3, DType> data = inputs[box_nms_enum::kData]
      .get_with_shape<gpu, 3, DType>(Shape3(num_batch, num_elem, width_elem), s);
     Tensor<gpu, 3, DType> out = outputs[box_nms_enum::kOut]
      .get_with_shape<gpu, 3, DType>(Shape3(num_batch, num_elem, width_elem), s);
+
     // Special case for topk == 0
     if (param.topk == 0) {
       if (req[0] != kNullOp &&
@@ -640,9 +644,11 @@ void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs,
 
     FilterAndPrepareAuxData(data, &out, workspace, param, s);
     Tensor<gpu, 1, DType> scores(workspace.scores, Shape1(num_batch * num_elem), s);
-    Tensor<gpu, 1, DType> sorted_scores(workspace.scores + scores.MSize(), Shape1(num_batch * num_elem), s);
+    Tensor<gpu, 1, DType> sorted_scores(workspace.scores + scores.MSize(),
+                                        Shape1(num_batch * num_elem), s);
     Tensor<gpu, 1, index_t> indices(workspace.indices, Shape1(num_batch * num_elem), s);
-    Tensor<gpu, 1, index_t> sorted_indices(workspace.indices + indices.MSize(), Shape1(num_batch * num_elem), s);
+    Tensor<gpu, 1, index_t> sorted_indices(workspace.indices + indices.MSize(),
+                                           Shape1(num_batch * num_elem), s);
     Tensor<gpu, 1, char> scratch(reinterpret_cast<char*>(workspace.scratch),
                                         Shape1(workspace.scratch_space), s);
     Tensor<gpu, 3, DType> buffer(workspace.buffer,
@@ -671,7 +677,6 @@ void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs,
                            &sorted_indices_batch);
     }
     CompactData<false>(sorted_indices, out, &buffer, topk, -1, s);
-    /*CompactData<false>(indices, out, &buffer, topk, -1, s);*/
     NMS<DType> nms;
     nms(&buffer, &nms_scratch,  topk, param, s);
     CompactNMSResults(buffer, &out, &indices, &scores, &sorted_indices,
@@ -687,7 +692,7 @@ void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs,
           out.dptr_ + param.coord_start, width_elem);
       }
     }
-  };//);
+  });
 }
 
 void BoxNMSForwardGPU(const nnvm::NodeAttrs& attrs,

From ad60b565ea4a5780ae92433dfc75946e5331778f Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 18 Oct 2019 20:51:47 -0700
Subject: [PATCH 11/19] Fixes from rebase

---
 src/operator/contrib/bounding_box.cc | 72 ----------------------------
 src/operator/contrib/bounding_box.cu |  5 +-
 2 files changed, 1 insertion(+), 76 deletions(-)

diff --git a/src/operator/contrib/bounding_box.cc b/src/operator/contrib/bounding_box.cc
index 9363eb09d5a4..5fd8a08e5f9e 100644
--- a/src/operator/contrib/bounding_box.cc
+++ b/src/operator/contrib/bounding_box.cc
@@ -34,78 +34,6 @@ DMLC_REGISTER_PARAMETER(BoxOverlapParam);
 DMLC_REGISTER_PARAMETER(BipartiteMatchingParam);
 DMLC_REGISTER_PARAMETER(BoxDecodeParam);
 
-NNVM_REGISTER_OP(_contrib_box_nms2)
-.describe(R"code(Apply non-maximum suppression to input.
-
-The output will be sorted in descending order according to `score`. Boxes with
-overlaps larger than `overlap_thresh`, smaller scores and background boxes
-will be removed and filled with -1, the corresponding position will be recorded
-for backward propogation.
-
-During back-propagation, the gradient will be copied to the original
-position according to the input index. For positions that have been suppressed,
-the in_grad will be assigned 0.
-In summary, gradients are sticked to its boxes, will either be moved or discarded
-according to its original index in input.
-
-Input requirements::
-
-  1. Input tensor have at least 2 dimensions, (n, k), any higher dims will be regarded
-  as batch, e.g. (a, b, c, d, n, k) == (a*b*c*d, n, k)
-  2. n is the number of boxes in each batch
-  3. k is the width of each box item.
-
-By default, a box is [id, score, xmin, ymin, xmax, ymax, ...],
-additional elements are allowed.
-
-- `id_index`: optional, use -1 to ignore, useful if `force_suppress=False`, which means
-  we will skip highly overlapped boxes if one is `apple` while the other is `car`.
-
-- `background_id`: optional, default=-1, class id for background boxes, useful
-  when `id_index >= 0` which means boxes with background id will be filtered before nms.
-
-- `coord_start`: required, default=2, the starting index of the 4 coordinates.
-  Two formats are supported:
-
-    - `corner`: [xmin, ymin, xmax, ymax]
-
-    - `center`: [x, y, width, height]
-
-- `score_index`: required, default=1, box score/confidence.
-  When two boxes overlap IOU > `overlap_thresh`, the one with smaller score will be suppressed.
-
-- `in_format` and `out_format`: default='corner', specify in/out box formats.
-
-Examples::
-
-  x = [[0, 0.5, 0.1, 0.1, 0.2, 0.2], [1, 0.4, 0.1, 0.1, 0.2, 0.2],
-       [0, 0.3, 0.1, 0.1, 0.14, 0.14], [2, 0.6, 0.5, 0.5, 0.7, 0.8]]
-  box_nms(x, overlap_thresh=0.1, coord_start=2, score_index=1, id_index=0,
-      force_suppress=True, in_format='corner', out_typ='corner') =
-      [[2, 0.6, 0.5, 0.5, 0.7, 0.8], [0, 0.5, 0.1, 0.1, 0.2, 0.2],
-       [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1]]
-  out_grad = [[0.1, 0.1, 0.1, 0.1, 0.1, 0.1], [0.2, 0.2, 0.2, 0.2, 0.2, 0.2],
-              [0.3, 0.3, 0.3, 0.3, 0.3, 0.3], [0.4, 0.4, 0.4, 0.4, 0.4, 0.4]]
-  # exe.backward
-  in_grad = [[0.2, 0.2, 0.2, 0.2, 0.2, 0.2], [0, 0, 0, 0, 0, 0],
-             [0, 0, 0, 0, 0, 0], [0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]
-
-)code" ADD_FILELINE)
-.set_num_inputs(1)
-.set_num_outputs(2)
-.set_attr_parser(ParamParser<BoxNMSParam>)
-.set_attr<nnvm::FNumVisibleOutputs>("FNumVisibleOutputs", BoxNMSNumVisibleOutputs)
-.set_attr<mxnet::FInferShape>("FInferShape", BoxNMSShape)
-.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 2>)
-.set_attr<FResourceRequest>("FResourceRequest",
-  [](const NodeAttrs& attrs) {
-    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-  })
-.set_attr<THasDeterministicOutput>("THasDeterministicOutput", true)
-.set_attr<FCompute>("FCompute<cpu>", BoxNMSForward<cpu>)
-.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseOut{"_backward_contrib_box_nms"})
-.add_argument("data", "NDArray-or-Symbol", "The input")
-.add_arguments(BoxNMSParam::__FIELDS__());
 
 NNVM_REGISTER_OP(_contrib_box_nms)
 .add_alias("_contrib_box_non_maximum_suppression")
diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu
index 33698e7abeeb..f6aeea3efd2d 100644
--- a/src/operator/contrib/bounding_box.cu
+++ b/src/operator/contrib/bounding_box.cu
@@ -713,11 +713,8 @@ void BoxNMSForwardGPU(const nnvm::NodeAttrs& attrs,
 }
 
 
-NNVM_REGISTER_OP(_contrib_box_nms2)
-.set_attr<FCompute>("FCompute<gpu>", BoxNMSForwardGPU);
-
 NNVM_REGISTER_OP(_contrib_box_nms)
-.set_attr<FCompute>("FCompute<gpu>", BoxNMSForward<gpu>);
+.set_attr<FCompute>("FCompute<gpu>", BoxNMSForwardGPU);
 
 NNVM_REGISTER_OP(_backward_contrib_box_nms)
 .set_attr<FCompute>("FCompute<gpu>", BoxNMSBackward<gpu>);

From eb89b0730ca05d62294ba727a3e03fbe0a49e956 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 18 Oct 2019 21:10:35 -0700
Subject: [PATCH 12/19] Fix lint and more fixes from rebase

---
 src/operator/contrib/bounding_box.cu | 6 +++---
 src/operator/tensor/sort_op-inl.cuh  | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu
index f6aeea3efd2d..b45cae1b2d27 100644
--- a/src/operator/contrib/bounding_box.cu
+++ b/src/operator/contrib/bounding_box.cu
@@ -24,10 +24,11 @@
   * \author Joshua Zhang
   */
 
+#include <cub/cub.cuh>
+
 #include "./bounding_box-inl.cuh"
 #include "./bounding_box-inl.h"
 #include "../elemwise_op_common.h"
-#include <cub/cub.cuh>
 
 namespace mxnet {
 namespace op {
@@ -385,7 +386,7 @@ __global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* re
     for (int i = 0; i < num_other_boxes; ++i) {
       const int other_boxes_offset = element_width * i;
       if ((class_index == -1 || my_class == other_boxes[other_boxes_offset + class_index]) &&
-          other_boxes[other_boxes_offset + score_index] != -1){
+          other_boxes[other_boxes_offset + score_index] != -1) {
         const DType their_area = other_boxes_areas[i];
 
         const DType intersect = calculate_intersection<encode>(
@@ -394,7 +395,6 @@ __global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* re
             other_boxes[other_boxes_offset + coord_index + 1],
             other_boxes[other_boxes_offset + coord_index + 2],
             other_boxes[other_boxes_offset + coord_index + 3]);
-        //const DType iou = intersect / (my_area + their_area - intersect);
         if (intersect > threshold * (my_area + their_area - intersect)) {
           ret = ret | (1u << i);
         }
diff --git a/src/operator/tensor/sort_op-inl.cuh b/src/operator/tensor/sort_op-inl.cuh
index 1a2f2db9b953..39f7e0d7aabb 100644
--- a/src/operator/tensor/sort_op-inl.cuh
+++ b/src/operator/tensor/sort_op-inl.cuh
@@ -347,7 +347,6 @@ SortByKeyImpl(mshadow::Tensor<gpu, 1, KDType> keys,
 template<typename KDType, typename VDType>
 inline void SortByKey(mshadow::Tensor<gpu, 1, KDType> keys, mshadow::Tensor<gpu, 1, VDType> values,
                       bool is_ascend, mshadow::Tensor<gpu, 1, char>* workspace,
-                      const int begin_bit, const int end_bit) {
                       const int begin_bit, const int end_bit,
                       mshadow::Tensor<gpu, 1, KDType>* sorted_keys,
                       mshadow::Tensor<gpu, 1, VDType>* sorted_values) {

From 2567266356e1b65d2f14472c7be645624ac5f719 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 18 Oct 2019 21:58:40 -0700
Subject: [PATCH 13/19] Fix typo

---
 src/operator/tensor/sort_op-inl.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/tensor/sort_op-inl.cuh b/src/operator/tensor/sort_op-inl.cuh
index 39f7e0d7aabb..4ecf0994e7b1 100644
--- a/src/operator/tensor/sort_op-inl.cuh
+++ b/src/operator/tensor/sort_op-inl.cuh
@@ -103,7 +103,7 @@ SortByKeyWorkspaceSize(const size_t num_keys,
 #else
   size_t keys_bytes, values_bytes;
   WorkspaceSize4KeysAndValues<KDType, VDType>(num_keys, &keys_bytes, &values_bytes);
-  size_t ret = SortPairsWorkspaseSize<KDType, VDType>(num_keys);
+  size_t ret = SortPairsWorkspaceSize<KDType, VDType>(num_keys);
   if (keys_in_place) {
     ret += keys_bytes;
   }

From 6ac371cafad3597ff82dbac47c8673da3f507935 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 18 Oct 2019 21:19:14 -0700
Subject: [PATCH 14/19] Early exit in Triangle kernel

---
 src/operator/contrib/bounding_box.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu
index b45cae1b2d27..45da87e4617b 100644
--- a/src/operator/contrib/bounding_box.cu
+++ b/src/operator/contrib/bounding_box.cu
@@ -425,9 +425,9 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results,
   __shared__ uint32_t current_valid_boxes[n_threads / warp_size];
   const uint32_t full_mask = 0xFFFFFFFF;
   const uint32_t my_lane_mask = 1 << my_lane;
-  const uint32_t my_warp_mask = (1 << (my_lane + 1)) - 1;
+  const uint32_t earlier_threads_mask = (1 << (my_lane + 1)) - 1;
   uint32_t valid = my_lane_mask;
-  uint32_t valid_boxes;
+  uint32_t valid_boxes = full_mask;
 
   uint32_t my_next_mask = my_element_in_batch < topk ?
     nms_results[my_element]:
@@ -439,8 +439,8 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results,
                     (my_element_in_batch < topk)) ?
       nms_results[(i + 1) * topk * num_batches + my_element]:
       full_mask;
-    if (my_warp == i) {
-      my_mask = my_mask | my_warp_mask;
+    if (my_warp == i && !__all_sync(full_mask, my_mask == full_mask)) {
+      my_mask = my_mask | earlier_threads_mask;
       // Loop over warp_size - 1 because the last
       // thread does not contribute to the mask anyway
 #pragma unroll

From b0ce511c43bfb683f106afd5bd8abba1d5d7665e Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 18 Oct 2019 22:02:42 -0700
Subject: [PATCH 15/19] Fixes

---
 src/operator/contrib/bounding_box.cu | 42 +++++++++++++++-------------
 1 file changed, 23 insertions(+), 19 deletions(-)

diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu
index 45da87e4617b..eb6e269e41a1 100644
--- a/src/operator/contrib/bounding_box.cu
+++ b/src/operator/contrib/bounding_box.cu
@@ -63,29 +63,33 @@ inline index_t align(index_t x, index_t alignment) {
 template <typename DType>
 __global__ void FilterAndPrepareAuxData_kernel(const DType* data, DType* out, DType* scores,
                                                index_t num_elements_per_batch,
-                                               const index_t element_width, const float threshold,
+                                               const index_t element_width,
+                                               const index_t N,
+                                               const float threshold,
                                                const int id_index, const int score_index,
                                                const int background_id) {
   index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
   bool first_in_element = (tid % element_width == 0);
   index_t start_of_my_element = tid - (tid % element_width);
 
-  DType my_score = data[start_of_my_element + score_index];
-  bool filtered_out = my_score <= threshold;
-  if (id_index != -1 && background_id != -1) {
-    DType my_id = data[start_of_my_element + id_index];
-    filtered_out = filtered_out || (my_id == background_id);
-  }
-  if (!filtered_out) {
-    out[tid] = data[tid];
-  } else {
-    out[tid] = -1;
-    my_score = -1;
-  }
+  if (tid < N) {
+    DType my_score = data[start_of_my_element + score_index];
+    bool filtered_out = my_score <= threshold;
+    if (id_index != -1 && background_id != -1) {
+      DType my_id = data[start_of_my_element + id_index];
+      filtered_out = filtered_out || (my_id == background_id);
+    }
+    if (!filtered_out) {
+      out[tid] = data[tid];
+    } else {
+      out[tid] = -1;
+      my_score = -1;
+    }
 
-  if (first_in_element) {
-    index_t offset = tid / element_width;
-    scores[offset] = my_score;
+    if (first_in_element) {
+      index_t offset = tid / element_width;
+      scores[offset] = my_score;
+    }
   }
 }
 
@@ -103,7 +107,7 @@ void FilterAndPrepareAuxData(const Tensor<gpu, 3, DType>& data,
                                    0,
                                    Stream<gpu>::GetStream(s)>>>(
     data.dptr_, out->dptr_, workspace.scores,
-    data.shape_[1], data.shape_[2],
+    data.shape_[1], data.shape_[2], N,
     param.valid_thresh, param.id_index,
     param.score_index, param.background_id);
 }
@@ -401,7 +405,7 @@ __global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* re
       }
     }
   }
-  result[my_row * topk * num_batches + my_element_in_batch] = ~ret;
+  result[(my_row * num_batches + my_batch) * topk + my_element_in_batch] = ~ret;
 }
 
 template <typename DType>
@@ -678,7 +682,7 @@ void BoxNMSForwardGPU_notemp(const nnvm::NodeAttrs& attrs,
     }
     CompactData<false>(sorted_indices, out, &buffer, topk, -1, s);
     NMS<DType> nms;
-    nms(&buffer, &nms_scratch,  topk, param, s);
+    nms(&buffer, &nms_scratch, topk, param, s);
     CompactNMSResults(buffer, &out, &indices, &scores, &sorted_indices,
                       &sorted_scores, &scratch, param.score_index, topk, s);
 

From 753e7c0eea2b28c6cdfd131b0b1c77ee9e7b7f37 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Thu, 24 Oct 2019 16:23:52 -0700
Subject: [PATCH 16/19] Fix sort

---
 src/operator/tensor/sort_op-inl.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/operator/tensor/sort_op-inl.cuh b/src/operator/tensor/sort_op-inl.cuh
index 4ecf0994e7b1..f30bbf3b8433 100644
--- a/src/operator/tensor/sort_op-inl.cuh
+++ b/src/operator/tensor/sort_op-inl.cuh
@@ -158,9 +158,9 @@ SortByKeyImpl(mshadow::Tensor<gpu, 1, KDType> keys,
 
     size_t start_keys = 0;
     size_t start_values = start_keys +
-                          sorted_keys == nullptr ? keys_bytes : 0;
+                          (sorted_keys == nullptr ? keys_bytes : 0);
     size_t start_scratch = start_values +
-                           sorted_values == nullptr ? values_bytes : 0;
+                           (sorted_values == nullptr ? values_bytes : 0);
     KDType* keys_out_ptr = sorted_keys == nullptr ?
                            reinterpret_cast<KDType *>(workspace->dptr_ + start_keys) :
                            sorted_keys->dptr_;

From 8cb9a5bbb7f99f1ceb893ddc86b8b34d5f0c9b61 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Thu, 24 Oct 2019 16:50:02 -0700
Subject: [PATCH 17/19] Fix from rebase

---
 src/operator/tensor/sort_op-inl.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/operator/tensor/sort_op-inl.cuh b/src/operator/tensor/sort_op-inl.cuh
index f30bbf3b8433..c157de99a4e1 100644
--- a/src/operator/tensor/sort_op-inl.cuh
+++ b/src/operator/tensor/sort_op-inl.cuh
@@ -327,8 +327,8 @@ SortByKeyImpl(mshadow::Tensor<gpu, 1, KDType> keys,
     v = sorted_values;
     mshadow::Copy(*sorted_values, values, values.stream_);
   }
-  const auto key_iter = thrust::device_pointer_cast(reinterpret_cast<half*>(k->dptr_));
-  const auto value_iter = thrust::device_pointer_cast(reinterpret_cast<half*>(v->dptr_));
+  const auto key_iter = thrust::device_pointer_cast(reinterpret_cast<__half*>(k->dptr_));
+  const auto value_iter = thrust::device_pointer_cast(reinterpret_cast<__half*>(v->dptr_));
   if (is_ascend) {
     thrust::stable_sort_by_key(
       thrust::cuda::par.on(stream),

From dc1a5992c90703c5901081923a066322fa44d532 Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Thu, 24 Oct 2019 16:52:15 -0700
Subject: [PATCH 18/19] Fix for the mixed naming convention

---
 src/operator/contrib/bounding_box.cu | 34 ++++++++++++++--------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu
index eb6e269e41a1..9d6d8bfb5951 100644
--- a/src/operator/contrib/bounding_box.cu
+++ b/src/operator/contrib/bounding_box.cu
@@ -61,7 +61,7 @@ inline index_t align(index_t x, index_t alignment) {
 }
 
 template <typename DType>
-__global__ void FilterAndPrepareAuxData_kernel(const DType* data, DType* out, DType* scores,
+__global__ void FilterAndPrepareAuxDataKernel(const DType* data, DType* out, DType* scores,
                                                index_t num_elements_per_batch,
                                                const index_t element_width,
                                                const index_t N,
@@ -102,7 +102,7 @@ void FilterAndPrepareAuxData(const Tensor<gpu, 3, DType>& data,
   const int n_threads = 512;
   index_t N = data.shape_.Size();
   const auto blocks = ceil_div(N, n_threads);
-  FilterAndPrepareAuxData_kernel<<<blocks,
+  FilterAndPrepareAuxDataKernel<<<blocks,
                                    n_threads,
                                    0,
                                    Stream<gpu>::GetStream(s)>>>(
@@ -113,7 +113,7 @@ void FilterAndPrepareAuxData(const Tensor<gpu, 3, DType>& data,
 }
 
 template <bool check_topk, bool check_score, typename DType>
-__global__ void CompactData_kernel(const index_t* indices, const DType* source,
+__global__ void CompactDataKernel(const index_t* indices, const DType* source,
                                    DType* destination, const index_t topk,
                                    const index_t element_width,
                                    const index_t num_elements_per_batch,
@@ -154,14 +154,14 @@ void CompactData(const Tensor<gpu, 1, index_t>& indices,
   index_t N = source.shape_.Size();
   const auto blocks = std::min(ceil_div(N, n_threads), max_blocks);
   if (topk > 0) {
-    CompactData_kernel<true, check_score><<<blocks, n_threads, 0,
+    CompactDataKernel<true, check_score><<<blocks, n_threads, 0,
                                             Stream<gpu>::GetStream(s)>>>(
       indices.dptr_, source.dptr_,
       destination->dptr_, topk,
       source.shape_[2], source.shape_[1],
       score_index, N);
   } else {
-    CompactData_kernel<false, check_score><<<blocks, n_threads, 0,
+    CompactDataKernel<false, check_score><<<blocks, n_threads, 0,
                                              Stream<gpu>::GetStream(s)>>>(
       indices.dptr_, source.dptr_,
       destination->dptr_, topk,
@@ -184,7 +184,7 @@ void WorkspaceForSort(const index_t num_elem,
 }
 
 template <int encode, typename DType>
-__global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* result,
+__global__ void CalculateGreedyNMSResultsKernel(const DType* data, uint32_t* result,
                                                  const index_t current_start,
                                                  const index_t num_elems,
                                                  const index_t num_batches,
@@ -199,7 +199,7 @@ __global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* re
                                                  const float threshold);
 
 template <typename DType>
-__global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results,
+__global__ void ReduceNMSResultTriangleKernel(uint32_t* nms_results,
                                                DType * data,
                                                const index_t score_index,
                                                const index_t element_width,
@@ -209,7 +209,7 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results,
                                                const index_t topk);
 
 template <typename DType>
-__global__ void ReduceNMSResultRest_kernel(DType* data,
+__global__ void ReduceNMSResultRestKernel(DType* data,
                                            const uint32_t* nms_results,
                                            const index_t score_index,
                                            const index_t element_width,
@@ -238,7 +238,7 @@ struct NMS {
       const index_t num_blocks_per_row =  num_blocks_per_row_batch * num_batches;
       const index_t n_blocks = THRESHOLD / (sizeof(uint32_t) * 8) * num_blocks_per_row;
       if (param.in_format == box_common_enum::kCorner) {
-        CalculateGreedyNMSResults_kernel<box_common_enum::kCorner>
+        CalculateGreedyNMSResultsKernel<box_common_enum::kCorner>
           <<<n_blocks, n_threads, 0, Stream<gpu>::GetStream(s)>>>(
             data->dptr_, scratch->dptr_, current_start, n_elems, num_batches,
             num_blocks_per_row_batch, num_blocks_per_row, topk, element_width,
@@ -246,7 +246,7 @@ struct NMS {
             param.force_suppress ? -1 : param.id_index,
             param.score_index, param.overlap_thresh);
       } else {
-        CalculateGreedyNMSResults_kernel<box_common_enum::kCenter>
+        CalculateGreedyNMSResultsKernel<box_common_enum::kCenter>
           <<<n_blocks, n_threads, 0, Stream<gpu>::GetStream(s)>>>(
             data->dptr_, scratch->dptr_, current_start, n_elems, num_batches,
             num_blocks_per_row_batch, num_blocks_per_row, topk, element_width,
@@ -254,7 +254,7 @@ struct NMS {
             param.force_suppress ? -1 : param.id_index,
             param.score_index, param.overlap_thresh);
       }
-      ReduceNMSResultTriangle_kernel<<<num_batches, THRESHOLD, 0, Stream<gpu>::GetStream(s)>>>(
+      ReduceNMSResultTriangleKernel<<<num_batches, THRESHOLD, 0, Stream<gpu>::GetStream(s)>>>(
           scratch->dptr_, data->dptr_, param.score_index,
           element_width, num_batches, num_elements_per_batch,
           current_start, topk);
@@ -262,7 +262,7 @@ struct NMS {
       const index_t num_rest_blocks_per_batch = ceil_div(n_rest_elems, n_threads);
       const index_t num_rest_blocks = num_rest_blocks_per_batch * num_batches;
       if (n_rest_elems > 0) {
-        ReduceNMSResultRest_kernel<<<num_rest_blocks, n_threads, 0, Stream<gpu>::GetStream(s)>>>(
+        ReduceNMSResultRestKernel<<<num_rest_blocks, n_threads, 0, Stream<gpu>::GetStream(s)>>>(
             data->dptr_, scratch->dptr_, param.score_index, element_width,
             num_batches, num_elements_per_batch, current_start, topk,
             num_rest_blocks_per_batch);
@@ -323,7 +323,7 @@ __device__ __forceinline__ DType calculate_intersection(const DType a0, const DT
 
 template <int encode, typename DType>
 __launch_bounds__(512)
-__global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* result,
+__global__ void CalculateGreedyNMSResultsKernel(const DType* data, uint32_t* result,
                                                  const index_t current_start,
                                                  const index_t num_elems,
                                                  const index_t num_batches,
@@ -410,7 +410,7 @@ __global__ void CalculateGreedyNMSResults_kernel(const DType* data, uint32_t* re
 
 template <typename DType>
 __launch_bounds__(NMS<DType>::THRESHOLD)
-__global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results,
+__global__ void ReduceNMSResultTriangleKernel(uint32_t* nms_results,
                                                DType * data,
                                                const index_t score_index,
                                                const index_t element_width,
@@ -473,7 +473,7 @@ __global__ void ReduceNMSResultTriangle_kernel(uint32_t* nms_results,
 
 template <typename DType>
 __launch_bounds__(512)
-__global__ void ReduceNMSResultRest_kernel(DType* data,
+__global__ void ReduceNMSResultRestKernel(DType* data,
                                            const uint32_t* nms_results,
                                            const index_t score_index,
                                            const index_t element_width,
@@ -554,7 +554,7 @@ TempWorkspace<DType> GetWorkspace(const index_t num_batch,
 }
 
 template <typename DType>
-__global__ void ExtractScores_kernel(const DType* data, DType* scores,
+__global__ void ExtractScoresKernel(const DType* data, DType* scores,
                                      const index_t N, const int element_width,
                                      const int score_index) {
   const index_t tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -581,7 +581,7 @@ void CompactNMSResults(const Tensor<gpu, 3, DType>& data,
   const index_t num_batches = data.shape_[0];
   const int element_width = data.shape_[2];
   const index_t n_blocks = ceil_div(num_elements, n_threads);
-  ExtractScores_kernel<<<n_blocks, n_threads, 0, Stream<gpu>::GetStream(s)>>>(
+  ExtractScoresKernel<<<n_blocks, n_threads, 0, Stream<gpu>::GetStream(s)>>>(
       data.dptr_, scores->dptr_, num_elements, element_width, score_index);
   *indices = mshadow::expr::range<index_t>(0, num_elements);
   for (index_t i = 0; i < num_batches; ++i) {

From 3dfca4dc3500c270d2f3ee501151cda85b2d33de Mon Sep 17 00:00:00 2001
From: Przemek Tredak <ptredak@nvidia.com>
Date: Fri, 25 Oct 2019 09:48:26 -0700
Subject: [PATCH 19/19] Fix the index_t with int comparisoon

---
 src/operator/contrib/bounding_box.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/operator/contrib/bounding_box.cu b/src/operator/contrib/bounding_box.cu
index 9d6d8bfb5951..ffc48d7d5a44 100644
--- a/src/operator/contrib/bounding_box.cu
+++ b/src/operator/contrib/bounding_box.cu
@@ -150,7 +150,7 @@ void CompactData(const Tensor<gpu, 1, index_t>& indices,
                  const int score_index,
                  Stream<gpu>* s) {
   const int n_threads = 512;
-  const int max_blocks = 320;
+  const index_t max_blocks = 320;
   index_t N = source.shape_.Size();
   const auto blocks = std::min(ceil_div(N, n_threads), max_blocks);
   if (topk > 0) {