rapidsai · rapids-bot · Aug 3, 2022 · Sep 13, 2021 · Sep 24, 2021 · Sep 24, 2021
@@ -190,6 +190,7 @@ struct Builder {
   int n_blks_for_cols = 10;
   /** Memory alignment value */
   const size_t align_value = 512;
+  IdxT* colids;
   /** rmm device workspace buffer */
   rmm::device_uvector<char> d_buff;
   /** pinned host buffer to store the trained nodes */
@@ -281,6 +282,7 @@ struct Builder {
     d_wsize += calculateAlignedBytes(sizeof(NodeWorkItem) * max_batch);           // d_work_Items
     d_wsize +=                                                                    // workload_info
       calculateAlignedBytes(sizeof(WorkloadInfo<IdxT>) * max_blocks_dimx);
+    d_wsize += calculateAlignedBytes(sizeof(IdxT) * max_batch * dataset.n_sampled_cols);  // colids
 
     // all nodes in the tree
     h_wsize +=  // h_workload_info
@@ -320,6 +322,8 @@ struct Builder {
     d_wspace += calculateAlignedBytes(sizeof(NodeWorkItem) * max_batch);
     workload_info = reinterpret_cast<WorkloadInfo<IdxT>*>(d_wspace);
     d_wspace += calculateAlignedBytes(sizeof(WorkloadInfo<IdxT>) * max_blocks_dimx);
+    colids = reinterpret_cast<IdxT*>(d_wspace);
+    d_wspace += calculateAlignedBytes(sizeof(IdxT) * max_batch * dataset.n_sampled_cols);
 
     RAFT_CUDA_TRY(
       cudaMemsetAsync(done_count, 0, sizeof(int) * max_batch * n_col_blks, builder_stream));
@@ -378,7 +382,7 @@ struct Builder {
 
   auto doSplit(const std::vector<NodeWorkItem>& work_items)
   {
-    raft::common::nvtx::range fun_scope("Builder::doSplit @bulder_base.cuh [batched-levelalgo]");
+    raft::common::nvtx::range fun_scope("Builder::doSplit @builder.cuh [batched-levelalgo]");
     // start fresh on the number of *new* nodes created in this batch
     RAFT_CUDA_TRY(cudaMemsetAsync(n_nodes, 0, sizeof(IdxT), builder_stream));
     initSplit<DataT, IdxT, TPB_DEFAULT>(splits, work_items.size(), builder_stream);
@@ -388,11 +392,86 @@ struct Builder {
 
     auto [n_blocks_dimx, n_large_nodes] = this->updateWorkloadInfo(work_items);
 
+    // do feature-sampling
+    if (dataset.n_sampled_cols != dataset.N) {
+      raft::common::nvtx::range fun_scope("feature-sampling");
+      constexpr int block_threads          = 128;
+      constexpr int max_samples_per_thread = 72;  // register spillage if more than this limit
+      // decide if the problem size is suitable for the excess-sampling strategy.
+      //
+      // our required shared memory is a function of number of samples we'll need to sample (in
+      // parallel, with replacement) in excess to get 'k' uniques out of 'n' features. estimated
+      // static shared memory required by cub's block-wide collectives:
+      // max_samples_per_thread * block_threads * sizeof(IdxT)
+      //
+      // The maximum items to sample ( the constant `max_samples_per_thread` to be set at
+      // compile-time) is calibrated so that:
+      // 1. There is no register spills and accesses to global memory
+      // 2. The required static shared memory (ie, `max_samples_per_thread * block_threads *
+      // sizeof(IdxT)` does not exceed 46KB.
+      //
+      // number of samples we'll need to sample (in parallel, with replacement), to expect 'k'
+      // unique samples from 'n' is given by the following equation: log(1 - k/n)/log(1 - 1/n) ref:
+      // https://stats.stackexchange.com/questions/296005/the-expected-number-of-unique-elements-drawn-with-replacement
+      IdxT n_parallel_samples =
+        std::ceil(raft::myLog(1 - double(dataset.n_sampled_cols) / double(dataset.N)) /
+                  (raft::myLog(1 - 1.f / double(dataset.N))));
+      // maximum sampling work possible by all threads in a block :
+      // `max_samples_per_thread * block_thread`
+      // dynamically calculated sampling work to be done per block:
+      // `n_parallel_samples`
+      // former must be greater or equal to than latter for excess-sampling-based strategy
+      if (max_samples_per_thread * block_threads >= n_parallel_samples) {
+        raft::common::nvtx::range fun_scope("excess-sampling-based approach");
+        dim3 grid;
+        grid.x = work_items.size();
+        grid.y = 1;
+        grid.z = 1;
+
+        if (n_parallel_samples <= block_threads)
+          // each thread randomly samples only 1 sample
+          excess_sample_with_replacement_kernel<IdxT, 1, block_threads>
+            <<<grid, block_threads, 0, builder_stream>>>(colids,
+                                                         d_work_items,
+                                                         work_items.size(),
+                                                         treeid,
+                                                         seed,
+                                                         dataset.N,
+                                                         dataset.n_sampled_cols,
+                                                         n_parallel_samples);
+        else
+          // each thread does more work and samples `max_samples_per_thread` samples
+          excess_sample_with_replacement_kernel<IdxT, max_samples_per_thread, block_threads>
+            <<<grid, block_threads, 0, builder_stream>>>(colids,
+                                                         d_work_items,
+                                                         work_items.size(),
+                                                         treeid,
+                                                         seed,
+                                                         dataset.N,
+                                                         dataset.n_sampled_cols,
+                                                         n_parallel_samples);
+        raft::common::nvtx::pop_range();
+      } else {
+        raft::common::nvtx::range fun_scope("reservoir-sampling-based approach");
+        // using algo-L (reservoir sampling) strategy to sample 'dataset.n_sampled_cols' unique
+        // features from 'dataset.N' total features
+        dim3 grid;
+        grid.x = (work_items.size() + 127) / 128;
+        grid.y = 1;
+        grid.z = 1;
+        algo_L_sample_kernel<<<grid, block_threads, 0, builder_stream>>>(
+          colids, d_work_items, work_items.size(), treeid, seed, dataset.N, dataset.n_sampled_cols);
+        raft::common::nvtx::pop_range();
+      }
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+      raft::common::nvtx::pop_range();
+    }
+
     // iterate through a batch of columns (to reduce the memory pressure) and
     // compute the best split at the end
     for (IdxT c = 0; c < dataset.n_sampled_cols; c += n_blks_for_cols) {
       computeSplit(c, n_blocks_dimx, n_large_nodes);
-      RAFT_CUDA_TRY(cudaGetLastError());
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
     }
 
     // create child nodes (or make the current ones leaf)
@@ -407,7 +486,7 @@ struct Builder {
                                                                       dataset,
                                                                       d_work_items,
                                                                       splits);
-    RAFT_CUDA_TRY(cudaGetLastError());
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
     raft::common::nvtx::pop_range();
     raft::update_host(h_splits, splits, work_items.size(), builder_stream);
     handle.sync_stream(builder_stream);
@@ -462,6 +541,7 @@ struct Builder {
                                                          quantiles,
                                                          d_work_items,
                                                          col,
+                                                         colids,
                                                          done_count,
                                                          mutex,
                                                          splits,

@@ -20,6 +20,10 @@
 #include "../objectives.cuh"
 #include "../quantiles.h"
 
+#include <raft/random/rng.hpp>
+
+#include <cub/cub.cuh>
+
 namespace ML {
 namespace DT {
 
@@ -60,6 +64,13 @@ HDI bool SplitNotValid(const SplitT& split,
          (IdxT(num_rows) - split.nLeft) < min_samples_leaf;
 }
 
+/* Returns 'dataset' rounded up to a correctly-aligned pointer of type OutT* */
+template <typename OutT, typename InT>
+DI OutT* alignPointer(InT dataset)
+{
+  return reinterpret_cast<OutT*>(raft::alignTo(reinterpret_cast<size_t>(dataset), sizeof(OutT)));
+}
+
 template <typename DataT, typename LabelT, typename IdxT, int TPB>
 __global__ void nodeSplitKernel(const IdxT max_depth,
                                 const IdxT min_samples_leaf,
@@ -111,6 +122,209 @@ HDI IdxT lower_bound(DataT* array, IdxT len, DataT element)
   return start;
 }
 
+template <typename IdxT>
+struct CustomDifference {
+  __device__ IdxT operator()(const IdxT& lhs, const IdxT& rhs)
+  {
+    if (lhs == rhs)
+      return 0;
+    else
+      return 1;
+  }
+};
+
+/**
+ * @brief Generates 'k' unique samples of features from 'n' feature sample-space.
+ *        Does this for each work-item (node), feeding a unique seed for each (treeid, nodeid
+ * (=blockIdx.x), threadIdx.x). Method used is a random, parallel, sampling with replacement of
+ * excess of 'k' samples (hence the name) and then eliminating the dupicates by ordering them. The
+ * excess number of samples (=`n_parallel_samples`) is calculated such that after ordering there is
+ * atleast 'k' uniques.
+ */
+template <typename IdxT, int MAX_SAMPLES_PER_THREAD, int BLOCK_THREADS = 128>
+__global__ void excess_sample_with_replacement_kernel(
+  IdxT* colids,
+  const NodeWorkItem* work_items,
+  size_t work_items_size,
+  IdxT treeid,
+  uint64_t seed,
+  size_t n /* total cols to sample from*/,
+  size_t k /* number of unique cols to sample */,
+  int n_parallel_samples /* number of cols to sample with replacement */)
+{
+  if (blockIdx.x >= work_items_size) return;
+
+  const uint32_t nodeid = work_items[blockIdx.x].idx;
+
+  uint64_t subsequence(fnv1a32_basis);
+  subsequence = fnv1a32(subsequence, uint32_t(threadIdx.x));
+  subsequence = fnv1a32(subsequence, uint32_t(treeid));
+  subsequence = fnv1a32(subsequence, uint32_t(nodeid));
+
+  raft::random::PCGenerator gen(seed, subsequence, uint64_t(0));
+  raft::random::UniformIntDistParams<IdxT, uint64_t> uniform_int_dist_params;
+
+  uniform_int_dist_params.start = 0;
+  uniform_int_dist_params.end   = n;
+  uniform_int_dist_params.diff =
+    uint64_t(uniform_int_dist_params.end - uniform_int_dist_params.start);
+
+  IdxT n_uniques = 0;
+  IdxT items[MAX_SAMPLES_PER_THREAD];
+  IdxT col_indices[MAX_SAMPLES_PER_THREAD];
+  IdxT mask[MAX_SAMPLES_PER_THREAD];
+  // populate this
+  for (int i = 0; i < MAX_SAMPLES_PER_THREAD; ++i)
+    mask[i] = 0;
+
+  do {
+    // blocked arrangement
+    for (int cta_sample_idx = MAX_SAMPLES_PER_THREAD * threadIdx.x, thread_local_sample_idx = 0;
+         thread_local_sample_idx < MAX_SAMPLES_PER_THREAD;
+         ++cta_sample_idx, ++thread_local_sample_idx) {
+      // mask of the previous iteration, if exists, is re-used here
+      // so previously generated unique random numbers are used.
+      // newly generated random numbers may or may not duplicate the previously generated ones
+      // but this ensures some forward progress in order to generate atleast 'k' unique random
+      // samples.
+      if (mask[thread_local_sample_idx] == 0 and cta_sample_idx < n_parallel_samples)
+        raft::random::custom_next(
+          gen, &items[thread_local_sample_idx], uniform_int_dist_params, IdxT(0), IdxT(0));
+      else if (mask[thread_local_sample_idx] ==
+               0)  // indices that exceed `n_parallel_samples` will not generate
+        items[thread_local_sample_idx] = n - 1;
+      else
+        continue;  // this case is for samples whose mask == 1 (saving previous iteraion's random
+                   // number generated)
+    }
+
+    // Specialize BlockRadixSort type for our thread block
+    typedef cub::BlockRadixSort<IdxT, BLOCK_THREADS, MAX_SAMPLES_PER_THREAD> BlockRadixSortT;
+    // BlockAdjacentDifference
+    typedef cub::BlockAdjacentDifference<IdxT, BLOCK_THREADS> BlockAdjacentDifferenceT;
+    // BlockScan
+    typedef cub::BlockScan<IdxT, BLOCK_THREADS> BlockScanT;
+
+    // Shared memory
+    __shared__ union TempStorage {
+      typename BlockRadixSortT::TempStorage sort;
+      typename BlockAdjacentDifferenceT::TempStorage diff;
+      typename BlockScanT::TempStorage scan;
+    } temp_storage;
+
+    // collectively sort items
+    BlockRadixSortT(temp_storage.sort).Sort(items);
+
+    __syncthreads();
+
+    // compute the mask
+    // compute the adjacent differences according to the functor
+    BlockAdjacentDifferenceT(temp_storage.diff)
+      .FlagHeads(mask, items, mask, CustomDifference<IdxT>());
+
+    __syncthreads();
+
+    // do a scan on the mask to get the indices for gathering
+    BlockScanT(temp_storage.scan).ExclusiveSum(mask, col_indices, n_uniques);
+
+    __syncthreads();
+
+  } while (n_uniques < k);
+
+  // write the items[] of only the ones with mask[]=1 to col[offset + col_idx[]]
+  IdxT col_offset = k * blockIdx.x;
+  for (int i = 0; i < MAX_SAMPLES_PER_THREAD; ++i) {
+    if (mask[i] and col_indices[i] < k) { colids[col_offset + col_indices[i]] = items[i]; }
+  }
+}
+
+// algo L of the reservoir sampling algorithm
+/**
+ * @brief Generates 'k' unique samples of features from 'n' feature sample-space using the algo-L
+ * algorithm of reservoir sampling. wiki :
+ * https://en.wikipedia.org/wiki/Reservoir_sampling#An_optimal_algorithm
+ */
+template <typename IdxT>
+__global__ void algo_L_sample_kernel(int* colids,
+                                     const NodeWorkItem* work_items,
+                                     size_t work_items_size,
+                                     IdxT treeid,
+                                     uint64_t seed,
+                                     size_t n /* total cols to sample from*/,
+                                     size_t k /* cols to sample */)
+{
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= work_items_size) return;
+  const uint32_t nodeid = work_items[tid].idx;
+  uint64_t subsequence  = (uint64_t(treeid) << 32) | uint64_t(nodeid);
+  raft::random::PCGenerator gen(seed, subsequence, uint64_t(0));
+  raft::random::UniformIntDistParams<IdxT, uint64_t> uniform_int_dist_params;
+  uniform_int_dist_params.start = 0;
+  uniform_int_dist_params.end   = k;
+  uniform_int_dist_params.diff =
+    uint64_t(uniform_int_dist_params.end - uniform_int_dist_params.start);
+  float fp_uniform_val;
+  IdxT int_uniform_val;
+  // fp_uniform_val will have a random value between 0 and 1
+  gen.next(fp_uniform_val);
+  double W = raft::myExp(raft::myLog(fp_uniform_val) / k);
+
+  size_t col(0);
+  // initially fill the reservoir array in increasing order of cols till k
+  while (1) {
+    colids[tid * k + col] = col;
+    if (col == k - 1)
+      break;
+    else
+      ++col;
+  }
+  // randomly sample from a geometric distribution
+  while (col < n) {
+    // fp_uniform_val will have a random value between 0 and 1
+    gen.next(fp_uniform_val);
+    col += static_cast<int>(raft::myLog(fp_uniform_val) / raft::myLog(1 - W)) + 1;
+    if (col < n) {
+      // int_uniform_val will now have a random value between 0...k
+      raft::random::custom_next(gen, &int_uniform_val, uniform_int_dist_params, IdxT(0), IdxT(0));
+      colids[tid * k + int_uniform_val] = col;  // the bad memory coalescing here is hidden
+      // fp_uniform_val will have a random value between 0 and 1
+      gen.next(fp_uniform_val);
+      W *= raft::myExp(raft::myLog(fp_uniform_val) / k);
+    }
+  }
+}
+
+template <typename IdxT>
+__global__ void adaptive_sample_kernel(int* colids,
+                                       const NodeWorkItem* work_items,
+                                       size_t work_items_size,
+                                       IdxT treeid,
+                                       uint64_t seed,
+                                       int N,
+                                       int M)
+{
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid >= work_items_size) return;
+  const uint32_t nodeid = work_items[tid].idx;
+
+  uint64_t subsequence = (uint64_t(treeid) << 32) | uint64_t(nodeid);
+  raft::random::PCGenerator gen(seed, subsequence, uint64_t(0));
+
+  int selected_count = 0;
+  for (int i = 0; i < N; i++) {
+    uint32_t toss = 0;
+    gen.next(toss);
+    uint64_t lhs = uint64_t(M - selected_count);
+    lhs <<= 32;
+    uint64_t rhs = uint64_t(toss) * (N - i);
+    if (lhs > rhs) {
+      colids[tid * M + selected_count] = i;
+      selected_count++;
+      if (selected_count == M) break;
+    }
+  }
+}
+
 template <typename DataT,
           typename LabelT,
           typename IdxT,
@@ -126,6 +340,7 @@ __global__ void computeSplitKernel(BinT* histograms,
                                    const Quantiles<DataT, IdxT> quantiles,
                                    const NodeWorkItem* work_items,
                                    IdxT colStart,
+                                   const IdxT* colids,
                                    int* done_count,
                                    int* mutex,
                                    volatile Split<DataT, IdxT>* splits,