From cf33a5acad4769b7434d22d9527d8b223761b311 Mon Sep 17 00:00:00 2001
From: Mark Harris <mharris@nvidia.com>
Date: Thu, 17 Feb 2022 19:51:51 +1100
Subject: [PATCH] New benchmark compares concurrent throughput of device_vector
 and device_uvector (#981)

Adds a new benchmark in `device_uvector_benchmark.cpp` that compares using multiple streams and concurrent kernels interleaved with vector creation. This is then parameterized on the type of the vector:

1. `thrust::device_vector` -- uses cudaMalloc allocation
2. `rmm::device_vector` -- uses RMM allocation
3. `rmm::device_uvector` -- uses RMM allocation and uninitialized vector

The benchmark uses the `cuda_async_memory_resource` so that cudaMallocAsync is used for allocation of the `rmm::` vector types.

The performance on V100 demonstrates that option 1. is slowest due to allocation bottlenecks. 2. alleviates these by using `cudaMallocFromPoolAsync`, but there is no concurrency among the kernels because `thrust::device_vector` synchronizes the default stream. 3. Is fastest and achieves full concurrency (verified in `nsight-sys`).

```----------------------------------------------------------------------------------------------------------------------------------
Benchmark                                                                        Time             CPU   Iterations UserCounters...
----------------------------------------------------------------------------------------------------------------------------------
BM_VectorWorkflow<thrust::device_vector<int32_t>>/100000/manual_time           242 us          267 us         2962 bytes_per_second=13.8375G/s
BM_VectorWorkflow<thrust::device_vector<int32_t>>/1000000/manual_time         1441 us         1465 us          472 bytes_per_second=23.273G/s
BM_VectorWorkflow<thrust::device_vector<int32_t>>/10000000/manual_time       10483 us        10498 us           68 bytes_per_second=31.9829G/s
BM_VectorWorkflow<thrust::device_vector<int32_t>>/100000000/manual_time      63583 us        63567 us           12 bytes_per_second=52.7303G/s
BM_VectorWorkflow<rmm::device_vector<int32_t>>/100000/manual_time             82.0 us          105 us         8181 bytes_per_second=40.8661G/s
BM_VectorWorkflow<rmm::device_vector<int32_t>>/1000000/manual_time             502 us          527 us         1357 bytes_per_second=66.8029G/s
BM_VectorWorkflow<rmm::device_vector<int32_t>>/10000000/manual_time           4714 us         4746 us          148 bytes_per_second=71.1222G/s
BM_VectorWorkflow<rmm::device_vector<int32_t>>/100000000/manual_time         46451 us        46478 us           13 bytes_per_second=72.1784G/s
BM_VectorWorkflow<rmm::device_uvector<int32_t>>/100000/manual_time            39.0 us         59.9 us        17970 bytes_per_second=85.8733G/s
BM_VectorWorkflow<rmm::device_uvector<int32_t>>/1000000/manual_time            135 us          159 us         5253 bytes_per_second=248.987G/s
BM_VectorWorkflow<rmm::device_uvector<int32_t>>/10000000/manual_time          1319 us         1351 us          516 bytes_per_second=254.169G/s
BM_VectorWorkflow<rmm::device_uvector<int32_t>>/100000000/manual_time        12841 us        12865 us           54 bytes_per_second=261.099G/s
```

Authors:
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Jake Hemstad (https://github.com/jrhemstad)
  - Conor Hoekstra (https://github.com/codereport)

URL: https://github.com/rapidsai/rmm/pull/981
---
 .../device_uvector/device_uvector_bench.cu    | 126 +++++++++++++++++-
 1 file changed, 119 insertions(+), 7 deletions(-)

diff --git a/benchmarks/device_uvector/device_uvector_bench.cu b/benchmarks/device_uvector/device_uvector_bench.cu
index 7e73451e6..701e5fceb 100644
--- a/benchmarks/device_uvector/device_uvector_bench.cu
+++ b/benchmarks/device_uvector/device_uvector_bench.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,8 +14,14 @@
  * limitations under the License.
  */
 
+#include "../synchronization/synchronization.hpp"
+#include "thrust/detail/raw_pointer_cast.h"
+
+#include <rmm/cuda_stream.hpp>
+#include <rmm/detail/error.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/device_vector.hpp>
+#include <rmm/mr/device/cuda_async_memory_resource.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
@@ -24,18 +30,21 @@
 
 #include <cuda_runtime_api.h>
 
-static void BM_UvectorSizeConstruction(benchmark::State& state)
+#include <cstdio>
+#include <type_traits>
+
+void BM_UvectorSizeConstruction(benchmark::State& state)
 {
   rmm::mr::cuda_memory_resource cuda_mr{};
   rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> mr{&cuda_mr};
   rmm::mr::set_current_device_resource(&mr);
 
   for (auto _ : state) {  // NOLINT(clang-analyzer-deadcode.DeadStores)
-    rmm::device_uvector<int32_t> vec(state.range(0), rmm::cuda_stream_view{});
+    rmm::device_uvector<std::int32_t> vec(state.range(0), rmm::cuda_stream_view{});
     cudaDeviceSynchronize();
   }
 
-  state.SetItemsProcessed(static_cast<int64_t>(state.iterations()));
+  state.SetItemsProcessed(static_cast<std::int64_t>(state.iterations()));
 
   rmm::mr::set_current_device_resource(nullptr);
 }
@@ -45,18 +54,18 @@ BENCHMARK(BM_UvectorSizeConstruction)
   ->Range(10'000, 1'000'000'000)  // NOLINT
   ->Unit(benchmark::kMicrosecond);
 
-static void BM_ThrustVectorSizeConstruction(benchmark::State& state)
+void BM_ThrustVectorSizeConstruction(benchmark::State& state)
 {
   rmm::mr::cuda_memory_resource cuda_mr{};
   rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> mr{&cuda_mr};
   rmm::mr::set_current_device_resource(&mr);
 
   for (auto _ : state) {  // NOLINT(clang-analyzer-deadcode.DeadStores)
-    rmm::device_vector<int32_t> vec(state.range(0));
+    rmm::device_vector<std::int32_t> vec(state.range(0));
     cudaDeviceSynchronize();
   }
 
-  state.SetItemsProcessed(static_cast<int64_t>(state.iterations()));
+  state.SetItemsProcessed(static_cast<std::int64_t>(state.iterations()));
 
   rmm::mr::set_current_device_resource(nullptr);
 }
@@ -66,4 +75,107 @@ BENCHMARK(BM_ThrustVectorSizeConstruction)
   ->Range(10'000, 1'000'000'000)  // NOLINT
   ->Unit(benchmark::kMicrosecond);
 
+// simple kernel used to test concurrent execution.
+__global__ void kernel(int const* input, int* output, std::size_t num)
+{
+  for (auto i = blockDim.x * blockIdx.x + threadIdx.x; i < num; i += gridDim.x * blockDim.x) {
+    output[i] = input[i] * input[i];
+  }
+}
+
+using thrust_vector = thrust::device_vector<int32_t>;
+using rmm_vector    = rmm::device_vector<int32_t>;
+using rmm_uvector   = rmm::device_uvector<int32_t>;
+
+template <typename Vector>
+Vector make_vector(std::int64_t num_elements, rmm::cuda_stream_view stream, bool zero_init = false)
+{
+  static_assert(std::is_same_v<Vector, thrust_vector> or std::is_same_v<Vector, rmm_vector> or
+                  std::is_same_v<Vector, rmm_uvector>,
+                "unsupported vector type");
+  if constexpr (std::is_same_v<Vector, thrust_vector>) {
+    return Vector(num_elements, 0);
+  } else if constexpr (std::is_same_v<Vector, rmm_vector>) {
+    return Vector(num_elements, 0, rmm::mr::thrust_allocator<std::int32_t>(stream));
+  } else if constexpr (std::is_same_v<Vector, rmm_uvector>) {
+    auto vec = Vector(num_elements, stream);
+    if (zero_init) {
+      cudaMemsetAsync(vec.data(), 0, num_elements * sizeof(std::int32_t), stream.value());
+    }
+    return vec;
+  }
+}
+
+template <typename Vector>
+int32_t* vector_data(Vector& vec)
+{
+  return thrust::raw_pointer_cast(vec.data());
+}
+
+template <typename Vector>
+void vector_workflow(std::size_t num_elements,
+                     std::int64_t num_blocks,
+                     std::int64_t block_size,
+                     rmm::cuda_stream const& input_stream,
+                     std::vector<rmm::cuda_stream> const& streams)
+{
+  auto input = make_vector<Vector>(num_elements, input_stream, true);
+  input_stream.synchronize();
+  for (rmm::cuda_stream_view stream : streams) {
+    auto output = make_vector<Vector>(num_elements, stream);
+    kernel<<<num_blocks, block_size, 0, stream.value()>>>(
+      vector_data(input), vector_data(output), num_elements);
+  }
+
+  for (rmm::cuda_stream_view stream : streams) {
+    stream.synchronize();
+  }
+}
+
+template <typename Vector>
+void BM_VectorWorkflow(benchmark::State& state)
+{
+  rmm::mr::cuda_async_memory_resource cuda_async_mr{};
+  rmm::mr::set_current_device_resource(&cuda_async_mr);
+
+  rmm::cuda_stream input_stream;
+  std::vector<rmm::cuda_stream> streams(4);
+
+  auto const num_elements   = state.range(0);
+  auto constexpr block_size = 256;
+  auto constexpr num_blocks = 16;
+
+  for (auto _ : state) {  // NOLINT(clang-analyzer-deadcode.DeadStores)
+    cuda_event_timer timer(state, true, input_stream);  // flush_l2_cache = true
+    vector_workflow<Vector>(num_elements, num_blocks, block_size, input_stream, streams);
+  }
+
+  auto constexpr num_accesses = 9;
+  auto const bytes            = num_elements * sizeof(std::int32_t) * num_accesses;
+  state.SetBytesProcessed(static_cast<std::int64_t>(state.iterations() * bytes));
+
+  rmm::mr::set_current_device_resource(nullptr);
+}
+
+BENCHMARK_TEMPLATE(BM_VectorWorkflow, thrust_vector)  // NOLINT
+  ->RangeMultiplier(10)                               // NOLINT
+  ->Range(100'000, 100'000'000)                       // NOLINT
+  ->Unit(benchmark::kMicrosecond)
+  ->UseManualTime();
+
+// The only difference here is that `rmm::device_vector` uses `rmm::current_device_resource()`
+// for allocation while `thrust::device_vector` uses cudaMalloc/cudaFree. In the benchmarks we use
+// `cuda_async_memory_resource`, which is faster.
+BENCHMARK_TEMPLATE(BM_VectorWorkflow, rmm_vector)  // NOLINT
+  ->RangeMultiplier(10)                            // NOLINT
+  ->Range(100'000, 100'000'000)                    // NOLINT
+  ->Unit(benchmark::kMicrosecond)
+  ->UseManualTime();
+
+BENCHMARK_TEMPLATE(BM_VectorWorkflow, rmm_uvector)  // NOLINT
+  ->RangeMultiplier(10)                             // NOLINT
+  ->Range(100'000, 100'000'000)                     // NOLINT
+  ->Unit(benchmark::kMicrosecond)
+  ->UseManualTime();
+
 BENCHMARK_MAIN();