From cf33a5acad4769b7434d22d9527d8b223761b311 Mon Sep 17 00:00:00 2001 From: Mark Harris Date: Thu, 17 Feb 2022 19:51:51 +1100 Subject: [PATCH] New benchmark compares concurrent throughput of device_vector and device_uvector (#981) Adds a new benchmark in `device_uvector_benchmark.cpp` that compares using multiple streams and concurrent kernels interleaved with vector creation. This is then parameterized on the type of the vector: 1. `thrust::device_vector` -- uses cudaMalloc allocation 2. `rmm::device_vector` -- uses RMM allocation 3. `rmm::device_uvector` -- uses RMM allocation and uninitialized vector The benchmark uses the `cuda_async_memory_resource` so that cudaMallocAsync is used for allocation of the `rmm::` vector types. The performance on V100 demonstrates that option 1. is slowest due to allocation bottlenecks. 2. alleviates these by using `cudaMallocFromPoolAsync`, but there is no concurrency among the kernels because `thrust::device_vector` synchronizes the default stream. 3. Is fastest and achieves full concurrency (verified in `nsight-sys`). ```---------------------------------------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... ---------------------------------------------------------------------------------------------------------------------------------- BM_VectorWorkflow>/100000/manual_time 242 us 267 us 2962 bytes_per_second=13.8375G/s BM_VectorWorkflow>/1000000/manual_time 1441 us 1465 us 472 bytes_per_second=23.273G/s BM_VectorWorkflow>/10000000/manual_time 10483 us 10498 us 68 bytes_per_second=31.9829G/s BM_VectorWorkflow>/100000000/manual_time 63583 us 63567 us 12 bytes_per_second=52.7303G/s BM_VectorWorkflow>/100000/manual_time 82.0 us 105 us 8181 bytes_per_second=40.8661G/s BM_VectorWorkflow>/1000000/manual_time 502 us 527 us 1357 bytes_per_second=66.8029G/s BM_VectorWorkflow>/10000000/manual_time 4714 us 4746 us 148 bytes_per_second=71.1222G/s BM_VectorWorkflow>/100000000/manual_time 46451 us 46478 us 13 bytes_per_second=72.1784G/s BM_VectorWorkflow>/100000/manual_time 39.0 us 59.9 us 17970 bytes_per_second=85.8733G/s BM_VectorWorkflow>/1000000/manual_time 135 us 159 us 5253 bytes_per_second=248.987G/s BM_VectorWorkflow>/10000000/manual_time 1319 us 1351 us 516 bytes_per_second=254.169G/s BM_VectorWorkflow>/100000000/manual_time 12841 us 12865 us 54 bytes_per_second=261.099G/s ``` Authors: - Mark Harris (https://github.com/harrism) Approvers: - Jake Hemstad (https://github.com/jrhemstad) - Conor Hoekstra (https://github.com/codereport) URL: https://github.com/rapidsai/rmm/pull/981 --- .../device_uvector/device_uvector_bench.cu | 126 +++++++++++++++++- 1 file changed, 119 insertions(+), 7 deletions(-) diff --git a/benchmarks/device_uvector/device_uvector_bench.cu b/benchmarks/device_uvector/device_uvector_bench.cu index 7e73451e6..701e5fceb 100644 --- a/benchmarks/device_uvector/device_uvector_bench.cu +++ b/benchmarks/device_uvector/device_uvector_bench.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021, NVIDIA CORPORATION. + * Copyright (c) 2020-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,8 +14,14 @@ * limitations under the License. */ +#include "../synchronization/synchronization.hpp" +#include "thrust/detail/raw_pointer_cast.h" + +#include +#include #include #include +#include #include #include #include @@ -24,18 +30,21 @@ #include -static void BM_UvectorSizeConstruction(benchmark::State& state) +#include +#include + +void BM_UvectorSizeConstruction(benchmark::State& state) { rmm::mr::cuda_memory_resource cuda_mr{}; rmm::mr::pool_memory_resource mr{&cuda_mr}; rmm::mr::set_current_device_resource(&mr); for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores) - rmm::device_uvector vec(state.range(0), rmm::cuda_stream_view{}); + rmm::device_uvector vec(state.range(0), rmm::cuda_stream_view{}); cudaDeviceSynchronize(); } - state.SetItemsProcessed(static_cast(state.iterations())); + state.SetItemsProcessed(static_cast(state.iterations())); rmm::mr::set_current_device_resource(nullptr); } @@ -45,18 +54,18 @@ BENCHMARK(BM_UvectorSizeConstruction) ->Range(10'000, 1'000'000'000) // NOLINT ->Unit(benchmark::kMicrosecond); -static void BM_ThrustVectorSizeConstruction(benchmark::State& state) +void BM_ThrustVectorSizeConstruction(benchmark::State& state) { rmm::mr::cuda_memory_resource cuda_mr{}; rmm::mr::pool_memory_resource mr{&cuda_mr}; rmm::mr::set_current_device_resource(&mr); for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores) - rmm::device_vector vec(state.range(0)); + rmm::device_vector vec(state.range(0)); cudaDeviceSynchronize(); } - state.SetItemsProcessed(static_cast(state.iterations())); + state.SetItemsProcessed(static_cast(state.iterations())); rmm::mr::set_current_device_resource(nullptr); } @@ -66,4 +75,107 @@ BENCHMARK(BM_ThrustVectorSizeConstruction) ->Range(10'000, 1'000'000'000) // NOLINT ->Unit(benchmark::kMicrosecond); +// simple kernel used to test concurrent execution. +__global__ void kernel(int const* input, int* output, std::size_t num) +{ + for (auto i = blockDim.x * blockIdx.x + threadIdx.x; i < num; i += gridDim.x * blockDim.x) { + output[i] = input[i] * input[i]; + } +} + +using thrust_vector = thrust::device_vector; +using rmm_vector = rmm::device_vector; +using rmm_uvector = rmm::device_uvector; + +template +Vector make_vector(std::int64_t num_elements, rmm::cuda_stream_view stream, bool zero_init = false) +{ + static_assert(std::is_same_v or std::is_same_v or + std::is_same_v, + "unsupported vector type"); + if constexpr (std::is_same_v) { + return Vector(num_elements, 0); + } else if constexpr (std::is_same_v) { + return Vector(num_elements, 0, rmm::mr::thrust_allocator(stream)); + } else if constexpr (std::is_same_v) { + auto vec = Vector(num_elements, stream); + if (zero_init) { + cudaMemsetAsync(vec.data(), 0, num_elements * sizeof(std::int32_t), stream.value()); + } + return vec; + } +} + +template +int32_t* vector_data(Vector& vec) +{ + return thrust::raw_pointer_cast(vec.data()); +} + +template +void vector_workflow(std::size_t num_elements, + std::int64_t num_blocks, + std::int64_t block_size, + rmm::cuda_stream const& input_stream, + std::vector const& streams) +{ + auto input = make_vector(num_elements, input_stream, true); + input_stream.synchronize(); + for (rmm::cuda_stream_view stream : streams) { + auto output = make_vector(num_elements, stream); + kernel<<>>( + vector_data(input), vector_data(output), num_elements); + } + + for (rmm::cuda_stream_view stream : streams) { + stream.synchronize(); + } +} + +template +void BM_VectorWorkflow(benchmark::State& state) +{ + rmm::mr::cuda_async_memory_resource cuda_async_mr{}; + rmm::mr::set_current_device_resource(&cuda_async_mr); + + rmm::cuda_stream input_stream; + std::vector streams(4); + + auto const num_elements = state.range(0); + auto constexpr block_size = 256; + auto constexpr num_blocks = 16; + + for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores) + cuda_event_timer timer(state, true, input_stream); // flush_l2_cache = true + vector_workflow(num_elements, num_blocks, block_size, input_stream, streams); + } + + auto constexpr num_accesses = 9; + auto const bytes = num_elements * sizeof(std::int32_t) * num_accesses; + state.SetBytesProcessed(static_cast(state.iterations() * bytes)); + + rmm::mr::set_current_device_resource(nullptr); +} + +BENCHMARK_TEMPLATE(BM_VectorWorkflow, thrust_vector) // NOLINT + ->RangeMultiplier(10) // NOLINT + ->Range(100'000, 100'000'000) // NOLINT + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +// The only difference here is that `rmm::device_vector` uses `rmm::current_device_resource()` +// for allocation while `thrust::device_vector` uses cudaMalloc/cudaFree. In the benchmarks we use +// `cuda_async_memory_resource`, which is faster. +BENCHMARK_TEMPLATE(BM_VectorWorkflow, rmm_vector) // NOLINT + ->RangeMultiplier(10) // NOLINT + ->Range(100'000, 100'000'000) // NOLINT + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + +BENCHMARK_TEMPLATE(BM_VectorWorkflow, rmm_uvector) // NOLINT + ->RangeMultiplier(10) // NOLINT + ->Range(100'000, 100'000'000) // NOLINT + ->Unit(benchmark::kMicrosecond) + ->UseManualTime(); + BENCHMARK_MAIN();