rapidsai · harrism · Aug 12, 2020 · Aug 6, 2020 · Aug 6, 2020 · Aug 6, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,7 @@
 - PR #457 New `binning_memory_resource` (replaces `hybrid_memory_resource` and 
           `fixed_multisize_memory_resource`).
 - PR #458 Add `get/set_per_device_resource` to better support multi-GPU per process applications
+- PR #466 Deprecate CNMeM.
 
 ## Improvements
 

diff --git a/README.md b/README.md
@@ -155,15 +155,16 @@ Allocates and frees device memory using `cudaMalloc` and `cudaFree`.
 
 Allocates and frees device memory using `cudaMallocManaged` and `cudaFree`.
 
-#### `cnmem_(managed_)memory_resource`
-
-Uses the [CNMeM](https://github.com/NVIDIA/cnmem) pool sub-allocator to satisfy (de)allocations.
-
 #### `pool_memory_resource`
 
 A coalescing, best-fit pool sub-allocator.
 
-### `fixed_size_memory_resource`
+#### `cnmem_(managed_)memory_resource` [DEPRECATED]
+
+Uses the [CNMeM](https://github.com/NVIDIA/cnmem) pool sub-allocator to satisfy (de)allocations.
+These resources are deprecated as of RMM 0.15.
+
+#### `fixed_size_memory_resource`
 
 A memory resource that can only allocate a single fixed size. Average allocation and deallocation
 cost is constant.
@@ -184,24 +185,31 @@ To enable this use case, RMM provides the concept of a "default" `device_memory_
 resource is used when another is not explicitly provided.
 
 Accessing and modifying the default resource is done through two functions:
-- `device_memory_resource* get_default_resource()`
-   - Returns a pointer to the current default resource
-   - The initial default memory resource is an instance of `cuda_memory_resource`
-   - This function is thread safe
-
-- `device_memory_resource* set_default_resource(device_memory_resource* new_resource)`
-   - Updates the default memory resource pointer to `new_resource`
+- `device_memory_resource* get_current_device_resource()`
+   - Returns a pointer to the default resource for the current CUDA device.
+   - The initial default memory resource is an instance of `cuda_memory_resource`.
+   - This function is thread safe with respect to concurrent calls to it and 
+     `set_current_device_resource()`.
+   - For more explicit control, you can use `get_per_device_resource()`, which takes a device ID.
+   - Replaces the deprecated `get_default_resource()`
+
+- `device_memory_resource* set_current_device_resource(device_memory_resource* new_mr)`
+   - Updates the default memory resource pointer for the current CUDA device to `new_resource`
    - Returns the previous default resource pointer
    - If `new_resource` is `nullptr`, then resets the default resource to `cuda_memory_resource`
-   - This function is thread safe
+   - This function is thread safe with respect to concurrent calls to it and
+     `get_current_device_resource()`
+   - For more explicit control, you can use `set_per_device_resource()`, which takes a device ID.
+   - Replaces the deprecated `set_default_resource()`
 
 #### Example
 
 ```c++
-rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(); // Points to `cuda_memory_resource`
-rmm::mr::cnmem_memory_resource pool_mr{}; // Construct a resource that uses the CNMeM pool
+rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(); // Points to `cuda_memory_resource`
+// Construct a resource that uses a coalescing best-fit pool allocator
+rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>> pool_mr{mr}; 
 rmm::mr::set_default_resource(&pool_mr); // Updates the default resource pointer to `pool_mr`
-rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(); // Points to `pool_mr`
+rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource(); // Points to `pool_mr`
 ```
 
 ## Device Data Structures
@@ -363,26 +371,31 @@ array([1., 2., 3.])
 
 ### MemoryResources
 
-MemoryResources are used to configure how device memory allocations are made by RMM.
+MemoryResources are used to configure how device memory allocations are made by
+RMM.
 
 By default, i.e., if you don't set a MemoryResource explicitly, RMM
 uses the `CudaMemoryResource`, which uses `cudaMalloc` for
 allocating device memory.
 
-The `rmm.mr.set_default_resource()` function can be used to set a
-different MemoryResource.  For example, enabling the
-`ManagedMemoryResource` tells RMM to use `cudaMallocManaged` instead
-of `cudaMalloc` for allocating memory:
+`rmm.reinitialize()` provides an easy way to initialize RMM with specific
+memory resource options across multiple devices. See `help(rmm.reinitialize) for 
+full details.
+
+For lower-level control, `rmm.mr.set_current_device_resource()` function can be
+used to set a different MemoryResource for the current CUDA device.  For
+example, enabling the `ManagedMemoryResource` tells RMM to use
+`cudaMallocManaged` instead of `cudaMalloc` for allocating memory:
 
 ```python
 >>> import rmm
->>> rmm.mr.set_default_resource(rmm.mr.ManagedMemoryResource())
+>>> rmm.mr.set_current_device_resource(rmm.mr.ManagedMemoryResource())
 ```
 
-> :warning: The default resource must be set **before** allocating any
-> device memory.  Setting or changing the default resource after
-> device allocations have been made can lead to unexpected behaviour
-> or crashes.
+> :warning: The default resource must be set for any device **before**
+> allocating any device memory on that device.  Setting or changing the
+> resource after device allocations have been made can lead to unexpected
+> behaviour or crashes.
 
 As another example, `PoolMemoryResource` allows you to allocate a
 large "pool" of device memory up-front. Subsequent allocations will
@@ -398,7 +411,7 @@ of 1 GiB and a maximum size of 4 GiB. The pool uses
 ...     initial_pool_size=2**30,
 ...     maximum_pool_size=2**32
 ... )
->>> rmm.mr.set_default_resource(pool)
+>>> rmm.mr.set_current_device_resource(pool)
 ```
 Other MemoryResources include:
 

diff --git a/benchmarks/device_uvector/device_uvector_bench.cu b/benchmarks/device_uvector/device_uvector_bench.cu
@@ -19,18 +19,25 @@
 #include <cuda_runtime_api.h>
 #include <rmm/thrust_rmm_allocator.h>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/cnmem_memory_resource.hpp>
 #include <rmm/mr/device/default_memory_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+#include "rmm/mr/device/cuda_memory_resource.hpp"
+#include "rmm/mr/device/per_device_resource.hpp"
 
 static void BM_UvectorSizeConstruction(benchmark::State& state)
 {
-  rmm::mr::cnmem_memory_resource mr{};
-  rmm::mr::set_default_resource(&mr);
+  rmm::mr::cuda_memory_resource cuda_mr{};
+  rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> mr{&cuda_mr};
+  rmm::mr::set_current_device_resource(&mr);
 
   for (auto _ : state) {
-    rmm::device_uvector<int32_t>(state.range(0), cudaStream_t{0});
+    rmm::device_uvector<int32_t> vec(state.range(0), cudaStream_t{0});
     cudaDeviceSynchronize();
   }
+
+  state.SetItemsProcessed(state.iterations());
+
+  rmm::mr::set_current_device_resource(nullptr);
 }
 BENCHMARK(BM_UvectorSizeConstruction)
   ->RangeMultiplier(10)
@@ -39,12 +46,18 @@ BENCHMARK(BM_UvectorSizeConstruction)
 
 static void BM_ThrustVectorSizeConstruction(benchmark::State& state)
 {
-  rmm::mr::cnmem_memory_resource mr{};
-  rmm::mr::set_default_resource(&mr);
+  rmm::mr::cuda_memory_resource cuda_mr{};
+  rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> mr{&cuda_mr};
+  rmm::mr::set_current_device_resource(&mr);
+
   for (auto _ : state) {
-    rmm::device_vector<int32_t>(state.range(0));
+    rmm::device_vector<int32_t> vec(state.range(0));
     cudaDeviceSynchronize();
   }
+
+  state.SetItemsProcessed(state.iterations());
+
+  rmm::mr::set_current_device_resource(nullptr);
 }
 
 BENCHMARK(BM_ThrustVectorSizeConstruction)

diff --git a/benchmarks/random_allocations/random_allocations.cpp b/benchmarks/random_allocations/random_allocations.cpp
@@ -17,7 +17,6 @@
 #include <benchmarks/utilities/cxxopts.hpp>
 
 #include <rmm/mr/device/binning_memory_resource.hpp>
-#include <rmm/mr/device/cnmem_memory_resource.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/owning_wrapper.hpp>
@@ -158,8 +157,6 @@ void uniform_random_allocations(rmm::mr::device_memory_resource& mr,
 /// MR factory functions
 inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
 
-inline auto make_cnmem() { return std::make_shared<rmm::mr::cnmem_memory_resource>(); }
-
 inline auto make_pool()
 {
   return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda());
@@ -237,8 +234,6 @@ void declare_benchmark(std::string name)
     BENCHMARK_CAPTURE(BM_RandomAllocations, binning_mr, &make_binning)->Apply(benchmark_range);
   else if (name == "pool")
     BENCHMARK_CAPTURE(BM_RandomAllocations, pool_mr, &make_pool)->Apply(benchmark_range);
-  else if (name == "cnmem")
-    BENCHMARK_CAPTURE(BM_RandomAllocations, cnmem_mr, &make_cnmem)->Apply(benchmark_range);
   else
     std::cout << "Error: invalid memory_resource name: " << name << "\n";
 }
@@ -283,10 +278,8 @@ int main(int argc, char** argv)
   max_size        = args["maxsize"].as<int>();
 
   if (args.count("profile") > 0) {
-    std::map<std::string, MRFactoryFunc> const funcs({{"binning", &make_binning},
-                                                      {"cnmem", &make_cnmem},
-                                                      {"cuda", &make_cuda},
-                                                      {"pool", &make_pool}});
+    std::map<std::string, MRFactoryFunc> const funcs(
+      {{"binning", &make_binning}, {"cuda", &make_cuda}, {"pool", &make_pool}});
     auto resource = args["resource"].as<std::string>();
 
     std::cout << "Profiling " << resource << " with " << num_allocations << " allocations of max "
@@ -307,7 +300,7 @@ int main(int argc, char** argv)
       std::string mr_name = args["resource"].as<std::string>();
       declare_benchmark(mr_name);
     } else {
-      std::array<std::string, 4> mrs{"pool", "binning", "cnmem", "cuda"};
+      std::array<std::string, 4> mrs{"pool", "binning", "cuda"};
       std::for_each(std::cbegin(mrs), std::cend(mrs), [](auto const& s) { declare_benchmark(s); });
     }
     ::benchmark::RunSpecifiedBenchmarks();

diff --git a/benchmarks/replay/replay.cpp b/benchmarks/replay/replay.cpp
@@ -18,8 +18,10 @@
 #include <benchmarks/utilities/log_parser.hpp>
 
 #include <rmm/detail/error.hpp>
-#include <rmm/mr/device/cnmem_memory_resource.hpp>
+#include <rmm/mr/device/binning_memory_resource.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/owning_wrapper.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/constant_iterator.h>
@@ -32,6 +34,27 @@
 #include <memory>
 #include <numeric>
 #include <string>
+#include "rmm/mr/device/device_memory_resource.hpp"
+
+/// MR factory functions
+inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
+
+inline auto make_pool()
+{
+  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda());
+}
+
+inline auto make_binning()
+{
+  auto pool = make_pool();
+  auto mr   = rmm::mr::make_owning_wrapper<rmm::mr::binning_memory_resource>(pool);
+  for (std::size_t i = 18; i <= 22; i++) {
+    mr->wrapped().add_bin(1 << i);
+  }
+  return mr;
+}
+
+using MRFactoryFunc = std::function<std::shared_ptr<rmm::mr::device_memory_resource>()>;
 
 /**
  * @brief Represents an allocation made during the replay
@@ -51,9 +74,8 @@ struct allocation {
  * @tparam MR The type of the `device_memory_resource` to use for allocation
  * replay
  */
-template <typename MR>
 struct replay_benchmark {
-  std::unique_ptr<MR> mr_{};
+  std::shared_ptr<rmm::mr::device_memory_resource> mr_{};
   std::vector<std::vector<rmm::detail::event>> const& events_{};
 
   /**
@@ -63,9 +85,9 @@ struct replay_benchmark {
    * @param events The set of allocation events to replay
    * @param args Variable number of arguments forward to the constructor of MR
    */
-  template <typename... Args>
-  replay_benchmark(std::vector<std::vector<rmm::detail::event>> const& events, Args&&... args)
-    : mr_{new MR{std::forward<Args>(args)...}}, events_{events}
+  replay_benchmark(MRFactoryFunc factory,
+                   std::vector<std::vector<rmm::detail::event>> const& events)
+    : mr_{factory()}, events_{events}
   {
   }
 
@@ -193,13 +215,16 @@ int main(int argc, char** argv)
 
   auto const num_threads = per_thread_events.size();
 
-  benchmark::RegisterBenchmark("CUDA Resource",
-                               replay_benchmark<rmm::mr::cuda_memory_resource>{per_thread_events})
+  benchmark::RegisterBenchmark("CUDA Resource", replay_benchmark{&make_cuda, per_thread_events})
+    ->Unit(benchmark::kMillisecond)
+    ->Threads(num_threads);
+
+  benchmark::RegisterBenchmark("Pool Resource", replay_benchmark(&make_pool, per_thread_events))
     ->Unit(benchmark::kMillisecond)
     ->Threads(num_threads);
 
-  benchmark::RegisterBenchmark(
-    "CNMEM Resource", replay_benchmark<rmm::mr::cnmem_memory_resource>(per_thread_events, 0u))
+  benchmark::RegisterBenchmark("Binning Resource",
+                               replay_benchmark(&make_binning, per_thread_events))
     ->Unit(benchmark::kMillisecond)
     ->Threads(num_threads);
 

diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp
@@ -31,7 +31,7 @@ namespace rmm {
  *
  * This class allocates untyped and *uninitialized* device memory using a
  * `device_memory_resource`. If not explicitly specified, the memory resource
- * returned from `get_default_resource()` is used.
+ * returned from `get_current_device_resource()` is used.
  *
  * @note Unlike `std::vector` or `thrust::device_vector`, the device memory
  * allocated by a `device_buffer` is uninitialized. Therefore, it is undefined
@@ -79,7 +79,7 @@ class device_buffer {
   // context of both host and device functions. Specifically, the `cudf::type_dispatcher` is a host-
   // device function. This causes warnings/errors because this ctor invokes host-only functions.
   device_buffer()
-    : _data{nullptr}, _size{}, _capacity{}, _stream{}, _mr{rmm::mr::get_default_resource()}
+    : _data{nullptr}, _size{}, _capacity{}, _stream{}, _mr{rmm::mr::get_current_device_resource()}
   {
   }
 
@@ -95,7 +95,7 @@ class device_buffer {
    */
   explicit device_buffer(std::size_t size,
                          cudaStream_t stream            = 0,
-                         mr::device_memory_resource* mr = mr::get_default_resource())
+                         mr::device_memory_resource* mr = mr::get_current_device_resource())
     : _stream{stream}, _mr{mr}
   {
     allocate(size);
@@ -118,7 +118,7 @@ class device_buffer {
   device_buffer(void const* source_data,
                 std::size_t size,
                 cudaStream_t stream            = 0,
-                mr::device_memory_resource* mr = mr::get_default_resource())
+                mr::device_memory_resource* mr = mr::get_current_device_resource())
     : _stream{stream}, _mr{mr}
   {
     allocate(size);
@@ -143,7 +143,7 @@ class device_buffer {
    */
   device_buffer(device_buffer const& other,
                 cudaStream_t stream                 = 0,
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource())
+                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
     : device_buffer{other.data(), other.size(), stream, mr}
   {
   }
@@ -388,8 +388,8 @@ class device_buffer {
   std::size_t _capacity{};  ///< The actual size of the device memory allocation
   cudaStream_t _stream{};   ///< Stream to use for device memory deallocation
   mr::device_memory_resource* _mr{
-    mr::get_default_resource()};  ///< The memory resource used to
-                                  ///< allocate/deallocate device memory
+    mr::get_current_device_resource()};  ///< The memory resource used to
+                                         ///< allocate/deallocate device memory
 
   /**
    * @brief Allocates the specified amount of memory and updates the

diff --git a/include/rmm/device_scalar.hpp b/include/rmm/device_scalar.hpp
@@ -50,8 +50,9 @@ class device_scalar {
    * @param stream Stream on which to perform asynchronous allocation.
    * @param mr Optional, resource with which to allocate.
    */
-  explicit device_scalar(cudaStream_t stream,
-                         rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource())
+  explicit device_scalar(
+    cudaStream_t stream,
+    rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
     : buffer{sizeof(T), stream, mr}
   {
   }
@@ -72,9 +73,10 @@ class device_scalar {
    * @param stream Optional, stream on which to perform allocation and copy.
    * @param mr Optional, resource with which to allocate.
    */
-  explicit device_scalar(T const &initial_value,
-                         cudaStream_t stream                 = 0,
-                         rmm::mr::device_memory_resource *mr = rmm::mr::get_default_resource())
+  explicit device_scalar(
+    T const &initial_value,
+    cudaStream_t stream                 = 0,
+    rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())
     : buffer{sizeof(T), stream, mr}
   {
     set_value(initial_value, stream);