rapidsai · harrism · Aug 12, 2020 · Aug 6, 2020 · Aug 6, 2020 · Aug 6, 2020
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,7 @@
 - PR #457 New `binning_memory_resource` (replaces `hybrid_memory_resource` and 
           `fixed_multisize_memory_resource`).
 - PR #458 Add `get/set_per_device_resource` to better support multi-GPU per process applications
+- PR #466 Deprecate CNMeM.
 
 ## Improvements
 

diff --git a/README.md b/README.md
@@ -155,15 +155,16 @@ Allocates and frees device memory using `cudaMalloc` and `cudaFree`.
 
 Allocates and frees device memory using `cudaMallocManaged` and `cudaFree`.
 
-#### `cnmem_(managed_)memory_resource`
-
-Uses the [CNMeM](https://github.com/NVIDIA/cnmem) pool sub-allocator to satisfy (de)allocations.
-
 #### `pool_memory_resource`
 
 A coalescing, best-fit pool sub-allocator.
 
-### `fixed_size_memory_resource`
+#### `cnmem_(managed_)memory_resource` [DEPRECATED]
+
+Uses the [CNMeM](https://github.com/NVIDIA/cnmem) pool sub-allocator to satisfy (de)allocations.
+These resources are deprecated as of RMM 0.15.
+
+#### `fixed_size_memory_resource`
 
 A memory resource that can only allocate a single fixed size. Average allocation and deallocation
 cost is constant.
@@ -184,22 +185,29 @@ To enable this use case, RMM provides the concept of a "default" `device_memory_
 resource is used when another is not explicitly provided.
 
 Accessing and modifying the default resource is done through two functions:
-- `device_memory_resource* get_default_resource()`
-   - Returns a pointer to the current default resource
-   - The initial default memory resource is an instance of `cuda_memory_resource`
-   - This function is thread safe
-
-- `device_memory_resource* set_default_resource(device_memory_resource* new_resource)`
-   - Updates the default memory resource pointer to `new_resource`
+- `device_memory_resource* get_current_device_resource()`
+   - Returns a pointer to the default resource for the current CUDA device.
+   - The initial default memory resource is an instance of `cuda_memory_resource`.
+   - This function is thread safe with respect to concurrent calls to it and 
+     `set_current_device_resource()`.
+   - For more explicit control, you can use `get_per_device_resource()`, which takes a device ID.
+   - Replaces the deprecated `get_default_resource()`
+
+- `device_memory_resource* set_current_device_resource(device_memory_resource* new_mr)`
+   - Updates the default memory resource pointer for the current CUDA device to `new_resource`
    - Returns the previous default resource pointer
    - If `new_resource` is `nullptr`, then resets the default resource to `cuda_memory_resource`
-   - This function is thread safe
+   - This function is thread safe with respect to concurrent calls to it and
+     `get_current_device_resource()`
+   - For more explicit control, you can use `set_per_device_resource()`, which takes a device ID.
+   - Replaces the deprecated `set_default_resource()`
 
 #### Example
 
 ```c++
 rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(); // Points to `cuda_memory_resource`
-rmm::mr::cnmem_memory_resource pool_mr{}; // Construct a resource that uses the CNMeM pool
+// Construct a resource that uses a coalescing best-fit pool allocator
+rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>> pool_mr{mr}; 
 rmm::mr::set_default_resource(&pool_mr); // Updates the default resource pointer to `pool_mr`
 rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource(); // Points to `pool_mr`
 ```

diff --git a/benchmarks/device_uvector/device_uvector_bench.cu b/benchmarks/device_uvector/device_uvector_bench.cu
@@ -19,18 +19,24 @@
 #include <cuda_runtime_api.h>
 #include <rmm/thrust_rmm_allocator.h>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/cnmem_memory_resource.hpp>
 #include <rmm/mr/device/default_memory_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+#include "rmm/mr/device/cuda_memory_resource.hpp"
 
 static void BM_UvectorSizeConstruction(benchmark::State& state)
 {
-  rmm::mr::cnmem_memory_resource mr{};
+  rmm::mr::cuda_memory_resource cuda_mr{};
+  rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> mr{&cuda_mr};
   rmm::mr::set_default_resource(&mr);
 
   for (auto _ : state) {
-    rmm::device_uvector<int32_t>(state.range(0), cudaStream_t{0});
+    rmm::device_uvector<int32_t> vec(state.range(0), cudaStream_t{0});
     cudaDeviceSynchronize();
   }
+
+  state.SetItemsProcessed(state.iterations());
+
+  rmm::mr::set_default_resource(nullptr);
 }
 BENCHMARK(BM_UvectorSizeConstruction)
   ->RangeMultiplier(10)
@@ -39,12 +45,18 @@ BENCHMARK(BM_UvectorSizeConstruction)
 
 static void BM_ThrustVectorSizeConstruction(benchmark::State& state)
 {
-  rmm::mr::cnmem_memory_resource mr{};
+  rmm::mr::cuda_memory_resource cuda_mr{};
+  rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> mr{&cuda_mr};
   rmm::mr::set_default_resource(&mr);
+
   for (auto _ : state) {
-    rmm::device_vector<int32_t>(state.range(0));
+    rmm::device_vector<int32_t> vec(state.range(0));
     cudaDeviceSynchronize();
   }
+
+  state.SetItemsProcessed(state.iterations());
+
+  rmm::mr::set_default_resource(nullptr);
 }
 
 BENCHMARK(BM_ThrustVectorSizeConstruction)

diff --git a/benchmarks/random_allocations/random_allocations.cpp b/benchmarks/random_allocations/random_allocations.cpp
@@ -17,7 +17,6 @@
 #include <benchmarks/utilities/cxxopts.hpp>
 
 #include <rmm/mr/device/binning_memory_resource.hpp>
-#include <rmm/mr/device/cnmem_memory_resource.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/owning_wrapper.hpp>
@@ -158,8 +157,6 @@ void uniform_random_allocations(rmm::mr::device_memory_resource& mr,
 /// MR factory functions
 inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
 
-inline auto make_cnmem() { return std::make_shared<rmm::mr::cnmem_memory_resource>(); }
-
 inline auto make_pool()
 {
   return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda());
@@ -237,8 +234,6 @@ void declare_benchmark(std::string name)
     BENCHMARK_CAPTURE(BM_RandomAllocations, binning_mr, &make_binning)->Apply(benchmark_range);
   else if (name == "pool")
     BENCHMARK_CAPTURE(BM_RandomAllocations, pool_mr, &make_pool)->Apply(benchmark_range);
-  else if (name == "cnmem")
-    BENCHMARK_CAPTURE(BM_RandomAllocations, cnmem_mr, &make_cnmem)->Apply(benchmark_range);
   else
     std::cout << "Error: invalid memory_resource name: " << name << "\n";
 }
@@ -283,10 +278,8 @@ int main(int argc, char** argv)
   max_size        = args["maxsize"].as<int>();
 
   if (args.count("profile") > 0) {
-    std::map<std::string, MRFactoryFunc> const funcs({{"binning", &make_binning},
-                                                      {"cnmem", &make_cnmem},
-                                                      {"cuda", &make_cuda},
-                                                      {"pool", &make_pool}});
+    std::map<std::string, MRFactoryFunc> const funcs(
+      {{"binning", &make_binning}, {"cuda", &make_cuda}, {"pool", &make_pool}});
     auto resource = args["resource"].as<std::string>();
 
     std::cout << "Profiling " << resource << " with " << num_allocations << " allocations of max "
@@ -307,7 +300,7 @@ int main(int argc, char** argv)
       std::string mr_name = args["resource"].as<std::string>();
       declare_benchmark(mr_name);
     } else {
-      std::array<std::string, 4> mrs{"pool", "binning", "cnmem", "cuda"};
+      std::array<std::string, 4> mrs{"pool", "binning", "cuda"};
       std::for_each(std::cbegin(mrs), std::cend(mrs), [](auto const& s) { declare_benchmark(s); });
     }
     ::benchmark::RunSpecifiedBenchmarks();

diff --git a/benchmarks/replay/replay.cpp b/benchmarks/replay/replay.cpp
@@ -18,8 +18,10 @@
 #include <benchmarks/utilities/log_parser.hpp>
 
 #include <rmm/detail/error.hpp>
-#include <rmm/mr/device/cnmem_memory_resource.hpp>
+#include <rmm/mr/device/binning_memory_resource.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/owning_wrapper.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/constant_iterator.h>
@@ -32,6 +34,27 @@
 #include <memory>
 #include <numeric>
 #include <string>
+#include "rmm/mr/device/device_memory_resource.hpp"
+
+/// MR factory functions
+inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
+
+inline auto make_pool()
+{
+  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda());
+}
+
+inline auto make_binning()
+{
+  auto pool = make_pool();
+  auto mr   = rmm::mr::make_owning_wrapper<rmm::mr::binning_memory_resource>(pool);
+  for (std::size_t i = 18; i <= 22; i++) {
+    mr->wrapped().add_bin(1 << i);
+  }
+  return mr;
+}
+
+using MRFactoryFunc = std::function<std::shared_ptr<rmm::mr::device_memory_resource>()>;
 
 /**
  * @brief Represents an allocation made during the replay
@@ -51,9 +74,8 @@ struct allocation {
  * @tparam MR The type of the `device_memory_resource` to use for allocation
  * replay
  */
-template <typename MR>
 struct replay_benchmark {
-  std::unique_ptr<MR> mr_{};
+  std::shared_ptr<rmm::mr::device_memory_resource> mr_{};
   std::vector<std::vector<rmm::detail::event>> const& events_{};
 
   /**
@@ -63,9 +85,9 @@ struct replay_benchmark {
    * @param events The set of allocation events to replay
    * @param args Variable number of arguments forward to the constructor of MR
    */
-  template <typename... Args>
-  replay_benchmark(std::vector<std::vector<rmm::detail::event>> const& events, Args&&... args)
-    : mr_{new MR{std::forward<Args>(args)...}}, events_{events}
+  replay_benchmark(MRFactoryFunc factory,
+                   std::vector<std::vector<rmm::detail::event>> const& events)
+    : mr_{factory()}, events_{events}
   {
   }
 
@@ -193,13 +215,16 @@ int main(int argc, char** argv)
 
   auto const num_threads = per_thread_events.size();
 
-  benchmark::RegisterBenchmark("CUDA Resource",
-                               replay_benchmark<rmm::mr::cuda_memory_resource>{per_thread_events})
+  benchmark::RegisterBenchmark("CUDA Resource", replay_benchmark{&make_cuda, per_thread_events})
+    ->Unit(benchmark::kMillisecond)
+    ->Threads(num_threads);
+
+  benchmark::RegisterBenchmark("Pool Resource", replay_benchmark(&make_pool, per_thread_events))
     ->Unit(benchmark::kMillisecond)
     ->Threads(num_threads);
 
-  benchmark::RegisterBenchmark(
-    "CNMEM Resource", replay_benchmark<rmm::mr::cnmem_memory_resource>(per_thread_events, 0u))
+  benchmark::RegisterBenchmark("Binning Resource",
+                               replay_benchmark(&make_binning, per_thread_events))
     ->Unit(benchmark::kMillisecond)
     ->Threads(num_threads);
 

diff --git a/include/rmm/mr/device/cnmem_managed_memory_resource.hpp b/include/rmm/mr/device/cnmem_managed_memory_resource.hpp
@@ -23,8 +23,9 @@ namespace rmm {
 namespace mr {
 /**
  * @brief Memory resource that allocates/deallocates managed device memory
-    (CUDA Unified Memory) using the cnmem pool sub-allocator.
- * the cnmem pool sub-allocator for allocation/deallocation.
+ * (CUDA Unified Memory) using the cnmem pool sub-allocator.
+ *
+ * @note This class is deprecated as of RMM 0.15. Use pool_memory_resource.
  */
 class cnmem_managed_memory_resource final : public cnmem_memory_resource {
  public:
@@ -37,8 +38,8 @@ class cnmem_managed_memory_resource final : public cnmem_memory_resource {
    * @param initial_pool_size Size, in bytes, of the intial pool size. When
    * zero, an implementation defined pool size is used.
    */
-  explicit cnmem_managed_memory_resource(std::size_t initial_pool_size   = 0,
-                                         std::vector<int> const& devices = {})
+  [[deprecated]] explicit cnmem_managed_memory_resource(std::size_t initial_pool_size   = 0,
+                                                        std::vector<int> const& devices = {})
     : cnmem_memory_resource(initial_pool_size, devices, memory_kind::MANAGED)
   {
   }

diff --git a/include/rmm/mr/device/cnmem_memory_resource.hpp b/include/rmm/mr/device/cnmem_memory_resource.hpp
@@ -57,6 +57,8 @@ namespace mr {
 /**
  * @brief Memory resource that allocates/deallocates using the cnmem pool
  * sub-allocator.
+ *
+ * @note This class is deprecated as of RMM 0.15. Use pool_memory_resource.
  */
 class cnmem_memory_resource : public device_memory_resource {
  public:
@@ -71,8 +73,8 @@ class cnmem_memory_resource : public device_memory_resource {
    * zero, an implementation defined pool size is used.
    * @param devices List of GPU device IDs to register with CNMEM
    */
-  explicit cnmem_memory_resource(std::size_t initial_pool_size   = 0,
-                                 std::vector<int> const& devices = {})
+  [[deprecated]] explicit cnmem_memory_resource(std::size_t initial_pool_size   = 0,
+                                                std::vector<int> const& devices = {})
     : cnmem_memory_resource(initial_pool_size, devices, memory_kind::CUDA)
   {
   }

diff --git a/include/rmm/mr/device/default_memory_resource.hpp b/include/rmm/mr/device/default_memory_resource.hpp
@@ -16,37 +16,17 @@
 
 #pragma once
 
-#include "cuda_memory_resource.hpp"
-#include "device_memory_resource.hpp"
+#include <rmm/mr/device/per_device_resource.hpp>
 
-#include <atomic>
 namespace rmm {
 namespace mr {
-namespace detail {
-/**
- * @brief Returns a pointer to the initial resource.
- *
- * Returns a global instance of a `cuda_memory_resource` as a function local static.
- *
- * @return Pointer to the static cuda_memory_resource used as the initial, default resource
- */
-inline device_memory_resource* initial_resource()
-{
-  static cuda_memory_resource mr{};
-  return &mr;
-}
-
-// Use an atomic to guarantee thread safety
-inline std::atomic<device_memory_resource*>& get_default()
-{
-  static std::atomic<device_memory_resource*> res{detail::initial_resource()};
-  return res;
-}
-}  // namespace detail
 
 /**
  * @brief Get the default device memory resource pointer.
  *
+ * Deprecated as of RMM v0.15. Please use get_current_device_resource() or
+ * get_per_device_resource().
+ *
  * The default device memory resource is used when an explicit memory resource
  * is not supplied. The initial default memory resource is a
  * `cuda_memory_resource`.
@@ -56,11 +36,17 @@ inline std::atomic<device_memory_resource*>& get_default()
  * @return device_memory_resource* Pointer to the current default memory
  * resource
  */
-inline device_memory_resource* get_default_resource() { return detail::get_default().load(); }
+[[deprecated]] inline device_memory_resource* get_default_resource()
+{
+  return get_current_device_resource();
+}
 
 /**
  * @brief Sets the default device memory resource pointer.
  *
+ * Deprecated as of RMM v0.15. Please use set_current_device_resource() or
+ * set_per_device_resource().
+ *
  * If `new_resource` is not `nullptr`, sets the default device memory resource
  * pointer to `new_resource`. Otherwise, resets the default device memory
  * resource to the initial `cuda_memory_resource`.
@@ -74,10 +60,10 @@ inline device_memory_resource* get_default_resource() { return detail::get_defau
  * default device memory resource
  * @return The previous value of the default device memory resource pointer
  */
-inline device_memory_resource* set_default_resource(device_memory_resource* new_resource)
+[[deprecated]] inline device_memory_resource* set_default_resource(
+  device_memory_resource* new_resource)
 {
-  new_resource = (new_resource == nullptr) ? detail::initial_resource() : new_resource;
-  return detail::get_default().exchange(new_resource);
+  return set_current_device_resource(new_resource);
 }
 
 }  // namespace mr