From b04128685aad4e22bd5213a920f09f197985dbd7 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Thu, 7 Oct 2021 18:46:22 -0700
Subject: [PATCH 01/35] add some tests for arena mr

---
 include/rmm/detail/aligned.hpp                | 36 ++++++++
 .../rmm/mr/device/arena_memory_resource.hpp   |  8 +-
 include/rmm/mr/device/detail/arena.hpp        | 56 ++++---------
 tests/CMakeLists.txt                          |  3 +
 tests/mr/device/arena_mr_tests.cpp            | 83 +++++++++++++++++++
 5 files changed, 142 insertions(+), 44 deletions(-)
 create mode 100644 tests/mr/device/arena_mr_tests.cpp

diff --git a/include/rmm/detail/aligned.hpp b/include/rmm/detail/aligned.hpp
index 321be53b5..19e69344d 100644
--- a/include/rmm/detail/aligned.hpp
+++ b/include/rmm/detail/aligned.hpp
@@ -62,6 +62,18 @@ constexpr std::size_t align_up(std::size_t value, std::size_t alignment) noexcep
   return (value + (alignment - 1)) & ~(alignment - 1);
 }
 
+/**
+ * @brief Align up to nearest multiple of the CUDA allocation alignment
+ *
+ * @param[in] v value to align
+ *
+ * @return Return the aligned value, as one would expect
+ */
+constexpr std::size_t align_up_cuda(std::size_t value) noexcept
+{
+  return align_up(value, CUDA_ALLOCATION_ALIGNMENT);
+}
+
 /**
  * @brief Align down to the nearest multiple of specified power of 2
  *
@@ -76,6 +88,18 @@ constexpr std::size_t align_down(std::size_t value, std::size_t alignment) noexc
   return value & ~(alignment - 1);
 }
 
+/**
+ * @brief Align down to the nearest multiple of the CUDA allocation alignment
+ *
+ * @param[in] v value to align
+ *
+ * @return Return the aligned value, as one would expect
+ */
+constexpr std::size_t align_down_cuda(std::size_t value) noexcept
+{
+  return align_down(value, CUDA_ALLOCATION_ALIGNMENT);
+}
+
 /**
  * @brief Checks whether a value is aligned to a multiple of a specified power of 2
  *
@@ -90,6 +114,18 @@ constexpr bool is_aligned(std::size_t value, std::size_t alignment) noexcept
   return value == align_down(value, alignment);
 }
 
+/**
+ * @brief Checks whether a value is aligned to a multiple of the CUDA allocation alignment
+ *
+ * @param[in] v value to check for alignment
+ *
+ * @return true if aligned
+ */
+constexpr bool is_cuda_aligned(std::size_t value) noexcept
+{
+  return is_aligned(value, CUDA_ALLOCATION_ALIGNMENT);
+}
+
 inline bool is_pointer_aligned(void* ptr, std::size_t alignment = CUDA_ALLOCATION_ALIGNMENT)
 {
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp
index ce8737225..9b78d9207 100644
--- a/include/rmm/mr/device/arena_memory_resource.hpp
+++ b/include/rmm/mr/device/arena_memory_resource.hpp
@@ -89,8 +89,8 @@ class arena_memory_resource final : public device_memory_resource {
    * of the available memory on the current device.
    */
   explicit arena_memory_resource(Upstream* upstream_mr,
-                                 std::size_t initial_size = global_arena::default_initial_size,
-                                 std::size_t maximum_size = global_arena::default_maximum_size,
+                                 std::optional<std::size_t> initial_size = std::nullopt,
+                                 std::optional<std::size_t> maximum_size = std::nullopt,
                                  bool dump_log_on_failure = false)
     : global_arena_{upstream_mr, initial_size, maximum_size},
       dump_log_on_failure_{dump_log_on_failure}
@@ -144,7 +144,7 @@ class arena_memory_resource final : public device_memory_resource {
   {
     if (bytes <= 0) { return nullptr; }
 
-    bytes         = detail::arena::align_up(bytes);
+    bytes         = rmm::detail::align_up_cuda(bytes);
     auto& arena   = get_arena(stream);
     void* pointer = arena.allocate(bytes);
 
@@ -173,7 +173,7 @@ class arena_memory_resource final : public device_memory_resource {
   {
     if (ptr == nullptr || bytes <= 0) { return; }
 
-    bytes = detail::arena::align_up(bytes);
+    bytes = rmm::detail::align_up_cuda(bytes);
     get_arena(stream).deallocate(ptr, bytes, stream);
   }
 
diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index 0d2bb319a..9cb691bd8 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -18,6 +18,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/aligned.hpp>
+#include <rmm/detail/cuda_util.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/logger.hpp>
 
@@ -139,28 +140,6 @@ class block {
 
 inline bool block_size_compare(block lhs, block rhs) { return lhs.size() < rhs.size(); }
 
-/**
- * @brief Align up to the allocation alignment.
- *
- * @param[in] v value to align
- * @return Return the aligned value
- */
-constexpr std::size_t align_up(std::size_t value) noexcept
-{
-  return rmm::detail::align_up(value, rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
-}
-
-/**
- * @brief Align down to the allocation alignment.
- *
- * @param[in] v value to align
- * @return Return the aligned value
- */
-constexpr std::size_t align_down(std::size_t value) noexcept
-{
-  return rmm::detail::align_down(value, rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
-}
-
 /**
  * @brief Get the first free block of at least `size` bytes.
  *
@@ -253,10 +232,6 @@ inline auto total_block_size(T const& blocks)
 template <typename Upstream>
 class global_arena final {
  public:
-  /// The default initial size for the global arena.
-  static constexpr std::size_t default_initial_size = std::numeric_limits<std::size_t>::max();
-  /// The default maximum size for the global arena.
-  static constexpr std::size_t default_maximum_size = std::numeric_limits<std::size_t>::max();
   /// Reserved memory that should not be allocated (64 MiB).
   static constexpr std::size_t reserved_size = 1U << 26U;
 
@@ -275,29 +250,30 @@ class global_arena final {
    * @param maximum_size Maximum size, in bytes, that the global arena can grow to. Defaults to all
    * of the available memory on the current device.
    */
-  global_arena(Upstream* upstream_mr, std::size_t initial_size, std::size_t maximum_size)
-    : upstream_mr_{upstream_mr}, maximum_size_{maximum_size}
+  global_arena(Upstream* upstream_mr,
+               std::optional<std::size_t> initial_size,
+               std::optional<std::size_t> maximum_size)
+    : upstream_mr_{upstream_mr}, maximum_size_{maximum_size.value_or(0)}
   {
     RMM_EXPECTS(nullptr != upstream_mr_, "Unexpected null upstream pointer.");
-    RMM_EXPECTS(initial_size == default_initial_size || initial_size == align_up(initial_size),
+    RMM_EXPECTS(!initial_size || rmm::detail::is_cuda_aligned(initial_size.value()),
                 "Error, Initial arena size required to be a multiple of 256 bytes");
-    RMM_EXPECTS(maximum_size_ == default_maximum_size || maximum_size_ == align_up(maximum_size_),
+    RMM_EXPECTS(!maximum_size || rmm::detail::is_cuda_aligned(maximum_size.value()),
                 "Error, Maximum arena size required to be a multiple of 256 bytes");
 
-    if (initial_size == default_initial_size || maximum_size == default_maximum_size) {
-      std::size_t free{};
-      std::size_t total{};
-      RMM_CUDA_TRY(cudaMemGetInfo(&free, &total));
-      if (initial_size == default_initial_size) {
-        initial_size = align_up(std::min(free, total / 2));
+    auto init = initial_size.value_or(0);
+    if (!initial_size || !maximum_size) {
+      auto const [free, total] = rmm::detail::available_device_memory();
+      if (!initial_size) {
+        init = rmm::detail::align_down_cuda(free) - reserved_size;
       }
-      if (maximum_size_ == default_maximum_size) {
-        maximum_size_ = align_down(free) - reserved_size;
+      if (!maximum_size) {
+        maximum_size_ = rmm::detail::align_down_cuda(free) - reserved_size;
       }
     }
-    RMM_EXPECTS(initial_size <= maximum_size_, "Initial arena size exceeds the maximum pool size!");
+    RMM_EXPECTS(init <= maximum_size_, "Initial arena size exceeds the maximum pool size!");
 
-    free_blocks_.emplace(expand_arena(initial_size));
+    free_blocks_.emplace(expand_arena(init));
   }
 
   // Disable copy (and move) semantics.
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 78c0c94a6..91e93bccf 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -64,6 +64,9 @@ ConfigureTest(DEVICE_MR_TEST mr/device/mr_tests.cpp mr/device/mr_multithreaded_t
 # pool mr tests
 ConfigureTest(POOL_MR_TEST mr/device/pool_mr_tests.cpp)
 
+# arena mr tests
+ConfigureTest(ARENA_MR_TEST mr/device/arena_mr_tests.cpp)
+
 # cuda_async mr tests
 ConfigureTest(CUDA_ASYNC_MR_TEST mr/device/cuda_async_mr_tests.cpp)
 
diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
new file mode 100644
index 000000000..e6609dda0
--- /dev/null
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <rmm/detail/aligned.hpp>
+#include <rmm/detail/cuda_util.hpp>
+#include <rmm/detail/error.hpp>
+#include <rmm/mr/device/arena_memory_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <gtest/gtest.h>
+
+namespace rmm::test {
+namespace {
+using arena_mr = rmm::mr::arena_memory_resource<rmm::mr::device_memory_resource>;
+
+TEST(ArenaTest, NullUpstream)
+{
+  EXPECT_THROW([]() { arena_mr mr{nullptr}; }(), rmm::logic_error);
+}
+
+TEST(ArenaTest, UnalignedInitialSize)
+{
+  EXPECT_THROW([]() { arena_mr mr(rmm::mr::get_current_device_resource(), 255); }(),
+               rmm::logic_error);
+}
+
+TEST(ArenaTest, UnalignedMaximumSize)
+{
+  EXPECT_THROW([]() { arena_mr mr(rmm::mr::get_current_device_resource(), 256, 257); }(),
+               rmm::logic_error);
+}
+
+TEST(ArenaTest, MaxLessThanInitial)
+{
+  EXPECT_THROW([]() { arena_mr mr(rmm::mr::get_current_device_resource(), 512, 256); }(),
+               rmm::logic_error);
+}
+
+TEST(ArenaTest, MaxEqualToInitial)
+{
+  EXPECT_NO_THROW([]() { arena_mr mr(rmm::mr::get_current_device_resource(), 512, 512); }());
+}
+
+TEST(ArenaTest, AllocateNinetyPercent)
+{
+  EXPECT_NO_THROW([]() {
+    auto const free = rmm::detail::available_device_memory().first;
+    auto const ninety_percent =
+      rmm::detail::align_up_cuda(static_cast<std::size_t>(static_cast<double>(free) * 0.9));
+    arena_mr mr(rmm::mr::get_current_device_resource(), ninety_percent);
+  }());
+}
+
+TEST(ArenaTest, SmallMediumLarge)
+{
+  EXPECT_NO_THROW([]() {
+    arena_mr mr(rmm::mr::get_current_device_resource());
+    auto* small = mr.allocate(256);
+    auto* medium = mr.allocate(1U << 26U);
+    auto const free = rmm::detail::available_device_memory().first;
+    auto* large = mr.allocate(free / 2);
+    mr.deallocate(small, 256);
+    mr.deallocate(medium, 1U << 26U);
+    mr.deallocate(large, free / 4);
+  }());
+}
+
+}  // namespace
+}  // namespace rmm::test

From 8bda94e7a4451b826a7ab59586063d2a45f762fa Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Mon, 8 Nov 2021 11:11:59 -0800
Subject: [PATCH 02/35] make superblocks persistent between different arenas

---
 .../rmm/mr/device/arena_memory_resource.hpp   |  60 +-
 include/rmm/mr/device/detail/arena.hpp        | 758 +++++++++++-------
 tests/mr/device/arena_mr_tests.cpp            |  23 -
 3 files changed, 519 insertions(+), 322 deletions(-)

diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp
index a8919def2..9b1073f85 100644
--- a/include/rmm/mr/device/arena_memory_resource.hpp
+++ b/include/rmm/mr/device/arena_memory_resource.hpp
@@ -77,23 +77,15 @@ class arena_memory_resource final : public device_memory_resource {
    * @brief Construct an `arena_memory_resource`.
    *
    * @throws rmm::logic_error if `upstream_mr == nullptr`.
-   * @throws rmm::logic_error if `initial_size` is neither the default nor aligned to a multiple of
-   * 256 bytes.
-   * @throws rmm::logic_error if `maximum_size` is neither the default nor aligned to a multiple of
-   * 256 bytes.
    *
    * @param upstream_mr The memory resource from which to allocate blocks for the pool
-   * @param initial_size Minimum size, in bytes, of the initial global arena. Defaults to half of
-   * the available memory on the current device.
-   * @param maximum_size Maximum size, in bytes, that the global arena can grow to. Defaults to all
-   * of the available memory on the current device.
+   * @param arena_size Size in bytes of the global arena. Defaults to all the available memory on
+   * the current device.
    */
   explicit arena_memory_resource(Upstream* upstream_mr,
-                                 std::optional<std::size_t> initial_size = std::nullopt,
-                                 std::optional<std::size_t> maximum_size = std::nullopt,
-                                 bool dump_log_on_failure = false)
-    : global_arena_{upstream_mr, initial_size, maximum_size},
-      dump_log_on_failure_{dump_log_on_failure}
+                                 std::optional<std::size_t> arena_size = std::nullopt,
+                                 bool dump_log_on_failure              = false)
+    : global_arena_{upstream_mr, arena_size}, dump_log_on_failure_{dump_log_on_failure}
   {
     if (dump_log_on_failure_) {
       logger_ = spdlog::basic_logger_mt("arena_memory_dump", "rmm_arena_memory_dump.log");
@@ -124,8 +116,8 @@ class arena_memory_resource final : public device_memory_resource {
   bool supports_get_mem_info() const noexcept override { return false; }
 
  private:
-  using global_arena = detail::arena::global_arena<Upstream>;
-  using arena        = detail::arena::arena<Upstream>;
+  using global_arena = rmm::mr::detail::arena::global_arena<Upstream>;
+  using arena        = rmm::mr::detail::arena::arena<Upstream>;
   using read_lock    = std::shared_lock<std::shared_timed_mutex>;
   using write_lock   = std::lock_guard<std::shared_timed_mutex>;
 
@@ -174,7 +166,43 @@ class arena_memory_resource final : public device_memory_resource {
     if (ptr == nullptr || bytes <= 0) { return; }
 
     bytes = rmm::detail::align_up_cuda(bytes);
-    get_arena(stream).deallocate(ptr, bytes, stream);
+    if (!get_arena(stream).deallocate(ptr, bytes, stream)) {
+      deallocate_from_other_arena(ptr, bytes, stream);
+    }
+  }
+
+  /**
+   * @brief Deallocate memory pointed to by `ptr` that was allocated in a different arena.
+   *
+   * @param ptr Pointer to be deallocated.
+   * @param bytes The size in bytes of the allocation. This must be equal to the
+   * value of `bytes` that was passed to the `allocate` call that returned `ptr`.
+   * @param stream Stream on which to perform deallocation.
+   */
+  void deallocate_from_other_arena(void* ptr, std::size_t bytes, cuda_stream_view stream)
+  {
+    stream.synchronize_no_throw();
+
+    read_lock lock(mtx_);
+
+    if (use_per_thread_arena(stream)) {
+      auto const id = std::this_thread::get_id();
+      for (auto& kv : thread_arenas_) {
+        // If the arena does not belong to the current thread, try to deallocate from it, and return
+        // if successful.
+        if (kv.first != id && kv.second->deallocate(ptr, bytes, stream)) { return; }
+      }
+    } else {
+      for (auto& kv : stream_arenas_) {
+        // If the arena does not belong to the current stream, try to deallocate from it, and return
+        // if successful.
+        if (stream.value() != kv.first && kv.second.deallocate(ptr, bytes, stream)) { return; }
+      }
+    }
+
+    // The thread that originally allocated the block has terminated, deallocate directly in the
+    // global arena.
+    global_arena_.deallocate_from_other_arena(ptr, bytes);
   }
 
   /**
diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index 9cb691bd8..8966cb47f 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -38,82 +38,81 @@
 
 namespace rmm::mr::detail::arena {
 
-/// Minimum size of a superblock (256 KiB).
-constexpr std::size_t minimum_superblock_size = 1U << 18U;
-
 /**
- * @brief Represents a chunk of memory that can be allocated and deallocated.
- *
- * A block bigger than a certain size is called a "superblock".
+ * @brief Represents a contiguous region of memory.
  */
-class block {
+class memory_span {
  public:
   /**
-   * @brief Construct a default block.
-   */
-  block() = default;
-
-  /**
-   * @brief Construct a block given a pointer and size.
-   *
-   * @param pointer The address for the beginning of the block.
-   * @param size The size of the block.
+   * @brief Construct a default span.
    */
-  block(char* pointer, std::size_t size) : pointer_(pointer), size_(size) {}
+  memory_span() = default;
 
   /**
-   * @brief Construct a block given a void pointer and size.
+   * @brief Construct a span given a pointer and size.
    *
-   * @param pointer The address for the beginning of the block.
-   * @param size The size of the block.
+   * @param pointer The address for the beginning of the span.
+   * @param size The size of the span.
    */
-  block(void* pointer, std::size_t size) : pointer_(static_cast<char*>(pointer)), size_(size) {}
+  memory_span(void* pointer, std::size_t size) : pointer_{static_cast<char*>(pointer)}, size_{size}
+  {
+  }
 
   /// Returns the underlying pointer.
-  [[nodiscard]] void* pointer() const { return pointer_; }
+  [[nodiscard]] char* pointer() const { return pointer_; }
 
-  /// Returns the size of the block.
+  /// Returns the size of the span.
   [[nodiscard]] std::size_t size() const { return size_; }
 
-  /// Returns true if this block is valid (non-null), false otherwise.
+  /// Returns true if this span is valid (non-null), false otherwise.
   [[nodiscard]] bool is_valid() const { return pointer_ != nullptr; }
 
-  /// Returns true if this block is a superblock, false otherwise.
-  [[nodiscard]] bool is_superblock() const { return size_ >= minimum_superblock_size; }
+  /// Used by std::set to compare spans.
+  bool operator<(memory_span const& s) const { return pointer_ < s.pointer_; }
+
+ private:
+  char* pointer_{};     ///< Raw memory pointer.
+  std::size_t size_{};  ///< Size in bytes.
+};
+
+/**
+ * @brief Represents a chunk of memory that can be allocated and deallocated.
+ */
+class block final : public memory_span {
+ public:
+  using memory_span::memory_span;
+
+  /**
+   * @brief Is this block large enough to fit `sz` bytes?
+   *
+   * @param sz The size in bytes to check for fit.
+   * @return true if this block is at least `sz` bytes.
+   */
+  [[nodiscard]] bool fits(std::size_t sz) const { return size() >= sz; }
 
   /**
    * @brief Verifies whether this block can be merged to the beginning of block b.
    *
    * @param b The block to check for contiguity.
-   * @return true Returns true if this block's `pointer` + `size` == `b.ptr`, and `not b.is_head`,
-                  false otherwise.
+   * @return true Returns true if this block's `pointer` + `size` == `b.pointer`.
    */
-  [[nodiscard]] bool is_contiguous_before(block const& blk) const
+  [[nodiscard]] bool is_contiguous_before(block const& b) const
   {
     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-    return pointer_ + size_ == blk.pointer_;
+    return pointer() + size() == b.pointer();
   }
 
-  /**
-   * @brief Is this block large enough to fit `sz` bytes?
-   *
-   * @param size The size in bytes to check for fit.
-   * @return true if this block is at least `sz` bytes.
-   */
-  [[nodiscard]] bool fits(std::size_t size) const { return size_ >= size; }
-
   /**
    * @brief Split this block into two by the given size.
    *
-   * @param size The size in bytes of the first block.
+   * @param sz The size in bytes of the first block.
    * @return std::pair<block, block> A pair of blocks split by sz.
    */
-  [[nodiscard]] std::pair<block, block> split(std::size_t size) const
+  [[nodiscard]] std::pair<block, block> split(std::size_t sz) const
   {
-    RMM_LOGGING_ASSERT(size_ >= size);
+    RMM_LOGGING_ASSERT(size() >= sz);
     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-    if (size_ > size) { return {{pointer_, size}, {pointer_ + size, size_ - size}}; }
-    return {*this, {}};
+    return {{pointer(), sz}, {pointer() + sz, size() - sz}};
   }
 
   /**
@@ -124,102 +123,198 @@ class block {
    * @param b block to merge.
    * @return block The merged block.
    */
-  [[nodiscard]] block merge(block const& blk) const
+  [[nodiscard]] block merge(block const& b) const
   {
-    RMM_LOGGING_ASSERT(is_contiguous_before(blk));
-    return {pointer_, size_ + blk.size_};
+    RMM_LOGGING_ASSERT(is_contiguous_before(b));
+    return {pointer(), size() + b.size()};
   }
+};
 
-  /// Used by std::set to compare blocks.
-  bool operator<(block const& blk) const { return pointer_ < blk.pointer_; }
-
- private:
-  char* pointer_{};     ///< Raw memory pointer.
-  std::size_t size_{};  ///< Size in bytes.
+/// Comparison function for block sizes.
+struct block_size_compare {
+  bool operator()(block const& lhs, block const& rhs) const { return lhs.size() < rhs.size(); }
 };
 
-inline bool block_size_compare(block lhs, block rhs) { return lhs.size() < rhs.size(); }
+/// Calculate the total size of a collection of blocks.
+template <typename T>
+inline auto total_block_size(T const& blocks)
+{
+  return std::accumulate(
+    blocks.cbegin(), blocks.cend(), std::size_t{}, [](auto const& lhs, auto const& rhs) {
+      return lhs + rhs.size();
+    });
+}
 
 /**
- * @brief Get the first free block of at least `size` bytes.
- *
- * Address-ordered first-fit has shown to perform slightly better than best-fit when it comes to
- * memory fragmentation, and slightly cheaper to implement. It is also used by some popular
- * allocators such as jemalloc.
- *
- * \see Johnstone, M. S., & Wilson, P. R. (1998). The memory fragmentation problem: Solved?. ACM
- * Sigplan Notices, 34(3), 26-36.
- *
- * @param free_blocks The address-ordered set of free blocks.
- * @param size The number of bytes to allocate.
- * @return block A block of memory of at least `size` bytes, or an empty block if not found.
+ * @brief Represents a large chunk of memory that is exchanged between the global arena and
+ * per-thread arenas.
  */
-inline block first_fit(std::set<block>& free_blocks, std::size_t size)
-{
-  auto const iter = std::find_if(
-    free_blocks.cbegin(), free_blocks.cend(), [size](auto const& blk) { return blk.fits(size); });
+class superblock final : public memory_span {
+ public:
+  /// Minimum size of a superblock (4 MiB).
+  static constexpr std::size_t minimum_size{1U << 22U};
 
-  if (iter == free_blocks.cend()) { return {}; }
-  // Remove the block from the free_list.
-  auto const blk  = *iter;
-  auto const next = free_blocks.erase(iter);
+  /**
+   * @brief Construct a default superblock.
+   */
+  superblock() = default;
 
-  if (blk.size() > size) {
-    // Split the block and put the remainder back.
-    auto const split = blk.split(size);
-    free_blocks.insert(next, split.second);
-    return split.first;
+  /**
+   * @brief Construct a superblock given a pointer and size.
+   *
+   * @param pointer The address for the beginning of the superblock.
+   * @param size The size of the superblock.
+   */
+  superblock(void* pointer, std::size_t size) : memory_span{pointer, size}
+  {
+    free_blocks_.emplace(pointer, size);
   }
-  return blk;
-}
 
-/**
- * @brief Coalesce the given block with other free blocks.
- *
- * @param free_blocks The address-ordered set of free blocks.
- * @param b The block to coalesce.
- * @return block The coalesced block.
- */
-inline block coalesce_block(std::set<block>& free_blocks, block const& blk)
-{
-  if (!blk.is_valid()) { return blk; }
-
-  // Find the right place (in ascending address order) to insert the block.
-  auto const next     = free_blocks.lower_bound(blk);
-  auto const previous = next == free_blocks.cbegin() ? next : std::prev(next);
-
-  // Coalesce with neighboring blocks.
-  bool const merge_prev = previous->is_contiguous_before(blk);
-  bool const merge_next = next != free_blocks.cend() && blk.is_contiguous_before(*next);
-
-  block merged{};
-  if (merge_prev && merge_next) {
-    merged = previous->merge(blk).merge(*next);
-    free_blocks.erase(previous);
-    auto const iter = free_blocks.erase(next);
-    free_blocks.insert(iter, merged);
-  } else if (merge_prev) {
-    merged          = previous->merge(blk);
-    auto const iter = free_blocks.erase(previous);
-    free_blocks.insert(iter, merged);
-  } else if (merge_next) {
-    merged          = blk.merge(*next);
-    auto const iter = free_blocks.erase(next);
-    free_blocks.insert(iter, merged);
-  } else {
-    free_blocks.emplace(blk);
-    merged = blk;
-  }
-  return merged;
-}
+  // Disable copy semantics.
+  superblock(superblock const&) = delete;
+  superblock& operator=(superblock const&) = delete;
+  // Allow move semantics.
+  superblock(superblock&& s) noexcept = default;
+  superblock& operator=(superblock&&) noexcept = default;
 
-template <typename T>
-inline auto total_block_size(T const& blocks)
-{
-  return std::accumulate(blocks.cbegin(), blocks.cend(), std::size_t{}, [](auto lhs, auto rhs) {
-    return lhs + rhs.size();
-  });
-}
+  ~superblock() = default;
+
+  /**
+   * @brief Is this superblock empty?
+   *
+   * @return true if this superblock is empty.
+   */
+  [[nodiscard]] bool empty() const
+  {
+    return free_blocks_.size() == 1 && free_blocks_.cbegin()->size() == size();
+  }
+
+  /**
+   * @brief Whether this superblock contains the given block.
+   *
+   * @param b The block to search for.
+   * @return true if the given block belongs to this superblock.
+   */
+  [[nodiscard]] bool contains(block const& b) const
+  {
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+    return pointer() <= b.pointer() && pointer() + size() >= b.pointer() + b.size();
+  }
+
+  /**
+   * @brief Can this superblock fit `sz` bytes?
+   *
+   * @param sz The size in bytes to check for fit.
+   * @return true if this superblock can fit `sz` bytes.
+   */
+  [[nodiscard]] bool fits(std::size_t sz) const
+  {
+    return std::any_of(
+      free_blocks_.cbegin(), free_blocks_.cend(), [sz](auto const& b) { return b.fits(sz); });
+  }
+
+  /**
+   * @brief Verifies whether this superblock can be merged to the beginning of superblock s.
+   *
+   * @param s The superblock to check for contiguity.
+   * @return true Returns true if both superblocks are empty and this superblock's
+   * `pointer` + `size` == `s.ptr`.
+   */
+  [[nodiscard]] bool is_contiguous_before(superblock const& s) const
+  {
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+    return empty() && s.empty() && pointer() + size() == s.pointer();
+  }
+
+  /**
+   * @brief Split this superblock into two by the given size.
+   *
+   * @param sz The size in bytes of the first block.
+   * @return superblock_pair A pair of superblocks split by sz.
+   */
+  [[nodiscard]] std::pair<superblock, superblock> split(std::size_t sz) const
+  {
+    RMM_LOGGING_ASSERT(empty() && sz >= minimum_size && size() - sz >= minimum_size);
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+    return {superblock{pointer(), sz}, superblock{pointer() + sz, size() - sz}};
+  }
+
+  /**
+   * @brief Coalesce two contiguous superblocks into one.
+   *
+   * `this->is_contiguous_before(s)` must be true.
+   *
+   * @param s superblock to merge.
+   * @return block The merged block.
+   */
+  [[nodiscard]] superblock merge(superblock const& s) const
+  {
+    RMM_LOGGING_ASSERT(is_contiguous_before(s));
+    return {pointer(), size() + s.size()};
+  }
+
+  /**
+   * @brief Get the first free block of at least `size` bytes.
+   *
+   * @param size The number of bytes to allocate.
+   * @return block A block of memory of at least `size` bytes, or an empty block if not found.
+   */
+  block first_fit(std::size_t size) const
+  {
+    auto const iter = std::find_if(
+      free_blocks_.cbegin(), free_blocks_.cend(), [size](auto const& b) { return b.fits(size); });
+    if (iter == free_blocks_.cend()) { return {}; }
+
+    // Remove the block from the free list.
+    auto const b    = *iter;
+    auto const next = free_blocks_.erase(iter);
+
+    if (b.size() > size) {
+      // Split the block and put the remainder back.
+      auto const split = b.split(size);
+      free_blocks_.insert(next, split.second);
+      return split.first;
+    }
+    return b;
+  }
+
+  /**
+   * @brief Coalesce the given block with other free blocks.
+   *
+   * @param b The block to coalesce.
+   */
+  void coalesce(block const& b) const
+  {
+    // Find the right place (in ascending address order) to insert the block.
+    auto const next     = free_blocks_.lower_bound(b);
+    auto const previous = next == free_blocks_.cbegin() ? next : std::prev(next);
+
+    // Coalesce with neighboring blocks.
+    bool const merge_prev = previous->is_contiguous_before(b);
+    bool const merge_next = next != free_blocks_.cend() && b.is_contiguous_before(*next);
+
+    if (merge_prev && merge_next) {
+      auto const merged = previous->merge(b).merge(*next);
+      free_blocks_.erase(previous);
+      auto const iter = free_blocks_.erase(next);
+      free_blocks_.insert(iter, merged);
+    } else if (merge_prev) {
+      auto const merged = previous->merge(b);
+      auto const iter   = free_blocks_.erase(previous);
+      free_blocks_.insert(iter, merged);
+    } else if (merge_next) {
+      auto const merged = b.merge(*next);
+      auto const iter   = free_blocks_.erase(next);
+      free_blocks_.insert(iter, merged);
+    } else {
+      free_blocks_.insert(next, b);
+    }
+  }
+
+ private:
+  /// Address-ordered set of free blocks.
+  mutable std::set<block> free_blocks_{};
+};
 
 /**
  * @brief The global arena for allocating memory from the upstream memory resource.
@@ -232,48 +327,21 @@ inline auto total_block_size(T const& blocks)
 template <typename Upstream>
 class global_arena final {
  public:
-  /// Reserved memory that should not be allocated (64 MiB).
-  static constexpr std::size_t reserved_size = 1U << 26U;
-
   /**
    * @brief Construct a global arena.
    *
    * @throws rmm::logic_error if `upstream_mr == nullptr`.
-   * @throws rmm::logic_error if `initial_size` is neither the default nor aligned to a multiple of
-   * 256 bytes.
-   * @throws rmm::logic_error if `maximum_size` is neither the default nor aligned to a multiple of
-   * 256 bytes.
    *
    * @param upstream_mr The memory resource from which to allocate blocks for the pool
-   * @param initial_size Minimum size, in bytes, of the initial global arena. Defaults to half of
-   * the available memory on the current device.
-   * @param maximum_size Maximum size, in bytes, that the global arena can grow to. Defaults to all
-   * of the available memory on the current device.
+   * @param arena_size Size in bytes of the global arena. Defaults to all the available memory on
+   * the current device.
    */
-  global_arena(Upstream* upstream_mr,
-               std::optional<std::size_t> initial_size,
-               std::optional<std::size_t> maximum_size)
-    : upstream_mr_{upstream_mr}, maximum_size_{maximum_size.value_or(0)}
+  global_arena(Upstream* upstream_mr, std::optional<std::size_t> arena_size)
+    : upstream_mr_{upstream_mr}
   {
     RMM_EXPECTS(nullptr != upstream_mr_, "Unexpected null upstream pointer.");
-    RMM_EXPECTS(!initial_size || rmm::detail::is_cuda_aligned(initial_size.value()),
-                "Error, Initial arena size required to be a multiple of 256 bytes");
-    RMM_EXPECTS(!maximum_size || rmm::detail::is_cuda_aligned(maximum_size.value()),
-                "Error, Maximum arena size required to be a multiple of 256 bytes");
-
-    auto init = initial_size.value_or(0);
-    if (!initial_size || !maximum_size) {
-      auto const [free, total] = rmm::detail::available_device_memory();
-      if (!initial_size) {
-        init = rmm::detail::align_down_cuda(free) - reserved_size;
-      }
-      if (!maximum_size) {
-        maximum_size_ = rmm::detail::align_down_cuda(free) - reserved_size;
-      }
-    }
-    RMM_EXPECTS(init <= maximum_size_, "Initial arena size exceeds the maximum pool size!");
-
-    free_blocks_.emplace(expand_arena(init));
+    auto const size = rmm::detail::align_down_cuda(arena_size.value_or(default_size()));
+    initialize(size);
   }
 
   // Disable copy (and move) semantics.
@@ -289,136 +357,237 @@ class global_arena final {
   ~global_arena()
   {
     lock_guard lock(mtx_);
-    for (auto const& blk : upstream_blocks_) {
-      upstream_mr_->deallocate(blk.pointer(), blk.size());
-    }
+    upstream_mr_->deallocate(upstream_block_.pointer(), upstream_block_.size());
   }
 
   /**
-   * @brief Allocates memory of size at least `bytes`.
-   *
-   * @throws `std::bad_alloc` if the requested allocation could not be fulfilled.
+   * @brief Acquire a superblock that can fit a block of the given size.
    *
-   * @param bytes The size in bytes of the allocation.
-   * @return void* Pointer to the newly allocated memory.
+   * @param size The size in bytes of the allocation.
+   * @return superblock The acquired superblock.
    */
-  block allocate(std::size_t bytes)
+  superblock acquire(std::size_t size)
   {
     lock_guard lock(mtx_);
-    return get_block(bytes);
+    return first_fit(size);
   }
 
   /**
-   * @brief Deallocate memory pointed to by `blk`.
+   * @brief Release a superblock.
    *
-   * @param blk Block to be deallocated.
+   * @param s Superblock to be released.
    */
-  void deallocate(block const& blk)
+  void release(superblock&& s)
   {
     lock_guard lock(mtx_);
-    coalesce_block(free_blocks_, blk);
+    coalesce(std::move(s));
   }
 
   /**
-   * @brief Deallocate a set of free blocks from a dying arena.
+   * @brief Release a set of superblocks from a dying arena.
    *
-   * @param free_blocks The set of free blocks.
+   * @param superblocks The set of superblocks.
    */
-  void deallocate(std::set<block> const& free_blocks)
+  void release(std::set<superblock>& superblocks)
   {
     lock_guard lock(mtx_);
-    for (auto const& blk : free_blocks) {
-      coalesce_block(free_blocks_, blk);
+    auto iter = superblocks.cbegin();
+    while (iter != superblocks.cend()) {
+      auto s = std::move(superblocks.extract(iter).value());
+      coalesce(std::move(s));
+      ++iter;
     }
   }
 
   /**
-   * @brief Dump memory to log.
+   * @brief Allocate a large block directly.
    *
-   * @param logger the spdlog logger to use
+   * @param size The size in bytes of the allocation.
+   * @return void* Pointer to the newly allocated memory.
    */
-  void dump_memory_log(std::shared_ptr<spdlog::logger> const& logger) const
+  void* allocate(std::size_t size)
   {
-    lock_guard lock(mtx_);
+    if (handles(size)) {
+      lock_guard lock(mtx_);
+      return first_fit(size).pointer();
+    }
+    return nullptr;
+  }
 
-    logger->info("  Maximum size: {}", rmm::detail::bytes{maximum_size_});
-    logger->info("  Current size: {}", rmm::detail::bytes{current_size_});
+  /**
+   * @brief Deallocate memory pointed to by `ptr` directly.
+   *
+   * @param ptr Pointer to be deallocated.
+   * @param size The size in bytes of the allocation. This must be equal to the value of `size`
+   * that was passed to the `allocate` call that returned `p`.
+   * @param stream Stream on which to perform deallocation.
+   * @return bool true if the allocation is found, false otherwise.
+   */
+  bool deallocate(void* ptr, std::size_t size, cuda_stream_view stream)
+  {
+    if (handles(size)) {
+      stream.synchronize_no_throw();
 
-    logger->info("  # free blocks: {}", free_blocks_.size());
-    if (!free_blocks_.empty()) {
-      logger->info("  Total size of free blocks: {}",
-                   rmm::detail::bytes{total_block_size(free_blocks_)});
-      auto const largest_free =
-        *std::max_element(free_blocks_.begin(), free_blocks_.end(), block_size_compare);
-      logger->info("  Size of largest free block: {}", rmm::detail::bytes{largest_free.size()});
+      lock_guard lock(mtx_);
+      superblock s{ptr, size};
+      coalesce(std::move(s));
+      return true;
     }
+    return false;
+  }
+
+  /**
+   * @brief Deallocate memory pointed to by `ptr` that was allocated in a per-thread arena.
+   *
+   * @param ptr Pointer to be deallocated.
+   * @param bytes The size in bytes of the allocation. This must be equal to the
+   * value of `bytes` that was passed to the `allocate` call that returned `ptr`.
+   * @param stream Stream on which to perform deallocation.
+   */
+  void deallocate_from_other_arena(void* ptr, std::size_t bytes)
+  {
+    lock_guard lock(mtx_);
 
-    logger->info("  # upstream blocks={}", upstream_blocks_.size());
-    logger->info("  Total size of upstream blocks: {}",
-                 rmm::detail::bytes{total_block_size(upstream_blocks_)});
+    block const b{ptr, bytes};
+    auto const iter = std::find_if(
+      superblocks_.cbegin(), superblocks_.cend(), [b](auto const& s) { return s.contains(b); });
+    if (iter == superblocks_.cend()) { RMM_FAIL("allocation not found"); }
+    iter->coalesce(b);
+  }
+
+  /**
+   * @brief Dump memory to log.
+   *
+   * @param logger the spdlog logger to use
+   */
+  void dump_memory_log(std::shared_ptr<spdlog::logger> const& logger) const
+  {
+    //    lock_guard lock(mtx_);
+    //
+    //    logger->info("  Maximum size: {}", rmm::detail::bytes{maximum_size_});
+    //    logger->info("  Current size: {}", rmm::detail::bytes{current_size_});
+    //
+    //    logger->info("  # free blocks: {}", free_blocks_.size());
+    //    if (!free_blocks_.empty()) {
+    //      logger->info("  Total size of free blocks: {}",
+    //                   rmm::detail::bytes{total_block_size(free_blocks_)});
+    //      auto const largest_free =
+    //        *std::max_element(free_blocks_.begin(), free_blocks_.end(), block_size_compare);
+    //      logger->info("  Size of largest free block: {}",
+    //      rmm::detail::bytes{largest_free.size()});
+    //    }
+    //
+    //    logger->info("  # upstream blocks={}", upstream_blocks_.size());
+    //    logger->info("  Total size of upstream blocks: {}",
+    //                 rmm::detail::bytes{total_block_size(upstream_blocks_)});
   }
 
  private:
   using lock_guard = std::lock_guard<std::mutex>;
 
+  /// Reserved memory that should not be allocated (64 MiB).
+  static constexpr std::size_t reserved_size = 1U << 26U;
+
   /**
-   * @brief Get an available memory block of at least `size` bytes.
-   *
-   * @param size The number of bytes to allocate.
-   * @return block A block of memory of at least `size` bytes.
+   * @brief Default size of the global arena if unspecified.
+   * @return the default global arena size.
    */
-  block get_block(std::size_t size)
+  constexpr std::size_t default_size() const
   {
-    // Find the first-fit free block.
-    auto const blk = first_fit(free_blocks_, size);
-    if (blk.is_valid()) { return blk; }
+    auto const [free, total] = rmm::detail::available_device_memory();
+    return free - reserved_size;
+  }
 
-    // No existing larger blocks available, so grow the arena.
-    auto const upstream_block = expand_arena(size_to_grow(size));
-    coalesce_block(free_blocks_, upstream_block);
-    return first_fit(free_blocks_, size);
+  /**
+   * @brief Allocate space from upstream to initialize the arena.
+   *
+   * @param size The size to allocate.
+   */
+  void initialize(std::size_t size)
+  {
+    RMM_LOGGING_ASSERT(size >= superblock::minimum_size);
+    upstream_block_ = {upstream_mr_->allocate(size), size};
+    superblocks_.emplace(upstream_block_.pointer(), size);
   }
 
   /**
-   * @brief Get the size to grow the global arena given the requested `size` bytes.
+   * @brief Should allocation of `size` bytes be handled by the global arena directly?
+   *
+   * @param size The size in bytes of the allocation.
+   * @return bool True if the allocation should be handled by the global arena.
+   */
+  bool handles(std::size_t size) { return size > superblock::minimum_size / 2; }
+
+  /**
+   * @brief Get the first superblock that can fit a block of at least `size` bytes.
+   *
+   * Address-ordered first-fit has shown to perform slightly better than best-fit when it comes to
+   * memory fragmentation, and slightly cheaper to implement. It is also used by some popular
+   * allocators such as jemalloc.
    *
-   * This simply grows the global arena to the maximum size.
+   * \see Johnstone, M. S., & Wilson, P. R. (1998). The memory fragmentation problem: Solved?. ACM
+   * Sigplan Notices, 34(3), 26-36.
    *
-   * @param size The number of bytes required.
-   * @return size The size for the arena to grow, or 0 if no more memory.
+   * @param size The number of bytes to allocate.
+   * @return superblock A superblock that can fit at least `size` bytes, or empty if not found.
    */
-  constexpr std::size_t size_to_grow(std::size_t size) const
+  superblock first_fit(std::size_t size)
   {
-    if (current_size_ + size > maximum_size_) { return 0; }
-    return maximum_size_ - current_size_;
+    auto const iter = std::find_if(
+      superblocks_.cbegin(), superblocks_.cend(), [size](auto const& s) { return s.fits(size); });
+    if (iter == superblocks_.cend()) { return {}; }
+
+    auto node_handle = superblocks_.extract(iter);
+    auto s           = std::move(node_handle.value());
+    auto const sz    = std::max(size, superblock::minimum_size);
+    if (s.empty() && s.size() - sz >= superblock::minimum_size) {
+      // Split the superblock and put the remainder back.
+      auto [head, tail] = s.split(sz);
+      superblocks_.insert(std::move(tail));
+      return std::move(head);
+    }
+    return s;
   }
 
   /**
-   * @brief Allocate space from upstream to supply the arena and return a sufficiently sized block.
+   * @brief Coalesce the given superblock with other empty superblocks.
    *
-   * @param size The minimum size to allocate.
-   * @return block A block of at least `size` bytes.
+   * @param s The superblock to coalesce.
    */
-  block expand_arena(std::size_t size)
+  void coalesce(superblock&& s)
   {
-    if (size > 0) {
-      upstream_blocks_.push_back({upstream_mr_->allocate(size), size});
-      current_size_ += size;
-      return upstream_blocks_.back();
+    // Find the right place (in ascending address order) to insert the block.
+    auto const next     = superblocks_.lower_bound(s);
+    auto const previous = next == superblocks_.cbegin() ? next : std::prev(next);
+
+    // Coalesce with neighboring blocks.
+    bool const merge_prev = previous->is_contiguous_before(s);
+    bool const merge_next = next != superblocks_.cend() && s.is_contiguous_before(*next);
+
+    if (merge_prev && merge_next) {
+      auto p      = std::move(superblocks_.extract(previous).value());
+      auto n      = std::move(superblocks_.extract(next).value());
+      auto merged = p.merge(std::move(s)).merge(std::move(n));
+      superblocks_.insert(std::move(merged));
+    } else if (merge_prev) {
+      auto p      = std::move(superblocks_.extract(previous).value());
+      auto merged = p.merge(std::move(s));
+      superblocks_.insert(std::move(merged));
+    } else if (merge_next) {
+      auto n      = std::move(superblocks_.extract(next).value());
+      auto merged = s.merge(std::move(n));
+      superblocks_.insert(std::move(merged));
+    } else {
+      superblocks_.insert(std::move(s));
     }
-    return {};
   }
 
   /// The upstream resource to allocate memory from.
   Upstream* upstream_mr_;
-  /// The maximum size the global arena can grow to.
-  std::size_t maximum_size_;
-  /// The current size of the global arena.
-  std::size_t current_size_{};
-  /// Address-ordered set of free blocks.
-  std::set<block> free_blocks_;
-  /// Blocks allocated from upstream so that they can be quickly freed.
-  std::vector<block> upstream_blocks_;
+  /// Block allocated from upstream so that it can be quickly freed.
+  block upstream_block_;
+  /// Address-ordered set of superblocks.
+  std::set<superblock> superblocks_;
   /// Mutex for exclusive lock.
   mutable std::mutex mtx_;
 };
@@ -427,7 +596,7 @@ class global_arena final {
  * @brief An arena for allocating memory for a thread.
  *
  * An arena is a per-thread or per-non-default-stream memory pool. It allocates
- * superblocks from the global arena, and return them when the superblocks become empty.
+ * superblocks from the global arena, and returns them when the superblocks become empty.
  *
  * @tparam Upstream Memory resource to use for allocating the global arena. Implements
  * rmm::mr::device_memory_resource interface.
@@ -442,43 +611,44 @@ class arena {
    */
   explicit arena(global_arena<Upstream>& global_arena) : global_arena_{global_arena} {}
 
-  ~arena() = default;
-
   // Disable copy (and move) semantics.
   arena(arena const&) = delete;
   arena& operator=(arena const&) = delete;
   arena(arena&&) noexcept        = delete;
   arena& operator=(arena&&) noexcept = delete;
 
+  ~arena() = default;
+
   /**
-   * @brief Allocates memory of size at least `bytes`.
-   *
-   * @throws `std::bad_alloc` if the requested allocation could not be fulfilled.
+   * @brief Allocates memory of size at least `size` bytes.
    *
-   * @param bytes The size in bytes of the allocation.
+   * @param size The size in bytes of the allocation.
    * @return void* Pointer to the newly allocated memory.
    */
-  void* allocate(std::size_t bytes)
+  void* allocate(std::size_t size)
   {
+    auto* ptr = global_arena_.allocate(size);
+    if (ptr != nullptr) { return ptr; }
+
     lock_guard lock(mtx_);
-    auto const blk = get_block(bytes);
-    return blk.pointer();
+    return get_block(size).pointer();
   }
 
   /**
    * @brief Deallocate memory pointed to by `ptr`, and possibly return superblocks to upstream.
    *
    * @param ptr Pointer to be deallocated.
-   * @param bytes The size in bytes of the allocation. This must be equal to the value of `bytes`
+   * @param size The size in bytes of the allocation. This must be equal to the value of `size`
    * that was passed to the `allocate` call that returned `p`.
    * @param stream Stream on which to perform deallocation.
+   * @return bool true if the allocation is found, false otherwise.
    */
-  void deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream)
+  bool deallocate(void* ptr, std::size_t size, cuda_stream_view stream)
   {
+    if (global_arena_.deallocate(ptr, size, stream)) { return true; }
+
     lock_guard lock(mtx_);
-    block const blk{ptr, bytes};
-    auto const merged = coalesce_block(free_blocks_, blk);
-    shrink_arena(merged, stream);
+    return deallocate_from_superblock({ptr, size});
   }
 
   /**
@@ -487,8 +657,7 @@ class arena {
   void clean()
   {
     lock_guard lock(mtx_);
-    global_arena_.deallocate(free_blocks_);
-    free_blocks_.clear();
+    global_arena_.release(superblocks_);
   }
 
   /**
@@ -498,21 +667,20 @@ class arena {
    */
   void dump_memory_log(std::shared_ptr<spdlog::logger> const& logger) const
   {
-    lock_guard lock(mtx_);
-    logger->info("    # free blocks: {}", free_blocks_.size());
-    if (!free_blocks_.empty()) {
-      logger->info("    Total size of free blocks: {}",
-                   rmm::detail::bytes{total_block_size(free_blocks_)});
-      auto const largest_free =
-        *std::max_element(free_blocks_.begin(), free_blocks_.end(), block_size_compare);
-      logger->info("    Size of largest free block: {}", rmm::detail::bytes{largest_free.size()});
-    }
+    //    lock_guard lock(mtx_);
+    //    logger->info("    # free blocks: {}", free_blocks_.size());
+    //    if (!free_blocks_.empty()) {
+    //      logger->info("    Total size of free blocks: {}",
+    //                   rmm::detail::bytes{total_block_size(free_blocks_)});
+    //      auto const largest_free =
+    //        *std::max_element(free_blocks_.begin(), free_blocks_.end(), block_size_compare);
+    //      logger->info("    Size of largest free block: {}",
+    //      rmm::detail::bytes{largest_free.size()});
+    //    }
   }
 
  private:
   using lock_guard = std::lock_guard<std::mutex>;
-  /// Maximum number of free blocks to keep.
-  static constexpr int max_free_blocks = 16;
 
   /**
    * @brief Get an available memory block of at least `size` bytes.
@@ -522,51 +690,75 @@ class arena {
    */
   block get_block(std::size_t size)
   {
-    if (size < minimum_superblock_size) {
-      // Find the first-fit free block.
-      auto const blk = first_fit(free_blocks_, size);
-      if (blk.is_valid()) { return blk; }
-    }
+    // Find the first-fit free block.
+    auto const b = first_fit(size);
+    if (b.is_valid()) { return b; }
 
     // No existing larger blocks available, so grow the arena and obtain a superblock.
-    auto const superblock = expand_arena(size);
-    if (superblock.is_valid()) {
-      coalesce_block(free_blocks_, superblock);
-      return first_fit(free_blocks_, size);
+    return expand_arena(size);
+  }
+
+  /**
+   * @brief Get the first free block of at least `size` bytes.
+   *
+   * Address-ordered first-fit has shown to perform slightly better than best-fit when it comes to
+   * memory fragmentation, and slightly cheaper to implement. It is also used by some popular
+   * allocators such as jemalloc.
+   *
+   * \see Johnstone, M. S., & Wilson, P. R. (1998). The memory fragmentation problem: Solved?. ACM
+   * Sigplan Notices, 34(3), 26-36.
+   *
+   * @param size The number of bytes to allocate.
+   * @return block A block of memory of at least `size` bytes, or an empty block if not found.
+   */
+  block first_fit(std::size_t size)
+  {
+    for (auto const& s : superblocks_) {
+      auto const b = s.first_fit(size);
+      if (b.is_valid()) { return b; }
     }
-    return superblock;
+    return {};
   }
 
   /**
-   * @brief Allocate space from upstream to supply the arena and return a superblock.
+   * @brief Deallocate a block from the superblock it belongs to.
    *
-   * @return A superblock.
+   * @param b The block to deallocate.
+   * @return true if the block is found.
    */
-  block expand_arena(std::size_t size)
+  bool deallocate_from_superblock(block b)
   {
-    auto const superblock_size = std::max(size, minimum_superblock_size);
-    return global_arena_.allocate(superblock_size);
+    auto const iter = std::find_if(
+      superblocks_.begin(), superblocks_.end(), [b](auto& s) { return s.contains(b); });
+    if (iter == superblocks_.end()) { return false; }
+
+    auto const& s = *iter;
+    s.coalesce(b);
+    if (s.empty()) { global_arena_.release(std::move(superblocks_.extract(iter).value())); }
+    return true;
   }
 
   /**
-   * @brief Shrink this arena by returning free superblocks to upstream.
+   * @brief Allocate space from upstream to supply the arena and return a block.
    *
-   * @param blk The block that can be used to shrink the arena.
-   * @param stream Stream on which to perform shrinking.
+   * @param size The number of bytes to allocate.
+   * @return block A block of memory of at least `size` bytes.
    */
-  void shrink_arena(block const& blk, cuda_stream_view stream)
+  block expand_arena(std::size_t size)
   {
-    if (blk.is_superblock() || free_blocks_.size() > max_free_blocks) {
-      stream.synchronize_no_throw();
-      global_arena_.deallocate(blk);
-      free_blocks_.erase(blk);
+    auto s = global_arena_.acquire(size);
+    if (s.is_valid()) {
+      auto const b = s.first_fit(size);
+      superblocks_.insert(std::move(s));
+      return b;
     }
+    return {};
   }
 
   /// The global arena to allocate superblocks from.
   global_arena<Upstream>& global_arena_;
-  /// Free blocks.
-  std::set<block> free_blocks_;
+  /// Acquired superblocks.
+  std::set<superblock> superblocks_;
   /// Mutex for exclusive lock.
   mutable std::mutex mtx_;
 };
diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index e6609dda0..17b001671 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -32,29 +32,6 @@ TEST(ArenaTest, NullUpstream)
   EXPECT_THROW([]() { arena_mr mr{nullptr}; }(), rmm::logic_error);
 }
 
-TEST(ArenaTest, UnalignedInitialSize)
-{
-  EXPECT_THROW([]() { arena_mr mr(rmm::mr::get_current_device_resource(), 255); }(),
-               rmm::logic_error);
-}
-
-TEST(ArenaTest, UnalignedMaximumSize)
-{
-  EXPECT_THROW([]() { arena_mr mr(rmm::mr::get_current_device_resource(), 256, 257); }(),
-               rmm::logic_error);
-}
-
-TEST(ArenaTest, MaxLessThanInitial)
-{
-  EXPECT_THROW([]() { arena_mr mr(rmm::mr::get_current_device_resource(), 512, 256); }(),
-               rmm::logic_error);
-}
-
-TEST(ArenaTest, MaxEqualToInitial)
-{
-  EXPECT_NO_THROW([]() { arena_mr mr(rmm::mr::get_current_device_resource(), 512, 512); }());
-}
-
 TEST(ArenaTest, AllocateNinetyPercent)
 {
   EXPECT_NO_THROW([]() {

From 5da4b59c6e84fd2608a22012c67e16a07b35f56c Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Tue, 9 Nov 2021 12:23:21 -0800
Subject: [PATCH 03/35] fix segfault

---
 .../rmm/mr/device/arena_memory_resource.hpp   |  18 ++--
 include/rmm/mr/device/detail/arena.hpp        | 100 ++++++++----------
 2 files changed, 55 insertions(+), 63 deletions(-)

diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp
index bfd4993dd..b099a4c2a 100644
--- a/include/rmm/mr/device/arena_memory_resource.hpp
+++ b/include/rmm/mr/device/arena_memory_resource.hpp
@@ -78,7 +78,7 @@ class arena_memory_resource final : public device_memory_resource {
    *
    * @throws rmm::logic_error if `upstream_mr == nullptr`.
    *
-   * @param upstream_mr The memory resource from which to allocate blocks for the pool
+   * @param upstream_mr The memory resource from which to allocate blocks for the pool.
    * @param arena_size Size in bytes of the global arena. Defaults to all the available memory on
    * the current device.
    */
@@ -118,8 +118,8 @@ class arena_memory_resource final : public device_memory_resource {
  private:
   using global_arena = rmm::mr::detail::arena::global_arena<Upstream>;
   using arena        = rmm::mr::detail::arena::arena<Upstream>;
-  using read_lock    = std::shared_lock<std::shared_timed_mutex>;
-  using write_lock   = std::lock_guard<std::shared_timed_mutex>;
+  using read_lock    = std::shared_lock<std::shared_mutex>;
+  using write_lock   = std::unique_lock<std::shared_mutex>;
 
   /**
    * @brief Allocates memory of size at least `bytes`.
@@ -183,17 +183,17 @@ class arena_memory_resource final : public device_memory_resource {
   {
     stream.synchronize_no_throw();
 
-    read_lock lock(mtx_);
+    write_lock lock(mtx_);
 
     if (use_per_thread_arena(stream)) {
       auto const id = std::this_thread::get_id();
-      for (auto& kv : thread_arenas_) {
+      for (auto&& kv : thread_arenas_) {
         // If the arena does not belong to the current thread, try to deallocate from it, and return
         // if successful.
         if (kv.first != id && kv.second->deallocate(ptr, bytes, stream)) { return; }
       }
     } else {
-      for (auto& kv : stream_arenas_) {
+      for (auto&& kv : stream_arenas_) {
         // If the arena does not belong to the current stream, try to deallocate from it, and return
         // if successful.
         if (stream.value() != kv.first && kv.second.deallocate(ptr, bytes, stream)) { return; }
@@ -211,10 +211,10 @@ class arena_memory_resource final : public device_memory_resource {
   void defragment()
   {
     RMM_CUDA_TRY(cudaDeviceSynchronize());
-    for (auto& thread_arena : thread_arenas_) {
+    for (auto&& thread_arena : thread_arenas_) {
       thread_arena.second->clean();
     }
-    for (auto& stream_arena : stream_arenas_) {
+    for (auto&& stream_arena : stream_arenas_) {
       stream_arena.second.clean();
     }
   }
@@ -334,7 +334,7 @@ class arena_memory_resource final : public device_memory_resource {
   /// The logger for memory dump.
   std::shared_ptr<spdlog::logger> logger_{};
   /// Mutex for read and write locks.
-  mutable std::shared_timed_mutex mtx_;
+  mutable std::shared_mutex mtx_;
 };
 
 }  // namespace rmm::mr
diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index 847b9dd77..515d77c16 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -174,7 +174,7 @@ class superblock final : public memory_span {
   superblock(superblock const&) = delete;
   superblock& operator=(superblock const&) = delete;
   // Allow move semantics.
-  superblock(superblock&& s) noexcept = default;
+  superblock(superblock&& sb) noexcept = default;
   superblock& operator=(superblock&&) noexcept = default;
 
   ~superblock() = default;
@@ -220,10 +220,10 @@ class superblock final : public memory_span {
    * @return true Returns true if both superblocks are empty and this superblock's
    * `pointer` + `size` == `s.ptr`.
    */
-  [[nodiscard]] bool is_contiguous_before(superblock const& s) const
+  [[nodiscard]] bool is_contiguous_before(superblock const& sb) const
   {
     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-    return empty() && s.empty() && pointer() + size() == s.pointer();
+    return empty() && sb.empty() && pointer() + size() == sb.pointer();
   }
 
   /**
@@ -247,10 +247,10 @@ class superblock final : public memory_span {
    * @param s superblock to merge.
    * @return block The merged block.
    */
-  [[nodiscard]] superblock merge(superblock const& s) const
+  [[nodiscard]] superblock merge(superblock const& sb) const
   {
-    RMM_LOGGING_ASSERT(is_contiguous_before(s));
-    return {pointer(), size() + s.size()};
+    RMM_LOGGING_ASSERT(is_contiguous_before(sb));
+    return {pointer(), size() + sb.size()};
   }
 
   /**
@@ -333,8 +333,8 @@ class global_arena final {
    * @throws rmm::logic_error if `upstream_mr == nullptr`.
    *
    * @param upstream_mr The memory resource from which to allocate blocks for the pool
-   * @param arena_size Size in bytes of the global arena. Defaults to all the available memory on
-   * the current device.
+   * @param arena_size Size in bytes of the global arena. Defaults to half of the available memory
+   * on the current device.
    */
   global_arena(Upstream* upstream_mr, std::optional<std::size_t> arena_size)
     : upstream_mr_{upstream_mr}
@@ -377,10 +377,10 @@ class global_arena final {
    *
    * @param s Superblock to be released.
    */
-  void release(superblock&& s)
+  void release(superblock&& sb)
   {
     lock_guard lock(mtx_);
-    coalesce(std::move(s));
+    coalesce(std::move(sb));
   }
 
   /**
@@ -391,11 +391,8 @@ class global_arena final {
   void release(std::set<superblock>& superblocks)
   {
     lock_guard lock(mtx_);
-    auto iter = superblocks.cbegin();
-    while (iter != superblocks.cend()) {
-      auto s = std::move(superblocks.extract(iter).value());
-      coalesce(std::move(s));
-      ++iter;
+    while (!superblocks.empty()) {
+      coalesce(std::move(superblocks.extract(superblocks.cbegin()).value()));
     }
   }
 
@@ -429,8 +426,7 @@ class global_arena final {
       stream.synchronize_no_throw();
 
       lock_guard lock(mtx_);
-      superblock s{ptr, size};
-      coalesce(std::move(s));
+      coalesce({ptr, size});
       return true;
     }
     return false;
@@ -450,7 +446,7 @@ class global_arena final {
 
     block const b{ptr, bytes};
     auto const iter = std::find_if(
-      superblocks_.cbegin(), superblocks_.cend(), [b](auto const& s) { return s.contains(b); });
+      superblocks_.cbegin(), superblocks_.cend(), [&](auto const& sb) { return sb.contains(b); });
     if (iter == superblocks_.cend()) { RMM_FAIL("allocation not found"); }
     iter->coalesce(b);
   }
@@ -485,9 +481,6 @@ class global_arena final {
  private:
   using lock_guard = std::lock_guard<std::mutex>;
 
-  /// Reserved memory that should not be allocated (64 MiB).
-  static constexpr std::size_t reserved_size = 1U << 26U;
-
   /**
    * @brief Default size of the global arena if unspecified.
    * @return the default global arena size.
@@ -495,7 +488,7 @@ class global_arena final {
   constexpr std::size_t default_size() const
   {
     auto const [free, total] = rmm::detail::available_device_memory();
-    return free - reserved_size;
+    return free / 2;
   }
 
   /**
@@ -534,51 +527,50 @@ class global_arena final {
   superblock first_fit(std::size_t size)
   {
     auto const iter = std::find_if(
-      superblocks_.cbegin(), superblocks_.cend(), [size](auto const& s) { return s.fits(size); });
+      superblocks_.cbegin(), superblocks_.cend(), [size](auto const& sb) { return sb.fits(size); });
     if (iter == superblocks_.cend()) { return {}; }
 
-    auto node_handle = superblocks_.extract(iter);
-    auto s           = std::move(node_handle.value());
-    auto const sz    = std::max(size, superblock::minimum_size);
-    if (s.empty() && s.size() - sz >= superblock::minimum_size) {
+    auto sb       = std::move(superblocks_.extract(iter).value());
+    auto const sz = std::max(size, superblock::minimum_size);
+    if (sb.empty() && sb.size() - sz >= superblock::minimum_size) {
       // Split the superblock and put the remainder back.
-      auto [head, tail] = s.split(sz);
+      auto [head, tail] = sb.split(sz);
       superblocks_.insert(std::move(tail));
       return std::move(head);
     }
-    return s;
+    return sb;
   }
 
   /**
    * @brief Coalesce the given superblock with other empty superblocks.
    *
-   * @param s The superblock to coalesce.
+   * @param sb The superblock to coalesce.
    */
-  void coalesce(superblock&& s)
+  void coalesce(superblock&& sb)
   {
     // Find the right place (in ascending address order) to insert the block.
-    auto const next     = superblocks_.lower_bound(s);
+    auto const next     = superblocks_.lower_bound(sb);
     auto const previous = next == superblocks_.cbegin() ? next : std::prev(next);
 
     // Coalesce with neighboring blocks.
-    bool const merge_prev = previous->is_contiguous_before(s);
-    bool const merge_next = next != superblocks_.cend() && s.is_contiguous_before(*next);
+    bool const merge_prev = previous->is_contiguous_before(sb);
+    bool const merge_next = next != superblocks_.cend() && sb.is_contiguous_before(*next);
 
     if (merge_prev && merge_next) {
-      auto p      = std::move(superblocks_.extract(previous).value());
-      auto n      = std::move(superblocks_.extract(next).value());
-      auto merged = p.merge(std::move(s)).merge(std::move(n));
-      superblocks_.insert(std::move(merged));
+      auto prev_sb = std::move(superblocks_.extract(previous).value());
+      auto next_sb = std::move(superblocks_.extract(next).value());
+      auto merged  = prev_sb.merge(sb).merge(next_sb);
+      superblocks_.emplace(std::move(merged));
     } else if (merge_prev) {
-      auto p      = std::move(superblocks_.extract(previous).value());
-      auto merged = p.merge(std::move(s));
-      superblocks_.insert(std::move(merged));
+      auto prev_sb = std::move(superblocks_.extract(previous).value());
+      auto merged  = prev_sb.merge(sb);
+      superblocks_.emplace(std::move(merged));
     } else if (merge_next) {
-      auto n      = std::move(superblocks_.extract(next).value());
-      auto merged = s.merge(std::move(n));
-      superblocks_.insert(std::move(merged));
+      auto next_sb = std::move(superblocks_.extract(next).value());
+      auto merged  = sb.merge(next_sb);
+      superblocks_.emplace(std::move(merged));
     } else {
-      superblocks_.insert(std::move(s));
+      superblocks_.emplace(std::move(sb));
     }
   }
 
@@ -726,15 +718,15 @@ class arena {
    * @param b The block to deallocate.
    * @return true if the block is found.
    */
-  bool deallocate_from_superblock(block b)
+  bool deallocate_from_superblock(block const& b)
   {
     auto const iter = std::find_if(
-      superblocks_.begin(), superblocks_.end(), [b](auto& s) { return s.contains(b); });
+      superblocks_.begin(), superblocks_.end(), [&](auto const& sb) { return sb.contains(b); });
     if (iter == superblocks_.end()) { return false; }
 
-    auto const& s = *iter;
-    s.coalesce(b);
-    if (s.empty()) { global_arena_.release(std::move(superblocks_.extract(iter).value())); }
+    auto const& sb = *iter;
+    sb.coalesce(b);
+    if (sb.empty()) { global_arena_.release(std::move(superblocks_.extract(iter).value())); }
     return true;
   }
 
@@ -746,10 +738,10 @@ class arena {
    */
   block expand_arena(std::size_t size)
   {
-    auto s = global_arena_.acquire(size);
-    if (s.is_valid()) {
-      auto const b = s.first_fit(size);
-      superblocks_.insert(std::move(s));
+    auto sb = global_arena_.acquire(size);
+    if (sb.is_valid()) {
+      auto const b = sb.first_fit(size);
+      superblocks_.emplace(std::move(sb));
       return b;
     }
     return {};

From 10ed42c666dcbfd4c5dc8165cd4a2ecdf9105ba9 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Tue, 9 Nov 2021 13:44:43 -0800
Subject: [PATCH 04/35] add back memory dump

---
 .../rmm/mr/device/arena_memory_resource.hpp   | 45 +++++-----
 include/rmm/mr/device/detail/arena.hpp        | 88 +++++++++++--------
 2 files changed, 71 insertions(+), 62 deletions(-)

diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp
index b099a4c2a..6bbbbecb5 100644
--- a/include/rmm/mr/device/arena_memory_resource.hpp
+++ b/include/rmm/mr/device/arena_memory_resource.hpp
@@ -92,6 +92,24 @@ class arena_memory_resource final : public device_memory_resource {
     }
   }
 
+  /**
+   * @brief Construct an `arena_memory_resource`.
+   *
+   * @throws rmm::logic_error if `upstream_mr == nullptr`.
+   *
+   * @param upstream_mr The memory resource from which to allocate blocks for the pool.
+   * @param arena_size Size in bytes of the global arena. Defaults to all the available memory on
+   * the current device.
+   * @param max_size Unused.
+   * @deprecated Use the version without the max size.
+   */
+  arena_memory_resource(Upstream* upstream_mr,
+                        std::optional<std::size_t> arena_size,
+                        std::optional<std::size_t> max_size)
+    : arena_memory_resource{upstream_mr, arena_size, false}
+  {
+  }
+
   ~arena_memory_resource() override = default;
 
   // Disable copy (and move) semantics.
@@ -141,13 +159,8 @@ class arena_memory_resource final : public device_memory_resource {
     void* pointer = arena.allocate(bytes);
 
     if (pointer == nullptr) {
-      write_lock lock(mtx_);
-      defragment();
-      pointer = arena.allocate(bytes);
-      if (pointer == nullptr) {
-        if (dump_log_on_failure_) { dump_memory_log(bytes); }
-        RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory);
-      }
+      if (dump_log_on_failure_) { dump_memory_log(bytes); }
+      RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory);
     }
 
     return pointer;
@@ -183,7 +196,7 @@ class arena_memory_resource final : public device_memory_resource {
   {
     stream.synchronize_no_throw();
 
-    write_lock lock(mtx_);
+    read_lock lock(mtx_);
 
     if (use_per_thread_arena(stream)) {
       auto const id = std::this_thread::get_id();
@@ -205,20 +218,6 @@ class arena_memory_resource final : public device_memory_resource {
     global_arena_.deallocate_from_other_arena(ptr, bytes);
   }
 
-  /**
-   * @brief Defragment memory by returning all free blocks to the global arena.
-   */
-  void defragment()
-  {
-    RMM_CUDA_TRY(cudaDeviceSynchronize());
-    for (auto&& thread_arena : thread_arenas_) {
-      thread_arena.second->clean();
-    }
-    for (auto&& stream_arena : stream_arenas_) {
-      stream_arena.second.clean();
-    }
-  }
-
   /**
    * @brief Get the arena associated with the current thread or the given stream.
    *
@@ -330,7 +329,7 @@ class arena_memory_resource final : public device_memory_resource {
   /// Implementation note: for small sizes, map is more efficient than unordered_map.
   std::map<cudaStream_t, arena> stream_arenas_;
   /// If true, dump memory information to log on allocation failure.
-  bool dump_log_on_failure_;
+  bool dump_log_on_failure_{};
   /// The logger for memory dump.
   std::shared_ptr<spdlog::logger> logger_{};
   /// Mutex for read and write locks.
diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index 515d77c16..3c025c50f 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -75,6 +75,16 @@ class memory_span {
   std::size_t size_{};  ///< Size in bytes.
 };
 
+/// Calculate the total size of a collection of memory spans.
+template <typename T>
+inline auto total_memory_size(std::set<T> const& spans)
+{
+  return std::accumulate(
+    spans.cbegin(), spans.cend(), std::size_t{}, [](auto const& lhs, auto const& rhs) {
+      return lhs + rhs.size();
+    });
+}
+
 /**
  * @brief Represents a chunk of memory that can be allocated and deallocated.
  */
@@ -131,18 +141,9 @@ class block final : public memory_span {
 };
 
 /// Comparison function for block sizes.
-struct block_size_compare {
-  bool operator()(block const& lhs, block const& rhs) const { return lhs.size() < rhs.size(); }
-};
-
-/// Calculate the total size of a collection of blocks.
-template <typename T>
-inline auto total_block_size(T const& blocks)
+inline bool block_size_compare(block const& lhs, block const& rhs)
 {
-  return std::accumulate(
-    blocks.cbegin(), blocks.cend(), std::size_t{}, [](auto const& lhs, auto const& rhs) {
-      return lhs + rhs.size();
-    });
+  return lhs.size() < rhs.size();
 }
 
 /**
@@ -311,11 +312,30 @@ class superblock final : public memory_span {
     }
   }
 
+  /**
+   * @brief Find the max free block.
+   * @return the max free block.
+   */
+  block max_free() const
+  {
+    return *std::max_element(free_blocks_.cbegin(), free_blocks_.cend(), block_size_compare);
+  }
+
  private:
   /// Address-ordered set of free blocks.
   mutable std::set<block> free_blocks_{};
 };
 
+/// Find the max free size from a set of superblocks.
+inline auto max_free(std::set<superblock> const& superblocks)
+{
+  std::size_t size{};
+  for (auto const& sb : superblocks) {
+    size = std::max(size, sb.max_free().size());
+  }
+  return size;
+};
+
 /**
  * @brief The global arena for allocating memory from the upstream memory resource.
  *
@@ -458,24 +478,16 @@ class global_arena final {
    */
   void dump_memory_log(std::shared_ptr<spdlog::logger> const& logger) const
   {
-    //    lock_guard lock(mtx_);
-    //
-    //    logger->info("  Maximum size: {}", rmm::detail::bytes{maximum_size_});
-    //    logger->info("  Current size: {}", rmm::detail::bytes{current_size_});
-    //
-    //    logger->info("  # free blocks: {}", free_blocks_.size());
-    //    if (!free_blocks_.empty()) {
-    //      logger->info("  Total size of free blocks: {}",
-    //                   rmm::detail::bytes{total_block_size(free_blocks_)});
-    //      auto const largest_free =
-    //        *std::max_element(free_blocks_.begin(), free_blocks_.end(), block_size_compare);
-    //      logger->info("  Size of largest free block: {}",
-    //      rmm::detail::bytes{largest_free.size()});
-    //    }
-    //
-    //    logger->info("  # upstream blocks={}", upstream_blocks_.size());
-    //    logger->info("  Total size of upstream blocks: {}",
-    //                 rmm::detail::bytes{total_block_size(upstream_blocks_)});
+    lock_guard lock(mtx_);
+
+    logger->info("  Arena size: {}", rmm::detail::bytes{upstream_block_.size()});
+
+    logger->info("  # superblocks: {}", superblocks_.size());
+    if (!superblocks_.empty()) {
+      logger->info("  Total size of superblocks: {}",
+                   rmm::detail::bytes{total_memory_size(superblocks_)});
+      logger->info("  Size of largest free block: {}", rmm::detail::bytes{max_free(superblocks_)});
+    }
   }
 
  private:
@@ -659,16 +671,14 @@ class arena {
    */
   void dump_memory_log(std::shared_ptr<spdlog::logger> const& logger) const
   {
-    //    lock_guard lock(mtx_);
-    //    logger->info("    # free blocks: {}", free_blocks_.size());
-    //    if (!free_blocks_.empty()) {
-    //      logger->info("    Total size of free blocks: {}",
-    //                   rmm::detail::bytes{total_block_size(free_blocks_)});
-    //      auto const largest_free =
-    //        *std::max_element(free_blocks_.begin(), free_blocks_.end(), block_size_compare);
-    //      logger->info("    Size of largest free block: {}",
-    //      rmm::detail::bytes{largest_free.size()});
-    //    }
+    lock_guard lock(mtx_);
+    logger->info("    # superblocks: {}", superblocks_.size());
+    if (!superblocks_.empty()) {
+      logger->info("    Total size of superblocks: {}",
+                   rmm::detail::bytes{total_memory_size(superblocks_)});
+      logger->info("    Size of largest free block: {}",
+                   rmm::detail::bytes{max_free(superblocks_)});
+    }
   }
 
  private:

From 3f5bf1e02e351e736e59221d19beb0644b5e8f11 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Wed, 10 Nov 2021 12:25:20 -0800
Subject: [PATCH 05/35] switch to map for superblocks

---
 include/rmm/mr/device/detail/arena.hpp | 93 ++++++++++++++------------
 1 file changed, 49 insertions(+), 44 deletions(-)

diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index 3c025c50f..ce7bc9733 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -30,9 +30,11 @@
 #include <algorithm>
 #include <cstddef>
 #include <limits>
+#include <map>
 #include <memory>
 #include <mutex>
 #include <numeric>
+#include <optional>
 #include <set>
 #include <unordered_map>
 
@@ -75,13 +77,13 @@ class memory_span {
   std::size_t size_{};  ///< Size in bytes.
 };
 
-/// Calculate the total size of a collection of memory spans.
+/// Calculate the total size of a map of memory spans.
 template <typename T>
-inline auto total_memory_size(std::set<T> const& spans)
+inline auto total_memory_size(std::map<void*, T> const& spans)
 {
   return std::accumulate(
     spans.cbegin(), spans.cend(), std::size_t{}, [](auto const& lhs, auto const& rhs) {
-      return lhs + rhs.size();
+      return lhs + rhs.second.size();
     });
 }
 
@@ -260,7 +262,7 @@ class superblock final : public memory_span {
    * @param size The number of bytes to allocate.
    * @return block A block of memory of at least `size` bytes, or an empty block if not found.
    */
-  block first_fit(std::size_t size) const
+  block first_fit(std::size_t size)
   {
     auto const iter = std::find_if(
       free_blocks_.cbegin(), free_blocks_.cend(), [size](auto const& b) { return b.fits(size); });
@@ -284,7 +286,7 @@ class superblock final : public memory_span {
    *
    * @param b The block to coalesce.
    */
-  void coalesce(block const& b) const
+  void coalesce(block const& b)
   {
     // Find the right place (in ascending address order) to insert the block.
     auto const next     = free_blocks_.lower_bound(b);
@@ -316,22 +318,22 @@ class superblock final : public memory_span {
    * @brief Find the max free block.
    * @return the max free block.
    */
-  block max_free() const
+  [[nodiscard]] block max_free() const
   {
     return *std::max_element(free_blocks_.cbegin(), free_blocks_.cend(), block_size_compare);
   }
 
  private:
   /// Address-ordered set of free blocks.
-  mutable std::set<block> free_blocks_{};
+  std::set<block> free_blocks_{};
 };
 
-/// Find the max free size from a set of superblocks.
-inline auto max_free(std::set<superblock> const& superblocks)
+/// Find the max free size from a map of superblocks.
+inline auto max_free(std::map<void*, superblock> const& superblocks)
 {
   std::size_t size{};
-  for (auto const& sb : superblocks) {
-    size = std::max(size, sb.max_free().size());
+  for (auto const& kv : superblocks) {
+    size = std::max(size, kv.second.max_free().size());
   }
   return size;
 };
@@ -408,11 +410,11 @@ class global_arena final {
    *
    * @param superblocks The set of superblocks.
    */
-  void release(std::set<superblock>& superblocks)
+  void release(std::map<void*, superblock>& superblocks)
   {
     lock_guard lock(mtx_);
     while (!superblocks.empty()) {
-      coalesce(std::move(superblocks.extract(superblocks.cbegin()).value()));
+      coalesce(std::move(superblocks.extract(superblocks.cbegin()).mapped()));
     }
   }
 
@@ -465,10 +467,11 @@ class global_arena final {
     lock_guard lock(mtx_);
 
     block const b{ptr, bytes};
-    auto const iter = std::find_if(
-      superblocks_.cbegin(), superblocks_.cend(), [&](auto const& sb) { return sb.contains(b); });
-    if (iter == superblocks_.cend()) { RMM_FAIL("allocation not found"); }
-    iter->coalesce(b);
+    auto iter = std::find_if(superblocks_.begin(), superblocks_.end(), [&](auto const& kv) {
+      return kv.second.contains(b);
+    });
+    if (iter == superblocks_.end()) { RMM_FAIL("allocation not found"); }
+    iter->second.coalesce(b);
   }
 
   /**
@@ -512,7 +515,7 @@ class global_arena final {
   {
     RMM_LOGGING_ASSERT(size >= superblock::minimum_size);
     upstream_block_ = {upstream_mr_->allocate(size), size};
-    superblocks_.emplace(upstream_block_.pointer(), size);
+    superblocks_.try_emplace(upstream_block_.pointer(), upstream_block_.pointer(), size);
   }
 
   /**
@@ -521,7 +524,7 @@ class global_arena final {
    * @param size The size in bytes of the allocation.
    * @return bool True if the allocation should be handled by the global arena.
    */
-  bool handles(std::size_t size) { return size > superblock::minimum_size / 2; }
+  bool handles(std::size_t size) const { return size > superblock::minimum_size / 2; }
 
   /**
    * @brief Get the first superblock that can fit a block of at least `size` bytes.
@@ -538,16 +541,17 @@ class global_arena final {
    */
   superblock first_fit(std::size_t size)
   {
-    auto const iter = std::find_if(
-      superblocks_.cbegin(), superblocks_.cend(), [size](auto const& sb) { return sb.fits(size); });
+    auto const iter = std::find_if(superblocks_.cbegin(),
+                                   superblocks_.cend(),
+                                   [size](auto const& kv) { return kv.second.fits(size); });
     if (iter == superblocks_.cend()) { return {}; }
 
-    auto sb       = std::move(superblocks_.extract(iter).value());
+    auto sb       = std::move(superblocks_.extract(iter).mapped());
     auto const sz = std::max(size, superblock::minimum_size);
     if (sb.empty() && sb.size() - sz >= superblock::minimum_size) {
       // Split the superblock and put the remainder back.
       auto [head, tail] = sb.split(sz);
-      superblocks_.insert(std::move(tail));
+      superblocks_.try_emplace(tail.pointer(), std::move(tail));
       return std::move(head);
     }
     return sb;
@@ -561,28 +565,28 @@ class global_arena final {
   void coalesce(superblock&& sb)
   {
     // Find the right place (in ascending address order) to insert the block.
-    auto const next     = superblocks_.lower_bound(sb);
+    auto const next     = superblocks_.lower_bound(sb.pointer());
     auto const previous = next == superblocks_.cbegin() ? next : std::prev(next);
 
     // Coalesce with neighboring blocks.
-    bool const merge_prev = previous->is_contiguous_before(sb);
-    bool const merge_next = next != superblocks_.cend() && sb.is_contiguous_before(*next);
+    bool const merge_prev = previous->second.is_contiguous_before(sb);
+    bool const merge_next = next != superblocks_.cend() && sb.is_contiguous_before(next->second);
 
     if (merge_prev && merge_next) {
-      auto prev_sb = std::move(superblocks_.extract(previous).value());
-      auto next_sb = std::move(superblocks_.extract(next).value());
+      auto prev_sb = std::move(superblocks_.extract(previous).mapped());
+      auto next_sb = std::move(superblocks_.extract(next).mapped());
       auto merged  = prev_sb.merge(sb).merge(next_sb);
-      superblocks_.emplace(std::move(merged));
+      superblocks_.try_emplace(merged.pointer(), std::move(merged));
     } else if (merge_prev) {
-      auto prev_sb = std::move(superblocks_.extract(previous).value());
+      auto prev_sb = std::move(superblocks_.extract(previous).mapped());
       auto merged  = prev_sb.merge(sb);
-      superblocks_.emplace(std::move(merged));
+      superblocks_.try_emplace(merged.pointer(), std::move(merged));
     } else if (merge_next) {
-      auto next_sb = std::move(superblocks_.extract(next).value());
+      auto next_sb = std::move(superblocks_.extract(next).mapped());
       auto merged  = sb.merge(next_sb);
-      superblocks_.emplace(std::move(merged));
+      superblocks_.try_emplace(merged.pointer(), std::move(merged));
     } else {
-      superblocks_.emplace(std::move(sb));
+      superblocks_.try_emplace(sb.pointer(), std::move(sb));
     }
   }
 
@@ -590,8 +594,8 @@ class global_arena final {
   Upstream* upstream_mr_;
   /// Block allocated from upstream so that it can be quickly freed.
   block upstream_block_;
-  /// Address-ordered set of superblocks.
-  std::set<superblock> superblocks_;
+  /// Address-ordered map of superblocks.
+  std::map<void*, superblock> superblocks_;
   /// Mutex for exclusive lock.
   mutable std::mutex mtx_;
 };
@@ -715,8 +719,8 @@ class arena {
    */
   block first_fit(std::size_t size)
   {
-    for (auto const& s : superblocks_) {
-      auto const b = s.first_fit(size);
+    for (auto&& kv : superblocks_) {
+      auto const b = kv.second.first_fit(size);
       if (b.is_valid()) { return b; }
     }
     return {};
@@ -730,13 +734,14 @@ class arena {
    */
   bool deallocate_from_superblock(block const& b)
   {
-    auto const iter = std::find_if(
-      superblocks_.begin(), superblocks_.end(), [&](auto const& sb) { return sb.contains(b); });
+    auto iter = std::find_if(superblocks_.begin(), superblocks_.end(), [&](auto const& kv) {
+      return kv.second.contains(b);
+    });
     if (iter == superblocks_.end()) { return false; }
 
-    auto const& sb = *iter;
+    auto& sb = iter->second;
     sb.coalesce(b);
-    if (sb.empty()) { global_arena_.release(std::move(superblocks_.extract(iter).value())); }
+    if (sb.empty()) { global_arena_.release(std::move(superblocks_.extract(iter).mapped())); }
     return true;
   }
 
@@ -751,7 +756,7 @@ class arena {
     auto sb = global_arena_.acquire(size);
     if (sb.is_valid()) {
       auto const b = sb.first_fit(size);
-      superblocks_.emplace(std::move(sb));
+      superblocks_.try_emplace(sb.pointer(), std::move(sb));
       return b;
     }
     return {};
@@ -760,7 +765,7 @@ class arena {
   /// The global arena to allocate superblocks from.
   global_arena<Upstream>& global_arena_;
   /// Acquired superblocks.
-  std::set<superblock> superblocks_;
+  std::map<void*, superblock> superblocks_;
   /// Mutex for exclusive lock.
   mutable std::mutex mtx_;
 };

From d33b9a0af8377bcd62c3ecaaee7ba38aefe4b70d Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Thu, 11 Nov 2021 09:30:53 -0800
Subject: [PATCH 06/35] add some tests

---
 include/rmm/mr/device/detail/arena.hpp |   4 +-
 tests/mr/device/arena_mr_tests.cpp     | 132 +++++++++++++++++++++++++
 2 files changed, 134 insertions(+), 2 deletions(-)

diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index ce7bc9733..877073d83 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -70,7 +70,7 @@ class memory_span {
   [[nodiscard]] bool is_valid() const { return pointer_ != nullptr; }
 
   /// Used by std::set to compare spans.
-  bool operator<(memory_span const& s) const { return pointer_ < s.pointer_; }
+  bool operator<(memory_span const& ms) const { return pointer_ < ms.pointer_; }
 
  private:
   char* pointer_{};     ///< Raw memory pointer.
@@ -247,7 +247,7 @@ class superblock final : public memory_span {
    *
    * `this->is_contiguous_before(s)` must be true.
    *
-   * @param s superblock to merge.
+   * @param sb superblock to merge.
    * @return block The merged block.
    */
   [[nodiscard]] superblock merge(superblock const& sb) const
diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index 17b001671..c7ddf2f6f 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -25,8 +25,140 @@
 
 namespace rmm::test {
 namespace {
+
+using memory_span = rmm::mr::detail::arena::memory_span;
+using block = rmm::mr::detail::arena::block;
+using superblock = rmm::mr::detail::arena::superblock;
 using arena_mr = rmm::mr::arena_memory_resource<rmm::mr::device_memory_resource>;
 
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
+auto const fake_address = reinterpret_cast<void*>(1L << 10L);
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
+auto const fake_address2 = reinterpret_cast<void*>(1L << 11L);
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
+auto const fake_address3 = reinterpret_cast<void*>(1L << 22L);
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
+auto const fake_address4 = reinterpret_cast<void*>(1L << 23L);
+
+TEST(ArenaTest, MemorySpan)
+{
+  memory_span const ms{};
+  EXPECT_FALSE(ms.is_valid());
+  memory_span const ms2{fake_address, 256};
+  EXPECT_TRUE(ms2.is_valid());
+}
+
+TEST(ArenaTest, BlockFits)
+{
+  block const b{fake_address, 1024};
+  EXPECT_TRUE(b.fits(1024));
+  EXPECT_FALSE(b.fits(1025));
+}
+
+TEST(ArenaTest, BlockIsContiguousBefore)
+{
+  block const b{fake_address, 1024};
+  block const b2{fake_address2, 256};
+  EXPECT_TRUE(b.is_contiguous_before(b2));
+  block const b3{fake_address, 512};
+  block const b4{fake_address2, 1024};
+  EXPECT_FALSE(b3.is_contiguous_before(b4));
+}
+
+TEST(ArenaTest, BlockSplit)
+{
+  block const b{fake_address, 2048};
+  auto const [head, tail] = b.split(1024);
+  EXPECT_EQ(head.pointer(), fake_address);
+  EXPECT_EQ(head.size(), 1024);
+  EXPECT_EQ(tail.pointer(), fake_address2);
+  EXPECT_EQ(tail.size(), 1024);
+}
+
+TEST(ArenaTest, BlockMerge)
+{
+  block const b{fake_address, 1024};
+  block const b2{fake_address2, 1024};
+  auto const merged = b.merge(b2);
+  EXPECT_EQ(merged.pointer(), fake_address);
+  EXPECT_EQ(merged.size(), 2048);
+}
+
+TEST(ArenaTest, SuperblockEmpty)
+{
+  superblock sb{fake_address3, 4194304};
+  EXPECT_TRUE(sb.empty());
+  sb.first_fit(256);
+  EXPECT_FALSE(sb.empty());
+}
+
+TEST(ArenaTest, SuperblockContains)
+{
+  superblock const sb{fake_address3, 4194304};
+  block const b{fake_address, 2048};
+  EXPECT_FALSE(sb.contains(b));
+  block const b2{fake_address3, 1024};
+  EXPECT_TRUE(sb.contains(b2));
+  block const b3{fake_address3, 4194305};
+  EXPECT_FALSE(sb.contains(b3));
+  block const b4{fake_address3, 4194304};
+  EXPECT_TRUE(sb.contains(b4));
+  block const b5{fake_address4, 256};
+  EXPECT_FALSE(sb.contains(b5));
+}
+
+TEST(ArenaTest, SuperblockFits)
+{
+  superblock sb{fake_address3, 4194304};
+  EXPECT_TRUE(sb.fits(4194304));
+  EXPECT_FALSE(sb.fits(4194305));
+
+  auto const b = sb.first_fit(1048576);
+  sb.first_fit(1048576);
+  sb.coalesce(b);
+  EXPECT_TRUE(sb.fits(2097152));
+  EXPECT_FALSE(sb.fits(2097153));
+}
+
+TEST(ArenaTest, SuperblockIsContiguousBefore)
+{
+  superblock sb{fake_address3, 4194304};
+  superblock sb2{fake_address4, 4194304};
+  EXPECT_TRUE(sb.is_contiguous_before(sb2));
+
+  auto const b = sb.first_fit(256);
+  EXPECT_FALSE(sb.is_contiguous_before(sb2));
+  sb.coalesce(b);
+  EXPECT_TRUE(sb.is_contiguous_before(sb2));
+
+  auto const b2 = sb2.first_fit(1024);
+  EXPECT_FALSE(sb.is_contiguous_before(sb2));
+  sb2.coalesce(b2);
+  EXPECT_TRUE(sb.is_contiguous_before(sb2));
+}
+
+TEST(ArenaTest, SuperblockSplit)
+{
+  superblock sb{fake_address3, 8388608};
+  auto const [head, tail] = sb.split(4194304);
+  EXPECT_EQ(head.pointer(), fake_address3);
+  EXPECT_EQ(head.size(), 4194304);
+  EXPECT_TRUE(head.empty());
+  EXPECT_EQ(tail.pointer(), fake_address4);
+  EXPECT_EQ(tail.size(), 4194304);
+  EXPECT_TRUE(tail.empty());
+}
+
+TEST(ArenaTest, SuperblockMerge)
+{
+  superblock sb{fake_address3, 4194304};
+  superblock sb2{fake_address4, 4194304};
+  auto const merged = sb.merge(sb2);
+  EXPECT_EQ(merged.pointer(), fake_address3);
+  EXPECT_EQ(merged.size(), 8388608);
+  EXPECT_TRUE(merged.empty());
+}
+
 TEST(ArenaTest, NullUpstream)
 {
   EXPECT_THROW([]() { arena_mr mr{nullptr}; }(), rmm::logic_error);

From b4a1d6a9aa597899c6a8403bf6576fbeee94f26c Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Thu, 11 Nov 2021 14:58:40 -0800
Subject: [PATCH 07/35] add more tests

---
 tests/mr/device/arena_mr_tests.cpp | 98 ++++++++++++++++++++++++++----
 1 file changed, 87 insertions(+), 11 deletions(-)

diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index c7ddf2f6f..2a8d5d5fc 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -27,19 +27,20 @@ namespace rmm::test {
 namespace {
 
 using memory_span = rmm::mr::detail::arena::memory_span;
-using block = rmm::mr::detail::arena::block;
-using superblock = rmm::mr::detail::arena::superblock;
-using arena_mr = rmm::mr::arena_memory_resource<rmm::mr::device_memory_resource>;
+using block       = rmm::mr::detail::arena::block;
+using superblock  = rmm::mr::detail::arena::superblock;
+using arena_mr    = rmm::mr::arena_memory_resource<rmm::mr::device_memory_resource>;
 
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
-auto const fake_address = reinterpret_cast<void*>(1L << 10L);
+auto const fake_address = reinterpret_cast<void*>(1024L);
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
-auto const fake_address2 = reinterpret_cast<void*>(1L << 11L);
+auto const fake_address2 = reinterpret_cast<void*>(2048L);
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
-auto const fake_address3 = reinterpret_cast<void*>(1L << 22L);
+auto const fake_address3 = reinterpret_cast<void*>(4194304L);
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
-auto const fake_address4 = reinterpret_cast<void*>(1L << 23L);
+auto const fake_address4 = reinterpret_cast<void*>(8388608L);
 
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
 TEST(ArenaTest, MemorySpan)
 {
   memory_span const ms{};
@@ -48,6 +49,7 @@ TEST(ArenaTest, MemorySpan)
   EXPECT_TRUE(ms2.is_valid());
 }
 
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
 TEST(ArenaTest, BlockFits)
 {
   block const b{fake_address, 1024};
@@ -55,6 +57,7 @@ TEST(ArenaTest, BlockFits)
   EXPECT_FALSE(b.fits(1025));
 }
 
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
 TEST(ArenaTest, BlockIsContiguousBefore)
 {
   block const b{fake_address, 1024};
@@ -65,6 +68,7 @@ TEST(ArenaTest, BlockIsContiguousBefore)
   EXPECT_FALSE(b3.is_contiguous_before(b4));
 }
 
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
 TEST(ArenaTest, BlockSplit)
 {
   block const b{fake_address, 2048};
@@ -75,6 +79,7 @@ TEST(ArenaTest, BlockSplit)
   EXPECT_EQ(tail.size(), 1024);
 }
 
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
 TEST(ArenaTest, BlockMerge)
 {
   block const b{fake_address, 1024};
@@ -84,6 +89,7 @@ TEST(ArenaTest, BlockMerge)
   EXPECT_EQ(merged.size(), 2048);
 }
 
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
 TEST(ArenaTest, SuperblockEmpty)
 {
   superblock sb{fake_address3, 4194304};
@@ -92,6 +98,7 @@ TEST(ArenaTest, SuperblockEmpty)
   EXPECT_FALSE(sb.empty());
 }
 
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
 TEST(ArenaTest, SuperblockContains)
 {
   superblock const sb{fake_address3, 4194304};
@@ -107,6 +114,7 @@ TEST(ArenaTest, SuperblockContains)
   EXPECT_FALSE(sb.contains(b5));
 }
 
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
 TEST(ArenaTest, SuperblockFits)
 {
   superblock sb{fake_address3, 4194304};
@@ -120,6 +128,7 @@ TEST(ArenaTest, SuperblockFits)
   EXPECT_FALSE(sb.fits(2097153));
 }
 
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
 TEST(ArenaTest, SuperblockIsContiguousBefore)
 {
   superblock sb{fake_address3, 4194304};
@@ -137,6 +146,7 @@ TEST(ArenaTest, SuperblockIsContiguousBefore)
   EXPECT_TRUE(sb.is_contiguous_before(sb2));
 }
 
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
 TEST(ArenaTest, SuperblockSplit)
 {
   superblock sb{fake_address3, 8388608};
@@ -149,6 +159,7 @@ TEST(ArenaTest, SuperblockSplit)
   EXPECT_TRUE(tail.empty());
 }
 
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
 TEST(ArenaTest, SuperblockMerge)
 {
   superblock sb{fake_address3, 4194304};
@@ -159,13 +170,76 @@ TEST(ArenaTest, SuperblockMerge)
   EXPECT_TRUE(merged.empty());
 }
 
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
+TEST(ArenaTest, SuperblockFirstFit)
+{
+  superblock sb{fake_address3, 4194304};
+  auto const b = sb.first_fit(1024);
+  EXPECT_EQ(b.pointer(), fake_address3);
+  EXPECT_EQ(b.size(), 1024);
+  auto const b2 = sb.first_fit(2048);
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+  EXPECT_EQ(b2.pointer(), static_cast<char*>(fake_address3) + 1024);
+  EXPECT_EQ(b2.size(), 2048);
+  sb.coalesce(b);
+  auto const b3 = sb.first_fit(512);
+  EXPECT_EQ(b3.pointer(), fake_address3);
+  EXPECT_EQ(b3.size(), 512);
+}
+
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
+TEST(ArenaTest, SuperblockCoalesceMergeNext)
+{
+  superblock sb{fake_address3, 4194304};
+  auto const b = sb.first_fit(2097152);
+  sb.coalesce(b);
+  EXPECT_TRUE(sb.first_fit(4194304).is_valid());
+}
+
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
+TEST(ArenaTest, SuperblockCoalesceMergePrevious)
+{
+  superblock sb{fake_address3, 4194304};
+  auto const b  = sb.first_fit(1024);
+  auto const b2 = sb.first_fit(1024);
+  sb.first_fit(1024);
+  sb.coalesce(b);
+  sb.coalesce(b2);
+  auto const b3 = sb.first_fit(2048);
+  EXPECT_EQ(b3.pointer(), fake_address3);
+}
+
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
+TEST(ArenaTest, SuperblockCoalesceMergePreviousAndNext)
+{
+  superblock sb{fake_address3, 4194304};
+  auto const b  = sb.first_fit(1024);
+  auto const b2 = sb.first_fit(1024);
+  sb.coalesce(b);
+  sb.coalesce(b2);
+  EXPECT_TRUE(sb.first_fit(4194304).is_valid());
+}
+
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
+TEST(ArenaTest, SuperblockMaxFree)
+{
+  superblock sb{fake_address3, 4194304};
+  sb.first_fit(2097152);
+  auto const b = sb.max_free();
+  EXPECT_EQ(b.size(), 2097152);
+}
+
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
 TEST(ArenaTest, NullUpstream)
 {
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto)
   EXPECT_THROW([]() { arena_mr mr{nullptr}; }(), rmm::logic_error);
 }
 
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
 TEST(ArenaTest, AllocateNinetyPercent)
 {
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto)
   EXPECT_NO_THROW([]() {
     auto const free = rmm::detail::available_device_memory().first;
     auto const ninety_percent =
@@ -174,17 +248,19 @@ TEST(ArenaTest, AllocateNinetyPercent)
   }());
 }
 
+// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
 TEST(ArenaTest, SmallMediumLarge)
 {
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto)
   EXPECT_NO_THROW([]() {
     arena_mr mr(rmm::mr::get_current_device_resource());
-    auto* small = mr.allocate(256);
-    auto* medium = mr.allocate(1U << 26U);
+    auto* small     = mr.allocate(256);
+    auto* medium    = mr.allocate(1U << 26U);
     auto const free = rmm::detail::available_device_memory().first;
-    auto* large = mr.allocate(free / 2);
+    auto* large     = mr.allocate(free / 3);
     mr.deallocate(small, 256);
     mr.deallocate(medium, 1U << 26U);
-    mr.deallocate(large, free / 4);
+    mr.deallocate(large, free / 3);
   }());
 }
 

From d86d6b19675aac43604ed0d2d52c136f1ec9e09e Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Thu, 11 Nov 2021 15:13:35 -0800
Subject: [PATCH 08/35] fix clang tidy warnings in test

---
 tests/mr/device/arena_mr_tests.cpp | 63 ++++++++++--------------------
 1 file changed, 21 insertions(+), 42 deletions(-)

diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index 2a8d5d5fc..96baee083 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -40,8 +40,7 @@ auto const fake_address3 = reinterpret_cast<void*>(4194304L);
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
 auto const fake_address4 = reinterpret_cast<void*>(8388608L);
 
-// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
-TEST(ArenaTest, MemorySpan)
+TEST(ArenaTest, MemorySpan)  // NOLINT
 {
   memory_span const ms{};
   EXPECT_FALSE(ms.is_valid());
@@ -49,16 +48,14 @@ TEST(ArenaTest, MemorySpan)
   EXPECT_TRUE(ms2.is_valid());
 }
 
-// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
-TEST(ArenaTest, BlockFits)
+TEST(ArenaTest, BlockFits)  // NOLINT
 {
   block const b{fake_address, 1024};
   EXPECT_TRUE(b.fits(1024));
   EXPECT_FALSE(b.fits(1025));
 }
 
-// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
-TEST(ArenaTest, BlockIsContiguousBefore)
+TEST(ArenaTest, BlockIsContiguousBefore)  // NOLINT
 {
   block const b{fake_address, 1024};
   block const b2{fake_address2, 256};
@@ -68,8 +65,7 @@ TEST(ArenaTest, BlockIsContiguousBefore)
   EXPECT_FALSE(b3.is_contiguous_before(b4));
 }
 
-// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
-TEST(ArenaTest, BlockSplit)
+TEST(ArenaTest, BlockSplit)  // NOLINT
 {
   block const b{fake_address, 2048};
   auto const [head, tail] = b.split(1024);
@@ -79,8 +75,7 @@ TEST(ArenaTest, BlockSplit)
   EXPECT_EQ(tail.size(), 1024);
 }
 
-// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
-TEST(ArenaTest, BlockMerge)
+TEST(ArenaTest, BlockMerge)  // NOLINT
 {
   block const b{fake_address, 1024};
   block const b2{fake_address2, 1024};
@@ -89,8 +84,7 @@ TEST(ArenaTest, BlockMerge)
   EXPECT_EQ(merged.size(), 2048);
 }
 
-// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
-TEST(ArenaTest, SuperblockEmpty)
+TEST(ArenaTest, SuperblockEmpty)  // NOLINT
 {
   superblock sb{fake_address3, 4194304};
   EXPECT_TRUE(sb.empty());
@@ -98,8 +92,7 @@ TEST(ArenaTest, SuperblockEmpty)
   EXPECT_FALSE(sb.empty());
 }
 
-// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
-TEST(ArenaTest, SuperblockContains)
+TEST(ArenaTest, SuperblockContains)  // NOLINT
 {
   superblock const sb{fake_address3, 4194304};
   block const b{fake_address, 2048};
@@ -114,8 +107,7 @@ TEST(ArenaTest, SuperblockContains)
   EXPECT_FALSE(sb.contains(b5));
 }
 
-// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
-TEST(ArenaTest, SuperblockFits)
+TEST(ArenaTest, SuperblockFits)  // NOLINT
 {
   superblock sb{fake_address3, 4194304};
   EXPECT_TRUE(sb.fits(4194304));
@@ -128,8 +120,7 @@ TEST(ArenaTest, SuperblockFits)
   EXPECT_FALSE(sb.fits(2097153));
 }
 
-// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
-TEST(ArenaTest, SuperblockIsContiguousBefore)
+TEST(ArenaTest, SuperblockIsContiguousBefore)  // NOLINT
 {
   superblock sb{fake_address3, 4194304};
   superblock sb2{fake_address4, 4194304};
@@ -146,8 +137,7 @@ TEST(ArenaTest, SuperblockIsContiguousBefore)
   EXPECT_TRUE(sb.is_contiguous_before(sb2));
 }
 
-// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
-TEST(ArenaTest, SuperblockSplit)
+TEST(ArenaTest, SuperblockSplit)  // NOLINT
 {
   superblock sb{fake_address3, 8388608};
   auto const [head, tail] = sb.split(4194304);
@@ -159,8 +149,7 @@ TEST(ArenaTest, SuperblockSplit)
   EXPECT_TRUE(tail.empty());
 }
 
-// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
-TEST(ArenaTest, SuperblockMerge)
+TEST(ArenaTest, SuperblockMerge)  // NOLINT
 {
   superblock sb{fake_address3, 4194304};
   superblock sb2{fake_address4, 4194304};
@@ -170,8 +159,7 @@ TEST(ArenaTest, SuperblockMerge)
   EXPECT_TRUE(merged.empty());
 }
 
-// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
-TEST(ArenaTest, SuperblockFirstFit)
+TEST(ArenaTest, SuperblockFirstFit)  // NOLINT
 {
   superblock sb{fake_address3, 4194304};
   auto const b = sb.first_fit(1024);
@@ -187,8 +175,7 @@ TEST(ArenaTest, SuperblockFirstFit)
   EXPECT_EQ(b3.size(), 512);
 }
 
-// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
-TEST(ArenaTest, SuperblockCoalesceMergeNext)
+TEST(ArenaTest, SuperblockCoalesceMergeNext)  // NOLINT
 {
   superblock sb{fake_address3, 4194304};
   auto const b = sb.first_fit(2097152);
@@ -196,8 +183,7 @@ TEST(ArenaTest, SuperblockCoalesceMergeNext)
   EXPECT_TRUE(sb.first_fit(4194304).is_valid());
 }
 
-// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
-TEST(ArenaTest, SuperblockCoalesceMergePrevious)
+TEST(ArenaTest, SuperblockCoalesceMergePrevious)  // NOLINT
 {
   superblock sb{fake_address3, 4194304};
   auto const b  = sb.first_fit(1024);
@@ -209,8 +195,7 @@ TEST(ArenaTest, SuperblockCoalesceMergePrevious)
   EXPECT_EQ(b3.pointer(), fake_address3);
 }
 
-// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
-TEST(ArenaTest, SuperblockCoalesceMergePreviousAndNext)
+TEST(ArenaTest, SuperblockCoalesceMergePreviousAndNext)  // NOLINT
 {
   superblock sb{fake_address3, 4194304};
   auto const b  = sb.first_fit(1024);
@@ -220,8 +205,7 @@ TEST(ArenaTest, SuperblockCoalesceMergePreviousAndNext)
   EXPECT_TRUE(sb.first_fit(4194304).is_valid());
 }
 
-// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
-TEST(ArenaTest, SuperblockMaxFree)
+TEST(ArenaTest, SuperblockMaxFree)  // NOLINT
 {
   superblock sb{fake_address3, 4194304};
   sb.first_fit(2097152);
@@ -229,18 +213,15 @@ TEST(ArenaTest, SuperblockMaxFree)
   EXPECT_EQ(b.size(), 2097152);
 }
 
-// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
-TEST(ArenaTest, NullUpstream)
+TEST(ArenaTest, NullUpstream)  // NOLINT
 {
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto)
   EXPECT_THROW([]() { arena_mr mr{nullptr}; }(), rmm::logic_error);
 }
 
-// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
-TEST(ArenaTest, AllocateNinetyPercent)
+TEST(ArenaTest, AllocateNinetyPercent)  // NOLINT
 {
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto)
-  EXPECT_NO_THROW([]() {
+  EXPECT_NO_THROW([]() {  // NOLINT(cppcoreguidelines-avoid-goto)
     auto const free = rmm::detail::available_device_memory().first;
     auto const ninety_percent =
       rmm::detail::align_up_cuda(static_cast<std::size_t>(static_cast<double>(free) * 0.9));
@@ -248,11 +229,9 @@ TEST(ArenaTest, AllocateNinetyPercent)
   }());
 }
 
-// NOLINTNEXTLINE(cppcoreguidelines-special-member-functions)
-TEST(ArenaTest, SmallMediumLarge)
+TEST(ArenaTest, SmallMediumLarge)  // NOLINT
 {
-  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto)
-  EXPECT_NO_THROW([]() {
+  EXPECT_NO_THROW([]() {  // NOLINT(cppcoreguidelines-avoid-goto)
     arena_mr mr(rmm::mr::get_current_device_resource());
     auto* small     = mr.allocate(256);
     auto* medium    = mr.allocate(1U << 26U);

From f87ba63c7ca7728fe7ef9f101932ca14e8cd5463 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Thu, 11 Nov 2021 19:57:45 -0800
Subject: [PATCH 09/35] add some logging asserts

---
 include/rmm/mr/device/detail/arena.hpp | 49 +++++++++++++++++--
 tests/mr/device/arena_mr_tests.cpp     | 65 ++++++++++++++++++++++++--
 2 files changed, 105 insertions(+), 9 deletions(-)

diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index 877073d83..e16af1c5a 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -58,6 +58,8 @@ class memory_span {
    */
   memory_span(void* pointer, std::size_t size) : pointer_{static_cast<char*>(pointer)}, size_{size}
   {
+    RMM_LOGGING_ASSERT(pointer != nullptr);
+    RMM_LOGGING_ASSERT(size > 0);
   }
 
   /// Returns the underlying pointer.
@@ -67,10 +69,14 @@ class memory_span {
   [[nodiscard]] std::size_t size() const { return size_; }
 
   /// Returns true if this span is valid (non-null), false otherwise.
-  [[nodiscard]] bool is_valid() const { return pointer_ != nullptr; }
+  [[nodiscard]] bool is_valid() const { return pointer_ != nullptr && size_ > 0; }
 
   /// Used by std::set to compare spans.
-  bool operator<(memory_span const& ms) const { return pointer_ < ms.pointer_; }
+  bool operator<(memory_span const& ms) const
+  {
+    RMM_LOGGING_ASSERT(ms.is_valid());
+    return pointer_ < ms.pointer_;
+  }
 
  private:
   char* pointer_{};     ///< Raw memory pointer.
@@ -100,7 +106,12 @@ class block final : public memory_span {
    * @param sz The size in bytes to check for fit.
    * @return true if this block is at least `sz` bytes.
    */
-  [[nodiscard]] bool fits(std::size_t sz) const { return size() >= sz; }
+  [[nodiscard]] bool fits(std::size_t sz) const
+  {
+    RMM_LOGGING_ASSERT(is_valid());
+    RMM_LOGGING_ASSERT(sz > 0);
+    return size() >= sz;
+  }
 
   /**
    * @brief Verifies whether this block can be merged to the beginning of block b.
@@ -110,6 +121,8 @@ class block final : public memory_span {
    */
   [[nodiscard]] bool is_contiguous_before(block const& b) const
   {
+    RMM_LOGGING_ASSERT(is_valid());
+    RMM_LOGGING_ASSERT(b.is_valid());
     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
     return pointer() + size() == b.pointer();
   }
@@ -122,6 +135,7 @@ class block final : public memory_span {
    */
   [[nodiscard]] std::pair<block, block> split(std::size_t sz) const
   {
+    RMM_LOGGING_ASSERT(is_valid());
     RMM_LOGGING_ASSERT(size() >= sz);
     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
     return {{pointer(), sz}, {pointer() + sz, size() - sz}};
@@ -137,6 +151,8 @@ class block final : public memory_span {
    */
   [[nodiscard]] block merge(block const& b) const
   {
+    RMM_LOGGING_ASSERT(is_valid());
+    RMM_LOGGING_ASSERT(b.is_valid());
     RMM_LOGGING_ASSERT(is_contiguous_before(b));
     return {pointer(), size() + b.size()};
   }
@@ -145,6 +161,8 @@ class block final : public memory_span {
 /// Comparison function for block sizes.
 inline bool block_size_compare(block const& lhs, block const& rhs)
 {
+  RMM_LOGGING_ASSERT(lhs.is_valid());
+  RMM_LOGGING_ASSERT(rhs.is_valid());
   return lhs.size() < rhs.size();
 }
 
@@ -170,6 +188,7 @@ class superblock final : public memory_span {
    */
   superblock(void* pointer, std::size_t size) : memory_span{pointer, size}
   {
+    RMM_LOGGING_ASSERT(size >= minimum_size);
     free_blocks_.emplace(pointer, size);
   }
 
@@ -189,6 +208,7 @@ class superblock final : public memory_span {
    */
   [[nodiscard]] bool empty() const
   {
+    RMM_LOGGING_ASSERT(is_valid());
     return free_blocks_.size() == 1 && free_blocks_.cbegin()->size() == size();
   }
 
@@ -200,6 +220,8 @@ class superblock final : public memory_span {
    */
   [[nodiscard]] bool contains(block const& b) const
   {
+    RMM_LOGGING_ASSERT(is_valid());
+    RMM_LOGGING_ASSERT(b.is_valid());
     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
     return pointer() <= b.pointer() && pointer() + size() >= b.pointer() + b.size();
   }
@@ -212,6 +234,7 @@ class superblock final : public memory_span {
    */
   [[nodiscard]] bool fits(std::size_t sz) const
   {
+    RMM_LOGGING_ASSERT(is_valid());
     return std::any_of(
       free_blocks_.cbegin(), free_blocks_.cend(), [sz](auto const& b) { return b.fits(sz); });
   }
@@ -225,6 +248,8 @@ class superblock final : public memory_span {
    */
   [[nodiscard]] bool is_contiguous_before(superblock const& sb) const
   {
+    RMM_LOGGING_ASSERT(is_valid());
+    RMM_LOGGING_ASSERT(sb.is_valid());
     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
     return empty() && sb.empty() && pointer() + size() == sb.pointer();
   }
@@ -237,6 +262,7 @@ class superblock final : public memory_span {
    */
   [[nodiscard]] std::pair<superblock, superblock> split(std::size_t sz) const
   {
+    RMM_LOGGING_ASSERT(is_valid());
     RMM_LOGGING_ASSERT(empty() && sz >= minimum_size && size() - sz >= minimum_size);
     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
     return {superblock{pointer(), sz}, superblock{pointer() + sz, size() - sz}};
@@ -264,6 +290,9 @@ class superblock final : public memory_span {
    */
   block first_fit(std::size_t size)
   {
+    RMM_LOGGING_ASSERT(is_valid());
+    RMM_LOGGING_ASSERT(size > 0);
+
     auto const iter = std::find_if(
       free_blocks_.cbegin(), free_blocks_.cend(), [size](auto const& b) { return b.fits(size); });
     if (iter == free_blocks_.cend()) { return {}; }
@@ -288,6 +317,10 @@ class superblock final : public memory_span {
    */
   void coalesce(block const& b)
   {
+    RMM_LOGGING_ASSERT(is_valid());
+    RMM_LOGGING_ASSERT(b.is_valid());
+    RMM_LOGGING_ASSERT(contains(b));
+
     // Find the right place (in ascending address order) to insert the block.
     auto const next     = free_blocks_.lower_bound(b);
     auto const previous = next == free_blocks_.cbegin() ? next : std::prev(next);
@@ -390,6 +423,8 @@ class global_arena final {
    */
   superblock acquire(std::size_t size)
   {
+    // Superblocks should only be acquired if the size is not directly handled by the global arena.
+    RMM_LOGGING_ASSERT(!handles(size));
     lock_guard lock(mtx_);
     return first_fit(size);
   }
@@ -401,6 +436,7 @@ class global_arena final {
    */
   void release(superblock&& sb)
   {
+    RMM_LOGGING_ASSERT(sb.is_valid());
     lock_guard lock(mtx_);
     coalesce(std::move(sb));
   }
@@ -414,7 +450,9 @@ class global_arena final {
   {
     lock_guard lock(mtx_);
     while (!superblocks.empty()) {
-      coalesce(std::move(superblocks.extract(superblocks.cbegin()).mapped()));
+      auto&& sb = std::move(superblocks.extract(superblocks.cbegin()).mapped());
+      RMM_LOGGING_ASSERT(sb.is_valid());
+      coalesce(std::move(sb));
     }
   }
 
@@ -484,7 +522,6 @@ class global_arena final {
     lock_guard lock(mtx_);
 
     logger->info("  Arena size: {}", rmm::detail::bytes{upstream_block_.size()});
-
     logger->info("  # superblocks: {}", superblocks_.size());
     if (!superblocks_.empty()) {
       logger->info("  Total size of superblocks: {}",
@@ -564,6 +601,8 @@ class global_arena final {
    */
   void coalesce(superblock&& sb)
   {
+    RMM_LOGGING_ASSERT(sb.is_valid());
+
     // Find the right place (in ascending address order) to insert the block.
     auto const next     = superblocks_.lower_bound(sb.pointer());
     auto const previous = next == superblocks_.cbegin() ? next : std::prev(next);
diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index 96baee083..7e3622bb7 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -21,15 +21,25 @@
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 
+#include <gmock/gmock-actions.h>
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
 namespace rmm::test {
 namespace {
 
-using memory_span = rmm::mr::detail::arena::memory_span;
-using block       = rmm::mr::detail::arena::block;
-using superblock  = rmm::mr::detail::arena::superblock;
-using arena_mr    = rmm::mr::arena_memory_resource<rmm::mr::device_memory_resource>;
+class mock_memory_resource {
+ public:
+  MOCK_METHOD(void*, allocate, (std::size_t));
+  MOCK_METHOD(void, deallocate, (void*, std::size_t));
+};
+
+using memory_span  = rmm::mr::detail::arena::memory_span;
+using block        = rmm::mr::detail::arena::block;
+using superblock   = rmm::mr::detail::arena::superblock;
+using global_arena = rmm::mr::detail::arena::global_arena<mock_memory_resource>;
+using arena_mr     = rmm::mr::arena_memory_resource<rmm::mr::device_memory_resource>;
+using ::testing::Return;
 
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
 auto const fake_address = reinterpret_cast<void*>(1024L);
@@ -40,6 +50,10 @@ auto const fake_address3 = reinterpret_cast<void*>(4194304L);
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
 auto const fake_address4 = reinterpret_cast<void*>(8388608L);
 
+/**
+ * Test memory_span.
+ */
+
 TEST(ArenaTest, MemorySpan)  // NOLINT
 {
   memory_span const ms{};
@@ -48,6 +62,10 @@ TEST(ArenaTest, MemorySpan)  // NOLINT
   EXPECT_TRUE(ms2.is_valid());
 }
 
+/**
+ * Test block.
+ */
+
 TEST(ArenaTest, BlockFits)  // NOLINT
 {
   block const b{fake_address, 1024};
@@ -84,6 +102,10 @@ TEST(ArenaTest, BlockMerge)  // NOLINT
   EXPECT_EQ(merged.size(), 2048);
 }
 
+/**
+ * Test superblock.
+ */
+
 TEST(ArenaTest, SuperblockEmpty)  // NOLINT
 {
   superblock sb{fake_address3, 4194304};
@@ -213,6 +235,41 @@ TEST(ArenaTest, SuperblockMaxFree)  // NOLINT
   EXPECT_EQ(b.size(), 2097152);
 }
 
+/**
+ * Test global_arena.
+ */
+
+TEST(ArenaTest, GlobalArenaNullUpstream)  // NOLINT
+{
+  auto construct_nullptr = []() { global_arena ga{nullptr, std::nullopt}; };
+  EXPECT_THROW(construct_nullptr(), rmm::logic_error);  // NOLINT(cppcoreguidelines-avoid-goto)
+}
+
+TEST(ArenaTest, GlobalArenaAcquire)  // NOLINT
+{
+  mock_memory_resource mock;
+  EXPECT_CALL(mock, allocate(8388608)).WillOnce(Return(fake_address3));
+  EXPECT_CALL(mock, deallocate(fake_address3, 8388608));
+
+  global_arena ga{&mock, 8388608};
+
+  auto const sb = ga.acquire(256);
+  EXPECT_EQ(sb.pointer(), fake_address3);
+  EXPECT_EQ(sb.size(), 4194304);
+  EXPECT_TRUE(sb.empty());
+
+  auto const sb2 = ga.acquire(1024);
+  EXPECT_EQ(sb2.pointer(), fake_address4);
+  EXPECT_EQ(sb2.size(), 4194304);
+  EXPECT_TRUE(sb2.empty());
+
+  EXPECT_FALSE(ga.acquire(512).is_valid());
+}
+
+/**
+ * Test arena_memory_resource.
+ */
+
 TEST(ArenaTest, NullUpstream)  // NOLINT
 {
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto)

From ce633f282a4de331987de1cb310be2d55a2bd5d3 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Fri, 12 Nov 2021 08:59:33 -0800
Subject: [PATCH 10/35] more tests for global arena

---
 include/rmm/mr/device/detail/arena.hpp |  2 +-
 tests/mr/device/arena_mr_tests.cpp     | 49 ++++++++++++++++++++++++++
 2 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index e16af1c5a..f16002d83 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -442,7 +442,7 @@ class global_arena final {
   }
 
   /**
-   * @brief Release a set of superblocks from a dying arena.
+   * @brief Release a map of superblocks from a dying arena.
    *
    * @param superblocks The set of superblocks.
    */
diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index 7e3622bb7..a214fee17 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -266,6 +266,55 @@ TEST(ArenaTest, GlobalArenaAcquire)  // NOLINT
   EXPECT_FALSE(ga.acquire(512).is_valid());
 }
 
+TEST(ArenaTest, GlobalArenaReleaseMergeNext)  // NOLINT
+{
+  mock_memory_resource mock;
+  EXPECT_CALL(mock, allocate(8388608)).WillOnce(Return(fake_address3));
+  EXPECT_CALL(mock, deallocate(fake_address3, 8388608));
+
+  global_arena ga{&mock, 8388608};
+
+  auto sb = ga.acquire(256);
+  ga.release(std::move(sb));
+  auto* p = ga.allocate(8388608);
+  EXPECT_EQ(p, fake_address3);
+}
+
+TEST(ArenaTest, GlobalArenaReleaseMergePrevious)  // NOLINT
+{
+  mock_memory_resource mock;
+  EXPECT_CALL(mock, allocate(16777216)).WillOnce(Return(fake_address3));
+  EXPECT_CALL(mock, deallocate(fake_address3, 16777216));
+
+  global_arena ga{&mock, 16777216};
+
+  auto sb = ga.acquire(256);
+  auto sb2 = ga.acquire(1024);
+  ga.acquire(512);
+  ga.release(std::move(sb));
+  ga.release(std::move(sb2));
+  auto* p = ga.allocate(8388608);
+  EXPECT_EQ(p, fake_address3);
+}
+
+TEST(ArenaTest, GlobalArenaReleaseMergePreviousAndNext)  // NOLINT
+{
+  mock_memory_resource mock;
+  EXPECT_CALL(mock, allocate(16777216)).WillOnce(Return(fake_address3));
+  EXPECT_CALL(mock, deallocate(fake_address3, 16777216));
+
+  global_arena ga{&mock, 16777216};
+
+  auto sb = ga.acquire(256);
+  auto sb2 = ga.acquire(1024);
+  auto sb3 = ga.acquire(512);
+  ga.release(std::move(sb));
+  ga.release(std::move(sb3));
+  ga.release(std::move(sb2));
+  auto* p = ga.allocate(16777216);
+  EXPECT_EQ(p, fake_address3);
+}
+
 /**
  * Test arena_memory_resource.
  */

From 23f679c3c4247997c4656d83a73e54c33686ee8b Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Fri, 12 Nov 2021 09:04:40 -0800
Subject: [PATCH 11/35] add back defrag

---
 .../rmm/mr/device/arena_memory_resource.hpp   | 23 +++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp
index 6bbbbecb5..b1b65640c 100644
--- a/include/rmm/mr/device/arena_memory_resource.hpp
+++ b/include/rmm/mr/device/arena_memory_resource.hpp
@@ -159,8 +159,13 @@ class arena_memory_resource final : public device_memory_resource {
     void* pointer = arena.allocate(bytes);
 
     if (pointer == nullptr) {
-      if (dump_log_on_failure_) { dump_memory_log(bytes); }
-      RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory);
+      write_lock lock(mtx_);
+      defragment();
+      pointer = arena.allocate(bytes);
+      if (pointer == nullptr) {
+        if (dump_log_on_failure_) { dump_memory_log(bytes); }
+        RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory);
+      }
     }
 
     return pointer;
@@ -218,6 +223,20 @@ class arena_memory_resource final : public device_memory_resource {
     global_arena_.deallocate_from_other_arena(ptr, bytes);
   }
 
+  /**
+   * @brief Defragment memory by returning all free blocks to the global arena.
+   */
+  void defragment()
+  {
+    RMM_CUDA_TRY(cudaDeviceSynchronize());
+    for (auto& thread_arena : thread_arenas_) {
+      thread_arena.second->clean();
+    }
+    for (auto& stream_arena : stream_arenas_) {
+      stream_arena.second.clean();
+    }
+  }
+
   /**
    * @brief Get the arena associated with the current thread or the given stream.
    *

From a5a4881a10846359af4e5a17e0cdac96b7d0eee5 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Mon, 15 Nov 2021 18:21:52 -0800
Subject: [PATCH 12/35] more tests

---
 include/rmm/mr/device/detail/arena.hpp | 66 ++++++++++++--------------
 tests/mr/device/arena_mr_tests.cpp     | 64 +++++++++++++++++++++++++
 2 files changed, 95 insertions(+), 35 deletions(-)

diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index f16002d83..14ce6cbe0 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -108,7 +108,7 @@ class block final : public memory_span {
    */
   [[nodiscard]] bool fits(std::size_t sz) const
   {
-    RMM_LOGGING_ASSERT(is_valid());
+    if (!is_valid()) { RMM_LOGGING_ASSERT(is_valid()); }
     RMM_LOGGING_ASSERT(sz > 0);
     return size() >= sz;
   }
@@ -188,7 +188,7 @@ class superblock final : public memory_span {
    */
   superblock(void* pointer, std::size_t size) : memory_span{pointer, size}
   {
-    RMM_LOGGING_ASSERT(size >= minimum_size);
+    RMM_LOGGING_ASSERT(size >= minimum_size / 2);
     free_blocks_.emplace(pointer, size);
   }
 
@@ -413,8 +413,17 @@ class global_arena final {
   {
     lock_guard lock(mtx_);
     upstream_mr_->deallocate(upstream_block_.pointer(), upstream_block_.size());
+    superblocks_.clear();
   }
 
+  /**
+   * @brief Should allocation of `size` bytes be handled by the global arena directly?
+   *
+   * @param size The size in bytes of the allocation.
+   * @return bool True if the allocation should be handled by the global arena.
+   */
+  bool handles(std::size_t size) const { return size > superblock::minimum_size / 2; }
+
   /**
    * @brief Acquire a superblock that can fit a block of the given size.
    *
@@ -464,11 +473,9 @@ class global_arena final {
    */
   void* allocate(std::size_t size)
   {
-    if (handles(size)) {
-      lock_guard lock(mtx_);
-      return first_fit(size).pointer();
-    }
-    return nullptr;
+    RMM_LOGGING_ASSERT(handles(size));
+    lock_guard lock(mtx_);
+    return first_fit(size).pointer();
   }
 
   /**
@@ -482,14 +489,11 @@ class global_arena final {
    */
   bool deallocate(void* ptr, std::size_t size, cuda_stream_view stream)
   {
-    if (handles(size)) {
-      stream.synchronize_no_throw();
-
-      lock_guard lock(mtx_);
-      coalesce({ptr, size});
-      return true;
-    }
-    return false;
+    RMM_LOGGING_ASSERT(handles(size));
+    stream.synchronize_no_throw();
+    lock_guard lock(mtx_);
+    coalesce({ptr, size});
+    return true;
   }
 
   /**
@@ -510,6 +514,7 @@ class global_arena final {
     });
     if (iter == superblocks_.end()) { RMM_FAIL("allocation not found"); }
     iter->second.coalesce(b);
+    if (iter->second.empty()) { coalesce(std::move(superblocks_.extract(iter).mapped())); }
   }
 
   /**
@@ -552,17 +557,11 @@ class global_arena final {
   {
     RMM_LOGGING_ASSERT(size >= superblock::minimum_size);
     upstream_block_ = {upstream_mr_->allocate(size), size};
-    superblocks_.try_emplace(upstream_block_.pointer(), upstream_block_.pointer(), size);
+    if (!upstream_block_.is_valid()) { RMM_FAIL("Failed to allocate memory from upstream"); }
+    superblocks_.insert(
+      std::make_pair(upstream_block_.pointer(), superblock(upstream_block_.pointer(), size)));
   }
 
-  /**
-   * @brief Should allocation of `size` bytes be handled by the global arena directly?
-   *
-   * @param size The size in bytes of the allocation.
-   * @return bool True if the allocation should be handled by the global arena.
-   */
-  bool handles(std::size_t size) const { return size > superblock::minimum_size / 2; }
-
   /**
    * @brief Get the first superblock that can fit a block of at least `size` bytes.
    *
@@ -588,7 +587,7 @@ class global_arena final {
     if (sb.empty() && sb.size() - sz >= superblock::minimum_size) {
       // Split the superblock and put the remainder back.
       auto [head, tail] = sb.split(sz);
-      superblocks_.try_emplace(tail.pointer(), std::move(tail));
+      superblocks_.insert(std::make_pair(tail.pointer(), std::move(tail)));
       return std::move(head);
     }
     return sb;
@@ -615,17 +614,17 @@ class global_arena final {
       auto prev_sb = std::move(superblocks_.extract(previous).mapped());
       auto next_sb = std::move(superblocks_.extract(next).mapped());
       auto merged  = prev_sb.merge(sb).merge(next_sb);
-      superblocks_.try_emplace(merged.pointer(), std::move(merged));
+      superblocks_.insert(std::make_pair(merged.pointer(), std::move(merged)));
     } else if (merge_prev) {
       auto prev_sb = std::move(superblocks_.extract(previous).mapped());
       auto merged  = prev_sb.merge(sb);
-      superblocks_.try_emplace(merged.pointer(), std::move(merged));
+      superblocks_.insert(std::make_pair(merged.pointer(), std::move(merged)));
     } else if (merge_next) {
       auto next_sb = std::move(superblocks_.extract(next).mapped());
       auto merged  = sb.merge(next_sb);
-      superblocks_.try_emplace(merged.pointer(), std::move(merged));
+      superblocks_.insert(std::make_pair(merged.pointer(), std::move(merged)));
     } else {
-      superblocks_.try_emplace(sb.pointer(), std::move(sb));
+      superblocks_.insert(std::make_pair(sb.pointer(), std::move(sb)));
     }
   }
 
@@ -674,9 +673,7 @@ class arena {
    */
   void* allocate(std::size_t size)
   {
-    auto* ptr = global_arena_.allocate(size);
-    if (ptr != nullptr) { return ptr; }
-
+    if (global_arena_.handles(size)) { return global_arena_.allocate(size); }
     lock_guard lock(mtx_);
     return get_block(size).pointer();
   }
@@ -692,8 +689,7 @@ class arena {
    */
   bool deallocate(void* ptr, std::size_t size, cuda_stream_view stream)
   {
-    if (global_arena_.deallocate(ptr, size, stream)) { return true; }
-
+    if (global_arena_.handles(size)) { return global_arena_.deallocate(ptr, size, stream); }
     lock_guard lock(mtx_);
     return deallocate_from_superblock({ptr, size});
   }
@@ -795,7 +791,7 @@ class arena {
     auto sb = global_arena_.acquire(size);
     if (sb.is_valid()) {
       auto const b = sb.first_fit(size);
-      superblocks_.try_emplace(sb.pointer(), std::move(sb));
+      superblocks_.insert(std::make_pair(sb.pointer(), std::move(sb)));
       return b;
     }
     return {};
diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index a214fee17..c71aa2bc0 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -315,6 +315,70 @@ TEST(ArenaTest, GlobalArenaReleaseMergePreviousAndNext)  // NOLINT
   EXPECT_EQ(p, fake_address3);
 }
 
+TEST(ArenaTest, GlobalArenaReleaseMultiple)  // NOLINT
+{
+  mock_memory_resource mock;
+  EXPECT_CALL(mock, allocate(16777216)).WillOnce(Return(fake_address3));
+  EXPECT_CALL(mock, deallocate(fake_address3, 16777216));
+
+  global_arena ga{&mock, 16777216};
+
+  std::map<void*, superblock> superblocks{};
+  auto sb = ga.acquire(256);
+  superblocks.insert(std::make_pair(sb.pointer(), std::move(sb)));
+  auto sb2 = ga.acquire(1024);
+  superblocks.insert(std::make_pair(sb2.pointer(), std::move(sb2)));
+  auto sb3 = ga.acquire(512);
+  superblocks.insert(std::make_pair(sb3.pointer(), std::move(sb3)));
+  ga.release(superblocks);
+  auto* p = ga.allocate(16777216);
+  EXPECT_EQ(p, fake_address3);
+}
+
+TEST(ArenaTest, GlobalArenaAllocate)  // NOLINT
+{
+  mock_memory_resource mock;
+  EXPECT_CALL(mock, allocate(8388608)).WillOnce(Return(fake_address3));
+  EXPECT_CALL(mock, deallocate(fake_address3, 8388608));
+
+  global_arena ga{&mock, 8388608};
+
+  auto* ptr = ga.allocate(4194304);
+  EXPECT_EQ(ptr, fake_address3);
+  auto* ptr2 = ga.allocate(4194304);
+  EXPECT_EQ(ptr2, fake_address4);
+}
+
+TEST(ArenaTest, GlobalArenaDeallocate)  // NOLINT
+{
+  mock_memory_resource mock;
+  EXPECT_CALL(mock, allocate(8388608)).WillOnce(Return(fake_address3));
+  EXPECT_CALL(mock, deallocate(fake_address3, 8388608));
+
+  global_arena ga{&mock, 8388608};
+
+  auto* ptr = ga.allocate(4194304);
+  EXPECT_EQ(ptr, fake_address3);
+  EXPECT_TRUE(ga.deallocate(ptr, 4194304, {}));
+  ptr = ga.allocate(4194304);
+  EXPECT_EQ(ptr, fake_address3);
+}
+
+TEST(ArenaTest, GlobalArenaDeallocateFromOtherArena)  // NOLINT
+{
+  mock_memory_resource mock;
+  EXPECT_CALL(mock, allocate(8388608)).WillOnce(Return(fake_address3));
+  EXPECT_CALL(mock, deallocate(fake_address3, 8388608));
+
+  global_arena ga{&mock, 8388608};
+
+  auto sb = ga.acquire(512);
+  auto const b = sb.first_fit(512);
+  ga.release(std::move(sb));
+  ga.deallocate_from_other_arena(b.pointer(), b.size());
+  EXPECT_EQ(ga.allocate(8388608), fake_address3);
+}
+
 /**
  * Test arena_memory_resource.
  */

From f77fb7e01875f8153b87639917ed6352b1baf7c9 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Tue, 16 Nov 2021 10:25:55 -0800
Subject: [PATCH 13/35] add tests for arena

---
 .../rmm/mr/device/arena_memory_resource.hpp   |  2 +-
 include/rmm/mr/device/detail/arena.hpp        |  6 +-
 tests/mr/device/arena_mr_tests.cpp            | 85 ++++++++++++++++++-
 3 files changed, 85 insertions(+), 8 deletions(-)

diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp
index b1b65640c..c6b8c20c2 100644
--- a/include/rmm/mr/device/arena_memory_resource.hpp
+++ b/include/rmm/mr/device/arena_memory_resource.hpp
@@ -201,7 +201,7 @@ class arena_memory_resource final : public device_memory_resource {
   {
     stream.synchronize_no_throw();
 
-    read_lock lock(mtx_);
+    write_lock lock(mtx_);
 
     if (use_per_thread_arena(stream)) {
       auto const id = std::this_thread::get_id();
diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index 14ce6cbe0..707c4eb19 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -108,7 +108,7 @@ class block final : public memory_span {
    */
   [[nodiscard]] bool fits(std::size_t sz) const
   {
-    if (!is_valid()) { RMM_LOGGING_ASSERT(is_valid()); }
+    RMM_LOGGING_ASSERT(is_valid());
     RMM_LOGGING_ASSERT(sz > 0);
     return size() >= sz;
   }
@@ -557,9 +557,7 @@ class global_arena final {
   {
     RMM_LOGGING_ASSERT(size >= superblock::minimum_size);
     upstream_block_ = {upstream_mr_->allocate(size), size};
-    if (!upstream_block_.is_valid()) { RMM_FAIL("Failed to allocate memory from upstream"); }
-    superblocks_.insert(
-      std::make_pair(upstream_block_.pointer(), superblock(upstream_block_.pointer(), size)));
+    superblocks_.try_emplace(upstream_block_.pointer(), upstream_block_.pointer(), size);
   }
 
   /**
diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index c71aa2bc0..5d18ef747 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -38,6 +38,7 @@ using memory_span  = rmm::mr::detail::arena::memory_span;
 using block        = rmm::mr::detail::arena::block;
 using superblock   = rmm::mr::detail::arena::superblock;
 using global_arena = rmm::mr::detail::arena::global_arena<mock_memory_resource>;
+using arena        = rmm::mr::detail::arena::arena<mock_memory_resource>;
 using arena_mr     = rmm::mr::arena_memory_resource<rmm::mr::device_memory_resource>;
 using ::testing::Return;
 
@@ -288,7 +289,7 @@ TEST(ArenaTest, GlobalArenaReleaseMergePrevious)  // NOLINT
 
   global_arena ga{&mock, 16777216};
 
-  auto sb = ga.acquire(256);
+  auto sb  = ga.acquire(256);
   auto sb2 = ga.acquire(1024);
   ga.acquire(512);
   ga.release(std::move(sb));
@@ -305,7 +306,7 @@ TEST(ArenaTest, GlobalArenaReleaseMergePreviousAndNext)  // NOLINT
 
   global_arena ga{&mock, 16777216};
 
-  auto sb = ga.acquire(256);
+  auto sb  = ga.acquire(256);
   auto sb2 = ga.acquire(1024);
   auto sb3 = ga.acquire(512);
   ga.release(std::move(sb));
@@ -372,13 +373,91 @@ TEST(ArenaTest, GlobalArenaDeallocateFromOtherArena)  // NOLINT
 
   global_arena ga{&mock, 8388608};
 
-  auto sb = ga.acquire(512);
+  auto sb      = ga.acquire(512);
   auto const b = sb.first_fit(512);
   ga.release(std::move(sb));
   ga.deallocate_from_other_arena(b.pointer(), b.size());
   EXPECT_EQ(ga.allocate(8388608), fake_address3);
 }
 
+/**
+ * Test arena.
+ */
+
+TEST(ArenaTest, ArenaAllocate)  // NOLINT
+{
+  mock_memory_resource mock;
+  EXPECT_CALL(mock, allocate(8388608)).WillOnce(Return(fake_address3));
+  EXPECT_CALL(mock, deallocate(fake_address3, 8388608));
+  global_arena ga{&mock, 8388608};
+  arena a{ga};
+
+  EXPECT_EQ(a.allocate(4194304), fake_address3);
+  EXPECT_EQ(a.allocate(256), fake_address4);
+}
+
+TEST(ArenaTest, ArenaDeallocate)  // NOLINT
+{
+  mock_memory_resource mock;
+  EXPECT_CALL(mock, allocate(8388608)).WillOnce(Return(fake_address3));
+  EXPECT_CALL(mock, deallocate(fake_address3, 8388608));
+  global_arena ga{&mock, 8388608};
+  arena a{ga};
+
+  auto* ptr = a.allocate(4194304);
+  a.deallocate(ptr, 4194304, {});
+  auto* ptr2 = a.allocate(256);
+  a.deallocate(ptr2, 256, {});
+  EXPECT_EQ(a.allocate(8388608), fake_address3);
+}
+
+TEST(ArenaTest, ArenaDeallocateMergePrevious)  // NOLINT
+{
+  mock_memory_resource mock;
+  EXPECT_CALL(mock, allocate(8388608)).WillOnce(Return(fake_address3));
+  EXPECT_CALL(mock, deallocate(fake_address3, 8388608));
+  global_arena ga{&mock, 8388608};
+  arena a{ga};
+
+  auto* ptr  = a.allocate(256);
+  auto* ptr2 = a.allocate(256);
+  a.allocate(256);
+  a.deallocate(ptr, 256, {});
+  a.deallocate(ptr2, 256, {});
+  EXPECT_EQ(a.allocate(512), fake_address3);
+}
+
+TEST(ArenaTest, ArenaDeallocateMergeNext)  // NOLINT
+{
+  mock_memory_resource mock;
+  EXPECT_CALL(mock, allocate(8388608)).WillOnce(Return(fake_address3));
+  EXPECT_CALL(mock, deallocate(fake_address3, 8388608));
+  global_arena ga{&mock, 8388608};
+  arena a{ga};
+
+  auto* ptr  = a.allocate(256);
+  auto* ptr2 = a.allocate(256);
+  a.allocate(256);
+  a.deallocate(ptr2, 256, {});
+  a.deallocate(ptr, 256, {});
+  EXPECT_EQ(a.allocate(512), fake_address3);
+}
+
+TEST(ArenaTest, ArenaDeallocateMergePreviousAndNext)  // NOLINT
+{
+  mock_memory_resource mock;
+  EXPECT_CALL(mock, allocate(8388608)).WillOnce(Return(fake_address3));
+  EXPECT_CALL(mock, deallocate(fake_address3, 8388608));
+  global_arena ga{&mock, 8388608};
+  arena a{ga};
+
+  auto* ptr  = a.allocate(256);
+  auto* ptr2 = a.allocate(256);
+  a.deallocate(ptr, 256, {});
+  a.deallocate(ptr2, 256, {});
+  EXPECT_EQ(a.allocate(2048), fake_address3);
+}
+
 /**
  * Test arena_memory_resource.
  */

From dd86082292ac5341163e78c63c9cb040a9124bc8 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Tue, 16 Nov 2021 10:39:13 -0800
Subject: [PATCH 14/35] remove alignment changes

---
 include/rmm/detail/aligned.hpp                | 36 -------------------
 .../rmm/mr/device/arena_memory_resource.hpp   |  4 +--
 include/rmm/mr/device/detail/arena.hpp        |  3 +-
 tests/mr/device/arena_mr_tests.cpp            |  3 +-
 4 files changed, 6 insertions(+), 40 deletions(-)

diff --git a/include/rmm/detail/aligned.hpp b/include/rmm/detail/aligned.hpp
index 19e69344d..321be53b5 100644
--- a/include/rmm/detail/aligned.hpp
+++ b/include/rmm/detail/aligned.hpp
@@ -62,18 +62,6 @@ constexpr std::size_t align_up(std::size_t value, std::size_t alignment) noexcep
   return (value + (alignment - 1)) & ~(alignment - 1);
 }
 
-/**
- * @brief Align up to nearest multiple of the CUDA allocation alignment
- *
- * @param[in] v value to align
- *
- * @return Return the aligned value, as one would expect
- */
-constexpr std::size_t align_up_cuda(std::size_t value) noexcept
-{
-  return align_up(value, CUDA_ALLOCATION_ALIGNMENT);
-}
-
 /**
  * @brief Align down to the nearest multiple of specified power of 2
  *
@@ -88,18 +76,6 @@ constexpr std::size_t align_down(std::size_t value, std::size_t alignment) noexc
   return value & ~(alignment - 1);
 }
 
-/**
- * @brief Align down to the nearest multiple of the CUDA allocation alignment
- *
- * @param[in] v value to align
- *
- * @return Return the aligned value, as one would expect
- */
-constexpr std::size_t align_down_cuda(std::size_t value) noexcept
-{
-  return align_down(value, CUDA_ALLOCATION_ALIGNMENT);
-}
-
 /**
  * @brief Checks whether a value is aligned to a multiple of a specified power of 2
  *
@@ -114,18 +90,6 @@ constexpr bool is_aligned(std::size_t value, std::size_t alignment) noexcept
   return value == align_down(value, alignment);
 }
 
-/**
- * @brief Checks whether a value is aligned to a multiple of the CUDA allocation alignment
- *
- * @param[in] v value to check for alignment
- *
- * @return true if aligned
- */
-constexpr bool is_cuda_aligned(std::size_t value) noexcept
-{
-  return is_aligned(value, CUDA_ALLOCATION_ALIGNMENT);
-}
-
 inline bool is_pointer_aligned(void* ptr, std::size_t alignment = CUDA_ALLOCATION_ALIGNMENT)
 {
   // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp
index c6b8c20c2..b0fc92139 100644
--- a/include/rmm/mr/device/arena_memory_resource.hpp
+++ b/include/rmm/mr/device/arena_memory_resource.hpp
@@ -154,7 +154,7 @@ class arena_memory_resource final : public device_memory_resource {
   {
     if (bytes <= 0) { return nullptr; }
 
-    bytes         = rmm::detail::align_up_cuda(bytes);
+    bytes         = rmm::detail::align_up(bytes, rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
     auto& arena   = get_arena(stream);
     void* pointer = arena.allocate(bytes);
 
@@ -183,7 +183,7 @@ class arena_memory_resource final : public device_memory_resource {
   {
     if (ptr == nullptr || bytes <= 0) { return; }
 
-    bytes = rmm::detail::align_up_cuda(bytes);
+    bytes = rmm::detail::align_up(bytes, rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
     if (!get_arena(stream).deallocate(ptr, bytes, stream)) {
       deallocate_from_other_arena(ptr, bytes, stream);
     }
diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index 707c4eb19..7e1f21238 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -395,7 +395,8 @@ class global_arena final {
     : upstream_mr_{upstream_mr}
   {
     RMM_EXPECTS(nullptr != upstream_mr_, "Unexpected null upstream pointer.");
-    auto const size = rmm::detail::align_down_cuda(arena_size.value_or(default_size()));
+    auto const size = rmm::detail::align_down(arena_size.value_or(default_size()),
+                                              rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
     initialize(size);
   }
 
diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index 5d18ef747..f39dc5f2e 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -473,7 +473,8 @@ TEST(ArenaTest, AllocateNinetyPercent)  // NOLINT
   EXPECT_NO_THROW([]() {  // NOLINT(cppcoreguidelines-avoid-goto)
     auto const free = rmm::detail::available_device_memory().first;
     auto const ninety_percent =
-      rmm::detail::align_up_cuda(static_cast<std::size_t>(static_cast<double>(free) * 0.9));
+      rmm::detail::align_up(static_cast<std::size_t>(static_cast<double>(free) * 0.9),
+                            rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
     arena_mr mr(rmm::mr::get_current_device_resource(), ninety_percent);
   }());
 }

From 29ae23b85be3292024c231ef012b51faad5279d5 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Tue, 16 Nov 2021 12:46:56 -0800
Subject: [PATCH 15/35] small fixes

---
 include/rmm/mr/device/detail/arena.hpp | 42 +++++++++++++++++++-------
 tests/mr/device/arena_mr_tests.cpp     | 12 ++++++--
 2 files changed, 41 insertions(+), 13 deletions(-)

diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index 7e1f21238..07deb49c7 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -188,7 +188,7 @@ class superblock final : public memory_span {
    */
   superblock(void* pointer, std::size_t size) : memory_span{pointer, size}
   {
-    RMM_LOGGING_ASSERT(size >= minimum_size / 2);
+    RMM_LOGGING_ASSERT(size > minimum_size / 2);
     free_blocks_.emplace(pointer, size);
   }
 
@@ -321,6 +321,11 @@ class superblock final : public memory_span {
     RMM_LOGGING_ASSERT(b.is_valid());
     RMM_LOGGING_ASSERT(contains(b));
 
+    if (free_blocks_.empty()) {
+      free_blocks_.insert(b);
+      return;
+    }
+
     // Find the right place (in ascending address order) to insert the block.
     auto const next     = free_blocks_.lower_bound(b);
     auto const previous = next == free_blocks_.cbegin() ? next : std::prev(next);
@@ -348,12 +353,12 @@ class superblock final : public memory_span {
   }
 
   /**
-   * @brief Find the max free block.
-   * @return the max free block.
+   * @brief Find the max free block size.
+   * @return the max free block size.
    */
-  [[nodiscard]] block max_free() const
+  [[nodiscard]] std::size_t max_free() const
   {
-    return *std::max_element(free_blocks_.cbegin(), free_blocks_.cend(), block_size_compare);
+    return std::max_element(free_blocks_.cbegin(), free_blocks_.cend(), block_size_compare)->size();
   }
 
  private:
@@ -366,7 +371,7 @@ inline auto max_free(std::map<void*, superblock> const& superblocks)
 {
   std::size_t size{};
   for (auto const& kv : superblocks) {
-    size = std::max(size, kv.second.max_free().size());
+    size = std::max(size, kv.second.max_free());
   }
   return size;
 };
@@ -460,7 +465,7 @@ class global_arena final {
   {
     lock_guard lock(mtx_);
     while (!superblocks.empty()) {
-      auto&& sb = std::move(superblocks.extract(superblocks.cbegin()).mapped());
+      auto sb = std::move(superblocks.extract(superblocks.cbegin()).mapped());
       RMM_LOGGING_ASSERT(sb.is_valid());
       coalesce(std::move(sb));
     }
@@ -514,8 +519,14 @@ class global_arena final {
       return kv.second.contains(b);
     });
     if (iter == superblocks_.end()) { RMM_FAIL("allocation not found"); }
-    iter->second.coalesce(b);
-    if (iter->second.empty()) { coalesce(std::move(superblocks_.extract(iter).mapped())); }
+
+    auto sb = std::move(superblocks_.extract(iter).mapped());
+    sb.coalesce(b);
+    if (sb.empty()) {
+      coalesce(std::move(sb));
+    } else {
+      superblocks_.insert(std::make_pair(sb.pointer(), std::move(sb)));
+    }
   }
 
   /**
@@ -601,6 +612,11 @@ class global_arena final {
   {
     RMM_LOGGING_ASSERT(sb.is_valid());
 
+    if (superblocks_.empty()) {
+      superblocks_.insert(std::make_pair(sb.pointer(), std::move(sb)));
+      return;
+    }
+
     // Find the right place (in ascending address order) to insert the block.
     auto const next     = superblocks_.lower_bound(sb.pointer());
     auto const previous = next == superblocks_.cbegin() ? next : std::prev(next);
@@ -773,9 +789,13 @@ class arena {
     });
     if (iter == superblocks_.end()) { return false; }
 
-    auto& sb = iter->second;
+    auto sb = std::move(superblocks_.extract(iter).mapped());
     sb.coalesce(b);
-    if (sb.empty()) { global_arena_.release(std::move(superblocks_.extract(iter).mapped())); }
+    if (sb.empty()) {
+      global_arena_.release(std::move(sb));
+    } else {
+      superblocks_.insert(std::make_pair(sb.pointer(), std::move(sb)));
+    }
     return true;
   }
 
diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index f39dc5f2e..049e3a2bd 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -198,6 +198,15 @@ TEST(ArenaTest, SuperblockFirstFit)  // NOLINT
   EXPECT_EQ(b3.size(), 512);
 }
 
+TEST(ArenaTest, SuperblockCoalesceAfterFull)  // NOLINT
+{
+  superblock sb{fake_address3, 4194304};
+  auto const b = sb.first_fit(2097152);
+  sb.first_fit(2097152);
+  sb.coalesce(b);
+  EXPECT_TRUE(sb.first_fit(2097152).is_valid());
+}
+
 TEST(ArenaTest, SuperblockCoalesceMergeNext)  // NOLINT
 {
   superblock sb{fake_address3, 4194304};
@@ -232,8 +241,7 @@ TEST(ArenaTest, SuperblockMaxFree)  // NOLINT
 {
   superblock sb{fake_address3, 4194304};
   sb.first_fit(2097152);
-  auto const b = sb.max_free();
-  EXPECT_EQ(b.size(), 2097152);
+  EXPECT_EQ(sb.max_free(), 2097152);
 }
 
 /**

From abd72260e56b1e17681a16d5bb4af49be3b01a19 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Tue, 16 Nov 2021 18:21:35 -0800
Subject: [PATCH 16/35] switch back to set, fix tests

---
 .../rmm/mr/device/arena_memory_resource.hpp   |  25 +----
 include/rmm/mr/device/detail/arena.hpp        | 100 +++++++++---------
 tests/mr/device/arena_mr_tests.cpp            |   8 +-
 3 files changed, 56 insertions(+), 77 deletions(-)

diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp
index b0fc92139..03d0b23ce 100644
--- a/include/rmm/mr/device/arena_memory_resource.hpp
+++ b/include/rmm/mr/device/arena_memory_resource.hpp
@@ -159,13 +159,8 @@ class arena_memory_resource final : public device_memory_resource {
     void* pointer = arena.allocate(bytes);
 
     if (pointer == nullptr) {
-      write_lock lock(mtx_);
-      defragment();
-      pointer = arena.allocate(bytes);
-      if (pointer == nullptr) {
-        if (dump_log_on_failure_) { dump_memory_log(bytes); }
-        RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory);
-      }
+      if (dump_log_on_failure_) { dump_memory_log(bytes); }
+      RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory);
     }
 
     return pointer;
@@ -201,7 +196,7 @@ class arena_memory_resource final : public device_memory_resource {
   {
     stream.synchronize_no_throw();
 
-    write_lock lock(mtx_);
+    read_lock lock(mtx_);
 
     if (use_per_thread_arena(stream)) {
       auto const id = std::this_thread::get_id();
@@ -223,20 +218,6 @@ class arena_memory_resource final : public device_memory_resource {
     global_arena_.deallocate_from_other_arena(ptr, bytes);
   }
 
-  /**
-   * @brief Defragment memory by returning all free blocks to the global arena.
-   */
-  void defragment()
-  {
-    RMM_CUDA_TRY(cudaDeviceSynchronize());
-    for (auto& thread_arena : thread_arenas_) {
-      thread_arena.second->clean();
-    }
-    for (auto& stream_arena : stream_arenas_) {
-      stream_arena.second.clean();
-    }
-  }
-
   /**
    * @brief Get the arena associated with the current thread or the given stream.
    *
diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index 07deb49c7..00327ee44 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -30,13 +30,11 @@
 #include <algorithm>
 #include <cstddef>
 #include <limits>
-#include <map>
 #include <memory>
 #include <mutex>
 #include <numeric>
 #include <optional>
 #include <set>
-#include <unordered_map>
 
 namespace rmm::mr::detail::arena {
 
@@ -83,13 +81,13 @@ class memory_span {
   std::size_t size_{};  ///< Size in bytes.
 };
 
-/// Calculate the total size of a map of memory spans.
+/// Calculate the total size of a set of memory spans.
 template <typename T>
-inline auto total_memory_size(std::map<void*, T> const& spans)
+inline auto total_memory_size(std::set<T> const& spans)
 {
   return std::accumulate(
     spans.cbegin(), spans.cend(), std::size_t{}, [](auto const& lhs, auto const& rhs) {
-      return lhs + rhs.second.size();
+      return lhs + rhs.size();
     });
 }
 
@@ -366,12 +364,12 @@ class superblock final : public memory_span {
   std::set<block> free_blocks_{};
 };
 
-/// Find the max free size from a map of superblocks.
-inline auto max_free(std::map<void*, superblock> const& superblocks)
+/// Find the max free size from a set of superblocks.
+inline auto max_free(std::set<superblock> const& superblocks)
 {
   std::size_t size{};
-  for (auto const& kv : superblocks) {
-    size = std::max(size, kv.second.max_free());
+  for (auto const& sb : superblocks) {
+    size = std::max(size, sb.max_free());
   }
   return size;
 };
@@ -457,15 +455,15 @@ class global_arena final {
   }
 
   /**
-   * @brief Release a map of superblocks from a dying arena.
+   * @brief Release a set of superblocks from a dying arena.
    *
    * @param superblocks The set of superblocks.
    */
-  void release(std::map<void*, superblock>& superblocks)
+  void release(std::set<superblock>& superblocks)
   {
     lock_guard lock(mtx_);
     while (!superblocks.empty()) {
-      auto sb = std::move(superblocks.extract(superblocks.cbegin()).mapped());
+      auto sb = std::move(superblocks.extract(superblocks.cbegin()).value());
       RMM_LOGGING_ASSERT(sb.is_valid());
       coalesce(std::move(sb));
     }
@@ -515,17 +513,16 @@ class global_arena final {
     lock_guard lock(mtx_);
 
     block const b{ptr, bytes};
-    auto iter = std::find_if(superblocks_.begin(), superblocks_.end(), [&](auto const& kv) {
-      return kv.second.contains(b);
-    });
-    if (iter == superblocks_.end()) { RMM_FAIL("allocation not found"); }
+    auto const iter = std::find_if(
+      superblocks_.cbegin(), superblocks_.cend(), [&](auto const& sb) { return sb.contains(b); });
+    if (iter == superblocks_.cend()) { RMM_FAIL("allocation not found"); }
 
-    auto sb = std::move(superblocks_.extract(iter).mapped());
+    auto sb = std::move(superblocks_.extract(iter).value());
     sb.coalesce(b);
     if (sb.empty()) {
       coalesce(std::move(sb));
     } else {
-      superblocks_.insert(std::make_pair(sb.pointer(), std::move(sb)));
+      superblocks_.insert(std::move(sb));
     }
   }
 
@@ -569,7 +566,7 @@ class global_arena final {
   {
     RMM_LOGGING_ASSERT(size >= superblock::minimum_size);
     upstream_block_ = {upstream_mr_->allocate(size), size};
-    superblocks_.try_emplace(upstream_block_.pointer(), upstream_block_.pointer(), size);
+    superblocks_.emplace(upstream_block_.pointer(), size);
   }
 
   /**
@@ -587,17 +584,16 @@ class global_arena final {
    */
   superblock first_fit(std::size_t size)
   {
-    auto const iter = std::find_if(superblocks_.cbegin(),
-                                   superblocks_.cend(),
-                                   [size](auto const& kv) { return kv.second.fits(size); });
+    auto const iter = std::find_if(
+      superblocks_.cbegin(), superblocks_.cend(), [size](auto const& sb) { return sb.fits(size); });
     if (iter == superblocks_.cend()) { return {}; }
 
-    auto sb       = std::move(superblocks_.extract(iter).mapped());
+    auto sb       = std::move(superblocks_.extract(iter).value());
     auto const sz = std::max(size, superblock::minimum_size);
     if (sb.empty() && sb.size() - sz >= superblock::minimum_size) {
       // Split the superblock and put the remainder back.
       auto [head, tail] = sb.split(sz);
-      superblocks_.insert(std::make_pair(tail.pointer(), std::move(tail)));
+      superblocks_.insert(std::move(tail));
       return std::move(head);
     }
     return sb;
@@ -613,33 +609,33 @@ class global_arena final {
     RMM_LOGGING_ASSERT(sb.is_valid());
 
     if (superblocks_.empty()) {
-      superblocks_.insert(std::make_pair(sb.pointer(), std::move(sb)));
+      superblocks_.insert(std::move(sb));
       return;
     }
 
     // Find the right place (in ascending address order) to insert the block.
-    auto const next     = superblocks_.lower_bound(sb.pointer());
+    auto const next     = superblocks_.lower_bound(sb);
     auto const previous = next == superblocks_.cbegin() ? next : std::prev(next);
 
     // Coalesce with neighboring blocks.
-    bool const merge_prev = previous->second.is_contiguous_before(sb);
-    bool const merge_next = next != superblocks_.cend() && sb.is_contiguous_before(next->second);
+    bool const merge_prev = previous->is_contiguous_before(sb);
+    bool const merge_next = next != superblocks_.cend() && sb.is_contiguous_before(*next);
 
     if (merge_prev && merge_next) {
-      auto prev_sb = std::move(superblocks_.extract(previous).mapped());
-      auto next_sb = std::move(superblocks_.extract(next).mapped());
+      auto prev_sb = std::move(superblocks_.extract(previous).value());
+      auto next_sb = std::move(superblocks_.extract(next).value());
       auto merged  = prev_sb.merge(sb).merge(next_sb);
-      superblocks_.insert(std::make_pair(merged.pointer(), std::move(merged)));
+      superblocks_.insert(std::move(merged));
     } else if (merge_prev) {
-      auto prev_sb = std::move(superblocks_.extract(previous).mapped());
+      auto prev_sb = std::move(superblocks_.extract(previous).value());
       auto merged  = prev_sb.merge(sb);
-      superblocks_.insert(std::make_pair(merged.pointer(), std::move(merged)));
+      superblocks_.insert(std::move(merged));
     } else if (merge_next) {
-      auto next_sb = std::move(superblocks_.extract(next).mapped());
+      auto next_sb = std::move(superblocks_.extract(next).value());
       auto merged  = sb.merge(next_sb);
-      superblocks_.insert(std::make_pair(merged.pointer(), std::move(merged)));
+      superblocks_.insert(std::move(merged));
     } else {
-      superblocks_.insert(std::make_pair(sb.pointer(), std::move(sb)));
+      superblocks_.insert(std::move(sb));
     }
   }
 
@@ -647,8 +643,8 @@ class global_arena final {
   Upstream* upstream_mr_;
   /// Block allocated from upstream so that it can be quickly freed.
   block upstream_block_;
-  /// Address-ordered map of superblocks.
-  std::map<void*, superblock> superblocks_;
+  /// Address-ordered set of superblocks.
+  std::set<superblock> superblocks_;
   /// Mutex for exclusive lock.
   mutable std::mutex mtx_;
 };
@@ -769,11 +765,14 @@ class arena {
    */
   block first_fit(std::size_t size)
   {
-    for (auto&& kv : superblocks_) {
-      auto const b = kv.second.first_fit(size);
-      if (b.is_valid()) { return b; }
-    }
-    return {};
+    auto const iter = std::find_if(
+      superblocks_.cbegin(), superblocks_.cend(), [size](auto const& sb) { return sb.fits(size); });
+    if (iter == superblocks_.cend()) { return {}; }
+
+    auto sb      = std::move(superblocks_.extract(iter).value());
+    auto const b = sb.first_fit(size);
+    superblocks_.insert(std::move(sb));
+    return b;
   }
 
   /**
@@ -784,17 +783,16 @@ class arena {
    */
   bool deallocate_from_superblock(block const& b)
   {
-    auto iter = std::find_if(superblocks_.begin(), superblocks_.end(), [&](auto const& kv) {
-      return kv.second.contains(b);
-    });
-    if (iter == superblocks_.end()) { return false; }
+    auto const iter = std::find_if(
+      superblocks_.cbegin(), superblocks_.cend(), [&](auto const& sb) { return sb.contains(b); });
+    if (iter == superblocks_.cend()) { return false; }
 
-    auto sb = std::move(superblocks_.extract(iter).mapped());
+    auto sb = std::move(superblocks_.extract(iter).value());
     sb.coalesce(b);
     if (sb.empty()) {
       global_arena_.release(std::move(sb));
     } else {
-      superblocks_.insert(std::make_pair(sb.pointer(), std::move(sb)));
+      superblocks_.insert(std::move(sb));
     }
     return true;
   }
@@ -810,7 +808,7 @@ class arena {
     auto sb = global_arena_.acquire(size);
     if (sb.is_valid()) {
       auto const b = sb.first_fit(size);
-      superblocks_.insert(std::make_pair(sb.pointer(), std::move(sb)));
+      superblocks_.insert(std::move(sb));
       return b;
     }
     return {};
@@ -819,7 +817,7 @@ class arena {
   /// The global arena to allocate superblocks from.
   global_arena<Upstream>& global_arena_;
   /// Acquired superblocks.
-  std::map<void*, superblock> superblocks_;
+  std::set<superblock> superblocks_;
   /// Mutex for exclusive lock.
   mutable std::mutex mtx_;
 };
diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index 049e3a2bd..85bfd5c83 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -332,13 +332,13 @@ TEST(ArenaTest, GlobalArenaReleaseMultiple)  // NOLINT
 
   global_arena ga{&mock, 16777216};
 
-  std::map<void*, superblock> superblocks{};
+  std::set<superblock> superblocks{};
   auto sb = ga.acquire(256);
-  superblocks.insert(std::make_pair(sb.pointer(), std::move(sb)));
+  superblocks.insert(std::move(sb));
   auto sb2 = ga.acquire(1024);
-  superblocks.insert(std::make_pair(sb2.pointer(), std::move(sb2)));
+  superblocks.insert(std::move(sb2));
   auto sb3 = ga.acquire(512);
-  superblocks.insert(std::make_pair(sb3.pointer(), std::move(sb3)));
+  superblocks.insert(std::move(sb3));
   ga.release(superblocks);
   auto* p = ga.allocate(16777216);
   EXPECT_EQ(p, fake_address3);

From 10771f59fb54a67c6306dfe991e6b0b5ebf07882 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Wed, 17 Nov 2021 16:53:10 -0800
Subject: [PATCH 17/35] stream synchronize before releasing superblock

---
 .../rmm/mr/device/arena_memory_resource.hpp    | 18 ------------------
 include/rmm/mr/device/detail/arena.hpp         | 11 +++++++----
 tests/mr/device/arena_mr_tests.cpp             | 14 +++++++-------
 3 files changed, 14 insertions(+), 29 deletions(-)

diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp
index 03d0b23ce..87c2a72db 100644
--- a/include/rmm/mr/device/arena_memory_resource.hpp
+++ b/include/rmm/mr/device/arena_memory_resource.hpp
@@ -92,24 +92,6 @@ class arena_memory_resource final : public device_memory_resource {
     }
   }
 
-  /**
-   * @brief Construct an `arena_memory_resource`.
-   *
-   * @throws rmm::logic_error if `upstream_mr == nullptr`.
-   *
-   * @param upstream_mr The memory resource from which to allocate blocks for the pool.
-   * @param arena_size Size in bytes of the global arena. Defaults to all the available memory on
-   * the current device.
-   * @param max_size Unused.
-   * @deprecated Use the version without the max size.
-   */
-  arena_memory_resource(Upstream* upstream_mr,
-                        std::optional<std::size_t> arena_size,
-                        std::optional<std::size_t> max_size)
-    : arena_memory_resource{upstream_mr, arena_size, false}
-  {
-  }
-
   ~arena_memory_resource() override = default;
 
   // Disable copy (and move) semantics.
diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index 00327ee44..c988efe16 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -446,10 +446,12 @@ class global_arena final {
    * @brief Release a superblock.
    *
    * @param s Superblock to be released.
+   * @param stream The stream to synchronize on before releasing.
    */
-  void release(superblock&& sb)
+  void release(superblock&& sb, cuda_stream_view stream)
   {
     RMM_LOGGING_ASSERT(sb.is_valid());
+    stream.synchronize_no_throw();
     lock_guard lock(mtx_);
     coalesce(std::move(sb));
   }
@@ -702,7 +704,7 @@ class arena {
   {
     if (global_arena_.handles(size)) { return global_arena_.deallocate(ptr, size, stream); }
     lock_guard lock(mtx_);
-    return deallocate_from_superblock({ptr, size});
+    return deallocate_from_superblock({ptr, size}, stream);
   }
 
   /**
@@ -779,9 +781,10 @@ class arena {
    * @brief Deallocate a block from the superblock it belongs to.
    *
    * @param b The block to deallocate.
+   * @param stream The stream to use for deallocation.
    * @return true if the block is found.
    */
-  bool deallocate_from_superblock(block const& b)
+  bool deallocate_from_superblock(block const& b, cuda_stream_view stream)
   {
     auto const iter = std::find_if(
       superblocks_.cbegin(), superblocks_.cend(), [&](auto const& sb) { return sb.contains(b); });
@@ -790,7 +793,7 @@ class arena {
     auto sb = std::move(superblocks_.extract(iter).value());
     sb.coalesce(b);
     if (sb.empty()) {
-      global_arena_.release(std::move(sb));
+      global_arena_.release(std::move(sb), stream);
     } else {
       superblocks_.insert(std::move(sb));
     }
diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index 85bfd5c83..87acc2a67 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -284,7 +284,7 @@ TEST(ArenaTest, GlobalArenaReleaseMergeNext)  // NOLINT
   global_arena ga{&mock, 8388608};
 
   auto sb = ga.acquire(256);
-  ga.release(std::move(sb));
+  ga.release(std::move(sb), {});
   auto* p = ga.allocate(8388608);
   EXPECT_EQ(p, fake_address3);
 }
@@ -300,8 +300,8 @@ TEST(ArenaTest, GlobalArenaReleaseMergePrevious)  // NOLINT
   auto sb  = ga.acquire(256);
   auto sb2 = ga.acquire(1024);
   ga.acquire(512);
-  ga.release(std::move(sb));
-  ga.release(std::move(sb2));
+  ga.release(std::move(sb), {});
+  ga.release(std::move(sb2), {});
   auto* p = ga.allocate(8388608);
   EXPECT_EQ(p, fake_address3);
 }
@@ -317,9 +317,9 @@ TEST(ArenaTest, GlobalArenaReleaseMergePreviousAndNext)  // NOLINT
   auto sb  = ga.acquire(256);
   auto sb2 = ga.acquire(1024);
   auto sb3 = ga.acquire(512);
-  ga.release(std::move(sb));
-  ga.release(std::move(sb3));
-  ga.release(std::move(sb2));
+  ga.release(std::move(sb), {});
+  ga.release(std::move(sb3), {});
+  ga.release(std::move(sb2), {});
   auto* p = ga.allocate(16777216);
   EXPECT_EQ(p, fake_address3);
 }
@@ -383,7 +383,7 @@ TEST(ArenaTest, GlobalArenaDeallocateFromOtherArena)  // NOLINT
 
   auto sb      = ga.acquire(512);
   auto const b = sb.first_fit(512);
-  ga.release(std::move(sb));
+  ga.release(std::move(sb), {});
   ga.deallocate_from_other_arena(b.pointer(), b.size());
   EXPECT_EQ(ga.allocate(8388608), fake_address3);
 }

From c16f026a0a3a7c703e9017d002f9a5d3e55addd8 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Wed, 17 Nov 2021 18:22:17 -0800
Subject: [PATCH 18/35] update docs

---
 .../rmm/mr/device/arena_memory_resource.hpp    | 18 ++++++++++--------
 include/rmm/mr/device/detail/arena.hpp         | 16 +++++++---------
 tests/mr/device/arena_mr_tests.cpp             |  2 +-
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp
index 87c2a72db..59ba968ff 100644
--- a/include/rmm/mr/device/arena_memory_resource.hpp
+++ b/include/rmm/mr/device/arena_memory_resource.hpp
@@ -78,9 +78,9 @@ class arena_memory_resource final : public device_memory_resource {
    *
    * @throws rmm::logic_error if `upstream_mr == nullptr`.
    *
-   * @param upstream_mr The memory resource from which to allocate blocks for the pool.
-   * @param arena_size Size in bytes of the global arena. Defaults to all the available memory on
-   * the current device.
+   * @param upstream_mr The memory resource from which to allocate blocks for the global arena.
+   * @param arena_size Size in bytes of the global arena. Defaults to half of the available memory
+   * on the current device.
    */
   explicit arena_memory_resource(Upstream* upstream_mr,
                                  std::optional<std::size_t> arena_size = std::nullopt,
@@ -126,7 +126,7 @@ class arena_memory_resource final : public device_memory_resource {
    *
    * The returned pointer has at least 256-byte alignment.
    *
-   * @throws `std::bad_alloc` if the requested allocation could not be fulfilled.
+   * @throws `rmm::out_of_memory` if no more memory is available for the requested size.
    *
    * @param bytes The size in bytes of the allocation.
    * @param stream The stream to associate this allocation with.
@@ -153,7 +153,7 @@ class arena_memory_resource final : public device_memory_resource {
    *
    * @param ptr Pointer to be deallocated.
    * @param bytes The size in bytes of the allocation. This must be equal to the
-   * value of `bytes` that was passed to the `allocate` call that returned `p`.
+   * value of `bytes` that was passed to the `allocate` call that returned `ptr`.
    * @param stream Stream on which to perform deallocation.
    */
   void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override
@@ -176,19 +176,21 @@ class arena_memory_resource final : public device_memory_resource {
    */
   void deallocate_from_other_arena(void* ptr, std::size_t bytes, cuda_stream_view stream)
   {
+    // Since we are returning this memory to another stream, we need to make sure the current stream
+    // is caught up.
     stream.synchronize_no_throw();
 
-    read_lock lock(mtx_);
+    write_lock lock(mtx_);
 
     if (use_per_thread_arena(stream)) {
       auto const id = std::this_thread::get_id();
-      for (auto&& kv : thread_arenas_) {
+      for (auto const& kv : thread_arenas_) {
         // If the arena does not belong to the current thread, try to deallocate from it, and return
         // if successful.
         if (kv.first != id && kv.second->deallocate(ptr, bytes, stream)) { return; }
       }
     } else {
-      for (auto&& kv : stream_arenas_) {
+      for (auto& kv : stream_arenas_) {
         // If the arena does not belong to the current stream, try to deallocate from it, and return
         // if successful.
         if (stream.value() != kv.first && kv.second.deallocate(ptr, bytes, stream)) { return; }
diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index c988efe16..5f2bfc501 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -134,7 +134,7 @@ class block final : public memory_span {
   [[nodiscard]] std::pair<block, block> split(std::size_t sz) const
   {
     RMM_LOGGING_ASSERT(is_valid());
-    RMM_LOGGING_ASSERT(size() >= sz);
+    RMM_LOGGING_ASSERT(size() > sz);
     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
     return {{pointer(), sz}, {pointer() + sz, size() - sz}};
   }
@@ -149,8 +149,6 @@ class block final : public memory_span {
    */
   [[nodiscard]] block merge(block const& b) const
   {
-    RMM_LOGGING_ASSERT(is_valid());
-    RMM_LOGGING_ASSERT(b.is_valid());
     RMM_LOGGING_ASSERT(is_contiguous_before(b));
     return {pointer(), size() + b.size()};
   }
@@ -313,7 +311,7 @@ class superblock final : public memory_span {
    *
    * @param b The block to coalesce.
    */
-  void coalesce(block const& b)
+  void coalesce(block const& b)  // NOLINT(readability-function-cognitive-complexity)
   {
     RMM_LOGGING_ASSERT(is_valid());
     RMM_LOGGING_ASSERT(b.is_valid());
@@ -417,7 +415,6 @@ class global_arena final {
   {
     lock_guard lock(mtx_);
     upstream_mr_->deallocate(upstream_block_.pointer(), upstream_block_.size());
-    superblocks_.clear();
   }
 
   /**
@@ -491,15 +488,13 @@ class global_arena final {
    * @param size The size in bytes of the allocation. This must be equal to the value of `size`
    * that was passed to the `allocate` call that returned `p`.
    * @param stream Stream on which to perform deallocation.
-   * @return bool true if the allocation is found, false otherwise.
    */
-  bool deallocate(void* ptr, std::size_t size, cuda_stream_view stream)
+  void deallocate(void* ptr, std::size_t size, cuda_stream_view stream)
   {
     RMM_LOGGING_ASSERT(handles(size));
     stream.synchronize_no_throw();
     lock_guard lock(mtx_);
     coalesce({ptr, size});
-    return true;
   }
 
   /**
@@ -702,7 +697,10 @@ class arena {
    */
   bool deallocate(void* ptr, std::size_t size, cuda_stream_view stream)
   {
-    if (global_arena_.handles(size)) { return global_arena_.deallocate(ptr, size, stream); }
+    if (global_arena_.handles(size)) {
+      global_arena_.deallocate(ptr, size, stream);
+      return true;
+    }
     lock_guard lock(mtx_);
     return deallocate_from_superblock({ptr, size}, stream);
   }
diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index 87acc2a67..44aec7398 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -368,7 +368,7 @@ TEST(ArenaTest, GlobalArenaDeallocate)  // NOLINT
 
   auto* ptr = ga.allocate(4194304);
   EXPECT_EQ(ptr, fake_address3);
-  EXPECT_TRUE(ga.deallocate(ptr, 4194304, {}));
+  ga.deallocate(ptr, 4194304, {});
   ptr = ga.allocate(4194304);
   EXPECT_EQ(ptr, fake_address3);
 }

From f3e687515880415aefe37a7e16f1042f7cf12efa Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Thu, 18 Nov 2021 09:35:26 -0800
Subject: [PATCH 19/35] use byte literals in tests

---
 tests/mr/device/arena_mr_tests.cpp | 253 +++++++++++++++--------------
 1 file changed, 127 insertions(+), 126 deletions(-)

diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index 44aec7398..a251fee08 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -20,6 +20,7 @@
 #include <rmm/mr/device/arena_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include "../../byte_literals.hpp"
 
 #include <gmock/gmock-actions.h>
 #include <gmock/gmock.h>
@@ -43,13 +44,13 @@ using arena_mr     = rmm::mr::arena_memory_resource<rmm::mr::device_memory_resou
 using ::testing::Return;
 
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
-auto const fake_address = reinterpret_cast<void*>(1024L);
+auto const fake_address = reinterpret_cast<void*>(1_KiB);
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
-auto const fake_address2 = reinterpret_cast<void*>(2048L);
+auto const fake_address2 = reinterpret_cast<void*>(2_KiB);
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
-auto const fake_address3 = reinterpret_cast<void*>(4194304L);
+auto const fake_address3 = reinterpret_cast<void*>(4_MiB);
 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
-auto const fake_address4 = reinterpret_cast<void*>(8388608L);
+auto const fake_address4 = reinterpret_cast<void*>(8_MiB);
 
 /**
  * Test memory_span.
@@ -69,38 +70,38 @@ TEST(ArenaTest, MemorySpan)  // NOLINT
 
 TEST(ArenaTest, BlockFits)  // NOLINT
 {
-  block const b{fake_address, 1024};
-  EXPECT_TRUE(b.fits(1024));
-  EXPECT_FALSE(b.fits(1025));
+  block const b{fake_address, 1_KiB};
+  EXPECT_TRUE(b.fits(1_KiB));
+  EXPECT_FALSE(b.fits(1_KiB + 1));
 }
 
 TEST(ArenaTest, BlockIsContiguousBefore)  // NOLINT
 {
-  block const b{fake_address, 1024};
+  block const b{fake_address, 1_KiB};
   block const b2{fake_address2, 256};
   EXPECT_TRUE(b.is_contiguous_before(b2));
   block const b3{fake_address, 512};
-  block const b4{fake_address2, 1024};
+  block const b4{fake_address2, 1_KiB};
   EXPECT_FALSE(b3.is_contiguous_before(b4));
 }
 
 TEST(ArenaTest, BlockSplit)  // NOLINT
 {
-  block const b{fake_address, 2048};
-  auto const [head, tail] = b.split(1024);
+  block const b{fake_address, 2_KiB};
+  auto const [head, tail] = b.split(1_KiB);
   EXPECT_EQ(head.pointer(), fake_address);
-  EXPECT_EQ(head.size(), 1024);
+  EXPECT_EQ(head.size(), 1_KiB);
   EXPECT_EQ(tail.pointer(), fake_address2);
-  EXPECT_EQ(tail.size(), 1024);
+  EXPECT_EQ(tail.size(), 1_KiB);
 }
 
 TEST(ArenaTest, BlockMerge)  // NOLINT
 {
-  block const b{fake_address, 1024};
-  block const b2{fake_address2, 1024};
+  block const b{fake_address, 1_KiB};
+  block const b2{fake_address2, 1_KiB};
   auto const merged = b.merge(b2);
   EXPECT_EQ(merged.pointer(), fake_address);
-  EXPECT_EQ(merged.size(), 2048);
+  EXPECT_EQ(merged.size(), 2_KiB);
 }
 
 /**
@@ -109,7 +110,7 @@ TEST(ArenaTest, BlockMerge)  // NOLINT
 
 TEST(ArenaTest, SuperblockEmpty)  // NOLINT
 {
-  superblock sb{fake_address3, 4194304};
+  superblock sb{fake_address3, 4_MiB};
   EXPECT_TRUE(sb.empty());
   sb.first_fit(256);
   EXPECT_FALSE(sb.empty());
@@ -117,14 +118,14 @@ TEST(ArenaTest, SuperblockEmpty)  // NOLINT
 
 TEST(ArenaTest, SuperblockContains)  // NOLINT
 {
-  superblock const sb{fake_address3, 4194304};
-  block const b{fake_address, 2048};
+  superblock const sb{fake_address3, 4_MiB};
+  block const b{fake_address, 2_KiB};
   EXPECT_FALSE(sb.contains(b));
-  block const b2{fake_address3, 1024};
+  block const b2{fake_address3, 1_KiB};
   EXPECT_TRUE(sb.contains(b2));
-  block const b3{fake_address3, 4194305};
+  block const b3{fake_address3, 4_MiB + 1};
   EXPECT_FALSE(sb.contains(b3));
-  block const b4{fake_address3, 4194304};
+  block const b4{fake_address3, 4_MiB};
   EXPECT_TRUE(sb.contains(b4));
   block const b5{fake_address4, 256};
   EXPECT_FALSE(sb.contains(b5));
@@ -132,21 +133,21 @@ TEST(ArenaTest, SuperblockContains)  // NOLINT
 
 TEST(ArenaTest, SuperblockFits)  // NOLINT
 {
-  superblock sb{fake_address3, 4194304};
-  EXPECT_TRUE(sb.fits(4194304));
-  EXPECT_FALSE(sb.fits(4194305));
+  superblock sb{fake_address3, 4_MiB};
+  EXPECT_TRUE(sb.fits(4_MiB));
+  EXPECT_FALSE(sb.fits(4_MiB + 1));
 
-  auto const b = sb.first_fit(1048576);
-  sb.first_fit(1048576);
+  auto const b = sb.first_fit(1_MiB);
+  sb.first_fit(1_MiB);
   sb.coalesce(b);
-  EXPECT_TRUE(sb.fits(2097152));
-  EXPECT_FALSE(sb.fits(2097153));
+  EXPECT_TRUE(sb.fits(2_MiB));
+  EXPECT_FALSE(sb.fits(2_MiB + 1));
 }
 
 TEST(ArenaTest, SuperblockIsContiguousBefore)  // NOLINT
 {
-  superblock sb{fake_address3, 4194304};
-  superblock sb2{fake_address4, 4194304};
+  superblock sb{fake_address3, 4_MiB};
+  superblock sb2{fake_address4, 4_MiB};
   EXPECT_TRUE(sb.is_contiguous_before(sb2));
 
   auto const b = sb.first_fit(256);
@@ -154,7 +155,7 @@ TEST(ArenaTest, SuperblockIsContiguousBefore)  // NOLINT
   sb.coalesce(b);
   EXPECT_TRUE(sb.is_contiguous_before(sb2));
 
-  auto const b2 = sb2.first_fit(1024);
+  auto const b2 = sb2.first_fit(1_KiB);
   EXPECT_FALSE(sb.is_contiguous_before(sb2));
   sb2.coalesce(b2);
   EXPECT_TRUE(sb.is_contiguous_before(sb2));
@@ -162,36 +163,36 @@ TEST(ArenaTest, SuperblockIsContiguousBefore)  // NOLINT
 
 TEST(ArenaTest, SuperblockSplit)  // NOLINT
 {
-  superblock sb{fake_address3, 8388608};
-  auto const [head, tail] = sb.split(4194304);
+  superblock sb{fake_address3, 8_MiB};
+  auto const [head, tail] = sb.split(4_MiB);
   EXPECT_EQ(head.pointer(), fake_address3);
-  EXPECT_EQ(head.size(), 4194304);
+  EXPECT_EQ(head.size(), 4_MiB);
   EXPECT_TRUE(head.empty());
   EXPECT_EQ(tail.pointer(), fake_address4);
-  EXPECT_EQ(tail.size(), 4194304);
+  EXPECT_EQ(tail.size(), 4_MiB);
   EXPECT_TRUE(tail.empty());
 }
 
 TEST(ArenaTest, SuperblockMerge)  // NOLINT
 {
-  superblock sb{fake_address3, 4194304};
-  superblock sb2{fake_address4, 4194304};
+  superblock sb{fake_address3, 4_MiB};
+  superblock sb2{fake_address4, 4_MiB};
   auto const merged = sb.merge(sb2);
   EXPECT_EQ(merged.pointer(), fake_address3);
-  EXPECT_EQ(merged.size(), 8388608);
+  EXPECT_EQ(merged.size(), 8_MiB);
   EXPECT_TRUE(merged.empty());
 }
 
 TEST(ArenaTest, SuperblockFirstFit)  // NOLINT
 {
-  superblock sb{fake_address3, 4194304};
-  auto const b = sb.first_fit(1024);
+  superblock sb{fake_address3, 4_MiB};
+  auto const b = sb.first_fit(1_KiB);
   EXPECT_EQ(b.pointer(), fake_address3);
-  EXPECT_EQ(b.size(), 1024);
-  auto const b2 = sb.first_fit(2048);
+  EXPECT_EQ(b.size(), 1_KiB);
+  auto const b2 = sb.first_fit(2_KiB);
   // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-  EXPECT_EQ(b2.pointer(), static_cast<char*>(fake_address3) + 1024);
-  EXPECT_EQ(b2.size(), 2048);
+  EXPECT_EQ(b2.pointer(), static_cast<char*>(fake_address3) + 1_KiB);
+  EXPECT_EQ(b2.size(), 2_KiB);
   sb.coalesce(b);
   auto const b3 = sb.first_fit(512);
   EXPECT_EQ(b3.pointer(), fake_address3);
@@ -200,48 +201,48 @@ TEST(ArenaTest, SuperblockFirstFit)  // NOLINT
 
 TEST(ArenaTest, SuperblockCoalesceAfterFull)  // NOLINT
 {
-  superblock sb{fake_address3, 4194304};
-  auto const b = sb.first_fit(2097152);
-  sb.first_fit(2097152);
+  superblock sb{fake_address3, 4_MiB};
+  auto const b = sb.first_fit(2_MiB);
+  sb.first_fit(2_MiB);
   sb.coalesce(b);
-  EXPECT_TRUE(sb.first_fit(2097152).is_valid());
+  EXPECT_TRUE(sb.first_fit(2_MiB).is_valid());
 }
 
 TEST(ArenaTest, SuperblockCoalesceMergeNext)  // NOLINT
 {
-  superblock sb{fake_address3, 4194304};
-  auto const b = sb.first_fit(2097152);
+  superblock sb{fake_address3, 4_MiB};
+  auto const b = sb.first_fit(2_MiB);
   sb.coalesce(b);
-  EXPECT_TRUE(sb.first_fit(4194304).is_valid());
+  EXPECT_TRUE(sb.first_fit(4_MiB).is_valid());
 }
 
 TEST(ArenaTest, SuperblockCoalesceMergePrevious)  // NOLINT
 {
-  superblock sb{fake_address3, 4194304};
-  auto const b  = sb.first_fit(1024);
-  auto const b2 = sb.first_fit(1024);
-  sb.first_fit(1024);
+  superblock sb{fake_address3, 4_MiB};
+  auto const b  = sb.first_fit(1_KiB);
+  auto const b2 = sb.first_fit(1_KiB);
+  sb.first_fit(1_KiB);
   sb.coalesce(b);
   sb.coalesce(b2);
-  auto const b3 = sb.first_fit(2048);
+  auto const b3 = sb.first_fit(2_KiB);
   EXPECT_EQ(b3.pointer(), fake_address3);
 }
 
 TEST(ArenaTest, SuperblockCoalesceMergePreviousAndNext)  // NOLINT
 {
-  superblock sb{fake_address3, 4194304};
-  auto const b  = sb.first_fit(1024);
-  auto const b2 = sb.first_fit(1024);
+  superblock sb{fake_address3, 4_MiB};
+  auto const b  = sb.first_fit(1_KiB);
+  auto const b2 = sb.first_fit(1_KiB);
   sb.coalesce(b);
   sb.coalesce(b2);
-  EXPECT_TRUE(sb.first_fit(4194304).is_valid());
+  EXPECT_TRUE(sb.first_fit(4_MiB).is_valid());
 }
 
 TEST(ArenaTest, SuperblockMaxFree)  // NOLINT
 {
-  superblock sb{fake_address3, 4194304};
-  sb.first_fit(2097152);
-  EXPECT_EQ(sb.max_free(), 2097152);
+  superblock sb{fake_address3, 4_MiB};
+  sb.first_fit(2_MiB);
+  EXPECT_EQ(sb.max_free(), 2_MiB);
 }
 
 /**
@@ -257,19 +258,19 @@ TEST(ArenaTest, GlobalArenaNullUpstream)  // NOLINT
 TEST(ArenaTest, GlobalArenaAcquire)  // NOLINT
 {
   mock_memory_resource mock;
-  EXPECT_CALL(mock, allocate(8388608)).WillOnce(Return(fake_address3));
-  EXPECT_CALL(mock, deallocate(fake_address3, 8388608));
+  EXPECT_CALL(mock, allocate(8_MiB)).WillOnce(Return(fake_address3));
+  EXPECT_CALL(mock, deallocate(fake_address3, 8_MiB));
 
-  global_arena ga{&mock, 8388608};
+  global_arena ga{&mock, 8_MiB};
 
   auto const sb = ga.acquire(256);
   EXPECT_EQ(sb.pointer(), fake_address3);
-  EXPECT_EQ(sb.size(), 4194304);
+  EXPECT_EQ(sb.size(), 4_MiB);
   EXPECT_TRUE(sb.empty());
 
-  auto const sb2 = ga.acquire(1024);
+  auto const sb2 = ga.acquire(1_KiB);
   EXPECT_EQ(sb2.pointer(), fake_address4);
-  EXPECT_EQ(sb2.size(), 4194304);
+  EXPECT_EQ(sb2.size(), 4_MiB);
   EXPECT_TRUE(sb2.empty());
 
   EXPECT_FALSE(ga.acquire(512).is_valid());
@@ -278,114 +279,114 @@ TEST(ArenaTest, GlobalArenaAcquire)  // NOLINT
 TEST(ArenaTest, GlobalArenaReleaseMergeNext)  // NOLINT
 {
   mock_memory_resource mock;
-  EXPECT_CALL(mock, allocate(8388608)).WillOnce(Return(fake_address3));
-  EXPECT_CALL(mock, deallocate(fake_address3, 8388608));
+  EXPECT_CALL(mock, allocate(8_MiB)).WillOnce(Return(fake_address3));
+  EXPECT_CALL(mock, deallocate(fake_address3, 8_MiB));
 
-  global_arena ga{&mock, 8388608};
+  global_arena ga{&mock, 8_MiB};
 
   auto sb = ga.acquire(256);
   ga.release(std::move(sb), {});
-  auto* p = ga.allocate(8388608);
+  auto* p = ga.allocate(8_MiB);
   EXPECT_EQ(p, fake_address3);
 }
 
 TEST(ArenaTest, GlobalArenaReleaseMergePrevious)  // NOLINT
 {
   mock_memory_resource mock;
-  EXPECT_CALL(mock, allocate(16777216)).WillOnce(Return(fake_address3));
-  EXPECT_CALL(mock, deallocate(fake_address3, 16777216));
+  EXPECT_CALL(mock, allocate(16_MiB)).WillOnce(Return(fake_address3));
+  EXPECT_CALL(mock, deallocate(fake_address3, 16_MiB));
 
-  global_arena ga{&mock, 16777216};
+  global_arena ga{&mock, 16_MiB};
 
   auto sb  = ga.acquire(256);
-  auto sb2 = ga.acquire(1024);
+  auto sb2 = ga.acquire(1_KiB);
   ga.acquire(512);
   ga.release(std::move(sb), {});
   ga.release(std::move(sb2), {});
-  auto* p = ga.allocate(8388608);
+  auto* p = ga.allocate(8_MiB);
   EXPECT_EQ(p, fake_address3);
 }
 
 TEST(ArenaTest, GlobalArenaReleaseMergePreviousAndNext)  // NOLINT
 {
   mock_memory_resource mock;
-  EXPECT_CALL(mock, allocate(16777216)).WillOnce(Return(fake_address3));
-  EXPECT_CALL(mock, deallocate(fake_address3, 16777216));
+  EXPECT_CALL(mock, allocate(16_MiB)).WillOnce(Return(fake_address3));
+  EXPECT_CALL(mock, deallocate(fake_address3, 16_MiB));
 
-  global_arena ga{&mock, 16777216};
+  global_arena ga{&mock, 16_MiB};
 
   auto sb  = ga.acquire(256);
-  auto sb2 = ga.acquire(1024);
+  auto sb2 = ga.acquire(1_KiB);
   auto sb3 = ga.acquire(512);
   ga.release(std::move(sb), {});
   ga.release(std::move(sb3), {});
   ga.release(std::move(sb2), {});
-  auto* p = ga.allocate(16777216);
+  auto* p = ga.allocate(16_MiB);
   EXPECT_EQ(p, fake_address3);
 }
 
 TEST(ArenaTest, GlobalArenaReleaseMultiple)  // NOLINT
 {
   mock_memory_resource mock;
-  EXPECT_CALL(mock, allocate(16777216)).WillOnce(Return(fake_address3));
-  EXPECT_CALL(mock, deallocate(fake_address3, 16777216));
+  EXPECT_CALL(mock, allocate(16_MiB)).WillOnce(Return(fake_address3));
+  EXPECT_CALL(mock, deallocate(fake_address3, 16_MiB));
 
-  global_arena ga{&mock, 16777216};
+  global_arena ga{&mock, 16_MiB};
 
   std::set<superblock> superblocks{};
   auto sb = ga.acquire(256);
   superblocks.insert(std::move(sb));
-  auto sb2 = ga.acquire(1024);
+  auto sb2 = ga.acquire(1_KiB);
   superblocks.insert(std::move(sb2));
   auto sb3 = ga.acquire(512);
   superblocks.insert(std::move(sb3));
   ga.release(superblocks);
-  auto* p = ga.allocate(16777216);
+  auto* p = ga.allocate(16_MiB);
   EXPECT_EQ(p, fake_address3);
 }
 
 TEST(ArenaTest, GlobalArenaAllocate)  // NOLINT
 {
   mock_memory_resource mock;
-  EXPECT_CALL(mock, allocate(8388608)).WillOnce(Return(fake_address3));
-  EXPECT_CALL(mock, deallocate(fake_address3, 8388608));
+  EXPECT_CALL(mock, allocate(8_MiB)).WillOnce(Return(fake_address3));
+  EXPECT_CALL(mock, deallocate(fake_address3, 8_MiB));
 
-  global_arena ga{&mock, 8388608};
+  global_arena ga{&mock, 8_MiB};
 
-  auto* ptr = ga.allocate(4194304);
+  auto* ptr = ga.allocate(4_MiB);
   EXPECT_EQ(ptr, fake_address3);
-  auto* ptr2 = ga.allocate(4194304);
+  auto* ptr2 = ga.allocate(4_MiB);
   EXPECT_EQ(ptr2, fake_address4);
 }
 
 TEST(ArenaTest, GlobalArenaDeallocate)  // NOLINT
 {
   mock_memory_resource mock;
-  EXPECT_CALL(mock, allocate(8388608)).WillOnce(Return(fake_address3));
-  EXPECT_CALL(mock, deallocate(fake_address3, 8388608));
+  EXPECT_CALL(mock, allocate(8_MiB)).WillOnce(Return(fake_address3));
+  EXPECT_CALL(mock, deallocate(fake_address3, 8_MiB));
 
-  global_arena ga{&mock, 8388608};
+  global_arena ga{&mock, 8_MiB};
 
-  auto* ptr = ga.allocate(4194304);
+  auto* ptr = ga.allocate(4_MiB);
   EXPECT_EQ(ptr, fake_address3);
-  ga.deallocate(ptr, 4194304, {});
-  ptr = ga.allocate(4194304);
+  ga.deallocate(ptr, 4_MiB, {});
+  ptr = ga.allocate(4_MiB);
   EXPECT_EQ(ptr, fake_address3);
 }
 
 TEST(ArenaTest, GlobalArenaDeallocateFromOtherArena)  // NOLINT
 {
   mock_memory_resource mock;
-  EXPECT_CALL(mock, allocate(8388608)).WillOnce(Return(fake_address3));
-  EXPECT_CALL(mock, deallocate(fake_address3, 8388608));
+  EXPECT_CALL(mock, allocate(8_MiB)).WillOnce(Return(fake_address3));
+  EXPECT_CALL(mock, deallocate(fake_address3, 8_MiB));
 
-  global_arena ga{&mock, 8388608};
+  global_arena ga{&mock, 8_MiB};
 
   auto sb      = ga.acquire(512);
   auto const b = sb.first_fit(512);
   ga.release(std::move(sb), {});
   ga.deallocate_from_other_arena(b.pointer(), b.size());
-  EXPECT_EQ(ga.allocate(8388608), fake_address3);
+  EXPECT_EQ(ga.allocate(8_MiB), fake_address3);
 }
 
 /**
@@ -395,36 +396,36 @@ TEST(ArenaTest, GlobalArenaDeallocateFromOtherArena)  // NOLINT
 TEST(ArenaTest, ArenaAllocate)  // NOLINT
 {
   mock_memory_resource mock;
-  EXPECT_CALL(mock, allocate(8388608)).WillOnce(Return(fake_address3));
-  EXPECT_CALL(mock, deallocate(fake_address3, 8388608));
-  global_arena ga{&mock, 8388608};
+  EXPECT_CALL(mock, allocate(8_MiB)).WillOnce(Return(fake_address3));
+  EXPECT_CALL(mock, deallocate(fake_address3, 8_MiB));
+  global_arena ga{&mock, 8_MiB};
   arena a{ga};
 
-  EXPECT_EQ(a.allocate(4194304), fake_address3);
+  EXPECT_EQ(a.allocate(4_MiB), fake_address3);
   EXPECT_EQ(a.allocate(256), fake_address4);
 }
 
 TEST(ArenaTest, ArenaDeallocate)  // NOLINT
 {
   mock_memory_resource mock;
-  EXPECT_CALL(mock, allocate(8388608)).WillOnce(Return(fake_address3));
-  EXPECT_CALL(mock, deallocate(fake_address3, 8388608));
-  global_arena ga{&mock, 8388608};
+  EXPECT_CALL(mock, allocate(8_MiB)).WillOnce(Return(fake_address3));
+  EXPECT_CALL(mock, deallocate(fake_address3, 8_MiB));
+  global_arena ga{&mock, 8_MiB};
   arena a{ga};
 
-  auto* ptr = a.allocate(4194304);
-  a.deallocate(ptr, 4194304, {});
+  auto* ptr = a.allocate(4_MiB);
+  a.deallocate(ptr, 4_MiB, {});
   auto* ptr2 = a.allocate(256);
   a.deallocate(ptr2, 256, {});
-  EXPECT_EQ(a.allocate(8388608), fake_address3);
+  EXPECT_EQ(a.allocate(8_MiB), fake_address3);
 }
 
 TEST(ArenaTest, ArenaDeallocateMergePrevious)  // NOLINT
 {
   mock_memory_resource mock;
-  EXPECT_CALL(mock, allocate(8388608)).WillOnce(Return(fake_address3));
-  EXPECT_CALL(mock, deallocate(fake_address3, 8388608));
-  global_arena ga{&mock, 8388608};
+  EXPECT_CALL(mock, allocate(8_MiB)).WillOnce(Return(fake_address3));
+  EXPECT_CALL(mock, deallocate(fake_address3, 8_MiB));
+  global_arena ga{&mock, 8_MiB};
   arena a{ga};
 
   auto* ptr  = a.allocate(256);
@@ -438,9 +439,9 @@ TEST(ArenaTest, ArenaDeallocateMergePrevious)  // NOLINT
 TEST(ArenaTest, ArenaDeallocateMergeNext)  // NOLINT
 {
   mock_memory_resource mock;
-  EXPECT_CALL(mock, allocate(8388608)).WillOnce(Return(fake_address3));
-  EXPECT_CALL(mock, deallocate(fake_address3, 8388608));
-  global_arena ga{&mock, 8388608};
+  EXPECT_CALL(mock, allocate(8_MiB)).WillOnce(Return(fake_address3));
+  EXPECT_CALL(mock, deallocate(fake_address3, 8_MiB));
+  global_arena ga{&mock, 8_MiB};
   arena a{ga};
 
   auto* ptr  = a.allocate(256);
@@ -454,16 +455,16 @@ TEST(ArenaTest, ArenaDeallocateMergeNext)  // NOLINT
 TEST(ArenaTest, ArenaDeallocateMergePreviousAndNext)  // NOLINT
 {
   mock_memory_resource mock;
-  EXPECT_CALL(mock, allocate(8388608)).WillOnce(Return(fake_address3));
-  EXPECT_CALL(mock, deallocate(fake_address3, 8388608));
-  global_arena ga{&mock, 8388608};
+  EXPECT_CALL(mock, allocate(8_MiB)).WillOnce(Return(fake_address3));
+  EXPECT_CALL(mock, deallocate(fake_address3, 8_MiB));
+  global_arena ga{&mock, 8_MiB};
   arena a{ga};
 
   auto* ptr  = a.allocate(256);
   auto* ptr2 = a.allocate(256);
   a.deallocate(ptr, 256, {});
   a.deallocate(ptr2, 256, {});
-  EXPECT_EQ(a.allocate(2048), fake_address3);
+  EXPECT_EQ(a.allocate(2_KiB), fake_address3);
 }
 
 /**
@@ -492,11 +493,11 @@ TEST(ArenaTest, SmallMediumLarge)  // NOLINT
   EXPECT_NO_THROW([]() {  // NOLINT(cppcoreguidelines-avoid-goto)
     arena_mr mr(rmm::mr::get_current_device_resource());
     auto* small     = mr.allocate(256);
-    auto* medium    = mr.allocate(1U << 26U);
+    auto* medium    = mr.allocate(64_MiB);
     auto const free = rmm::detail::available_device_memory().first;
     auto* large     = mr.allocate(free / 3);
     mr.deallocate(small, 256);
-    mr.deallocate(medium, 1U << 26U);
+    mr.deallocate(medium, 64_MiB);
     mr.deallocate(large, free / 3);
   }());
 }

From cb25f74088ed232358607571a1e81eec6957b402 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Thu, 18 Nov 2021 10:21:38 -0800
Subject: [PATCH 20/35] fix overflow bug

---
 include/rmm/mr/device/arena_memory_resource.hpp | 6 +++---
 include/rmm/mr/device/detail/arena.hpp          | 5 +++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp
index 59ba968ff..c37f6f19b 100644
--- a/include/rmm/mr/device/arena_memory_resource.hpp
+++ b/include/rmm/mr/device/arena_memory_resource.hpp
@@ -180,17 +180,17 @@ class arena_memory_resource final : public device_memory_resource {
     // is caught up.
     stream.synchronize_no_throw();
 
-    write_lock lock(mtx_);
+    read_lock lock(mtx_);
 
     if (use_per_thread_arena(stream)) {
       auto const id = std::this_thread::get_id();
-      for (auto const& kv : thread_arenas_) {
+      for (auto&& kv : thread_arenas_) {
         // If the arena does not belong to the current thread, try to deallocate from it, and return
         // if successful.
         if (kv.first != id && kv.second->deallocate(ptr, bytes, stream)) { return; }
       }
     } else {
-      for (auto& kv : stream_arenas_) {
+      for (auto&& kv : stream_arenas_) {
         // If the arena does not belong to the current stream, try to deallocate from it, and return
         // if successful.
         if (stream.value() != kv.first && kv.second.deallocate(ptr, bytes, stream)) { return; }
diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index 5f2bfc501..444eb3102 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -185,6 +185,7 @@ class superblock final : public memory_span {
   superblock(void* pointer, std::size_t size) : memory_span{pointer, size}
   {
     RMM_LOGGING_ASSERT(size > minimum_size / 2);
+    RMM_LOGGING_ASSERT(size < 1UL << 40UL);
     free_blocks_.emplace(pointer, size);
   }
 
@@ -259,7 +260,7 @@ class superblock final : public memory_span {
   [[nodiscard]] std::pair<superblock, superblock> split(std::size_t sz) const
   {
     RMM_LOGGING_ASSERT(is_valid());
-    RMM_LOGGING_ASSERT(empty() && sz >= minimum_size && size() - sz >= minimum_size);
+    RMM_LOGGING_ASSERT(empty() && sz >= minimum_size && size() >= sz + minimum_size);
     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
     return {superblock{pointer(), sz}, superblock{pointer() + sz, size() - sz}};
   }
@@ -587,7 +588,7 @@ class global_arena final {
 
     auto sb       = std::move(superblocks_.extract(iter).value());
     auto const sz = std::max(size, superblock::minimum_size);
-    if (sb.empty() && sb.size() - sz >= superblock::minimum_size) {
+    if (sb.empty() && sb.size() >= sz + superblock::minimum_size) {
       // Split the superblock and put the remainder back.
       auto [head, tail] = sb.split(sz);
       superblocks_.insert(std::move(tail));

From 6eb957f7f45678fc0e276893c6749c5957ff0c86 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Mon, 22 Nov 2021 18:39:55 -0800
Subject: [PATCH 21/35] more fixes

---
 .../rmm/mr/device/arena_memory_resource.hpp   |  24 +-
 include/rmm/mr/device/detail/arena.hpp        | 114 ++++-
 tests/mr/device/arena_mr_tests.cpp            | 429 +++++++++---------
 3 files changed, 334 insertions(+), 233 deletions(-)

diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp
index c37f6f19b..440da5a0b 100644
--- a/include/rmm/mr/device/arena_memory_resource.hpp
+++ b/include/rmm/mr/device/arena_memory_resource.hpp
@@ -141,13 +141,32 @@ class arena_memory_resource final : public device_memory_resource {
     void* pointer = arena.allocate(bytes);
 
     if (pointer == nullptr) {
-      if (dump_log_on_failure_) { dump_memory_log(bytes); }
-      RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory);
+      write_lock lock(mtx_);
+      defragment();
+      pointer = arena.allocate(bytes);
+      if (pointer == nullptr) {
+        if (dump_log_on_failure_) { dump_memory_log(bytes); }
+        RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory);
+      }
     }
 
     return pointer;
   }
 
+  /**
+   * @brief Defragment memory by returning all superblocks to the global arena.
+   */
+  void defragment()
+  {
+    RMM_CUDA_TRY(cudaDeviceSynchronize());
+    for (auto& thread_arena : thread_arenas_) {
+      thread_arena.second->defragment();
+    }
+    for (auto& stream_arena : stream_arenas_) {
+      stream_arena.second.defragment();
+    }
+  }
+
   /**
    * @brief Deallocate memory pointed to by `ptr`.
    *
@@ -291,6 +310,7 @@ class arena_memory_resource final : public device_memory_resource {
         stream_arena.second.dump_memory_log(logger_);
       }
     }
+    logger_->flush();
   }
 
   /**
diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index 444eb3102..802ea269a 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -66,6 +66,12 @@ class memory_span {
   /// Returns the size of the span.
   [[nodiscard]] std::size_t size() const { return size_; }
 
+  /// Returns the end of the span.
+  [[nodiscard]] char* end() const
+  {
+    return pointer_ + size_;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+  }
+
   /// Returns true if this span is valid (non-null), false otherwise.
   [[nodiscard]] bool is_valid() const { return pointer_ != nullptr && size_ > 0; }
 
@@ -168,8 +174,8 @@ inline bool block_size_compare(block const& lhs, block const& rhs)
  */
 class superblock final : public memory_span {
  public:
-  /// Minimum size of a superblock (4 MiB).
-  static constexpr std::size_t minimum_size{1U << 22U};
+  /// Minimum size of a superblock (64 MiB).
+  static constexpr std::size_t minimum_size{1U << 26U};
 
   /**
    * @brief Construct a default superblock.
@@ -184,7 +190,7 @@ class superblock final : public memory_span {
    */
   superblock(void* pointer, std::size_t size) : memory_span{pointer, size}
   {
-    RMM_LOGGING_ASSERT(size > minimum_size / 2);
+    RMM_LOGGING_ASSERT(size >= minimum_size);
     RMM_LOGGING_ASSERT(size < 1UL << 40UL);
     free_blocks_.emplace(pointer, size);
   }
@@ -209,6 +215,17 @@ class superblock final : public memory_span {
     return free_blocks_.size() == 1 && free_blocks_.cbegin()->size() == size();
   }
 
+  /**
+   * @brief Return the number of free blocks.
+   *
+   * @return the number of free blocks.
+   */
+  [[nodiscard]] std::size_t free_blocks() const
+  {
+    RMM_LOGGING_ASSERT(is_valid());
+    return free_blocks_.size();
+  }
+
   /**
    * @brief Whether this superblock contains the given block.
    *
@@ -355,6 +372,7 @@ class superblock final : public memory_span {
    */
   [[nodiscard]] std::size_t max_free() const
   {
+    if (free_blocks_.empty()) { return 0; }
     return std::max_element(free_blocks_.cbegin(), free_blocks_.cend(), block_size_compare)->size();
   }
 
@@ -424,7 +442,7 @@ class global_arena final {
    * @param size The size in bytes of the allocation.
    * @return bool True if the allocation should be handled by the global arena.
    */
-  bool handles(std::size_t size) const { return size > superblock::minimum_size / 2; }
+  bool handles(std::size_t size) const { return size > superblock::minimum_size; }
 
   /**
    * @brief Acquire a superblock that can fit a block of the given size.
@@ -437,19 +455,17 @@ class global_arena final {
     // Superblocks should only be acquired if the size is not directly handled by the global arena.
     RMM_LOGGING_ASSERT(!handles(size));
     lock_guard lock(mtx_);
-    return first_fit(size);
+    return first_fit(size, superblock::minimum_size);
   }
 
   /**
    * @brief Release a superblock.
    *
    * @param s Superblock to be released.
-   * @param stream The stream to synchronize on before releasing.
    */
-  void release(superblock&& sb, cuda_stream_view stream)
+  void release(superblock&& sb)
   {
     RMM_LOGGING_ASSERT(sb.is_valid());
-    stream.synchronize_no_throw();
     lock_guard lock(mtx_);
     coalesce(std::move(sb));
   }
@@ -479,7 +495,13 @@ class global_arena final {
   {
     RMM_LOGGING_ASSERT(handles(size));
     lock_guard lock(mtx_);
-    return first_fit(size).pointer();
+    auto const aligned = rmm::detail::align_up(size, superblock::minimum_size);
+    auto sb            = first_fit(aligned, aligned);
+    if (sb.is_valid()) {
+      RMM_LOGGING_ASSERT(large_allocations_.find(sb.pointer()) == large_allocations_.cend());
+      large_allocations_.emplace(sb.pointer(), sb.size());
+    }
+    return sb.pointer();
   }
 
   /**
@@ -495,7 +517,9 @@ class global_arena final {
     RMM_LOGGING_ASSERT(handles(size));
     stream.synchronize_no_throw();
     lock_guard lock(mtx_);
-    coalesce({ptr, size});
+    auto const allocated_size = large_allocations_.at(ptr);
+    large_allocations_.erase(ptr);
+    coalesce({ptr, allocated_size});
   }
 
   /**
@@ -539,6 +563,23 @@ class global_arena final {
       logger->info("  Total size of superblocks: {}",
                    rmm::detail::bytes{total_memory_size(superblocks_)});
       logger->info("  Size of largest free block: {}", rmm::detail::bytes{max_free(superblocks_)});
+      logger->info("  # of outstanding large allocations: {}", large_allocations_.size());
+      auto i = 0;
+      char* prev_end{};
+      for (auto const& sb : superblocks_) {
+        if (prev_end == nullptr) { prev_end = sb.pointer(); }
+        logger->info(
+          "    Superblock {}: start={}, end={}, size={}, empty={}, # free blocks={}, gap={}",
+          i,
+          fmt::ptr(sb.pointer()),
+          fmt::ptr(sb.end()),
+          rmm::detail::bytes{sb.size()},
+          sb.empty(),
+          sb.free_blocks(),
+          rmm::detail::bytes{static_cast<size_t>(sb.pointer() - prev_end)});
+        prev_end = sb.end();
+        i++;
+      }
     }
   }
 
@@ -578,19 +619,20 @@ class global_arena final {
    * Sigplan Notices, 34(3), 26-36.
    *
    * @param size The number of bytes to allocate.
+   * @param minimum_size The minimum size of the superblock required.
    * @return superblock A superblock that can fit at least `size` bytes, or empty if not found.
    */
-  superblock first_fit(std::size_t size)
+  superblock first_fit(std::size_t size, std::size_t minimum_size)
   {
-    auto const iter = std::find_if(
-      superblocks_.cbegin(), superblocks_.cend(), [size](auto const& sb) { return sb.fits(size); });
+    auto const iter = std::find_if(superblocks_.cbegin(), superblocks_.cend(), [=](auto const& sb) {
+      return sb.fits(size) && sb.size() >= minimum_size;
+    });
     if (iter == superblocks_.cend()) { return {}; }
 
-    auto sb       = std::move(superblocks_.extract(iter).value());
-    auto const sz = std::max(size, superblock::minimum_size);
-    if (sb.empty() && sb.size() >= sz + superblock::minimum_size) {
+    auto sb = std::move(superblocks_.extract(iter).value());
+    if (sb.empty() && sb.size() >= minimum_size + superblock::minimum_size) {
       // Split the superblock and put the remainder back.
-      auto [head, tail] = sb.split(sz);
+      auto [head, tail] = sb.split(minimum_size);
       superblocks_.insert(std::move(tail));
       return std::move(head);
     }
@@ -643,6 +685,8 @@ class global_arena final {
   block upstream_block_;
   /// Address-ordered set of superblocks.
   std::set<superblock> superblocks_;
+  /// Large allocations.
+  std::unordered_map<void*, std::size_t> large_allocations_;
   /// Mutex for exclusive lock.
   mutable std::mutex mtx_;
 };
@@ -707,7 +751,7 @@ class arena {
   }
 
   /**
-   * @brief Clean the arena and deallocate free blocks from the global arena.
+   * @brief Clean the arena and release all superblocks to the global arena.
    */
   void clean()
   {
@@ -715,6 +759,20 @@ class arena {
     global_arena_.release(superblocks_);
   }
 
+  /**
+   * @brief Defragment the arena and release empty superblock to the global arena.
+   */
+  void defragment()
+  {
+    lock_guard lock(mtx_);
+    while (true) {
+      auto const iter = std::find_if(
+        superblocks_.cbegin(), superblocks_.cend(), [](auto const& sb) { return sb.empty(); });
+      if (iter == superblocks_.cend()) { return; }
+      global_arena_.release(std::move(superblocks_.extract(iter).value()));
+    }
+  }
+
   /**
    * Dump memory to log.
    *
@@ -729,6 +787,19 @@ class arena {
                    rmm::detail::bytes{total_memory_size(superblocks_)});
       logger->info("    Size of largest free block: {}",
                    rmm::detail::bytes{max_free(superblocks_)});
+      auto i = 0;
+      for (auto const& sb : superblocks_) {
+        logger->info(
+          "      Superblock {}: start={}, end={}, size={}, empty={}, # free blocks={}, max free={}",
+          i,
+          fmt::ptr(sb.pointer()),
+          fmt::ptr(sb.end()),
+          rmm::detail::bytes{sb.size()},
+          sb.empty(),
+          sb.free_blocks(),
+          rmm::detail::bytes{sb.max_free()});
+        i++;
+      }
     }
   }
 
@@ -791,11 +862,7 @@ class arena {
 
     auto sb = std::move(superblocks_.extract(iter).value());
     sb.coalesce(b);
-    if (sb.empty()) {
-      global_arena_.release(std::move(sb), stream);
-    } else {
-      superblocks_.insert(std::move(sb));
-    }
+    superblocks_.insert(std::move(sb));
     return true;
   }
 
@@ -809,6 +876,7 @@ class arena {
   {
     auto sb = global_arena_.acquire(size);
     if (sb.is_valid()) {
+      RMM_LOGGING_ASSERT(sb.size() >= superblock::minimum_size);
       auto const b = sb.first_fit(size);
       superblocks_.insert(std::move(sb));
       return b;
diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index a251fee08..15d8faef6 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <rmm/cuda_stream.hpp>
 #include <rmm/detail/aligned.hpp>
 #include <rmm/detail/cuda_util.hpp>
 #include <rmm/detail/error.hpp>
@@ -26,6 +27,8 @@
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
+#include <memory>
+
 namespace rmm::test {
 namespace {
 
@@ -43,20 +46,40 @@ using arena        = rmm::mr::detail::arena::arena<mock_memory_resource>;
 using arena_mr     = rmm::mr::arena_memory_resource<rmm::mr::device_memory_resource>;
 using ::testing::Return;
 
-// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast,performance-no-int-to-ptr)
 auto const fake_address = reinterpret_cast<void*>(1_KiB);
-// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast,performance-no-int-to-ptr)
 auto const fake_address2 = reinterpret_cast<void*>(2_KiB);
-// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
-auto const fake_address3 = reinterpret_cast<void*>(4_MiB);
-// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
-auto const fake_address4 = reinterpret_cast<void*>(8_MiB);
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast,performance-no-int-to-ptr)
+auto const fake_address3 = reinterpret_cast<void*>(superblock::minimum_size);
+// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast,performance-no-int-to-ptr)
+auto const fake_address4 = reinterpret_cast<void*>(superblock::minimum_size * 2);
+
+class ArenaTest : public ::testing::Test {
+ protected:
+  void SetUp() override
+  {
+    EXPECT_CALL(mock_, allocate(arena_size_)).WillOnce(Return(fake_address3));
+    EXPECT_CALL(mock_, deallocate(fake_address3, arena_size_));
+    ga_ = std::make_unique<global_arena>(&mock_, arena_size_);
+    a_  = std::make_unique<arena>(*ga_);
+  }
+
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::size_t arena_size_{superblock::minimum_size * 4};
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  mock_memory_resource mock_{};
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::unique_ptr<global_arena> ga_{};
+  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::unique_ptr<arena> a_{};
+};
 
 /**
  * Test memory_span.
  */
 
-TEST(ArenaTest, MemorySpan)  // NOLINT
+TEST_F(ArenaTest, MemorySpan)  // NOLINT
 {
   memory_span const ms{};
   EXPECT_FALSE(ms.is_valid());
@@ -68,14 +91,14 @@ TEST(ArenaTest, MemorySpan)  // NOLINT
  * Test block.
  */
 
-TEST(ArenaTest, BlockFits)  // NOLINT
+TEST_F(ArenaTest, BlockFits)  // NOLINT
 {
   block const b{fake_address, 1_KiB};
   EXPECT_TRUE(b.fits(1_KiB));
   EXPECT_FALSE(b.fits(1_KiB + 1));
 }
 
-TEST(ArenaTest, BlockIsContiguousBefore)  // NOLINT
+TEST_F(ArenaTest, BlockIsContiguousBefore)  // NOLINT
 {
   block const b{fake_address, 1_KiB};
   block const b2{fake_address2, 256};
@@ -85,7 +108,7 @@ TEST(ArenaTest, BlockIsContiguousBefore)  // NOLINT
   EXPECT_FALSE(b3.is_contiguous_before(b4));
 }
 
-TEST(ArenaTest, BlockSplit)  // NOLINT
+TEST_F(ArenaTest, BlockSplit)  // NOLINT
 {
   block const b{fake_address, 2_KiB};
   auto const [head, tail] = b.split(1_KiB);
@@ -95,7 +118,7 @@ TEST(ArenaTest, BlockSplit)  // NOLINT
   EXPECT_EQ(tail.size(), 1_KiB);
 }
 
-TEST(ArenaTest, BlockMerge)  // NOLINT
+TEST_F(ArenaTest, BlockMerge)  // NOLINT
 {
   block const b{fake_address, 1_KiB};
   block const b2{fake_address2, 1_KiB};
@@ -108,46 +131,46 @@ TEST(ArenaTest, BlockMerge)  // NOLINT
  * Test superblock.
  */
 
-TEST(ArenaTest, SuperblockEmpty)  // NOLINT
+TEST_F(ArenaTest, SuperblockEmpty)  // NOLINT
 {
-  superblock sb{fake_address3, 4_MiB};
+  superblock sb{fake_address3, superblock::minimum_size};
   EXPECT_TRUE(sb.empty());
   sb.first_fit(256);
   EXPECT_FALSE(sb.empty());
 }
 
-TEST(ArenaTest, SuperblockContains)  // NOLINT
+TEST_F(ArenaTest, SuperblockContains)  // NOLINT
 {
-  superblock const sb{fake_address3, 4_MiB};
+  superblock const sb{fake_address3, superblock::minimum_size};
   block const b{fake_address, 2_KiB};
   EXPECT_FALSE(sb.contains(b));
   block const b2{fake_address3, 1_KiB};
   EXPECT_TRUE(sb.contains(b2));
-  block const b3{fake_address3, 4_MiB + 1};
+  block const b3{fake_address3, superblock::minimum_size + 1};
   EXPECT_FALSE(sb.contains(b3));
-  block const b4{fake_address3, 4_MiB};
+  block const b4{fake_address3, superblock::minimum_size};
   EXPECT_TRUE(sb.contains(b4));
   block const b5{fake_address4, 256};
   EXPECT_FALSE(sb.contains(b5));
 }
 
-TEST(ArenaTest, SuperblockFits)  // NOLINT
+TEST_F(ArenaTest, SuperblockFits)  // NOLINT
 {
-  superblock sb{fake_address3, 4_MiB};
-  EXPECT_TRUE(sb.fits(4_MiB));
-  EXPECT_FALSE(sb.fits(4_MiB + 1));
+  superblock sb{fake_address3, superblock::minimum_size};
+  EXPECT_TRUE(sb.fits(superblock::minimum_size));
+  EXPECT_FALSE(sb.fits(superblock::minimum_size + 1));
 
-  auto const b = sb.first_fit(1_MiB);
-  sb.first_fit(1_MiB);
+  auto const b = sb.first_fit(superblock::minimum_size / 4);
+  sb.first_fit(superblock::minimum_size / 4);
   sb.coalesce(b);
-  EXPECT_TRUE(sb.fits(2_MiB));
-  EXPECT_FALSE(sb.fits(2_MiB + 1));
+  EXPECT_TRUE(sb.fits(superblock::minimum_size / 2));
+  EXPECT_FALSE(sb.fits(superblock::minimum_size / 2 + 1));
 }
 
-TEST(ArenaTest, SuperblockIsContiguousBefore)  // NOLINT
+TEST_F(ArenaTest, SuperblockIsContiguousBefore)  // NOLINT
 {
-  superblock sb{fake_address3, 4_MiB};
-  superblock sb2{fake_address4, 4_MiB};
+  superblock sb{fake_address3, superblock::minimum_size};
+  superblock sb2{fake_address4, superblock::minimum_size};
   EXPECT_TRUE(sb.is_contiguous_before(sb2));
 
   auto const b = sb.first_fit(256);
@@ -161,31 +184,31 @@ TEST(ArenaTest, SuperblockIsContiguousBefore)  // NOLINT
   EXPECT_TRUE(sb.is_contiguous_before(sb2));
 }
 
-TEST(ArenaTest, SuperblockSplit)  // NOLINT
+TEST_F(ArenaTest, SuperblockSplit)  // NOLINT
 {
-  superblock sb{fake_address3, 8_MiB};
-  auto const [head, tail] = sb.split(4_MiB);
+  superblock sb{fake_address3, superblock::minimum_size * 2};
+  auto const [head, tail] = sb.split(superblock::minimum_size);
   EXPECT_EQ(head.pointer(), fake_address3);
-  EXPECT_EQ(head.size(), 4_MiB);
+  EXPECT_EQ(head.size(), superblock::minimum_size);
   EXPECT_TRUE(head.empty());
   EXPECT_EQ(tail.pointer(), fake_address4);
-  EXPECT_EQ(tail.size(), 4_MiB);
+  EXPECT_EQ(tail.size(), superblock::minimum_size);
   EXPECT_TRUE(tail.empty());
 }
 
-TEST(ArenaTest, SuperblockMerge)  // NOLINT
+TEST_F(ArenaTest, SuperblockMerge)  // NOLINT
 {
-  superblock sb{fake_address3, 4_MiB};
-  superblock sb2{fake_address4, 4_MiB};
+  superblock sb{fake_address3, superblock::minimum_size};
+  superblock sb2{fake_address4, superblock::minimum_size};
   auto const merged = sb.merge(sb2);
   EXPECT_EQ(merged.pointer(), fake_address3);
-  EXPECT_EQ(merged.size(), 8_MiB);
+  EXPECT_EQ(merged.size(), superblock::minimum_size * 2);
   EXPECT_TRUE(merged.empty());
 }
 
-TEST(ArenaTest, SuperblockFirstFit)  // NOLINT
+TEST_F(ArenaTest, SuperblockFirstFit)  // NOLINT
 {
-  superblock sb{fake_address3, 4_MiB};
+  superblock sb{fake_address3, superblock::minimum_size};
   auto const b = sb.first_fit(1_KiB);
   EXPECT_EQ(b.pointer(), fake_address3);
   EXPECT_EQ(b.size(), 1_KiB);
@@ -199,26 +222,26 @@ TEST(ArenaTest, SuperblockFirstFit)  // NOLINT
   EXPECT_EQ(b3.size(), 512);
 }
 
-TEST(ArenaTest, SuperblockCoalesceAfterFull)  // NOLINT
+TEST_F(ArenaTest, SuperblockCoalesceAfterFull)  // NOLINT
 {
-  superblock sb{fake_address3, 4_MiB};
-  auto const b = sb.first_fit(2_MiB);
-  sb.first_fit(2_MiB);
+  superblock sb{fake_address3, superblock::minimum_size};
+  auto const b = sb.first_fit(superblock::minimum_size / 2);
+  sb.first_fit(superblock::minimum_size / 2);
   sb.coalesce(b);
-  EXPECT_TRUE(sb.first_fit(2_MiB).is_valid());
+  EXPECT_TRUE(sb.first_fit(superblock::minimum_size / 2).is_valid());
 }
 
-TEST(ArenaTest, SuperblockCoalesceMergeNext)  // NOLINT
+TEST_F(ArenaTest, SuperblockCoalesceMergeNext)  // NOLINT
 {
-  superblock sb{fake_address3, 4_MiB};
-  auto const b = sb.first_fit(2_MiB);
+  superblock sb{fake_address3, superblock::minimum_size};
+  auto const b = sb.first_fit(superblock::minimum_size / 2);
   sb.coalesce(b);
-  EXPECT_TRUE(sb.first_fit(4_MiB).is_valid());
+  EXPECT_TRUE(sb.first_fit(superblock::minimum_size).is_valid());
 }
 
-TEST(ArenaTest, SuperblockCoalesceMergePrevious)  // NOLINT
+TEST_F(ArenaTest, SuperblockCoalesceMergePrevious)  // NOLINT
 {
-  superblock sb{fake_address3, 4_MiB};
+  superblock sb{fake_address3, superblock::minimum_size};
   auto const b  = sb.first_fit(1_KiB);
   auto const b2 = sb.first_fit(1_KiB);
   sb.first_fit(1_KiB);
@@ -228,256 +251,222 @@ TEST(ArenaTest, SuperblockCoalesceMergePrevious)  // NOLINT
   EXPECT_EQ(b3.pointer(), fake_address3);
 }
 
-TEST(ArenaTest, SuperblockCoalesceMergePreviousAndNext)  // NOLINT
+TEST_F(ArenaTest, SuperblockCoalesceMergePreviousAndNext)  // NOLINT
 {
-  superblock sb{fake_address3, 4_MiB};
+  superblock sb{fake_address3, superblock::minimum_size};
   auto const b  = sb.first_fit(1_KiB);
   auto const b2 = sb.first_fit(1_KiB);
   sb.coalesce(b);
   sb.coalesce(b2);
-  EXPECT_TRUE(sb.first_fit(4_MiB).is_valid());
+  EXPECT_TRUE(sb.first_fit(superblock::minimum_size).is_valid());
+}
+
+TEST_F(ArenaTest, SuperblockMaxFree)  // NOLINT
+{
+  superblock sb{fake_address3, superblock::minimum_size};
+  sb.first_fit(superblock::minimum_size / 2);
+  EXPECT_EQ(sb.max_free(), superblock::minimum_size / 2);
 }
 
-TEST(ArenaTest, SuperblockMaxFree)  // NOLINT
+TEST_F(ArenaTest, SuperblockMaxFreeWhenFull)  // NOLINT
 {
-  superblock sb{fake_address3, 4_MiB};
-  sb.first_fit(2_MiB);
-  EXPECT_EQ(sb.max_free(), 2_MiB);
+  superblock sb{fake_address3, superblock::minimum_size};
+  sb.first_fit(superblock::minimum_size);
+  EXPECT_EQ(sb.max_free(), 0);
 }
 
 /**
  * Test global_arena.
  */
 
-TEST(ArenaTest, GlobalArenaNullUpstream)  // NOLINT
+TEST_F(ArenaTest, GlobalArenaNullUpstream)  // NOLINT
 {
   auto construct_nullptr = []() { global_arena ga{nullptr, std::nullopt}; };
   EXPECT_THROW(construct_nullptr(), rmm::logic_error);  // NOLINT(cppcoreguidelines-avoid-goto)
 }
 
-TEST(ArenaTest, GlobalArenaAcquire)  // NOLINT
+TEST_F(ArenaTest, GlobalArenaAcquire)  // NOLINT
 {
-  mock_memory_resource mock;
-  EXPECT_CALL(mock, allocate(8_MiB)).WillOnce(Return(fake_address3));
-  EXPECT_CALL(mock, deallocate(fake_address3, 8_MiB));
-
-  global_arena ga{&mock, 8_MiB};
-
-  auto const sb = ga.acquire(256);
+  auto const sb = ga_->acquire(256);
   EXPECT_EQ(sb.pointer(), fake_address3);
-  EXPECT_EQ(sb.size(), 4_MiB);
+  EXPECT_EQ(sb.size(), superblock::minimum_size);
   EXPECT_TRUE(sb.empty());
 
-  auto const sb2 = ga.acquire(1_KiB);
+  auto const sb2 = ga_->acquire(1_KiB);
   EXPECT_EQ(sb2.pointer(), fake_address4);
-  EXPECT_EQ(sb2.size(), 4_MiB);
+  EXPECT_EQ(sb2.size(), superblock::minimum_size);
   EXPECT_TRUE(sb2.empty());
 
-  EXPECT_FALSE(ga.acquire(512).is_valid());
+  ga_->acquire(512);
+  ga_->acquire(512);
+  EXPECT_FALSE(ga_->acquire(512).is_valid());
 }
 
-TEST(ArenaTest, GlobalArenaReleaseMergeNext)  // NOLINT
+TEST_F(ArenaTest, GlobalArenaReleaseMergeNext)  // NOLINT
 {
-  mock_memory_resource mock;
-  EXPECT_CALL(mock, allocate(8_MiB)).WillOnce(Return(fake_address3));
-  EXPECT_CALL(mock, deallocate(fake_address3, 8_MiB));
-
-  global_arena ga{&mock, 8_MiB};
-
-  auto sb = ga.acquire(256);
-  ga.release(std::move(sb), {});
-  auto* p = ga.allocate(8_MiB);
+  auto sb = ga_->acquire(256);
+  ga_->release(std::move(sb));
+  auto* p = ga_->allocate(arena_size_);
   EXPECT_EQ(p, fake_address3);
 }
 
-TEST(ArenaTest, GlobalArenaReleaseMergePrevious)  // NOLINT
+TEST_F(ArenaTest, GlobalArenaReleaseMergePrevious)  // NOLINT
 {
-  mock_memory_resource mock;
-  EXPECT_CALL(mock, allocate(16_MiB)).WillOnce(Return(fake_address3));
-  EXPECT_CALL(mock, deallocate(fake_address3, 16_MiB));
-
-  global_arena ga{&mock, 16_MiB};
-
-  auto sb  = ga.acquire(256);
-  auto sb2 = ga.acquire(1_KiB);
-  ga.acquire(512);
-  ga.release(std::move(sb), {});
-  ga.release(std::move(sb2), {});
-  auto* p = ga.allocate(8_MiB);
+  auto sb  = ga_->acquire(256);
+  auto sb2 = ga_->acquire(1_KiB);
+  ga_->acquire(512);
+  ga_->release(std::move(sb));
+  ga_->release(std::move(sb2));
+  auto* p = ga_->allocate(superblock::minimum_size * 2);
   EXPECT_EQ(p, fake_address3);
 }
 
-TEST(ArenaTest, GlobalArenaReleaseMergePreviousAndNext)  // NOLINT
+TEST_F(ArenaTest, GlobalArenaReleaseMergePreviousAndNext)  // NOLINT
 {
-  mock_memory_resource mock;
-  EXPECT_CALL(mock, allocate(16_MiB)).WillOnce(Return(fake_address3));
-  EXPECT_CALL(mock, deallocate(fake_address3, 16_MiB));
-
-  global_arena ga{&mock, 16_MiB};
-
-  auto sb  = ga.acquire(256);
-  auto sb2 = ga.acquire(1_KiB);
-  auto sb3 = ga.acquire(512);
-  ga.release(std::move(sb), {});
-  ga.release(std::move(sb3), {});
-  ga.release(std::move(sb2), {});
-  auto* p = ga.allocate(16_MiB);
+  auto sb  = ga_->acquire(256);
+  auto sb2 = ga_->acquire(1_KiB);
+  auto sb3 = ga_->acquire(512);
+  ga_->release(std::move(sb));
+  ga_->release(std::move(sb3));
+  ga_->release(std::move(sb2));
+  auto* p = ga_->allocate(arena_size_);
   EXPECT_EQ(p, fake_address3);
 }
 
-TEST(ArenaTest, GlobalArenaReleaseMultiple)  // NOLINT
+TEST_F(ArenaTest, GlobalArenaReleaseMultiple)  // NOLINT
 {
-  mock_memory_resource mock;
-  EXPECT_CALL(mock, allocate(16_MiB)).WillOnce(Return(fake_address3));
-  EXPECT_CALL(mock, deallocate(fake_address3, 16_MiB));
-
-  global_arena ga{&mock, 16_MiB};
-
   std::set<superblock> superblocks{};
-  auto sb = ga.acquire(256);
+  auto sb = ga_->acquire(256);
   superblocks.insert(std::move(sb));
-  auto sb2 = ga.acquire(1_KiB);
+  auto sb2 = ga_->acquire(1_KiB);
   superblocks.insert(std::move(sb2));
-  auto sb3 = ga.acquire(512);
+  auto sb3 = ga_->acquire(512);
   superblocks.insert(std::move(sb3));
-  ga.release(superblocks);
-  auto* p = ga.allocate(16_MiB);
+  ga_->release(superblocks);
+  auto* p = ga_->allocate(arena_size_);
   EXPECT_EQ(p, fake_address3);
 }
 
-TEST(ArenaTest, GlobalArenaAllocate)  // NOLINT
+TEST_F(ArenaTest, GlobalArenaAllocate)  // NOLINT
 {
-  mock_memory_resource mock;
-  EXPECT_CALL(mock, allocate(8_MiB)).WillOnce(Return(fake_address3));
-  EXPECT_CALL(mock, deallocate(fake_address3, 8_MiB));
-
-  global_arena ga{&mock, 8_MiB};
-
-  auto* ptr = ga.allocate(4_MiB);
+  auto* ptr = ga_->allocate(superblock::minimum_size * 2);
   EXPECT_EQ(ptr, fake_address3);
-  auto* ptr2 = ga.allocate(4_MiB);
-  EXPECT_EQ(ptr2, fake_address4);
 }
 
-TEST(ArenaTest, GlobalArenaDeallocate)  // NOLINT
+TEST_F(ArenaTest, GlobalArenaAllocateExtraLarge)  // NOLINT
 {
-  mock_memory_resource mock;
-  EXPECT_CALL(mock, allocate(8_MiB)).WillOnce(Return(fake_address3));
-  EXPECT_CALL(mock, deallocate(fake_address3, 8_MiB));
+  EXPECT_EQ(ga_->allocate(1_PiB), nullptr);
+  EXPECT_EQ(ga_->allocate(1_PiB), nullptr);
+}
 
-  global_arena ga{&mock, 8_MiB};
+TEST_F(ArenaTest, GlobalArenaAllocateAlignUp)  // NOLINT
+{
+  ga_->allocate(superblock::minimum_size + 256);
+  ga_->allocate(superblock::minimum_size + 256);
+  EXPECT_EQ(ga_->allocate(superblock::minimum_size + 256), nullptr);
+}
 
-  auto* ptr = ga.allocate(4_MiB);
+TEST_F(ArenaTest, GlobalArenaDeallocate)  // NOLINT
+{
+  auto* ptr = ga_->allocate(superblock::minimum_size * 2);
   EXPECT_EQ(ptr, fake_address3);
-  ga.deallocate(ptr, 4_MiB, {});
-  ptr = ga.allocate(4_MiB);
+  ga_->deallocate(ptr, superblock::minimum_size * 2, {});
+  ptr = ga_->allocate(superblock::minimum_size * 2);
   EXPECT_EQ(ptr, fake_address3);
 }
 
-TEST(ArenaTest, GlobalArenaDeallocateFromOtherArena)  // NOLINT
+TEST_F(ArenaTest, GlobalArenaDeallocateAlignUp)  // NOLINT
 {
-  mock_memory_resource mock;
-  EXPECT_CALL(mock, allocate(8_MiB)).WillOnce(Return(fake_address3));
-  EXPECT_CALL(mock, deallocate(fake_address3, 8_MiB));
-
-  global_arena ga{&mock, 8_MiB};
+  auto* ptr  = ga_->allocate(superblock::minimum_size + 256);
+  auto* ptr2 = ga_->allocate(superblock::minimum_size + 512);
+  ga_->deallocate(ptr, superblock::minimum_size + 256, {});
+  ga_->deallocate(ptr2, superblock::minimum_size + 512, {});
+  EXPECT_EQ(ga_->allocate(arena_size_), fake_address3);
+}
 
-  auto sb      = ga.acquire(512);
+TEST_F(ArenaTest, GlobalArenaDeallocateFromOtherArena)  // NOLINT
+{
+  auto sb      = ga_->acquire(512);
   auto const b = sb.first_fit(512);
-  ga.release(std::move(sb), {});
-  ga.deallocate_from_other_arena(b.pointer(), b.size());
-  EXPECT_EQ(ga.allocate(8_MiB), fake_address3);
+  ga_->release(std::move(sb));
+  ga_->deallocate_from_other_arena(b.pointer(), b.size());
+  EXPECT_EQ(ga_->allocate(arena_size_), fake_address3);
 }
 
 /**
  * Test arena.
  */
 
-TEST(ArenaTest, ArenaAllocate)  // NOLINT
+TEST_F(ArenaTest, ArenaAllocate)  // NOLINT
 {
-  mock_memory_resource mock;
-  EXPECT_CALL(mock, allocate(8_MiB)).WillOnce(Return(fake_address3));
-  EXPECT_CALL(mock, deallocate(fake_address3, 8_MiB));
-  global_arena ga{&mock, 8_MiB};
-  arena a{ga};
-
-  EXPECT_EQ(a.allocate(4_MiB), fake_address3);
-  EXPECT_EQ(a.allocate(256), fake_address4);
+  EXPECT_EQ(a_->allocate(superblock::minimum_size), fake_address3);
+  EXPECT_EQ(a_->allocate(256), fake_address4);
 }
 
-TEST(ArenaTest, ArenaDeallocate)  // NOLINT
+TEST_F(ArenaTest, ArenaDeallocate)  // NOLINT
 {
-  mock_memory_resource mock;
-  EXPECT_CALL(mock, allocate(8_MiB)).WillOnce(Return(fake_address3));
-  EXPECT_CALL(mock, deallocate(fake_address3, 8_MiB));
-  global_arena ga{&mock, 8_MiB};
-  arena a{ga};
-
-  auto* ptr = a.allocate(4_MiB);
-  a.deallocate(ptr, 4_MiB, {});
-  auto* ptr2 = a.allocate(256);
-  a.deallocate(ptr2, 256, {});
-  EXPECT_EQ(a.allocate(8_MiB), fake_address3);
+  auto* ptr = a_->allocate(superblock::minimum_size);
+  a_->deallocate(ptr, superblock::minimum_size, {});
+  auto* ptr2 = a_->allocate(256);
+  a_->deallocate(ptr2, 256, {});
+  EXPECT_EQ(a_->allocate(superblock::minimum_size), fake_address3);
 }
 
-TEST(ArenaTest, ArenaDeallocateMergePrevious)  // NOLINT
+TEST_F(ArenaTest, ArenaDeallocateMergePrevious)  // NOLINT
 {
-  mock_memory_resource mock;
-  EXPECT_CALL(mock, allocate(8_MiB)).WillOnce(Return(fake_address3));
-  EXPECT_CALL(mock, deallocate(fake_address3, 8_MiB));
-  global_arena ga{&mock, 8_MiB};
-  arena a{ga};
-
-  auto* ptr  = a.allocate(256);
-  auto* ptr2 = a.allocate(256);
-  a.allocate(256);
-  a.deallocate(ptr, 256, {});
-  a.deallocate(ptr2, 256, {});
-  EXPECT_EQ(a.allocate(512), fake_address3);
+  auto* ptr  = a_->allocate(256);
+  auto* ptr2 = a_->allocate(256);
+  a_->allocate(256);
+  a_->deallocate(ptr, 256, {});
+  a_->deallocate(ptr2, 256, {});
+  EXPECT_EQ(a_->allocate(512), fake_address3);
 }
 
-TEST(ArenaTest, ArenaDeallocateMergeNext)  // NOLINT
+TEST_F(ArenaTest, ArenaDeallocateMergeNext)  // NOLINT
 {
-  mock_memory_resource mock;
-  EXPECT_CALL(mock, allocate(8_MiB)).WillOnce(Return(fake_address3));
-  EXPECT_CALL(mock, deallocate(fake_address3, 8_MiB));
-  global_arena ga{&mock, 8_MiB};
-  arena a{ga};
-
-  auto* ptr  = a.allocate(256);
-  auto* ptr2 = a.allocate(256);
-  a.allocate(256);
-  a.deallocate(ptr2, 256, {});
-  a.deallocate(ptr, 256, {});
-  EXPECT_EQ(a.allocate(512), fake_address3);
+  auto* ptr  = a_->allocate(256);
+  auto* ptr2 = a_->allocate(256);
+  a_->allocate(256);
+  a_->deallocate(ptr2, 256, {});
+  a_->deallocate(ptr, 256, {});
+  EXPECT_EQ(a_->allocate(512), fake_address3);
 }
 
-TEST(ArenaTest, ArenaDeallocateMergePreviousAndNext)  // NOLINT
+TEST_F(ArenaTest, ArenaDeallocateMergePreviousAndNext)  // NOLINT
 {
-  mock_memory_resource mock;
-  EXPECT_CALL(mock, allocate(8_MiB)).WillOnce(Return(fake_address3));
-  EXPECT_CALL(mock, deallocate(fake_address3, 8_MiB));
-  global_arena ga{&mock, 8_MiB};
-  arena a{ga};
+  auto* ptr  = a_->allocate(256);
+  auto* ptr2 = a_->allocate(256);
+  a_->deallocate(ptr, 256, {});
+  a_->deallocate(ptr2, 256, {});
+  EXPECT_EQ(a_->allocate(2_KiB), fake_address3);
+}
 
-  auto* ptr  = a.allocate(256);
-  auto* ptr2 = a.allocate(256);
-  a.deallocate(ptr, 256, {});
-  a.deallocate(ptr2, 256, {});
-  EXPECT_EQ(a.allocate(2_KiB), fake_address3);
+TEST_F(ArenaTest, ArenaDefragment)  // NOLINT
+{
+  std::vector<void*> pointers;
+  std::size_t num_pointers{4};
+  for (std::size_t i = 0; i < num_pointers; i++) {
+    pointers.push_back(a_->allocate(superblock::minimum_size));
+  }
+  for (auto* ptr : pointers) {
+    a_->deallocate(ptr, superblock::minimum_size, {});
+  }
+  EXPECT_EQ(ga_->allocate(arena_size_), nullptr);
+  a_->defragment();
+  EXPECT_EQ(ga_->allocate(arena_size_), fake_address3);
 }
 
 /**
  * Test arena_memory_resource.
  */
 
-TEST(ArenaTest, NullUpstream)  // NOLINT
+TEST_F(ArenaTest, NullUpstream)  // NOLINT
 {
   // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto)
   EXPECT_THROW([]() { arena_mr mr{nullptr}; }(), rmm::logic_error);
 }
 
-TEST(ArenaTest, AllocateNinetyPercent)  // NOLINT
+TEST_F(ArenaTest, AllocateNinetyPercent)  // NOLINT
 {
   EXPECT_NO_THROW([]() {  // NOLINT(cppcoreguidelines-avoid-goto)
     auto const free = rmm::detail::available_device_memory().first;
@@ -488,7 +477,7 @@ TEST(ArenaTest, AllocateNinetyPercent)  // NOLINT
   }());
 }
 
-TEST(ArenaTest, SmallMediumLarge)  // NOLINT
+TEST_F(ArenaTest, SmallMediumLarge)  // NOLINT
 {
   EXPECT_NO_THROW([]() {  // NOLINT(cppcoreguidelines-avoid-goto)
     arena_mr mr(rmm::mr::get_current_device_resource());
@@ -502,5 +491,29 @@ TEST(ArenaTest, SmallMediumLarge)  // NOLINT
   }());
 }
 
+TEST_F(ArenaTest, Defragment)  // NOLINT
+{
+  EXPECT_NO_THROW([]() {  // NOLINT(cppcoreguidelines-avoid-goto)
+    auto const arena_size = superblock::minimum_size * 4;
+    arena_mr mr(rmm::mr::get_current_device_resource(), arena_size);
+    std::vector<std::thread> threads;
+    std::size_t num_threads{4};
+    threads.reserve(num_threads);
+    for (std::size_t i = 0; i < num_threads; ++i) {
+      threads.emplace_back(std::thread([&] {
+        cuda_stream stream{};
+        void* ptr = mr.allocate(32_KiB, stream);
+        mr.deallocate(ptr, 32_KiB, stream);
+      }));
+    }
+    for (auto& thread : threads) {
+      thread.join();
+    }
+
+    auto* ptr = mr.allocate(arena_size);
+    mr.deallocate(ptr, arena_size);
+  }());
+}
+
 }  // namespace
 }  // namespace rmm::test

From 9a2e917b9107f2f56382b2a4d0438822b95d8d2f Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Tue, 30 Nov 2021 13:34:59 -0800
Subject: [PATCH 22/35] clean instead of defragment individual arenas

---
 include/rmm/mr/device/arena_memory_resource.hpp | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp
index 440da5a0b..342180927 100644
--- a/include/rmm/mr/device/arena_memory_resource.hpp
+++ b/include/rmm/mr/device/arena_memory_resource.hpp
@@ -160,10 +160,10 @@ class arena_memory_resource final : public device_memory_resource {
   {
     RMM_CUDA_TRY(cudaDeviceSynchronize());
     for (auto& thread_arena : thread_arenas_) {
-      thread_arena.second->defragment();
+      thread_arena.second->clean();
     }
     for (auto& stream_arena : stream_arenas_) {
-      stream_arena.second.defragment();
+      stream_arena.second.clean();
     }
   }
 
@@ -199,20 +199,15 @@ class arena_memory_resource final : public device_memory_resource {
     // is caught up.
     stream.synchronize_no_throw();
 
-    read_lock lock(mtx_);
+    write_lock lock(mtx_);
 
     if (use_per_thread_arena(stream)) {
-      auto const id = std::this_thread::get_id();
       for (auto&& kv : thread_arenas_) {
-        // If the arena does not belong to the current thread, try to deallocate from it, and return
-        // if successful.
-        if (kv.first != id && kv.second->deallocate(ptr, bytes, stream)) { return; }
+        if (kv.second->deallocate(ptr, bytes, stream)) { return; }
       }
     } else {
       for (auto&& kv : stream_arenas_) {
-        // If the arena does not belong to the current stream, try to deallocate from it, and return
-        // if successful.
-        if (stream.value() != kv.first && kv.second.deallocate(ptr, bytes, stream)) { return; }
+        if (kv.second.deallocate(ptr, bytes, stream)) { return; }
       }
     }
 

From fb1f193bf9cb98487548f3b291f3340775cd58f3 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Tue, 30 Nov 2021 13:37:02 -0800
Subject: [PATCH 23/35] lower superblock size to 1MB

---
 include/rmm/mr/device/detail/arena.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index 802ea269a..86d9d3079 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -174,8 +174,8 @@ inline bool block_size_compare(block const& lhs, block const& rhs)
  */
 class superblock final : public memory_span {
  public:
-  /// Minimum size of a superblock (64 MiB).
-  static constexpr std::size_t minimum_size{1U << 26U};
+  /// Minimum size of a superblock (1 MiB).
+  static constexpr std::size_t minimum_size{1U << 20U};
 
   /**
    * @brief Construct a default superblock.

From 5148c51a8d7dad186014b3ec3b2215bef7ad3144 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Fri, 3 Dec 2021 18:26:50 -0800
Subject: [PATCH 24/35] align to size classes

---
 .../random_allocations/random_allocations.cpp |   4 +-
 .../rmm/mr/device/arena_memory_resource.hpp   |  62 +++++-----
 include/rmm/mr/device/detail/arena.hpp        | 114 +++++++++++++++---
 tests/mr/device/arena_mr_tests.cpp            |  38 +++---
 4 files changed, 154 insertions(+), 64 deletions(-)

diff --git a/benchmarks/random_allocations/random_allocations.cpp b/benchmarks/random_allocations/random_allocations.cpp
index 828561dd1..c236ed7bb 100644
--- a/benchmarks/random_allocations/random_allocations.cpp
+++ b/benchmarks/random_allocations/random_allocations.cpp
@@ -170,7 +170,9 @@ inline auto make_pool()
 
 inline auto make_arena()
 {
-  return rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(make_cuda());
+  auto free    = rmm::detail::available_device_memory().first;
+  auto reserve = 1UL << 26;
+  return rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(make_cuda(), free - reserve);
 }
 
 inline auto make_binning()
diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp
index 342180927..c55052c3b 100644
--- a/include/rmm/mr/device/arena_memory_resource.hpp
+++ b/include/rmm/mr/device/arena_memory_resource.hpp
@@ -118,8 +118,6 @@ class arena_memory_resource final : public device_memory_resource {
  private:
   using global_arena = rmm::mr::detail::arena::global_arena<Upstream>;
   using arena        = rmm::mr::detail::arena::arena<Upstream>;
-  using read_lock    = std::shared_lock<std::shared_mutex>;
-  using write_lock   = std::unique_lock<std::shared_mutex>;
 
   /**
    * @brief Allocates memory of size at least `bytes`.
@@ -135,22 +133,25 @@ class arena_memory_resource final : public device_memory_resource {
   void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
   {
     if (bytes <= 0) { return nullptr; }
+    bytes       = rmm::mr::detail::arena::align_to_size_class(bytes);
+    auto& arena = get_arena(stream);
 
-    bytes         = rmm::detail::align_up(bytes, rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
-    auto& arena   = get_arena(stream);
-    void* pointer = arena.allocate(bytes);
+    {
+      std::shared_lock lock(mtx_);
+      void* pointer = arena.allocate(bytes);
+      if (pointer != nullptr) { return pointer; }
+    }
 
-    if (pointer == nullptr) {
-      write_lock lock(mtx_);
+    {
+      std::unique_lock lock(mtx_);
       defragment();
-      pointer = arena.allocate(bytes);
+      void* pointer = arena.allocate(bytes);
       if (pointer == nullptr) {
         if (dump_log_on_failure_) { dump_memory_log(bytes); }
         RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory);
       }
+      return pointer;
     }
-
-    return pointer;
   }
 
   /**
@@ -178,9 +179,20 @@ class arena_memory_resource final : public device_memory_resource {
   void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override
   {
     if (ptr == nullptr || bytes <= 0) { return; }
+    bytes       = rmm::mr::detail::arena::align_to_size_class(bytes);
+    auto& arena = get_arena(stream);
 
-    bytes = rmm::detail::align_up(bytes, rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
-    if (!get_arena(stream).deallocate(ptr, bytes, stream)) {
+    {
+      std::shared_lock lock(mtx_);
+      if (arena.deallocate(ptr, bytes, stream)) { return; }
+    }
+
+    {
+      // Since we are returning this memory to another stream, we need to make sure the current
+      // stream is caught up.
+      stream.synchronize_no_throw();
+
+      std::unique_lock lock(mtx_);
       deallocate_from_other_arena(ptr, bytes, stream);
     }
   }
@@ -195,19 +207,13 @@ class arena_memory_resource final : public device_memory_resource {
    */
   void deallocate_from_other_arena(void* ptr, std::size_t bytes, cuda_stream_view stream)
   {
-    // Since we are returning this memory to another stream, we need to make sure the current stream
-    // is caught up.
-    stream.synchronize_no_throw();
-
-    write_lock lock(mtx_);
-
     if (use_per_thread_arena(stream)) {
-      for (auto&& kv : thread_arenas_) {
-        if (kv.second->deallocate(ptr, bytes, stream)) { return; }
+      for (auto const& thread_arena : thread_arenas_) {
+        if (thread_arena.second->deallocate(ptr, bytes, stream)) { return; }
       }
     } else {
-      for (auto&& kv : stream_arenas_) {
-        if (kv.second.deallocate(ptr, bytes, stream)) { return; }
+      for (auto& stream_arena : stream_arenas_) {
+        if (stream_arena.second.deallocate(ptr, bytes, stream)) { return; }
       }
     }
 
@@ -237,12 +243,12 @@ class arena_memory_resource final : public device_memory_resource {
   {
     auto const thread_id = std::this_thread::get_id();
     {
-      read_lock lock(mtx_);
+      std::shared_lock lock(map_mtx_);
       auto const iter = thread_arenas_.find(thread_id);
       if (iter != thread_arenas_.end()) { return *iter->second; }
     }
     {
-      write_lock lock(mtx_);
+      std::unique_lock lock(map_mtx_);
       auto thread_arena = std::make_shared<arena>(global_arena_);
       thread_arenas_.emplace(thread_id, thread_arena);
       thread_local detail::arena::arena_cleaner<Upstream> cleaner{thread_arena};
@@ -259,12 +265,12 @@ class arena_memory_resource final : public device_memory_resource {
   {
     RMM_LOGGING_ASSERT(!use_per_thread_arena(stream));
     {
-      read_lock lock(mtx_);
+      std::shared_lock lock(map_mtx_);
       auto const iter = stream_arenas_.find(stream.value());
       if (iter != stream_arenas_.end()) { return iter->second; }
     }
     {
-      write_lock lock(mtx_);
+      std::unique_lock lock(map_mtx_);
       stream_arenas_.emplace(stream.value(), global_arena_);
       return stream_arenas_.at(stream.value());
     }
@@ -331,7 +337,9 @@ class arena_memory_resource final : public device_memory_resource {
   bool dump_log_on_failure_{};
   /// The logger for memory dump.
   std::shared_ptr<spdlog::logger> logger_{};
-  /// Mutex for read and write locks.
+  /// Mutex for read and write locks on arena maps.
+  mutable std::shared_mutex map_mtx_;
+  /// Mutex for shared and unique locks on the mr.
   mutable std::shared_mutex mtx_;
 };
 
diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index 0423a4242..8e43a661d 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -38,6 +38,85 @@
 
 namespace rmm::mr::detail::arena {
 
+/**
+ * @brief Align up to nearest size class.
+ *
+ * @param[in] value value to align.
+ * @return Return the aligned value.
+ */
+inline std::size_t align_to_size_class(std::size_t value) noexcept
+{
+  // See http://jemalloc.net/jemalloc.3.html.
+  // NOLINTBEGIN(readability-magic-numbers,cppcoreguidelines-avoid-magic-numbers)
+  static std::array<std::size_t, 117> size_classes{
+    // clang-format off
+    // Spacing 256:
+    256UL, 512UL, 768UL, 1024UL, 1280UL, 1536UL, 1792UL, 2048UL,
+    // Spacing 512:
+    2560UL, 3072UL, 3584UL, 4096UL,
+    // Spacing 1 KiB:
+    5UL << 10, 6UL << 10, 7UL << 10, 8UL << 10,
+    // Spacing 2 KiB:
+    10UL << 10, 12UL << 10, 14UL << 10, 16UL << 10,
+    // Spacing 4 KiB:
+    20UL << 10, 24UL << 10, 28UL << 10, 32UL << 10,
+    // Spacing 8 KiB:
+    40UL << 10, 48UL << 10, 54UL << 10, 64UL << 10,
+    // Spacing 16 KiB:
+    80UL << 10, 96UL << 10, 112UL << 10, 128UL << 10,
+    // Spacing 32 KiB:
+    160UL << 10, 192UL << 10, 224UL << 10, 256UL << 10,
+    // Spacing 64 KiB:
+    320UL << 10, 384UL << 10, 448UL << 10, 512UL << 10,
+    // Spacing 128 KiB:
+    640UL << 10, 768UL << 10, 896UL << 10, 1UL << 20,
+    // Spacing 256 KiB:
+    1280UL << 10, 1536UL << 10, 1792UL << 10, 2UL << 20,
+    // Spacing 512 KiB:
+    2560UL << 10, 3UL << 20, 3584UL << 10, 4UL << 20,
+    // Spacing 1 MiB:
+    5UL << 20, 6UL << 20, 7UL << 20, 8UL << 20,
+    // Spacing 2 MiB:
+    10UL << 20, 12UL << 20, 14UL << 20, 16UL << 20,
+    // Spacing 4 MiB:
+    20UL << 20, 24UL << 20, 28UL << 20, 32UL << 20,
+    // Spacing 8 MiB:
+    40UL << 20, 48UL << 20, 56UL << 20, 64UL << 20,
+    // Spacing 16 MiB:
+    80UL << 20, 96UL << 20, 112UL << 20, 128UL << 20,
+    // Spacing 32 MiB:
+    160UL << 20, 192UL << 20, 224UL << 20, 256UL << 20,
+    // Spacing 64 MiB:
+    320UL << 20, 384UL << 20, 448UL << 20, 512UL << 20,
+    // Spacing 128 MiB:
+    640UL << 20, 768UL << 20, 896UL << 20, 1UL << 30,
+    // Spacing 256 MiB:
+    1280UL << 20, 1536UL << 20, 1792UL << 20, 2UL << 30,
+    // Spacing 512 MiB:
+    2560UL << 20, 3UL << 30, 3584UL << 20, 4UL << 30,
+    // Spacing 1 GiB:
+    5UL << 30, 6UL << 30, 7UL << 30, 8UL << 30,
+    // Spacing 2 GiB:
+    10UL << 30, 12UL << 30, 14UL << 30, 16UL << 30,
+    // Spacing 4 GiB:
+    20UL << 30, 24UL << 30, 28UL << 30, 32UL << 30,
+    // Spacing 8 GiB:
+    40UL << 30, 48UL << 30, 56UL << 30, 64UL << 30,
+    // Spacing 16 GiB:
+    80UL << 30, 96UL << 30, 112UL << 30, 128UL << 30,
+    // Spacing 32 Gib:
+    160UL << 30, 192UL << 30, 224UL << 30, 256UL << 30,
+    // Catch all:
+    std::numeric_limits<std::size_t>::max()
+    // clang-format on
+  };
+  // NOLINTEND(readability-magic-numbers,cppcoreguidelines-avoid-magic-numbers)
+
+  auto* bound = std::lower_bound(size_classes.begin(), size_classes.end(), value);
+  RMM_LOGGING_ASSERT(bound != size_classes.end());
+  return *bound;
+}
+
 /**
  * @brief Represents a contiguous region of memory.
  */
@@ -307,7 +386,7 @@ class superblock final : public memory_span {
     RMM_LOGGING_ASSERT(is_valid());
     RMM_LOGGING_ASSERT(size > 0);
 
-    auto fits = [size](auto const& blk) { return blk.fits(size); };
+    auto fits       = [size](auto const& blk) { return blk.fits(size); };
     auto const iter = std::find_if(free_blocks_.cbegin(), free_blocks_.cend(), fits);
     if (iter == free_blocks_.cend()) { return {}; }
 
@@ -432,7 +511,7 @@ class global_arena final {
    */
   ~global_arena()
   {
-    lock_guard lock(mtx_);
+    std::lock_guard lock(mtx_);
     upstream_mr_->deallocate(upstream_block_.pointer(), upstream_block_.size());
   }
 
@@ -454,7 +533,7 @@ class global_arena final {
   {
     // Superblocks should only be acquired if the size is not directly handled by the global arena.
     RMM_LOGGING_ASSERT(!handles(size));
-    lock_guard lock(mtx_);
+    std::lock_guard lock(mtx_);
     return first_fit(size, superblock::minimum_size);
   }
 
@@ -466,7 +545,7 @@ class global_arena final {
   void release(superblock&& sb)
   {
     RMM_LOGGING_ASSERT(sb.is_valid());
-    lock_guard lock(mtx_);
+    std::lock_guard lock(mtx_);
     coalesce(std::move(sb));
   }
 
@@ -477,7 +556,7 @@ class global_arena final {
    */
   void release(std::set<superblock>& superblocks)
   {
-    lock_guard lock(mtx_);
+    std::lock_guard lock(mtx_);
     while (!superblocks.empty()) {
       auto sb = std::move(superblocks.extract(superblocks.cbegin()).value());
       RMM_LOGGING_ASSERT(sb.is_valid());
@@ -494,9 +573,8 @@ class global_arena final {
   void* allocate(std::size_t size)
   {
     RMM_LOGGING_ASSERT(handles(size));
-    lock_guard lock(mtx_);
-    auto const aligned = rmm::detail::align_up(size, superblock::minimum_size);
-    auto sb            = first_fit(aligned, aligned);
+    std::lock_guard lock(mtx_);
+    auto sb = first_fit(size, size);
     if (sb.is_valid()) {
       RMM_LOGGING_ASSERT(large_allocations_.find(sb.pointer()) == large_allocations_.cend());
       large_allocations_.emplace(sb.pointer(), sb.size());
@@ -516,7 +594,7 @@ class global_arena final {
   {
     RMM_LOGGING_ASSERT(handles(size));
     stream.synchronize_no_throw();
-    lock_guard lock(mtx_);
+    std::lock_guard lock(mtx_);
     auto const allocated_size = large_allocations_.at(ptr);
     large_allocations_.erase(ptr);
     coalesce({ptr, allocated_size});
@@ -532,7 +610,7 @@ class global_arena final {
    */
   void deallocate_from_other_arena(void* ptr, std::size_t bytes)
   {
-    lock_guard lock(mtx_);
+    std::lock_guard lock(mtx_);
 
     block const b{ptr, bytes};
     auto const iter = std::find_if(
@@ -555,7 +633,7 @@ class global_arena final {
    */
   void dump_memory_log(std::shared_ptr<spdlog::logger> const& logger) const
   {
-    lock_guard lock(mtx_);
+    std::lock_guard lock(mtx_);
 
     logger->info("  Arena size: {}", rmm::detail::bytes{upstream_block_.size()});
     logger->info("  # superblocks: {}", superblocks_.size());
@@ -584,8 +662,6 @@ class global_arena final {
   }
 
  private:
-  using lock_guard = std::lock_guard<std::mutex>;
-
   /**
    * @brief Default size of the global arena if unspecified.
    * @return the default global arena size.
@@ -727,7 +803,7 @@ class arena {
   void* allocate(std::size_t size)
   {
     if (global_arena_.handles(size)) { return global_arena_.allocate(size); }
-    lock_guard lock(mtx_);
+    std::lock_guard lock(mtx_);
     return get_block(size).pointer();
   }
 
@@ -746,7 +822,7 @@ class arena {
       global_arena_.deallocate(ptr, size, stream);
       return true;
     }
-    lock_guard lock(mtx_);
+    std::lock_guard lock(mtx_);
     return deallocate_from_superblock({ptr, size}, stream);
   }
 
@@ -755,7 +831,7 @@ class arena {
    */
   void clean()
   {
-    lock_guard lock(mtx_);
+    std::lock_guard lock(mtx_);
     global_arena_.release(superblocks_);
   }
 
@@ -764,7 +840,7 @@ class arena {
    */
   void defragment()
   {
-    lock_guard lock(mtx_);
+    std::lock_guard lock(mtx_);
     while (true) {
       auto const iter = std::find_if(
         superblocks_.cbegin(), superblocks_.cend(), [](auto const& sb) { return sb.empty(); });
@@ -780,7 +856,7 @@ class arena {
    */
   void dump_memory_log(std::shared_ptr<spdlog::logger> const& logger) const
   {
-    lock_guard lock(mtx_);
+    std::lock_guard lock(mtx_);
     logger->info("    # superblocks: {}", superblocks_.size());
     if (!superblocks_.empty()) {
       logger->info("    Total size of superblocks: {}",
@@ -804,8 +880,6 @@ class arena {
   }
 
  private:
-  using lock_guard = std::lock_guard<std::mutex>;
-
   /**
    * @brief Get an available memory block of at least `size` bytes.
    *
diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index 0890544be..97db29748 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -46,14 +46,12 @@ using arena        = rmm::mr::detail::arena::arena<mock_memory_resource>;
 using arena_mr     = rmm::mr::arena_memory_resource<rmm::mr::device_memory_resource>;
 using ::testing::Return;
 
-// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast,performance-no-int-to-ptr)
-auto const fake_address = reinterpret_cast<void*>(1_KiB);
-// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast,performance-no-int-to-ptr)
+// NOLINTBEGIN(cppcoreguidelines-pro-type-reinterpret-cast,performance-no-int-to-ptr)
+auto const fake_address  = reinterpret_cast<void*>(1_KiB);
 auto const fake_address2 = reinterpret_cast<void*>(2_KiB);
-// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast,performance-no-int-to-ptr)
 auto const fake_address3 = reinterpret_cast<void*>(superblock::minimum_size);
-// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast,performance-no-int-to-ptr)
 auto const fake_address4 = reinterpret_cast<void*>(superblock::minimum_size * 2);
+// NOLINTEND(cppcoreguidelines-pro-type-reinterpret-cast,performance-no-int-to-ptr)
 
 class ArenaTest : public ::testing::Test {
  protected:
@@ -65,16 +63,31 @@ class ArenaTest : public ::testing::Test {
     a_  = std::make_unique<arena>(*ga_);
   }
 
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
+  // NOLINTBEGIN(cppcoreguidelines-non-private-member-variables-in-classes)
   std::size_t arena_size_{superblock::minimum_size * 4};
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   mock_memory_resource mock_{};
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   std::unique_ptr<global_arena> ga_{};
-  // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes)
   std::unique_ptr<arena> a_{};
+  // NOLINTEND(cppcoreguidelines-non-private-member-variables-in-classes)
 };
 
+/**
+ * Test align_to_size_class.
+ */
+TEST_F(ArenaTest, AlignToSizeClass)  // NOLINT
+{
+  using rmm::mr::detail::arena::align_to_size_class;
+  EXPECT_EQ(align_to_size_class(8), 256);
+  EXPECT_EQ(align_to_size_class(256), 256);
+  EXPECT_EQ(align_to_size_class(264), 512);
+  EXPECT_EQ(align_to_size_class(512), 512);
+  EXPECT_EQ(align_to_size_class(17_KiB), 20_KiB);
+  EXPECT_EQ(align_to_size_class(13_MiB), 14_MiB);
+  EXPECT_EQ(align_to_size_class(2500_MiB), 2560_MiB);
+  EXPECT_EQ(align_to_size_class(128_GiB), 128_GiB);
+  EXPECT_EQ(align_to_size_class(1_PiB), std::numeric_limits<std::size_t>::max());
+}
+
 /**
  * Test memory_span.
  */
@@ -359,13 +372,6 @@ TEST_F(ArenaTest, GlobalArenaAllocateExtraLarge)  // NOLINT
   EXPECT_EQ(ga_->allocate(1_PiB), nullptr);
 }
 
-TEST_F(ArenaTest, GlobalArenaAllocateAlignUp)  // NOLINT
-{
-  ga_->allocate(superblock::minimum_size + 256);
-  ga_->allocate(superblock::minimum_size + 256);
-  EXPECT_EQ(ga_->allocate(superblock::minimum_size + 256), nullptr);
-}
-
 TEST_F(ArenaTest, GlobalArenaDeallocate)  // NOLINT
 {
   auto* ptr = ga_->allocate(superblock::minimum_size * 2);

From a13e8adac9275ce9cf450292ed9cf8cc05b9711a Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Tue, 7 Dec 2021 10:12:53 -0800
Subject: [PATCH 25/35] keep track of large allocations in superblocks

---
 .../rmm/mr/device/arena_memory_resource.hpp   |  22 +-
 include/rmm/mr/device/detail/arena.hpp        | 351 +++++++++---------
 tests/mr/device/arena_mr_tests.cpp            |   2 +-
 3 files changed, 193 insertions(+), 182 deletions(-)

diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp
index c55052c3b..fd6874705 100644
--- a/include/rmm/mr/device/arena_memory_resource.hpp
+++ b/include/rmm/mr/device/arena_memory_resource.hpp
@@ -133,7 +133,11 @@ class arena_memory_resource final : public device_memory_resource {
   void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
   {
     if (bytes <= 0) { return nullptr; }
-    bytes       = rmm::mr::detail::arena::align_to_size_class(bytes);
+#ifdef RMM_ARENA_USE_SIZE_CLASSES
+    bytes = rmm::mr::detail::arena::align_to_size_class(bytes);
+#else
+    bytes = rmm::detail::align_up(bytes, rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
+#endif
     auto& arena = get_arena(stream);
 
     {
@@ -179,7 +183,11 @@ class arena_memory_resource final : public device_memory_resource {
   void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override
   {
     if (ptr == nullptr || bytes <= 0) { return; }
-    bytes       = rmm::mr::detail::arena::align_to_size_class(bytes);
+#ifdef RMM_ARENA_USE_SIZE_CLASSES
+    bytes = rmm::mr::detail::arena::align_to_size_class(bytes);
+#else
+    bytes = rmm::detail::align_up(bytes, rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
+#endif
     auto& arena = get_arena(stream);
 
     {
@@ -209,17 +217,17 @@ class arena_memory_resource final : public device_memory_resource {
   {
     if (use_per_thread_arena(stream)) {
       for (auto const& thread_arena : thread_arenas_) {
-        if (thread_arena.second->deallocate(ptr, bytes, stream)) { return; }
+        if (thread_arena.second->deallocate(ptr, bytes)) { return; }
       }
     } else {
       for (auto& stream_arena : stream_arenas_) {
-        if (stream_arena.second.deallocate(ptr, bytes, stream)) { return; }
+        if (stream_arena.second.deallocate(ptr, bytes)) { return; }
       }
     }
 
-    // The thread that originally allocated the block has terminated, deallocate directly in the
-    // global arena.
-    global_arena_.deallocate_from_other_arena(ptr, bytes);
+    if (!global_arena_.deallocate(ptr, bytes)) {
+      RMM_FAIL("allocation not found");
+    }
   }
 
   /**
diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index 8e43a661d..a2579471d 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -155,10 +155,10 @@ class memory_span {
   [[nodiscard]] bool is_valid() const { return pointer_ != nullptr && size_ > 0; }
 
   /// Used by std::set to compare spans.
-  bool operator<(memory_span const& ms) const
+  bool operator<(memory_span const& mem_span) const
   {
-    RMM_LOGGING_ASSERT(ms.is_valid());
-    return pointer_ < ms.pointer_;
+    RMM_LOGGING_ASSERT(mem_span.is_valid());
+    return pointer_ < mem_span.pointer_;
   }
 
  private:
@@ -184,58 +184,58 @@ class block final : public memory_span {
   using memory_span::memory_span;
 
   /**
-   * @brief Is this block large enough to fit `sz` bytes?
+   * @brief Is this block large enough to fit `bytes` bytes?
    *
-   * @param sz The size in bytes to check for fit.
-   * @return true if this block is at least `sz` bytes.
+   * @param bytes The size in bytes to check for fit.
+   * @return true if this block is at least `bytes` bytes.
    */
-  [[nodiscard]] bool fits(std::size_t sz) const
+  [[nodiscard]] bool fits(std::size_t bytes) const
   {
     RMM_LOGGING_ASSERT(is_valid());
-    RMM_LOGGING_ASSERT(sz > 0);
-    return size() >= sz;
+    RMM_LOGGING_ASSERT(bytes > 0);
+    return size() >= bytes;
   }
 
   /**
-   * @brief Verifies whether this block can be merged to the beginning of block b.
+   * @brief Verifies whether this block can be merged to the beginning of block blk.
    *
-   * @param b The block to check for contiguity.
-   * @return true Returns true if this block's `pointer` + `size` == `b.pointer`.
+   * @param blk The block to check for contiguity.
+   * @return true Returns true if this block's `pointer` + `size` == `blk.pointer`.
    */
-  [[nodiscard]] bool is_contiguous_before(block const& b) const
+  [[nodiscard]] bool is_contiguous_before(block const& blk) const
   {
     RMM_LOGGING_ASSERT(is_valid());
-    RMM_LOGGING_ASSERT(b.is_valid());
+    RMM_LOGGING_ASSERT(blk.is_valid());
     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-    return pointer() + size() == b.pointer();
+    return pointer() + size() == blk.pointer();
   }
 
   /**
    * @brief Split this block into two by the given size.
    *
-   * @param sz The size in bytes of the first block.
-   * @return std::pair<block, block> A pair of blocks split by sz.
+   * @param bytes The size in bytes of the first block.
+   * @return std::pair<block, block> A pair of blocks split by bytes.
    */
-  [[nodiscard]] std::pair<block, block> split(std::size_t sz) const
+  [[nodiscard]] std::pair<block, block> split(std::size_t bytes) const
   {
     RMM_LOGGING_ASSERT(is_valid());
-    RMM_LOGGING_ASSERT(size() > sz);
+    RMM_LOGGING_ASSERT(size() > bytes);
     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-    return {{pointer(), sz}, {pointer() + sz, size() - sz}};
+    return {{pointer(), bytes}, {pointer() + bytes, size() - bytes}};
   }
 
   /**
    * @brief Coalesce two contiguous blocks into one.
    *
-   * `this->is_contiguous_before(b)` must be true.
+   * `this->is_contiguous_before(blk)` must be true.
    *
-   * @param b block to merge.
+   * @param blk block to merge.
    * @return block The merged block.
    */
-  [[nodiscard]] block merge(block const& b) const
+  [[nodiscard]] block merge(block const& blk) const
   {
-    RMM_LOGGING_ASSERT(is_contiguous_before(b));
-    return {pointer(), size() + b.size()};
+    RMM_LOGGING_ASSERT(is_contiguous_before(blk));
+    return {pointer(), size() + blk.size()};
   }
 };
 
@@ -278,7 +278,7 @@ class superblock final : public memory_span {
   superblock(superblock const&) = delete;
   superblock& operator=(superblock const&) = delete;
   // Allow move semantics.
-  superblock(superblock&& sb) noexcept = default;
+  superblock(superblock&&) noexcept = default;
   superblock& operator=(superblock&&) noexcept = default;
 
   ~superblock() = default;
@@ -308,28 +308,29 @@ class superblock final : public memory_span {
   /**
    * @brief Whether this superblock contains the given block.
    *
-   * @param b The block to search for.
+   * @param blk The block to search for.
    * @return true if the given block belongs to this superblock.
    */
-  [[nodiscard]] bool contains(block const& b) const
+  [[nodiscard]] bool contains(block const& blk) const
   {
     RMM_LOGGING_ASSERT(is_valid());
-    RMM_LOGGING_ASSERT(b.is_valid());
+    RMM_LOGGING_ASSERT(blk.is_valid());
     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-    return pointer() <= b.pointer() && pointer() + size() >= b.pointer() + b.size();
+    return pointer() <= blk.pointer() && pointer() + size() >= blk.pointer() + blk.size();
   }
 
   /**
-   * @brief Can this superblock fit `sz` bytes?
+   * @brief Can this superblock fit `bytes` bytes?
    *
-   * @param sz The size in bytes to check for fit.
-   * @return true if this superblock can fit `sz` bytes.
+   * @param bytes The size in bytes to check for fit.
+   * @return true if this superblock can fit `bytes` bytes.
    */
-  [[nodiscard]] bool fits(std::size_t sz) const
+  [[nodiscard]] bool fits(std::size_t bytes) const
   {
     RMM_LOGGING_ASSERT(is_valid());
-    return std::any_of(
-      free_blocks_.cbegin(), free_blocks_.cend(), [sz](auto const& b) { return b.fits(sz); });
+    return std::any_of(free_blocks_.cbegin(), free_blocks_.cend(), [bytes](auto const& blk) {
+      return blk.fits(bytes);
+    });
   }
 
   /**
@@ -339,26 +340,26 @@ class superblock final : public memory_span {
    * @return true Returns true if both superblocks are empty and this superblock's
    * `pointer` + `size` == `s.ptr`.
    */
-  [[nodiscard]] bool is_contiguous_before(superblock const& sb) const
+  [[nodiscard]] bool is_contiguous_before(superblock const& sblk) const
   {
     RMM_LOGGING_ASSERT(is_valid());
-    RMM_LOGGING_ASSERT(sb.is_valid());
+    RMM_LOGGING_ASSERT(sblk.is_valid());
     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-    return empty() && sb.empty() && pointer() + size() == sb.pointer();
+    return empty() && sblk.empty() && pointer() + size() == sblk.pointer();
   }
 
   /**
    * @brief Split this superblock into two by the given size.
    *
-   * @param sz The size in bytes of the first block.
-   * @return superblock_pair A pair of superblocks split by sz.
+   * @param bytes The size in bytes of the first block.
+   * @return superblock_pair A pair of superblocks split by bytes.
    */
-  [[nodiscard]] std::pair<superblock, superblock> split(std::size_t sz) const
+  [[nodiscard]] std::pair<superblock, superblock> split(std::size_t bytes) const
   {
     RMM_LOGGING_ASSERT(is_valid());
-    RMM_LOGGING_ASSERT(empty() && sz >= minimum_size && size() >= sz + minimum_size);
+    RMM_LOGGING_ASSERT(empty() && bytes >= minimum_size && size() >= bytes + minimum_size);
     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-    return {superblock{pointer(), sz}, superblock{pointer() + sz, size() - sz}};
+    return {superblock{pointer(), bytes}, superblock{pointer() + bytes, size() - bytes}};
   }
 
   /**
@@ -366,13 +367,13 @@ class superblock final : public memory_span {
    *
    * `this->is_contiguous_before(s)` must be true.
    *
-   * @param sb superblock to merge.
+   * @param sblk superblock to merge.
    * @return block The merged block.
    */
-  [[nodiscard]] superblock merge(superblock const& sb) const
+  [[nodiscard]] superblock merge(superblock const& sblk) const
   {
-    RMM_LOGGING_ASSERT(is_contiguous_before(sb));
-    return {pointer(), size() + sb.size()};
+    RMM_LOGGING_ASSERT(is_contiguous_before(sblk));
+    return {pointer(), size() + sblk.size()};
   }
 
   /**
@@ -391,57 +392,52 @@ class superblock final : public memory_span {
     if (iter == free_blocks_.cend()) { return {}; }
 
     // Remove the block from the free list.
-    auto const b    = *iter;
+    auto const blk  = *iter;
     auto const next = free_blocks_.erase(iter);
 
-    if (b.size() > size) {
+    if (blk.size() > size) {
       // Split the block and put the remainder back.
-      auto const split = b.split(size);
+      auto const split = blk.split(size);
       free_blocks_.insert(next, split.second);
       return split.first;
     }
-    return b;
+    return blk;
   }
 
   /**
    * @brief Coalesce the given block with other free blocks.
    *
-   * @param b The block to coalesce.
+   * @param blk The block to coalesce.
    */
-  void coalesce(block const& b)  // NOLINT(readability-function-cognitive-complexity)
+  void coalesce(block const& blk)  // NOLINT(readability-function-cognitive-complexity)
   {
     RMM_LOGGING_ASSERT(is_valid());
-    RMM_LOGGING_ASSERT(b.is_valid());
-    RMM_LOGGING_ASSERT(contains(b));
-
-    if (free_blocks_.empty()) {
-      free_blocks_.insert(b);
-      return;
-    }
+    RMM_LOGGING_ASSERT(blk.is_valid());
+    RMM_LOGGING_ASSERT(contains(blk));
 
     // Find the right place (in ascending address order) to insert the block.
-    auto const next     = free_blocks_.lower_bound(b);
+    auto const next     = free_blocks_.lower_bound(blk);
     auto const previous = next == free_blocks_.cbegin() ? next : std::prev(next);
 
     // Coalesce with neighboring blocks.
-    bool const merge_prev = previous->is_contiguous_before(b);
-    bool const merge_next = next != free_blocks_.cend() && b.is_contiguous_before(*next);
+    bool const merge_prev = previous != free_blocks_.cend() && previous->is_contiguous_before(blk);
+    bool const merge_next = next != free_blocks_.cend() && blk.is_contiguous_before(*next);
 
     if (merge_prev && merge_next) {
-      auto const merged = previous->merge(b).merge(*next);
+      auto const merged = previous->merge(blk).merge(*next);
       free_blocks_.erase(previous);
       auto const iter = free_blocks_.erase(next);
       free_blocks_.insert(iter, merged);
     } else if (merge_prev) {
-      auto const merged = previous->merge(b);
+      auto const merged = previous->merge(blk);
       auto const iter   = free_blocks_.erase(previous);
       free_blocks_.insert(iter, merged);
     } else if (merge_next) {
-      auto const merged = b.merge(*next);
+      auto const merged = blk.merge(*next);
       auto const iter   = free_blocks_.erase(next);
       free_blocks_.insert(iter, merged);
     } else {
-      free_blocks_.insert(next, b);
+      free_blocks_.insert(next, blk);
     }
   }
 
@@ -464,8 +460,8 @@ class superblock final : public memory_span {
 inline auto max_free(std::set<superblock> const& superblocks)
 {
   std::size_t size{};
-  for (auto const& sb : superblocks) {
-    size = std::max(size, sb.max_free());
+  for (auto const& sblk : superblocks) {
+    size = std::max(size, sblk.max_free());
   }
   return size;
 };
@@ -534,7 +530,7 @@ class global_arena final {
     // Superblocks should only be acquired if the size is not directly handled by the global arena.
     RMM_LOGGING_ASSERT(!handles(size));
     std::lock_guard lock(mtx_);
-    return first_fit(size, superblock::minimum_size);
+    return first_fit(size);
   }
 
   /**
@@ -542,11 +538,11 @@ class global_arena final {
    *
    * @param s Superblock to be released.
    */
-  void release(superblock&& sb)
+  void release(superblock&& sblk)
   {
-    RMM_LOGGING_ASSERT(sb.is_valid());
+    RMM_LOGGING_ASSERT(sblk.is_valid());
     std::lock_guard lock(mtx_);
-    coalesce(std::move(sb));
+    coalesce(std::move(sblk));
   }
 
   /**
@@ -558,9 +554,9 @@ class global_arena final {
   {
     std::lock_guard lock(mtx_);
     while (!superblocks.empty()) {
-      auto sb = std::move(superblocks.extract(superblocks.cbegin()).value());
-      RMM_LOGGING_ASSERT(sb.is_valid());
-      coalesce(std::move(sb));
+      auto sblk = std::move(superblocks.extract(superblocks.cbegin()).value());
+      RMM_LOGGING_ASSERT(sblk.is_valid());
+      coalesce(std::move(sblk));
     }
   }
 
@@ -574,56 +570,57 @@ class global_arena final {
   {
     RMM_LOGGING_ASSERT(handles(size));
     std::lock_guard lock(mtx_);
-    auto sb = first_fit(size, size);
-    if (sb.is_valid()) {
-      RMM_LOGGING_ASSERT(large_allocations_.find(sb.pointer()) == large_allocations_.cend());
-      large_allocations_.emplace(sb.pointer(), sb.size());
+    auto sblk = first_fit(size);
+    if (sblk.is_valid()) {
+      auto blk = sblk.first_fit(size);
+      superblocks_.insert(std::move(sblk));
+      return blk.pointer();
     }
-    return sb.pointer();
+    return nullptr;
   }
 
   /**
-   * @brief Deallocate memory pointed to by `ptr` directly.
+   * @brief Deallocate memory pointed to by `ptr`.
    *
    * @param ptr Pointer to be deallocated.
    * @param size The size in bytes of the allocation. This must be equal to the value of `size`
    * that was passed to the `allocate` call that returned `p`.
    * @param stream Stream on which to perform deallocation.
+   * @return bool true if the allocation is found, false otherwise.
    */
-  void deallocate(void* ptr, std::size_t size, cuda_stream_view stream)
+  bool deallocate(void* ptr, std::size_t size, cuda_stream_view stream)
   {
     RMM_LOGGING_ASSERT(handles(size));
     stream.synchronize_no_throw();
-    std::lock_guard lock(mtx_);
-    auto const allocated_size = large_allocations_.at(ptr);
-    large_allocations_.erase(ptr);
-    coalesce({ptr, allocated_size});
+    return deallocate(ptr, size);
   }
 
   /**
-   * @brief Deallocate memory pointed to by `ptr` that was allocated in a per-thread arena.
+   * @brief Deallocate memory pointed to by `ptr`.
    *
    * @param ptr Pointer to be deallocated.
    * @param bytes The size in bytes of the allocation. This must be equal to the
    * value of `bytes` that was passed to the `allocate` call that returned `ptr`.
-   * @param stream Stream on which to perform deallocation.
+   * @return bool true if the allocation is found, false otherwise.
    */
-  void deallocate_from_other_arena(void* ptr, std::size_t bytes)
+  bool deallocate(void* ptr, std::size_t bytes)
   {
     std::lock_guard lock(mtx_);
 
-    block const b{ptr, bytes};
-    auto const iter = std::find_if(
-      superblocks_.cbegin(), superblocks_.cend(), [&](auto const& sb) { return sb.contains(b); });
-    if (iter == superblocks_.cend()) { RMM_FAIL("allocation not found"); }
+    block const blk{ptr, bytes};
+    auto const iter = std::find_if(superblocks_.cbegin(),
+                                   superblocks_.cend(),
+                                   [&](auto const& sblk) { return sblk.contains(blk); });
+    if (iter == superblocks_.cend()) { return false; }
 
-    auto sb = std::move(superblocks_.extract(iter).value());
-    sb.coalesce(b);
-    if (sb.empty()) {
-      coalesce(std::move(sb));
+    auto sblk = std::move(superblocks_.extract(iter).value());
+    sblk.coalesce(blk);
+    if (sblk.empty()) {
+      coalesce(std::move(sblk));
     } else {
-      superblocks_.insert(std::move(sb));
+      superblocks_.insert(std::move(sblk));
     }
+    return true;
   }
 
   /**
@@ -641,22 +638,21 @@ class global_arena final {
       logger->info("  Total size of superblocks: {}",
                    rmm::detail::bytes{total_memory_size(superblocks_)});
       logger->info("  Size of largest free block: {}", rmm::detail::bytes{max_free(superblocks_)});
-      logger->info("  # of outstanding large allocations: {}", large_allocations_.size());
-      auto i = 0;
+      auto index = 0;
       char* prev_end{};
-      for (auto const& sb : superblocks_) {
-        if (prev_end == nullptr) { prev_end = sb.pointer(); }
+      for (auto const& sblk : superblocks_) {
+        if (prev_end == nullptr) { prev_end = sblk.pointer(); }
         logger->info(
           "    Superblock {}: start={}, end={}, size={}, empty={}, # free blocks={}, gap={}",
-          i,
-          fmt::ptr(sb.pointer()),
-          fmt::ptr(sb.end()),
-          rmm::detail::bytes{sb.size()},
-          sb.empty(),
-          sb.free_blocks(),
-          rmm::detail::bytes{static_cast<size_t>(sb.pointer() - prev_end)});
-        prev_end = sb.end();
-        i++;
+          index,
+          fmt::ptr(sblk.pointer()),
+          fmt::ptr(sblk.end()),
+          rmm::detail::bytes{sblk.size()},
+          sblk.empty(),
+          sblk.free_blocks(),
+          rmm::detail::bytes{static_cast<size_t>(sblk.pointer() - prev_end)});
+        prev_end = sblk.end();
+        index++;
       }
     }
   }
@@ -698,60 +694,56 @@ class global_arena final {
    * @param minimum_size The minimum size of the superblock required.
    * @return superblock A superblock that can fit at least `size` bytes, or empty if not found.
    */
-  superblock first_fit(std::size_t size, std::size_t minimum_size)
+  superblock first_fit(std::size_t size)
   {
-    auto const iter = std::find_if(superblocks_.cbegin(), superblocks_.cend(), [=](auto const& sb) {
-      return sb.fits(size) && sb.size() >= minimum_size;
+    auto iter = std::find_if(superblocks_.cbegin(), superblocks_.cend(), [=](auto const& sblk) {
+      return sblk.fits(size);
     });
     if (iter == superblocks_.cend()) { return {}; }
 
-    auto sb = std::move(superblocks_.extract(iter).value());
-    if (sb.empty() && sb.size() >= minimum_size + superblock::minimum_size) {
+    auto sblk           = std::move(superblocks_.extract(iter).value());
+    auto const min_size = std::max(superblock::minimum_size, size);
+    if (sblk.empty() && sblk.size() >= min_size + superblock::minimum_size) {
       // Split the superblock and put the remainder back.
-      auto [head, tail] = sb.split(minimum_size);
+      auto [head, tail] = sblk.split(min_size);
       superblocks_.insert(std::move(tail));
       return std::move(head);
     }
-    return sb;
+    return sblk;
   }
 
   /**
    * @brief Coalesce the given superblock with other empty superblocks.
    *
-   * @param sb The superblock to coalesce.
+   * @param sblk The superblock to coalesce.
    */
-  void coalesce(superblock&& sb)
+  void coalesce(superblock&& sblk)
   {
-    RMM_LOGGING_ASSERT(sb.is_valid());
-
-    if (superblocks_.empty()) {
-      superblocks_.insert(std::move(sb));
-      return;
-    }
+    RMM_LOGGING_ASSERT(sblk.is_valid());
 
     // Find the right place (in ascending address order) to insert the block.
-    auto const next     = superblocks_.lower_bound(sb);
+    auto const next     = superblocks_.lower_bound(sblk);
     auto const previous = next == superblocks_.cbegin() ? next : std::prev(next);
 
     // Coalesce with neighboring blocks.
-    bool const merge_prev = previous->is_contiguous_before(sb);
-    bool const merge_next = next != superblocks_.cend() && sb.is_contiguous_before(*next);
+    bool const merge_prev = previous != superblocks_.cend() && previous->is_contiguous_before(sblk);
+    bool const merge_next = next != superblocks_.cend() && sblk.is_contiguous_before(*next);
 
     if (merge_prev && merge_next) {
       auto prev_sb = std::move(superblocks_.extract(previous).value());
       auto next_sb = std::move(superblocks_.extract(next).value());
-      auto merged  = prev_sb.merge(sb).merge(next_sb);
+      auto merged  = prev_sb.merge(sblk).merge(next_sb);
       superblocks_.insert(std::move(merged));
     } else if (merge_prev) {
       auto prev_sb = std::move(superblocks_.extract(previous).value());
-      auto merged  = prev_sb.merge(sb);
+      auto merged  = prev_sb.merge(sblk);
       superblocks_.insert(std::move(merged));
     } else if (merge_next) {
       auto next_sb = std::move(superblocks_.extract(next).value());
-      auto merged  = sb.merge(next_sb);
+      auto merged  = sblk.merge(next_sb);
       superblocks_.insert(std::move(merged));
     } else {
-      superblocks_.insert(std::move(sb));
+      superblocks_.insert(std::move(sblk));
     }
   }
 
@@ -761,8 +753,6 @@ class global_arena final {
   block upstream_block_;
   /// Address-ordered set of superblocks.
   std::set<superblock> superblocks_;
-  /// Large allocations.
-  std::unordered_map<void*, std::size_t> large_allocations_;
   /// Mutex for exclusive lock.
   mutable std::mutex mtx_;
 };
@@ -818,12 +808,22 @@ class arena {
    */
   bool deallocate(void* ptr, std::size_t size, cuda_stream_view stream)
   {
-    if (global_arena_.handles(size)) {
-      global_arena_.deallocate(ptr, size, stream);
-      return true;
-    }
+    if (global_arena_.handles(size) && global_arena_.deallocate(ptr, size, stream)) { return true; }
+    return deallocate(ptr, size);
+  }
+
+  /**
+   * @brief Deallocate memory pointed to by `ptr`, and possibly return superblocks to upstream.
+   *
+   * @param ptr Pointer to be deallocated.
+   * @param size The size in bytes of the allocation. This must be equal to the value of `size`
+   * that was passed to the `allocate` call that returned `p`.
+   * @return bool true if the allocation is found, false otherwise.
+   */
+  bool deallocate(void* ptr, std::size_t size)
+  {
     std::lock_guard lock(mtx_);
-    return deallocate_from_superblock({ptr, size}, stream);
+    return deallocate_from_superblock({ptr, size});
   }
 
   /**
@@ -833,6 +833,7 @@ class arena {
   {
     std::lock_guard lock(mtx_);
     global_arena_.release(superblocks_);
+    superblocks_.clear();
   }
 
   /**
@@ -843,7 +844,7 @@ class arena {
     std::lock_guard lock(mtx_);
     while (true) {
       auto const iter = std::find_if(
-        superblocks_.cbegin(), superblocks_.cend(), [](auto const& sb) { return sb.empty(); });
+        superblocks_.cbegin(), superblocks_.cend(), [](auto const& sblk) { return sblk.empty(); });
       if (iter == superblocks_.cend()) { return; }
       global_arena_.release(std::move(superblocks_.extract(iter).value()));
     }
@@ -863,18 +864,18 @@ class arena {
                    rmm::detail::bytes{total_memory_size(superblocks_)});
       logger->info("    Size of largest free block: {}",
                    rmm::detail::bytes{max_free(superblocks_)});
-      auto i = 0;
-      for (auto const& sb : superblocks_) {
+      auto index = 0;
+      for (auto const& sblk : superblocks_) {
         logger->info(
           "      Superblock {}: start={}, end={}, size={}, empty={}, # free blocks={}, max free={}",
-          i,
-          fmt::ptr(sb.pointer()),
-          fmt::ptr(sb.end()),
-          rmm::detail::bytes{sb.size()},
-          sb.empty(),
-          sb.free_blocks(),
-          rmm::detail::bytes{sb.max_free()});
-        i++;
+          index,
+          fmt::ptr(sblk.pointer()),
+          fmt::ptr(sblk.end()),
+          rmm::detail::bytes{sblk.size()},
+          sblk.empty(),
+          sblk.free_blocks(),
+          rmm::detail::bytes{sblk.max_free()});
+        index++;
       }
     }
   }
@@ -889,8 +890,8 @@ class arena {
   block get_block(std::size_t size)
   {
     // Find the first-fit free block.
-    auto const b = first_fit(size);
-    if (b.is_valid()) { return b; }
+    auto const blk = first_fit(size);
+    if (blk.is_valid()) { return blk; }
 
     // No existing larger blocks available, so grow the arena and obtain a superblock.
     return expand_arena(size);
@@ -911,32 +912,34 @@ class arena {
    */
   block first_fit(std::size_t size)
   {
-    auto const iter = std::find_if(
-      superblocks_.cbegin(), superblocks_.cend(), [size](auto const& sb) { return sb.fits(size); });
+    auto const iter = std::find_if(superblocks_.cbegin(),
+                                   superblocks_.cend(),
+                                   [size](auto const& sblk) { return sblk.fits(size); });
     if (iter == superblocks_.cend()) { return {}; }
 
-    auto sb      = std::move(superblocks_.extract(iter).value());
-    auto const b = sb.first_fit(size);
-    superblocks_.insert(std::move(sb));
-    return b;
+    auto sblk      = std::move(superblocks_.extract(iter).value());
+    auto const blk = sblk.first_fit(size);
+    superblocks_.insert(std::move(sblk));
+    return blk;
   }
 
   /**
    * @brief Deallocate a block from the superblock it belongs to.
    *
-   * @param b The block to deallocate.
+   * @param blk The block to deallocate.
    * @param stream The stream to use for deallocation.
    * @return true if the block is found.
    */
-  bool deallocate_from_superblock(block const& b, cuda_stream_view stream)
+  bool deallocate_from_superblock(block const& blk)
   {
-    auto const iter = std::find_if(
-      superblocks_.cbegin(), superblocks_.cend(), [&](auto const& sb) { return sb.contains(b); });
+    auto const iter = std::find_if(superblocks_.cbegin(),
+                                   superblocks_.cend(),
+                                   [&](auto const& sblk) { return sblk.contains(blk); });
     if (iter == superblocks_.cend()) { return false; }
 
-    auto sb = std::move(superblocks_.extract(iter).value());
-    sb.coalesce(b);
-    superblocks_.insert(std::move(sb));
+    auto sblk = std::move(superblocks_.extract(iter).value());
+    sblk.coalesce(blk);
+    superblocks_.insert(std::move(sblk));
     return true;
   }
 
@@ -948,12 +951,12 @@ class arena {
    */
   block expand_arena(std::size_t size)
   {
-    auto sb = global_arena_.acquire(size);
-    if (sb.is_valid()) {
-      RMM_LOGGING_ASSERT(sb.size() >= superblock::minimum_size);
-      auto const b = sb.first_fit(size);
-      superblocks_.insert(std::move(sb));
-      return b;
+    auto sblk = global_arena_.acquire(size);
+    if (sblk.is_valid()) {
+      RMM_LOGGING_ASSERT(sblk.size() >= superblock::minimum_size);
+      auto const blk = sblk.first_fit(size);
+      superblocks_.insert(std::move(sblk));
+      return blk;
     }
     return {};
   }
diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index 97db29748..7ed63ec2e 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -395,7 +395,7 @@ TEST_F(ArenaTest, GlobalArenaDeallocateFromOtherArena)  // NOLINT
   auto sb      = ga_->acquire(512);
   auto const b = sb.first_fit(512);
   ga_->release(std::move(sb));
-  ga_->deallocate_from_other_arena(b.pointer(), b.size());
+  ga_->deallocate(b.pointer(), b.size());
   EXPECT_EQ(ga_->allocate(arena_size_), fake_address3);
 }
 

From fb9ce95066850e5152c77cb3dff1f778567c98d2 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Tue, 7 Dec 2021 12:01:54 -0800
Subject: [PATCH 26/35] log max free in superblock

---
 include/rmm/mr/device/detail/arena.hpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index a2579471d..9b1f6869e 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -643,13 +643,15 @@ class global_arena final {
       for (auto const& sblk : superblocks_) {
         if (prev_end == nullptr) { prev_end = sblk.pointer(); }
         logger->info(
-          "    Superblock {}: start={}, end={}, size={}, empty={}, # free blocks={}, gap={}",
+          "    Superblock {}: start={}, end={}, size={}, empty={}, # free blocks={}, max free={}, "
+          "gap={}",
           index,
           fmt::ptr(sblk.pointer()),
           fmt::ptr(sblk.end()),
           rmm::detail::bytes{sblk.size()},
           sblk.empty(),
           sblk.free_blocks(),
+          rmm::detail::bytes{sblk.max_free()},
           rmm::detail::bytes{static_cast<size_t>(sblk.pointer() - prev_end)});
         prev_end = sblk.end();
         index++;

From 65742cbffd5d82a6825a7da185a729f1b477b2ab Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Tue, 7 Dec 2021 18:41:25 -0800
Subject: [PATCH 27/35] log fragmentation percentage

---
 .../rmm/mr/device/arena_memory_resource.hpp   |  12 +-
 include/rmm/mr/device/detail/arena.hpp        |  51 ++-
 tests/mr/device/arena_mr_tests.cpp            | 358 +++++++++---------
 3 files changed, 220 insertions(+), 201 deletions(-)

diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp
index fd6874705..84e1dd73b 100644
--- a/include/rmm/mr/device/arena_memory_resource.hpp
+++ b/include/rmm/mr/device/arena_memory_resource.hpp
@@ -225,9 +225,7 @@ class arena_memory_resource final : public device_memory_resource {
       }
     }
 
-    if (!global_arena_.deallocate(ptr, bytes)) {
-      RMM_FAIL("allocation not found");
-    }
+    if (!global_arena_.deallocate(ptr, bytes)) { RMM_FAIL("allocation not found"); }
   }
 
   /**
@@ -307,15 +305,15 @@ class arena_memory_resource final : public device_memory_resource {
     logger_->info("**************************************************");
     logger_->info("Global arena:");
     global_arena_.dump_memory_log(logger_);
-    logger_->info("Per-thread arenas:");
+    logger_->debug("Per-thread arenas:");
     for (auto const& thread_arena : thread_arenas_) {
-      logger_->info("  Thread {}:", thread_arena.first);
+      logger_->debug("  Thread {}:", thread_arena.first);
       thread_arena.second->dump_memory_log(logger_);
     }
     if (!stream_arenas_.empty()) {
-      logger_->info("Per-stream arenas:");
+      logger_->debug("Per-stream arenas:");
       for (auto const& stream_arena : stream_arenas_) {
-        logger_->info("  Stream {}:", static_cast<void*>(stream_arena.first));
+        logger_->debug("  Stream {}:", static_cast<void*>(stream_arena.first));
         stream_arena.second.dump_memory_log(logger_);
       }
     }
diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index 9b1f6869e..2c823becf 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -441,11 +441,17 @@ class superblock final : public memory_span {
     }
   }
 
+  /**
+   * @brief Find the total free block size.
+   * @return the total free block size.
+   */
+  [[nodiscard]] std::size_t total_free_size() const { return total_memory_size(free_blocks_); }
+
   /**
    * @brief Find the max free block size.
    * @return the max free block size.
    */
-  [[nodiscard]] std::size_t max_free() const
+  [[nodiscard]] std::size_t max_free_size() const
   {
     if (free_blocks_.empty()) { return 0; }
     return std::max_element(free_blocks_.cbegin(), free_blocks_.cend(), block_size_compare)->size();
@@ -456,12 +462,21 @@ class superblock final : public memory_span {
   std::set<block> free_blocks_{};
 };
 
+/// Calculate the total free size of a set of superblocks.
+inline auto total_free_size(std::set<superblock> const& superblocks)
+{
+  return std::accumulate(
+    superblocks.cbegin(), superblocks.cend(), std::size_t{}, [](auto const& lhs, auto const& rhs) {
+      return lhs + rhs.total_free_size();
+    });
+}
+
 /// Find the max free size from a set of superblocks.
-inline auto max_free(std::set<superblock> const& superblocks)
+inline auto max_free_size(std::set<superblock> const& superblocks)
 {
   std::size_t size{};
   for (auto const& sblk : superblocks) {
-    size = std::max(size, sblk.max_free());
+    size = std::max(size, sblk.max_free_size());
   }
   return size;
 };
@@ -635,14 +650,20 @@ class global_arena final {
     logger->info("  Arena size: {}", rmm::detail::bytes{upstream_block_.size()});
     logger->info("  # superblocks: {}", superblocks_.size());
     if (!superblocks_.empty()) {
-      logger->info("  Total size of superblocks: {}",
-                   rmm::detail::bytes{total_memory_size(superblocks_)});
-      logger->info("  Size of largest free block: {}", rmm::detail::bytes{max_free(superblocks_)});
+      logger->debug("  Total size of superblocks: {}",
+                    rmm::detail::bytes{total_memory_size(superblocks_)});
+      auto const total_free    = total_free_size(superblocks_);
+      auto const max_free      = max_free_size(superblocks_);
+      auto const fragmentation = (1 - max_free / static_cast<double>(total_free)) * 100;
+      logger->info("  Total free memory: {}", rmm::detail::bytes{total_free});
+      logger->info("  Largest block of free memory: {}", rmm::detail::bytes{max_free});
+      logger->info("  Fragmentation: {:.2f}%", fragmentation);
+
       auto index = 0;
       char* prev_end{};
       for (auto const& sblk : superblocks_) {
         if (prev_end == nullptr) { prev_end = sblk.pointer(); }
-        logger->info(
+        logger->debug(
           "    Superblock {}: start={}, end={}, size={}, empty={}, # free blocks={}, max free={}, "
           "gap={}",
           index,
@@ -651,7 +672,7 @@ class global_arena final {
           rmm::detail::bytes{sblk.size()},
           sblk.empty(),
           sblk.free_blocks(),
-          rmm::detail::bytes{sblk.max_free()},
+          rmm::detail::bytes{sblk.max_free_size()},
           rmm::detail::bytes{static_cast<size_t>(sblk.pointer() - prev_end)});
         prev_end = sblk.end();
         index++;
@@ -860,15 +881,15 @@ class arena {
   void dump_memory_log(std::shared_ptr<spdlog::logger> const& logger) const
   {
     std::lock_guard lock(mtx_);
-    logger->info("    # superblocks: {}", superblocks_.size());
+    logger->debug("    # superblocks: {}", superblocks_.size());
     if (!superblocks_.empty()) {
-      logger->info("    Total size of superblocks: {}",
-                   rmm::detail::bytes{total_memory_size(superblocks_)});
-      logger->info("    Size of largest free block: {}",
-                   rmm::detail::bytes{max_free(superblocks_)});
+      logger->debug("    Total size of superblocks: {}",
+                    rmm::detail::bytes{total_memory_size(superblocks_)});
+      logger->debug("    Size of largest free block: {}",
+                    rmm::detail::bytes{max_free_size(superblocks_)});
       auto index = 0;
       for (auto const& sblk : superblocks_) {
-        logger->info(
+        logger->debug(
           "      Superblock {}: start={}, end={}, size={}, empty={}, # free blocks={}, max free={}",
           index,
           fmt::ptr(sblk.pointer()),
@@ -876,7 +897,7 @@ class arena {
           rmm::detail::bytes{sblk.size()},
           sblk.empty(),
           sblk.free_blocks(),
-          rmm::detail::bytes{sblk.max_free()});
+          rmm::detail::bytes{sblk.max_free_size()});
         index++;
       }
     }
diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index 7ed63ec2e..bd6d81f2c 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -59,15 +59,15 @@ class ArenaTest : public ::testing::Test {
   {
     EXPECT_CALL(mock_, allocate(arena_size_)).WillOnce(Return(fake_address3));
     EXPECT_CALL(mock_, deallocate(fake_address3, arena_size_));
-    ga_ = std::make_unique<global_arena>(&mock_, arena_size_);
-    a_  = std::make_unique<arena>(*ga_);
+    global_arena_ = std::make_unique<global_arena>(&mock_, arena_size_);
+    arena_        = std::make_unique<arena>(*global_arena_);
   }
 
   // NOLINTBEGIN(cppcoreguidelines-non-private-member-variables-in-classes)
   std::size_t arena_size_{superblock::minimum_size * 4};
   mock_memory_resource mock_{};
-  std::unique_ptr<global_arena> ga_{};
-  std::unique_ptr<arena> a_{};
+  std::unique_ptr<global_arena> global_arena_{};
+  std::unique_ptr<arena> arena_{};
   // NOLINTEND(cppcoreguidelines-non-private-member-variables-in-classes)
 };
 
@@ -94,8 +94,8 @@ TEST_F(ArenaTest, AlignToSizeClass)  // NOLINT
 
 TEST_F(ArenaTest, MemorySpan)  // NOLINT
 {
-  memory_span const ms{};
-  EXPECT_FALSE(ms.is_valid());
+  memory_span const mem_span{};
+  EXPECT_FALSE(mem_span.is_valid());
   memory_span const ms2{fake_address, 256};
   EXPECT_TRUE(ms2.is_valid());
 }
@@ -106,25 +106,25 @@ TEST_F(ArenaTest, MemorySpan)  // NOLINT
 
 TEST_F(ArenaTest, BlockFits)  // NOLINT
 {
-  block const b{fake_address, 1_KiB};
-  EXPECT_TRUE(b.fits(1_KiB));
-  EXPECT_FALSE(b.fits(1_KiB + 1));
+  block const blk{fake_address, 1_KiB};
+  EXPECT_TRUE(blk.fits(1_KiB));
+  EXPECT_FALSE(blk.fits(1_KiB + 1));
 }
 
 TEST_F(ArenaTest, BlockIsContiguousBefore)  // NOLINT
 {
-  block const b{fake_address, 1_KiB};
-  block const b2{fake_address2, 256};
-  EXPECT_TRUE(b.is_contiguous_before(b2));
-  block const b3{fake_address, 512};
-  block const b4{fake_address2, 1_KiB};
-  EXPECT_FALSE(b3.is_contiguous_before(b4));
+  block const blk{fake_address, 1_KiB};
+  block const blk2{fake_address2, 256};
+  EXPECT_TRUE(blk.is_contiguous_before(blk2));
+  block const blk3{fake_address, 512};
+  block const blk4{fake_address2, 1_KiB};
+  EXPECT_FALSE(blk3.is_contiguous_before(blk4));
 }
 
 TEST_F(ArenaTest, BlockSplit)  // NOLINT
 {
-  block const b{fake_address, 2_KiB};
-  auto const [head, tail] = b.split(1_KiB);
+  block const blk{fake_address, 2_KiB};
+  auto const [head, tail] = blk.split(1_KiB);
   EXPECT_EQ(head.pointer(), fake_address);
   EXPECT_EQ(head.size(), 1_KiB);
   EXPECT_EQ(tail.pointer(), fake_address2);
@@ -133,9 +133,9 @@ TEST_F(ArenaTest, BlockSplit)  // NOLINT
 
 TEST_F(ArenaTest, BlockMerge)  // NOLINT
 {
-  block const b{fake_address, 1_KiB};
-  block const b2{fake_address2, 1_KiB};
-  auto const merged = b.merge(b2);
+  block const blk{fake_address, 1_KiB};
+  block const blk2{fake_address2, 1_KiB};
+  auto const merged = blk.merge(blk2);
   EXPECT_EQ(merged.pointer(), fake_address);
   EXPECT_EQ(merged.size(), 2_KiB);
 }
@@ -146,61 +146,61 @@ TEST_F(ArenaTest, BlockMerge)  // NOLINT
 
 TEST_F(ArenaTest, SuperblockEmpty)  // NOLINT
 {
-  superblock sb{fake_address3, superblock::minimum_size};
-  EXPECT_TRUE(sb.empty());
-  sb.first_fit(256);
-  EXPECT_FALSE(sb.empty());
+  superblock sblk{fake_address3, superblock::minimum_size};
+  EXPECT_TRUE(sblk.empty());
+  sblk.first_fit(256);
+  EXPECT_FALSE(sblk.empty());
 }
 
 TEST_F(ArenaTest, SuperblockContains)  // NOLINT
 {
-  superblock const sb{fake_address3, superblock::minimum_size};
-  block const b{fake_address, 2_KiB};
-  EXPECT_FALSE(sb.contains(b));
-  block const b2{fake_address3, 1_KiB};
-  EXPECT_TRUE(sb.contains(b2));
-  block const b3{fake_address3, superblock::minimum_size + 1};
-  EXPECT_FALSE(sb.contains(b3));
-  block const b4{fake_address3, superblock::minimum_size};
-  EXPECT_TRUE(sb.contains(b4));
-  block const b5{fake_address4, 256};
-  EXPECT_FALSE(sb.contains(b5));
+  superblock const sblk{fake_address3, superblock::minimum_size};
+  block const blk{fake_address, 2_KiB};
+  EXPECT_FALSE(sblk.contains(blk));
+  block const blk2{fake_address3, 1_KiB};
+  EXPECT_TRUE(sblk.contains(blk2));
+  block const blk3{fake_address3, superblock::minimum_size + 1};
+  EXPECT_FALSE(sblk.contains(blk3));
+  block const blk4{fake_address3, superblock::minimum_size};
+  EXPECT_TRUE(sblk.contains(blk4));
+  block const blk5{fake_address4, 256};
+  EXPECT_FALSE(sblk.contains(blk5));
 }
 
 TEST_F(ArenaTest, SuperblockFits)  // NOLINT
 {
-  superblock sb{fake_address3, superblock::minimum_size};
-  EXPECT_TRUE(sb.fits(superblock::minimum_size));
-  EXPECT_FALSE(sb.fits(superblock::minimum_size + 1));
+  superblock sblk{fake_address3, superblock::minimum_size};
+  EXPECT_TRUE(sblk.fits(superblock::minimum_size));
+  EXPECT_FALSE(sblk.fits(superblock::minimum_size + 1));
 
-  auto const b = sb.first_fit(superblock::minimum_size / 4);
-  sb.first_fit(superblock::minimum_size / 4);
-  sb.coalesce(b);
-  EXPECT_TRUE(sb.fits(superblock::minimum_size / 2));
-  EXPECT_FALSE(sb.fits(superblock::minimum_size / 2 + 1));
+  auto const blk = sblk.first_fit(superblock::minimum_size / 4);
+  sblk.first_fit(superblock::minimum_size / 4);
+  sblk.coalesce(blk);
+  EXPECT_TRUE(sblk.fits(superblock::minimum_size / 2));
+  EXPECT_FALSE(sblk.fits(superblock::minimum_size / 2 + 1));
 }
 
 TEST_F(ArenaTest, SuperblockIsContiguousBefore)  // NOLINT
 {
-  superblock sb{fake_address3, superblock::minimum_size};
+  superblock sblk{fake_address3, superblock::minimum_size};
   superblock sb2{fake_address4, superblock::minimum_size};
-  EXPECT_TRUE(sb.is_contiguous_before(sb2));
+  EXPECT_TRUE(sblk.is_contiguous_before(sb2));
 
-  auto const b = sb.first_fit(256);
-  EXPECT_FALSE(sb.is_contiguous_before(sb2));
-  sb.coalesce(b);
-  EXPECT_TRUE(sb.is_contiguous_before(sb2));
+  auto const blk = sblk.first_fit(256);
+  EXPECT_FALSE(sblk.is_contiguous_before(sb2));
+  sblk.coalesce(blk);
+  EXPECT_TRUE(sblk.is_contiguous_before(sb2));
 
-  auto const b2 = sb2.first_fit(1_KiB);
-  EXPECT_FALSE(sb.is_contiguous_before(sb2));
-  sb2.coalesce(b2);
-  EXPECT_TRUE(sb.is_contiguous_before(sb2));
+  auto const blk2 = sb2.first_fit(1_KiB);
+  EXPECT_FALSE(sblk.is_contiguous_before(sb2));
+  sb2.coalesce(blk2);
+  EXPECT_TRUE(sblk.is_contiguous_before(sb2));
 }
 
 TEST_F(ArenaTest, SuperblockSplit)  // NOLINT
 {
-  superblock sb{fake_address3, superblock::minimum_size * 2};
-  auto const [head, tail] = sb.split(superblock::minimum_size);
+  superblock sblk{fake_address3, superblock::minimum_size * 2};
+  auto const [head, tail] = sblk.split(superblock::minimum_size);
   EXPECT_EQ(head.pointer(), fake_address3);
   EXPECT_EQ(head.size(), superblock::minimum_size);
   EXPECT_TRUE(head.empty());
@@ -211,9 +211,9 @@ TEST_F(ArenaTest, SuperblockSplit)  // NOLINT
 
 TEST_F(ArenaTest, SuperblockMerge)  // NOLINT
 {
-  superblock sb{fake_address3, superblock::minimum_size};
+  superblock sblk{fake_address3, superblock::minimum_size};
   superblock sb2{fake_address4, superblock::minimum_size};
-  auto const merged = sb.merge(sb2);
+  auto const merged = sblk.merge(sb2);
   EXPECT_EQ(merged.pointer(), fake_address3);
   EXPECT_EQ(merged.size(), superblock::minimum_size * 2);
   EXPECT_TRUE(merged.empty());
@@ -221,71 +221,71 @@ TEST_F(ArenaTest, SuperblockMerge)  // NOLINT
 
 TEST_F(ArenaTest, SuperblockFirstFit)  // NOLINT
 {
-  superblock sb{fake_address3, superblock::minimum_size};
-  auto const b = sb.first_fit(1_KiB);
-  EXPECT_EQ(b.pointer(), fake_address3);
-  EXPECT_EQ(b.size(), 1_KiB);
-  auto const b2 = sb.first_fit(2_KiB);
+  superblock sblk{fake_address3, superblock::minimum_size};
+  auto const blk = sblk.first_fit(1_KiB);
+  EXPECT_EQ(blk.pointer(), fake_address3);
+  EXPECT_EQ(blk.size(), 1_KiB);
+  auto const blk2 = sblk.first_fit(2_KiB);
   // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-  EXPECT_EQ(b2.pointer(), static_cast<char*>(fake_address3) + 1_KiB);
-  EXPECT_EQ(b2.size(), 2_KiB);
-  sb.coalesce(b);
-  auto const b3 = sb.first_fit(512);
-  EXPECT_EQ(b3.pointer(), fake_address3);
-  EXPECT_EQ(b3.size(), 512);
+  EXPECT_EQ(blk2.pointer(), static_cast<char*>(fake_address3) + 1_KiB);
+  EXPECT_EQ(blk2.size(), 2_KiB);
+  sblk.coalesce(blk);
+  auto const blk3 = sblk.first_fit(512);
+  EXPECT_EQ(blk3.pointer(), fake_address3);
+  EXPECT_EQ(blk3.size(), 512);
 }
 
 TEST_F(ArenaTest, SuperblockCoalesceAfterFull)  // NOLINT
 {
-  superblock sb{fake_address3, superblock::minimum_size};
-  auto const b = sb.first_fit(superblock::minimum_size / 2);
-  sb.first_fit(superblock::minimum_size / 2);
-  sb.coalesce(b);
-  EXPECT_TRUE(sb.first_fit(superblock::minimum_size / 2).is_valid());
+  superblock sblk{fake_address3, superblock::minimum_size};
+  auto const blk = sblk.first_fit(superblock::minimum_size / 2);
+  sblk.first_fit(superblock::minimum_size / 2);
+  sblk.coalesce(blk);
+  EXPECT_TRUE(sblk.first_fit(superblock::minimum_size / 2).is_valid());
 }
 
 TEST_F(ArenaTest, SuperblockCoalesceMergeNext)  // NOLINT
 {
-  superblock sb{fake_address3, superblock::minimum_size};
-  auto const b = sb.first_fit(superblock::minimum_size / 2);
-  sb.coalesce(b);
-  EXPECT_TRUE(sb.first_fit(superblock::minimum_size).is_valid());
+  superblock sblk{fake_address3, superblock::minimum_size};
+  auto const blk = sblk.first_fit(superblock::minimum_size / 2);
+  sblk.coalesce(blk);
+  EXPECT_TRUE(sblk.first_fit(superblock::minimum_size).is_valid());
 }
 
 TEST_F(ArenaTest, SuperblockCoalesceMergePrevious)  // NOLINT
 {
-  superblock sb{fake_address3, superblock::minimum_size};
-  auto const b  = sb.first_fit(1_KiB);
-  auto const b2 = sb.first_fit(1_KiB);
-  sb.first_fit(1_KiB);
-  sb.coalesce(b);
-  sb.coalesce(b2);
-  auto const b3 = sb.first_fit(2_KiB);
-  EXPECT_EQ(b3.pointer(), fake_address3);
+  superblock sblk{fake_address3, superblock::minimum_size};
+  auto const blk  = sblk.first_fit(1_KiB);
+  auto const blk2 = sblk.first_fit(1_KiB);
+  sblk.first_fit(1_KiB);
+  sblk.coalesce(blk);
+  sblk.coalesce(blk2);
+  auto const blk3 = sblk.first_fit(2_KiB);
+  EXPECT_EQ(blk3.pointer(), fake_address3);
 }
 
 TEST_F(ArenaTest, SuperblockCoalesceMergePreviousAndNext)  // NOLINT
 {
-  superblock sb{fake_address3, superblock::minimum_size};
-  auto const b  = sb.first_fit(1_KiB);
-  auto const b2 = sb.first_fit(1_KiB);
-  sb.coalesce(b);
-  sb.coalesce(b2);
-  EXPECT_TRUE(sb.first_fit(superblock::minimum_size).is_valid());
+  superblock sblk{fake_address3, superblock::minimum_size};
+  auto const blk  = sblk.first_fit(1_KiB);
+  auto const blk2 = sblk.first_fit(1_KiB);
+  sblk.coalesce(blk);
+  sblk.coalesce(blk2);
+  EXPECT_TRUE(sblk.first_fit(superblock::minimum_size).is_valid());
 }
 
-TEST_F(ArenaTest, SuperblockMaxFree)  // NOLINT
+TEST_F(ArenaTest, SuperblockMaxFreeSize)  // NOLINT
 {
-  superblock sb{fake_address3, superblock::minimum_size};
-  sb.first_fit(superblock::minimum_size / 2);
-  EXPECT_EQ(sb.max_free(), superblock::minimum_size / 2);
+  superblock sblk{fake_address3, superblock::minimum_size};
+  sblk.first_fit(superblock::minimum_size / 2);
+  EXPECT_EQ(sblk.max_free_size(), superblock::minimum_size / 2);
 }
 
-TEST_F(ArenaTest, SuperblockMaxFreeWhenFull)  // NOLINT
+TEST_F(ArenaTest, SuperblockMaxFreeSizeWhenFull)  // NOLINT
 {
-  superblock sb{fake_address3, superblock::minimum_size};
-  sb.first_fit(superblock::minimum_size);
-  EXPECT_EQ(sb.max_free(), 0);
+  superblock sblk{fake_address3, superblock::minimum_size};
+  sblk.first_fit(superblock::minimum_size);
+  EXPECT_EQ(sblk.max_free_size(), 0);
 }
 
 /**
@@ -294,109 +294,109 @@ TEST_F(ArenaTest, SuperblockMaxFreeWhenFull)  // NOLINT
 
 TEST_F(ArenaTest, GlobalArenaNullUpstream)  // NOLINT
 {
-  auto construct_nullptr = []() { global_arena ga{nullptr, std::nullopt}; };
+  auto construct_nullptr = []() { global_arena global{nullptr, std::nullopt}; };
   EXPECT_THROW(construct_nullptr(), rmm::logic_error);  // NOLINT(cppcoreguidelines-avoid-goto)
 }
 
 TEST_F(ArenaTest, GlobalArenaAcquire)  // NOLINT
 {
-  auto const sb = ga_->acquire(256);
-  EXPECT_EQ(sb.pointer(), fake_address3);
-  EXPECT_EQ(sb.size(), superblock::minimum_size);
-  EXPECT_TRUE(sb.empty());
+  auto const sblk = global_arena_->acquire(256);
+  EXPECT_EQ(sblk.pointer(), fake_address3);
+  EXPECT_EQ(sblk.size(), superblock::minimum_size);
+  EXPECT_TRUE(sblk.empty());
 
-  auto const sb2 = ga_->acquire(1_KiB);
+  auto const sb2 = global_arena_->acquire(1_KiB);
   EXPECT_EQ(sb2.pointer(), fake_address4);
   EXPECT_EQ(sb2.size(), superblock::minimum_size);
   EXPECT_TRUE(sb2.empty());
 
-  ga_->acquire(512);
-  ga_->acquire(512);
-  EXPECT_FALSE(ga_->acquire(512).is_valid());
+  global_arena_->acquire(512);
+  global_arena_->acquire(512);
+  EXPECT_FALSE(global_arena_->acquire(512).is_valid());
 }
 
 TEST_F(ArenaTest, GlobalArenaReleaseMergeNext)  // NOLINT
 {
-  auto sb = ga_->acquire(256);
-  ga_->release(std::move(sb));
-  auto* p = ga_->allocate(arena_size_);
-  EXPECT_EQ(p, fake_address3);
+  auto sblk = global_arena_->acquire(256);
+  global_arena_->release(std::move(sblk));
+  auto* ptr = global_arena_->allocate(arena_size_);
+  EXPECT_EQ(ptr, fake_address3);
 }
 
 TEST_F(ArenaTest, GlobalArenaReleaseMergePrevious)  // NOLINT
 {
-  auto sb  = ga_->acquire(256);
-  auto sb2 = ga_->acquire(1_KiB);
-  ga_->acquire(512);
-  ga_->release(std::move(sb));
-  ga_->release(std::move(sb2));
-  auto* p = ga_->allocate(superblock::minimum_size * 2);
-  EXPECT_EQ(p, fake_address3);
+  auto sblk = global_arena_->acquire(256);
+  auto sb2  = global_arena_->acquire(1_KiB);
+  global_arena_->acquire(512);
+  global_arena_->release(std::move(sblk));
+  global_arena_->release(std::move(sb2));
+  auto* ptr = global_arena_->allocate(superblock::minimum_size * 2);
+  EXPECT_EQ(ptr, fake_address3);
 }
 
 TEST_F(ArenaTest, GlobalArenaReleaseMergePreviousAndNext)  // NOLINT
 {
-  auto sb  = ga_->acquire(256);
-  auto sb2 = ga_->acquire(1_KiB);
-  auto sb3 = ga_->acquire(512);
-  ga_->release(std::move(sb));
-  ga_->release(std::move(sb3));
-  ga_->release(std::move(sb2));
-  auto* p = ga_->allocate(arena_size_);
-  EXPECT_EQ(p, fake_address3);
+  auto sblk = global_arena_->acquire(256);
+  auto sb2  = global_arena_->acquire(1_KiB);
+  auto sb3  = global_arena_->acquire(512);
+  global_arena_->release(std::move(sblk));
+  global_arena_->release(std::move(sb3));
+  global_arena_->release(std::move(sb2));
+  auto* ptr = global_arena_->allocate(arena_size_);
+  EXPECT_EQ(ptr, fake_address3);
 }
 
 TEST_F(ArenaTest, GlobalArenaReleaseMultiple)  // NOLINT
 {
   std::set<superblock> superblocks{};
-  auto sb = ga_->acquire(256);
-  superblocks.insert(std::move(sb));
-  auto sb2 = ga_->acquire(1_KiB);
+  auto sblk = global_arena_->acquire(256);
+  superblocks.insert(std::move(sblk));
+  auto sb2 = global_arena_->acquire(1_KiB);
   superblocks.insert(std::move(sb2));
-  auto sb3 = ga_->acquire(512);
+  auto sb3 = global_arena_->acquire(512);
   superblocks.insert(std::move(sb3));
-  ga_->release(superblocks);
-  auto* p = ga_->allocate(arena_size_);
-  EXPECT_EQ(p, fake_address3);
+  global_arena_->release(superblocks);
+  auto* ptr = global_arena_->allocate(arena_size_);
+  EXPECT_EQ(ptr, fake_address3);
 }
 
 TEST_F(ArenaTest, GlobalArenaAllocate)  // NOLINT
 {
-  auto* ptr = ga_->allocate(superblock::minimum_size * 2);
+  auto* ptr = global_arena_->allocate(superblock::minimum_size * 2);
   EXPECT_EQ(ptr, fake_address3);
 }
 
 TEST_F(ArenaTest, GlobalArenaAllocateExtraLarge)  // NOLINT
 {
-  EXPECT_EQ(ga_->allocate(1_PiB), nullptr);
-  EXPECT_EQ(ga_->allocate(1_PiB), nullptr);
+  EXPECT_EQ(global_arena_->allocate(1_PiB), nullptr);
+  EXPECT_EQ(global_arena_->allocate(1_PiB), nullptr);
 }
 
 TEST_F(ArenaTest, GlobalArenaDeallocate)  // NOLINT
 {
-  auto* ptr = ga_->allocate(superblock::minimum_size * 2);
+  auto* ptr = global_arena_->allocate(superblock::minimum_size * 2);
   EXPECT_EQ(ptr, fake_address3);
-  ga_->deallocate(ptr, superblock::minimum_size * 2, {});
-  ptr = ga_->allocate(superblock::minimum_size * 2);
+  global_arena_->deallocate(ptr, superblock::minimum_size * 2, {});
+  ptr = global_arena_->allocate(superblock::minimum_size * 2);
   EXPECT_EQ(ptr, fake_address3);
 }
 
 TEST_F(ArenaTest, GlobalArenaDeallocateAlignUp)  // NOLINT
 {
-  auto* ptr  = ga_->allocate(superblock::minimum_size + 256);
-  auto* ptr2 = ga_->allocate(superblock::minimum_size + 512);
-  ga_->deallocate(ptr, superblock::minimum_size + 256, {});
-  ga_->deallocate(ptr2, superblock::minimum_size + 512, {});
-  EXPECT_EQ(ga_->allocate(arena_size_), fake_address3);
+  auto* ptr  = global_arena_->allocate(superblock::minimum_size + 256);
+  auto* ptr2 = global_arena_->allocate(superblock::minimum_size + 512);
+  global_arena_->deallocate(ptr, superblock::minimum_size + 256, {});
+  global_arena_->deallocate(ptr2, superblock::minimum_size + 512, {});
+  EXPECT_EQ(global_arena_->allocate(arena_size_), fake_address3);
 }
 
 TEST_F(ArenaTest, GlobalArenaDeallocateFromOtherArena)  // NOLINT
 {
-  auto sb      = ga_->acquire(512);
-  auto const b = sb.first_fit(512);
-  ga_->release(std::move(sb));
-  ga_->deallocate(b.pointer(), b.size());
-  EXPECT_EQ(ga_->allocate(arena_size_), fake_address3);
+  auto sblk      = global_arena_->acquire(512);
+  auto const blk = sblk.first_fit(512);
+  global_arena_->release(std::move(sblk));
+  global_arena_->deallocate(blk.pointer(), blk.size());
+  EXPECT_EQ(global_arena_->allocate(arena_size_), fake_address3);
 }
 
 /**
@@ -405,46 +405,46 @@ TEST_F(ArenaTest, GlobalArenaDeallocateFromOtherArena)  // NOLINT
 
 TEST_F(ArenaTest, ArenaAllocate)  // NOLINT
 {
-  EXPECT_EQ(a_->allocate(superblock::minimum_size), fake_address3);
-  EXPECT_EQ(a_->allocate(256), fake_address4);
+  EXPECT_EQ(arena_->allocate(superblock::minimum_size), fake_address3);
+  EXPECT_EQ(arena_->allocate(256), fake_address4);
 }
 
 TEST_F(ArenaTest, ArenaDeallocate)  // NOLINT
 {
-  auto* ptr = a_->allocate(superblock::minimum_size);
-  a_->deallocate(ptr, superblock::minimum_size, {});
-  auto* ptr2 = a_->allocate(256);
-  a_->deallocate(ptr2, 256, {});
-  EXPECT_EQ(a_->allocate(superblock::minimum_size), fake_address3);
+  auto* ptr = arena_->allocate(superblock::minimum_size);
+  arena_->deallocate(ptr, superblock::minimum_size, {});
+  auto* ptr2 = arena_->allocate(256);
+  arena_->deallocate(ptr2, 256, {});
+  EXPECT_EQ(arena_->allocate(superblock::minimum_size), fake_address3);
 }
 
 TEST_F(ArenaTest, ArenaDeallocateMergePrevious)  // NOLINT
 {
-  auto* ptr  = a_->allocate(256);
-  auto* ptr2 = a_->allocate(256);
-  a_->allocate(256);
-  a_->deallocate(ptr, 256, {});
-  a_->deallocate(ptr2, 256, {});
-  EXPECT_EQ(a_->allocate(512), fake_address3);
+  auto* ptr  = arena_->allocate(256);
+  auto* ptr2 = arena_->allocate(256);
+  arena_->allocate(256);
+  arena_->deallocate(ptr, 256, {});
+  arena_->deallocate(ptr2, 256, {});
+  EXPECT_EQ(arena_->allocate(512), fake_address3);
 }
 
 TEST_F(ArenaTest, ArenaDeallocateMergeNext)  // NOLINT
 {
-  auto* ptr  = a_->allocate(256);
-  auto* ptr2 = a_->allocate(256);
-  a_->allocate(256);
-  a_->deallocate(ptr2, 256, {});
-  a_->deallocate(ptr, 256, {});
-  EXPECT_EQ(a_->allocate(512), fake_address3);
+  auto* ptr  = arena_->allocate(256);
+  auto* ptr2 = arena_->allocate(256);
+  arena_->allocate(256);
+  arena_->deallocate(ptr2, 256, {});
+  arena_->deallocate(ptr, 256, {});
+  EXPECT_EQ(arena_->allocate(512), fake_address3);
 }
 
 TEST_F(ArenaTest, ArenaDeallocateMergePreviousAndNext)  // NOLINT
 {
-  auto* ptr  = a_->allocate(256);
-  auto* ptr2 = a_->allocate(256);
-  a_->deallocate(ptr, 256, {});
-  a_->deallocate(ptr2, 256, {});
-  EXPECT_EQ(a_->allocate(2_KiB), fake_address3);
+  auto* ptr  = arena_->allocate(256);
+  auto* ptr2 = arena_->allocate(256);
+  arena_->deallocate(ptr, 256, {});
+  arena_->deallocate(ptr2, 256, {});
+  EXPECT_EQ(arena_->allocate(2_KiB), fake_address3);
 }
 
 TEST_F(ArenaTest, ArenaDefragment)  // NOLINT
@@ -452,14 +452,14 @@ TEST_F(ArenaTest, ArenaDefragment)  // NOLINT
   std::vector<void*> pointers;
   std::size_t num_pointers{4};
   for (std::size_t i = 0; i < num_pointers; i++) {
-    pointers.push_back(a_->allocate(superblock::minimum_size));
+    pointers.push_back(arena_->allocate(superblock::minimum_size));
   }
   for (auto* ptr : pointers) {
-    a_->deallocate(ptr, superblock::minimum_size, {});
+    arena_->deallocate(ptr, superblock::minimum_size, {});
   }
-  EXPECT_EQ(ga_->allocate(arena_size_), nullptr);
-  a_->defragment();
-  EXPECT_EQ(ga_->allocate(arena_size_), fake_address3);
+  EXPECT_EQ(global_arena_->allocate(arena_size_), nullptr);
+  arena_->defragment();
+  EXPECT_EQ(global_arena_->allocate(arena_size_), fake_address3);
 }
 
 /**

From b92c9eb6418bb109bc5f40361edad6dcf899e25a Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Wed, 8 Dec 2021 15:21:02 -0800
Subject: [PATCH 28/35] minor fix

---
 include/rmm/mr/device/arena_memory_resource.hpp | 2 ++
 include/rmm/mr/device/detail/arena.hpp          | 6 +++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp
index 84e1dd73b..b01dec56d 100644
--- a/include/rmm/mr/device/arena_memory_resource.hpp
+++ b/include/rmm/mr/device/arena_memory_resource.hpp
@@ -89,6 +89,8 @@ class arena_memory_resource final : public device_memory_resource {
   {
     if (dump_log_on_failure_) {
       logger_ = spdlog::basic_logger_mt("arena_memory_dump", "rmm_arena_memory_dump.log");
+      // Set the level to `debug` for more detailed output.
+      logger_->set_level(spdlog::level::info);
     }
   }
 
diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index 2c823becf..61611e091 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -719,9 +719,9 @@ class global_arena final {
    */
   superblock first_fit(std::size_t size)
   {
-    auto iter = std::find_if(superblocks_.cbegin(), superblocks_.cend(), [=](auto const& sblk) {
-      return sblk.fits(size);
-    });
+    auto const iter = std::find_if(superblocks_.cbegin(),
+                                   superblocks_.cend(),
+                                   [=](auto const& sblk) { return sblk.fits(size); });
     if (iter == superblocks_.cend()) { return {}; }
 
     auto sblk           = std::move(superblocks_.extract(iter).value());

From 5452b82557ad51552b3df970682bb68ce0f2cc25 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Thu, 9 Dec 2021 09:17:22 -0800
Subject: [PATCH 29/35] clang format

---
 tests/mr/device/arena_mr_tests.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index bd6d81f2c..f2369d189 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include "../../byte_literals.hpp"
 #include <rmm/cuda_stream.hpp>
 #include <rmm/detail/aligned.hpp>
 #include <rmm/detail/cuda_util.hpp>
@@ -22,12 +21,13 @@
 #include <rmm/mr/device/arena_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include "../../byte_literals.hpp"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
-#include <memory>
 #include <sys/stat.h>
+#include <memory>
 
 namespace rmm::test {
 namespace {

From c782893763421a8f3dd3f3e7cd3067c2ed59e1dc Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Thu, 9 Dec 2021 15:06:39 -0800
Subject: [PATCH 30/35] address review comments

---
 .../rmm/mr/device/arena_memory_resource.hpp   |  1 +
 include/rmm/mr/device/detail/arena.hpp        | 28 ++++++++++---------
 tests/mr/device/arena_mr_tests.cpp            | 18 ++++++------
 3 files changed, 25 insertions(+), 22 deletions(-)

diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp
index b01dec56d..4d20f7cfa 100644
--- a/include/rmm/mr/device/arena_memory_resource.hpp
+++ b/include/rmm/mr/device/arena_memory_resource.hpp
@@ -81,6 +81,7 @@ class arena_memory_resource final : public device_memory_resource {
    * @param upstream_mr The memory resource from which to allocate blocks for the global arena.
    * @param arena_size Size in bytes of the global arena. Defaults to half of the available memory
    * on the current device.
+   * @param dump_log_on_failure If true, dump memory log when running out of memory.
    */
   explicit arena_memory_resource(Upstream* upstream_mr,
                                  std::optional<std::size_t> arena_size = std::nullopt,
diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index 61611e091..c22834474 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -120,12 +120,12 @@ inline std::size_t align_to_size_class(std::size_t value) noexcept
 /**
  * @brief Represents a contiguous region of memory.
  */
-class memory_span {
+class byte_span {
  public:
   /**
    * @brief Construct a default span.
    */
-  memory_span() = default;
+  byte_span() = default;
 
   /**
    * @brief Construct a span given a pointer and size.
@@ -133,7 +133,7 @@ class memory_span {
    * @param pointer The address for the beginning of the span.
    * @param size The size of the span.
    */
-  memory_span(void* pointer, std::size_t size) : pointer_{static_cast<char*>(pointer)}, size_{size}
+  byte_span(void* pointer, std::size_t size) : pointer_{static_cast<char*>(pointer)}, size_{size}
   {
     RMM_LOGGING_ASSERT(pointer != nullptr);
     RMM_LOGGING_ASSERT(size > 0);
@@ -155,10 +155,10 @@ class memory_span {
   [[nodiscard]] bool is_valid() const { return pointer_ != nullptr && size_ > 0; }
 
   /// Used by std::set to compare spans.
-  bool operator<(memory_span const& mem_span) const
+  bool operator<(byte_span const& span) const
   {
-    RMM_LOGGING_ASSERT(mem_span.is_valid());
-    return pointer_ < mem_span.pointer_;
+    RMM_LOGGING_ASSERT(span.is_valid());
+    return pointer_ < span.pointer_;
   }
 
  private:
@@ -166,7 +166,7 @@ class memory_span {
   std::size_t size_{};  ///< Size in bytes.
 };
 
-/// Calculate the total size of a set of memory spans.
+/// Calculate the total size of a set of spans.
 template <typename T>
 inline auto total_memory_size(std::set<T> const& spans)
 {
@@ -179,9 +179,9 @@ inline auto total_memory_size(std::set<T> const& spans)
 /**
  * @brief Represents a chunk of memory that can be allocated and deallocated.
  */
-class block final : public memory_span {
+class block final : public byte_span {
  public:
-  using memory_span::memory_span;
+  using byte_span::byte_span;
 
   /**
    * @brief Is this block large enough to fit `bytes` bytes?
@@ -251,10 +251,12 @@ inline bool block_size_compare(block const& lhs, block const& rhs)
  * @brief Represents a large chunk of memory that is exchanged between the global arena and
  * per-thread arenas.
  */
-class superblock final : public memory_span {
+class superblock final : public byte_span {
  public:
   /// Minimum size of a superblock (1 MiB).
-  static constexpr std::size_t minimum_size{1U << 20U};
+  static constexpr std::size_t minimum_size{1UL << 20};
+  /// Maximum size of a superblock (1 TiB), as a sanity check.
+  static constexpr std::size_t maximum_size{1UL << 40};
 
   /**
    * @brief Construct a default superblock.
@@ -267,10 +269,10 @@ class superblock final : public memory_span {
    * @param pointer The address for the beginning of the superblock.
    * @param size The size of the superblock.
    */
-  superblock(void* pointer, std::size_t size) : memory_span{pointer, size}
+  superblock(void* pointer, std::size_t size) : byte_span{pointer, size}
   {
     RMM_LOGGING_ASSERT(size >= minimum_size);
-    RMM_LOGGING_ASSERT(size < 1UL << 40UL);
+    RMM_LOGGING_ASSERT(size <= maximum_size);
     free_blocks_.emplace(pointer, size);
   }
 
diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index f2369d189..b856050bb 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -38,9 +38,9 @@ class mock_memory_resource {
   MOCK_METHOD(void, deallocate, (void*, std::size_t));
 };
 
-using memory_span  = rmm::mr::detail::arena::memory_span;
-using block        = rmm::mr::detail::arena::block;
-using superblock   = rmm::mr::detail::arena::superblock;
+using rmm::mr::detail::arena::block;
+using rmm::mr::detail::arena::byte_span;
+using rmm::mr::detail::arena::superblock;
 using global_arena = rmm::mr::detail::arena::global_arena<mock_memory_resource>;
 using arena        = rmm::mr::detail::arena::arena<mock_memory_resource>;
 using arena_mr     = rmm::mr::arena_memory_resource<rmm::mr::device_memory_resource>;
@@ -89,15 +89,15 @@ TEST_F(ArenaTest, AlignToSizeClass)  // NOLINT
 }
 
 /**
- * Test memory_span.
+ * Test byte_span.
  */
 
-TEST_F(ArenaTest, MemorySpan)  // NOLINT
+TEST_F(ArenaTest, ByteSpan)  // NOLINT
 {
-  memory_span const mem_span{};
-  EXPECT_FALSE(mem_span.is_valid());
-  memory_span const ms2{fake_address, 256};
-  EXPECT_TRUE(ms2.is_valid());
+  byte_span const span{};
+  EXPECT_FALSE(span.is_valid());
+  byte_span const span2{fake_address, 256};
+  EXPECT_TRUE(span2.is_valid());
 }
 
 /**

From 0fd715e42b1e28e370de18dc6bacb2debc4c721f Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Wed, 15 Dec 2021 09:37:35 -0800
Subject: [PATCH 31/35] clang format

---
 tests/mr/device/arena_mr_tests.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index b856050bb..b6e69ce0d 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "../../byte_literals.hpp"
 #include <rmm/cuda_stream.hpp>
 #include <rmm/detail/aligned.hpp>
 #include <rmm/detail/cuda_util.hpp>
@@ -21,13 +22,12 @@
 #include <rmm/mr/device/arena_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
-#include "../../byte_literals.hpp"
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
-#include <sys/stat.h>
 #include <memory>
+#include <sys/stat.h>
 
 namespace rmm::test {
 namespace {

From c42a4d45139f0f1d4767b2be1d7063813c3b6bee Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Tue, 11 Jan 2022 16:56:36 -0800
Subject: [PATCH 32/35] review feedback

---
 .../random_allocations/random_allocations.cpp |   6 +-
 .../rmm/mr/device/arena_memory_resource.hpp   |   3 +-
 include/rmm/mr/device/detail/arena.hpp        |   5 +-
 tests/mr/device/arena_mr_tests.cpp            | 170 +++++++++---------
 4 files changed, 95 insertions(+), 89 deletions(-)

diff --git a/benchmarks/random_allocations/random_allocations.cpp b/benchmarks/random_allocations/random_allocations.cpp
index c236ed7bb..470442830 100644
--- a/benchmarks/random_allocations/random_allocations.cpp
+++ b/benchmarks/random_allocations/random_allocations.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -170,8 +170,8 @@ inline auto make_pool()
 
 inline auto make_arena()
 {
-  auto free    = rmm::detail::available_device_memory().first;
-  auto reserve = 1UL << 26;
+  auto free = rmm::detail::available_device_memory().first;
+  constexpr auto reserve{64UL << 20};  // Leave some space for CUDA overhead.
   return rmm::mr::make_owning_wrapper<rmm::mr::arena_memory_resource>(make_cuda(), free - reserve);
 }
 
diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp
index 0039d57f6..0fa77b896 100644
--- a/include/rmm/mr/device/arena_memory_resource.hpp
+++ b/include/rmm/mr/device/arena_memory_resource.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -196,6 +196,7 @@ class arena_memory_resource final : public device_memory_resource {
 
     {
       std::shared_lock lock(mtx_);
+      // If the memory being freed does not belong to the arena, the following will return false.
       if (arena.deallocate(ptr, bytes, stream)) { return; }
     }
 
diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index c22834474..1dd1fbc6d 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -509,6 +509,8 @@ class global_arena final {
     RMM_EXPECTS(nullptr != upstream_mr_, "Unexpected null upstream pointer.");
     auto const size = rmm::detail::align_down(arena_size.value_or(default_size()),
                                               rmm::detail::CUDA_ALLOCATION_ALIGNMENT);
+    RMM_EXPECTS(size >= superblock::minimum_size,
+                "Arena size smaller than minimum superblock size.");
     initialize(size);
   }
 
@@ -700,7 +702,6 @@ class global_arena final {
    */
   void initialize(std::size_t size)
   {
-    RMM_LOGGING_ASSERT(size >= superblock::minimum_size);
     upstream_block_ = {upstream_mr_->allocate(size), size};
     superblocks_.emplace(upstream_block_.pointer(), size);
   }
diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index c6b281cab..c9e9e5e37 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,22 +56,19 @@ auto const fake_address3 = reinterpret_cast<void*>(superblock::minimum_size);
 auto const fake_address4 = reinterpret_cast<void*>(superblock::minimum_size * 2);
 // NOLINTEND(cppcoreguidelines-pro-type-reinterpret-cast,performance-no-int-to-ptr)
 
-class ArenaTest : public ::testing::Test {
- protected:
+struct ArenaTest : public ::testing::Test {
   void SetUp() override
   {
-    EXPECT_CALL(mock_, allocate(arena_size_)).WillOnce(Return(fake_address3));
-    EXPECT_CALL(mock_, deallocate(fake_address3, arena_size_));
-    global_arena_ = std::make_unique<global_arena>(&mock_, arena_size_);
-    arena_        = std::make_unique<arena>(*global_arena_);
+    EXPECT_CALL(mock_mr, allocate(arena_size)).WillOnce(Return(fake_address3));
+    EXPECT_CALL(mock_mr, deallocate(fake_address3, arena_size));
+    global        = std::make_unique<global_arena>(&mock_mr, arena_size);
+    per_thread    = std::make_unique<arena>(*global);
   }
 
-  // NOLINTBEGIN(cppcoreguidelines-non-private-member-variables-in-classes)
-  std::size_t arena_size_{superblock::minimum_size * 4};
-  mock_memory_resource mock_{};
-  std::unique_ptr<global_arena> global_arena_{};
-  std::unique_ptr<arena> arena_{};
-  // NOLINTEND(cppcoreguidelines-non-private-member-variables-in-classes)
+  std::size_t arena_size{superblock::minimum_size * 4};
+  mock_memory_resource mock_mr{};
+  std::unique_ptr<global_arena> global{};
+  std::unique_ptr<arena> per_thread{};
 };
 
 /**
@@ -303,103 +300,103 @@ TEST_F(ArenaTest, GlobalArenaNullUpstream)  // NOLINT
 
 TEST_F(ArenaTest, GlobalArenaAcquire)  // NOLINT
 {
-  auto const sblk = global_arena_->acquire(256);
+  auto const sblk = global->acquire(256);
   EXPECT_EQ(sblk.pointer(), fake_address3);
   EXPECT_EQ(sblk.size(), superblock::minimum_size);
   EXPECT_TRUE(sblk.empty());
 
-  auto const sb2 = global_arena_->acquire(1_KiB);
+  auto const sb2 = global->acquire(1_KiB);
   EXPECT_EQ(sb2.pointer(), fake_address4);
   EXPECT_EQ(sb2.size(), superblock::minimum_size);
   EXPECT_TRUE(sb2.empty());
 
-  global_arena_->acquire(512);
-  global_arena_->acquire(512);
-  EXPECT_FALSE(global_arena_->acquire(512).is_valid());
+  global->acquire(512);
+  global->acquire(512);
+  EXPECT_FALSE(global->acquire(512).is_valid());
 }
 
 TEST_F(ArenaTest, GlobalArenaReleaseMergeNext)  // NOLINT
 {
-  auto sblk = global_arena_->acquire(256);
-  global_arena_->release(std::move(sblk));
-  auto* ptr = global_arena_->allocate(arena_size_);
+  auto sblk = global->acquire(256);
+  global->release(std::move(sblk));
+  auto* ptr = global->allocate(arena_size);
   EXPECT_EQ(ptr, fake_address3);
 }
 
 TEST_F(ArenaTest, GlobalArenaReleaseMergePrevious)  // NOLINT
 {
-  auto sblk = global_arena_->acquire(256);
-  auto sb2  = global_arena_->acquire(1_KiB);
-  global_arena_->acquire(512);
-  global_arena_->release(std::move(sblk));
-  global_arena_->release(std::move(sb2));
-  auto* ptr = global_arena_->allocate(superblock::minimum_size * 2);
+  auto sblk = global->acquire(256);
+  auto sb2  = global->acquire(1_KiB);
+  global->acquire(512);
+  global->release(std::move(sblk));
+  global->release(std::move(sb2));
+  auto* ptr = global->allocate(superblock::minimum_size * 2);
   EXPECT_EQ(ptr, fake_address3);
 }
 
 TEST_F(ArenaTest, GlobalArenaReleaseMergePreviousAndNext)  // NOLINT
 {
-  auto sblk = global_arena_->acquire(256);
-  auto sb2  = global_arena_->acquire(1_KiB);
-  auto sb3  = global_arena_->acquire(512);
-  global_arena_->release(std::move(sblk));
-  global_arena_->release(std::move(sb3));
-  global_arena_->release(std::move(sb2));
-  auto* ptr = global_arena_->allocate(arena_size_);
+  auto sblk = global->acquire(256);
+  auto sb2  = global->acquire(1_KiB);
+  auto sb3  = global->acquire(512);
+  global->release(std::move(sblk));
+  global->release(std::move(sb3));
+  global->release(std::move(sb2));
+  auto* ptr = global->allocate(arena_size);
   EXPECT_EQ(ptr, fake_address3);
 }
 
 TEST_F(ArenaTest, GlobalArenaReleaseMultiple)  // NOLINT
 {
   std::set<superblock> superblocks{};
-  auto sblk = global_arena_->acquire(256);
+  auto sblk = global->acquire(256);
   superblocks.insert(std::move(sblk));
-  auto sb2 = global_arena_->acquire(1_KiB);
+  auto sb2 = global->acquire(1_KiB);
   superblocks.insert(std::move(sb2));
-  auto sb3 = global_arena_->acquire(512);
+  auto sb3 = global->acquire(512);
   superblocks.insert(std::move(sb3));
-  global_arena_->release(superblocks);
-  auto* ptr = global_arena_->allocate(arena_size_);
+  global->release(superblocks);
+  auto* ptr = global->allocate(arena_size);
   EXPECT_EQ(ptr, fake_address3);
 }
 
 TEST_F(ArenaTest, GlobalArenaAllocate)  // NOLINT
 {
-  auto* ptr = global_arena_->allocate(superblock::minimum_size * 2);
+  auto* ptr = global->allocate(superblock::minimum_size * 2);
   EXPECT_EQ(ptr, fake_address3);
 }
 
 TEST_F(ArenaTest, GlobalArenaAllocateExtraLarge)  // NOLINT
 {
-  EXPECT_EQ(global_arena_->allocate(1_PiB), nullptr);
-  EXPECT_EQ(global_arena_->allocate(1_PiB), nullptr);
+  EXPECT_EQ(global->allocate(1_PiB), nullptr);
+  EXPECT_EQ(global->allocate(1_PiB), nullptr);
 }
 
 TEST_F(ArenaTest, GlobalArenaDeallocate)  // NOLINT
 {
-  auto* ptr = global_arena_->allocate(superblock::minimum_size * 2);
+  auto* ptr = global->allocate(superblock::minimum_size * 2);
   EXPECT_EQ(ptr, fake_address3);
-  global_arena_->deallocate(ptr, superblock::minimum_size * 2, {});
-  ptr = global_arena_->allocate(superblock::minimum_size * 2);
+  global->deallocate(ptr, superblock::minimum_size * 2, {});
+  ptr = global->allocate(superblock::minimum_size * 2);
   EXPECT_EQ(ptr, fake_address3);
 }
 
 TEST_F(ArenaTest, GlobalArenaDeallocateAlignUp)  // NOLINT
 {
-  auto* ptr  = global_arena_->allocate(superblock::minimum_size + 256);
-  auto* ptr2 = global_arena_->allocate(superblock::minimum_size + 512);
-  global_arena_->deallocate(ptr, superblock::minimum_size + 256, {});
-  global_arena_->deallocate(ptr2, superblock::minimum_size + 512, {});
-  EXPECT_EQ(global_arena_->allocate(arena_size_), fake_address3);
+  auto* ptr  = global->allocate(superblock::minimum_size + 256);
+  auto* ptr2 = global->allocate(superblock::minimum_size + 512);
+  global->deallocate(ptr, superblock::minimum_size + 256, {});
+  global->deallocate(ptr2, superblock::minimum_size + 512, {});
+  EXPECT_EQ(global->allocate(arena_size), fake_address3);
 }
 
 TEST_F(ArenaTest, GlobalArenaDeallocateFromOtherArena)  // NOLINT
 {
-  auto sblk      = global_arena_->acquire(512);
+  auto sblk      = global->acquire(512);
   auto const blk = sblk.first_fit(512);
-  global_arena_->release(std::move(sblk));
-  global_arena_->deallocate(blk.pointer(), blk.size());
-  EXPECT_EQ(global_arena_->allocate(arena_size_), fake_address3);
+  global->release(std::move(sblk));
+  global->deallocate(blk.pointer(), blk.size());
+  EXPECT_EQ(global->allocate(arena_size), fake_address3);
 }
 
 /**
@@ -408,46 +405,46 @@ TEST_F(ArenaTest, GlobalArenaDeallocateFromOtherArena)  // NOLINT
 
 TEST_F(ArenaTest, ArenaAllocate)  // NOLINT
 {
-  EXPECT_EQ(arena_->allocate(superblock::minimum_size), fake_address3);
-  EXPECT_EQ(arena_->allocate(256), fake_address4);
+  EXPECT_EQ(per_thread->allocate(superblock::minimum_size), fake_address3);
+  EXPECT_EQ(per_thread->allocate(256), fake_address4);
 }
 
 TEST_F(ArenaTest, ArenaDeallocate)  // NOLINT
 {
-  auto* ptr = arena_->allocate(superblock::minimum_size);
-  arena_->deallocate(ptr, superblock::minimum_size, {});
-  auto* ptr2 = arena_->allocate(256);
-  arena_->deallocate(ptr2, 256, {});
-  EXPECT_EQ(arena_->allocate(superblock::minimum_size), fake_address3);
+  auto* ptr = per_thread->allocate(superblock::minimum_size);
+  per_thread->deallocate(ptr, superblock::minimum_size, {});
+  auto* ptr2 = per_thread->allocate(256);
+  per_thread->deallocate(ptr2, 256, {});
+  EXPECT_EQ(per_thread->allocate(superblock::minimum_size), fake_address3);
 }
 
 TEST_F(ArenaTest, ArenaDeallocateMergePrevious)  // NOLINT
 {
-  auto* ptr  = arena_->allocate(256);
-  auto* ptr2 = arena_->allocate(256);
-  arena_->allocate(256);
-  arena_->deallocate(ptr, 256, {});
-  arena_->deallocate(ptr2, 256, {});
-  EXPECT_EQ(arena_->allocate(512), fake_address3);
+  auto* ptr  = per_thread->allocate(256);
+  auto* ptr2 = per_thread->allocate(256);
+  per_thread->allocate(256);
+  per_thread->deallocate(ptr, 256, {});
+  per_thread->deallocate(ptr2, 256, {});
+  EXPECT_EQ(per_thread->allocate(512), fake_address3);
 }
 
 TEST_F(ArenaTest, ArenaDeallocateMergeNext)  // NOLINT
 {
-  auto* ptr  = arena_->allocate(256);
-  auto* ptr2 = arena_->allocate(256);
-  arena_->allocate(256);
-  arena_->deallocate(ptr2, 256, {});
-  arena_->deallocate(ptr, 256, {});
-  EXPECT_EQ(arena_->allocate(512), fake_address3);
+  auto* ptr  = per_thread->allocate(256);
+  auto* ptr2 = per_thread->allocate(256);
+  per_thread->allocate(256);
+  per_thread->deallocate(ptr2, 256, {});
+  per_thread->deallocate(ptr, 256, {});
+  EXPECT_EQ(per_thread->allocate(512), fake_address3);
 }
 
 TEST_F(ArenaTest, ArenaDeallocateMergePreviousAndNext)  // NOLINT
 {
-  auto* ptr  = arena_->allocate(256);
-  auto* ptr2 = arena_->allocate(256);
-  arena_->deallocate(ptr, 256, {});
-  arena_->deallocate(ptr2, 256, {});
-  EXPECT_EQ(arena_->allocate(2_KiB), fake_address3);
+  auto* ptr  = per_thread->allocate(256);
+  auto* ptr2 = per_thread->allocate(256);
+  per_thread->deallocate(ptr, 256, {});
+  per_thread->deallocate(ptr2, 256, {});
+  EXPECT_EQ(per_thread->allocate(2_KiB), fake_address3);
 }
 
 TEST_F(ArenaTest, ArenaDefragment)  // NOLINT
@@ -455,14 +452,14 @@ TEST_F(ArenaTest, ArenaDefragment)  // NOLINT
   std::vector<void*> pointers;
   std::size_t num_pointers{4};
   for (std::size_t i = 0; i < num_pointers; i++) {
-    pointers.push_back(arena_->allocate(superblock::minimum_size));
+    pointers.push_back(per_thread->allocate(superblock::minimum_size));
   }
   for (auto* ptr : pointers) {
-    arena_->deallocate(ptr, superblock::minimum_size, {});
+    per_thread->deallocate(ptr, superblock::minimum_size, {});
   }
-  EXPECT_EQ(global_arena_->allocate(arena_size_), nullptr);
-  arena_->defragment();
-  EXPECT_EQ(global_arena_->allocate(arena_size_), fake_address3);
+  EXPECT_EQ(global->allocate(arena_size), nullptr);
+  per_thread->defragment();
+  EXPECT_EQ(global->allocate(arena_size), fake_address3);
 }
 
 /**
@@ -476,6 +473,13 @@ TEST_F(ArenaTest, ThrowOnNullUpstream)  // NOLINT
   EXPECT_THROW(construct_nullptr(), rmm::logic_error);
 }
 
+TEST_F(ArenaTest, SizeSmallerThanSuperblockSize)  // NOLINT
+{
+  auto construct_small = []() { arena_mr mr{rmm::mr::get_current_device_resource(), 256}; };
+  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto)
+  EXPECT_THROW(construct_small(), rmm::logic_error);
+}
+
 TEST_F(ArenaTest, AllocateNinetyPercent)  // NOLINT
 {
   EXPECT_NO_THROW([]() {  // NOLINT(cppcoreguidelines-avoid-goto)

From 96c976b795e45ed680dbd0156ca926333cfa3689 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Tue, 11 Jan 2022 17:15:06 -0800
Subject: [PATCH 33/35] clang format

---
 tests/mr/device/arena_mr_tests.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index c9e9e5e37..3541ffbfc 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -61,8 +61,8 @@ struct ArenaTest : public ::testing::Test {
   {
     EXPECT_CALL(mock_mr, allocate(arena_size)).WillOnce(Return(fake_address3));
     EXPECT_CALL(mock_mr, deallocate(fake_address3, arena_size));
-    global        = std::make_unique<global_arena>(&mock_mr, arena_size);
-    per_thread    = std::make_unique<arena>(*global);
+    global     = std::make_unique<global_arena>(&mock_mr, arena_size);
+    per_thread = std::make_unique<arena>(*global);
   }
 
   std::size_t arena_size{superblock::minimum_size * 4};

From a97565dd9efd6a27eb2d20328c5f3f4d56d1fd32 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Tue, 11 Jan 2022 19:02:37 -0800
Subject: [PATCH 34/35] increase test coverage

---
 .../rmm/mr/device/arena_memory_resource.hpp   | 12 --------
 include/rmm/mr/device/detail/arena.hpp        | 30 -------------------
 tests/mr/device/arena_mr_tests.cpp            |  6 +++-
 3 files changed, 5 insertions(+), 43 deletions(-)

diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp
index 0fa77b896..f1b4e40c4 100644
--- a/include/rmm/mr/device/arena_memory_resource.hpp
+++ b/include/rmm/mr/device/arena_memory_resource.hpp
@@ -310,18 +310,6 @@ class arena_memory_resource final : public device_memory_resource {
     logger_->info("**************************************************");
     logger_->info("Global arena:");
     global_arena_.dump_memory_log(logger_);
-    logger_->debug("Per-thread arenas:");
-    for (auto const& thread_arena : thread_arenas_) {
-      logger_->debug("  Thread {}:", thread_arena.first);
-      thread_arena.second->dump_memory_log(logger_);
-    }
-    if (!stream_arenas_.empty()) {
-      logger_->debug("Per-stream arenas:");
-      for (auto const& stream_arena : stream_arenas_) {
-        logger_->debug("  Stream {}:", static_cast<void*>(stream_arena.first));
-        stream_arena.second.dump_memory_log(logger_);
-      }
-    }
     logger_->flush();
   }
 
diff --git a/include/rmm/mr/device/detail/arena.hpp b/include/rmm/mr/device/detail/arena.hpp
index 1dd1fbc6d..c0e5df377 100644
--- a/include/rmm/mr/device/detail/arena.hpp
+++ b/include/rmm/mr/device/detail/arena.hpp
@@ -876,36 +876,6 @@ class arena {
     }
   }
 
-  /**
-   * Dump memory to log.
-   *
-   * @param logger the spdlog logger to use
-   */
-  void dump_memory_log(std::shared_ptr<spdlog::logger> const& logger) const
-  {
-    std::lock_guard lock(mtx_);
-    logger->debug("    # superblocks: {}", superblocks_.size());
-    if (!superblocks_.empty()) {
-      logger->debug("    Total size of superblocks: {}",
-                    rmm::detail::bytes{total_memory_size(superblocks_)});
-      logger->debug("    Size of largest free block: {}",
-                    rmm::detail::bytes{max_free_size(superblocks_)});
-      auto index = 0;
-      for (auto const& sblk : superblocks_) {
-        logger->debug(
-          "      Superblock {}: start={}, end={}, size={}, empty={}, # free blocks={}, max free={}",
-          index,
-          fmt::ptr(sblk.pointer()),
-          fmt::ptr(sblk.end()),
-          rmm::detail::bytes{sblk.size()},
-          sblk.empty(),
-          sblk.free_blocks(),
-          rmm::detail::bytes{sblk.max_free_size()});
-        index++;
-      }
-    }
-  }
-
  private:
   /**
    * @brief Get an available memory block of at least `size` bytes.
diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index 3541ffbfc..c7c7f578c 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -277,7 +277,9 @@ TEST_F(ArenaTest, SuperblockCoalesceMergePreviousAndNext)  // NOLINT
 TEST_F(ArenaTest, SuperblockMaxFreeSize)  // NOLINT
 {
   superblock sblk{fake_address3, superblock::minimum_size};
-  sblk.first_fit(superblock::minimum_size / 2);
+  auto const blk = sblk.first_fit(superblock::minimum_size / 4);
+  sblk.first_fit(superblock::minimum_size / 4);
+  sblk.coalesce(blk);
   EXPECT_EQ(sblk.max_free_size(), superblock::minimum_size / 2);
 }
 
@@ -394,8 +396,10 @@ TEST_F(ArenaTest, GlobalArenaDeallocateFromOtherArena)  // NOLINT
 {
   auto sblk      = global->acquire(512);
   auto const blk = sblk.first_fit(512);
+  auto const blk2 = sblk.first_fit(1024);
   global->release(std::move(sblk));
   global->deallocate(blk.pointer(), blk.size());
+  global->deallocate(blk2.pointer(), blk2.size());
   EXPECT_EQ(global->allocate(arena_size), fake_address3);
 }
 

From 5cf9360b68700c8713853146289538eeb7cd7a23 Mon Sep 17 00:00:00 2001
From: Rong Ou <rong.ou@gmail.com>
Date: Tue, 11 Jan 2022 19:04:41 -0800
Subject: [PATCH 35/35] clang format

---
 tests/mr/device/arena_mr_tests.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/mr/device/arena_mr_tests.cpp b/tests/mr/device/arena_mr_tests.cpp
index c7c7f578c..b86e2457c 100644
--- a/tests/mr/device/arena_mr_tests.cpp
+++ b/tests/mr/device/arena_mr_tests.cpp
@@ -394,8 +394,8 @@ TEST_F(ArenaTest, GlobalArenaDeallocateAlignUp)  // NOLINT
 
 TEST_F(ArenaTest, GlobalArenaDeallocateFromOtherArena)  // NOLINT
 {
-  auto sblk      = global->acquire(512);
-  auto const blk = sblk.first_fit(512);
+  auto sblk       = global->acquire(512);
+  auto const blk  = sblk.first_fit(512);
   auto const blk2 = sblk.first_fit(1024);
   global->release(std::move(sblk));
   global->deallocate(blk.pointer(), blk.size());