From ba99ff403de8285dda7139fda00ba9acdb2ed709 Mon Sep 17 00:00:00 2001
From: Mark Harris <783069+harrism@users.noreply.github.com>
Date: Wed, 15 Nov 2023 15:32:14 +1100
Subject: [PATCH] Store and set the correct CUDA device in device_buffer
 (#1370)

This changes `device_buffer` to store the active CUDA device ID on creation, and (possibly temporarily) set the active device to that ID before allocating or freeing memory. It also adds tests for containers built on `device_buffer` (`device_buffer`, `device_uvector` and `device_scalar`) that ensure correct operation when the device is changed before doing things that alloc/dealloc memory for those containers.

This fixes #1342 . HOWEVER, there is an important question yet to answer:

`rmm::device_vector` is just an alias for `thrust::device_vector`, which does not use `rmm::device_buffer` for storage. However users may be surprised after this PR because the multidevice semantics of RMM containers will be different from `thrust::device_vector` (and therefore `rmm::device_vector`).

Update: opinion is that it's probably OK to diverge from `device_vector`, and some think we should remove `rmm::device_vector`.

~While we discuss this I have set the DO NOT MERGE label.~

Authors:
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Jake Hemstad (https://github.com/jrhemstad)

URL: https://github.com/rapidsai/rmm/pull/1370
---
 include/rmm/cuda_device.hpp             |  38 +++++-
 include/rmm/device_buffer.hpp           |  18 ++-
 tests/CMakeLists.txt                    |   3 +
 tests/container_multidevice_tests.cu    | 149 ++++++++++++++++++++++++
 tests/device_buffer_tests.cu            |   7 +-
 tests/device_check_resource_adaptor.hpp |  77 ++++++++++++
 6 files changed, 283 insertions(+), 9 deletions(-)
 create mode 100644 tests/container_multidevice_tests.cu
 create mode 100644 tests/device_check_resource_adaptor.hpp

diff --git a/include/rmm/cuda_device.hpp b/include/rmm/cuda_device.hpp
index e4a35ee16..054bbb920 100644
--- a/include/rmm/cuda_device.hpp
+++ b/include/rmm/cuda_device.hpp
@@ -34,7 +34,7 @@ struct cuda_device_id {
   using value_type = int;  ///< Integer type used for device identifier
 
   /**
-   * @brief Construct a `cuda_device_id` from the specified integer value
+   * @brief Construct a `cuda_device_id` from the specified integer value.
    *
    * @param dev_id The device's integer identifier
    */
@@ -43,6 +43,35 @@ struct cuda_device_id {
   /// @briefreturn{The wrapped integer value}
   [[nodiscard]] constexpr value_type value() const noexcept { return id_; }
 
+  // TODO re-add doxygen comment specifier /** for these hidden friend operators once this Breathe
+  // bug is fixed: https://github.com/breathe-doc/breathe/issues/916
+  //! @cond Doxygen_Suppress
+  /**
+   * @brief Compare two `cuda_device_id`s for equality.
+   *
+   * @param lhs The first `cuda_device_id` to compare.
+   * @param rhs The second `cuda_device_id` to compare.
+   * @return true if the two `cuda_device_id`s wrap the same integer value, false otherwise.
+   */
+  [[nodiscard]] constexpr friend bool operator==(cuda_device_id const& lhs,
+                                                 cuda_device_id const& rhs) noexcept
+  {
+    return lhs.value() == rhs.value();
+  }
+
+  /**
+   * @brief Compare two `cuda_device_id`s for inequality.
+   *
+   * @param lhs The first `cuda_device_id` to compare.
+   * @param rhs The second `cuda_device_id` to compare.
+   * @return true if the two `cuda_device_id`s wrap different integer values, false otherwise.
+   */
+  [[nodiscard]] constexpr friend bool operator!=(cuda_device_id const& lhs,
+                                                 cuda_device_id const& rhs) noexcept
+  {
+    return lhs.value() != rhs.value();
+  }
+  //! @endcond
  private:
   value_type id_;
 };
@@ -84,16 +113,17 @@ struct cuda_set_device_raii {
    * @param dev_id The device to set as the current CUDA device
    */
   explicit cuda_set_device_raii(cuda_device_id dev_id)
-    : old_device_{get_current_cuda_device()}, needs_reset_{old_device_.value() != dev_id.value()}
+    : old_device_{get_current_cuda_device()},
+      needs_reset_{dev_id.value() >= 0 && old_device_ != dev_id}
   {
-    if (needs_reset_) RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(dev_id.value()));
+    if (needs_reset_) { RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(dev_id.value())); }
   }
   /**
    * @brief Reactivates the previous CUDA device
    */
   ~cuda_set_device_raii() noexcept
   {
-    if (needs_reset_) RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(old_device_.value()));
+    if (needs_reset_) { RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(old_device_.value())); }
   }
 
   cuda_set_device_raii(cuda_set_device_raii const&)            = delete;
diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp
index c69b9206b..20fa4f36e 100644
--- a/include/rmm/device_buffer.hpp
+++ b/include/rmm/device_buffer.hpp
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
@@ -109,6 +110,7 @@ class device_buffer {
                          mr::device_memory_resource* mr = mr::get_current_device_resource())
     : _stream{stream}, _mr{mr}
   {
+    cuda_set_device_raii dev{_device};
     allocate_async(size);
   }
 
@@ -137,6 +139,7 @@ class device_buffer {
                 mr::device_memory_resource* mr = mr::get_current_device_resource())
     : _stream{stream}, _mr{mr}
   {
+    cuda_set_device_raii dev{_device};
     allocate_async(size);
     copy_async(source_data, size);
   }
@@ -185,12 +188,14 @@ class device_buffer {
       _size{other._size},
       _capacity{other._capacity},
       _stream{other.stream()},
-      _mr{other._mr}
+      _mr{other._mr},
+      _device{other._device}
   {
     other._data     = nullptr;
     other._size     = 0;
     other._capacity = 0;
     other.set_stream(cuda_stream_view{});
+    other._device = cuda_device_id{-1};
   }
 
   /**
@@ -210,18 +215,21 @@ class device_buffer {
   device_buffer& operator=(device_buffer&& other) noexcept
   {
     if (&other != this) {
+      cuda_set_device_raii dev{_device};
       deallocate_async();
 
       _data     = other._data;
       _size     = other._size;
       _capacity = other._capacity;
       set_stream(other.stream());
-      _mr = other._mr;
+      _mr     = other._mr;
+      _device = other._device;
 
       other._data     = nullptr;
       other._size     = 0;
       other._capacity = 0;
       other.set_stream(cuda_stream_view{});
+      other._device = cuda_device_id{-1};
     }
     return *this;
   }
@@ -235,6 +243,7 @@ class device_buffer {
    */
   ~device_buffer() noexcept
   {
+    cuda_set_device_raii dev{_device};
     deallocate_async();
     _mr     = nullptr;
     _stream = cuda_stream_view{};
@@ -262,6 +271,7 @@ class device_buffer {
   {
     set_stream(stream);
     if (new_capacity > capacity()) {
+      cuda_set_device_raii dev{_device};
       auto tmp            = device_buffer{new_capacity, stream, _mr};
       auto const old_size = size();
       RMM_CUDA_TRY(cudaMemcpyAsync(tmp.data(), data(), size(), cudaMemcpyDefault, stream.value()));
@@ -303,6 +313,7 @@ class device_buffer {
     if (new_size <= capacity()) {
       _size = new_size;
     } else {
+      cuda_set_device_raii dev{_device};
       auto tmp = device_buffer{new_size, stream, _mr};
       RMM_CUDA_TRY(cudaMemcpyAsync(tmp.data(), data(), size(), cudaMemcpyDefault, stream.value()));
       *this = std::move(tmp);
@@ -326,6 +337,7 @@ class device_buffer {
   {
     set_stream(stream);
     if (size() != capacity()) {
+      cuda_set_device_raii dev{_device};
       // Invoke copy ctor on self which only copies `[0, size())` and swap it
       // with self. The temporary `device_buffer` will hold the old contents
       // which will then be destroyed
@@ -407,6 +419,7 @@ class device_buffer {
   mr::device_memory_resource* _mr{
     mr::get_current_device_resource()};  ///< The memory resource used to
                                          ///< allocate/deallocate device memory
+  cuda_device_id _device{get_current_cuda_device()};
 
   /**
    * @brief Allocates the specified amount of memory and updates the size/capacity accordingly.
@@ -457,6 +470,7 @@ class device_buffer {
   {
     if (bytes > 0) {
       RMM_EXPECTS(nullptr != source, "Invalid copy from nullptr.");
+      RMM_EXPECTS(nullptr != _data, "Invalid copy to nullptr.");
 
       RMM_CUDA_TRY(cudaMemcpyAsync(_data, source, bytes, cudaMemcpyDefault, stream().value()));
     }
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 36c3aa043..752496279 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -186,4 +186,7 @@ ConfigureTest(BINNING_MR_TEST mr/device/binning_mr_tests.cpp)
 # callback memory resource tests
 ConfigureTest(CALLBACK_MR_TEST mr/device/callback_mr_tests.cpp)
 
+# container multidevice tests
+ConfigureTest(CONTAINER_MULTIDEVICE_TEST container_multidevice_tests.cu)
+
 rapids_test_install_relocatable(INSTALL_COMPONENT_SET testing DESTINATION bin/gtests/librmm)
diff --git a/tests/container_multidevice_tests.cu b/tests/container_multidevice_tests.cu
new file mode 100644
index 000000000..9de9ddf40
--- /dev/null
+++ b/tests/container_multidevice_tests.cu
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "device_check_resource_adaptor.hpp"
+#include "rmm/mr/device/per_device_resource.hpp"
+
+#include <rmm/cuda_stream.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+
+#include <gtest/gtest.h>
+
+#include <type_traits>
+
+template <typename ContainerType>
+struct ContainerMultiDeviceTest : public ::testing::Test {};
+
+using containers =
+  ::testing::Types<rmm::device_buffer, rmm::device_uvector<int>, rmm::device_scalar<int>>;
+
+TYPED_TEST_CASE(ContainerMultiDeviceTest, containers);
+
+TYPED_TEST(ContainerMultiDeviceTest, CreateDestroyDifferentActiveDevice)
+{
+  // Get the number of cuda devices
+  int num_devices = rmm::get_num_cuda_devices();
+
+  // only run on multidevice systems
+  if (num_devices >= 2) {
+    rmm::cuda_set_device_raii dev{rmm::cuda_device_id{0}};
+    auto* orig_mr = rmm::mr::get_current_device_resource();
+    auto check_mr = device_check_resource_adaptor{orig_mr};
+    rmm::mr::set_current_device_resource(&check_mr);
+
+    {
+      if constexpr (std::is_same_v<TypeParam, rmm::device_scalar<int>>) {
+        auto buf = TypeParam(rmm::cuda_stream_view{});
+        RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(1));  // force dtor with different active device
+      } else {
+        auto buf = TypeParam(128, rmm::cuda_stream_view{});
+        RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(1));  // force dtor with different active device
+      }
+    }
+
+    RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(0));
+    rmm::mr::set_current_device_resource(orig_mr);
+  }
+}
+
+TYPED_TEST(ContainerMultiDeviceTest, CreateMoveDestroyDifferentActiveDevice)
+{
+  // Get the number of cuda devices
+  int num_devices = rmm::get_num_cuda_devices();
+
+  // only run on multidevice systems
+  if (num_devices >= 2) {
+    rmm::cuda_set_device_raii dev{rmm::cuda_device_id{0}};
+    auto* orig_mr = rmm::mr::get_current_device_resource();
+    auto check_mr = device_check_resource_adaptor{orig_mr};
+    rmm::mr::set_current_device_resource(&check_mr);
+
+    {
+      auto buf_1 = []() {
+        if constexpr (std::is_same_v<TypeParam, rmm::device_scalar<int>>) {
+          return TypeParam(rmm::cuda_stream_view{});
+        } else {
+          return TypeParam(128, rmm::cuda_stream_view{});
+        }
+      }();
+
+      {
+        if constexpr (std::is_same_v<TypeParam, rmm::device_scalar<int>>) {
+          // device_vector does not have a constructor that takes a stream
+          auto buf_0 = TypeParam(rmm::cuda_stream_view{});
+          buf_1      = std::move(buf_0);
+        } else {
+          auto buf_0 = TypeParam(128, rmm::cuda_stream_view{});
+          buf_1      = std::move(buf_0);
+        }
+      }
+
+      RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(1));  // force dtor with different active device
+    }
+
+    RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(0));
+    rmm::mr::set_current_device_resource(orig_mr);
+  }
+}
+
+TYPED_TEST(ContainerMultiDeviceTest, ResizeDifferentActiveDevice)
+{
+  // Get the number of cuda devices
+  int num_devices = rmm::get_num_cuda_devices();
+
+  // only run on multidevice systems
+  if (num_devices >= 2) {
+    rmm::cuda_set_device_raii dev{rmm::cuda_device_id{0}};
+    auto* orig_mr = rmm::mr::get_current_device_resource();
+    auto check_mr = device_check_resource_adaptor{orig_mr};
+    rmm::mr::set_current_device_resource(&check_mr);
+
+    if constexpr (not std::is_same_v<TypeParam, rmm::device_scalar<int>>) {
+      auto buf = TypeParam(128, rmm::cuda_stream_view{});
+      RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(1));  // force resize with different active device
+      buf.resize(1024, rmm::cuda_stream_view{});
+    }
+
+    RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(0));
+    rmm::mr::set_current_device_resource(orig_mr);
+  }
+}
+
+TYPED_TEST(ContainerMultiDeviceTest, ShrinkDifferentActiveDevice)
+{
+  // Get the number of cuda devices
+  int num_devices = rmm::get_num_cuda_devices();
+
+  // only run on multidevice systems
+  if (num_devices >= 2) {
+    rmm::cuda_set_device_raii dev{rmm::cuda_device_id{0}};
+    auto* orig_mr = rmm::mr::get_current_device_resource();
+    auto check_mr = device_check_resource_adaptor{orig_mr};
+    rmm::mr::set_current_device_resource(&check_mr);
+
+    if constexpr (not std::is_same_v<TypeParam, rmm::device_scalar<int>>) {
+      auto buf = TypeParam(128, rmm::cuda_stream_view{});
+      RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(1));  // force resize with different active device
+      buf.resize(64, rmm::cuda_stream_view{});
+      buf.shrink_to_fit(rmm::cuda_stream_view{});
+    }
+
+    RMM_ASSERT_CUDA_SUCCESS(cudaSetDevice(0));
+    rmm::mr::set_current_device_resource(orig_mr);
+  }
+}
diff --git a/tests/device_buffer_tests.cu b/tests/device_buffer_tests.cu
index d4c34385e..e0d8e5555 100644
--- a/tests/device_buffer_tests.cu
+++ b/tests/device_buffer_tests.cu
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <gtest/gtest.h>
-
 #include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/error.hpp>
@@ -29,9 +27,12 @@
 
 #include <thrust/equal.h>
 #include <thrust/sequence.h>
+
+#include <gtest/gtest.h>
+
 namespace testing {
 namespace thrust = THRUST_NS_QUALIFIER;
-}
+}  // namespace testing
 using namespace testing;
 
 #include <cuda_runtime_api.h>
diff --git a/tests/device_check_resource_adaptor.hpp b/tests/device_check_resource_adaptor.hpp
new file mode 100644
index 000000000..f9ad4cf70
--- /dev/null
+++ b/tests/device_check_resource_adaptor.hpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <gtest/gtest.h>
+
+class device_check_resource_adaptor final : public rmm::mr::device_memory_resource {
+ public:
+  device_check_resource_adaptor(rmm::mr::device_memory_resource* upstream)
+    : device_id{rmm::get_current_cuda_device()}, upstream_(upstream)
+  {
+  }
+
+  [[nodiscard]] bool supports_streams() const noexcept override
+  {
+    return upstream_->supports_streams();
+  }
+
+  [[nodiscard]] bool supports_get_mem_info() const noexcept override
+  {
+    return upstream_->supports_get_mem_info();
+  }
+
+  [[nodiscard]] device_memory_resource* get_upstream() const noexcept { return upstream_; }
+
+ private:
+  [[nodiscard]] bool check_device_id() const { return device_id == rmm::get_current_cuda_device(); }
+
+  void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
+  {
+    bool const is_correct_device = check_device_id();
+    EXPECT_TRUE(is_correct_device);
+    if (is_correct_device) { return upstream_->allocate(bytes, stream); }
+    return nullptr;
+  }
+
+  void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) override
+  {
+    bool const is_correct_device = check_device_id();
+    EXPECT_TRUE(is_correct_device);
+    if (is_correct_device) { upstream_->deallocate(ptr, bytes, stream); }
+  }
+
+  [[nodiscard]] bool do_is_equal(
+    rmm::mr::device_memory_resource const& other) const noexcept override
+  {
+    if (this == &other) { return true; }
+    auto const* cast = dynamic_cast<device_check_resource_adaptor const*>(&other);
+    if (cast != nullptr) { return upstream_->is_equal(*cast->get_upstream()); }
+    return upstream_->is_equal(other);
+  }
+
+  [[nodiscard]] std::pair<std::size_t, std::size_t> do_get_mem_info(
+    rmm::cuda_stream_view stream) const override
+  {
+    return upstream_->get_mem_info(stream);
+  }
+
+  rmm::cuda_device_id device_id;
+  rmm::mr::device_memory_resource* upstream_{};
+};