rapidsai · rapids-bot · Jun 12, 2024 · May 30, 2024 · Jun 4, 2024 · Jun 4, 2024
@@ -43,6 +43,7 @@ class device_scalar {
   static_assert(std::is_trivially_copyable<T>::value, "Scalar type must be trivially copyable");
 
   using value_type = typename device_uvector<T>::value_type;  ///< T, the type of the scalar element
+  using size_type  = typename device_uvector<T>::size_type;   ///< The type used for the size
   using reference  = typename device_uvector<T>::reference;   ///< value_type&
   using const_reference = typename device_uvector<T>::const_reference;  ///< const value_type&
   using pointer =
@@ -254,6 +255,11 @@ class device_scalar {
     return static_cast<const_pointer>(_storage.data());
   }
 
+  /**
+   * @briefreturn{The size of the scalar: always 1}
+   */
+  [[nodiscard]] constexpr size_type size() const noexcept { return 1; }
+
   /**
    * @briefreturn{Stream associated with the device memory allocation}
    */

@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/error.hpp>
+
+#include <cuda/std/span>
+
+namespace rmm {
+
+/**
+ * @addtogroup utilities
+ * @{
+ * @file
+ */
+
+/**
+ * @brief Prefetch memory to the specified device on the specified stream.
+ *
+ * This function is a no-op if the pointer is not to CUDA managed memory.
+ *
+ * @throw rmm::cuda_error if the prefetch fails.
+ *
+ * @tparam T The type of the elements pointed to by `ptr`.
+ * @param ptr The pointer to the memory to prefetch
+ * @param size The number of bytes to prefetch
+ * @param device The device to prefetch to
+ * @param stream The stream to use for the prefetch
+ */
+template <typename T>
+void prefetch(T* ptr, std::size_t size, rmm::cuda_device_id device, rmm::cuda_stream_view stream)
+{
+  auto result = cudaMemPrefetchAsync(ptr, size, device.value(), stream.value());
+  // InvalidValue error is raised when non-managed memory is passed to cudaMemPrefetchAsync
+  // We should treat this as a no-op
+  if (result != cudaErrorInvalidValue && result != cudaSuccess) { RMM_CUDA_TRY(result); }
+}
+
+/**
+ * @brief Prefetch a span of memory to the specified device on the specified stream.
+ *
+ * This function is a no-op if the buffer is not backed by CUDA managed memory.
+ *
+ * @throw rmm::cuda_error if the prefetch fails.
+ *
+ * @param data The span to prefetch
+ * @param device The device to prefetch to
+ * @param stream The stream to use for the prefetch
+ */
+template <typename T>
+void prefetch(cuda::std::span<T> data, rmm::cuda_device_id device, rmm::cuda_stream_view stream)
-void prefetch(cuda::std::span<T> data, rmm::cuda_device_id device, rmm::cuda_stream_view stream)
+void prefetch(cuda::std::span<T const> data, rmm::cuda_device_id device, rmm::cuda_stream_view stream)
-void prefetch(cuda::std::span<T> data, rmm::cuda_device_id device, rmm::cuda_stream_view stream)
+void prefetch(cuda::std::span<T const> data, rmm::cuda_device_id device, rmm::cuda_stream_view stream)
+{
+  prefetch(data.data(), data.size_bytes(), device, stream);
+}
+
+/** @} */  // end of group
+
+}  // namespace rmm
@@ -67,6 +67,38 @@ host:
 array([1., 2., 3.])
 ```
 
+## Prefetching DeviceBuffers Backed by Managed Memory
+
+CUDA Unified Memory, also known as managed memory, can be allocated using
+rmm.ManagedMemoryResource explicitly, or by calling `rmm.reinitialize` with
+`managed_memory=True`.
+
+A DeviceBuffer backed by managed memory may be prefetched to a specified
+device:
+
+```python
+>>> import rmm
+>>> rmm.reinitialize(managed_memory=True)
+>>> buf = rmm.DeviceBuffer(size=100)
+>>> buf.prefetch()
+```
+
+The above example prefetches the DeviceBuffer memory to the current CUDA device
+on the stream that the DeviceBuffer last used (e.g. at construction). The
+destination device ID and stream are optional parameters.
+
+```python
+>>> import rmm
+>>> rmm.reinitialize(managed_memory=True)
+>>> from rmm._cuda.stream import Stream
+>>> stream = Stream()
+>>> buf = rmm.DeviceBuffer(size=100, stream)
+>>> buf.prefetch(device=3, stream) # prefetch to device on stream.
+```
+
+`DeviceBuffer.prefetch()` is a no-op if the DeviceBuffer is not backed
+by managed memory.
+
 ### MemoryResource objects
 
 `MemoryResource` objects are used to configure how device memory allocations are made by

@@ -23,6 +23,22 @@ from rmm._lib.memory_resource cimport (
 )
 
 
+cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil:
+    cdef cppclass cuda_device_id:
+        ctypedef int value_type
+
+        cuda_device_id(value_type id)
+
+        value_type value()
+
+    cdef cuda_device_id get_current_cuda_device()
+
+cdef extern from "rmm/prefetch.hpp" namespace "rmm" nogil:
+    cdef void prefetch[T](const T* ptr,
+                          size_t bytes,
+                          cuda_device_id device,
+                          cuda_stream_view stream) except +
+
 cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil:
     cdef cppclass device_buffer:
         device_buffer()
@@ -82,6 +98,7 @@ cdef class DeviceBuffer:
     cpdef void resize(self, size_t new_size, Stream stream=*) except *
     cpdef size_t capacity(self) except *
     cdef void* c_data(self) except *
+    cdef void c_prefetch(self, cuda_device_id device, Stream stream=*) except *
 
     cdef device_buffer c_release(self) except *
 

@@ -138,6 +138,26 @@ cdef class DeviceBuffer:
         }
         return intf
 
+    def prefetch(self, device=None, stream=None):
+        """Prefetch buffer data to the specified device on the specified stream.
+
+        Assumes the storage for this DeviceBuffer is CUDA managed memory
+        (unified memory). If it is not, this function is a no-op.
+
+        Parameters
+        ----------
+        device: optional
+            The CUDA device to which to prefetch the memory for this buffer.
+            Defaults to the current CUDA device.
+        stream : optional
+            CUDA stream to use for prefetching. Defaults to self.stream
+        """
+        stream = self.stream if stream is None else stream
+        if device is None:
+            self.c_prefetch(get_current_cuda_device(), stream)
+        else:
+            self.c_prefetch(cuda_device_id(device), stream)
+
     def copy(self):
         """Returns a copy of DeviceBuffer.
 
@@ -345,6 +365,14 @@ cdef class DeviceBuffer:
     cdef void* c_data(self) except *:
         return self.c_obj.get()[0].data()
 
+    cdef void c_prefetch(self,
+                         cuda_device_id device,
+                         Stream stream=DEFAULT_STREAM) except *:
+        prefetch(self.c_obj.get()[0].data(),
+                 self.c_obj.get()[0].size(),
+                 device,
+                 stream.view())
+
     cdef device_buffer c_release(self) except *:
         """
         Releases ownership of the data held by this DeviceBuffer.

@@ -284,6 +284,15 @@ def test_rmm_device_buffer_pickle_roundtrip(hb):
     assert hb3 == hb
 
 
+@pytest.mark.parametrize(
+    "managed, pool", list(product([False, True], [False, True]))
+)
+def test_rmm_device_buffer_prefetch(pool, managed):
+    rmm.reinitialize(pool_allocator=pool, managed_memory=managed)
+    db = rmm.DeviceBuffer.to_device(np.zeros(256, dtype="u1"))
+    db.prefetch()  # just test that it doesn't throw
+
+
 @pytest.mark.parametrize("stream", [cuda.default_stream(), cuda.stream()])
 def test_rmm_pool_numba_stream(stream):
     rmm.reinitialize(pool_allocator=True)

@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -174,12 +174,15 @@ ConfigureTest(DEVICE_BUFFER_TEST device_buffer_tests.cu)
 # device scalar tests
 ConfigureTest(DEVICE_SCALAR_TEST device_scalar_tests.cpp)
 
-# logger tests
-ConfigureTest(LOGGER_TEST logger_tests.cpp)
-
 # uvector tests
 ConfigureTest(DEVICE_UVECTOR_TEST device_uvector_tests.cpp GPUS 1 PERCENT 60)
 
+# prefetch tests
+ConfigureTest(PREFETCH_TEST prefetch_tests.cpp)
+
+# logger tests
+ConfigureTest(LOGGER_TEST logger_tests.cpp)
+
 # arena MR tests
 ConfigureTest(ARENA_MR_TEST mr/device/arena_mr_tests.cpp GPUS 1 PERCENT 60)
 

@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <rmm/cuda_stream.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/prefetch.hpp>
+
+#include <gtest/gtest.h>
+
+#include <cstddef>
+#include <random>
+
+template <typename MemoryResourceType>
+struct PrefetchTest : public ::testing::Test {
+  rmm::cuda_stream stream{};
+  std::size_t size{};
+  MemoryResourceType mr{};
+
+  PrefetchTest()
+  {
+    std::default_random_engine generator;
+
+    auto constexpr range_min{1000};
+    auto constexpr range_max{100000};
+    std::uniform_int_distribution<std::size_t> distribution(range_min, range_max);
+    size = distribution(generator);
+  }
+};
+
+using resources = ::testing::Types<rmm::mr::cuda_memory_resource, rmm::mr::managed_memory_resource>;
+
+TYPED_TEST_CASE(PrefetchTest, resources);
+
+// The following tests simply test compilation and that there are no exceptions thrown
+// due to prefetching non-managed memory.
+
+TYPED_TEST(PrefetchTest, PointerAndSize)
+{
+  rmm::device_buffer buff(this->size, this->stream, &this->mr);
+  rmm::prefetch(buff.data(), buff.size(), rmm::get_current_cuda_device(), this->stream);
+}
+
+TYPED_TEST(PrefetchTest, DeviceUVector)
+{
+  rmm::device_uvector<int> uvec(this->size, this->stream, &this->mr);
+  rmm::prefetch<int>(uvec, rmm::get_current_cuda_device(), this->stream);
+
+  // test iterator range of part of the vector (implicitly constructs a span)
+  rmm::prefetch<int>({uvec.begin(), std::next(uvec.begin(), this->size / 2)},
+                     rmm::get_current_cuda_device(),
+                     this->stream);
+}
+
+TYPED_TEST(PrefetchTest, DeviceScalar)
+{
+  rmm::device_scalar<int> scalar(this->stream, &this->mr);
+  rmm::prefetch<int>(scalar, rmm::get_current_cuda_device(), this->stream);
+}