Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add rmm::prefetch() and DeviceBuffer.prefetch() #1573

Merged
merged 32 commits into from
Jun 12, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
212e777
Add prefetch() method to device_buffer/DeviceBuffer
harrism May 30, 2024
40bc935
Revert addition of device_buffer::prefetch
harrism Jun 4, 2024
99f341c
Add new `rmm::prefetch()` functions and tests.
harrism Jun 4, 2024
62ecd78
Update Python DeviceBuffer.prefetch() and add pytest.
harrism Jun 4, 2024
414cb64
doc
harrism Jun 4, 2024
4778b27
Support prefetch on device_scalar
harrism Jun 4, 2024
81dc818
Merge branch 'branch-24.08' into fea-prefetch-device_buffer
harrism Jun 4, 2024
bc3746b
Add `device_scalar::size()` function so we can implicitly convert to …
harrism Jun 4, 2024
73a1b0d
Add conversion operators to `cuda::std::span`
harrism Jun 4, 2024
4d29f84
Replace various prefetch() functions with a single that takes a span
harrism Jun 4, 2024
3661f62
Remove smelly span conversion from `device_buffer`.
harrism Jun 5, 2024
84d77d6
Add to doxygen utilities group
harrism Jun 5, 2024
407682f
Improve Python documentation
harrism Jun 5, 2024
e79b58c
Revert changes to device_buffer includes and copyright.
harrism Jun 5, 2024
1acb374
doc
harrism Jun 5, 2024
3ef521b
whitespace
harrism Jun 5, 2024
9275976
Line length
harrism Jun 5, 2024
450c7e9
Fix self.stream default argument
harrism Jun 5, 2024
17310a5
Apply suggestions from code review
harrism Jun 5, 2024
125cc67
Improve docs.
harrism Jun 6, 2024
4537cb0
Add validation of prefetching using cudaMemRangeGetAttribute
harrism Jun 6, 2024
5b57c05
Add nullary ctor to aid in Cython usage
harrism Jun 7, 2024
9b15eaa
Remove c_prefetch, consolidate cython code
harrism Jun 7, 2024
6ac4576
Remove template, add const
harrism Jun 7, 2024
d9cbeaa
Update python/rmm/rmm/_lib/device_buffer.pyx
harrism Jun 7, 2024
92c74d7
Test buffer prefetching using cuda Python
harrism Jun 7, 2024
1fbcb48
Style
harrism Jun 7, 2024
d20541b
Merge branch 'fea-prefetch-device_buffer' of github.com:harrism/rmm i…
harrism Jun 7, 2024
2bccc74
Update Cython for prefetch() to match C++ changes.
harrism Jun 7, 2024
60aed37
Merge branch 'branch-24.08' into fea-prefetch-device_buffer
harrism Jun 11, 2024
c43942a
Work around CCCL regression in test
harrism Jun 12, 2024
5165889
Improve documentation of device parameter. Coauthored by @leofang
harrism Jun 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions include/rmm/device_scalar.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class device_scalar {
static_assert(std::is_trivially_copyable<T>::value, "Scalar type must be trivially copyable");

using value_type = typename device_uvector<T>::value_type; ///< T, the type of the scalar element
using size_type = typename device_uvector<T>::size_type; ///< The type used for the size
using reference = typename device_uvector<T>::reference; ///< value_type&
using const_reference = typename device_uvector<T>::const_reference; ///< const value_type&
using pointer =
Expand Down Expand Up @@ -254,6 +255,11 @@ class device_scalar {
return static_cast<const_pointer>(_storage.data());
}

/**
* @briefreturn{The size of the scalar: always 1}
*/
[[nodiscard]] constexpr size_type size() const noexcept { return 1; }

/**
* @briefreturn{Stream associated with the device memory allocation}
*/
Expand Down
74 changes: 74 additions & 0 deletions include/rmm/prefetch.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <rmm/cuda_device.hpp>
#include <rmm/cuda_stream_view.hpp>
#include <rmm/error.hpp>

#include <cuda/std/span>

namespace rmm {

/**
* @addtogroup utilities
* @{
* @file
*/

/**
* @brief Prefetch memory to the specified device on the specified stream.
*
* This function is a no-op if the pointer is not to CUDA managed memory.
*
* @throw rmm::cuda_error if the prefetch fails.
*
* @tparam T The type of the elements pointed to by `ptr`.
* @param ptr The pointer to the memory to prefetch
* @param size The number of bytes to prefetch
* @param device The device to prefetch to
* @param stream The stream to use for the prefetch
*/
template <typename T>
void prefetch(T* ptr, std::size_t size, rmm::cuda_device_id device, rmm::cuda_stream_view stream)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If ptr is typed, then size shouldn't be bytes, it should be elements.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed. If this takes T* ptr, it should use size * sizeof(T) to compute the bytes.

Or, if this is really designed for the case of device buffers, it could just use void* ptr and accept size in bytes.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm switching it back to void const* because then we can use span::size_bytes() in the span function. Someone suggested the T* version during discussions but I can't remember who or why. If there is a good reason, I'm all ears.

{
auto result = cudaMemPrefetchAsync(ptr, size, device.value(), stream.value());
// InvalidValue error is raised when non-managed memory is passed to cudaMemPrefetchAsync
// We should treat this as a no-op
if (result != cudaErrorInvalidValue && result != cudaSuccess) { RMM_CUDA_TRY(result); }
}

/**
* @brief Prefetch a span of memory to the specified device on the specified stream.
*
* This function is a no-op if the buffer is not backed by CUDA managed memory.
*
* @throw rmm::cuda_error if the prefetch fails.
*
* @param data The span to prefetch
* @param device The device to prefetch to
* @param stream The stream to use for the prefetch
*/
template <typename T>
vyasr marked this conversation as resolved.
Show resolved Hide resolved
void prefetch(cuda::std::span<T> data, rmm::cuda_device_id device, rmm::cuda_stream_view stream)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function shouldn't be mutating any of the data.

Suggested change
void prefetch(cuda::std::span<T> data, rmm::cuda_device_id device, rmm::cuda_stream_view stream)
void prefetch(cuda::std::span<T const> data, rmm::cuda_device_id device, rmm::cuda_stream_view stream)

{
prefetch(data.data(), data.size_bytes(), device, stream);
}

/** @} */ // end of group

} // namespace rmm
32 changes: 32 additions & 0 deletions python/rmm/docs/guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,38 @@ host:
array([1., 2., 3.])
```

## Prefetching DeviceBuffers Backed by Managed Memory

CUDA Unified Memory, also known as managed memory, can be allocated using
rmm.ManagedMemoryResource explicitly, or by calling `rmm.reinitialize` with
`managed_memory=True`.
harrism marked this conversation as resolved.
Show resolved Hide resolved

A DeviceBuffer backed by managed memory may be prefetched to a specified
harrism marked this conversation as resolved.
Show resolved Hide resolved
device:

```python
>>> import rmm
>>> rmm.reinitialize(managed_memory=True)
>>> buf = rmm.DeviceBuffer(size=100)
>>> buf.prefetch()
```

The above example prefetches the DeviceBuffer memory to the current CUDA device
on the stream that the DeviceBuffer last used (e.g. at construction). The
harrism marked this conversation as resolved.
Show resolved Hide resolved
destination device ID and stream are optional parameters.

```python
>>> import rmm
>>> rmm.reinitialize(managed_memory=True)
>>> from rmm._cuda.stream import Stream
>>> stream = Stream()
>>> buf = rmm.DeviceBuffer(size=100, stream)
>>> buf.prefetch(device=3, stream) # prefetch to device on stream.
harrism marked this conversation as resolved.
Show resolved Hide resolved
```

`DeviceBuffer.prefetch()` is a no-op if the DeviceBuffer is not backed
harrism marked this conversation as resolved.
Show resolved Hide resolved
by managed memory.

### MemoryResource objects

`MemoryResource` objects are used to configure how device memory allocations are made by
Expand Down
17 changes: 17 additions & 0 deletions python/rmm/rmm/_lib/device_buffer.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,22 @@ from rmm._lib.memory_resource cimport (
)


cdef extern from "rmm/mr/device/per_device_resource.hpp" namespace "rmm" nogil:
cdef cppclass cuda_device_id:
ctypedef int value_type

cuda_device_id(value_type id)

value_type value()

cdef cuda_device_id get_current_cuda_device()

cdef extern from "rmm/prefetch.hpp" namespace "rmm" nogil:
cdef void prefetch[T](const T* ptr,
size_t bytes,
cuda_device_id device,
cuda_stream_view stream) except +

cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil:
cdef cppclass device_buffer:
device_buffer()
Expand Down Expand Up @@ -82,6 +98,7 @@ cdef class DeviceBuffer:
cpdef void resize(self, size_t new_size, Stream stream=*) except *
cpdef size_t capacity(self) except *
cdef void* c_data(self) except *
cdef void c_prefetch(self, cuda_device_id device, Stream stream=*) except *

cdef device_buffer c_release(self) except *

Expand Down
28 changes: 28 additions & 0 deletions python/rmm/rmm/_lib/device_buffer.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,26 @@ cdef class DeviceBuffer:
}
return intf

def prefetch(self, device=None, stream=None):
harrism marked this conversation as resolved.
Show resolved Hide resolved
vyasr marked this conversation as resolved.
Show resolved Hide resolved
"""Prefetch buffer data to the specified device on the specified stream.

Assumes the storage for this DeviceBuffer is CUDA managed memory
(unified memory). If it is not, this function is a no-op.

Parameters
----------
device: optional
harrism marked this conversation as resolved.
Show resolved Hide resolved
The CUDA device to which to prefetch the memory for this buffer.
Defaults to the current CUDA device.
stream : optional
CUDA stream to use for prefetching. Defaults to self.stream
"""
stream = self.stream if stream is None else stream
if device is None:
self.c_prefetch(get_current_cuda_device(), stream)
else:
self.c_prefetch(cuda_device_id(device), stream)

def copy(self):
"""Returns a copy of DeviceBuffer.

Expand Down Expand Up @@ -345,6 +365,14 @@ cdef class DeviceBuffer:
cdef void* c_data(self) except *:
return self.c_obj.get()[0].data()

cdef void c_prefetch(self,
vyasr marked this conversation as resolved.
Show resolved Hide resolved
cuda_device_id device,
Stream stream=DEFAULT_STREAM) except *:
prefetch(self.c_obj.get()[0].data(),
self.c_obj.get()[0].size(),
device,
stream.view())

cdef device_buffer c_release(self) except *:
"""
Releases ownership of the data held by this DeviceBuffer.
Expand Down
9 changes: 9 additions & 0 deletions python/rmm/rmm/tests/test_rmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,15 @@ def test_rmm_device_buffer_pickle_roundtrip(hb):
assert hb3 == hb


@pytest.mark.parametrize(
"managed, pool", list(product([False, True], [False, True]))
)
def test_rmm_device_buffer_prefetch(pool, managed):
rmm.reinitialize(pool_allocator=pool, managed_memory=managed)
db = rmm.DeviceBuffer.to_device(np.zeros(256, dtype="u1"))
db.prefetch() # just test that it doesn't throw
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You might be able to test that a prefetch call was issued with cudaMemRangeGetAttribute (void* data, size_t dataSize, cudaMemRangeAttribute attribute, const void* devPtr, size_t count), with attribute cudaMemRangeAttributeLastPrefetchLocation. It ought to be possible to call this via cuda-python (API docs).

Also:

Note that this simply returns the last location that the applicaton requested to prefetch the memory range to. It gives no indication as to whether the prefetch operation to that location has completed or even begun.

You wouldn't know if the prefetch completed or not, but you could verify that the prefetch request was issued.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

...of course, as soon as I scrolled down, I see you did this exact thing in the C++ tests, at Vyas's request. It would be nice to have a corresponding Python API test, since it should be quick to write with cuda-python.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CUDA Python results in very ugly code due to its non-pythonic error handling. But I've done what you asked...



@pytest.mark.parametrize("stream", [cuda.default_stream(), cuda.stream()])
def test_rmm_pool_numba_stream(stream):
rmm.reinitialize(pool_allocator=True)
Expand Down
11 changes: 7 additions & 4 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# =============================================================================
# Copyright (c) 2018-2023, NVIDIA CORPORATION.
# Copyright (c) 2018-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
Expand Down Expand Up @@ -174,12 +174,15 @@ ConfigureTest(DEVICE_BUFFER_TEST device_buffer_tests.cu)
# device scalar tests
ConfigureTest(DEVICE_SCALAR_TEST device_scalar_tests.cpp)

# logger tests
ConfigureTest(LOGGER_TEST logger_tests.cpp)

# uvector tests
ConfigureTest(DEVICE_UVECTOR_TEST device_uvector_tests.cpp GPUS 1 PERCENT 60)

# prefetch tests
ConfigureTest(PREFETCH_TEST prefetch_tests.cpp)

# logger tests
ConfigureTest(LOGGER_TEST logger_tests.cpp)

# arena MR tests
ConfigureTest(ARENA_MR_TEST mr/device/arena_mr_tests.cpp GPUS 1 PERCENT 60)

Expand Down
75 changes: 75 additions & 0 deletions tests/prefetch_tests.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/*
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <rmm/cuda_stream.hpp>
#include <rmm/device_buffer.hpp>
#include <rmm/device_scalar.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/mr/device/cuda_memory_resource.hpp>
#include <rmm/mr/device/managed_memory_resource.hpp>
#include <rmm/prefetch.hpp>

#include <gtest/gtest.h>

#include <cstddef>
#include <random>

template <typename MemoryResourceType>
struct PrefetchTest : public ::testing::Test {
rmm::cuda_stream stream{};
std::size_t size{};
MemoryResourceType mr{};

PrefetchTest()
{
std::default_random_engine generator;

auto constexpr range_min{1000};
auto constexpr range_max{100000};
std::uniform_int_distribution<std::size_t> distribution(range_min, range_max);
size = distribution(generator);
}
};

using resources = ::testing::Types<rmm::mr::cuda_memory_resource, rmm::mr::managed_memory_resource>;

TYPED_TEST_CASE(PrefetchTest, resources);

// The following tests simply test compilation and that there are no exceptions thrown
// due to prefetching non-managed memory.

TYPED_TEST(PrefetchTest, PointerAndSize)
{
rmm::device_buffer buff(this->size, this->stream, &this->mr);
rmm::prefetch(buff.data(), buff.size(), rmm::get_current_cuda_device(), this->stream);
vyasr marked this conversation as resolved.
Show resolved Hide resolved
}

TYPED_TEST(PrefetchTest, DeviceUVector)
{
rmm::device_uvector<int> uvec(this->size, this->stream, &this->mr);
rmm::prefetch<int>(uvec, rmm::get_current_cuda_device(), this->stream);

// test iterator range of part of the vector (implicitly constructs a span)
rmm::prefetch<int>({uvec.begin(), std::next(uvec.begin(), this->size / 2)},
rmm::get_current_cuda_device(),
this->stream);
}

TYPED_TEST(PrefetchTest, DeviceScalar)
{
rmm::device_scalar<int> scalar(this->stream, &this->mr);
rmm::prefetch<int>(scalar, rmm::get_current_cuda_device(), this->stream);
}
Loading