-
Notifications
You must be signed in to change notification settings - Fork 208
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add rmm::prefetch() and DeviceBuffer.prefetch() #1573
Changes from 18 commits
212e777
40bc935
99f341c
62ecd78
414cb64
4778b27
81dc818
bc3746b
73a1b0d
4d29f84
3661f62
84d77d6
407682f
e79b58c
1acb374
3ef521b
9275976
450c7e9
17310a5
125cc67
4537cb0
5b57c05
9b15eaa
6ac4576
d9cbeaa
92c74d7
1fbcb48
d20541b
2bccc74
60aed37
c43942a
5165889
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,74 @@ | ||||||
/* | ||||||
* Copyright (c) 2024, NVIDIA CORPORATION. | ||||||
* | ||||||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||||||
* you may not use this file except in compliance with the License. | ||||||
* You may obtain a copy of the License at | ||||||
* | ||||||
* http://www.apache.org/licenses/LICENSE-2.0 | ||||||
* | ||||||
* Unless required by applicable law or agreed to in writing, software | ||||||
* distributed under the License is distributed on an "AS IS" BASIS, | ||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||
* See the License for the specific language governing permissions and | ||||||
* limitations under the License. | ||||||
*/ | ||||||
|
||||||
#pragma once | ||||||
|
||||||
#include <rmm/cuda_device.hpp> | ||||||
#include <rmm/cuda_stream_view.hpp> | ||||||
#include <rmm/error.hpp> | ||||||
|
||||||
#include <cuda/std/span> | ||||||
|
||||||
namespace rmm { | ||||||
|
||||||
/** | ||||||
* @addtogroup utilities | ||||||
* @{ | ||||||
* @file | ||||||
*/ | ||||||
|
||||||
/** | ||||||
* @brief Prefetch memory to the specified device on the specified stream. | ||||||
* | ||||||
* This function is a no-op if the pointer is not to CUDA managed memory. | ||||||
* | ||||||
* @throw rmm::cuda_error if the prefetch fails. | ||||||
* | ||||||
* @tparam T The type of the elements pointed to by `ptr`. | ||||||
* @param ptr The pointer to the memory to prefetch | ||||||
* @param size The number of bytes to prefetch | ||||||
* @param device The device to prefetch to | ||||||
* @param stream The stream to use for the prefetch | ||||||
*/ | ||||||
template <typename T> | ||||||
void prefetch(T* ptr, std::size_t size, rmm::cuda_device_id device, rmm::cuda_stream_view stream) | ||||||
{ | ||||||
auto result = cudaMemPrefetchAsync(ptr, size, device.value(), stream.value()); | ||||||
// InvalidValue error is raised when non-managed memory is passed to cudaMemPrefetchAsync | ||||||
// We should treat this as a no-op | ||||||
if (result != cudaErrorInvalidValue && result != cudaSuccess) { RMM_CUDA_TRY(result); } | ||||||
} | ||||||
|
||||||
/** | ||||||
* @brief Prefetch a span of memory to the specified device on the specified stream. | ||||||
* | ||||||
* This function is a no-op if the buffer is not backed by CUDA managed memory. | ||||||
* | ||||||
* @throw rmm::cuda_error if the prefetch fails. | ||||||
* | ||||||
* @param data The span to prefetch | ||||||
* @param device The device to prefetch to | ||||||
* @param stream The stream to use for the prefetch | ||||||
*/ | ||||||
template <typename T> | ||||||
vyasr marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
void prefetch(cuda::std::span<T> data, rmm::cuda_device_id device, rmm::cuda_stream_view stream) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function shouldn't be mutating any of the data.
Suggested change
|
||||||
{ | ||||||
prefetch(data.data(), data.size_bytes(), device, stream); | ||||||
} | ||||||
|
||||||
/** @} */ // end of group | ||||||
|
||||||
} // namespace rmm |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -284,6 +284,15 @@ def test_rmm_device_buffer_pickle_roundtrip(hb): | |
assert hb3 == hb | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"managed, pool", list(product([False, True], [False, True])) | ||
) | ||
def test_rmm_device_buffer_prefetch(pool, managed): | ||
rmm.reinitialize(pool_allocator=pool, managed_memory=managed) | ||
db = rmm.DeviceBuffer.to_device(np.zeros(256, dtype="u1")) | ||
db.prefetch() # just test that it doesn't throw | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You might be able to test that a prefetch call was issued with Also:
You wouldn't know if the prefetch completed or not, but you could verify that the prefetch request was issued. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ...of course, as soon as I scrolled down, I see you did this exact thing in the C++ tests, at Vyas's request. It would be nice to have a corresponding Python API test, since it should be quick to write with cuda-python. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. CUDA Python results in very ugly code due to its non-pythonic error handling. But I've done what you asked... |
||
|
||
|
||
@pytest.mark.parametrize("stream", [cuda.default_stream(), cuda.stream()]) | ||
def test_rmm_pool_numba_stream(stream): | ||
rmm.reinitialize(pool_allocator=True) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
/* | ||
* Copyright (c) 2019-2024, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include <rmm/cuda_stream.hpp> | ||
#include <rmm/device_buffer.hpp> | ||
#include <rmm/device_scalar.hpp> | ||
#include <rmm/device_uvector.hpp> | ||
#include <rmm/mr/device/cuda_memory_resource.hpp> | ||
#include <rmm/mr/device/managed_memory_resource.hpp> | ||
#include <rmm/prefetch.hpp> | ||
|
||
#include <gtest/gtest.h> | ||
|
||
#include <cstddef> | ||
#include <random> | ||
|
||
template <typename MemoryResourceType> | ||
struct PrefetchTest : public ::testing::Test { | ||
rmm::cuda_stream stream{}; | ||
std::size_t size{}; | ||
MemoryResourceType mr{}; | ||
|
||
PrefetchTest() | ||
{ | ||
std::default_random_engine generator; | ||
|
||
auto constexpr range_min{1000}; | ||
auto constexpr range_max{100000}; | ||
std::uniform_int_distribution<std::size_t> distribution(range_min, range_max); | ||
size = distribution(generator); | ||
} | ||
}; | ||
|
||
using resources = ::testing::Types<rmm::mr::cuda_memory_resource, rmm::mr::managed_memory_resource>; | ||
|
||
TYPED_TEST_CASE(PrefetchTest, resources); | ||
|
||
// The following tests simply test compilation and that there are no exceptions thrown | ||
// due to prefetching non-managed memory. | ||
|
||
TYPED_TEST(PrefetchTest, PointerAndSize) | ||
{ | ||
rmm::device_buffer buff(this->size, this->stream, &this->mr); | ||
rmm::prefetch(buff.data(), buff.size(), rmm::get_current_cuda_device(), this->stream); | ||
vyasr marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
|
||
TYPED_TEST(PrefetchTest, DeviceUVector) | ||
{ | ||
rmm::device_uvector<int> uvec(this->size, this->stream, &this->mr); | ||
rmm::prefetch<int>(uvec, rmm::get_current_cuda_device(), this->stream); | ||
|
||
// test iterator range of part of the vector (implicitly constructs a span) | ||
rmm::prefetch<int>({uvec.begin(), std::next(uvec.begin(), this->size / 2)}, | ||
rmm::get_current_cuda_device(), | ||
this->stream); | ||
} | ||
|
||
TYPED_TEST(PrefetchTest, DeviceScalar) | ||
{ | ||
rmm::device_scalar<int> scalar(this->stream, &this->mr); | ||
rmm::prefetch<int>(scalar, rmm::get_current_cuda_device(), this->stream); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If
ptr
is typed, thensize
shouldn't be bytes, it should be elements.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed. If this takes
T* ptr
, it should usesize * sizeof(T)
to compute the bytes.Or, if this is really designed for the case of device buffers, it could just use
void* ptr
and acceptsize
in bytes.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm switching it back to
void const*
because then we can use span::size_bytes() in the span function. Someone suggested the T* version during discussions but I can't remember who or why. If there is a good reason, I'm all ears.