From 76a18c4ec48da054bcc5bdd7b735314282f9033b Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Thu, 19 Dec 2019 18:13:29 -0800 Subject: [PATCH 01/35] Add `copy_to_host` method to `rmm::device_buffer` This allows users to copy an `rmm::device_buffer` back to host memory given a pointer to the destination. It is up to the user to ensure they have allocated enough space to perform the copy and that the pointer is in fact on the host. --- include/rmm/device_buffer.hpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp index b2bcc4936..400c28db1 100644 --- a/include/rmm/device_buffer.hpp +++ b/include/rmm/device_buffer.hpp @@ -383,6 +383,19 @@ class device_buffer { *-------------------------------------------------------------------------**/ mr::device_memory_resource* memory_resource() const noexcept { return _mr; } + /**--------------------------------------------------------------------------* + * @brief Copies rmm::device_buffer to a preallocated host buffer. + *-------------------------------------------------------------------------**/ + void copy_to_host(void* host_buffer) const { + cudaError_t err = cudaMemcpy(host_buf, + this->data(), + this->size(), + cudaMemcpyDeviceToHost); + if (status != cudaSuccess) { + throw std::runtime_error{"Failed to copy to host."}; + } + } + private: void* _data{nullptr}; ///< Pointer to device memory allocation std::size_t _size{}; ///< Requested size of the device memory allocation From 695aabf6c8ac8309fe61b054975e1df0fcd92695 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Thu, 19 Dec 2019 18:13:30 -0800 Subject: [PATCH 02/35] Add `copy_to_host` to Cython binding interface Make sure that we are able to use the `copy_to_host` method from Cython. --- python/rmm/_lib/device_buffer.pxd | 1 + 1 file changed, 1 insertion(+) diff --git a/python/rmm/_lib/device_buffer.pxd b/python/rmm/_lib/device_buffer.pxd index b8df7c8b5..019094276 100644 --- a/python/rmm/_lib/device_buffer.pxd +++ b/python/rmm/_lib/device_buffer.pxd @@ -13,6 +13,7 @@ cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil: void* data() size_t size() size_t capacity() + void copy_to_host(void* host_buffer) except * cdef class DeviceBuffer: cdef unique_ptr[device_buffer] c_obj From 88051e47898f345d8e24d9404865ca51f5cfd43b Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Thu, 19 Dec 2019 18:13:31 -0800 Subject: [PATCH 03/35] Define `tobytes` method in `DeviceBuffer` --- python/rmm/_lib/device_buffer.pxd | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/rmm/_lib/device_buffer.pxd b/python/rmm/_lib/device_buffer.pxd index 019094276..a46f144b8 100644 --- a/python/rmm/_lib/device_buffer.pxd +++ b/python/rmm/_lib/device_buffer.pxd @@ -21,6 +21,8 @@ cdef class DeviceBuffer: @staticmethod cdef DeviceBuffer c_from_unique_ptr(unique_ptr[device_buffer] ptr) + cpdef bytes tobytes(self) + cdef size_t c_size(self) cpdef void resize(self, size_t new_size) cpdef size_t capacity(self) From 3d29c3404265f8e4bb21c6a2f4b6c43e9f4424d3 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Thu, 19 Dec 2019 18:19:50 -0800 Subject: [PATCH 04/35] Implement `tobytes` using `copy_to_host` Allocate a `bytes` object that is uninitialized and then copy the data from the underlying `rmm::device_buffer` into the pointer held within `bytes`. --- python/rmm/_lib/device_buffer.pyx | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/python/rmm/_lib/device_buffer.pyx b/python/rmm/_lib/device_buffer.pyx index 80cfea66f..2425bbbc3 100644 --- a/python/rmm/_lib/device_buffer.pyx +++ b/python/rmm/_lib/device_buffer.pyx @@ -1,6 +1,8 @@ from libcpp.memory cimport unique_ptr from libc.stdint cimport uintptr_t +from cpython.bytes cimport PyBytes_FromStringAndSize, PyBytes_AS_STRING + cdef class DeviceBuffer: @@ -50,6 +52,14 @@ cdef class DeviceBuffer: buf.c_obj = move(ptr) return buf + cpdef bytes tobytes(self): + cdef bytes b = PyBytes_FromStringAndSize(NULL, self.c_size()) + cdef void* p = PyBytes_AS_STRING(b) + + self.c_obj.get()[0].copy_to_host(p) + + return b + cdef size_t c_size(self): return self.c_obj.get()[0].size() From 4d0b5fcdbab00ea4b6de6c4543f47b37104950dc Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Thu, 19 Dec 2019 18:33:13 -0800 Subject: [PATCH 05/35] Fix incorrect variable name --- include/rmm/device_buffer.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp index 400c28db1..26de23ad3 100644 --- a/include/rmm/device_buffer.hpp +++ b/include/rmm/device_buffer.hpp @@ -387,7 +387,7 @@ class device_buffer { * @brief Copies rmm::device_buffer to a preallocated host buffer. *-------------------------------------------------------------------------**/ void copy_to_host(void* host_buffer) const { - cudaError_t err = cudaMemcpy(host_buf, + cudaError_t err = cudaMemcpy(host_buffer, this->data(), this->size(), cudaMemcpyDeviceToHost); From 105b6c043048e090f36f417faf6b1bd0be898a47 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Thu, 19 Dec 2019 18:33:17 -0800 Subject: [PATCH 06/35] Use attributes directly in `copy_to_host` --- include/rmm/device_buffer.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp index 26de23ad3..19b49f835 100644 --- a/include/rmm/device_buffer.hpp +++ b/include/rmm/device_buffer.hpp @@ -388,8 +388,8 @@ class device_buffer { *-------------------------------------------------------------------------**/ void copy_to_host(void* host_buffer) const { cudaError_t err = cudaMemcpy(host_buffer, - this->data(), - this->size(), + _data, + _size, cudaMemcpyDeviceToHost); if (status != cudaSuccess) { throw std::runtime_error{"Failed to copy to host."}; From aaf38a399a8a596f94099e79a84e9bdc557774e8 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Thu, 19 Dec 2019 18:35:26 -0800 Subject: [PATCH 07/35] Check that `host_buffer` is not `nullptr` --- include/rmm/device_buffer.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp index 19b49f835..470f38244 100644 --- a/include/rmm/device_buffer.hpp +++ b/include/rmm/device_buffer.hpp @@ -387,6 +387,9 @@ class device_buffer { * @brief Copies rmm::device_buffer to a preallocated host buffer. *-------------------------------------------------------------------------**/ void copy_to_host(void* host_buffer) const { + if (host_buffer == nullptr) { + throw std::runtime_error{"Cannot copy to `nullptr`."}; + } cudaError_t err = cudaMemcpy(host_buffer, _data, _size, From edbd381ef579860fe47889892ef75e5ebba4ceaf Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Thu, 19 Dec 2019 19:07:59 -0800 Subject: [PATCH 08/35] Add CUDA stream and use `cudaMemcpyAsync` --- include/rmm/device_buffer.hpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp index 470f38244..843ceaed6 100644 --- a/include/rmm/device_buffer.hpp +++ b/include/rmm/device_buffer.hpp @@ -386,14 +386,16 @@ class device_buffer { /**--------------------------------------------------------------------------* * @brief Copies rmm::device_buffer to a preallocated host buffer. *-------------------------------------------------------------------------**/ - void copy_to_host(void* host_buffer) const { + void copy_to_host(void* host_buffer, cudaStream_t stream = 0) const { if (host_buffer == nullptr) { throw std::runtime_error{"Cannot copy to `nullptr`."}; } - cudaError_t err = cudaMemcpy(host_buffer, - _data, - _size, - cudaMemcpyDeviceToHost); + set_stream(stream); + cudaError_t err = cudaMemcpyAsync(host_buffer, + _data, + _size, + cudaMemcpyDeviceToHost, + this->stream()); if (status != cudaSuccess) { throw std::runtime_error{"Failed to copy to host."}; } From f30a301dab33f3ae9476edb861efe0b34822fc42 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 20 Dec 2019 12:28:41 -0800 Subject: [PATCH 09/35] Fix incorrect variable name --- include/rmm/device_buffer.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp index 843ceaed6..b93ca632e 100644 --- a/include/rmm/device_buffer.hpp +++ b/include/rmm/device_buffer.hpp @@ -396,7 +396,7 @@ class device_buffer { _size, cudaMemcpyDeviceToHost, this->stream()); - if (status != cudaSuccess) { + if (err != cudaSuccess) { throw std::runtime_error{"Failed to copy to host."}; } } From 8edffa1155b1f12d75985d7e5bb42c83c9dcbe89 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Fri, 20 Dec 2019 12:33:19 -0800 Subject: [PATCH 10/35] Convert `copy_to_host` to a function --- include/rmm/device_buffer.hpp | 36 +++++++++++++++---------------- python/rmm/_lib/device_buffer.pxd | 3 ++- python/rmm/_lib/device_buffer.pyx | 2 +- 3 files changed, 21 insertions(+), 20 deletions(-) diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp index b93ca632e..e25a9a6d2 100644 --- a/include/rmm/device_buffer.hpp +++ b/include/rmm/device_buffer.hpp @@ -383,24 +383,6 @@ class device_buffer { *-------------------------------------------------------------------------**/ mr::device_memory_resource* memory_resource() const noexcept { return _mr; } - /**--------------------------------------------------------------------------* - * @brief Copies rmm::device_buffer to a preallocated host buffer. - *-------------------------------------------------------------------------**/ - void copy_to_host(void* host_buffer, cudaStream_t stream = 0) const { - if (host_buffer == nullptr) { - throw std::runtime_error{"Cannot copy to `nullptr`."}; - } - set_stream(stream); - cudaError_t err = cudaMemcpyAsync(host_buffer, - _data, - _size, - cudaMemcpyDeviceToHost, - this->stream()); - if (err != cudaSuccess) { - throw std::runtime_error{"Failed to copy to host."}; - } - } - private: void* _data{nullptr}; ///< Pointer to device memory allocation std::size_t _size{}; ///< Requested size of the device memory allocation @@ -410,4 +392,22 @@ class device_buffer { mr::get_default_resource()}; ///< The memory resource used to ///< allocate/deallocate device memory }; + +/**--------------------------------------------------------------------------* + * @brief Copies rmm::device_buffer to a preallocated host buffer. + *-------------------------------------------------------------------------**/ +void copy_to_host(device_buffer& db, void* hb, cudaStream_t stream = 0) { + if (hb == nullptr) { + throw std::runtime_error{"Cannot copy to `nullptr`."}; + } + db.set_stream(stream); + cudaError_t err = cudaMemcpyAsync(hb, + db.data(), + db.size(), + cudaMemcpyDeviceToHost, + db.stream()); + if (err != cudaSuccess) { + throw std::runtime_error{"Failed to copy to host."}; + } +} } // namespace rmm diff --git a/python/rmm/_lib/device_buffer.pxd b/python/rmm/_lib/device_buffer.pxd index a46f144b8..22b1f222a 100644 --- a/python/rmm/_lib/device_buffer.pxd +++ b/python/rmm/_lib/device_buffer.pxd @@ -13,7 +13,8 @@ cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil: void* data() size_t size() size_t capacity() - void copy_to_host(void* host_buffer) except * + + void copy_to_host(device_buffer& db, void* hb) except * cdef class DeviceBuffer: cdef unique_ptr[device_buffer] c_obj diff --git a/python/rmm/_lib/device_buffer.pyx b/python/rmm/_lib/device_buffer.pyx index 2425bbbc3..730a3f7fc 100644 --- a/python/rmm/_lib/device_buffer.pyx +++ b/python/rmm/_lib/device_buffer.pyx @@ -56,7 +56,7 @@ cdef class DeviceBuffer: cdef bytes b = PyBytes_FromStringAndSize(NULL, self.c_size()) cdef void* p = PyBytes_AS_STRING(b) - self.c_obj.get()[0].copy_to_host(p) + copy_to_host(self.c_obj.get()[0], p) return b From 60980387089911001b721e2c33ee8cf1de1569b1 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 6 Jan 2020 13:48:17 -0800 Subject: [PATCH 11/35] Use stream provided and don't change device_buffer --- include/rmm/device_buffer.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp index e25a9a6d2..b93ecedac 100644 --- a/include/rmm/device_buffer.hpp +++ b/include/rmm/device_buffer.hpp @@ -396,16 +396,15 @@ class device_buffer { /**--------------------------------------------------------------------------* * @brief Copies rmm::device_buffer to a preallocated host buffer. *-------------------------------------------------------------------------**/ -void copy_to_host(device_buffer& db, void* hb, cudaStream_t stream = 0) { +void copy_to_host(const device_buffer& db, void* hb, cudaStream_t stream = 0) { if (hb == nullptr) { throw std::runtime_error{"Cannot copy to `nullptr`."}; } - db.set_stream(stream); cudaError_t err = cudaMemcpyAsync(hb, db.data(), db.size(), cudaMemcpyDeviceToHost, - db.stream()); + stream); if (err != cudaSuccess) { throw std::runtime_error{"Failed to copy to host."}; } From 93eda0825ce3d0e77945fec32321fde99756ec1f Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 6 Jan 2020 13:49:20 -0800 Subject: [PATCH 12/35] Add Mark's doc suggestion --- include/rmm/device_buffer.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp index b93ecedac..2289752eb 100644 --- a/include/rmm/device_buffer.hpp +++ b/include/rmm/device_buffer.hpp @@ -395,6 +395,8 @@ class device_buffer { /**--------------------------------------------------------------------------* * @brief Copies rmm::device_buffer to a preallocated host buffer. + * + * Copies device memory asynchronously on the specified stream *-------------------------------------------------------------------------**/ void copy_to_host(const device_buffer& db, void* hb, cudaStream_t stream = 0) { if (hb == nullptr) { From 327bcadf7e2495b75121d5a57feb4265cc14a53b Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 6 Jan 2020 13:51:36 -0800 Subject: [PATCH 13/35] Include changelog entry for this feature --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5557b80fa..c2f0b3003 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ## New Features +- PR #219 Add method to copy `device_buffer` back to host memory + ## Improvements - PR #214 Add codeowners From b886251c23d771d696c5e4982d19ed75842f34c2 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 6 Jan 2020 15:48:20 -0800 Subject: [PATCH 14/35] Add `const` in `copy_to_host` forward declaration --- python/rmm/_lib/device_buffer.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/rmm/_lib/device_buffer.pxd b/python/rmm/_lib/device_buffer.pxd index 22b1f222a..5238f18c3 100644 --- a/python/rmm/_lib/device_buffer.pxd +++ b/python/rmm/_lib/device_buffer.pxd @@ -14,7 +14,7 @@ cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil: size_t size() size_t capacity() - void copy_to_host(device_buffer& db, void* hb) except * + void copy_to_host(const device_buffer& db, void* hb) except * cdef class DeviceBuffer: cdef unique_ptr[device_buffer] c_obj From 46407728d485576c8590f9e54317709ba0b3afcf Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 6 Jan 2020 15:53:40 -0800 Subject: [PATCH 15/35] Add `stream` with default value --- python/rmm/_lib/device_buffer.pxd | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/rmm/_lib/device_buffer.pxd b/python/rmm/_lib/device_buffer.pxd index 5238f18c3..0bcbfce4a 100644 --- a/python/rmm/_lib/device_buffer.pxd +++ b/python/rmm/_lib/device_buffer.pxd @@ -14,7 +14,9 @@ cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil: size_t size() size_t capacity() - void copy_to_host(const device_buffer& db, void* hb) except * + void copy_to_host(const device_buffer& db, + void* hb, + cudaStream_t stream=*) except * cdef class DeviceBuffer: cdef unique_ptr[device_buffer] c_obj From 1d1af062a83e337f7ff1051ac161e6f762e194ea Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 6 Jan 2020 16:03:04 -0800 Subject: [PATCH 16/35] Declare `copy_to_host` w/ & w/o `cudaStream_t` arg --- python/rmm/_lib/device_buffer.pxd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/rmm/_lib/device_buffer.pxd b/python/rmm/_lib/device_buffer.pxd index 0bcbfce4a..591921fc5 100644 --- a/python/rmm/_lib/device_buffer.pxd +++ b/python/rmm/_lib/device_buffer.pxd @@ -14,9 +14,10 @@ cdef extern from "rmm/device_buffer.hpp" namespace "rmm" nogil: size_t size() size_t capacity() + void copy_to_host(const device_buffer& db, void* hb) except * void copy_to_host(const device_buffer& db, void* hb, - cudaStream_t stream=*) except * + cudaStream_t stream) except * cdef class DeviceBuffer: cdef unique_ptr[device_buffer] c_obj From 8f9662f9ac7f09405f18f4e39a175aa1d01a5515 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 6 Jan 2020 16:52:58 -0800 Subject: [PATCH 17/35] Test copying to raw host pointer in C++ --- tests/device_buffer_tests.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/device_buffer_tests.cpp b/tests/device_buffer_tests.cpp index 45ebd9de8..0b32e70c0 100644 --- a/tests/device_buffer_tests.cpp +++ b/tests/device_buffer_tests.cpp @@ -111,6 +111,15 @@ TYPED_TEST(DeviceBufferTest, CopyFromRawDevicePointer) { EXPECT_EQ(cudaSuccess, cudaFree(device_memory)); } +TYPED_TEST(DeviceBufferTest, CopyToRawHostPointer) { + rmm::device_buffer buff(this->size); + std::vector host_data(this->size); + uint8_t* host_data_ptr = host_data.data(); + rmm::copy_to_host(buff, host_data_ptr); + EXPECT_EQ(0, buff.stream()); + // TODO check for equality between the contents of the two allocations +} + TYPED_TEST(DeviceBufferTest, CopyFromRawHostPointer) { std::vector host_data(this->size); rmm::device_buffer buff(static_cast(host_data.data()), this->size); From 1a2989dd925d898bc4d148f6b5211155c081dc7c Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 6 Jan 2020 17:38:44 -0800 Subject: [PATCH 18/35] Define `p` as a `char*` and cast later --- python/rmm/_lib/device_buffer.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/rmm/_lib/device_buffer.pyx b/python/rmm/_lib/device_buffer.pyx index 730a3f7fc..0c9ba76ff 100644 --- a/python/rmm/_lib/device_buffer.pyx +++ b/python/rmm/_lib/device_buffer.pyx @@ -54,9 +54,9 @@ cdef class DeviceBuffer: cpdef bytes tobytes(self): cdef bytes b = PyBytes_FromStringAndSize(NULL, self.c_size()) - cdef void* p = PyBytes_AS_STRING(b) + cdef char* p = PyBytes_AS_STRING(b) - copy_to_host(self.c_obj.get()[0], p) + copy_to_host(self.c_obj.get()[0], p) return b From b4f786df766697d1ccb9a3c0633a310685aebb31 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 6 Jan 2020 18:27:56 -0800 Subject: [PATCH 19/35] Assign `const device_buffer*` beforehand --- python/rmm/_lib/device_buffer.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/rmm/_lib/device_buffer.pyx b/python/rmm/_lib/device_buffer.pyx index 0c9ba76ff..89289bd39 100644 --- a/python/rmm/_lib/device_buffer.pyx +++ b/python/rmm/_lib/device_buffer.pyx @@ -53,10 +53,11 @@ cdef class DeviceBuffer: return buf cpdef bytes tobytes(self): + cdef const device_buffer* dbp = self.c_obj.get() cdef bytes b = PyBytes_FromStringAndSize(NULL, self.c_size()) cdef char* p = PyBytes_AS_STRING(b) - copy_to_host(self.c_obj.get()[0], p) + copy_to_host(dbp[0], p) return b From 7fe5c1e1d81876eb188a5c16a42d425b826afe3d Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 6 Jan 2020 18:28:01 -0800 Subject: [PATCH 20/35] Release GIL when copying to host As it may take a bit to copy the data from device back to host, make sure to release the GIL so Python can do other things while this happens. After all we have already allocated the Python objects we care to have. This step is merely filling them up with data. --- python/rmm/_lib/device_buffer.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/rmm/_lib/device_buffer.pyx b/python/rmm/_lib/device_buffer.pyx index 89289bd39..fd0c6cd0a 100644 --- a/python/rmm/_lib/device_buffer.pyx +++ b/python/rmm/_lib/device_buffer.pyx @@ -57,7 +57,8 @@ cdef class DeviceBuffer: cdef bytes b = PyBytes_FromStringAndSize(NULL, self.c_size()) cdef char* p = PyBytes_AS_STRING(b) - copy_to_host(dbp[0], p) + with nogil: + copy_to_host(dbp[0], p) return b From 2f550ab82a1872b6748fdd4e00207da487f7c1e9 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 6 Jan 2020 18:40:44 -0800 Subject: [PATCH 21/35] Synchronize on CUDA stream before returning To make sure that the data is copied over to host before returning, make sure to call `cudaStreamSynchronize` last. If it fails, make sure to throw a `std::runtime_error` indicating this failure. --- include/rmm/device_buffer.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp index 2289752eb..4bb0b7fde 100644 --- a/include/rmm/device_buffer.hpp +++ b/include/rmm/device_buffer.hpp @@ -410,5 +410,8 @@ void copy_to_host(const device_buffer& db, void* hb, cudaStream_t stream = 0) { if (err != cudaSuccess) { throw std::runtime_error{"Failed to copy to host."}; } + if (cudaSuccess != cudaStreamSynchronize(stream)) { + throw std::runtime_error{"Stream sync failed."}; + } } } // namespace rmm From eec351adf2b8d99b529fb19f51e2b62b05fc322d Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Mon, 6 Jan 2020 19:04:55 -0800 Subject: [PATCH 22/35] Add Python test to convert `DeviceBuffer` to bytes --- python/rmm/tests/test_rmm.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/rmm/tests/test_rmm.py b/python/rmm/tests/test_rmm.py index 2c21384c1..f516fcedd 100644 --- a/python/rmm/tests/test_rmm.py +++ b/python/rmm/tests/test_rmm.py @@ -105,6 +105,11 @@ def test_rmm_device_buffer(size): assert b.__cuda_array_interface__["typestr"] == "|u1" assert b.__cuda_array_interface__["version"] == 0 + # Test conversion to bytes + s = b.tobytes() + assert isinstance(s, bytes) + assert len(s) == len(b) + # Test resizing b.resize(2) assert b.size == 2 From 65bd2d654059ca6706f9c712abd7c2ac8a0f82c9 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Tue, 7 Jan 2020 10:51:23 -0800 Subject: [PATCH 23/35] Allow passing `stream` into `tobytes` Defaults to `0` as it would have been otherwise if not specified. --- python/rmm/_lib/device_buffer.pxd | 2 +- python/rmm/_lib/device_buffer.pyx | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/python/rmm/_lib/device_buffer.pxd b/python/rmm/_lib/device_buffer.pxd index 591921fc5..638bc9e2f 100644 --- a/python/rmm/_lib/device_buffer.pxd +++ b/python/rmm/_lib/device_buffer.pxd @@ -25,7 +25,7 @@ cdef class DeviceBuffer: @staticmethod cdef DeviceBuffer c_from_unique_ptr(unique_ptr[device_buffer] ptr) - cpdef bytes tobytes(self) + cpdef bytes tobytes(self, cudaStream_t stream=*) cdef size_t c_size(self) cpdef void resize(self, size_t new_size) diff --git a/python/rmm/_lib/device_buffer.pyx b/python/rmm/_lib/device_buffer.pyx index fd0c6cd0a..42f778243 100644 --- a/python/rmm/_lib/device_buffer.pyx +++ b/python/rmm/_lib/device_buffer.pyx @@ -3,6 +3,8 @@ from libc.stdint cimport uintptr_t from cpython.bytes cimport PyBytes_FromStringAndSize, PyBytes_AS_STRING +from rmm._lib.lib cimport cudaStream_t + cdef class DeviceBuffer: @@ -52,13 +54,13 @@ cdef class DeviceBuffer: buf.c_obj = move(ptr) return buf - cpdef bytes tobytes(self): + cpdef bytes tobytes(self, cudaStream_t stream=0): cdef const device_buffer* dbp = self.c_obj.get() cdef bytes b = PyBytes_FromStringAndSize(NULL, self.c_size()) cdef char* p = PyBytes_AS_STRING(b) with nogil: - copy_to_host(dbp[0], p) + copy_to_host(dbp[0], p, stream) return b From 1634ef2b4c4b1938249776d6200f0b9bb4d07d74 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Tue, 7 Jan 2020 10:51:24 -0800 Subject: [PATCH 24/35] Drop stream synchronization from `copy_to_host` Since we want to keep the C++ asynchronous, drop the explicit synchronization step from `copy_to_host`. This will be moved to the Cython level instead. After all we still need to ensure that the `bytes` object we return in Python actually has valid data before returning. --- include/rmm/device_buffer.hpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp index 4bb0b7fde..2289752eb 100644 --- a/include/rmm/device_buffer.hpp +++ b/include/rmm/device_buffer.hpp @@ -410,8 +410,5 @@ void copy_to_host(const device_buffer& db, void* hb, cudaStream_t stream = 0) { if (err != cudaSuccess) { throw std::runtime_error{"Failed to copy to host."}; } - if (cudaSuccess != cudaStreamSynchronize(stream)) { - throw std::runtime_error{"Stream sync failed."}; - } } } // namespace rmm From 28e91167d9d67cc6518cb2f820999f9e7aa20ec9 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Tue, 7 Jan 2020 10:51:25 -0800 Subject: [PATCH 25/35] Extern `cudaError_t` in Cython As this is an `enum` with 200+ values that we are mostly uninterested in, skip defining all of these and just define `cudaSuccess` as `0`. If we find we need more, we can also define those later. After all this is still defined at the C++ level. We are just making Cython aware of it. --- python/rmm/_lib/lib.pxd | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/rmm/_lib/lib.pxd b/python/rmm/_lib/lib.pxd index f67483fee..0a2bf2ab6 100644 --- a/python/rmm/_lib/lib.pxd +++ b/python/rmm/_lib/lib.pxd @@ -28,6 +28,9 @@ ctypedef pair[size_t, size_t] memory_pair cdef extern from * nogil: + ctypedef enum cudaError_t "cudaError_t": + cudaSuccess = 0 + ctypedef void* cudaStream_t "cudaStream_t" From b098935fa0be5604ebe28a62e7fcae5ed211bd36 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Tue, 7 Jan 2020 10:51:25 -0800 Subject: [PATCH 26/35] Extern `cudaStreamSynchronize` As we need to perform synchronization at the Cython level, ensure we can call `cudaStreamSynchronize` by externing it. --- python/rmm/_lib/lib.pxd | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/rmm/_lib/lib.pxd b/python/rmm/_lib/lib.pxd index 0a2bf2ab6..4c831da3d 100644 --- a/python/rmm/_lib/lib.pxd +++ b/python/rmm/_lib/lib.pxd @@ -33,6 +33,8 @@ cdef extern from * nogil: ctypedef void* cudaStream_t "cudaStream_t" + ctypedef cudaError_t cudaStreamSynchronize(cudaStream_t stream) + cdef uintptr_t c_alloc( size_t size, From 527814694d7149d23b3830cc01a051f7efdc4c5f Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Tue, 7 Jan 2020 11:14:23 -0800 Subject: [PATCH 27/35] Sync stream in Cython To ensure we have a valid `bytes` object with data in it, synchronize at the Cython level. Besides we no longer synchronize in C++. So we cannot rely on that any more. --- python/rmm/_lib/device_buffer.pyx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/rmm/_lib/device_buffer.pyx b/python/rmm/_lib/device_buffer.pyx index 42f778243..bd70bd293 100644 --- a/python/rmm/_lib/device_buffer.pyx +++ b/python/rmm/_lib/device_buffer.pyx @@ -3,7 +3,7 @@ from libc.stdint cimport uintptr_t from cpython.bytes cimport PyBytes_FromStringAndSize, PyBytes_AS_STRING -from rmm._lib.lib cimport cudaStream_t +from rmm._lib.lib cimport cudaError_t, cudaStream_t, cudaStreamSynchronize cdef class DeviceBuffer: @@ -58,9 +58,13 @@ cdef class DeviceBuffer: cdef const device_buffer* dbp = self.c_obj.get() cdef bytes b = PyBytes_FromStringAndSize(NULL, self.c_size()) cdef char* p = PyBytes_AS_STRING(b) + cdef cudaError_t err with nogil: copy_to_host(dbp[0], p, stream) + err = cudaStreamSynchronize(stream) + if err != cudaSuccess: + raise RuntimeError("Stream sync failed.") return b From 8559ecb25ff64a46d1182bda993eedaaf8271bb7 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Tue, 7 Jan 2020 11:33:06 -0800 Subject: [PATCH 28/35] Take `uintptr_t` in `tobytes` call Instead of taking a `cudaStream_t` typed `stream` value in `tobytes`, take a `uintptr_t` and cast it to `cudaStream_t` later. This way we can leverage Cython's knowledge of how to coerce Python objects to `uintptr_t`. Then it is a simple matter to cast the `uintptr_t` to `cudaStream_t` later. --- python/rmm/_lib/device_buffer.pxd | 2 +- python/rmm/_lib/device_buffer.pyx | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/rmm/_lib/device_buffer.pxd b/python/rmm/_lib/device_buffer.pxd index 638bc9e2f..0f8a4ea95 100644 --- a/python/rmm/_lib/device_buffer.pxd +++ b/python/rmm/_lib/device_buffer.pxd @@ -25,7 +25,7 @@ cdef class DeviceBuffer: @staticmethod cdef DeviceBuffer c_from_unique_ptr(unique_ptr[device_buffer] ptr) - cpdef bytes tobytes(self, cudaStream_t stream=*) + cpdef bytes tobytes(self, uintptr_t stream=*) cdef size_t c_size(self) cpdef void resize(self, size_t new_size) diff --git a/python/rmm/_lib/device_buffer.pyx b/python/rmm/_lib/device_buffer.pyx index bd70bd293..bb6f185ec 100644 --- a/python/rmm/_lib/device_buffer.pyx +++ b/python/rmm/_lib/device_buffer.pyx @@ -54,15 +54,15 @@ cdef class DeviceBuffer: buf.c_obj = move(ptr) return buf - cpdef bytes tobytes(self, cudaStream_t stream=0): + cpdef bytes tobytes(self, uintptr_t stream=0): cdef const device_buffer* dbp = self.c_obj.get() cdef bytes b = PyBytes_FromStringAndSize(NULL, self.c_size()) cdef char* p = PyBytes_AS_STRING(b) cdef cudaError_t err with nogil: - copy_to_host(dbp[0], p, stream) - err = cudaStreamSynchronize(stream) + copy_to_host(dbp[0], p, stream) + err = cudaStreamSynchronize(stream) if err != cudaSuccess: raise RuntimeError("Stream sync failed.") From 10601c99eec02b4b64a8c042f911603229baf94e Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Tue, 7 Jan 2020 11:33:25 -0800 Subject: [PATCH 29/35] Import `cudaSuccess` in `device_buffer` --- python/rmm/_lib/device_buffer.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/rmm/_lib/device_buffer.pyx b/python/rmm/_lib/device_buffer.pyx index bb6f185ec..cc32c8dcb 100644 --- a/python/rmm/_lib/device_buffer.pyx +++ b/python/rmm/_lib/device_buffer.pyx @@ -3,7 +3,8 @@ from libc.stdint cimport uintptr_t from cpython.bytes cimport PyBytes_FromStringAndSize, PyBytes_AS_STRING -from rmm._lib.lib cimport cudaError_t, cudaStream_t, cudaStreamSynchronize +from rmm._lib.lib cimport (cudaError_t, cudaSuccess, + cudaStream_t, cudaStreamSynchronize) cdef class DeviceBuffer: From bcf359266e319cafbaceb808cc47207400aa05cc Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Tue, 7 Jan 2020 11:36:43 -0800 Subject: [PATCH 30/35] Drop unneeded `ctypedef` --- python/rmm/_lib/lib.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/rmm/_lib/lib.pxd b/python/rmm/_lib/lib.pxd index 4c831da3d..9605dfb66 100644 --- a/python/rmm/_lib/lib.pxd +++ b/python/rmm/_lib/lib.pxd @@ -33,7 +33,7 @@ cdef extern from * nogil: ctypedef void* cudaStream_t "cudaStream_t" - ctypedef cudaError_t cudaStreamSynchronize(cudaStream_t stream) + cudaError_t cudaStreamSynchronize(cudaStream_t stream) cdef uintptr_t c_alloc( From 9e43bafcc23da2ae46e8df7457a324f76aad61b8 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Tue, 7 Jan 2020 11:41:24 -0800 Subject: [PATCH 31/35] Document C++ parameters of `copy_to_host` --- include/rmm/device_buffer.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp index 2289752eb..20fcd92f3 100644 --- a/include/rmm/device_buffer.hpp +++ b/include/rmm/device_buffer.hpp @@ -397,6 +397,10 @@ class device_buffer { * @brief Copies rmm::device_buffer to a preallocated host buffer. * * Copies device memory asynchronously on the specified stream + * + * @param db `rmm::device_buffer` to copy to host + * @param hb host allocated buffer to copy data to + * @param stream CUDA stream on which memory may be allocated if the memory *-------------------------------------------------------------------------**/ void copy_to_host(const device_buffer& db, void* hb, cudaStream_t stream = 0) { if (hb == nullptr) { From 8261367f41b3ff424100325a2af444886afb0d92 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Tue, 7 Jan 2020 11:43:26 -0800 Subject: [PATCH 32/35] Document exceptions thrown by `copy_to_host` --- include/rmm/device_buffer.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp index 20fcd92f3..6f1f887b3 100644 --- a/include/rmm/device_buffer.hpp +++ b/include/rmm/device_buffer.hpp @@ -398,6 +398,8 @@ class device_buffer { * * Copies device memory asynchronously on the specified stream * + * @throws std::runtime_error if `hb` is `nullptr` or copy fails + * * @param db `rmm::device_buffer` to copy to host * @param hb host allocated buffer to copy data to * @param stream CUDA stream on which memory may be allocated if the memory From 37b48a22a0394d26e7c87d07e5918add2cf990d1 Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Tue, 7 Jan 2020 11:46:18 -0800 Subject: [PATCH 33/35] Also import `uintptr_t` in `device_buffer.pxd` --- python/rmm/_lib/device_buffer.pxd | 1 + 1 file changed, 1 insertion(+) diff --git a/python/rmm/_lib/device_buffer.pxd b/python/rmm/_lib/device_buffer.pxd index 0f8a4ea95..d1b23c509 100644 --- a/python/rmm/_lib/device_buffer.pxd +++ b/python/rmm/_lib/device_buffer.pxd @@ -1,4 +1,5 @@ from libcpp.memory cimport unique_ptr +from libc.stdint cimport uintptr_t from rmm._lib.lib cimport cudaStream_t From 48b7158f5abab998345f2f51d16ddcdedb9a7b57 Mon Sep 17 00:00:00 2001 From: jakirkham Date: Tue, 7 Jan 2020 11:58:03 -0800 Subject: [PATCH 34/35] Update include/rmm/device_buffer.hpp Co-Authored-By: Jake Hemstad --- include/rmm/device_buffer.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/rmm/device_buffer.hpp b/include/rmm/device_buffer.hpp index 6f1f887b3..30698eec5 100644 --- a/include/rmm/device_buffer.hpp +++ b/include/rmm/device_buffer.hpp @@ -402,7 +402,7 @@ class device_buffer { * * @param db `rmm::device_buffer` to copy to host * @param hb host allocated buffer to copy data to - * @param stream CUDA stream on which memory may be allocated if the memory + * @param stream CUDA stream on which the device to host copy will be performed *-------------------------------------------------------------------------**/ void copy_to_host(const device_buffer& db, void* hb, cudaStream_t stream = 0) { if (hb == nullptr) { From f8ea21a75f46e230eb5856e960ee96853d9847fb Mon Sep 17 00:00:00 2001 From: John Kirkham Date: Tue, 7 Jan 2020 14:43:57 -0800 Subject: [PATCH 35/35] Add the CUDA error code in the Python exception --- python/rmm/_lib/device_buffer.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/rmm/_lib/device_buffer.pyx b/python/rmm/_lib/device_buffer.pyx index cc32c8dcb..c1469af4f 100644 --- a/python/rmm/_lib/device_buffer.pyx +++ b/python/rmm/_lib/device_buffer.pyx @@ -65,7 +65,7 @@ cdef class DeviceBuffer: copy_to_host(dbp[0], p, stream) err = cudaStreamSynchronize(stream) if err != cudaSuccess: - raise RuntimeError("Stream sync failed.") + raise RuntimeError(f"Stream sync failed with error: {err}") return b