From eb39019d55615160766af37ff6b36d51c23522a6 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 30 May 2024 16:24:30 -0700 Subject: [PATCH 01/75] remove pinned_host_vector --- cpp/benchmarks/io/text/multibyte_split.cpp | 8 +- .../detail/utilities/pinned_host_vector.hpp | 216 ------------------ .../cudf/detail/utilities/rmm_host_vector.hpp | 8 +- .../detail/utilities/vector_factories.hpp | 38 ++- cpp/src/io/csv/reader_impl.cu | 1 + cpp/src/io/orc/reader_impl_chunking.cu | 1 + cpp/src/io/orc/writer_impl.cu | 5 +- cpp/src/io/parquet/reader_impl_helpers.cpp | 2 + cpp/src/io/parquet/writer_impl.cu | 3 +- cpp/src/io/text/bgzip_data_chunk_source.cu | 16 +- .../io/text/data_chunk_source_factories.cpp | 23 +- cpp/src/io/utilities/hostdevice_vector.hpp | 4 +- 12 files changed, 77 insertions(+), 248 deletions(-) delete mode 100644 cpp/include/cudf/detail/utilities/pinned_host_vector.hpp diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp index b5d855d8881..172182c3607 100644 --- a/cpp/benchmarks/io/text/multibyte_split.cpp +++ b/cpp/benchmarks/io/text/multibyte_split.cpp @@ -22,7 +22,6 @@ #include #include -#include #include #include #include @@ -132,9 +131,10 @@ static void bench_multibyte_split(nvbench::state& state, auto const delim_factor = static_cast(delim_percent) / 100; std::unique_ptr datasource; - auto device_input = create_random_input(file_size_approx, delim_factor, 0.05, delim); - auto host_input = std::vector{}; - auto host_pinned_input = cudf::detail::pinned_host_vector{}; + auto device_input = create_random_input(file_size_approx, delim_factor, 0.05, delim); + auto host_input = std::vector{}; + auto host_pinned_input = + cudf::detail::make_pinned_vector_async(0, cudf::get_default_stream()); if (source_type != data_chunk_source_type::device && source_type != data_chunk_source_type::host_pinned) { diff --git a/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp b/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp deleted file mode 100644 index c22b6a6ba15..00000000000 --- a/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp +++ /dev/null @@ -1,216 +0,0 @@ -/* - * Copyright (c) 2008-2024, NVIDIA CORPORATION - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include - -#include -#include -#include // for bad_alloc - -namespace cudf::detail { - -/*! \p pinned_allocator is a CUDA-specific host memory allocator - * that employs \c cudaMallocHost for allocation. - * - * This implementation is ported from the experimental/pinned_allocator - * that Thrust used to provide. - * - * \see https://en.cppreference.com/w/cpp/memory/allocator - */ -template -class pinned_allocator; - -/*! \p pinned_allocator is a CUDA-specific host memory allocator - * that employs \c cudaMallocHost for allocation. - * - * This implementation is ported from the experimental/pinned_allocator - * that Thrust used to provide. - * - * \see https://en.cppreference.com/w/cpp/memory/allocator - */ -template <> -class pinned_allocator { - public: - using value_type = void; ///< The type of the elements in the allocator - using pointer = void*; ///< The type returned by address() / allocate() - using const_pointer = void const*; ///< The type returned by address() - using size_type = std::size_t; ///< The type used for the size of the allocation - using difference_type = std::ptrdiff_t; ///< The type of the distance between two pointers - - /** - * @brief converts a `pinned_allocator` to `pinned_allocator` - */ - template - struct rebind { - using other = pinned_allocator; ///< The rebound type - }; -}; - -/*! \p pinned_allocator is a CUDA-specific host memory allocator - * that employs \c cudaMallocHost for allocation. - * - * This implementation is ported from the experimental/pinned_allocator - * that Thrust used to provide. - * - * \see https://en.cppreference.com/w/cpp/memory/allocator - */ -template -class pinned_allocator { - public: - using value_type = T; ///< The type of the elements in the allocator - using pointer = T*; ///< The type returned by address() / allocate() - using const_pointer = T const*; ///< The type returned by address() - using reference = T&; ///< The parameter type for address() - using const_reference = T const&; ///< The parameter type for address() - using size_type = std::size_t; ///< The type used for the size of the allocation - using difference_type = std::ptrdiff_t; ///< The type of the distance between two pointers - - /** - * @brief converts a `pinned_allocator` to `pinned_allocator` - */ - template - struct rebind { - using other = pinned_allocator; ///< The rebound type - }; - - /** - * @brief pinned_allocator's null constructor does nothing. - */ - __host__ __device__ inline pinned_allocator() {} - - /** - * @brief pinned_allocator's null destructor does nothing. - */ - __host__ __device__ inline ~pinned_allocator() {} - - /** - * @brief pinned_allocator's copy constructor does nothing. - */ - __host__ __device__ inline pinned_allocator(pinned_allocator const&) {} - - /** - * @brief pinned_allocator's copy constructor does nothing. - * - * This version of pinned_allocator's copy constructor - * is templated on the \c value_type of the pinned_allocator - * to copy from. It is provided merely for convenience; it - * does nothing. - */ - template - __host__ __device__ inline pinned_allocator(pinned_allocator const&) - { - } - - /** - * @brief This method returns the address of a \c reference of - * interest. - * - * @param r The \c reference of interest. - * @return \c r's address. - */ - __host__ __device__ inline pointer address(reference r) { return &r; } - - /** - * @brief This method returns the address of a \c const_reference - * of interest. - * - * @param r The \c const_reference of interest. - * @return \c r's address. - */ - __host__ __device__ inline const_pointer address(const_reference r) { return &r; } - - /** - * @brief This method allocates storage for objects in pinned host - * memory. - * - * @param cnt The number of objects to allocate. - * @return a \c pointer to the newly allocated objects. - * @note The second parameter to this function is meant as a - * hint pointer to a nearby memory location, but is - * not used by this allocator. - * @note This method does not invoke \p value_type's constructor. - * It is the responsibility of the caller to initialize the - * objects at the returned \c pointer. - */ - __host__ inline pointer allocate(size_type cnt, const_pointer /*hint*/ = 0) - { - if (cnt > this->max_size()) { throw std::bad_alloc(); } // end if - - pointer result(0); - CUDF_CUDA_TRY(cudaMallocHost(reinterpret_cast(&result), cnt * sizeof(value_type))); - return result; - } - - /** - * @brief This method deallocates pinned host memory previously allocated - * with this \c pinned_allocator. - * - * @param p A \c pointer to the previously allocated memory. - * @note The second parameter is the number of objects previously allocated - * but is ignored by this allocator. - * @note This method does not invoke \p value_type's destructor. - * It is the responsibility of the caller to destroy - * the objects stored at \p p. - */ - __host__ inline void deallocate(pointer p, size_type /*cnt*/) - { - auto dealloc_worked = cudaFreeHost(p); - (void)dealloc_worked; - assert(dealloc_worked == cudaSuccess); - } - - /** - * @brief This method returns the maximum size of the \c cnt parameter - * accepted by the \p allocate() method. - * - * @return The maximum number of objects that may be allocated - * by a single call to \p allocate(). - */ - inline size_type max_size() const { return (std::numeric_limits::max)() / sizeof(T); } - - /** - * @brief This method tests this \p pinned_allocator for equality to - * another. - * - * @param x The other \p pinned_allocator of interest. - * @return This method always returns \c true. - */ - __host__ __device__ inline bool operator==(pinned_allocator const& x) const { return true; } - - /** - * @brief This method tests this \p pinned_allocator for inequality - * to another. - * - * @param x The other \p pinned_allocator of interest. - * @return This method always returns \c false. - */ - __host__ __device__ inline bool operator!=(pinned_allocator const& x) const - { - return !operator==(x); - } -}; - -/** - * @brief A vector class with pinned host memory allocator - */ -template -using pinned_host_vector = thrust::host_vector>; - -} // namespace cudf::detail diff --git a/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp b/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp index 6901a19473e..6604020c224 100644 --- a/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp +++ b/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -32,8 +33,6 @@ namespace cudf::detail { /*! \p rmm_host_allocator is a CUDA-specific host memory allocator * that employs \c a `rmm::host_async_resource_ref` for allocation. * - * This implementation is ported from pinned_host_vector in cudf. - * * \see https://en.cppreference.com/w/cpp/memory/allocator */ template @@ -42,8 +41,6 @@ class rmm_host_allocator; /*! \p rmm_host_allocator is a CUDA-specific host memory allocator * that employs \c an `cudf::host_async_resource_ref` for allocation. * - * This implementation is ported from pinned_host_vector in cudf. - * * \see https://en.cppreference.com/w/cpp/memory/allocator */ template <> @@ -70,8 +67,7 @@ class rmm_host_allocator { * The \p rmm_host_allocator provides an interface for host memory allocation through the user * provided \c `rmm::host_async_resource_ref`. The \p rmm_host_allocator does not take ownership of * this reference and therefore it is the user's responsibility to ensure its lifetime for the - * duration of the lifetime of the \p rmm_host_allocator. This implementation is ported from - * pinned_host_vector in cudf. + * duration of the lifetime of the \p rmm_host_allocator. * * \see https://en.cppreference.com/w/cpp/memory/allocator */ diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp index 293a4096c57..6f2287fc1c8 100644 --- a/cpp/include/cudf/detail/utilities/vector_factories.hpp +++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp @@ -21,6 +21,8 @@ * @file vector_factories.hpp */ +#include +#include #include #include #include @@ -380,7 +382,7 @@ thrust::host_vector make_host_vector_async(device_span v, rmm::cuda_ * @brief Asynchronously construct a `std::vector` containing a copy of data from a device * container * - * @note This function synchronizes `stream`. + * @note This function does not synchronize `stream`. * * @tparam Container The type of the container to copy from * @tparam T The type of the data to copy @@ -439,6 +441,40 @@ thrust::host_vector make_host_vector_sync( return make_host_vector_sync(device_span{c}, stream); } +/** + * @brief Asynchronously construct a `rmm_host_vector` of the given size + * + * @note This function does not synchronize `stream`. + * + * @tparam T The type of the vector data + * @param size The number of elements in the created vector + * @param stream The stream on which to allocate memory + * @return A rmm_host_vector of the given size + */ +template +rmm_host_vector make_pinned_vector_async(size_t size, rmm::cuda_stream_view stream) +{ + return rmm_host_vector(size, {cudf::io::get_host_memory_resource(), stream}); +} + +/** + * @brief Synchronously construct a `rmm_host_vector` of the given size + * + * @note This function synchronizes `stream`. + * + * @tparam T The type of the vector data + * @param size The number of elements in the created vector + * @param stream The stream on which to allocate memory + * @return A rmm_host_vector of the given size + */ +template +rmm_host_vector make_pinned_vector_sync(size_t size, rmm::cuda_stream_view stream) +{ + auto result = make_pinned_vector_async(size, stream); + stream.synchronize(); + return result; +} + } // namespace detail } // namespace cudf diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 5dee0c17a33..05faded651d 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -27,6 +27,7 @@ #include "io/utilities/parsing_utils.cuh" #include +#include #include #include #include diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu index 5034aa14a95..43301826003 100644 --- a/cpp/src/io/orc/reader_impl_chunking.cu +++ b/cpp/src/io/orc/reader_impl_chunking.cu @@ -22,6 +22,7 @@ #include #include +#include #include #include diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 344e216cdc8..e9e031a407a 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include @@ -2339,7 +2338,7 @@ auto convert_table_to_orc_data(table_view const& input, std::move(streams), std::move(stripes), std::move(stripe_dicts.views), - cudf::detail::pinned_host_vector()}; + cudf::detail::make_pinned_vector_async(0, stream)}; } // Allocate intermediate output stream buffer @@ -2407,7 +2406,7 @@ auto convert_table_to_orc_data(table_view const& input, return max_stream_size; }(); - cudf::detail::pinned_host_vector bounce_buffer(max_out_stream_size); + auto bounce_buffer = cudf::detail::make_pinned_vector_async(max_out_stream_size, stream); auto intermediate_stats = gather_statistic_blobs(stats_freq, orc_table, segmentation, stream); diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp index eb653c6b9ac..9de8a9e2719 100644 --- a/cpp/src/io/parquet/reader_impl_helpers.cpp +++ b/cpp/src/io/parquet/reader_impl_helpers.cpp @@ -23,6 +23,8 @@ #include "ipc/Message_generated.h" #include "ipc/Schema_generated.h" +#include + #include #include diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 1dfced94f5b..6d466748c17 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -36,7 +36,6 @@ #include #include #include -#include #include #include #include @@ -2278,7 +2277,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, } auto bounce_buffer = - cudf::detail::pinned_host_vector(all_device_write ? 0 : max_write_size); + cudf::detail::make_pinned_vector_async(all_device_write ? 0 : max_write_size, stream); return std::tuple{std::move(agg_meta), std::move(pages), diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu index faa09e586ab..190015686df 100644 --- a/cpp/src/io/text/bgzip_data_chunk_source.cu +++ b/cpp/src/io/text/bgzip_data_chunk_source.cu @@ -14,13 +14,14 @@ * limitations under the License. */ +#include "cudf/detail/utilities/vector_factories.hpp" #include "io/comp/nvcomp_adapter.hpp" #include "io/text/device_data_chunks.hpp" #include "io/utilities/config_utils.hpp" #include #include -#include +#include #include #include #include @@ -66,7 +67,7 @@ struct bgzip_nvcomp_transform_functor { class bgzip_data_chunk_reader : public data_chunk_reader { private: template - static void copy_to_device(cudf::detail::pinned_host_vector const& host, + static void copy_to_device(cudf::detail::rmm_host_vector const& host, rmm::device_uvector& device, rmm::cuda_stream_view stream) { @@ -84,9 +85,9 @@ class bgzip_data_chunk_reader : public data_chunk_reader { 1 << 16; // 64k offset allocation, resized on demand cudaEvent_t event; - cudf::detail::pinned_host_vector h_compressed_blocks; - cudf::detail::pinned_host_vector h_compressed_offsets; - cudf::detail::pinned_host_vector h_decompressed_offsets; + cudf::detail::rmm_host_vector h_compressed_blocks; + cudf::detail::rmm_host_vector h_compressed_offsets; + cudf::detail::rmm_host_vector h_decompressed_offsets; rmm::device_uvector d_compressed_blocks; rmm::device_uvector d_decompressed_blocks; rmm::device_uvector d_compressed_offsets; @@ -103,7 +104,10 @@ class bgzip_data_chunk_reader : public data_chunk_reader { bool is_decompressed{}; decompression_blocks(rmm::cuda_stream_view init_stream) - : d_compressed_blocks(0, init_stream), + : h_compressed_blocks{cudf::detail::make_pinned_vector_async(0, init_stream)}, + h_compressed_offsets{cudf::detail::make_pinned_vector_async(0, init_stream)}, + h_decompressed_offsets{cudf::detail::make_pinned_vector_async(0, init_stream)}, + d_compressed_blocks(0, init_stream), d_decompressed_blocks(0, init_stream), d_compressed_offsets(0, init_stream), d_decompressed_offsets(0, init_stream), diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp index 9d1d0498ace..8278b2c25cb 100644 --- a/cpp/src/io/text/data_chunk_source_factories.cpp +++ b/cpp/src/io/text/data_chunk_source_factories.cpp @@ -14,10 +14,11 @@ * limitations under the License. */ +#include "cudf/detail/utilities/vector_factories.hpp" #include "io/text/device_data_chunks.hpp" #include -#include +#include #include #include @@ -32,7 +33,7 @@ namespace { struct host_ticket { cudaEvent_t event; - cudf::detail::pinned_host_vector buffer; + std::unique_ptr> buffer; }; /** @@ -84,13 +85,16 @@ class datasource_chunk_reader : public data_chunk_reader { CUDF_CUDA_TRY(cudaEventSynchronize(h_ticket.event)); // resize the host buffer as necessary to contain the requested number of bytes - if (h_ticket.buffer.size() < read_size) { h_ticket.buffer.resize(read_size); } + if (h_ticket.buffer == nullptr or h_ticket.buffer->size() < read_size) { + h_ticket.buffer = std::make_unique>( + cudf::detail::make_pinned_vector_sync(read_size, stream)); + } - _source->host_read(_offset, read_size, reinterpret_cast(h_ticket.buffer.data())); + _source->host_read(_offset, read_size, reinterpret_cast(h_ticket.buffer->data())); // copy the host-pinned data on to device CUDF_CUDA_TRY(cudaMemcpyAsync( - chunk.data(), h_ticket.buffer.data(), read_size, cudaMemcpyDefault, stream.value())); + chunk.data(), h_ticket.buffer->data(), read_size, cudaMemcpyDefault, stream.value())); // record the host-to-device copy. CUDF_CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value())); @@ -148,10 +152,13 @@ class istream_data_chunk_reader : public data_chunk_reader { CUDF_CUDA_TRY(cudaEventSynchronize(h_ticket.event)); // resize the host buffer as necessary to contain the requested number of bytes - if (h_ticket.buffer.size() < read_size) { h_ticket.buffer.resize(read_size); } + if (h_ticket.buffer == nullptr or h_ticket.buffer->size() < read_size) { + h_ticket.buffer = std::make_unique>( + cudf::detail::make_pinned_vector_sync(read_size, stream)); + } // read data from the host istream in to the pinned host memory buffer - _datastream->read(h_ticket.buffer.data(), read_size); + _datastream->read(h_ticket.buffer->data(), read_size); // adjust the read size to reflect how many bytes were actually read from the data stream read_size = _datastream->gcount(); @@ -161,7 +168,7 @@ class istream_data_chunk_reader : public data_chunk_reader { // copy the host-pinned data on to device CUDF_CUDA_TRY(cudaMemcpyAsync( - chunk.data(), h_ticket.buffer.data(), read_size, cudaMemcpyDefault, stream.value())); + chunk.data(), h_ticket.buffer->data(), read_size, cudaMemcpyDefault, stream.value())); // record the host-to-device copy. CUDF_CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value())); diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp index 0883ac3609f..492084bd5bc 100644 --- a/cpp/src/io/utilities/hostdevice_vector.hpp +++ b/cpp/src/io/utilities/hostdevice_vector.hpp @@ -16,10 +16,10 @@ #pragma once -#include "config_utils.hpp" #include "hostdevice_span.hpp" #include +#include #include #include #include @@ -53,7 +53,7 @@ class hostdevice_vector { } explicit hostdevice_vector(size_t initial_size, size_t max_size, rmm::cuda_stream_view stream) - : h_data({cudf::io::get_host_memory_resource(), stream}), d_data(max_size, stream) + : h_data{make_pinned_vector_async(0, stream)}, d_data(max_size, stream) { CUDF_EXPECTS(initial_size <= max_size, "initial_size cannot be larger than max_size"); From 24b12451ffea31766d01c9cbe8d4e10bbe3734be Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 30 May 2024 16:54:36 -0700 Subject: [PATCH 02/75] switch to host_device resource ref --- cpp/benchmarks/fixture/nvbench_fixture.hpp | 5 +++-- cpp/include/cudf/io/memory_resource.hpp | 7 +++---- cpp/src/io/utilities/config_utils.cpp | 21 +++++++++++---------- cpp/src/io/utilities/hostdevice_vector.hpp | 1 - 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/cpp/benchmarks/fixture/nvbench_fixture.hpp b/cpp/benchmarks/fixture/nvbench_fixture.hpp index ebcbcb17e98..3a5a9bfd2fa 100644 --- a/cpp/benchmarks/fixture/nvbench_fixture.hpp +++ b/cpp/benchmarks/fixture/nvbench_fixture.hpp @@ -81,14 +81,15 @@ struct nvbench_base_fixture { "\nExpecting: cuda, pool, async, arena, managed, or managed_pool"); } - inline rmm::host_async_resource_ref make_cuio_host_pinned() + inline rmm::host_device_async_resource_ref make_cuio_host_pinned() { static std::shared_ptr mr = std::make_shared(); return *mr; } - inline rmm::host_async_resource_ref create_cuio_host_memory_resource(std::string const& mode) + inline rmm::host_device_async_resource_ref create_cuio_host_memory_resource( + std::string const& mode) { if (mode == "pinned") return make_cuio_host_pinned(); if (mode == "pinned_pool") return cudf::io::get_host_memory_resource(); diff --git a/cpp/include/cudf/io/memory_resource.hpp b/cpp/include/cudf/io/memory_resource.hpp index a36e220ae7b..2af5755f824 100644 --- a/cpp/include/cudf/io/memory_resource.hpp +++ b/cpp/include/cudf/io/memory_resource.hpp @@ -33,7 +33,8 @@ namespace cudf::io { * @param mr The rmm resource to be used for host-side allocations * @return The previous resource that was in use */ -rmm::host_async_resource_ref set_host_memory_resource(rmm::host_async_resource_ref mr); +rmm::host_device_async_resource_ref set_host_memory_resource( + rmm::host_device_async_resource_ref mr); /** * @brief Get the rmm resource being used for host memory allocations by @@ -41,7 +42,7 @@ rmm::host_async_resource_ref set_host_memory_resource(rmm::host_async_resource_r * * @return The rmm resource used for host-side allocations */ -rmm::host_async_resource_ref get_host_memory_resource(); +rmm::host_device_async_resource_ref get_host_memory_resource(); /** * @brief Options to configure the default host memory resource @@ -54,8 +55,6 @@ struct host_mr_options { /** * @brief Configure the size of the default host memory resource. * - * @throws cudf::logic_error if called after the default host memory resource has been created - * * @param opts Options to configure the default host memory resource * @return True if this call successfully configured the host memory resource, false if a * a resource was already configured. diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp index dad1135e766..25649d17c76 100644 --- a/cpp/src/io/utilities/config_utils.cpp +++ b/cpp/src/io/utilities/config_utils.cpp @@ -209,7 +209,8 @@ static_assert(cuda::mr::resource_with config_size) +CUDF_EXPORT rmm::host_device_async_resource_ref& make_default_pinned_mr( + std::optional config_size) { static fixed_pinned_pool_memory_resource mr = [config_size]() { auto const size = [&config_size]() -> size_t { @@ -233,7 +234,7 @@ CUDF_EXPORT rmm::host_async_resource_ref& make_default_pinned_mr(std::optional const& opts, - bool* did_configure = nullptr) +CUDF_EXPORT rmm::host_device_async_resource_ref& make_host_mr( + std::optional const& opts, bool* did_configure = nullptr) { - static rmm::host_async_resource_ref* mr_ref = nullptr; - bool configured = false; + static rmm::host_device_async_resource_ref* mr_ref = nullptr; + bool configured = false; if (mr_ref == nullptr) { configured = true; mr_ref = &make_default_pinned_mr(opts ? opts->pool_size : std::nullopt); @@ -262,13 +263,13 @@ CUDF_EXPORT rmm::host_async_resource_ref& make_host_mr(std::optional #include -#include #include #include #include From 6c896f6ebf2d6177f62903dd719cebf88da08565 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 30 May 2024 18:07:00 -0700 Subject: [PATCH 03/75] rebrand host memory resource --- cpp/CMakeLists.txt | 1 + cpp/benchmarks/fixture/nvbench_fixture.hpp | 8 +- .../io/parquet/parquet_reader_multithread.cpp | 2 +- .../detail/utilities/vector_factories.hpp | 4 +- cpp/include/cudf/io/memory_resource.hpp | 64 ----- cpp/include/cudf/utilities/pinned_memory.hpp | 58 +++++ cpp/src/io/utilities/config_utils.cpp | 215 +--------------- cpp/src/utilities/pinned_memory.cpp | 230 ++++++++++++++++++ cpp/tests/io/json_test.cpp | 6 +- .../utilities_tests/io_utilities_tests.cpp | 8 +- 10 files changed, 305 insertions(+), 291 deletions(-) delete mode 100644 cpp/include/cudf/io/memory_resource.hpp create mode 100644 cpp/include/cudf/utilities/pinned_memory.hpp create mode 100644 cpp/src/utilities/pinned_memory.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index f637db66c2c..8a0f56aec53 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -665,6 +665,7 @@ add_library( src/utilities/default_stream.cpp src/utilities/linked_column.cpp src/utilities/logger.cpp + src/utilities/pinned_memory.cpp src/utilities/stacktrace.cpp src/utilities/stream_pool.cpp src/utilities/traits.cpp diff --git a/cpp/benchmarks/fixture/nvbench_fixture.hpp b/cpp/benchmarks/fixture/nvbench_fixture.hpp index 3a5a9bfd2fa..df1492690bb 100644 --- a/cpp/benchmarks/fixture/nvbench_fixture.hpp +++ b/cpp/benchmarks/fixture/nvbench_fixture.hpp @@ -15,8 +15,8 @@ */ #pragma once -#include #include +#include #include #include @@ -92,7 +92,7 @@ struct nvbench_base_fixture { std::string const& mode) { if (mode == "pinned") return make_cuio_host_pinned(); - if (mode == "pinned_pool") return cudf::io::get_host_memory_resource(); + if (mode == "pinned_pool") return cudf::get_pinned_memory_resource(); CUDF_FAIL("Unknown cuio_host_mem parameter: " + mode + "\nExpecting: pinned or pinned_pool"); } @@ -113,14 +113,14 @@ struct nvbench_base_fixture { rmm::mr::set_current_device_resource(mr.get()); std::cout << "RMM memory resource = " << rmm_mode << "\n"; - cudf::io::set_host_memory_resource(create_cuio_host_memory_resource(cuio_host_mode)); + cudf::set_pinned_memory_resource(create_cuio_host_memory_resource(cuio_host_mode)); std::cout << "CUIO host memory resource = " << cuio_host_mode << "\n"; } ~nvbench_base_fixture() { // Ensure the the pool is freed before the CUDA context is destroyed: - cudf::io::set_host_memory_resource(this->make_cuio_host_pinned()); + cudf::set_pinned_memory_resource(this->make_cuio_host_pinned()); } std::shared_ptr mr; diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp index bd80c4e0e88..f340b4aeb7a 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp @@ -20,9 +20,9 @@ #include #include -#include #include #include +#include #include #include diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp index 6f2287fc1c8..7343ee25c8f 100644 --- a/cpp/include/cudf/detail/utilities/vector_factories.hpp +++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp @@ -22,9 +22,9 @@ */ #include -#include #include #include +#include #include #include @@ -454,7 +454,7 @@ thrust::host_vector make_host_vector_sync( template rmm_host_vector make_pinned_vector_async(size_t size, rmm::cuda_stream_view stream) { - return rmm_host_vector(size, {cudf::io::get_host_memory_resource(), stream}); + return rmm_host_vector(size, {cudf::get_pinned_memory_resource(), stream}); } /** diff --git a/cpp/include/cudf/io/memory_resource.hpp b/cpp/include/cudf/io/memory_resource.hpp deleted file mode 100644 index 2af5755f824..00000000000 --- a/cpp/include/cudf/io/memory_resource.hpp +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include - -#include - -namespace cudf::io { - -/** - * @brief Set the rmm resource to be used for host memory allocations by - * cudf::detail::hostdevice_vector - * - * hostdevice_vector is a utility class that uses a pair of host and device-side buffers for - * bouncing state between the cpu and the gpu. The resource set with this function (typically a - * pinned memory allocator) is what it uses to allocate space for it's host-side buffer. - * - * @param mr The rmm resource to be used for host-side allocations - * @return The previous resource that was in use - */ -rmm::host_device_async_resource_ref set_host_memory_resource( - rmm::host_device_async_resource_ref mr); - -/** - * @brief Get the rmm resource being used for host memory allocations by - * cudf::detail::hostdevice_vector - * - * @return The rmm resource used for host-side allocations - */ -rmm::host_device_async_resource_ref get_host_memory_resource(); - -/** - * @brief Options to configure the default host memory resource - */ -struct host_mr_options { - std::optional pool_size; ///< The size of the pool to use for the default host memory - ///< resource. If not set, the default pool size is used. -}; - -/** - * @brief Configure the size of the default host memory resource. - * - * @param opts Options to configure the default host memory resource - * @return True if this call successfully configured the host memory resource, false if a - * a resource was already configured. - */ -bool config_default_host_memory_resource(host_mr_options const& opts); - -} // namespace cudf::io diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp new file mode 100644 index 00000000000..b423eab6d38 --- /dev/null +++ b/cpp/include/cudf/utilities/pinned_memory.hpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +namespace cudf { + +/** + * @brief Set the rmm resource to be used for pinned memory allocations. + * + * @param mr The rmm resource to be used for pinned allocations + * @return The previous resource that was in use + */ +rmm::host_device_async_resource_ref set_pinned_memory_resource( + rmm::host_device_async_resource_ref mr); + +/** + * @brief Get the rmm resource being used for pinned memory allocations. + * + * @return The rmm resource used for pinned allocations + */ +rmm::host_device_async_resource_ref get_pinned_memory_resource(); + +/** + * @brief Options to configure the default pinned memory resource + */ +struct pinned_mr_options { + std::optional pool_size; ///< The size of the pool to use for the default pinned memory + ///< resource. If not set, the default pool size is used. +}; + +/** + * @brief Configure the size of the default pinned memory resource. + * + * @param opts Options to configure the default pinned memory resource + * @return True if this call successfully configured the pinned memory resource, false if a + * a resource was already configured. + */ +bool config_default_pinned_memory_resource(pinned_mr_options const& opts); + +} // namespace cudf diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp index 25649d17c76..20ac89b4d53 100644 --- a/cpp/src/io/utilities/config_utils.cpp +++ b/cpp/src/io/utilities/config_utils.cpp @@ -16,22 +16,12 @@ #include "config_utils.hpp" -#include -#include #include -#include - -#include -#include -#include -#include #include #include -namespace cudf::io { - -namespace detail { +namespace cudf::io::detail { namespace cufile_integration { @@ -90,205 +80,4 @@ bool is_stable_enabled() { return is_all_enabled() or get_env_policy() == usage_ } // namespace nvcomp_integration -} // namespace detail - -namespace { -class fixed_pinned_pool_memory_resource { - using upstream_mr = rmm::mr::pinned_host_memory_resource; - using host_pooled_mr = rmm::mr::pool_memory_resource; - - private: - upstream_mr upstream_mr_{}; - size_t pool_size_{0}; - // Raw pointer to avoid a segfault when the pool is destroyed on exit - host_pooled_mr* pool_{nullptr}; - void* pool_begin_{nullptr}; - void* pool_end_{nullptr}; - cuda::stream_ref stream_{cudf::detail::global_cuda_stream_pool().get_stream().value()}; - - public: - fixed_pinned_pool_memory_resource(size_t size) - : pool_size_{size}, pool_{new host_pooled_mr(upstream_mr_, size, size)} - { - if (pool_size_ == 0) { return; } - - // Allocate full size from the pinned pool to figure out the beginning and end address - pool_begin_ = pool_->allocate_async(pool_size_, stream_); - pool_end_ = static_cast(static_cast(pool_begin_) + pool_size_); - pool_->deallocate_async(pool_begin_, pool_size_, stream_); - } - - void* do_allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream) - { - if (bytes <= pool_size_) { - try { - return pool_->allocate_async(bytes, alignment, stream); - } catch (...) { - // If the pool is exhausted, fall back to the upstream memory resource - } - } - - return upstream_mr_.allocate_async(bytes, alignment, stream); - } - - void do_deallocate_async(void* ptr, - std::size_t bytes, - std::size_t alignment, - cuda::stream_ref stream) noexcept - { - if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr <= pool_end_) { - pool_->deallocate_async(ptr, bytes, alignment, stream); - } else { - upstream_mr_.deallocate_async(ptr, bytes, alignment, stream); - } - } - - void* allocate_async(std::size_t bytes, cuda::stream_ref stream) - { - return do_allocate_async(bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); - } - - void* allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream) - { - return do_allocate_async(bytes, alignment, stream); - } - - void* allocate(std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) - { - auto const result = do_allocate_async(bytes, alignment, stream_); - stream_.wait(); - return result; - } - - void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept - { - return do_deallocate_async(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); - } - - void deallocate_async(void* ptr, - std::size_t bytes, - std::size_t alignment, - cuda::stream_ref stream) noexcept - { - return do_deallocate_async(ptr, bytes, alignment, stream); - } - - void deallocate(void* ptr, - std::size_t bytes, - std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept - { - deallocate_async(ptr, bytes, alignment, stream_); - stream_.wait(); - } - - bool operator==(fixed_pinned_pool_memory_resource const& other) const - { - return pool_ == other.pool_ and stream_ == other.stream_; - } - - bool operator!=(fixed_pinned_pool_memory_resource const& other) const - { - return !operator==(other); - } - - [[maybe_unused]] friend void get_property(fixed_pinned_pool_memory_resource const&, - cuda::mr::device_accessible) noexcept - { - } - - [[maybe_unused]] friend void get_property(fixed_pinned_pool_memory_resource const&, - cuda::mr::host_accessible) noexcept - { - } -}; - -static_assert(cuda::mr::resource_with, - ""); - -} // namespace - -CUDF_EXPORT rmm::host_device_async_resource_ref& make_default_pinned_mr( - std::optional config_size) -{ - static fixed_pinned_pool_memory_resource mr = [config_size]() { - auto const size = [&config_size]() -> size_t { - if (auto const env_val = getenv("LIBCUDF_PINNED_POOL_SIZE"); env_val != nullptr) { - return std::atol(env_val); - } - - if (config_size.has_value()) { return *config_size; } - - size_t free{}, total{}; - CUDF_CUDA_TRY(cudaMemGetInfo(&free, &total)); - // 0.5% of the total device memory, capped at 100MB - return std::min(total / 200, size_t{100} * 1024 * 1024); - }(); - - // rmm requires the pool size to be a multiple of 256 bytes - auto const aligned_size = (size + 255) & ~255; - CUDF_LOG_INFO("Pinned pool size = {}", aligned_size); - - // make the pool with max size equal to the initial size - return fixed_pinned_pool_memory_resource{aligned_size}; - }(); - - static rmm::host_device_async_resource_ref mr_ref{mr}; - return mr_ref; -} - -CUDF_EXPORT std::mutex& host_mr_mutex() -{ - static std::mutex map_lock; - return map_lock; -} - -// Must be called with the host_mr_mutex mutex held -CUDF_EXPORT rmm::host_device_async_resource_ref& make_host_mr( - std::optional const& opts, bool* did_configure = nullptr) -{ - static rmm::host_device_async_resource_ref* mr_ref = nullptr; - bool configured = false; - if (mr_ref == nullptr) { - configured = true; - mr_ref = &make_default_pinned_mr(opts ? opts->pool_size : std::nullopt); - } - - // If the user passed an out param to detect whether this call configured a resource - // set the result - if (did_configure != nullptr) { *did_configure = configured; } - - return *mr_ref; -} - -// Must be called with the host_mr_mutex mutex held -CUDF_EXPORT rmm::host_device_async_resource_ref& host_mr() -{ - static rmm::host_device_async_resource_ref mr_ref = make_host_mr(std::nullopt); - return mr_ref; -} - -rmm::host_device_async_resource_ref set_host_memory_resource(rmm::host_device_async_resource_ref mr) -{ - std::scoped_lock lock{host_mr_mutex()}; - auto last_mr = host_mr(); - host_mr() = mr; - return last_mr; -} - -rmm::host_device_async_resource_ref get_host_memory_resource() -{ - std::scoped_lock lock{host_mr_mutex()}; - return host_mr(); -} - -bool config_default_host_memory_resource(host_mr_options const& opts) -{ - std::scoped_lock lock{host_mr_mutex()}; - auto did_configure = false; - make_host_mr(opts, &did_configure); - return did_configure; -} - -} // namespace cudf::io +} // namespace cudf::io::detail diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp new file mode 100644 index 00000000000..23d673a1382 --- /dev/null +++ b/cpp/src/utilities/pinned_memory.cpp @@ -0,0 +1,230 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace cudf { + +namespace { +class fixed_pinned_pool_memory_resource { + using upstream_mr = rmm::mr::pinned_host_memory_resource; + using host_pooled_mr = rmm::mr::pool_memory_resource; + + private: + upstream_mr upstream_mr_{}; + size_t pool_size_{0}; + // Raw pointer to avoid a segfault when the pool is destroyed on exit + host_pooled_mr* pool_{nullptr}; + void* pool_begin_{nullptr}; + void* pool_end_{nullptr}; + cuda::stream_ref stream_{cudf::detail::global_cuda_stream_pool().get_stream().value()}; + + public: + fixed_pinned_pool_memory_resource(size_t size) + : pool_size_{size}, pool_{new host_pooled_mr(upstream_mr_, size, size)} + { + if (pool_size_ == 0) { return; } + + // Allocate full size from the pinned pool to figure out the beginning and end address + pool_begin_ = pool_->allocate_async(pool_size_, stream_); + pool_end_ = static_cast(static_cast(pool_begin_) + pool_size_); + pool_->deallocate_async(pool_begin_, pool_size_, stream_); + } + + void* do_allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream) + { + if (bytes <= pool_size_) { + try { + return pool_->allocate_async(bytes, alignment, stream); + } catch (...) { + // If the pool is exhausted, fall back to the upstream memory resource + } + } + + return upstream_mr_.allocate_async(bytes, alignment, stream); + } + + void do_deallocate_async(void* ptr, + std::size_t bytes, + std::size_t alignment, + cuda::stream_ref stream) noexcept + { + if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr <= pool_end_) { + pool_->deallocate_async(ptr, bytes, alignment, stream); + } else { + upstream_mr_.deallocate_async(ptr, bytes, alignment, stream); + } + } + + void* allocate_async(std::size_t bytes, cuda::stream_ref stream) + { + return do_allocate_async(bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); + } + + void* allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream) + { + return do_allocate_async(bytes, alignment, stream); + } + + void* allocate(std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) + { + auto const result = do_allocate_async(bytes, alignment, stream_); + stream_.wait(); + return result; + } + + void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept + { + return do_deallocate_async(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); + } + + void deallocate_async(void* ptr, + std::size_t bytes, + std::size_t alignment, + cuda::stream_ref stream) noexcept + { + return do_deallocate_async(ptr, bytes, alignment, stream); + } + + void deallocate(void* ptr, + std::size_t bytes, + std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept + { + deallocate_async(ptr, bytes, alignment, stream_); + stream_.wait(); + } + + bool operator==(fixed_pinned_pool_memory_resource const& other) const + { + return pool_ == other.pool_ and stream_ == other.stream_; + } + + bool operator!=(fixed_pinned_pool_memory_resource const& other) const + { + return !operator==(other); + } + + [[maybe_unused]] friend void get_property(fixed_pinned_pool_memory_resource const&, + cuda::mr::device_accessible) noexcept + { + } + + [[maybe_unused]] friend void get_property(fixed_pinned_pool_memory_resource const&, + cuda::mr::host_accessible) noexcept + { + } +}; + +static_assert(cuda::mr::resource_with, + ""); + +} // namespace + +CUDF_EXPORT rmm::host_device_async_resource_ref& make_default_pinned_mr( + std::optional config_size) +{ + static fixed_pinned_pool_memory_resource mr = [config_size]() { + auto const size = [&config_size]() -> size_t { + if (auto const env_val = getenv("LIBCUDF_PINNED_POOL_SIZE"); env_val != nullptr) { + return std::atol(env_val); + } + + if (config_size.has_value()) { return *config_size; } + + size_t free{}, total{}; + CUDF_CUDA_TRY(cudaMemGetInfo(&free, &total)); + // 0.5% of the total device memory, capped at 100MB + return std::min(total / 200, size_t{100} * 1024 * 1024); + }(); + + // rmm requires the pool size to be a multiple of 256 bytes + auto const aligned_size = (size + 255) & ~255; + CUDF_LOG_INFO("Pinned pool size = {}", aligned_size); + + // make the pool with max size equal to the initial size + return fixed_pinned_pool_memory_resource{aligned_size}; + }(); + + static rmm::host_device_async_resource_ref mr_ref{mr}; + return mr_ref; +} + +CUDF_EXPORT std::mutex& host_mr_mutex() +{ + static std::mutex map_lock; + return map_lock; +} + +// Must be called with the host_mr_mutex mutex held +CUDF_EXPORT rmm::host_device_async_resource_ref& make_host_mr( + std::optional const& opts, bool* did_configure = nullptr) +{ + static rmm::host_device_async_resource_ref* mr_ref = nullptr; + bool configured = false; + if (mr_ref == nullptr) { + configured = true; + mr_ref = &make_default_pinned_mr(opts ? opts->pool_size : std::nullopt); + } + + // If the user passed an out param to detect whether this call configured a resource + // set the result + if (did_configure != nullptr) { *did_configure = configured; } + + return *mr_ref; +} + +// Must be called with the host_mr_mutex mutex held +CUDF_EXPORT rmm::host_device_async_resource_ref& host_mr() +{ + static rmm::host_device_async_resource_ref mr_ref = make_host_mr(std::nullopt); + return mr_ref; +} + +rmm::host_device_async_resource_ref set_pinned_memory_resource( + rmm::host_device_async_resource_ref mr) +{ + std::scoped_lock lock{host_mr_mutex()}; + auto last_mr = host_mr(); + host_mr() = mr; + return last_mr; +} + +rmm::host_device_async_resource_ref get_pinned_memory_resource() +{ + std::scoped_lock lock{host_mr_mutex()}; + return host_mr(); +} + +bool config_default_pinned_memory_resource(pinned_mr_options const& opts) +{ + std::scoped_lock lock{host_mr_mutex()}; + auto did_configure = false; + make_host_mr(opts, &did_configure); + return did_configure; +} + +} // namespace cudf diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 9d766e80094..4b3793b22f6 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -28,13 +28,13 @@ #include #include #include -#include #include #include #include #include #include #include +#include #include @@ -2068,7 +2068,7 @@ TEST_F(JsonReaderTest, JSONLinesRecoveringSync) size_t{128} * 1024 * 1024}; // Set new resource - auto last_mr = cudf::io::set_host_memory_resource(mr); + auto last_mr = cudf::set_pinned_memory_resource(mr); /** * @brief Spark has the specific need to ignore extra characters that come after the first record @@ -2158,7 +2158,7 @@ TEST_F(JsonReaderTest, JSONLinesRecoveringSync) float64_wrapper{c_data.cbegin(), c_data.cend(), c_validity.cbegin()}); // Restore original memory source - cudf::io::set_host_memory_resource(last_mr); + cudf::set_pinned_memory_resource(last_mr); } TEST_F(JsonReaderTest, MixedTypes) diff --git a/cpp/tests/utilities_tests/io_utilities_tests.cpp b/cpp/tests/utilities_tests/io_utilities_tests.cpp index e5a153bf781..c00f1ab8863 100644 --- a/cpp/tests/utilities_tests/io_utilities_tests.cpp +++ b/cpp/tests/utilities_tests/io_utilities_tests.cpp @@ -18,8 +18,8 @@ #include #include -#include #include +#include #include #include @@ -44,8 +44,8 @@ TEST(IoUtilitiesTest, HostMemoryGetAndSet) size_t{128} * 1024 * 1024); // set new resource - auto last_mr = cudf::io::get_host_memory_resource(); - cudf::io::set_host_memory_resource(mr); + auto last_mr = cudf::get_pinned_memory_resource(); + cudf::set_pinned_memory_resource(mr); constexpr int num_rows = 32 * 1024; auto valids = @@ -66,7 +66,7 @@ TEST(IoUtilitiesTest, HostMemoryGetAndSet) CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, expected); // reset memory resource back - cudf::io::set_host_memory_resource(last_mr); + cudf::set_pinned_memory_resource(last_mr); } TEST(IoUtilitiesTest, Base64EncodeAndDecode) From 0048c5951171d931b676dee3ab40ca312a7eb560 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 30 May 2024 19:09:32 -0700 Subject: [PATCH 04/75] style --- cpp/benchmarks/io/text/multibyte_split.cpp | 2 +- cpp/src/io/text/data_chunk_source_factories.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp index 172182c3607..67705863d41 100644 --- a/cpp/benchmarks/io/text/multibyte_split.cpp +++ b/cpp/benchmarks/io/text/multibyte_split.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp index 8278b2c25cb..46149db929f 100644 --- a/cpp/src/io/text/data_chunk_source_factories.cpp +++ b/cpp/src/io/text/data_chunk_source_factories.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 1964523554cd86763be1a0a6b1580f7d12016270 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 31 May 2024 10:39:55 -0700 Subject: [PATCH 05/75] java update because breaking --- java/src/main/native/src/RmmJni.cpp | 32 +++++++++++++++-------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index fa78f6ca4e2..e1cb7845b77 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -16,7 +16,7 @@ #include "cudf_jni_apis.hpp" -#include +#include #include #include @@ -395,15 +395,17 @@ class java_debug_event_handler_memory_resource final : public java_event_handler } }; -inline auto& prior_cuio_host_mr() +inline auto& prior_cudf_pinned_mr() { - static rmm::host_async_resource_ref _prior_cuio_host_mr = cudf::io::get_host_memory_resource(); - return _prior_cuio_host_mr; + static rmm::host_device_async_resource_ref _prior_cudf_pinned_mr = + cudf::get_pinned_memory_resource(); + return _prior_cudf_pinned_mr; } /** * This is a pinned fallback memory resource that will try to allocate `pool` - * and if that fails, attempt to allocate from the prior resource used by cuIO `prior_cuio_host_mr`. + * and if that fails, attempt to allocate from the prior resource used by cuDF + * `prior_cudf_pinned_mr`. * * We detect whether a pointer to free is inside of the pool by checking its address (see * constructor) @@ -433,7 +435,7 @@ class pinned_fallback_host_memory_resource { /** * @brief Allocates pinned host memory of size at least \p bytes bytes from either the - * _pool argument provided, or prior_cuio_host_mr. + * _pool argument provided, or prior_cudf_pinned_mr. * * @throws rmm::bad_alloc if the requested allocation could not be fulfilled due to any other * reason. @@ -450,7 +452,7 @@ class pinned_fallback_host_memory_resource { return _pool->allocate(bytes, alignment); } catch (const std::exception& unused) { // try to allocate using the underlying pinned resource - return prior_cuio_host_mr().allocate(bytes, alignment); + return prior_cudf_pinned_mr().allocate(bytes, alignment); } // we should not reached here return nullptr; @@ -459,7 +461,7 @@ class pinned_fallback_host_memory_resource { /** * @brief Deallocate memory pointed to by \p ptr of size \p bytes bytes. We attempt * to deallocate from _pool, if ptr is detected to be in the pool address range, - * otherwise we deallocate from `prior_cuio_host_mr`. + * otherwise we deallocate from `prior_cudf_pinned_mr`. * * @param ptr Pointer to be deallocated. * @param bytes Size of the allocation. @@ -472,7 +474,7 @@ class pinned_fallback_host_memory_resource { if (ptr >= pool_begin_ && ptr <= pool_end_) { _pool->deallocate(ptr, bytes, alignment); } else { - prior_cuio_host_mr().deallocate(ptr, bytes, alignment); + prior_cudf_pinned_mr().deallocate(ptr, bytes, alignment); } } @@ -1025,7 +1027,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newPinnedPoolMemoryResource(JNIE CATCH_STD(env, 0) } -JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCuioPinnedPoolMemoryResource(JNIEnv* env, +JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCudfPinnedPoolMemoryResource(JNIEnv* env, jclass clazz, jlong pool_ptr) { @@ -1035,7 +1037,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCuioPinnedPoolMemoryResource(J // create a pinned fallback pool that will allocate pinned memory // if the regular pinned pool is exhausted pinned_fallback_mr.reset(new pinned_fallback_host_memory_resource(pool)); - prior_cuio_host_mr() = cudf::io::set_host_memory_resource(*pinned_fallback_mr); + prior_cudf_pinned_mr() = cudf::set_pinned_memory_resource(*pinned_fallback_mr); } CATCH_STD(env, ) } @@ -1048,7 +1050,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releasePinnedPoolMemoryResource(J cudf::jni::auto_set_device(env); // set the cuio host memory resource to what it was before, or the same // if we didn't overwrite it with setCuioPinnedPoolMemoryResource - cudf::io::set_host_memory_resource(prior_cuio_host_mr()); + cudf::set_pinned_memory_resource(prior_cudf_pinned_mr()); pinned_fallback_mr.reset(); delete reinterpret_cast(pool_ptr); } @@ -1088,7 +1090,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocFromFallbackPinnedPool(JNIE jlong size) { cudf::jni::auto_set_device(env); - void* ret = cudf::io::get_host_memory_resource().allocate(size); + void* ret = cudf::get_pinned_memory_resource().allocate(size); return reinterpret_cast(ret); } @@ -1101,7 +1103,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeFromFallbackPinnedPool(JNIEnv try { cudf::jni::auto_set_device(env); void* cptr = reinterpret_cast(ptr); - cudf::io::get_host_memory_resource().deallocate(cptr, size); + cudf::get_pinned_memory_resource().deallocate(cptr, size); } CATCH_STD(env, ) } @@ -1112,7 +1114,7 @@ JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Rmm_configureDefaultCudfPinnedPoo { try { cudf::jni::auto_set_device(env); - return cudf::io::config_default_host_memory_resource(cudf::io::host_mr_options{size}); + return cudf::config_default_pinned_memory_resource(cudf::pinned_mr_options{size}); } CATCH_STD(env, false) } From ac0ce9c0ef5c2b3c6c4dfe1e1b4ee5330100f999 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 31 May 2024 12:04:05 -0700 Subject: [PATCH 06/75] java fix --- java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java | 2 +- java/src/main/java/ai/rapids/cudf/Rmm.java | 2 +- java/src/main/native/src/RmmJni.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java index 83b801db7fb..5050834303d 100644 --- a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java +++ b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java @@ -224,7 +224,7 @@ private PinnedMemoryPool(long poolSize, int gpuId, boolean setCuioHostMemoryReso } this.poolHandle = Rmm.newPinnedPoolMemoryResource(poolSize, poolSize); if (setCuioHostMemoryResource) { - Rmm.setCuioPinnedPoolMemoryResource(this.poolHandle); + Rmm.setCudfPinnedPoolMemoryResource(this.poolHandle); } this.poolSize = poolSize; } diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java index 4dee1b7aa24..ed029c918e4 100755 --- a/java/src/main/java/ai/rapids/cudf/Rmm.java +++ b/java/src/main/java/ai/rapids/cudf/Rmm.java @@ -597,7 +597,7 @@ static native long newEventHandlerResourceAdaptor(long handle, long trackerHandl public static native long newPinnedPoolMemoryResource(long initSize, long maxSize); - public static native long setCuioPinnedPoolMemoryResource(long poolPtr); + public static native long setCudfPinnedPoolMemoryResource(long poolPtr); public static native void releasePinnedPoolMemoryResource(long poolPtr); diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index e1cb7845b77..8bd0f7793b4 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -1049,7 +1049,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releasePinnedPoolMemoryResource(J try { cudf::jni::auto_set_device(env); // set the cuio host memory resource to what it was before, or the same - // if we didn't overwrite it with setCuioPinnedPoolMemoryResource + // if we didn't overwrite it with setCudfPinnedPoolMemoryResource cudf::set_pinned_memory_resource(prior_cudf_pinned_mr()); pinned_fallback_mr.reset(); delete reinterpret_cast(pool_ptr); From ab36162ac32cd0be5fe69ef3d92e421f4e5ea798 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 31 May 2024 15:48:45 -0700 Subject: [PATCH 07/75] move test out of io util --- cpp/tests/CMakeLists.txt | 5 +- .../utilities_tests/io_utilities_tests.cpp | 45 ------------- .../utilities_tests/pinned_memory_tests.cpp | 65 +++++++++++++++++++ 3 files changed, 68 insertions(+), 47 deletions(-) create mode 100644 cpp/tests/utilities_tests/pinned_memory_tests.cpp diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 2f2c12f265c..19c87facb51 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -378,15 +378,16 @@ ConfigureTest( # * utilities tests ------------------------------------------------------------------------------- ConfigureTest( UTILITIES_TEST - utilities_tests/type_list_tests.cpp utilities_tests/column_debug_tests.cpp utilities_tests/column_utilities_tests.cpp utilities_tests/column_wrapper_tests.cpp + utilities_tests/default_stream_tests.cpp utilities_tests/io_utilities_tests.cpp utilities_tests/lists_column_wrapper_tests.cpp utilities_tests/logger_tests.cpp - utilities_tests/default_stream_tests.cpp + utilities_tests/pinned_memory_tests.cpp utilities_tests/type_check_tests.cpp + utilities_tests/type_list_tests.cpp ) # ################################################################################################## diff --git a/cpp/tests/utilities_tests/io_utilities_tests.cpp b/cpp/tests/utilities_tests/io_utilities_tests.cpp index c00f1ab8863..9ed8f18f5cc 100644 --- a/cpp/tests/utilities_tests/io_utilities_tests.cpp +++ b/cpp/tests/utilities_tests/io_utilities_tests.cpp @@ -16,14 +16,6 @@ #include #include -#include - -#include -#include - -#include -#include -#include #include @@ -32,43 +24,6 @@ using cudf::io::detail::base64_encode; class IoUtilitiesTest : public cudf::test::BaseFixture {}; -TEST(IoUtilitiesTest, HostMemoryGetAndSet) -{ - // Global environment for temporary files - auto const temp_env = static_cast( - ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment)); - - // pinned/pooled host memory resource - using host_pooled_mr = rmm::mr::pool_memory_resource; - host_pooled_mr mr(std::make_shared().get(), - size_t{128} * 1024 * 1024); - - // set new resource - auto last_mr = cudf::get_pinned_memory_resource(); - cudf::set_pinned_memory_resource(mr); - - constexpr int num_rows = 32 * 1024; - auto valids = - cudf::detail::make_counting_transform_iterator(0, [&](int index) { return index % 2; }); - auto values = thrust::make_counting_iterator(0); - - cudf::test::fixed_width_column_wrapper col(values, values + num_rows, valids); - - cudf::table_view expected({col}); - auto filepath = temp_env->get_temp_filepath("IoUtilsMemTest.parquet"); - cudf::io::parquet_writer_options out_args = - cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); - cudf::io::write_parquet(out_args); - - cudf::io::parquet_reader_options const read_opts = - cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); - auto const result = cudf::io::read_parquet(read_opts); - CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, expected); - - // reset memory resource back - cudf::set_pinned_memory_resource(last_mr); -} - TEST(IoUtilitiesTest, Base64EncodeAndDecode) { // a vector of lorem ipsum strings diff --git a/cpp/tests/utilities_tests/pinned_memory_tests.cpp b/cpp/tests/utilities_tests/pinned_memory_tests.cpp new file mode 100644 index 00000000000..df9103640f4 --- /dev/null +++ b/cpp/tests/utilities_tests/pinned_memory_tests.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +#include +#include +#include + +class PinnedMemoryTest : public cudf::test::BaseFixture {}; + +TEST(PinnedMemoryTest, MemoryResourceGetAndSet) +{ + // Global environment for temporary files + auto const temp_env = static_cast( + ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment)); + + // pinned/pooled host memory resource + using host_pooled_mr = rmm::mr::pool_memory_resource; + host_pooled_mr mr(std::make_shared().get(), + 4 * 1024 * 1024); + + // set new resource + auto last_mr = cudf::get_pinned_memory_resource(); + cudf::set_pinned_memory_resource(mr); + + constexpr int num_rows = 32 * 1024; + auto valids = + cudf::detail::make_counting_transform_iterator(0, [&](int index) { return index % 2; }); + auto values = thrust::make_counting_iterator(0); + + cudf::test::fixed_width_column_wrapper col(values, values + num_rows, valids); + + cudf::table_view expected({col}); + auto filepath = temp_env->get_temp_filepath("MemoryResourceGetAndSetTest.parquet"); + cudf::io::parquet_writer_options out_args = + cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected); + cudf::io::write_parquet(out_args); + + cudf::io::parquet_reader_options const read_opts = + cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}); + auto const result = cudf::io::read_parquet(read_opts); + CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, expected); + + // reset memory resource back + cudf::set_pinned_memory_resource(last_mr); +} From 83f665a15f4aba3040079b58306fd964621a91c8 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 3 Jun 2024 15:01:38 -0700 Subject: [PATCH 08/75] missed rename --- .../src/main/java/ai/rapids/cudf/PinnedMemoryPool.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java index 5050834303d..5ca5bc0db68 100644 --- a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java +++ b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java @@ -128,9 +128,9 @@ public static synchronized void initialize(long poolSize, int gpuId) { * * @param poolSize size of the pool to initialize. * @param gpuId gpu id to set to get memory pool from, -1 means to use default - * @param setCuioHostMemoryResource true if this pinned pool should be used by cuIO for host memory + * @param setCudfPinnedPoolMemoryResource true if this pinned pool should be used by cuIO for host memory */ - public static synchronized void initialize(long poolSize, int gpuId, boolean setCuioHostMemoryResource) { + public static synchronized void initialize(long poolSize, int gpuId, boolean setCudfPinnedPoolMemoryResource) { if (isInitialized()) { throw new IllegalStateException("Can only initialize the pool once."); } @@ -139,7 +139,7 @@ public static synchronized void initialize(long poolSize, int gpuId, boolean set t.setDaemon(true); return t; }); - initFuture = initService.submit(() -> new PinnedMemoryPool(poolSize, gpuId, setCuioHostMemoryResource)); + initFuture = initService.submit(() -> new PinnedMemoryPool(poolSize, gpuId, setCudfPinnedPoolMemoryResource)); initService.shutdown(); } @@ -216,14 +216,14 @@ public static long getTotalPoolSizeBytes() { return 0; } - private PinnedMemoryPool(long poolSize, int gpuId, boolean setCuioHostMemoryResource) { + private PinnedMemoryPool(long poolSize, int gpuId, boolean setCudfPinnedPoolMemoryResource) { if (gpuId > -1) { // set the gpu device to use Cuda.setDevice(gpuId); Cuda.freeZero(); } this.poolHandle = Rmm.newPinnedPoolMemoryResource(poolSize, poolSize); - if (setCuioHostMemoryResource) { + if (setCudfPinnedPoolMemoryResource) { Rmm.setCudfPinnedPoolMemoryResource(this.poolHandle); } this.poolSize = poolSize; From c1ae478c2b3ce28101e35ec3f5a5af5e03d5452c Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 3 Jun 2024 15:52:34 -0700 Subject: [PATCH 09/75] update benchmark changes --- cpp/benchmarks/io/cuio_common.cpp | 12 ++++++++++++ cpp/benchmarks/io/cuio_common.hpp | 4 ++-- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp index 37ced8ea703..f06938bd721 100644 --- a/cpp/benchmarks/io/cuio_common.cpp +++ b/cpp/benchmarks/io/cuio_common.cpp @@ -14,6 +14,9 @@ * limitations under the License. */ +#include "rmm/mr/pinned_host_memory_resource.hpp" +#include "rmm/resource_ref.hpp" + #include #include @@ -28,6 +31,14 @@ temp_directory const cuio_source_sink_pair::tmpdir{"cudf_gbench"}; +// Don't use cudf's pinned pool for the source data +rmm::host_async_resource_ref pinned_memory_resource() +{ + static rmm::mr::pinned_host_memory_resource mr = rmm::mr::pinned_host_memory_resource{}; + + return mr; +} + std::string random_file_in_dir(std::string const& dir_path) { // `mkstemp` modifies the template in place @@ -41,6 +52,7 @@ std::string random_file_in_dir(std::string const& dir_path) cuio_source_sink_pair::cuio_source_sink_pair(io_type type) : type{type}, + pinned_buffer({pinned_memory_resource(), cudf::get_default_stream()}), d_buffer{0, cudf::get_default_stream()}, file_name{random_file_in_dir(tmpdir.path())}, void_sink{cudf::io::data_sink::create()} diff --git a/cpp/benchmarks/io/cuio_common.hpp b/cpp/benchmarks/io/cuio_common.hpp index d4f39a5f243..407805a8a1a 100644 --- a/cpp/benchmarks/io/cuio_common.hpp +++ b/cpp/benchmarks/io/cuio_common.hpp @@ -18,7 +18,7 @@ #include -#include +#include #include #include @@ -79,7 +79,7 @@ class cuio_source_sink_pair { io_type const type; std::vector h_buffer; - cudf::detail::pinned_host_vector pinned_buffer; + cudf::detail::rmm_host_vector pinned_buffer; rmm::device_uvector d_buffer; std::string const file_name; std::unique_ptr void_sink; From 1c09d0cfe15bcf6b5d2ce775129f0db7e0c662f0 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 4 Jun 2024 11:52:48 -0700 Subject: [PATCH 10/75] rename rmm_host_vector --- cpp/benchmarks/io/cuio_common.hpp | 4 ++-- .../{rmm_host_vector.hpp => host_vector.hpp} | 2 +- .../cudf/detail/utilities/vector_factories.hpp | 16 ++++++++-------- cpp/src/io/text/bgzip_data_chunk_source.cu | 10 +++++----- cpp/src/io/text/data_chunk_source_factories.cpp | 8 ++++---- cpp/src/io/utilities/hostdevice_vector.hpp | 4 ++-- 6 files changed, 22 insertions(+), 22 deletions(-) rename cpp/include/cudf/detail/utilities/{rmm_host_vector.hpp => host_vector.hpp} (98%) diff --git a/cpp/benchmarks/io/cuio_common.hpp b/cpp/benchmarks/io/cuio_common.hpp index 407805a8a1a..64d6021cf50 100644 --- a/cpp/benchmarks/io/cuio_common.hpp +++ b/cpp/benchmarks/io/cuio_common.hpp @@ -18,7 +18,7 @@ #include -#include +#include #include #include @@ -79,7 +79,7 @@ class cuio_source_sink_pair { io_type const type; std::vector h_buffer; - cudf::detail::rmm_host_vector pinned_buffer; + cudf::detail::host_vector pinned_buffer; rmm::device_uvector d_buffer; std::string const file_name; std::unique_ptr void_sink; diff --git a/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp similarity index 98% rename from cpp/include/cudf/detail/utilities/rmm_host_vector.hpp rename to cpp/include/cudf/detail/utilities/host_vector.hpp index 6604020c224..e62c8017f8b 100644 --- a/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp +++ b/cpp/include/cudf/detail/utilities/host_vector.hpp @@ -178,6 +178,6 @@ class rmm_host_allocator { * @brief A vector class with rmm host memory allocator */ template -using rmm_host_vector = thrust::host_vector>; +using host_vector = thrust::host_vector>; } // namespace cudf::detail diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp index 7343ee25c8f..6f859ded456 100644 --- a/cpp/include/cudf/detail/utilities/vector_factories.hpp +++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp @@ -21,7 +21,7 @@ * @file vector_factories.hpp */ -#include +#include #include #include #include @@ -442,33 +442,33 @@ thrust::host_vector make_host_vector_sync( } /** - * @brief Asynchronously construct a `rmm_host_vector` of the given size + * @brief Asynchronously construct a `host_vector` of the given size * * @note This function does not synchronize `stream`. * * @tparam T The type of the vector data * @param size The number of elements in the created vector * @param stream The stream on which to allocate memory - * @return A rmm_host_vector of the given size + * @return A host_vector of the given size */ template -rmm_host_vector make_pinned_vector_async(size_t size, rmm::cuda_stream_view stream) +host_vector make_pinned_vector_async(size_t size, rmm::cuda_stream_view stream) { - return rmm_host_vector(size, {cudf::get_pinned_memory_resource(), stream}); + return host_vector(size, {cudf::get_pinned_memory_resource(), stream}); } /** - * @brief Synchronously construct a `rmm_host_vector` of the given size + * @brief Synchronously construct a `cudf::detail::host_vector` of the given size * * @note This function synchronizes `stream`. * * @tparam T The type of the vector data * @param size The number of elements in the created vector * @param stream The stream on which to allocate memory - * @return A rmm_host_vector of the given size + * @return A host_vector of the given size */ template -rmm_host_vector make_pinned_vector_sync(size_t size, rmm::cuda_stream_view stream) +host_vector make_pinned_vector_sync(size_t size, rmm::cuda_stream_view stream) { auto result = make_pinned_vector_async(size, stream); stream.synchronize(); diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu index 190015686df..896123d95a9 100644 --- a/cpp/src/io/text/bgzip_data_chunk_source.cu +++ b/cpp/src/io/text/bgzip_data_chunk_source.cu @@ -20,8 +20,8 @@ #include "io/utilities/config_utils.hpp" #include +#include #include -#include #include #include #include @@ -67,7 +67,7 @@ struct bgzip_nvcomp_transform_functor { class bgzip_data_chunk_reader : public data_chunk_reader { private: template - static void copy_to_device(cudf::detail::rmm_host_vector const& host, + static void copy_to_device(cudf::detail::host_vector const& host, rmm::device_uvector& device, rmm::cuda_stream_view stream) { @@ -85,9 +85,9 @@ class bgzip_data_chunk_reader : public data_chunk_reader { 1 << 16; // 64k offset allocation, resized on demand cudaEvent_t event; - cudf::detail::rmm_host_vector h_compressed_blocks; - cudf::detail::rmm_host_vector h_compressed_offsets; - cudf::detail::rmm_host_vector h_decompressed_offsets; + cudf::detail::host_vector h_compressed_blocks; + cudf::detail::host_vector h_compressed_offsets; + cudf::detail::host_vector h_decompressed_offsets; rmm::device_uvector d_compressed_blocks; rmm::device_uvector d_decompressed_blocks; rmm::device_uvector d_compressed_offsets; diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp index 46149db929f..1e1671a1545 100644 --- a/cpp/src/io/text/data_chunk_source_factories.cpp +++ b/cpp/src/io/text/data_chunk_source_factories.cpp @@ -18,7 +18,7 @@ #include "io/text/device_data_chunks.hpp" #include -#include +#include #include #include @@ -33,7 +33,7 @@ namespace { struct host_ticket { cudaEvent_t event; - std::unique_ptr> buffer; + std::unique_ptr> buffer; }; /** @@ -86,7 +86,7 @@ class datasource_chunk_reader : public data_chunk_reader { // resize the host buffer as necessary to contain the requested number of bytes if (h_ticket.buffer == nullptr or h_ticket.buffer->size() < read_size) { - h_ticket.buffer = std::make_unique>( + h_ticket.buffer = std::make_unique>( cudf::detail::make_pinned_vector_sync(read_size, stream)); } @@ -153,7 +153,7 @@ class istream_data_chunk_reader : public data_chunk_reader { // resize the host buffer as necessary to contain the requested number of bytes if (h_ticket.buffer == nullptr or h_ticket.buffer->size() < read_size) { - h_ticket.buffer = std::make_unique>( + h_ticket.buffer = std::make_unique>( cudf::detail::make_pinned_vector_sync(read_size, stream)); } diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp index f6f7ff14d38..1ae27a2f4ae 100644 --- a/cpp/src/io/utilities/hostdevice_vector.hpp +++ b/cpp/src/io/utilities/hostdevice_vector.hpp @@ -18,7 +18,7 @@ #include "hostdevice_span.hpp" -#include +#include #include #include #include @@ -172,7 +172,7 @@ class hostdevice_vector { } private: - cudf::detail::rmm_host_vector h_data; + cudf::detail::host_vector h_data; rmm::device_uvector d_data; }; From c343c3194f48dedb10e49c8610e3e0deaacf315b Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 4 Jun 2024 13:00:10 -0700 Subject: [PATCH 11/75] remove do_xyz --- cpp/src/utilities/pinned_memory.cpp | 39 ++++++++++------------------- 1 file changed, 13 insertions(+), 26 deletions(-) diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp index 23d673a1382..47c09217363 100644 --- a/cpp/src/utilities/pinned_memory.cpp +++ b/cpp/src/utilities/pinned_memory.cpp @@ -53,7 +53,7 @@ class fixed_pinned_pool_memory_resource { pool_->deallocate_async(pool_begin_, pool_size_, stream_); } - void* do_allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream) + void* allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream) { if (bytes <= pool_size_) { try { @@ -66,46 +66,33 @@ class fixed_pinned_pool_memory_resource { return upstream_mr_.allocate_async(bytes, alignment, stream); } - void do_deallocate_async(void* ptr, - std::size_t bytes, - std::size_t alignment, - cuda::stream_ref stream) noexcept - { - if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr <= pool_end_) { - pool_->deallocate_async(ptr, bytes, alignment, stream); - } else { - upstream_mr_.deallocate_async(ptr, bytes, alignment, stream); - } - } - void* allocate_async(std::size_t bytes, cuda::stream_ref stream) { - return do_allocate_async(bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); - } - - void* allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream) - { - return do_allocate_async(bytes, alignment, stream); + return allocate_async(bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); } void* allocate(std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) { - auto const result = do_allocate_async(bytes, alignment, stream_); + auto const result = allocate_async(bytes, alignment, stream_); stream_.wait(); return result; } - void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept - { - return do_deallocate_async(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); - } - void deallocate_async(void* ptr, std::size_t bytes, std::size_t alignment, cuda::stream_ref stream) noexcept { - return do_deallocate_async(ptr, bytes, alignment, stream); + if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr <= pool_end_) { + pool_->deallocate_async(ptr, bytes, alignment, stream); + } else { + upstream_mr_.deallocate_async(ptr, bytes, alignment, stream); + } + } + + void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept + { + return deallocate_async(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); } void deallocate(void* ptr, From 50f4d3ee3d27a2a0a10a2a2cc8a7f425ab8c82e1 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 4 Jun 2024 15:14:21 -0700 Subject: [PATCH 12/75] comment --- cpp/include/cudf/detail/utilities/vector_factories.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp index 6f859ded456..06dfcbfc5e5 100644 --- a/cpp/include/cudf/detail/utilities/vector_factories.hpp +++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp @@ -442,7 +442,7 @@ thrust::host_vector make_host_vector_sync( } /** - * @brief Asynchronously construct a `host_vector` of the given size + * @brief Asynchronously construct a pinned `cudf::detail::host_vector` of the given size * * @note This function does not synchronize `stream`. * @@ -458,7 +458,7 @@ host_vector make_pinned_vector_async(size_t size, rmm::cuda_stream_view strea } /** - * @brief Synchronously construct a `cudf::detail::host_vector` of the given size + * @brief Synchronously construct a pinned `cudf::detail::host_vector` of the given size * * @note This function synchronizes `stream`. * From e5af4902dad3fcbf3c1cd1c678e271bccdd0489b Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 4 Jun 2024 19:49:49 -0700 Subject: [PATCH 13/75] works --- cpp/CMakeLists.txt | 1 + .../cudf/detail/utilities/cuda_copy.hpp | 41 +++++++++++++++++++ cpp/include/cudf/utilities/pinned_memory.hpp | 17 ++++++++ cpp/src/io/utilities/hostdevice_vector.hpp | 7 ++-- cpp/src/utilities/cuda_copy.cu | 41 +++++++++++++++++++ cpp/src/utilities/pinned_memory.cpp | 10 +++++ 6 files changed, 113 insertions(+), 4 deletions(-) create mode 100644 cpp/include/cudf/detail/utilities/cuda_copy.hpp create mode 100644 cpp/src/utilities/cuda_copy.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 8a0f56aec53..cf836a45708 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -663,6 +663,7 @@ add_library( src/unary/nan_ops.cu src/unary/null_ops.cu src/utilities/default_stream.cpp + src/utilities/cuda_copy.cu src/utilities/linked_column.cpp src/utilities/logger.cpp src/utilities/pinned_memory.cpp diff --git a/cpp/include/cudf/detail/utilities/cuda_copy.hpp b/cpp/include/cudf/detail/utilities/cuda_copy.hpp new file mode 100644 index 00000000000..2ceb70f2ef2 --- /dev/null +++ b/cpp/include/cudf/detail/utilities/cuda_copy.hpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +namespace cudf::detail { + +namespace impl { + +void copy_pinned(void* dst, void const* src, size_t size, rmm::cuda_stream_view stream); + +} // namespace impl + +template +void copy_pinned_to_device_async(T* dst, T const* src, size_t size, rmm::cuda_stream_view stream) +{ + impl::copy_pinned(dst, src, size * sizeof(T), stream); +} + +template +void copy_device_to_pinned_async(T* dst, T const* src, size_t size, rmm::cuda_stream_view stream) +{ + impl::copy_pinned(dst, src, size * sizeof(T), stream); +} + +} // namespace cudf::detail \ No newline at end of file diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp index b423eab6d38..b0d6c55999f 100644 --- a/cpp/include/cudf/utilities/pinned_memory.hpp +++ b/cpp/include/cudf/utilities/pinned_memory.hpp @@ -55,4 +55,21 @@ struct pinned_mr_options { */ bool config_default_pinned_memory_resource(pinned_mr_options const& opts); +/** + * @brief Set the threshold size for using kernels for pinned memory copies. + * + * @param threshold The threshold size in bytes. If the size of the copy is less than this + * threshold, the copy will be done using kernels. If the size is greater than or equal to this + * threshold, the copy will be done using cudaMemcpyAsync. + */ + +void set_kernel_copy_threshold(size_t threshold); + +/** + * @brief Get the threshold size for using kernels for pinned memory copies. + * + * @return The threshold size in bytes. + */ +size_t get_kernel_copy_threshold(); + } // namespace cudf diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp index 1ae27a2f4ae..171379143a6 100644 --- a/cpp/src/io/utilities/hostdevice_vector.hpp +++ b/cpp/src/io/utilities/hostdevice_vector.hpp @@ -18,6 +18,7 @@ #include "hostdevice_span.hpp" +#include #include #include #include @@ -124,8 +125,7 @@ class hostdevice_vector { void host_to_device_async(rmm::cuda_stream_view stream) { - CUDF_CUDA_TRY( - cudaMemcpyAsync(device_ptr(), host_ptr(), size_bytes(), cudaMemcpyDefault, stream.value())); + copy_pinned_to_device_async(device_ptr(), host_ptr(), size(), stream); } void host_to_device_sync(rmm::cuda_stream_view stream) @@ -136,8 +136,7 @@ class hostdevice_vector { void device_to_host_async(rmm::cuda_stream_view stream) { - CUDF_CUDA_TRY( - cudaMemcpyAsync(host_ptr(), device_ptr(), size_bytes(), cudaMemcpyDefault, stream.value())); + copy_device_to_pinned_async(host_ptr(), device_ptr(), size(), stream); } void device_to_host_sync(rmm::cuda_stream_view stream) diff --git a/cpp/src/utilities/cuda_copy.cu b/cpp/src/utilities/cuda_copy.cu new file mode 100644 index 00000000000..9a574d6d0e5 --- /dev/null +++ b/cpp/src/utilities/cuda_copy.cu @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include + +#include + +namespace cudf::detail::impl { + +void copy_pinned(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream) +{ + if (size == 0) return; + + if (size < get_kernel_copy_threshold()) { + thrust::copy_n(rmm::exec_policy_nosync(stream), + static_cast(src), + size, + static_cast(dst)); + } else { + CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream)); + } +} + +} // namespace cudf::detail::impl \ No newline at end of file diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp index 47c09217363..0791f404bf2 100644 --- a/cpp/src/utilities/pinned_memory.cpp +++ b/cpp/src/utilities/pinned_memory.cpp @@ -214,4 +214,14 @@ bool config_default_pinned_memory_resource(pinned_mr_options const& opts) return did_configure; } +CUDF_EXPORT auto& kernel_copy_threshold() +{ + static std::atomic threshold = 0; // use cudaMemcpyAsync for all pinned copies + return threshold; +} + +void set_kernel_copy_threshold(size_t threshold) { kernel_copy_threshold() = threshold; } + +size_t get_kernel_copy_threshold() { return kernel_copy_threshold(); } + } // namespace cudf From 9082ccc979383b0bcfa7181c54d097d5036f2904 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 5 Jun 2024 15:14:34 -0700 Subject: [PATCH 14/75] include style Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com> --- cpp/benchmarks/io/cuio_common.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp index f06938bd721..45dc812e247 100644 --- a/cpp/benchmarks/io/cuio_common.cpp +++ b/cpp/benchmarks/io/cuio_common.cpp @@ -14,8 +14,8 @@ * limitations under the License. */ -#include "rmm/mr/pinned_host_memory_resource.hpp" -#include "rmm/resource_ref.hpp" +#include +#include #include From 17b1ee0e736a0dbbbf152d99aad3a27de3bc9c3a Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 5 Jun 2024 17:05:40 -0700 Subject: [PATCH 15/75] reviews --- cpp/src/io/text/bgzip_data_chunk_source.cu | 2 +- cpp/src/io/text/data_chunk_source_factories.cpp | 2 +- java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu index 896123d95a9..0e3ce779089 100644 --- a/cpp/src/io/text/bgzip_data_chunk_source.cu +++ b/cpp/src/io/text/bgzip_data_chunk_source.cu @@ -14,7 +14,6 @@ * limitations under the License. */ -#include "cudf/detail/utilities/vector_factories.hpp" #include "io/comp/nvcomp_adapter.hpp" #include "io/text/device_data_chunks.hpp" #include "io/utilities/config_utils.hpp" @@ -22,6 +21,7 @@ #include #include #include +#include #include #include #include diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp index 1e1671a1545..45096b7155c 100644 --- a/cpp/src/io/text/data_chunk_source_factories.cpp +++ b/cpp/src/io/text/data_chunk_source_factories.cpp @@ -14,11 +14,11 @@ * limitations under the License. */ -#include "cudf/detail/utilities/vector_factories.hpp" #include "io/text/device_data_chunks.hpp" #include #include +#include #include #include diff --git a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java index 5ca5bc0db68..df0d9dc7c3e 100644 --- a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java +++ b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java @@ -128,7 +128,7 @@ public static synchronized void initialize(long poolSize, int gpuId) { * * @param poolSize size of the pool to initialize. * @param gpuId gpu id to set to get memory pool from, -1 means to use default - * @param setCudfPinnedPoolMemoryResource true if this pinned pool should be used by cuIO for host memory + * @param setCudfPinnedPoolMemoryResource true if this pinned pool should be used by cuDF for pinned memory */ public static synchronized void initialize(long poolSize, int gpuId, boolean setCudfPinnedPoolMemoryResource) { if (isInitialized()) { From 2dbb68f9d66e752eef4015082d2c877b8145e068 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 6 Jun 2024 09:52:44 -0700 Subject: [PATCH 16/75] available_device_memory --- cpp/src/utilities/pinned_memory.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp index 47c09217363..d0709c6a40e 100644 --- a/cpp/src/utilities/pinned_memory.cpp +++ b/cpp/src/utilities/pinned_memory.cpp @@ -142,8 +142,7 @@ CUDF_EXPORT rmm::host_device_async_resource_ref& make_default_pinned_mr( if (config_size.has_value()) { return *config_size; } - size_t free{}, total{}; - CUDF_CUDA_TRY(cudaMemGetInfo(&free, &total)); + auto const total = rmm::available_device_memory().second; // 0.5% of the total device memory, capped at 100MB return std::min(total / 200, size_t{100} * 1024 * 1024); }(); From cb9cc228bdc8fa74b5a517a149b11f7e6201d71b Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 6 Jun 2024 10:58:48 -0700 Subject: [PATCH 17/75] reviews --- cpp/src/utilities/pinned_memory.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp index d0709c6a40e..53e0d10c6f4 100644 --- a/cpp/src/utilities/pinned_memory.cpp +++ b/cpp/src/utilities/pinned_memory.cpp @@ -83,7 +83,7 @@ class fixed_pinned_pool_memory_resource { std::size_t alignment, cuda::stream_ref stream) noexcept { - if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr <= pool_end_) { + if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr < pool_end_) { pool_->deallocate_async(ptr, bytes, alignment, stream); } else { upstream_mr_.deallocate_async(ptr, bytes, alignment, stream); @@ -113,13 +113,13 @@ class fixed_pinned_pool_memory_resource { return !operator==(other); } - [[maybe_unused]] friend void get_property(fixed_pinned_pool_memory_resource const&, - cuda::mr::device_accessible) noexcept + friend void get_property(fixed_pinned_pool_memory_resource const&, + cuda::mr::device_accessible) noexcept { } - [[maybe_unused]] friend void get_property(fixed_pinned_pool_memory_resource const&, - cuda::mr::host_accessible) noexcept + friend void get_property(fixed_pinned_pool_memory_resource const&, + cuda::mr::host_accessible) noexcept { } }; @@ -148,7 +148,7 @@ CUDF_EXPORT rmm::host_device_async_resource_ref& make_default_pinned_mr( }(); // rmm requires the pool size to be a multiple of 256 bytes - auto const aligned_size = (size + 255) & ~255; + auto const aligned_size = rmm::align_up(size, rmm::RMM_DEFAULT_HOST_ALIGNMENT); CUDF_LOG_INFO("Pinned pool size = {}", aligned_size); // make the pool with max size equal to the initial size From cf67a14795017eb4a0835bd727acebfff0a066f9 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 6 Jun 2024 11:01:20 -0700 Subject: [PATCH 18/75] expand anon namespace --- cpp/src/utilities/pinned_memory.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp index 53e0d10c6f4..9cebf980d00 100644 --- a/cpp/src/utilities/pinned_memory.cpp +++ b/cpp/src/utilities/pinned_memory.cpp @@ -129,8 +129,6 @@ static_assert(cuda::mr::resource_with, ""); -} // namespace - CUDF_EXPORT rmm::host_device_async_resource_ref& make_default_pinned_mr( std::optional config_size) { @@ -190,6 +188,8 @@ CUDF_EXPORT rmm::host_device_async_resource_ref& host_mr() return mr_ref; } +} // namespace + rmm::host_device_async_resource_ref set_pinned_memory_resource( rmm::host_device_async_resource_ref mr) { From 24c15498b9ad53ec452a99b94fb767b90f4551a0 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 7 Jun 2024 11:07:20 -0700 Subject: [PATCH 19/75] host_uvector --- cpp/benchmarks/io/cuio_common.cpp | 9 +- cpp/benchmarks/io/cuio_common.hpp | 4 +- .../cudf/detail/utilities/host_uvector.hpp | 142 ++++++++++++++ .../cudf/detail/utilities/host_vector.hpp | 183 ------------------ .../detail/utilities/vector_factories.hpp | 12 +- cpp/include/cudf/utilities/span.hpp | 6 + cpp/src/io/text/bgzip_data_chunk_source.cu | 10 +- .../io/text/data_chunk_source_factories.cpp | 8 +- cpp/src/io/utilities/hostdevice_vector.hpp | 4 +- 9 files changed, 172 insertions(+), 206 deletions(-) create mode 100644 cpp/include/cudf/detail/utilities/host_uvector.hpp delete mode 100644 cpp/include/cudf/detail/utilities/host_vector.hpp diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp index 45dc812e247..09d7d8a9db6 100644 --- a/cpp/benchmarks/io/cuio_common.cpp +++ b/cpp/benchmarks/io/cuio_common.cpp @@ -14,13 +14,14 @@ * limitations under the License. */ -#include -#include - #include #include #include +#include + +#include +#include #include @@ -52,7 +53,7 @@ std::string random_file_in_dir(std::string const& dir_path) cuio_source_sink_pair::cuio_source_sink_pair(io_type type) : type{type}, - pinned_buffer({pinned_memory_resource(), cudf::get_default_stream()}), + pinned_buffer(0, pinned_memory_resource(), cudf::get_default_stream()), d_buffer{0, cudf::get_default_stream()}, file_name{random_file_in_dir(tmpdir.path())}, void_sink{cudf::io::data_sink::create()} diff --git a/cpp/benchmarks/io/cuio_common.hpp b/cpp/benchmarks/io/cuio_common.hpp index 64d6021cf50..020fd7e00c1 100644 --- a/cpp/benchmarks/io/cuio_common.hpp +++ b/cpp/benchmarks/io/cuio_common.hpp @@ -18,7 +18,7 @@ #include -#include +#include #include #include @@ -79,7 +79,7 @@ class cuio_source_sink_pair { io_type const type; std::vector h_buffer; - cudf::detail::host_vector pinned_buffer; + cudf::detail::host_uvector pinned_buffer; rmm::device_uvector d_buffer; std::string const file_name; std::unique_ptr void_sink; diff --git a/cpp/include/cudf/detail/utilities/host_uvector.hpp b/cpp/include/cudf/detail/utilities/host_uvector.hpp new file mode 100644 index 00000000000..39bde04e985 --- /dev/null +++ b/cpp/include/cudf/detail/utilities/host_uvector.hpp @@ -0,0 +1,142 @@ +/* + * Copyright 2024 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include + +namespace cudf::detail { + +template +class host_uvector { + public: + host_uvector(std::size_t size, rmm::host_async_resource_ref mr, rmm::cuda_stream_view stream) + : _size{size}, _capacity{size}, _mr{mr}, _stream{stream} + { + if (_size != 0) { _data = static_cast(mr.allocate_async(_size * sizeof(T), _stream)); } + } + + host_uvector(host_uvector const&) = delete; + host_uvector(host_uvector&& other) + : _data{other._data}, + _size{other._size}, + _capacity{other._capacity}, + _mr{other._mr}, + _stream{other._stream} + { + other._data = nullptr; + other._size = 0; + other._capacity = 0; + } + + host_uvector& operator=(host_uvector const&) = delete; + host_uvector& operator=(host_uvector&& other) + { + if (this != &other) { + if (_data != nullptr) { _mr.deallocate_async(_data, _size * sizeof(T), _stream); } + _data = other._data; + _size = other._size; + _capacity = other._capacity; + _mr = other._mr; + _stream = other._stream; + other._data = nullptr; + other._size = 0; + other._capacity = 0; + } + return *this; + } + + ~host_uvector() + { + if (_data != nullptr) { _mr.deallocate_async(_data, _size * sizeof(T), _stream); } + } + + void resize(std::size_t new_size) + { + if (new_size > _capacity) { + auto new_data = static_cast(_mr.allocate_async(new_size * sizeof(T), _stream)); + _stream.synchronize(); + if (_data != nullptr) { + std::copy(_data, _data + _size, new_data); + _mr.deallocate_async(_data, _size * sizeof(T), _stream); + } + _data = new_data; + _capacity = new_size; + } + _size = new_size; + } + + void reserve(std::size_t new_capacity) + { + if (new_capacity > _capacity) { + auto new_data = static_cast(_mr.allocate_async(new_capacity * sizeof(T), _stream)); + _stream.synchronize(); + if (_data != nullptr) { + std::copy(_data, _data + _size, new_data); + _mr.deallocate_async(_data, _size * sizeof(T), _stream); + } + _data = new_data; + _capacity = new_capacity; + } + } + + void push_back(T const& value) + { + if (_size == _capacity) { reserve(_capacity == 0 ? 2 : _capacity * 2); } + _data[_size++] = value; + } + + void clear() { _size = 0; } + + [[nodiscard]] std::size_t size() const { return _size; } + [[nodiscard]] std::int64_t ssize() const { return _size; } + [[nodiscard]] bool is_empty() const { return _size == 0; } + [[nodiscard]] std::size_t capacity() const { return _capacity; } + + [[nodiscard]] T& operator[](std::size_t idx) { return _data[idx]; } + [[nodiscard]] T const& operator[](std::size_t idx) const { return _data[idx]; } + + [[nodiscard]] T* data() { return _data; } + [[nodiscard]] T const* data() const { return _data; } + + [[nodiscard]] T& front() { return _data[0]; } + [[nodiscard]] T const& front() const { return _data[0]; } + + [[nodiscard]] T& back() { return _data[_size - 1]; } + [[nodiscard]] T const& back() const { return _data[_size - 1]; } + + [[nodiscard]] T* begin() { return _data; } + [[nodiscard]] T const* begin() const { return _data; } + + [[nodiscard]] T* end() { return _data + _size; } + [[nodiscard]] T const* end() const { return _data + _size; } + + [[nodiscard]] rmm::host_async_resource_ref memory_resource() const { return _mr; } + [[nodiscard]] rmm::cuda_stream_view stream() const { return _stream; } + + private: + T* _data{nullptr}; + std::size_t _size; + std::size_t _capacity; + rmm::host_async_resource_ref _mr; + rmm::cuda_stream_view _stream; +}; + +} // namespace cudf::detail diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp deleted file mode 100644 index e62c8017f8b..00000000000 --- a/cpp/include/cudf/detail/utilities/host_vector.hpp +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Copyright 2024 NVIDIA Corporation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -#include -#include - -#include - -#include -#include -#include // for bad_alloc - -namespace cudf::detail { - -/*! \p rmm_host_allocator is a CUDA-specific host memory allocator - * that employs \c a `rmm::host_async_resource_ref` for allocation. - * - * \see https://en.cppreference.com/w/cpp/memory/allocator - */ -template -class rmm_host_allocator; - -/*! \p rmm_host_allocator is a CUDA-specific host memory allocator - * that employs \c an `cudf::host_async_resource_ref` for allocation. - * - * \see https://en.cppreference.com/w/cpp/memory/allocator - */ -template <> -class rmm_host_allocator { - public: - using value_type = void; ///< The type of the elements in the allocator - using pointer = void*; ///< The type returned by address() / allocate() - using const_pointer = void const*; ///< The type returned by address() - using size_type = std::size_t; ///< The type used for the size of the allocation - using difference_type = std::ptrdiff_t; ///< The type of the distance between two pointers - - /** - * @brief converts a `rmm_host_allocator` to `rmm_host_allocator` - */ - template - struct rebind { - using other = rmm_host_allocator; ///< The rebound type - }; -}; - -/*! \p rmm_host_allocator is a CUDA-specific host memory allocator - * that employs \c `rmm::host_async_resource_ref` for allocation. - * - * The \p rmm_host_allocator provides an interface for host memory allocation through the user - * provided \c `rmm::host_async_resource_ref`. The \p rmm_host_allocator does not take ownership of - * this reference and therefore it is the user's responsibility to ensure its lifetime for the - * duration of the lifetime of the \p rmm_host_allocator. - * - * \see https://en.cppreference.com/w/cpp/memory/allocator - */ -template -class rmm_host_allocator { - public: - using value_type = T; ///< The type of the elements in the allocator - using pointer = T*; ///< The type returned by address() / allocate() - using const_pointer = T const*; ///< The type returned by address() - using reference = T&; ///< The parameter type for address() - using const_reference = T const&; ///< The parameter type for address() - using size_type = std::size_t; ///< The type used for the size of the allocation - using difference_type = std::ptrdiff_t; ///< The type of the distance between two pointers - - typedef cuda::std::true_type propagate_on_container_move_assignment; - - /** - * @brief converts a `rmm_host_allocator` to `rmm_host_allocator` - */ - template - struct rebind { - using other = rmm_host_allocator; ///< The rebound type - }; - - /** - * @brief Cannot declare an empty host allocator. - */ - rmm_host_allocator() = delete; - - /** - * @brief Construct from a `cudf::host_async_resource_ref` - */ - rmm_host_allocator(rmm::host_async_resource_ref _mr, rmm::cuda_stream_view _stream) - : mr(_mr), stream(_stream) - { - } - - /** - * @brief This method allocates storage for objects in host memory. - * - * @param cnt The number of objects to allocate. - * @return a \c pointer to the newly allocated objects. - * @note This method does not invoke \p value_type's constructor. - * It is the responsibility of the caller to initialize the - * objects at the returned \c pointer. - */ - inline pointer allocate(size_type cnt) - { - if (cnt > this->max_size()) { throw std::bad_alloc(); } // end if - return static_cast( - mr.allocate_async(cnt * sizeof(value_type), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream)); - } - - /** - * @brief This method deallocates host memory previously allocated - * with this \c rmm_host_allocator. - * - * @param p A \c pointer to the previously allocated memory. - * @note The second parameter is the number of objects previously allocated. - * @note This method does not invoke \p value_type's destructor. - * It is the responsibility of the caller to destroy - * the objects stored at \p p. - */ - inline void deallocate(pointer p, size_type cnt) - { - mr.deallocate_async(p, cnt * sizeof(value_type), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); - } - - /** - * @brief This method returns the maximum size of the \c cnt parameter - * accepted by the \p allocate() method. - * - * @return The maximum number of objects that may be allocated - * by a single call to \p allocate(). - */ - constexpr inline size_type max_size() const - { - return (std::numeric_limits::max)() / sizeof(T); - } - - /** - * @brief This method tests this \p rmm_host_allocator for equality to - * another. - * - * @param x The other \p rmm_host_allocator of interest. - * @return This method always returns \c true. - */ - inline bool operator==(rmm_host_allocator const& x) const - { - return x.mr == mr && x.stream == stream; - } - - /** - * @brief This method tests this \p rmm_host_allocator for inequality - * to another. - * - * @param x The other \p rmm_host_allocator of interest. - * @return This method always returns \c false. - */ - inline bool operator!=(rmm_host_allocator const& x) const { return !operator==(x); } - - private: - rmm::host_async_resource_ref mr; - rmm::cuda_stream_view stream; -}; - -/** - * @brief A vector class with rmm host memory allocator - */ -template -using host_vector = thrust::host_vector>; - -} // namespace cudf::detail diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp index 06dfcbfc5e5..f67b671c610 100644 --- a/cpp/include/cudf/detail/utilities/vector_factories.hpp +++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp @@ -21,7 +21,7 @@ * @file vector_factories.hpp */ -#include +#include #include #include #include @@ -442,7 +442,7 @@ thrust::host_vector make_host_vector_sync( } /** - * @brief Asynchronously construct a pinned `cudf::detail::host_vector` of the given size + * @brief Asynchronously construct a pinned `cudf::detail::host_uvector` of the given size * * @note This function does not synchronize `stream`. * @@ -452,13 +452,13 @@ thrust::host_vector make_host_vector_sync( * @return A host_vector of the given size */ template -host_vector make_pinned_vector_async(size_t size, rmm::cuda_stream_view stream) +host_uvector make_pinned_vector_async(size_t size, rmm::cuda_stream_view stream) { - return host_vector(size, {cudf::get_pinned_memory_resource(), stream}); + return host_uvector(size, cudf::get_pinned_memory_resource(), stream); } /** - * @brief Synchronously construct a pinned `cudf::detail::host_vector` of the given size + * @brief Synchronously construct a pinned `cudf::detail::host_uvector` of the given size * * @note This function synchronizes `stream`. * @@ -468,7 +468,7 @@ host_vector make_pinned_vector_async(size_t size, rmm::cuda_stream_view strea * @return A host_vector of the given size */ template -host_vector make_pinned_vector_sync(size_t size, rmm::cuda_stream_view stream) +host_uvector make_pinned_vector_sync(size_t size, rmm::cuda_stream_view stream) { auto result = make_pinned_vector_async(size, stream); stream.synchronize(); diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp index 47e92d61a9f..873d3e56acb 100644 --- a/cpp/include/cudf/utilities/span.hpp +++ b/cpp/include/cudf/utilities/span.hpp @@ -16,6 +16,8 @@ #pragma once +#include + #include #include #include @@ -212,6 +214,10 @@ template struct is_host_span_supported_container< // thrust::host_vector> : std::true_type {}; +template +struct is_host_span_supported_container< // + cudf::detail::host_uvector> : std::true_type {}; + template struct is_host_span_supported_container< // std::basic_string, Alloc>> : std::true_type {}; diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu index 0e3ce779089..b7644a6fb9f 100644 --- a/cpp/src/io/text/bgzip_data_chunk_source.cu +++ b/cpp/src/io/text/bgzip_data_chunk_source.cu @@ -19,7 +19,7 @@ #include "io/utilities/config_utils.hpp" #include -#include +#include #include #include #include @@ -67,7 +67,7 @@ struct bgzip_nvcomp_transform_functor { class bgzip_data_chunk_reader : public data_chunk_reader { private: template - static void copy_to_device(cudf::detail::host_vector const& host, + static void copy_to_device(cudf::detail::host_uvector const& host, rmm::device_uvector& device, rmm::cuda_stream_view stream) { @@ -85,9 +85,9 @@ class bgzip_data_chunk_reader : public data_chunk_reader { 1 << 16; // 64k offset allocation, resized on demand cudaEvent_t event; - cudf::detail::host_vector h_compressed_blocks; - cudf::detail::host_vector h_compressed_offsets; - cudf::detail::host_vector h_decompressed_offsets; + cudf::detail::host_uvector h_compressed_blocks; + cudf::detail::host_uvector h_compressed_offsets; + cudf::detail::host_uvector h_decompressed_offsets; rmm::device_uvector d_compressed_blocks; rmm::device_uvector d_decompressed_blocks; rmm::device_uvector d_compressed_offsets; diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp index 45096b7155c..2c4160e48c5 100644 --- a/cpp/src/io/text/data_chunk_source_factories.cpp +++ b/cpp/src/io/text/data_chunk_source_factories.cpp @@ -17,7 +17,7 @@ #include "io/text/device_data_chunks.hpp" #include -#include +#include #include #include @@ -33,7 +33,7 @@ namespace { struct host_ticket { cudaEvent_t event; - std::unique_ptr> buffer; + std::unique_ptr> buffer; }; /** @@ -86,7 +86,7 @@ class datasource_chunk_reader : public data_chunk_reader { // resize the host buffer as necessary to contain the requested number of bytes if (h_ticket.buffer == nullptr or h_ticket.buffer->size() < read_size) { - h_ticket.buffer = std::make_unique>( + h_ticket.buffer = std::make_unique>( cudf::detail::make_pinned_vector_sync(read_size, stream)); } @@ -153,7 +153,7 @@ class istream_data_chunk_reader : public data_chunk_reader { // resize the host buffer as necessary to contain the requested number of bytes if (h_ticket.buffer == nullptr or h_ticket.buffer->size() < read_size) { - h_ticket.buffer = std::make_unique>( + h_ticket.buffer = std::make_unique>( cudf::detail::make_pinned_vector_sync(read_size, stream)); } diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp index 1ae27a2f4ae..ae2ab03ded3 100644 --- a/cpp/src/io/utilities/hostdevice_vector.hpp +++ b/cpp/src/io/utilities/hostdevice_vector.hpp @@ -18,7 +18,7 @@ #include "hostdevice_span.hpp" -#include +#include #include #include #include @@ -172,7 +172,7 @@ class hostdevice_vector { } private: - cudf::detail::host_vector h_data; + cudf::detail::host_uvector h_data; rmm::device_uvector d_data; }; From 075deca7c87b70b62f30a5b8a266da39a3e852cb Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 7 Jun 2024 11:33:10 -0700 Subject: [PATCH 20/75] style --- cpp/include/cudf/utilities/span.hpp | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp index 873d3e56acb..2f622612209 100644 --- a/cpp/include/cudf/utilities/span.hpp +++ b/cpp/include/cudf/utilities/span.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -204,23 +204,28 @@ class span_base { // ===== host_span ================================================================================= template -struct is_host_span_supported_container : std::false_type {}; +struct is_host_span_supported_container : std::false_type { +}; template struct is_host_span_supported_container< // - std::vector> : std::true_type {}; + std::vector> : std::true_type { +}; template struct is_host_span_supported_container< // - thrust::host_vector> : std::true_type {}; + thrust::host_vector> : std::true_type { +}; template struct is_host_span_supported_container< // - cudf::detail::host_uvector> : std::true_type {}; + cudf::detail::host_uvector> : std::true_type { +}; template struct is_host_span_supported_container< // - std::basic_string, Alloc>> : std::true_type {}; + std::basic_string, Alloc>> : std::true_type { +}; /** * @brief C++20 std::span with reduced feature set. @@ -275,19 +280,23 @@ struct host_span : public cudf::detail::span_base -struct is_device_span_supported_container : std::false_type {}; +struct is_device_span_supported_container : std::false_type { +}; template struct is_device_span_supported_container< // - thrust::device_vector> : std::true_type {}; + thrust::device_vector> : std::true_type { +}; template struct is_device_span_supported_container< // - rmm::device_vector> : std::true_type {}; + rmm::device_vector> : std::true_type { +}; template struct is_device_span_supported_container< // - rmm::device_uvector> : std::true_type {}; + rmm::device_uvector> : std::true_type { +}; /** * @brief Device version of C++20 std::span with reduced feature set. From 164fce20ad07632b5a9899668d9da7d23ced6b97 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 7 Jun 2024 11:53:00 -0700 Subject: [PATCH 21/75] docs; prefixes --- cpp/src/utilities/pinned_memory.cpp | 43 ++++++++++++++++------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp index 9cebf980d00..85d4b7e2283 100644 --- a/cpp/src/utilities/pinned_memory.cpp +++ b/cpp/src/utilities/pinned_memory.cpp @@ -28,34 +28,39 @@ namespace cudf { namespace { + +// Asynchronous memory resource that allocates a fixed-size pool of pinned memory and falls back to +// additional pinned allocations if the pool is exhausted. class fixed_pinned_pool_memory_resource { using upstream_mr = rmm::mr::pinned_host_memory_resource; using host_pooled_mr = rmm::mr::pool_memory_resource; private: - upstream_mr upstream_mr_{}; - size_t pool_size_{0}; + upstream_mr _upstream_mr{}; + size_t _pool_size{0}; // Raw pointer to avoid a segfault when the pool is destroyed on exit host_pooled_mr* pool_{nullptr}; - void* pool_begin_{nullptr}; - void* pool_end_{nullptr}; - cuda::stream_ref stream_{cudf::detail::global_cuda_stream_pool().get_stream().value()}; + // The beginning and end of the pool memory range; pool is never reallocated so these are constant + // and can be used to determine if a pointer is within the pool + void* _pool_begin{nullptr}; + void* _pool_end{nullptr}; + cuda::stream_ref _stream{cudf::detail::global_cuda_stream_pool().get_stream().value()}; public: fixed_pinned_pool_memory_resource(size_t size) - : pool_size_{size}, pool_{new host_pooled_mr(upstream_mr_, size, size)} + : _pool_size{size}, pool_{new host_pooled_mr(_upstream_mr, size, size)} { - if (pool_size_ == 0) { return; } + if (_pool_size == 0) { return; } // Allocate full size from the pinned pool to figure out the beginning and end address - pool_begin_ = pool_->allocate_async(pool_size_, stream_); - pool_end_ = static_cast(static_cast(pool_begin_) + pool_size_); - pool_->deallocate_async(pool_begin_, pool_size_, stream_); + _pool_begin = pool_->allocate_async(_pool_size, _stream); + _pool_end = static_cast(static_cast(_pool_begin) + _pool_size); + pool_->deallocate_async(_pool_begin, _pool_size, _stream); } void* allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream) { - if (bytes <= pool_size_) { + if (bytes <= _pool_size) { try { return pool_->allocate_async(bytes, alignment, stream); } catch (...) { @@ -63,7 +68,7 @@ class fixed_pinned_pool_memory_resource { } } - return upstream_mr_.allocate_async(bytes, alignment, stream); + return _upstream_mr.allocate_async(bytes, alignment, stream); } void* allocate_async(std::size_t bytes, cuda::stream_ref stream) @@ -73,8 +78,8 @@ class fixed_pinned_pool_memory_resource { void* allocate(std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) { - auto const result = allocate_async(bytes, alignment, stream_); - stream_.wait(); + auto const result = allocate_async(bytes, alignment, _stream); + _stream.wait(); return result; } @@ -83,10 +88,10 @@ class fixed_pinned_pool_memory_resource { std::size_t alignment, cuda::stream_ref stream) noexcept { - if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr < pool_end_) { + if (bytes <= _pool_size && ptr >= _pool_begin && ptr < _pool_end) { pool_->deallocate_async(ptr, bytes, alignment, stream); } else { - upstream_mr_.deallocate_async(ptr, bytes, alignment, stream); + _upstream_mr.deallocate_async(ptr, bytes, alignment, stream); } } @@ -99,13 +104,13 @@ class fixed_pinned_pool_memory_resource { std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept { - deallocate_async(ptr, bytes, alignment, stream_); - stream_.wait(); + deallocate_async(ptr, bytes, alignment, _stream); + _stream.wait(); } bool operator==(fixed_pinned_pool_memory_resource const& other) const { - return pool_ == other.pool_ and stream_ == other.stream_; + return pool_ == other.pool_ and _stream == other._stream; } bool operator!=(fixed_pinned_pool_memory_resource const& other) const From b566babb87696cf54656605ab76e9e25b5c42bed Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 7 Jun 2024 12:00:40 -0700 Subject: [PATCH 22/75] type aliases in host_uvector --- .../cudf/detail/utilities/host_uvector.hpp | 69 +++++++++++-------- 1 file changed, 41 insertions(+), 28 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/host_uvector.hpp b/cpp/include/cudf/detail/utilities/host_uvector.hpp index 39bde04e985..c8166217a73 100644 --- a/cpp/include/cudf/detail/utilities/host_uvector.hpp +++ b/cpp/include/cudf/detail/utilities/host_uvector.hpp @@ -27,10 +27,21 @@ namespace cudf::detail { template class host_uvector { public: - host_uvector(std::size_t size, rmm::host_async_resource_ref mr, rmm::cuda_stream_view stream) + using value_type = T; + using size_type = std::size_t; + using reference = value_type&; + using const_reference = value_type const&; + using pointer = value_type*; + using const_pointer = value_type const*; + using iterator = pointer; + using const_iterator = const_pointer; + + host_uvector(size_type size, rmm::host_async_resource_ref mr, rmm::cuda_stream_view stream) : _size{size}, _capacity{size}, _mr{mr}, _stream{stream} { - if (_size != 0) { _data = static_cast(mr.allocate_async(_size * sizeof(T), _stream)); } + if (_size != 0) { + _data = static_cast(mr.allocate_async(_size * sizeof(value_type), _stream)); + } } host_uvector(host_uvector const&) = delete; @@ -50,7 +61,7 @@ class host_uvector { host_uvector& operator=(host_uvector&& other) { if (this != &other) { - if (_data != nullptr) { _mr.deallocate_async(_data, _size * sizeof(T), _stream); } + if (_data != nullptr) { _mr.deallocate_async(_data, _size * sizeof(value_type), _stream); } _data = other._data; _size = other._size; _capacity = other._capacity; @@ -65,17 +76,18 @@ class host_uvector { ~host_uvector() { - if (_data != nullptr) { _mr.deallocate_async(_data, _size * sizeof(T), _stream); } + if (_data != nullptr) { _mr.deallocate_async(_data, _size * sizeof(value_type), _stream); } } - void resize(std::size_t new_size) + void resize(size_type new_size) { if (new_size > _capacity) { - auto new_data = static_cast(_mr.allocate_async(new_size * sizeof(T), _stream)); + auto new_data = + static_cast(_mr.allocate_async(new_size * sizeof(value_type), _stream)); _stream.synchronize(); if (_data != nullptr) { std::copy(_data, _data + _size, new_data); - _mr.deallocate_async(_data, _size * sizeof(T), _stream); + _mr.deallocate_async(_data, _size * sizeof(value_type), _stream); } _data = new_data; _capacity = new_size; @@ -83,21 +95,22 @@ class host_uvector { _size = new_size; } - void reserve(std::size_t new_capacity) + void reserve(size_type new_capacity) { if (new_capacity > _capacity) { - auto new_data = static_cast(_mr.allocate_async(new_capacity * sizeof(T), _stream)); + auto new_data = + static_cast(_mr.allocate_async(new_capacity * sizeof(value_type), _stream)); _stream.synchronize(); if (_data != nullptr) { std::copy(_data, _data + _size, new_data); - _mr.deallocate_async(_data, _size * sizeof(T), _stream); + _mr.deallocate_async(_data, _size * sizeof(value_type), _stream); } _data = new_data; _capacity = new_capacity; } } - void push_back(T const& value) + void push_back(const_reference value) { if (_size == _capacity) { reserve(_capacity == 0 ? 2 : _capacity * 2); } _data[_size++] = value; @@ -105,36 +118,36 @@ class host_uvector { void clear() { _size = 0; } - [[nodiscard]] std::size_t size() const { return _size; } + [[nodiscard]] size_type size() const { return _size; } [[nodiscard]] std::int64_t ssize() const { return _size; } [[nodiscard]] bool is_empty() const { return _size == 0; } - [[nodiscard]] std::size_t capacity() const { return _capacity; } + [[nodiscard]] size_type capacity() const { return _capacity; } - [[nodiscard]] T& operator[](std::size_t idx) { return _data[idx]; } - [[nodiscard]] T const& operator[](std::size_t idx) const { return _data[idx]; } + [[nodiscard]] reference operator[](size_type idx) { return _data[idx]; } + [[nodiscard]] const_reference operator[](size_type idx) const { return _data[idx]; } - [[nodiscard]] T* data() { return _data; } - [[nodiscard]] T const* data() const { return _data; } + [[nodiscard]] pointer data() { return _data; } + [[nodiscard]] const_pointer data() const { return _data; } - [[nodiscard]] T& front() { return _data[0]; } - [[nodiscard]] T const& front() const { return _data[0]; } + [[nodiscard]] reference front() { return _data[0]; } + [[nodiscard]] const_reference front() const { return _data[0]; } - [[nodiscard]] T& back() { return _data[_size - 1]; } - [[nodiscard]] T const& back() const { return _data[_size - 1]; } + [[nodiscard]] reference back() { return _data[_size - 1]; } + [[nodiscard]] const_reference back() const { return _data[_size - 1]; } - [[nodiscard]] T* begin() { return _data; } - [[nodiscard]] T const* begin() const { return _data; } + [[nodiscard]] iterator begin() { return _data; } + [[nodiscard]] const_iterator begin() const { return _data; } - [[nodiscard]] T* end() { return _data + _size; } - [[nodiscard]] T const* end() const { return _data + _size; } + [[nodiscard]] iterator end() { return _data + _size; } + [[nodiscard]] const_iterator end() const { return _data + _size; } [[nodiscard]] rmm::host_async_resource_ref memory_resource() const { return _mr; } [[nodiscard]] rmm::cuda_stream_view stream() const { return _stream; } private: - T* _data{nullptr}; - std::size_t _size; - std::size_t _capacity; + pointer _data{nullptr}; + size_type _size; + size_type _capacity; rmm::host_async_resource_ref _mr; rmm::cuda_stream_view _stream; }; From 21edb534a15c836963c116f3c9ca360cadb1844c Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 7 Jun 2024 13:24:37 -0700 Subject: [PATCH 23/75] refactor host_ticket --- .../io/text/data_chunk_source_factories.cpp | 56 +++++++------------ 1 file changed, 19 insertions(+), 37 deletions(-) diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp index 2c4160e48c5..39e955232e3 100644 --- a/cpp/src/io/text/data_chunk_source_factories.cpp +++ b/cpp/src/io/text/data_chunk_source_factories.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "cudf/utilities/default_stream.hpp" #include "io/text/device_data_chunks.hpp" #include @@ -32,8 +33,15 @@ namespace cudf::io::text { namespace { struct host_ticket { - cudaEvent_t event; - std::unique_ptr> buffer; + cudaEvent_t event{}; // tracks the completion of the last device-to-host copy. + cudf::detail::host_uvector buffer; + + host_ticket() : buffer{cudf::detail::make_pinned_vector_sync(0, cudf::get_default_stream())} + { + cudaEventCreate(&event); + } + + ~host_ticket() { cudaEventDestroy(event); } }; /** @@ -44,20 +52,7 @@ class datasource_chunk_reader : public data_chunk_reader { constexpr static int num_tickets = 2; public: - datasource_chunk_reader(datasource* source) : _source(source) - { - // create an event to track the completion of the last device-to-host copy. - for (auto& ticket : _tickets) { - CUDF_CUDA_TRY(cudaEventCreate(&(ticket.event))); - } - } - - ~datasource_chunk_reader() override - { - for (auto& ticket : _tickets) { - CUDF_CUDA_TRY(cudaEventDestroy(ticket.event)); - } - } + datasource_chunk_reader(datasource* source) : _source(source) {} void skip_bytes(std::size_t size) override { @@ -85,16 +80,15 @@ class datasource_chunk_reader : public data_chunk_reader { CUDF_CUDA_TRY(cudaEventSynchronize(h_ticket.event)); // resize the host buffer as necessary to contain the requested number of bytes - if (h_ticket.buffer == nullptr or h_ticket.buffer->size() < read_size) { - h_ticket.buffer = std::make_unique>( - cudf::detail::make_pinned_vector_sync(read_size, stream)); + if (h_ticket.buffer.size() < read_size) { + h_ticket.buffer = cudf::detail::make_pinned_vector_sync(read_size, stream); } - _source->host_read(_offset, read_size, reinterpret_cast(h_ticket.buffer->data())); + _source->host_read(_offset, read_size, reinterpret_cast(h_ticket.buffer.data())); // copy the host-pinned data on to device CUDF_CUDA_TRY(cudaMemcpyAsync( - chunk.data(), h_ticket.buffer->data(), read_size, cudaMemcpyDefault, stream.value())); + chunk.data(), h_ticket.buffer.data(), read_size, cudaMemcpyDefault, stream.value())); // record the host-to-device copy. CUDF_CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value())); @@ -124,17 +118,6 @@ class istream_data_chunk_reader : public data_chunk_reader { istream_data_chunk_reader(std::unique_ptr datastream) : _datastream(std::move(datastream)) { - // create an event to track the completion of the last device-to-host copy. - for (auto& ticket : _tickets) { - CUDF_CUDA_TRY(cudaEventCreate(&(ticket.event))); - } - } - - ~istream_data_chunk_reader() override - { - for (auto& ticket : _tickets) { - CUDF_CUDA_TRY(cudaEventDestroy(ticket.event)); - } } void skip_bytes(std::size_t size) override { _datastream->ignore(size); }; @@ -152,13 +135,12 @@ class istream_data_chunk_reader : public data_chunk_reader { CUDF_CUDA_TRY(cudaEventSynchronize(h_ticket.event)); // resize the host buffer as necessary to contain the requested number of bytes - if (h_ticket.buffer == nullptr or h_ticket.buffer->size() < read_size) { - h_ticket.buffer = std::make_unique>( - cudf::detail::make_pinned_vector_sync(read_size, stream)); + if (h_ticket.buffer.size() < read_size) { + h_ticket.buffer = cudf::detail::make_pinned_vector_sync(read_size, stream); } // read data from the host istream in to the pinned host memory buffer - _datastream->read(h_ticket.buffer->data(), read_size); + _datastream->read(h_ticket.buffer.data(), read_size); // adjust the read size to reflect how many bytes were actually read from the data stream read_size = _datastream->gcount(); @@ -168,7 +150,7 @@ class istream_data_chunk_reader : public data_chunk_reader { // copy the host-pinned data on to device CUDF_CUDA_TRY(cudaMemcpyAsync( - chunk.data(), h_ticket.buffer->data(), read_size, cudaMemcpyDefault, stream.value())); + chunk.data(), h_ticket.buffer.data(), read_size, cudaMemcpyDefault, stream.value())); // record the host-to-device copy. CUDF_CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value())); From 3814797d5b2d3478901e14f9ecbb733d2168a06a Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 7 Jun 2024 13:33:50 -0700 Subject: [PATCH 24/75] style --- cpp/include/cudf/utilities/span.hpp | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp index 2f622612209..6deef974c0e 100644 --- a/cpp/include/cudf/utilities/span.hpp +++ b/cpp/include/cudf/utilities/span.hpp @@ -204,28 +204,23 @@ class span_base { // ===== host_span ================================================================================= template -struct is_host_span_supported_container : std::false_type { -}; +struct is_host_span_supported_container : std::false_type {}; template struct is_host_span_supported_container< // - std::vector> : std::true_type { -}; + std::vector> : std::true_type {}; template struct is_host_span_supported_container< // - thrust::host_vector> : std::true_type { -}; + thrust::host_vector> : std::true_type {}; template struct is_host_span_supported_container< // - cudf::detail::host_uvector> : std::true_type { -}; + cudf::detail::host_uvector> : std::true_type {}; template struct is_host_span_supported_container< // - std::basic_string, Alloc>> : std::true_type { -}; + std::basic_string, Alloc>> : std::true_type {}; /** * @brief C++20 std::span with reduced feature set. @@ -280,23 +275,19 @@ struct host_span : public cudf::detail::span_base -struct is_device_span_supported_container : std::false_type { -}; +struct is_device_span_supported_container : std::false_type {}; template struct is_device_span_supported_container< // - thrust::device_vector> : std::true_type { -}; + thrust::device_vector> : std::true_type {}; template struct is_device_span_supported_container< // - rmm::device_vector> : std::true_type { -}; + rmm::device_vector> : std::true_type {}; template struct is_device_span_supported_container< // - rmm::device_uvector> : std::true_type { -}; + rmm::device_uvector> : std::true_type {}; /** * @brief Device version of C++20 std::span with reduced feature set. From c9331575ac9ebe2d9b0ebf85ec2f93a42ed9b876 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 10 Jun 2024 10:24:48 -0700 Subject: [PATCH 25/75] style --- cpp/include/cudf/detail/utilities/cuda_copy.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/detail/utilities/cuda_copy.hpp b/cpp/include/cudf/detail/utilities/cuda_copy.hpp index 2ceb70f2ef2..fce91751f80 100644 --- a/cpp/include/cudf/detail/utilities/cuda_copy.hpp +++ b/cpp/include/cudf/detail/utilities/cuda_copy.hpp @@ -38,4 +38,4 @@ void copy_device_to_pinned_async(T* dst, T const* src, size_t size, rmm::cuda_st impl::copy_pinned(dst, src, size * sizeof(T), stream); } -} // namespace cudf::detail \ No newline at end of file +} // namespace cudf::detail From 6784e073197ee450d46350fc348f28e9f085b68a Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 10 Jun 2024 10:35:46 -0700 Subject: [PATCH 26/75] more style --- cpp/src/utilities/cuda_copy.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/utilities/cuda_copy.cu b/cpp/src/utilities/cuda_copy.cu index 9a574d6d0e5..da3b4760967 100644 --- a/cpp/src/utilities/cuda_copy.cu +++ b/cpp/src/utilities/cuda_copy.cu @@ -38,4 +38,4 @@ void copy_pinned(void* dst, void const* src, std::size_t size, rmm::cuda_stream_ } } -} // namespace cudf::detail::impl \ No newline at end of file +} // namespace cudf::detail::impl From f7999aae606269e187de88279f96d5034ad48753 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 10 Jun 2024 16:58:00 -0700 Subject: [PATCH 27/75] Revert "type aliases in host_uvector" This reverts commit b566babb87696cf54656605ab76e9e25b5c42bed. --- .../cudf/detail/utilities/host_uvector.hpp | 69 ++++++++----------- 1 file changed, 28 insertions(+), 41 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/host_uvector.hpp b/cpp/include/cudf/detail/utilities/host_uvector.hpp index c8166217a73..39bde04e985 100644 --- a/cpp/include/cudf/detail/utilities/host_uvector.hpp +++ b/cpp/include/cudf/detail/utilities/host_uvector.hpp @@ -27,21 +27,10 @@ namespace cudf::detail { template class host_uvector { public: - using value_type = T; - using size_type = std::size_t; - using reference = value_type&; - using const_reference = value_type const&; - using pointer = value_type*; - using const_pointer = value_type const*; - using iterator = pointer; - using const_iterator = const_pointer; - - host_uvector(size_type size, rmm::host_async_resource_ref mr, rmm::cuda_stream_view stream) + host_uvector(std::size_t size, rmm::host_async_resource_ref mr, rmm::cuda_stream_view stream) : _size{size}, _capacity{size}, _mr{mr}, _stream{stream} { - if (_size != 0) { - _data = static_cast(mr.allocate_async(_size * sizeof(value_type), _stream)); - } + if (_size != 0) { _data = static_cast(mr.allocate_async(_size * sizeof(T), _stream)); } } host_uvector(host_uvector const&) = delete; @@ -61,7 +50,7 @@ class host_uvector { host_uvector& operator=(host_uvector&& other) { if (this != &other) { - if (_data != nullptr) { _mr.deallocate_async(_data, _size * sizeof(value_type), _stream); } + if (_data != nullptr) { _mr.deallocate_async(_data, _size * sizeof(T), _stream); } _data = other._data; _size = other._size; _capacity = other._capacity; @@ -76,18 +65,17 @@ class host_uvector { ~host_uvector() { - if (_data != nullptr) { _mr.deallocate_async(_data, _size * sizeof(value_type), _stream); } + if (_data != nullptr) { _mr.deallocate_async(_data, _size * sizeof(T), _stream); } } - void resize(size_type new_size) + void resize(std::size_t new_size) { if (new_size > _capacity) { - auto new_data = - static_cast(_mr.allocate_async(new_size * sizeof(value_type), _stream)); + auto new_data = static_cast(_mr.allocate_async(new_size * sizeof(T), _stream)); _stream.synchronize(); if (_data != nullptr) { std::copy(_data, _data + _size, new_data); - _mr.deallocate_async(_data, _size * sizeof(value_type), _stream); + _mr.deallocate_async(_data, _size * sizeof(T), _stream); } _data = new_data; _capacity = new_size; @@ -95,22 +83,21 @@ class host_uvector { _size = new_size; } - void reserve(size_type new_capacity) + void reserve(std::size_t new_capacity) { if (new_capacity > _capacity) { - auto new_data = - static_cast(_mr.allocate_async(new_capacity * sizeof(value_type), _stream)); + auto new_data = static_cast(_mr.allocate_async(new_capacity * sizeof(T), _stream)); _stream.synchronize(); if (_data != nullptr) { std::copy(_data, _data + _size, new_data); - _mr.deallocate_async(_data, _size * sizeof(value_type), _stream); + _mr.deallocate_async(_data, _size * sizeof(T), _stream); } _data = new_data; _capacity = new_capacity; } } - void push_back(const_reference value) + void push_back(T const& value) { if (_size == _capacity) { reserve(_capacity == 0 ? 2 : _capacity * 2); } _data[_size++] = value; @@ -118,36 +105,36 @@ class host_uvector { void clear() { _size = 0; } - [[nodiscard]] size_type size() const { return _size; } + [[nodiscard]] std::size_t size() const { return _size; } [[nodiscard]] std::int64_t ssize() const { return _size; } [[nodiscard]] bool is_empty() const { return _size == 0; } - [[nodiscard]] size_type capacity() const { return _capacity; } + [[nodiscard]] std::size_t capacity() const { return _capacity; } - [[nodiscard]] reference operator[](size_type idx) { return _data[idx]; } - [[nodiscard]] const_reference operator[](size_type idx) const { return _data[idx]; } + [[nodiscard]] T& operator[](std::size_t idx) { return _data[idx]; } + [[nodiscard]] T const& operator[](std::size_t idx) const { return _data[idx]; } - [[nodiscard]] pointer data() { return _data; } - [[nodiscard]] const_pointer data() const { return _data; } + [[nodiscard]] T* data() { return _data; } + [[nodiscard]] T const* data() const { return _data; } - [[nodiscard]] reference front() { return _data[0]; } - [[nodiscard]] const_reference front() const { return _data[0]; } + [[nodiscard]] T& front() { return _data[0]; } + [[nodiscard]] T const& front() const { return _data[0]; } - [[nodiscard]] reference back() { return _data[_size - 1]; } - [[nodiscard]] const_reference back() const { return _data[_size - 1]; } + [[nodiscard]] T& back() { return _data[_size - 1]; } + [[nodiscard]] T const& back() const { return _data[_size - 1]; } - [[nodiscard]] iterator begin() { return _data; } - [[nodiscard]] const_iterator begin() const { return _data; } + [[nodiscard]] T* begin() { return _data; } + [[nodiscard]] T const* begin() const { return _data; } - [[nodiscard]] iterator end() { return _data + _size; } - [[nodiscard]] const_iterator end() const { return _data + _size; } + [[nodiscard]] T* end() { return _data + _size; } + [[nodiscard]] T const* end() const { return _data + _size; } [[nodiscard]] rmm::host_async_resource_ref memory_resource() const { return _mr; } [[nodiscard]] rmm::cuda_stream_view stream() const { return _stream; } private: - pointer _data{nullptr}; - size_type _size; - size_type _capacity; + T* _data{nullptr}; + std::size_t _size; + std::size_t _capacity; rmm::host_async_resource_ref _mr; rmm::cuda_stream_view _stream; }; From c9a82d010a997d4c1f4afad94b36709e859d98fe Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 10 Jun 2024 16:58:14 -0700 Subject: [PATCH 28/75] Revert "docs; prefixes" This reverts commit 164fce20ad07632b5a9899668d9da7d23ced6b97. --- cpp/src/utilities/pinned_memory.cpp | 43 +++++++++++++---------------- 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp index 85d4b7e2283..9cebf980d00 100644 --- a/cpp/src/utilities/pinned_memory.cpp +++ b/cpp/src/utilities/pinned_memory.cpp @@ -28,39 +28,34 @@ namespace cudf { namespace { - -// Asynchronous memory resource that allocates a fixed-size pool of pinned memory and falls back to -// additional pinned allocations if the pool is exhausted. class fixed_pinned_pool_memory_resource { using upstream_mr = rmm::mr::pinned_host_memory_resource; using host_pooled_mr = rmm::mr::pool_memory_resource; private: - upstream_mr _upstream_mr{}; - size_t _pool_size{0}; + upstream_mr upstream_mr_{}; + size_t pool_size_{0}; // Raw pointer to avoid a segfault when the pool is destroyed on exit host_pooled_mr* pool_{nullptr}; - // The beginning and end of the pool memory range; pool is never reallocated so these are constant - // and can be used to determine if a pointer is within the pool - void* _pool_begin{nullptr}; - void* _pool_end{nullptr}; - cuda::stream_ref _stream{cudf::detail::global_cuda_stream_pool().get_stream().value()}; + void* pool_begin_{nullptr}; + void* pool_end_{nullptr}; + cuda::stream_ref stream_{cudf::detail::global_cuda_stream_pool().get_stream().value()}; public: fixed_pinned_pool_memory_resource(size_t size) - : _pool_size{size}, pool_{new host_pooled_mr(_upstream_mr, size, size)} + : pool_size_{size}, pool_{new host_pooled_mr(upstream_mr_, size, size)} { - if (_pool_size == 0) { return; } + if (pool_size_ == 0) { return; } // Allocate full size from the pinned pool to figure out the beginning and end address - _pool_begin = pool_->allocate_async(_pool_size, _stream); - _pool_end = static_cast(static_cast(_pool_begin) + _pool_size); - pool_->deallocate_async(_pool_begin, _pool_size, _stream); + pool_begin_ = pool_->allocate_async(pool_size_, stream_); + pool_end_ = static_cast(static_cast(pool_begin_) + pool_size_); + pool_->deallocate_async(pool_begin_, pool_size_, stream_); } void* allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream) { - if (bytes <= _pool_size) { + if (bytes <= pool_size_) { try { return pool_->allocate_async(bytes, alignment, stream); } catch (...) { @@ -68,7 +63,7 @@ class fixed_pinned_pool_memory_resource { } } - return _upstream_mr.allocate_async(bytes, alignment, stream); + return upstream_mr_.allocate_async(bytes, alignment, stream); } void* allocate_async(std::size_t bytes, cuda::stream_ref stream) @@ -78,8 +73,8 @@ class fixed_pinned_pool_memory_resource { void* allocate(std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) { - auto const result = allocate_async(bytes, alignment, _stream); - _stream.wait(); + auto const result = allocate_async(bytes, alignment, stream_); + stream_.wait(); return result; } @@ -88,10 +83,10 @@ class fixed_pinned_pool_memory_resource { std::size_t alignment, cuda::stream_ref stream) noexcept { - if (bytes <= _pool_size && ptr >= _pool_begin && ptr < _pool_end) { + if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr < pool_end_) { pool_->deallocate_async(ptr, bytes, alignment, stream); } else { - _upstream_mr.deallocate_async(ptr, bytes, alignment, stream); + upstream_mr_.deallocate_async(ptr, bytes, alignment, stream); } } @@ -104,13 +99,13 @@ class fixed_pinned_pool_memory_resource { std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept { - deallocate_async(ptr, bytes, alignment, _stream); - _stream.wait(); + deallocate_async(ptr, bytes, alignment, stream_); + stream_.wait(); } bool operator==(fixed_pinned_pool_memory_resource const& other) const { - return pool_ == other.pool_ and _stream == other._stream; + return pool_ == other.pool_ and stream_ == other.stream_; } bool operator!=(fixed_pinned_pool_memory_resource const& other) const From 930efef8fcec62a4ac87a1f8faebab9783ccabd4 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 10 Jun 2024 16:58:24 -0700 Subject: [PATCH 29/75] Revert "style" This reverts commit 075deca7c87b70b62f30a5b8a266da39a3e852cb. --- cpp/include/cudf/utilities/span.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp index 6deef974c0e..873d3e56acb 100644 --- a/cpp/include/cudf/utilities/span.hpp +++ b/cpp/include/cudf/utilities/span.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 046694978dbe65ea515ad46b079ccbdcd9bc1206 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 10 Jun 2024 16:59:50 -0700 Subject: [PATCH 30/75] Revert "host_uvector" This reverts commit 24c15498b9ad53ec452a99b94fb767b90f4551a0. --- cpp/benchmarks/io/cuio_common.cpp | 9 +- cpp/benchmarks/io/cuio_common.hpp | 4 +- .../cudf/detail/utilities/host_uvector.hpp | 142 -------------- .../cudf/detail/utilities/host_vector.hpp | 183 ++++++++++++++++++ .../detail/utilities/vector_factories.hpp | 12 +- cpp/include/cudf/utilities/span.hpp | 6 - cpp/src/io/text/bgzip_data_chunk_source.cu | 10 +- .../io/text/data_chunk_source_factories.cpp | 4 +- cpp/src/io/utilities/hostdevice_vector.hpp | 4 +- 9 files changed, 204 insertions(+), 170 deletions(-) delete mode 100644 cpp/include/cudf/detail/utilities/host_uvector.hpp create mode 100644 cpp/include/cudf/detail/utilities/host_vector.hpp diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp index 09d7d8a9db6..45dc812e247 100644 --- a/cpp/benchmarks/io/cuio_common.cpp +++ b/cpp/benchmarks/io/cuio_common.cpp @@ -14,14 +14,13 @@ * limitations under the License. */ +#include +#include + #include #include #include -#include - -#include -#include #include @@ -53,7 +52,7 @@ std::string random_file_in_dir(std::string const& dir_path) cuio_source_sink_pair::cuio_source_sink_pair(io_type type) : type{type}, - pinned_buffer(0, pinned_memory_resource(), cudf::get_default_stream()), + pinned_buffer({pinned_memory_resource(), cudf::get_default_stream()}), d_buffer{0, cudf::get_default_stream()}, file_name{random_file_in_dir(tmpdir.path())}, void_sink{cudf::io::data_sink::create()} diff --git a/cpp/benchmarks/io/cuio_common.hpp b/cpp/benchmarks/io/cuio_common.hpp index 020fd7e00c1..64d6021cf50 100644 --- a/cpp/benchmarks/io/cuio_common.hpp +++ b/cpp/benchmarks/io/cuio_common.hpp @@ -18,7 +18,7 @@ #include -#include +#include #include #include @@ -79,7 +79,7 @@ class cuio_source_sink_pair { io_type const type; std::vector h_buffer; - cudf::detail::host_uvector pinned_buffer; + cudf::detail::host_vector pinned_buffer; rmm::device_uvector d_buffer; std::string const file_name; std::unique_ptr void_sink; diff --git a/cpp/include/cudf/detail/utilities/host_uvector.hpp b/cpp/include/cudf/detail/utilities/host_uvector.hpp deleted file mode 100644 index 39bde04e985..00000000000 --- a/cpp/include/cudf/detail/utilities/host_uvector.hpp +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Copyright 2024 NVIDIA Corporation - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -#include -#include - -namespace cudf::detail { - -template -class host_uvector { - public: - host_uvector(std::size_t size, rmm::host_async_resource_ref mr, rmm::cuda_stream_view stream) - : _size{size}, _capacity{size}, _mr{mr}, _stream{stream} - { - if (_size != 0) { _data = static_cast(mr.allocate_async(_size * sizeof(T), _stream)); } - } - - host_uvector(host_uvector const&) = delete; - host_uvector(host_uvector&& other) - : _data{other._data}, - _size{other._size}, - _capacity{other._capacity}, - _mr{other._mr}, - _stream{other._stream} - { - other._data = nullptr; - other._size = 0; - other._capacity = 0; - } - - host_uvector& operator=(host_uvector const&) = delete; - host_uvector& operator=(host_uvector&& other) - { - if (this != &other) { - if (_data != nullptr) { _mr.deallocate_async(_data, _size * sizeof(T), _stream); } - _data = other._data; - _size = other._size; - _capacity = other._capacity; - _mr = other._mr; - _stream = other._stream; - other._data = nullptr; - other._size = 0; - other._capacity = 0; - } - return *this; - } - - ~host_uvector() - { - if (_data != nullptr) { _mr.deallocate_async(_data, _size * sizeof(T), _stream); } - } - - void resize(std::size_t new_size) - { - if (new_size > _capacity) { - auto new_data = static_cast(_mr.allocate_async(new_size * sizeof(T), _stream)); - _stream.synchronize(); - if (_data != nullptr) { - std::copy(_data, _data + _size, new_data); - _mr.deallocate_async(_data, _size * sizeof(T), _stream); - } - _data = new_data; - _capacity = new_size; - } - _size = new_size; - } - - void reserve(std::size_t new_capacity) - { - if (new_capacity > _capacity) { - auto new_data = static_cast(_mr.allocate_async(new_capacity * sizeof(T), _stream)); - _stream.synchronize(); - if (_data != nullptr) { - std::copy(_data, _data + _size, new_data); - _mr.deallocate_async(_data, _size * sizeof(T), _stream); - } - _data = new_data; - _capacity = new_capacity; - } - } - - void push_back(T const& value) - { - if (_size == _capacity) { reserve(_capacity == 0 ? 2 : _capacity * 2); } - _data[_size++] = value; - } - - void clear() { _size = 0; } - - [[nodiscard]] std::size_t size() const { return _size; } - [[nodiscard]] std::int64_t ssize() const { return _size; } - [[nodiscard]] bool is_empty() const { return _size == 0; } - [[nodiscard]] std::size_t capacity() const { return _capacity; } - - [[nodiscard]] T& operator[](std::size_t idx) { return _data[idx]; } - [[nodiscard]] T const& operator[](std::size_t idx) const { return _data[idx]; } - - [[nodiscard]] T* data() { return _data; } - [[nodiscard]] T const* data() const { return _data; } - - [[nodiscard]] T& front() { return _data[0]; } - [[nodiscard]] T const& front() const { return _data[0]; } - - [[nodiscard]] T& back() { return _data[_size - 1]; } - [[nodiscard]] T const& back() const { return _data[_size - 1]; } - - [[nodiscard]] T* begin() { return _data; } - [[nodiscard]] T const* begin() const { return _data; } - - [[nodiscard]] T* end() { return _data + _size; } - [[nodiscard]] T const* end() const { return _data + _size; } - - [[nodiscard]] rmm::host_async_resource_ref memory_resource() const { return _mr; } - [[nodiscard]] rmm::cuda_stream_view stream() const { return _stream; } - - private: - T* _data{nullptr}; - std::size_t _size; - std::size_t _capacity; - rmm::host_async_resource_ref _mr; - rmm::cuda_stream_view _stream; -}; - -} // namespace cudf::detail diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp new file mode 100644 index 00000000000..e62c8017f8b --- /dev/null +++ b/cpp/include/cudf/detail/utilities/host_vector.hpp @@ -0,0 +1,183 @@ +/* + * Copyright 2024 NVIDIA Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include +#include + +#include + +#include +#include +#include // for bad_alloc + +namespace cudf::detail { + +/*! \p rmm_host_allocator is a CUDA-specific host memory allocator + * that employs \c a `rmm::host_async_resource_ref` for allocation. + * + * \see https://en.cppreference.com/w/cpp/memory/allocator + */ +template +class rmm_host_allocator; + +/*! \p rmm_host_allocator is a CUDA-specific host memory allocator + * that employs \c an `cudf::host_async_resource_ref` for allocation. + * + * \see https://en.cppreference.com/w/cpp/memory/allocator + */ +template <> +class rmm_host_allocator { + public: + using value_type = void; ///< The type of the elements in the allocator + using pointer = void*; ///< The type returned by address() / allocate() + using const_pointer = void const*; ///< The type returned by address() + using size_type = std::size_t; ///< The type used for the size of the allocation + using difference_type = std::ptrdiff_t; ///< The type of the distance between two pointers + + /** + * @brief converts a `rmm_host_allocator` to `rmm_host_allocator` + */ + template + struct rebind { + using other = rmm_host_allocator; ///< The rebound type + }; +}; + +/*! \p rmm_host_allocator is a CUDA-specific host memory allocator + * that employs \c `rmm::host_async_resource_ref` for allocation. + * + * The \p rmm_host_allocator provides an interface for host memory allocation through the user + * provided \c `rmm::host_async_resource_ref`. The \p rmm_host_allocator does not take ownership of + * this reference and therefore it is the user's responsibility to ensure its lifetime for the + * duration of the lifetime of the \p rmm_host_allocator. + * + * \see https://en.cppreference.com/w/cpp/memory/allocator + */ +template +class rmm_host_allocator { + public: + using value_type = T; ///< The type of the elements in the allocator + using pointer = T*; ///< The type returned by address() / allocate() + using const_pointer = T const*; ///< The type returned by address() + using reference = T&; ///< The parameter type for address() + using const_reference = T const&; ///< The parameter type for address() + using size_type = std::size_t; ///< The type used for the size of the allocation + using difference_type = std::ptrdiff_t; ///< The type of the distance between two pointers + + typedef cuda::std::true_type propagate_on_container_move_assignment; + + /** + * @brief converts a `rmm_host_allocator` to `rmm_host_allocator` + */ + template + struct rebind { + using other = rmm_host_allocator; ///< The rebound type + }; + + /** + * @brief Cannot declare an empty host allocator. + */ + rmm_host_allocator() = delete; + + /** + * @brief Construct from a `cudf::host_async_resource_ref` + */ + rmm_host_allocator(rmm::host_async_resource_ref _mr, rmm::cuda_stream_view _stream) + : mr(_mr), stream(_stream) + { + } + + /** + * @brief This method allocates storage for objects in host memory. + * + * @param cnt The number of objects to allocate. + * @return a \c pointer to the newly allocated objects. + * @note This method does not invoke \p value_type's constructor. + * It is the responsibility of the caller to initialize the + * objects at the returned \c pointer. + */ + inline pointer allocate(size_type cnt) + { + if (cnt > this->max_size()) { throw std::bad_alloc(); } // end if + return static_cast( + mr.allocate_async(cnt * sizeof(value_type), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream)); + } + + /** + * @brief This method deallocates host memory previously allocated + * with this \c rmm_host_allocator. + * + * @param p A \c pointer to the previously allocated memory. + * @note The second parameter is the number of objects previously allocated. + * @note This method does not invoke \p value_type's destructor. + * It is the responsibility of the caller to destroy + * the objects stored at \p p. + */ + inline void deallocate(pointer p, size_type cnt) + { + mr.deallocate_async(p, cnt * sizeof(value_type), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); + } + + /** + * @brief This method returns the maximum size of the \c cnt parameter + * accepted by the \p allocate() method. + * + * @return The maximum number of objects that may be allocated + * by a single call to \p allocate(). + */ + constexpr inline size_type max_size() const + { + return (std::numeric_limits::max)() / sizeof(T); + } + + /** + * @brief This method tests this \p rmm_host_allocator for equality to + * another. + * + * @param x The other \p rmm_host_allocator of interest. + * @return This method always returns \c true. + */ + inline bool operator==(rmm_host_allocator const& x) const + { + return x.mr == mr && x.stream == stream; + } + + /** + * @brief This method tests this \p rmm_host_allocator for inequality + * to another. + * + * @param x The other \p rmm_host_allocator of interest. + * @return This method always returns \c false. + */ + inline bool operator!=(rmm_host_allocator const& x) const { return !operator==(x); } + + private: + rmm::host_async_resource_ref mr; + rmm::cuda_stream_view stream; +}; + +/** + * @brief A vector class with rmm host memory allocator + */ +template +using host_vector = thrust::host_vector>; + +} // namespace cudf::detail diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp index f67b671c610..06dfcbfc5e5 100644 --- a/cpp/include/cudf/detail/utilities/vector_factories.hpp +++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp @@ -21,7 +21,7 @@ * @file vector_factories.hpp */ -#include +#include #include #include #include @@ -442,7 +442,7 @@ thrust::host_vector make_host_vector_sync( } /** - * @brief Asynchronously construct a pinned `cudf::detail::host_uvector` of the given size + * @brief Asynchronously construct a pinned `cudf::detail::host_vector` of the given size * * @note This function does not synchronize `stream`. * @@ -452,13 +452,13 @@ thrust::host_vector make_host_vector_sync( * @return A host_vector of the given size */ template -host_uvector make_pinned_vector_async(size_t size, rmm::cuda_stream_view stream) +host_vector make_pinned_vector_async(size_t size, rmm::cuda_stream_view stream) { - return host_uvector(size, cudf::get_pinned_memory_resource(), stream); + return host_vector(size, {cudf::get_pinned_memory_resource(), stream}); } /** - * @brief Synchronously construct a pinned `cudf::detail::host_uvector` of the given size + * @brief Synchronously construct a pinned `cudf::detail::host_vector` of the given size * * @note This function synchronizes `stream`. * @@ -468,7 +468,7 @@ host_uvector make_pinned_vector_async(size_t size, rmm::cuda_stream_view stre * @return A host_vector of the given size */ template -host_uvector make_pinned_vector_sync(size_t size, rmm::cuda_stream_view stream) +host_vector make_pinned_vector_sync(size_t size, rmm::cuda_stream_view stream) { auto result = make_pinned_vector_async(size, stream); stream.synchronize(); diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp index 873d3e56acb..47e92d61a9f 100644 --- a/cpp/include/cudf/utilities/span.hpp +++ b/cpp/include/cudf/utilities/span.hpp @@ -16,8 +16,6 @@ #pragma once -#include - #include #include #include @@ -214,10 +212,6 @@ template struct is_host_span_supported_container< // thrust::host_vector> : std::true_type {}; -template -struct is_host_span_supported_container< // - cudf::detail::host_uvector> : std::true_type {}; - template struct is_host_span_supported_container< // std::basic_string, Alloc>> : std::true_type {}; diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu index b7644a6fb9f..0e3ce779089 100644 --- a/cpp/src/io/text/bgzip_data_chunk_source.cu +++ b/cpp/src/io/text/bgzip_data_chunk_source.cu @@ -19,7 +19,7 @@ #include "io/utilities/config_utils.hpp" #include -#include +#include #include #include #include @@ -67,7 +67,7 @@ struct bgzip_nvcomp_transform_functor { class bgzip_data_chunk_reader : public data_chunk_reader { private: template - static void copy_to_device(cudf::detail::host_uvector const& host, + static void copy_to_device(cudf::detail::host_vector const& host, rmm::device_uvector& device, rmm::cuda_stream_view stream) { @@ -85,9 +85,9 @@ class bgzip_data_chunk_reader : public data_chunk_reader { 1 << 16; // 64k offset allocation, resized on demand cudaEvent_t event; - cudf::detail::host_uvector h_compressed_blocks; - cudf::detail::host_uvector h_compressed_offsets; - cudf::detail::host_uvector h_decompressed_offsets; + cudf::detail::host_vector h_compressed_blocks; + cudf::detail::host_vector h_compressed_offsets; + cudf::detail::host_vector h_decompressed_offsets; rmm::device_uvector d_compressed_blocks; rmm::device_uvector d_decompressed_blocks; rmm::device_uvector d_compressed_offsets; diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp index 39e955232e3..596ca3458c8 100644 --- a/cpp/src/io/text/data_chunk_source_factories.cpp +++ b/cpp/src/io/text/data_chunk_source_factories.cpp @@ -18,7 +18,7 @@ #include "io/text/device_data_chunks.hpp" #include -#include +#include #include #include @@ -34,7 +34,7 @@ namespace { struct host_ticket { cudaEvent_t event{}; // tracks the completion of the last device-to-host copy. - cudf::detail::host_uvector buffer; + cudf::detail::host_vector buffer; host_ticket() : buffer{cudf::detail::make_pinned_vector_sync(0, cudf::get_default_stream())} { diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp index ae2ab03ded3..1ae27a2f4ae 100644 --- a/cpp/src/io/utilities/hostdevice_vector.hpp +++ b/cpp/src/io/utilities/hostdevice_vector.hpp @@ -18,7 +18,7 @@ #include "hostdevice_span.hpp" -#include +#include #include #include #include @@ -172,7 +172,7 @@ class hostdevice_vector { } private: - cudf::detail::host_uvector h_data; + cudf::detail::host_vector h_data; rmm::device_uvector d_data; }; From f31221901aab0712b7e4e416c7454d4ef03a7019 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 10 Jun 2024 17:38:31 -0700 Subject: [PATCH 31/75] make do without host_uvector --- cpp/include/cudf/detail/utilities/host_vector.hpp | 8 ++++++-- cpp/include/cudf/detail/utilities/vector_factories.hpp | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp index e62c8017f8b..756fdab177a 100644 --- a/cpp/include/cudf/detail/utilities/host_vector.hpp +++ b/cpp/include/cudf/detail/utilities/host_vector.hpp @@ -117,8 +117,12 @@ class rmm_host_allocator { inline pointer allocate(size_type cnt) { if (cnt > this->max_size()) { throw std::bad_alloc(); } // end if - return static_cast( - mr.allocate_async(cnt * sizeof(value_type), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream)); + auto const result = + mr.allocate_async(cnt * sizeof(value_type), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); + // synchronize to ensure the memory is allocated before thrust::host_vector initialization + // TODO: replace thrust::host_vector with a type that does not require synchronization + stream.synchronize(); + return static_cast(result); } /** diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp index 06dfcbfc5e5..20cb55bb1c7 100644 --- a/cpp/include/cudf/detail/utilities/vector_factories.hpp +++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp @@ -444,7 +444,7 @@ thrust::host_vector make_host_vector_sync( /** * @brief Asynchronously construct a pinned `cudf::detail::host_vector` of the given size * - * @note This function does not synchronize `stream`. + * @note This function may not synchronize `stream`. * * @tparam T The type of the vector data * @param size The number of elements in the created vector From 7cfee0ab2d3bfc3b261edce2340555a69840ebcc Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 10 Jun 2024 17:41:16 -0700 Subject: [PATCH 32/75] missed change --- cpp/include/cudf/detail/utilities/host_vector.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp index 756fdab177a..6a115177ab5 100644 --- a/cpp/include/cudf/detail/utilities/host_vector.hpp +++ b/cpp/include/cudf/detail/utilities/host_vector.hpp @@ -119,7 +119,7 @@ class rmm_host_allocator { if (cnt > this->max_size()) { throw std::bad_alloc(); } // end if auto const result = mr.allocate_async(cnt * sizeof(value_type), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); - // synchronize to ensure the memory is allocated before thrust::host_vector initialization + // Synchronize to ensure the memory is allocated before thrust::host_vector initialization // TODO: replace thrust::host_vector with a type that does not require synchronization stream.synchronize(); return static_cast(result); From fe4d668fb4e6dddf0a019e5443acf2ecc34ff0e8 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 10 Jun 2024 17:54:30 -0700 Subject: [PATCH 33/75] style --- cpp/benchmarks/io/cuio_common.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp index 45dc812e247..645994f3f0d 100644 --- a/cpp/benchmarks/io/cuio_common.cpp +++ b/cpp/benchmarks/io/cuio_common.cpp @@ -14,14 +14,14 @@ * limitations under the License. */ -#include -#include - #include #include #include +#include +#include + #include #include From 5a71f7702483d39b589abf55b6926aa0f07f9ec4 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 12 Jun 2024 12:22:41 -0700 Subject: [PATCH 34/75] rename --- cpp/include/cudf/utilities/pinned_memory.hpp | 4 ++-- cpp/src/utilities/cuda_copy.cu | 2 +- cpp/src/utilities/pinned_memory.cpp | 12 ++++++++---- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp index b0d6c55999f..c57c96dcb41 100644 --- a/cpp/include/cudf/utilities/pinned_memory.hpp +++ b/cpp/include/cudf/utilities/pinned_memory.hpp @@ -63,13 +63,13 @@ bool config_default_pinned_memory_resource(pinned_mr_options const& opts); * threshold, the copy will be done using cudaMemcpyAsync. */ -void set_kernel_copy_threshold(size_t threshold); +void kernel_pinned_copy_threshold(size_t threshold); /** * @brief Get the threshold size for using kernels for pinned memory copies. * * @return The threshold size in bytes. */ -size_t get_kernel_copy_threshold(); +size_t get_kernel_pinned_copy_threshold(); } // namespace cudf diff --git a/cpp/src/utilities/cuda_copy.cu b/cpp/src/utilities/cuda_copy.cu index da3b4760967..7e0cab1f21b 100644 --- a/cpp/src/utilities/cuda_copy.cu +++ b/cpp/src/utilities/cuda_copy.cu @@ -28,7 +28,7 @@ void copy_pinned(void* dst, void const* src, std::size_t size, rmm::cuda_stream_ { if (size == 0) return; - if (size < get_kernel_copy_threshold()) { + if (size < get_kernel_pinned_copy_threshold()) { thrust::copy_n(rmm::exec_policy_nosync(stream), static_cast(src), size, diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp index c1305ad1e89..86d37987b07 100644 --- a/cpp/src/utilities/pinned_memory.cpp +++ b/cpp/src/utilities/pinned_memory.cpp @@ -213,14 +213,18 @@ bool config_default_pinned_memory_resource(pinned_mr_options const& opts) return did_configure; } -CUDF_EXPORT auto& kernel_copy_threshold() +CUDF_EXPORT auto& kernel_pinned_copy_threshold() { - static std::atomic threshold = 0; // use cudaMemcpyAsync for all pinned copies + // use cudaMemcpyAsync for all pinned copies + static std::atomic threshold = 0; return threshold; } -void set_kernel_copy_threshold(size_t threshold) { kernel_copy_threshold() = threshold; } +void set_kernel_pinned_copy_threshold(size_t threshold) +{ + kernel_pinned_copy_threshold() = threshold; +} -size_t get_kernel_copy_threshold() { return kernel_copy_threshold(); } +size_t get_kernel_pinned_copy_threshold() { return kernel_pinned_copy_threshold(); } } // namespace cudf From 9068642c86a3d4bf2f30c705683ac52f9d9f42f3 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 12 Jun 2024 14:30:35 -0700 Subject: [PATCH 35/75] refactor --- .../cudf/detail/utilities/cuda_copy.hpp | 49 +++++++++++++++++-- cpp/src/io/utilities/hostdevice_vector.hpp | 10 ++-- cpp/src/utilities/cuda_copy.cu | 43 ++++++++++++++++ 3 files changed, 91 insertions(+), 11 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/cuda_copy.hpp b/cpp/include/cudf/detail/utilities/cuda_copy.hpp index fce91751f80..7732e108938 100644 --- a/cpp/include/cudf/detail/utilities/cuda_copy.hpp +++ b/cpp/include/cudf/detail/utilities/cuda_copy.hpp @@ -22,20 +22,59 @@ namespace cudf::detail { namespace impl { -void copy_pinned(void* dst, void const* src, size_t size, rmm::cuda_stream_view stream); +void copy_pinned_to_device(void* dst, void const* src, size_t size, rmm::cuda_stream_view stream); +void copy_device_to_pinned(void* dst, void const* src, size_t size, rmm::cuda_stream_view stream); + +void copy_pageable_to_device(void* dst, void const* src, size_t size, rmm::cuda_stream_view stream); +void copy_device_to_pageable(void* dst, void const* src, size_t size, rmm::cuda_stream_view stream); } // namespace impl +enum class copy_kind { PINNED_TO_DEVICE, DEVICE_TO_PINNED, PAGEABLE_TO_DEVICE, DEVICE_TO_PAGEABLE }; + +/** + * @brief Asynchronously copies data between the host and device. + * + * Implementation may use different strategies depending on the size and type of host data. + * + * @param dst Destination memory address + * @param src Source memory address + * @param size Number of bytes to copy + * @param kind Direction of the copy and type of host memory + * @param stream CUDA stream used for the copy + */ + template -void copy_pinned_to_device_async(T* dst, T const* src, size_t size, rmm::cuda_stream_view stream) +void cuda_memcpy_async( + T* dst, T const* src, size_t size, copy_kind kind, rmm::cuda_stream_view stream) { - impl::copy_pinned(dst, src, size * sizeof(T), stream); + if (kind == copy_kind::PINNED_TO_DEVICE) { + impl::copy_pinned_to_device(dst, src, size * sizeof(T), stream); + } else if (kind == copy_kind::DEVICE_TO_PINNED) { + impl::copy_device_to_pinned(dst, src, size * sizeof(T), stream); + } else if (kind == copy_kind::PAGEABLE_TO_DEVICE) { + impl::copy_pageable_to_device(dst, src, size * sizeof(T), stream); + } else if (kind == copy_kind::DEVICE_TO_PAGEABLE) { + impl::copy_device_to_pageable(dst, src, size * sizeof(T), stream); + } } +/** + * @brief Synchronously copies data between the host and device. + * + * Implementation may use different strategies depending on the size and type of host data. + * + * @param dst Destination memory address + * @param src Source memory address + * @param size Number of bytes to copy + * @param kind Direction of the copy and type of host memory + * @param stream CUDA stream used for the copy + */ template -void copy_device_to_pinned_async(T* dst, T const* src, size_t size, rmm::cuda_stream_view stream) +void cuda_memcpy(T* dst, T const* src, size_t size, copy_kind kind, rmm::cuda_stream_view stream) { - impl::copy_pinned(dst, src, size * sizeof(T), stream); + cuda_memcpy_async(dst, src, size, kind, stream); + stream.synchronize(); } } // namespace cudf::detail diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp index fe2100a7886..2429bca57fa 100644 --- a/cpp/src/io/utilities/hostdevice_vector.hpp +++ b/cpp/src/io/utilities/hostdevice_vector.hpp @@ -125,24 +125,22 @@ class hostdevice_vector { void host_to_device_async(rmm::cuda_stream_view stream) { - copy_pinned_to_device_async(device_ptr(), host_ptr(), size(), stream); + cuda_memcpy_async(device_ptr(), host_ptr(), size(), copy_kind::PINNED_TO_DEVICE, stream); } void host_to_device_sync(rmm::cuda_stream_view stream) { - host_to_device_async(stream); - stream.synchronize(); + cuda_memcpy(device_ptr(), host_ptr(), size(), copy_kind::PINNED_TO_DEVICE, stream); } void device_to_host_async(rmm::cuda_stream_view stream) { - copy_device_to_pinned_async(host_ptr(), device_ptr(), size(), stream); + cuda_memcpy_async(device_ptr(), host_ptr(), size(), copy_kind::DEVICE_TO_PINNED, stream); } void device_to_host_sync(rmm::cuda_stream_view stream) { - device_to_host_async(stream); - stream.synchronize(); + cuda_memcpy(device_ptr(), host_ptr(), size(), copy_kind::DEVICE_TO_PINNED, stream); } /** diff --git a/cpp/src/utilities/cuda_copy.cu b/cpp/src/utilities/cuda_copy.cu index 7e0cab1f21b..78445c45a63 100644 --- a/cpp/src/utilities/cuda_copy.cu +++ b/cpp/src/utilities/cuda_copy.cu @@ -24,6 +24,8 @@ namespace cudf::detail::impl { +namespace { + void copy_pinned(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream) { if (size == 0) return; @@ -38,4 +40,45 @@ void copy_pinned(void* dst, void const* src, std::size_t size, rmm::cuda_stream_ } } +void copy_pageable(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream) +{ + if (size == 0) return; + + CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream)); +} + +}; // namespace + +void copy_pinned_to_device(void* dst, + void const* src, + std::size_t size, + rmm::cuda_stream_view stream) +{ + copy_pinned(dst, src, size, stream); +} + +void copy_device_to_pinned(void* dst, + void const* src, + std::size_t size, + rmm::cuda_stream_view stream) +{ + copy_pinned(dst, src, size, stream); +} + +void copy_device_to_pageable(void* dst, + void const* src, + std::size_t size, + rmm::cuda_stream_view stream) +{ + copy_pageable(dst, src, size, stream); +} + +void copy_pageable_to_device(void* dst, + void const* src, + std::size_t size, + rmm::cuda_stream_view stream) +{ + copy_pageable(dst, src, size, stream); +} + } // namespace cudf::detail::impl From 2ec467002c94063bf03303cf36c9cc9b038c5c8f Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 17 Jun 2024 16:14:38 -0700 Subject: [PATCH 36/75] missing newlines --- cpp/include/cudf/detail/utilities/cuda_copy.hpp | 1 - cpp/include/cudf/utilities/pinned_memory.hpp | 1 - 2 files changed, 2 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/cuda_copy.hpp b/cpp/include/cudf/detail/utilities/cuda_copy.hpp index 7732e108938..47533959ae4 100644 --- a/cpp/include/cudf/detail/utilities/cuda_copy.hpp +++ b/cpp/include/cudf/detail/utilities/cuda_copy.hpp @@ -43,7 +43,6 @@ enum class copy_kind { PINNED_TO_DEVICE, DEVICE_TO_PINNED, PAGEABLE_TO_DEVICE, D * @param kind Direction of the copy and type of host memory * @param stream CUDA stream used for the copy */ - template void cuda_memcpy_async( T* dst, T const* src, size_t size, copy_kind kind, rmm::cuda_stream_view stream) diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp index c57c96dcb41..e41020dff1e 100644 --- a/cpp/include/cudf/utilities/pinned_memory.hpp +++ b/cpp/include/cudf/utilities/pinned_memory.hpp @@ -62,7 +62,6 @@ bool config_default_pinned_memory_resource(pinned_mr_options const& opts); * threshold, the copy will be done using kernels. If the size is greater than or equal to this * threshold, the copy will be done using cudaMemcpyAsync. */ - void kernel_pinned_copy_threshold(size_t threshold); /** From a886eb4b9df0fcf3e8a536ffeff355593712add2 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 17 Jun 2024 16:47:41 -0700 Subject: [PATCH 37/75] rename files --- cpp/CMakeLists.txt | 2 +- .../cudf/detail/utilities/{cuda_copy.hpp => cuda_memcpy.hpp} | 0 cpp/src/io/utilities/hostdevice_vector.hpp | 2 +- cpp/src/utilities/{cuda_copy.cu => cuda_memcpy.cu} | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) rename cpp/include/cudf/detail/utilities/{cuda_copy.hpp => cuda_memcpy.hpp} (100%) rename cpp/src/utilities/{cuda_copy.cu => cuda_memcpy.cu} (98%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 13db81f3c97..afbeb7c3266 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -662,7 +662,7 @@ add_library( src/unary/nan_ops.cu src/unary/null_ops.cu src/utilities/default_stream.cpp - src/utilities/cuda_copy.cu + src/utilities/cuda_memcpy.cu src/utilities/linked_column.cpp src/utilities/logger.cpp src/utilities/pinned_memory.cpp diff --git a/cpp/include/cudf/detail/utilities/cuda_copy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp similarity index 100% rename from cpp/include/cudf/detail/utilities/cuda_copy.hpp rename to cpp/include/cudf/detail/utilities/cuda_memcpy.hpp diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp index 2429bca57fa..db1f9f1e461 100644 --- a/cpp/src/io/utilities/hostdevice_vector.hpp +++ b/cpp/src/io/utilities/hostdevice_vector.hpp @@ -18,7 +18,7 @@ #include "hostdevice_span.hpp" -#include +#include #include #include #include diff --git a/cpp/src/utilities/cuda_copy.cu b/cpp/src/utilities/cuda_memcpy.cu similarity index 98% rename from cpp/src/utilities/cuda_copy.cu rename to cpp/src/utilities/cuda_memcpy.cu index 78445c45a63..ff8d3bf120a 100644 --- a/cpp/src/utilities/cuda_copy.cu +++ b/cpp/src/utilities/cuda_memcpy.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include +#include #include #include From dcaeaba8285fc4b3e60ee0a12d60cd9a3cbcf66c Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 18 Jun 2024 15:26:36 -0700 Subject: [PATCH 38/75] test commit, please ignore --- cpp/src/io/utilities/hostdevice_vector.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp index db1f9f1e461..5b9bc4d36c0 100644 --- a/cpp/src/io/utilities/hostdevice_vector.hpp +++ b/cpp/src/io/utilities/hostdevice_vector.hpp @@ -135,12 +135,12 @@ class hostdevice_vector { void device_to_host_async(rmm::cuda_stream_view stream) { - cuda_memcpy_async(device_ptr(), host_ptr(), size(), copy_kind::DEVICE_TO_PINNED, stream); + cuda_memcpy_async(host_ptr(), device_ptr(), size(), copy_kind::DEVICE_TO_PINNED, stream); } void device_to_host_sync(rmm::cuda_stream_view stream) { - cuda_memcpy(device_ptr(), host_ptr(), size(), copy_kind::DEVICE_TO_PINNED, stream); + cuda_memcpy(host_ptr(), device_ptr(), size(), copy_kind::DEVICE_TO_PINNED, stream); } /** From 0a2742f5026fac3546e5d8814d76eb8fdf794fe8 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 24 Jun 2024 10:46:40 -0700 Subject: [PATCH 39/75] fix typo --- cpp/include/cudf/utilities/pinned_memory.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp index e41020dff1e..3e2fa43cb50 100644 --- a/cpp/include/cudf/utilities/pinned_memory.hpp +++ b/cpp/include/cudf/utilities/pinned_memory.hpp @@ -62,7 +62,7 @@ bool config_default_pinned_memory_resource(pinned_mr_options const& opts); * threshold, the copy will be done using kernels. If the size is greater than or equal to this * threshold, the copy will be done using cudaMemcpyAsync. */ -void kernel_pinned_copy_threshold(size_t threshold); +void set_kernel_pinned_copy_threshold(size_t threshold); /** * @brief Get the threshold size for using kernels for pinned memory copies. From 68a03f13f9a9b1ebd33659018a44f0227e4e9432 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 24 Jun 2024 11:47:14 -0700 Subject: [PATCH 40/75] typeless API --- .../cudf/detail/utilities/cuda_memcpy.hpp | 32 ++----------------- cpp/src/io/utilities/hostdevice_vector.hpp | 8 ++--- cpp/src/utilities/cuda_memcpy.cu | 29 ++++++++++++++--- 3 files changed, 32 insertions(+), 37 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp index 47533959ae4..3d497d0a5e2 100644 --- a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp +++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp @@ -20,16 +20,6 @@ namespace cudf::detail { -namespace impl { - -void copy_pinned_to_device(void* dst, void const* src, size_t size, rmm::cuda_stream_view stream); -void copy_device_to_pinned(void* dst, void const* src, size_t size, rmm::cuda_stream_view stream); - -void copy_pageable_to_device(void* dst, void const* src, size_t size, rmm::cuda_stream_view stream); -void copy_device_to_pageable(void* dst, void const* src, size_t size, rmm::cuda_stream_view stream); - -} // namespace impl - enum class copy_kind { PINNED_TO_DEVICE, DEVICE_TO_PINNED, PAGEABLE_TO_DEVICE, DEVICE_TO_PAGEABLE }; /** @@ -43,20 +33,8 @@ enum class copy_kind { PINNED_TO_DEVICE, DEVICE_TO_PINNED, PAGEABLE_TO_DEVICE, D * @param kind Direction of the copy and type of host memory * @param stream CUDA stream used for the copy */ -template void cuda_memcpy_async( - T* dst, T const* src, size_t size, copy_kind kind, rmm::cuda_stream_view stream) -{ - if (kind == copy_kind::PINNED_TO_DEVICE) { - impl::copy_pinned_to_device(dst, src, size * sizeof(T), stream); - } else if (kind == copy_kind::DEVICE_TO_PINNED) { - impl::copy_device_to_pinned(dst, src, size * sizeof(T), stream); - } else if (kind == copy_kind::PAGEABLE_TO_DEVICE) { - impl::copy_pageable_to_device(dst, src, size * sizeof(T), stream); - } else if (kind == copy_kind::DEVICE_TO_PAGEABLE) { - impl::copy_device_to_pageable(dst, src, size * sizeof(T), stream); - } -} + void* dst, void const* src, size_t size, copy_kind kind, rmm::cuda_stream_view stream); /** * @brief Synchronously copies data between the host and device. @@ -69,11 +47,7 @@ void cuda_memcpy_async( * @param kind Direction of the copy and type of host memory * @param stream CUDA stream used for the copy */ -template -void cuda_memcpy(T* dst, T const* src, size_t size, copy_kind kind, rmm::cuda_stream_view stream) -{ - cuda_memcpy_async(dst, src, size, kind, stream); - stream.synchronize(); -} +void cuda_memcpy( + void* dst, void const* src, size_t size, copy_kind kind, rmm::cuda_stream_view stream); } // namespace cudf::detail diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp index 5b9bc4d36c0..1cbf850bf20 100644 --- a/cpp/src/io/utilities/hostdevice_vector.hpp +++ b/cpp/src/io/utilities/hostdevice_vector.hpp @@ -125,22 +125,22 @@ class hostdevice_vector { void host_to_device_async(rmm::cuda_stream_view stream) { - cuda_memcpy_async(device_ptr(), host_ptr(), size(), copy_kind::PINNED_TO_DEVICE, stream); + cuda_memcpy_async(device_ptr(), host_ptr(), size_bytes(), copy_kind::PINNED_TO_DEVICE, stream); } void host_to_device_sync(rmm::cuda_stream_view stream) { - cuda_memcpy(device_ptr(), host_ptr(), size(), copy_kind::PINNED_TO_DEVICE, stream); + cuda_memcpy(device_ptr(), host_ptr(), size_bytes(), copy_kind::PINNED_TO_DEVICE, stream); } void device_to_host_async(rmm::cuda_stream_view stream) { - cuda_memcpy_async(host_ptr(), device_ptr(), size(), copy_kind::DEVICE_TO_PINNED, stream); + cuda_memcpy_async(host_ptr(), device_ptr(), size_bytes(), copy_kind::DEVICE_TO_PINNED, stream); } void device_to_host_sync(rmm::cuda_stream_view stream) { - cuda_memcpy(host_ptr(), device_ptr(), size(), copy_kind::DEVICE_TO_PINNED, stream); + cuda_memcpy(host_ptr(), device_ptr(), size_bytes(), copy_kind::DEVICE_TO_PINNED, stream); } /** diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu index ff8d3bf120a..ed920cc90c7 100644 --- a/cpp/src/utilities/cuda_memcpy.cu +++ b/cpp/src/utilities/cuda_memcpy.cu @@ -22,7 +22,7 @@ #include -namespace cudf::detail::impl { +namespace cudf::detail { namespace { @@ -47,8 +47,6 @@ void copy_pageable(void* dst, void const* src, std::size_t size, rmm::cuda_strea CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream)); } -}; // namespace - void copy_pinned_to_device(void* dst, void const* src, std::size_t size, @@ -81,4 +79,27 @@ void copy_pageable_to_device(void* dst, copy_pageable(dst, src, size, stream); } -} // namespace cudf::detail::impl +}; // namespace + +void cuda_memcpy_async( + void* dst, void const* src, size_t size, copy_kind kind, rmm::cuda_stream_view stream) +{ + if (kind == copy_kind::PINNED_TO_DEVICE) { + copy_pinned_to_device(dst, src, size, stream); + } else if (kind == copy_kind::DEVICE_TO_PINNED) { + copy_device_to_pinned(dst, src, size, stream); + } else if (kind == copy_kind::PAGEABLE_TO_DEVICE) { + copy_pageable_to_device(dst, src, size, stream); + } else if (kind == copy_kind::DEVICE_TO_PAGEABLE) { + copy_device_to_pageable(dst, src, size, stream); + } +} + +void cuda_memcpy( + void* dst, void const* src, size_t size, copy_kind kind, rmm::cuda_stream_view stream) +{ + cuda_memcpy_async(dst, src, size, kind, stream); + stream.synchronize(); +} + +} // namespace cudf::detail From 1741037f77aada99936f4c6c3af720b5a3af7ddc Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 25 Jun 2024 16:54:27 -0700 Subject: [PATCH 41/75] sorthidth --- cpp/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index afbeb7c3266..9ec35acb6fb 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -661,8 +661,8 @@ add_library( src/unary/math_ops.cu src/unary/nan_ops.cu src/unary/null_ops.cu - src/utilities/default_stream.cpp src/utilities/cuda_memcpy.cu + src/utilities/default_stream.cpp src/utilities/linked_column.cpp src/utilities/logger.cpp src/utilities/pinned_memory.cpp From fff667b3f5521213999c56d4e7c6e795d0269742 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 25 Jun 2024 23:26:33 -0700 Subject: [PATCH 42/75] simplify --- .../cudf/detail/utilities/cuda_memcpy.hpp | 10 ++-- cpp/src/io/utilities/hostdevice_vector.hpp | 8 ++-- cpp/src/utilities/cuda_memcpy.cu | 48 +++---------------- 3 files changed, 15 insertions(+), 51 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp index 3d497d0a5e2..b66c461ab12 100644 --- a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp +++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp @@ -20,7 +20,7 @@ namespace cudf::detail { -enum class copy_kind { PINNED_TO_DEVICE, DEVICE_TO_PINNED, PAGEABLE_TO_DEVICE, DEVICE_TO_PAGEABLE }; +enum class host_memory_kind : uint8_t { PINNED, PAGEABLE }; /** * @brief Asynchronously copies data between the host and device. @@ -30,11 +30,11 @@ enum class copy_kind { PINNED_TO_DEVICE, DEVICE_TO_PINNED, PAGEABLE_TO_DEVICE, D * @param dst Destination memory address * @param src Source memory address * @param size Number of bytes to copy - * @param kind Direction of the copy and type of host memory + * @param kind Type of host memory * @param stream CUDA stream used for the copy */ void cuda_memcpy_async( - void* dst, void const* src, size_t size, copy_kind kind, rmm::cuda_stream_view stream); + void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream); /** * @brief Synchronously copies data between the host and device. @@ -44,10 +44,10 @@ void cuda_memcpy_async( * @param dst Destination memory address * @param src Source memory address * @param size Number of bytes to copy - * @param kind Direction of the copy and type of host memory + * @param kind Type of host memory * @param stream CUDA stream used for the copy */ void cuda_memcpy( - void* dst, void const* src, size_t size, copy_kind kind, rmm::cuda_stream_view stream); + void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream); } // namespace cudf::detail diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp index 1cbf850bf20..aed745c42dd 100644 --- a/cpp/src/io/utilities/hostdevice_vector.hpp +++ b/cpp/src/io/utilities/hostdevice_vector.hpp @@ -125,22 +125,22 @@ class hostdevice_vector { void host_to_device_async(rmm::cuda_stream_view stream) { - cuda_memcpy_async(device_ptr(), host_ptr(), size_bytes(), copy_kind::PINNED_TO_DEVICE, stream); + cuda_memcpy_async(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream); } void host_to_device_sync(rmm::cuda_stream_view stream) { - cuda_memcpy(device_ptr(), host_ptr(), size_bytes(), copy_kind::PINNED_TO_DEVICE, stream); + cuda_memcpy(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream); } void device_to_host_async(rmm::cuda_stream_view stream) { - cuda_memcpy_async(host_ptr(), device_ptr(), size_bytes(), copy_kind::DEVICE_TO_PINNED, stream); + cuda_memcpy_async(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream); } void device_to_host_sync(rmm::cuda_stream_view stream) { - cuda_memcpy(host_ptr(), device_ptr(), size_bytes(), copy_kind::DEVICE_TO_PINNED, stream); + cuda_memcpy(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream); } /** diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu index ed920cc90c7..42696ac9d4b 100644 --- a/cpp/src/utilities/cuda_memcpy.cu +++ b/cpp/src/utilities/cuda_memcpy.cu @@ -47,56 +47,20 @@ void copy_pageable(void* dst, void const* src, std::size_t size, rmm::cuda_strea CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream)); } -void copy_pinned_to_device(void* dst, - void const* src, - std::size_t size, - rmm::cuda_stream_view stream) -{ - copy_pinned(dst, src, size, stream); -} - -void copy_device_to_pinned(void* dst, - void const* src, - std::size_t size, - rmm::cuda_stream_view stream) -{ - copy_pinned(dst, src, size, stream); -} - -void copy_device_to_pageable(void* dst, - void const* src, - std::size_t size, - rmm::cuda_stream_view stream) -{ - copy_pageable(dst, src, size, stream); -} - -void copy_pageable_to_device(void* dst, - void const* src, - std::size_t size, - rmm::cuda_stream_view stream) -{ - copy_pageable(dst, src, size, stream); -} - }; // namespace void cuda_memcpy_async( - void* dst, void const* src, size_t size, copy_kind kind, rmm::cuda_stream_view stream) + void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream) { - if (kind == copy_kind::PINNED_TO_DEVICE) { - copy_pinned_to_device(dst, src, size, stream); - } else if (kind == copy_kind::DEVICE_TO_PINNED) { - copy_device_to_pinned(dst, src, size, stream); - } else if (kind == copy_kind::PAGEABLE_TO_DEVICE) { - copy_pageable_to_device(dst, src, size, stream); - } else if (kind == copy_kind::DEVICE_TO_PAGEABLE) { - copy_device_to_pageable(dst, src, size, stream); + switch (kind) { + case host_memory_kind::PINNED: copy_pinned(dst, src, size, stream); + case host_memory_kind::PAGEABLE: + default: copy_pageable(dst, src, size, stream); } } void cuda_memcpy( - void* dst, void const* src, size_t size, copy_kind kind, rmm::cuda_stream_view stream) + void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream) { cuda_memcpy_async(dst, src, size, kind, stream); stream.synchronize(); From 84683d20f8643ca6fc36c40fcf3f342ec393a666 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 25 Jun 2024 23:55:41 -0700 Subject: [PATCH 43/75] another day, another threshold --- cpp/include/cudf/utilities/pinned_memory.hpp | 16 ++++++++++++++++ cpp/src/utilities/pinned_memory.cpp | 14 ++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp index 3e2fa43cb50..7a9e48f443c 100644 --- a/cpp/include/cudf/utilities/pinned_memory.hpp +++ b/cpp/include/cudf/utilities/pinned_memory.hpp @@ -71,4 +71,20 @@ void set_kernel_pinned_copy_threshold(size_t threshold); */ size_t get_kernel_pinned_copy_threshold(); +/** + * @brief Set the threshold size for allocating host memory as pinned memory. + * + * @param threshold The threshold size in bytes. If the size of the allocation is less than this + * threshold, the memory will be allocated as pinned memory. If the size is greater than or equal + * to this threshold, the memory will be allocated as pageable memory. + */ +void set_allocate_host_as_pinned_threshold(size_t threshold); + +/** + * @brief Get the threshold size for allocating host memory as pinned memory. + * + * @return The threshold size in bytes. + */ +size_t get_allocate_host_as_pinned_threshold(); + } // namespace cudf diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp index 3ea4293fc60..feba66d6e8c 100644 --- a/cpp/src/utilities/pinned_memory.cpp +++ b/cpp/src/utilities/pinned_memory.cpp @@ -225,4 +225,18 @@ void set_kernel_pinned_copy_threshold(size_t threshold) size_t get_kernel_pinned_copy_threshold() { return kernel_pinned_copy_threshold(); } +CUDF_EXPORT auto& allocate_host_as_pinned_threshold() +{ + // use pageable memory for all host allocations + static std::atomic threshold = 0; + return threshold; +} + +void set_allocate_host_as_pinned_threshold(size_t threshold) +{ + allocate_host_as_pinned_threshold() = threshold; +} + +size_t get_allocate_host_as_pinned_threshold() { return allocate_host_as_pinned_threshold(); } + } // namespace cudf From 1bbd5743c8956cb80ee2d1c2221e47ba368e3a58 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 26 Jun 2024 09:34:17 -0700 Subject: [PATCH 44/75] add missing break --- cpp/src/utilities/cuda_memcpy.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu index 42696ac9d4b..c73bd9b7799 100644 --- a/cpp/src/utilities/cuda_memcpy.cu +++ b/cpp/src/utilities/cuda_memcpy.cu @@ -53,9 +53,9 @@ void cuda_memcpy_async( void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream) { switch (kind) { - case host_memory_kind::PINNED: copy_pinned(dst, src, size, stream); + case host_memory_kind::PINNED: copy_pinned(dst, src, size, stream); break; case host_memory_kind::PAGEABLE: - default: copy_pageable(dst, src, size, stream); + default: copy_pageable(dst, src, size, stream); break; } } From 101288fb0a56cf935884ee5b28f2a3511dc590c3 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 26 Jun 2024 16:51:37 -0700 Subject: [PATCH 45/75] rename files to host --- cpp/CMakeLists.txt | 2 +- cpp/benchmarks/fixture/nvbench_fixture.hpp | 2 +- .../io/orc/orc_reader_multithreaded.cpp | 2 +- .../io/parquet/parquet_reader_multithread.cpp | 2 +- .../cudf/detail/utilities/vector_factories.hpp | 2 +- .../{pinned_memory.hpp => host_memory.hpp} | 0 cpp/src/utilities/cuda_memcpy.cu | 2 +- .../{pinned_memory.cpp => host_memory.cpp} | 2 +- cpp/tests/io/json_test.cpp | 15 ++++++++------- cpp/tests/utilities_tests/pinned_memory_tests.cpp | 2 +- java/src/main/native/src/RmmJni.cpp | 2 +- 11 files changed, 17 insertions(+), 16 deletions(-) rename cpp/include/cudf/utilities/{pinned_memory.hpp => host_memory.hpp} (100%) rename cpp/src/utilities/{pinned_memory.cpp => host_memory.cpp} (99%) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 35cf90411f2..94df0433b81 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -666,7 +666,7 @@ add_library( src/utilities/default_stream.cpp src/utilities/linked_column.cpp src/utilities/logger.cpp - src/utilities/pinned_memory.cpp + src/utilities/host_memory.cpp src/utilities/stacktrace.cpp src/utilities/stream_pool.cpp src/utilities/traits.cpp diff --git a/cpp/benchmarks/fixture/nvbench_fixture.hpp b/cpp/benchmarks/fixture/nvbench_fixture.hpp index df1492690bb..699844afe62 100644 --- a/cpp/benchmarks/fixture/nvbench_fixture.hpp +++ b/cpp/benchmarks/fixture/nvbench_fixture.hpp @@ -16,7 +16,7 @@ #pragma once #include -#include +#include #include #include diff --git a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp index aa0ee39a179..d3574985bc1 100644 --- a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp +++ b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp index b4c8ed78ed8..71ce265e066 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp index 20cb55bb1c7..41ec6ae7e16 100644 --- a/cpp/include/cudf/detail/utilities/vector_factories.hpp +++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/host_memory.hpp similarity index 100% rename from cpp/include/cudf/utilities/pinned_memory.hpp rename to cpp/include/cudf/utilities/host_memory.hpp diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu index 42696ac9d4b..3b2aefb3a99 100644 --- a/cpp/src/utilities/cuda_memcpy.cu +++ b/cpp/src/utilities/cuda_memcpy.cu @@ -16,7 +16,7 @@ #include #include -#include +#include #include diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/host_memory.cpp similarity index 99% rename from cpp/src/utilities/pinned_memory.cpp rename to cpp/src/utilities/host_memory.cpp index feba66d6e8c..92bbff13c7f 100644 --- a/cpp/src/utilities/pinned_memory.cpp +++ b/cpp/src/utilities/host_memory.cpp @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 9c76c344157..1ac68b859e9 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include @@ -222,7 +222,8 @@ std::string to_records_orient(std::vector> co } template -struct JsonFixedPointReaderTest : public JsonReaderTest {}; +struct JsonFixedPointReaderTest : public JsonReaderTest { +}; template struct JsonValidFixedPointReaderTest : public JsonFixedPointReaderTest { @@ -1139,7 +1140,7 @@ TEST_P(JsonReaderParamTest, ParseOutOfRangeIntegers) "-33333333333333333333333", "-444444444444444444444444"}; std::vector greater_uint64_max = { - "18446744073709551615", "18446744073709551616", "18446744073709551617", "18446744073709551618"}; + "18446744073709551615", "18446744073709551616", "18446744073709551617", "18446744073709551618"}; std::vector less_int64_min = { "-9223372036854775807", "-9223372036854775808", "-9223372036854775809", "-9223372036854775810"}; std::vector mixed_range = { @@ -1369,10 +1370,10 @@ TEST_F(JsonReaderTest, JsonLongString) "", // null "", // null "கார்த்தி", - "CႮ≪ㇳ䍏凹沦王辿龸ꁗ믜스폶ﴠ", // 0000-FFFF - "𐀀𑿪𒐦𓃰𔙆 𖦆𗿿𘳕𚿾[↳] 𜽆𝓚𞤁🄰", // 10000-1FFFF - "𠘨𡥌𢗉𣇊𤊩𥅽𦉱𧴱𨁲𩁹𪐢𫇭𬬭𭺷𮊦屮", // 20000-2FFFF - "𰾑𱔈𲍉", // 30000-3FFFF + "CႮ≪ㇳ䍏凹沦王辿龸ꁗ믜스폶ﴠ", // 0000-FFFF + "𐀀𑿪𒐦𓃰𔙆 𖦆𗿿𘳕𚿾[↳] 𜽆𝓚𞤁🄰", // 10000-1FFFF + "𠘨𡥌𢗉𣇊𤊩𥅽𦉱𧴱𨁲𩁹𪐢𫇭𬬭𭺷𮊦屮", // 20000-2FFFF + "𰾑𱔈𲍉", // 30000-3FFFF R"("$€ \u0024\u20ac \\u0024\\u20ac \\\u0024\\\u20ac \\\\u0024\\\\u20ac)", R"( \\\\\\\\\\\\\\\\)", R"(\\\\\\\\\\\\\\\\)", diff --git a/cpp/tests/utilities_tests/pinned_memory_tests.cpp b/cpp/tests/utilities_tests/pinned_memory_tests.cpp index df9103640f4..5b81930b2c7 100644 --- a/cpp/tests/utilities_tests/pinned_memory_tests.cpp +++ b/cpp/tests/utilities_tests/pinned_memory_tests.cpp @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index 5842a980fc4..706e478842d 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -16,7 +16,7 @@ #include "cudf_jni_apis.hpp" -#include +#include #include #include From ce58c4636b2e3fb47488f4200387fc4486eaf5b6 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 26 Jun 2024 17:00:39 -0700 Subject: [PATCH 46/75] lines --- cpp/src/utilities/cuda_memcpy.cu | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu index c73bd9b7799..5a32f73f236 100644 --- a/cpp/src/utilities/cuda_memcpy.cu +++ b/cpp/src/utilities/cuda_memcpy.cu @@ -53,9 +53,15 @@ void cuda_memcpy_async( void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream) { switch (kind) { - case host_memory_kind::PINNED: copy_pinned(dst, src, size, stream); break; + case host_memory_kind::PINNED: { + copy_pinned(dst, src, size, stream); + break; + } case host_memory_kind::PAGEABLE: - default: copy_pageable(dst, src, size, stream); break; + default: { + copy_pageable(dst, src, size, stream); + break; + } } } From 3739c47789fff7891bc2b51ef677166ae75f64fa Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 26 Jun 2024 18:08:39 -0700 Subject: [PATCH 47/75] get_host_memory_resource --- cpp/include/cudf/utilities/host_memory.hpp | 8 +++ cpp/src/utilities/host_memory.cpp | 70 ++++++++++++++++++++++ 2 files changed, 78 insertions(+) diff --git a/cpp/include/cudf/utilities/host_memory.hpp b/cpp/include/cudf/utilities/host_memory.hpp index 7a9e48f443c..1db747b12a3 100644 --- a/cpp/include/cudf/utilities/host_memory.hpp +++ b/cpp/include/cudf/utilities/host_memory.hpp @@ -87,4 +87,12 @@ void set_allocate_host_as_pinned_threshold(size_t threshold); */ size_t get_allocate_host_as_pinned_threshold(); +/** + * @brief Get the rmm resource to be used for host memory allocations. + * + * @param size The size of the allocation + * @return The rmm resource to be used for host memory allocations + */ +rmm::host_async_resource_ref get_host_memory_resource(size_t size); + } // namespace cudf diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp index 92bbff13c7f..8125c851b71 100644 --- a/cpp/src/utilities/host_memory.cpp +++ b/cpp/src/utilities/host_memory.cpp @@ -186,6 +186,70 @@ CUDF_EXPORT rmm::host_device_async_resource_ref& host_mr() return mr_ref; } +class new_delete_memory_resource { + public: + void* allocate(std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) + { + try { + return rmm::detail::aligned_host_allocate( + bytes, rmm::CUDA_ALLOCATION_ALIGNMENT, [](std::size_t size) { + return ::operator new(size); + }); + } catch (std::bad_alloc const& e) { + RMM_FAIL("Failed to allocate memory: " + std::string{e.what()}, rmm::out_of_memory); + } + } + + void* allocate_async(std::size_t bytes, [[maybe_unused]] cuda::stream_ref stream) + { + return allocate(bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT); + } + + void* allocate_async(std::size_t bytes, + std::size_t alignment, + [[maybe_unused]] cuda::stream_ref stream) + { + return allocate(bytes, alignment); + } + + void deallocate(void* ptr, + std::size_t bytes, + std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept + { + rmm::detail::aligned_host_deallocate( + ptr, bytes, rmm::CUDA_ALLOCATION_ALIGNMENT, [](void* ptr) { ::operator delete(ptr); }); + } + + void deallocate_async(void* ptr, + std::size_t bytes, + std::size_t alignment, + cuda::stream_ref stream) noexcept + { + deallocate(ptr, bytes, alignment); + } + + void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept + { + deallocate(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT); + } + + bool operator==(new_delete_memory_resource const& other) const { return true; } + + bool operator!=(new_delete_memory_resource const& other) const { return !operator==(other); } + + friend void get_property(new_delete_memory_resource const&, cuda::mr::host_accessible) noexcept {} +}; + +static_assert(cuda::mr::resource_with, + "Pinned pool mr must be accessible from both host and device"); + +CUDF_EXPORT rmm::host_async_resource_ref& get_pageable_memory_resource() +{ + static new_delete_memory_resource mr{}; + static rmm::host_async_resource_ref mr_ref{mr}; + return mr_ref; +} + } // namespace rmm::host_device_async_resource_ref set_pinned_memory_resource( @@ -239,4 +303,10 @@ void set_allocate_host_as_pinned_threshold(size_t threshold) size_t get_allocate_host_as_pinned_threshold() { return allocate_host_as_pinned_threshold(); } +rmm::host_async_resource_ref get_host_memory_resource(size_t size) +{ + if (size <= get_allocate_host_as_pinned_threshold()) { return get_pinned_memory_resource(); } + return get_pageable_memory_resource(); +} + } // namespace cudf From 49d65b86635e7487c8698c38e65177d646df7742 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 26 Jun 2024 18:11:57 -0700 Subject: [PATCH 48/75] use if/else --- cpp/src/utilities/cuda_memcpy.cu | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu index 5a32f73f236..3d0822d8545 100644 --- a/cpp/src/utilities/cuda_memcpy.cu +++ b/cpp/src/utilities/cuda_memcpy.cu @@ -52,16 +52,12 @@ void copy_pageable(void* dst, void const* src, std::size_t size, rmm::cuda_strea void cuda_memcpy_async( void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream) { - switch (kind) { - case host_memory_kind::PINNED: { - copy_pinned(dst, src, size, stream); - break; - } - case host_memory_kind::PAGEABLE: - default: { - copy_pageable(dst, src, size, stream); - break; - } + if (kind == host_memory_kind::PINNED) { + copy_pinned(dst, src, size, stream); + } else if (kind == host_memory_kind::PAGEABLE) { + copy_pageable(dst, src, size, stream); + } else { + CUDF_FAIL("Unsupported host memory kind"); } } From db45aa77b9c82eb61d42990eedd7b1a6a21f9263 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 26 Jun 2024 18:55:28 -0700 Subject: [PATCH 49/75] rename back :D --- cpp/benchmarks/fixture/nvbench_fixture.hpp | 2 +- cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp | 2 +- cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp | 2 +- cpp/include/cudf/detail/utilities/host_memory.hpp | 0 cpp/include/cudf/detail/utilities/vector_factories.hpp | 2 +- .../cudf/utilities/{host_memory.hpp => pinned_memory.hpp} | 0 cpp/src/utilities/cuda_memcpy.cu | 2 +- cpp/src/utilities/host_memory.cpp | 2 +- cpp/tests/io/json_test.cpp | 2 +- cpp/tests/utilities_tests/pinned_memory_tests.cpp | 2 +- java/src/main/native/src/RmmJni.cpp | 2 +- 11 files changed, 9 insertions(+), 9 deletions(-) create mode 100644 cpp/include/cudf/detail/utilities/host_memory.hpp rename cpp/include/cudf/utilities/{host_memory.hpp => pinned_memory.hpp} (100%) diff --git a/cpp/benchmarks/fixture/nvbench_fixture.hpp b/cpp/benchmarks/fixture/nvbench_fixture.hpp index 699844afe62..df1492690bb 100644 --- a/cpp/benchmarks/fixture/nvbench_fixture.hpp +++ b/cpp/benchmarks/fixture/nvbench_fixture.hpp @@ -16,7 +16,7 @@ #pragma once #include -#include +#include #include #include diff --git a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp index d3574985bc1..aa0ee39a179 100644 --- a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp +++ b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp index 71ce265e066..b4c8ed78ed8 100644 --- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp +++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include diff --git a/cpp/include/cudf/detail/utilities/host_memory.hpp b/cpp/include/cudf/detail/utilities/host_memory.hpp new file mode 100644 index 00000000000..e69de29bb2d diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp index 41ec6ae7e16..20cb55bb1c7 100644 --- a/cpp/include/cudf/detail/utilities/vector_factories.hpp +++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include diff --git a/cpp/include/cudf/utilities/host_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp similarity index 100% rename from cpp/include/cudf/utilities/host_memory.hpp rename to cpp/include/cudf/utilities/pinned_memory.hpp diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu index fca6385ffbf..3d0822d8545 100644 --- a/cpp/src/utilities/cuda_memcpy.cu +++ b/cpp/src/utilities/cuda_memcpy.cu @@ -16,7 +16,7 @@ #include #include -#include +#include #include diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp index 8125c851b71..b2f76e7dd78 100644 --- a/cpp/src/utilities/host_memory.cpp +++ b/cpp/src/utilities/host_memory.cpp @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 1ac68b859e9..0ee139b4787 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include diff --git a/cpp/tests/utilities_tests/pinned_memory_tests.cpp b/cpp/tests/utilities_tests/pinned_memory_tests.cpp index 5b81930b2c7..df9103640f4 100644 --- a/cpp/tests/utilities_tests/pinned_memory_tests.cpp +++ b/cpp/tests/utilities_tests/pinned_memory_tests.cpp @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp index 706e478842d..5842a980fc4 100644 --- a/java/src/main/native/src/RmmJni.cpp +++ b/java/src/main/native/src/RmmJni.cpp @@ -16,7 +16,7 @@ #include "cudf_jni_apis.hpp" -#include +#include #include #include From 5a072cfdb12bbab499d4aebb8639d921b9ed798f Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 27 Jun 2024 11:47:53 -0700 Subject: [PATCH 50/75] working make_host_vector --- .../cudf/detail/utilities/host_memory.hpp | 46 +++++++++++++++++++ .../cudf/detail/utilities/host_vector.hpp | 13 +++++- .../detail/utilities/vector_factories.hpp | 23 +++++++--- cpp/include/cudf/utilities/pinned_memory.hpp | 8 ---- cpp/src/utilities/host_memory.cpp | 20 ++++---- 5 files changed, 82 insertions(+), 28 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/host_memory.hpp b/cpp/include/cudf/detail/utilities/host_memory.hpp index e69de29bb2d..b1a51ed660e 100644 --- a/cpp/include/cudf/detail/utilities/host_memory.hpp +++ b/cpp/include/cudf/detail/utilities/host_memory.hpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include + +#include + +namespace cudf::detail { + +CUDF_EXPORT rmm::host_async_resource_ref& get_pageable_memory_resource(); + +/** + * @brief Get the rmm resource to be used for host memory allocations. + * + * @param size The size of the allocation + * @return The rmm resource to be used for host memory allocations + */ +template +rmm_host_allocator get_host_allocator(std::size_t size, rmm::cuda_stream_view _stream) +{ + if (size * sizeof(T) <= get_allocate_host_as_pinned_threshold()) { + return {get_pinned_memory_resource(), _stream}; + } + return {get_pageable_memory_resource(), _stream}; +} + +} // namespace cudf::detail \ No newline at end of file diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp index 2d14d0306cd..e688d90a760 100644 --- a/cpp/include/cudf/detail/utilities/host_vector.hpp +++ b/cpp/include/cudf/detail/utilities/host_vector.hpp @@ -100,8 +100,14 @@ class rmm_host_allocator { /** * @brief Construct from a `cudf::host_async_resource_ref` */ - rmm_host_allocator(rmm::host_async_resource_ref _mr, rmm::cuda_stream_view _stream) - : mr(_mr), stream(_stream) + template + rmm_host_allocator(cuda::mr::async_resource_ref _mr, + rmm::cuda_stream_view _stream) + : mr(_mr), + stream(_stream), + _is_device_accessible{ + cuda::has_property, + cuda::mr::device_accessible>} { } @@ -173,9 +179,12 @@ class rmm_host_allocator { */ inline bool operator!=(rmm_host_allocator const& x) const { return !operator==(x); } + bool is_device_accessible() const { return _is_device_accessible; } + private: rmm::host_async_resource_ref mr; rmm::cuda_stream_view stream; + bool _is_device_accessible; }; /** diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp index 20cb55bb1c7..306aa8e2f77 100644 --- a/cpp/include/cudf/detail/utilities/vector_factories.hpp +++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp @@ -21,6 +21,8 @@ * @file vector_factories.hpp */ +#include +#include #include #include #include @@ -373,9 +375,16 @@ std::vector make_std_vector_sync(Container const * @return The data copied to the host */ template -thrust::host_vector make_host_vector_async(device_span v, rmm::cuda_stream_view stream) +host_vector make_host_vector_async(device_span v, rmm::cuda_stream_view stream) { - return make_vector_async>(v, stream); + auto result = host_vector(v.size(), get_host_allocator(v.size(), stream)); + auto const is_pinned = result.get_allocator().is_device_accessible(); + cuda_memcpy_async(result.data(), + v.data(), + v.size() * sizeof(T), + is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, + stream); + return result; } /** @@ -394,8 +403,8 @@ template < typename Container, std::enable_if_t< std::is_convertible_v>>* = nullptr> -thrust::host_vector make_host_vector_async( - Container const& c, rmm::cuda_stream_view stream) +host_vector make_host_vector_async(Container const& c, + rmm::cuda_stream_view stream) { return make_host_vector_async(device_span{c}, stream); } @@ -412,7 +421,7 @@ thrust::host_vector make_host_vector_async( * @return The data copied to the host */ template -thrust::host_vector make_host_vector_sync(device_span v, rmm::cuda_stream_view stream) +host_vector make_host_vector_sync(device_span v, rmm::cuda_stream_view stream) { auto result = make_host_vector_async(v, stream); stream.synchronize(); @@ -435,8 +444,8 @@ template < typename Container, std::enable_if_t< std::is_convertible_v>>* = nullptr> -thrust::host_vector make_host_vector_sync( - Container const& c, rmm::cuda_stream_view stream) +host_vector make_host_vector_sync(Container const& c, + rmm::cuda_stream_view stream) { return make_host_vector_sync(device_span{c}, stream); } diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp index 1db747b12a3..7a9e48f443c 100644 --- a/cpp/include/cudf/utilities/pinned_memory.hpp +++ b/cpp/include/cudf/utilities/pinned_memory.hpp @@ -87,12 +87,4 @@ void set_allocate_host_as_pinned_threshold(size_t threshold); */ size_t get_allocate_host_as_pinned_threshold(); -/** - * @brief Get the rmm resource to be used for host memory allocations. - * - * @param size The size of the allocation - * @return The rmm resource to be used for host memory allocations - */ -rmm::host_async_resource_ref get_host_memory_resource(size_t size); - } // namespace cudf diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp index b2f76e7dd78..b816b9f4e2e 100644 --- a/cpp/src/utilities/host_memory.cpp +++ b/cpp/src/utilities/host_memory.cpp @@ -243,13 +243,6 @@ class new_delete_memory_resource { static_assert(cuda::mr::resource_with, "Pinned pool mr must be accessible from both host and device"); -CUDF_EXPORT rmm::host_async_resource_ref& get_pageable_memory_resource() -{ - static new_delete_memory_resource mr{}; - static rmm::host_async_resource_ref mr_ref{mr}; - return mr_ref; -} - } // namespace rmm::host_device_async_resource_ref set_pinned_memory_resource( @@ -292,7 +285,7 @@ size_t get_kernel_pinned_copy_threshold() { return kernel_pinned_copy_threshold( CUDF_EXPORT auto& allocate_host_as_pinned_threshold() { // use pageable memory for all host allocations - static std::atomic threshold = 0; + static std::atomic threshold = 20; return threshold; } @@ -303,10 +296,15 @@ void set_allocate_host_as_pinned_threshold(size_t threshold) size_t get_allocate_host_as_pinned_threshold() { return allocate_host_as_pinned_threshold(); } -rmm::host_async_resource_ref get_host_memory_resource(size_t size) +namespace detail { + +CUDF_EXPORT rmm::host_async_resource_ref& get_pageable_memory_resource() { - if (size <= get_allocate_host_as_pinned_threshold()) { return get_pinned_memory_resource(); } - return get_pageable_memory_resource(); + static new_delete_memory_resource mr{}; + static rmm::host_async_resource_ref mr_ref{mr}; + return mr_ref; } +} // namespace detail + } // namespace cudf From dd93448238b329178e008a29fb70ecc2b0c40080 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 28 Jun 2024 11:23:38 -0700 Subject: [PATCH 51/75] auto --- cpp/include/cudf/detail/utilities/vector_factories.hpp | 2 -- cpp/src/io/json/nested_json_gpu.cu | 6 ++---- cpp/src/lists/dremel.cu | 6 ++---- cpp/src/utilities/host_memory.cpp | 2 +- cpp/tests/io/json_tree.cpp | 6 ++---- cpp/tests/strings/integers_tests.cpp | 4 +--- 6 files changed, 8 insertions(+), 18 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp index 306aa8e2f77..ea16ac0ef66 100644 --- a/cpp/include/cudf/detail/utilities/vector_factories.hpp +++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp @@ -34,8 +34,6 @@ #include #include -#include - #include namespace cudf { diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 031edfde4f6..405084cc4ad 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1698,10 +1698,8 @@ void make_json_column(json_column& root_column, auto const [d_tokens_gpu, d_token_indices_gpu] = get_token_stream(d_input, options, stream, mr); // Copy the JSON tokens to the host - thrust::host_vector tokens = - cudf::detail::make_host_vector_async(d_tokens_gpu, stream); - thrust::host_vector token_indices_gpu = - cudf::detail::make_host_vector_async(d_token_indices_gpu, stream); + auto tokens = cudf::detail::make_host_vector_async(d_tokens_gpu, stream); + auto token_indices_gpu = cudf::detail::make_host_vector_async(d_token_indices_gpu, stream); // Make sure tokens have been copied to the host stream.synchronize(); diff --git a/cpp/src/lists/dremel.cu b/cpp/src/lists/dremel.cu index 5625e1bf05c..50f40924478 100644 --- a/cpp/src/lists/dremel.cu +++ b/cpp/src/lists/dremel.cu @@ -257,10 +257,8 @@ dremel_data get_encoding(column_view h_col, }, stream); - thrust::host_vector column_offsets = - cudf::detail::make_host_vector_async(d_column_offsets, stream); - thrust::host_vector column_ends = - cudf::detail::make_host_vector_async(d_column_ends, stream); + auto column_offsets = cudf::detail::make_host_vector_async(d_column_offsets, stream); + auto column_ends = cudf::detail::make_host_vector_async(d_column_ends, stream); stream.synchronize(); size_t max_vals_size = 0; diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp index b816b9f4e2e..d3bcf7a085d 100644 --- a/cpp/src/utilities/host_memory.cpp +++ b/cpp/src/utilities/host_memory.cpp @@ -285,7 +285,7 @@ size_t get_kernel_pinned_copy_threshold() { return kernel_pinned_copy_threshold( CUDF_EXPORT auto& allocate_host_as_pinned_threshold() { // use pageable memory for all host allocations - static std::atomic threshold = 20; + static std::atomic threshold = 16 * 1024; return threshold; } diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json_tree.cpp index 7a72b77e1fb..8bcd5790e99 100644 --- a/cpp/tests/io/json_tree.cpp +++ b/cpp/tests/io/json_tree.cpp @@ -235,10 +235,8 @@ tree_meta_t2 get_tree_representation_cpu( { constexpr bool include_quote_char = true; // Copy the JSON tokens to the host - thrust::host_vector tokens = - cudf::detail::make_host_vector_async(tokens_gpu, stream); - thrust::host_vector token_indices = - cudf::detail::make_host_vector_async(token_indices_gpu1, stream); + auto tokens = cudf::detail::make_host_vector_async(tokens_gpu, stream); + auto token_indices = cudf::detail::make_host_vector_async(token_indices_gpu1, stream); // Make sure tokens have been copied to the host stream.synchronize(); diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp index 51e9b3bd0a0..7a038fa6d75 100644 --- a/cpp/tests/strings/integers_tests.cpp +++ b/cpp/tests/strings/integers_tests.cpp @@ -294,7 +294,7 @@ TYPED_TEST(StringsIntegerConvertTest, FromToInteger) std::iota(h_integers.begin(), h_integers.end(), -(TypeParam)(h_integers.size() / 2)); h_integers.push_back(std::numeric_limits::min()); h_integers.push_back(std::numeric_limits::max()); - auto d_integers = cudf::detail::make_device_uvector_sync( + auto const d_integers = cudf::detail::make_device_uvector_sync( h_integers, cudf::get_default_stream(), rmm::mr::get_current_device_resource()); auto integers = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id()}, (cudf::size_type)d_integers.size()); @@ -308,8 +308,6 @@ TYPED_TEST(StringsIntegerConvertTest, FromToInteger) // convert to strings auto results_strings = cudf::strings::from_integers(integers->view()); - // copy back to host - h_integers = cudf::detail::make_host_vector_sync(d_integers, cudf::get_default_stream()); std::vector h_strings; for (auto itr = h_integers.begin(); itr != h_integers.end(); ++itr) h_strings.push_back(std::to_string(*itr)); From 02e7bfb8fcc5a6138e80522402deb2bfabef312f Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 3 Jul 2024 11:31:06 -0700 Subject: [PATCH 52/75] derive host_vector --- .../cudf/detail/utilities/host_vector.hpp | 6 +++- .../detail/utilities/vector_factories.hpp | 31 ++++++++++++++++++- cpp/include/cudf/utilities/span.hpp | 6 ++++ cpp/src/io/parquet/predicate_pushdown.cpp | 20 +++++++----- 4 files changed, 54 insertions(+), 9 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp index e688d90a760..71c5bc842c9 100644 --- a/cpp/include/cudf/detail/utilities/host_vector.hpp +++ b/cpp/include/cudf/detail/utilities/host_vector.hpp @@ -191,6 +191,10 @@ class rmm_host_allocator { * @brief A vector class with rmm host memory allocator */ template -using host_vector = thrust::host_vector>; +class host_vector : public thrust::host_vector> { + public: + using base = thrust::host_vector>; + host_vector(size_t size, rmm_host_allocator const& alloc) : base(size, alloc) {} +}; } // namespace cudf::detail diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp index ea16ac0ef66..f4a421138f1 100644 --- a/cpp/include/cudf/detail/utilities/vector_factories.hpp +++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp @@ -181,6 +181,21 @@ rmm::device_uvector make_device_uvector_async( device_span{c}, stream, mr); } +template +rmm::device_uvector make_device_uvector_async(host_vector const& v, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + rmm::device_uvector ret(v.size(), stream, mr); + auto const is_pinned = v.get_allocator().is_device_accessible(); + cuda_memcpy_async(ret.data(), + v.data(), + v.size() * sizeof(T), + is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, + stream); + return ret; +} + /** * @brief Synchronously construct a `device_uvector` containing a deep copy of data from a * `host_span` @@ -361,6 +376,20 @@ std::vector make_std_vector_sync(Container const return make_std_vector_sync(device_span{c}, stream); } +template +host_vector make_host_vector(size_t size, rmm::cuda_stream_view stream) +{ + return host_vector(size, get_host_allocator(size, stream)); +} + +template +host_vector make_empty_host_vector(size_t capacity, rmm::cuda_stream_view stream) +{ + auto result = host_vector(0, get_host_allocator(capacity, stream)); + result.reserve(capacity); + return result; +} + /** * @brief Asynchronously construct a `thrust::host_vector` containing a copy of data from a * `device_span` @@ -375,7 +404,7 @@ std::vector make_std_vector_sync(Container const template host_vector make_host_vector_async(device_span v, rmm::cuda_stream_view stream) { - auto result = host_vector(v.size(), get_host_allocator(v.size(), stream)); + auto result = make_host_vector(v.size(), stream); auto const is_pinned = result.get_allocator().is_device_accessible(); cuda_memcpy_async(result.data(), v.data(), diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp index 3b35e60e034..34e39d01a6a 100644 --- a/cpp/include/cudf/utilities/span.hpp +++ b/cpp/include/cudf/utilities/span.hpp @@ -16,6 +16,8 @@ #pragma once +#include + #include #include #include @@ -216,6 +218,10 @@ template struct is_host_span_supported_container< // thrust::host_vector> : std::true_type {}; +template +struct is_host_span_supported_container< // + cudf::detail::host_vector> : std::true_type {}; + template struct is_host_span_supported_container< // std::basic_string, Alloc>> : std::true_type {}; diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp index 11f4a00ee8b..481c1e9fcdd 100644 --- a/cpp/src/io/parquet/predicate_pushdown.cpp +++ b/cpp/src/io/parquet/predicate_pushdown.cpp @@ -141,11 +141,11 @@ struct stats_caster { // Local struct to hold host columns struct host_column { // using thrust::host_vector because std::vector uses bitmap instead of byte per bool. - thrust::host_vector val; + cudf::detail::host_vector val; std::vector null_mask; cudf::size_type null_count = 0; - host_column(size_type total_row_groups) - : val(total_row_groups), + host_column(size_type total_row_groups, rmm::cuda_stream_view stream) + : val{cudf::detail::make_host_vector(total_row_groups, stream)}, null_mask( cudf::util::div_rounding_up_safe( cudf::bitmask_allocation_size_bytes(total_row_groups), sizeof(bitmask_type)), @@ -170,8 +170,14 @@ struct stats_caster { rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - std::vector chars{}; - std::vector offsets(1, 0); + auto const total_char_count = std::accumulate( + host_strings.begin(), host_strings.end(), 0, [](auto sum, auto const& str) { + return sum + str.size_bytes(); + }); + auto chars = cudf::detail::make_empty_host_vector(total_char_count, stream); + auto offsets = + cudf::detail::make_empty_host_vector(host_strings.size() + 1, stream); + offsets.push_back(0); for (auto const& str : host_strings) { auto tmp = str.empty() ? std::string_view{} : std::string_view(str.data(), str.size_bytes()); @@ -206,8 +212,8 @@ struct stats_caster { null_count); } }; // local struct host_column - host_column min(total_row_groups); - host_column max(total_row_groups); + host_column min(total_row_groups, stream); + host_column max(total_row_groups, stream); size_type stats_idx = 0; for (size_t src_idx = 0; src_idx < row_group_indices.size(); ++src_idx) { for (auto const rg_idx : row_group_indices[src_idx]) { From ef4e1de5402c4db1f428cd0436f214c41d00c3b5 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 3 Jul 2024 11:31:32 -0700 Subject: [PATCH 53/75] use host_vector pt2 --- cpp/include/cudf/detail/gather.cuh | 2 +- cpp/src/io/orc/writer_impl.cu | 23 ++++---- cpp/src/io/parquet/reader_impl_chunking.cu | 61 ++++++++++++---------- 3 files changed, 46 insertions(+), 40 deletions(-) diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh index 5977c7341c1..04dbe0a9294 100644 --- a/cpp/include/cudf/detail/gather.cuh +++ b/cpp/include/cudf/detail/gather.cuh @@ -577,7 +577,7 @@ void gather_bitmask(table_view const& source, } // Make device array of target bitmask pointers - std::vector target_masks(target.size()); + auto target_masks = cudf::detail::make_host_vector(target.size(), stream); std::transform(target.begin(), target.end(), target_masks.begin(), [](auto const& col) { return col->mutable_view().null_mask(); }); diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index e9e031a407a..409bf91997a 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -1335,7 +1335,7 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer, if (num_file_blobs == 0) { return {}; } // Create empty file stats and merge groups - std::vector h_stat_chunks(num_file_blobs); + auto h_stat_chunks = cudf::detail::make_host_vector(num_file_blobs, stream); cudf::detail::hostdevice_vector stats_merge(num_file_blobs, stream); // Fill in stats_merge and stat_chunks on the host for (auto i = 0u; i < num_file_blobs; ++i) { @@ -1676,39 +1676,39 @@ struct pushdown_null_masks { // Owning vector for masks in device memory std::vector> data; // Pointers to pushdown masks in device memory. Can be same for multiple columns. - std::vector masks; + cudf::detail::host_vector masks; }; pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table, rmm::cuda_stream_view stream) { - std::vector mask_ptrs; - mask_ptrs.reserve(orc_table.num_columns()); + auto mask_ptrs = + cudf::detail::make_empty_host_vector(orc_table.num_columns(), stream); std::vector> pd_masks; for (auto const& col : orc_table.columns) { // Leaf columns don't need pushdown masks if (col.num_children() == 0) { - mask_ptrs.emplace_back(nullptr); + mask_ptrs.push_back({nullptr}); continue; } auto const parent_pd_mask = col.is_child() ? mask_ptrs[col.parent_index()] : nullptr; auto const null_mask = col.null_mask(); if (null_mask == nullptr and parent_pd_mask == nullptr) { - mask_ptrs.emplace_back(nullptr); + mask_ptrs.push_back({nullptr}); continue; } if (col.orc_kind() == STRUCT) { if (null_mask != nullptr and parent_pd_mask == nullptr) { // Reuse own null mask - mask_ptrs.emplace_back(null_mask); + mask_ptrs.push_back(null_mask); } else if (null_mask == nullptr and parent_pd_mask != nullptr) { // Reuse parent's pushdown mask - mask_ptrs.emplace_back(parent_pd_mask); + mask_ptrs.push_back(parent_pd_mask); } else { // Both are nullable, allocate new pushdown mask pd_masks.emplace_back(num_bitmask_words(col.size()), stream); - mask_ptrs.emplace_back(pd_masks.back().data()); + mask_ptrs.push_back({pd_masks.back().data()}); thrust::transform(rmm::exec_policy(stream), null_mask, @@ -1723,7 +1723,7 @@ pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table, auto const child_col = orc_table.column(col.child_begin()[0]); // pushdown mask applies to child column(s); use the child column size pd_masks.emplace_back(num_bitmask_words(child_col.size()), stream); - mask_ptrs.emplace_back(pd_masks.back().data()); + mask_ptrs.push_back({pd_masks.back().data()}); pushdown_lists_null_mask(col, orc_table.d_columns, parent_pd_mask, pd_masks.back(), stream); } } @@ -1814,8 +1814,7 @@ orc_table_view make_orc_table_view(table_view const& table, append_orc_column(table.column(col_idx), nullptr, table_meta.column_metadata[col_idx]); } - std::vector type_kinds; - type_kinds.reserve(orc_columns.size()); + auto type_kinds = cudf::detail::make_empty_host_vector(orc_columns.size(), stream); std::transform( orc_columns.cbegin(), orc_columns.cend(), std::back_inserter(type_kinds), [](auto& orc_column) { return orc_column.orc_kind(); diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu index d371ef5de93..5fba54ab309 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -804,10 +804,10 @@ std::vector compute_page_splits_by_row(device_span> comp_in; - comp_in.reserve(num_comp_pages); - std::vector> comp_out; - comp_out.reserve(num_comp_pages); + auto comp_in = + cudf::detail::make_empty_host_vector>(num_comp_pages, stream); + auto comp_out = + cudf::detail::make_empty_host_vector>(num_comp_pages, stream); // vectors to save v2 def and rep level data, if any std::vector> copy_in; @@ -822,7 +822,6 @@ std::vector compute_page_splits_by_row(device_span compute_page_splits_by_row(device_span(page.compressed_page_size - offset)); - comp_out.emplace_back(dst_base + offset, - static_cast(page.uncompressed_page_size - offset)); + comp_in.push_back( + {page.page_data + offset, static_cast(page.compressed_page_size - offset)}); + comp_out.push_back( + {dst_base + offset, static_cast(page.uncompressed_page_size - offset)}); page.page_data = dst_base; decomp_offset += page.uncompressed_page_size; }); + } + auto d_comp_in = cudf::detail::make_device_uvector_async( + comp_in, stream, rmm::mr::get_current_device_resource()); + auto d_comp_out = cudf::detail::make_device_uvector_async( + comp_out, stream, rmm::mr::get_current_device_resource()); + + int32_t start_pos = 0; + for (auto const& codec : codecs) { + if (codec.num_pages == 0) { continue; } + + device_span const> d_comp_in_view{d_comp_in.data() + start_pos, + codec.num_pages}; + + device_span const> d_comp_out_view(d_comp_out.data() + start_pos, + codec.num_pages); - host_span const> comp_in_view{comp_in.data() + start_pos, - codec.num_pages}; - auto const d_comp_in = cudf::detail::make_device_uvector_async( - comp_in_view, stream, rmm::mr::get_current_device_resource()); - host_span const> comp_out_view(comp_out.data() + start_pos, - codec.num_pages); - auto const d_comp_out = cudf::detail::make_device_uvector_async( - comp_out_view, stream, rmm::mr::get_current_device_resource()); device_span d_comp_res_view(comp_res.data() + start_pos, codec.num_pages); switch (codec.compression_type) { case GZIP: - gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, stream); + gpuinflate( + d_comp_in_view, d_comp_out_view, d_comp_res_view, gzip_header_included::YES, stream); break; case SNAPPY: if (cudf::io::detail::nvcomp_integration::is_stable_enabled()) { nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY, - d_comp_in, - d_comp_out, + d_comp_in_view, + d_comp_out_view, d_comp_res_view, codec.max_decompressed_size, codec.total_decomp_size, stream); } else { - gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, stream); + gpu_unsnap(d_comp_in_view, d_comp_out, d_comp_res_view, stream); } break; case ZSTD: nvcomp::batched_decompress(nvcomp::compression_type::ZSTD, - d_comp_in, - d_comp_out, + d_comp_in_view, + d_comp_out_view, d_comp_res_view, codec.max_decompressed_size, codec.total_decomp_size, stream); break; case BROTLI: - gpu_debrotli(d_comp_in, - d_comp_out, + gpu_debrotli(d_comp_in_view, + d_comp_out_view, d_comp_res_view, debrotli_scratch.data(), debrotli_scratch.size(), @@ -893,8 +900,8 @@ std::vector compute_page_splits_by_row(device_span Date: Fri, 5 Jul 2024 11:48:59 -0700 Subject: [PATCH 54/75] include changes --- cpp/include/cudf/detail/gather.cuh | 2 +- cpp/include/cudf/detail/null_mask.cuh | 4 +++- cpp/include/cudf/detail/utilities/host_vector.hpp | 3 +++ cpp/include/cudf/detail/utilities/vector_factories.hpp | 2 +- cpp/include/cudf/io/text/detail/trie.hpp | 4 ++-- 5 files changed, 10 insertions(+), 5 deletions(-) diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh index 04dbe0a9294..d3e9fc4974d 100644 --- a/cpp/include/cudf/detail/gather.cuh +++ b/cpp/include/cudf/detail/gather.cuh @@ -577,7 +577,7 @@ void gather_bitmask(table_view const& source, } // Make device array of target bitmask pointers - auto target_masks = cudf::detail::make_host_vector(target.size(), stream); + auto target_masks = make_host_vector(target.size(), stream); std::transform(target.begin(), target.end(), target_masks.begin(), [](auto const& col) { return col->mutable_view().null_mask(); }); diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh index e62675cbc8c..ae6db5409cc 100644 --- a/cpp/include/cudf/detail/null_mask.cuh +++ b/cpp/include/cudf/detail/null_mask.cuh @@ -430,7 +430,9 @@ std::vector segmented_count_bits(bitmask_type const* bitmask, if (num_segments == 0) { return std::vector{}; } // Construct a contiguous host buffer of indices and copy to device. - auto const h_indices = std::vector(indices_begin, indices_end); + auto h_indices = make_empty_host_vector::value_type>( + std::distance(indices_begin, indices_end), stream); + std::copy(indices_begin, indices_end, std::back_inserter(h_indices)); auto const d_indices = make_device_uvector_async(h_indices, stream, rmm::mr::get_current_device_resource()); diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp index 71c5bc842c9..b99e79b2e88 100644 --- a/cpp/include/cudf/detail/utilities/host_vector.hpp +++ b/cpp/include/cudf/detail/utilities/host_vector.hpp @@ -194,6 +194,9 @@ template class host_vector : public thrust::host_vector> { public: using base = thrust::host_vector>; + + host_vector(rmm_host_allocator const& alloc) : base(alloc) {} + host_vector(size_t size, rmm_host_allocator const& alloc) : base(size, alloc) {} }; diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp index f4a421138f1..3f29d9d7a33 100644 --- a/cpp/include/cudf/detail/utilities/vector_factories.hpp +++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp @@ -385,7 +385,7 @@ host_vector make_host_vector(size_t size, rmm::cuda_stream_view stream) template host_vector make_empty_host_vector(size_t capacity, rmm::cuda_stream_view stream) { - auto result = host_vector(0, get_host_allocator(capacity, stream)); + auto result = host_vector(get_host_allocator(capacity, stream)); result.reserve(capacity); return result; } diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp index e0b9c7635e3..28862d97ede 100644 --- a/cpp/include/cudf/io/text/detail/trie.hpp +++ b/cpp/include/cudf/io/text/detail/trie.hpp @@ -223,11 +223,11 @@ struct trie { match_length.emplace_back(0); - std::vector trie_nodes; auto token_counts = std::unordered_map(); + auto trie_nodes = cudf::detail::make_empty_host_vector(tokens.size(), stream); for (uint32_t i = 0; i < tokens.size(); i++) { - trie_nodes.emplace_back(trie_node{tokens[i], match_length[i], transitions[i]}); + trie_nodes.push_back(trie_node{tokens[i], match_length[i], transitions[i]}); token_counts[tokens[i]]++; } From 58900ddd3b3f98d4d699b35b52ff5e7ba5f1a4f4 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 5 Jul 2024 13:47:38 -0700 Subject: [PATCH 55/75] orc --- cpp/src/io/orc/reader_impl_decode.cu | 10 ++++++++-- cpp/src/io/orc/stripe_enc.cu | 4 ++-- cpp/src/io/orc/writer_impl.cu | 25 +++++++++++++++---------- cpp/src/io/orc/writer_impl.hpp | 9 ++++----- 4 files changed, 29 insertions(+), 19 deletions(-) diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu index 72eb41b1360..ab3c54584cb 100644 --- a/cpp/src/io/orc/reader_impl_decode.cu +++ b/cpp/src/io/orc/reader_impl_decode.cu @@ -492,11 +492,17 @@ void scan_null_counts(cudf::detail::hostdevice_2dvector const& if (num_stripes == 0) return; auto const num_columns = chunks.size().second; - std::vector> prefix_sums_to_update; + auto const num_struct_cols = + std::count_if(chunks[0].begin(), chunks[0].end(), [](auto const& chunk) { + return chunk.type_kind == STRUCT; + }); + auto prefix_sums_to_update = + cudf::detail::make_empty_host_vector>(num_struct_cols, + stream); for (auto col_idx = 0ul; col_idx < num_columns; ++col_idx) { // Null counts sums are only needed for children of struct columns if (chunks[0][col_idx].type_kind == STRUCT) { - prefix_sums_to_update.emplace_back(col_idx, d_prefix_sums + num_stripes * col_idx); + prefix_sums_to_update.push_back({col_idx, d_prefix_sums + num_stripes * col_idx}); } } auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async( diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu index b6fc4e3510f..8b06fd05cb0 100644 --- a/cpp/src/io/orc/stripe_enc.cu +++ b/cpp/src/io/orc/stripe_enc.cu @@ -1417,8 +1417,8 @@ void decimal_sizes_to_offsets(device_2dspan rg_bounds, if (rg_bounds.count() == 0) return; // Convert map to a vector of views of the `elem_sizes` device buffers - std::vector h_sizes; - h_sizes.reserve(elem_sizes.size()); + auto h_sizes = + cudf::detail::make_empty_host_vector(elem_sizes.size(), stream); std::transform(elem_sizes.begin(), elem_sizes.end(), std::back_inserter(h_sizes), [](auto& p) { return decimal_column_element_sizes{p.first, p.second}; }); diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 409bf91997a..ba1a4eef99f 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -443,14 +443,15 @@ namespace { */ file_segmentation calculate_segmentation(host_span columns, hostdevice_2dvector&& rowgroup_bounds, - stripe_size_limits max_stripe_size) + stripe_size_limits max_stripe_size, + rmm::cuda_stream_view stream) { - std::vector infos; - auto const num_rowgroups = rowgroup_bounds.size().first; - size_t stripe_start = 0; - size_t stripe_bytes = 0; - size_type stripe_rows = 0; - for (size_t rg_idx = 0; rg_idx < num_rowgroups; ++rg_idx) { + auto infos = cudf::detail::make_empty_host_vector(1, stream); + size_type const num_rowgroups = rowgroup_bounds.size().first; + size_type stripe_start = 0; + size_t stripe_bytes = 0; + size_type stripe_rows = 0; + for (size_type rg_idx = 0; rg_idx < num_rowgroups; ++rg_idx) { auto const rowgroup_total_bytes = std::accumulate(columns.begin(), columns.end(), 0ul, [&](size_t total_size, auto const& col) { auto const rows = rowgroup_bounds[rg_idx][col.index()].size(); @@ -469,7 +470,9 @@ file_segmentation calculate_segmentation(host_span column // Check if adding the current rowgroup to the stripe will make the stripe too large or long if ((rg_idx > stripe_start) && (stripe_bytes + rowgroup_total_bytes > max_stripe_size.bytes || stripe_rows + rowgroup_rows_max > max_stripe_size.rows)) { - infos.emplace_back(infos.size(), stripe_start, rg_idx - stripe_start); + infos.push_back(stripe_rowgroups{static_cast(infos.size()), + stripe_start, + static_cast(rg_idx - stripe_start)}); stripe_start = rg_idx; stripe_bytes = 0; stripe_rows = 0; @@ -478,7 +481,9 @@ file_segmentation calculate_segmentation(host_span column stripe_bytes += rowgroup_total_bytes; stripe_rows += rowgroup_rows_max; if (rg_idx + 1 == num_rowgroups) { - infos.emplace_back(infos.size(), stripe_start, num_rowgroups - stripe_start); + infos.push_back(stripe_rowgroups{static_cast(infos.size()), + stripe_start, + static_cast(num_rowgroups - stripe_start)}); } } @@ -2297,7 +2302,7 @@ auto convert_table_to_orc_data(table_view const& input, // Decide stripe boundaries based on rowgroups and char counts auto segmentation = - calculate_segmentation(orc_table.columns, std::move(rowgroup_bounds), max_stripe_size); + calculate_segmentation(orc_table.columns, std::move(rowgroup_bounds), max_stripe_size, stream); auto stripe_dicts = build_dictionaries(orc_table, segmentation, sort_dictionaries, stream); auto dec_chunk_sizes = decimal_chunk_sizes(orc_table, segmentation, stream); diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp index bd082befe0c..f5f8b3cfed9 100644 --- a/cpp/src/io/orc/writer_impl.hpp +++ b/cpp/src/io/orc/writer_impl.hpp @@ -78,10 +78,9 @@ struct orc_table_view { * Provides a container-like interface to iterate over rowgroup indices. */ struct stripe_rowgroups { - uint32_t id; // stripe id - uint32_t first; // first rowgroup in the stripe - uint32_t size; // number of rowgroups in the stripe - stripe_rowgroups(uint32_t id, uint32_t first, uint32_t size) : id{id}, first{first}, size{size} {} + size_type id; // stripe id + size_type first; // first rowgroup in the stripe + size_type size; // number of rowgroups in the stripe [[nodiscard]] auto cbegin() const { return thrust::make_counting_iterator(first); } [[nodiscard]] auto cend() const { return thrust::make_counting_iterator(first + size); } }; @@ -125,7 +124,7 @@ class orc_streams { */ struct file_segmentation { hostdevice_2dvector rowgroups; - std::vector stripes; + cudf::detail::host_vector stripes; auto num_rowgroups() const noexcept { return rowgroups.size().first; } auto num_stripes() const noexcept { return stripes.size(); } From 395898a82abd2f5cf1eb7599037ef35cc884d934 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 5 Jul 2024 13:51:55 -0700 Subject: [PATCH 56/75] copying --- cpp/src/copying/concatenate.cu | 6 +++--- cpp/src/copying/contiguous_split.cu | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu index 47e74a5cb48..b66e5cab333 100644 --- a/cpp/src/copying/concatenate.cu +++ b/cpp/src/copying/concatenate.cu @@ -73,8 +73,8 @@ auto create_device_views(host_span views, rmm::cuda_stream_vi }); // Assemble contiguous array of device views - auto device_views = thrust::host_vector(); - device_views.reserve(views.size()); + auto device_views = + cudf::detail::make_empty_host_vector(views.size(), stream); std::transform(device_view_owners.cbegin(), device_view_owners.cend(), std::back_inserter(device_views), @@ -84,7 +84,7 @@ auto create_device_views(host_span views, rmm::cuda_stream_vi make_device_uvector_async(device_views, stream, rmm::mr::get_current_device_resource()); // Compute the partition offsets - auto offsets = thrust::host_vector(views.size() + 1); + auto offsets = cudf::detail::make_host_vector(views.size() + 1, stream); thrust::transform_inclusive_scan( thrust::host, device_views.cbegin(), diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index 37db2c74790..95544742fb7 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -1539,7 +1539,8 @@ std::unique_ptr chunk_iteration_state::create( std::vector num_batches_per_iteration; std::vector size_of_batches_per_iteration; - std::vector accum_size_per_iteration; + auto accum_size_per_iteration = + cudf::detail::make_empty_host_vector(h_offsets.size(), stream); std::size_t accum_size = 0; { auto current_offset_it = h_offsets.begin(); From be916f9c2e100cec2e25ec9d68e177f35c404402 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 5 Jul 2024 13:55:52 -0700 Subject: [PATCH 57/75] few more --- cpp/src/datetime/timezone.cpp | 6 ++---- cpp/src/dictionary/detail/concatenate.cu | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp index 1b0d201501b..070b2f1a77e 100644 --- a/cpp/src/datetime/timezone.cpp +++ b/cpp/src/datetime/timezone.cpp @@ -485,14 +485,12 @@ std::unique_ptr make_timezone_transition_table(std::optional ttimes_typed; - ttimes_typed.reserve(transition_times.size()); + auto ttimes_typed = make_empty_host_vector(transition_times.size(), stream); std::transform(transition_times.cbegin(), transition_times.cend(), std::back_inserter(ttimes_typed), [](auto ts) { return timestamp_s{duration_s{ts}}; }); - std::vector offsets_typed; - offsets_typed.reserve(offsets.size()); + auto offsets_typed = make_empty_host_vector(transition_times.size(), stream); std::transform(offsets.cbegin(), offsets.cend(), std::back_inserter(offsets_typed), [](auto ts) { return duration_s{ts}; }); diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu index fdc3d9d0ecf..72828309425 100644 --- a/cpp/src/dictionary/detail/concatenate.cu +++ b/cpp/src/dictionary/detail/concatenate.cu @@ -105,7 +105,7 @@ struct compute_children_offsets_fn { */ rmm::device_uvector create_children_offsets(rmm::cuda_stream_view stream) { - std::vector offsets(columns_ptrs.size()); + auto offsets = cudf::detail::make_host_vector(columns_ptrs.size(), stream); thrust::transform_exclusive_scan( thrust::host, columns_ptrs.begin(), From 2225e3b0d28aa4492303ffb16c1e4e12f0b95724 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 5 Jul 2024 14:04:32 -0700 Subject: [PATCH 58/75] partial IO --- cpp/src/io/avro/reader_impl.cu | 8 ++-- cpp/src/io/csv/reader_impl.cu | 67 ++++++++++++++++++++-------------- cpp/src/io/json/json_column.cu | 4 +- cpp/src/io/json/read_json.cu | 3 +- 4 files changed, 48 insertions(+), 34 deletions(-) diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu index 814efe2b5a1..69a0e982a5b 100644 --- a/cpp/src/io/avro/reader_impl.cu +++ b/cpp/src/io/avro/reader_impl.cu @@ -554,9 +554,11 @@ table_with_metadata read_avro(std::unique_ptr&& source, auto d_global_dict_data = rmm::device_uvector(0, stream); if (total_dictionary_entries > 0) { - auto h_global_dict = std::vector(total_dictionary_entries); - auto h_global_dict_data = std::vector(dictionary_data_size); - size_t dict_pos = 0; + auto h_global_dict = + cudf::detail::make_host_vector(total_dictionary_entries, stream); + auto h_global_dict_data = + cudf::detail::make_host_vector(dictionary_data_size, stream); + size_t dict_pos = 0; for (size_t i = 0; i < column_types.size(); ++i) { auto const col_idx = selected_columns[i].first; diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 05faded651d..9a3d777593b 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -506,7 +506,7 @@ void get_data_types_from_column_names(std::map const& us } void infer_column_types(parse_options const& parse_opts, - host_span column_flags, + cudf::detail::host_vector const& column_flags, device_span data, device_span row_offsets, int32_t num_records, @@ -566,17 +566,18 @@ void infer_column_types(parse_options const& parse_opts, } } -std::vector decode_data(parse_options const& parse_opts, - std::vector const& column_flags, - std::vector const& column_names, - device_span data, - device_span row_offsets, - host_span column_types, - int32_t num_records, - int32_t num_actual_columns, - int32_t num_active_columns, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::vector decode_data( + parse_options const& parse_opts, + cudf::detail::host_vector const& column_flags, + std::vector const& column_names, + device_span data, + device_span row_offsets, + cudf::detail::host_vector const& column_types, + int32_t num_records, + int32_t num_actual_columns, + int32_t num_active_columns, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { // Alloc output; columns' data memory is still expected for empty dataframe std::vector out_buffers; @@ -592,8 +593,8 @@ std::vector decode_data(parse_options const& parse_opts, } } - thrust::host_vector h_data(num_active_columns); - thrust::host_vector h_valid(num_active_columns); + auto h_data = cudf::detail::make_host_vector(num_active_columns, stream); + auto h_valid = cudf::detail::make_host_vector(num_active_columns, stream); for (int i = 0; i < num_active_columns; ++i) { h_data[i] = out_buffers[i].data(); @@ -622,14 +623,16 @@ std::vector decode_data(parse_options const& parse_opts, return out_buffers; } -std::vector determine_column_types(csv_reader_options const& reader_opts, - parse_options const& parse_opts, - host_span column_names, - device_span data, - device_span row_offsets, - int32_t num_records, - host_span column_flags, - rmm::cuda_stream_view stream) +cudf::detail::host_vector determine_column_types( + csv_reader_options const& reader_opts, + parse_options const& parse_opts, + host_span column_names, + device_span data, + device_span row_offsets, + int32_t num_records, + cudf::detail::host_vector& column_flags, + cudf::size_type num_active_columns, + rmm::cuda_stream_view stream) { std::vector column_types(column_flags.size()); @@ -653,7 +656,8 @@ std::vector determine_column_types(csv_reader_options const& reader_o stream); // compact column_types to only include active columns - std::vector active_col_types; + auto active_col_types = + cudf::detail::make_empty_host_vector(num_active_columns, stream); std::copy_if(column_types.cbegin(), column_types.cend(), std::back_inserter(active_col_types), @@ -697,8 +701,10 @@ table_with_metadata read_csv(cudf::io::datasource* source, auto const num_actual_columns = static_cast(column_names.size()); auto num_active_columns = num_actual_columns; - auto column_flags = std::vector( - num_actual_columns, column_parse::enabled | column_parse::inferred); + auto column_flags = + cudf::detail::make_host_vector(num_actual_columns, stream); + std::fill( + column_flags.begin(), column_flags.end(), column_parse::enabled | column_parse::inferred); // User did not pass column names to override names in the file // Process names from the file to remove empty and duplicated strings @@ -842,8 +848,15 @@ table_with_metadata read_csv(cudf::io::datasource* source, // Exclude the end-of-data row from number of rows with actual data auto const num_records = std::max(row_offsets.size(), 1ul) - 1; - auto const column_types = determine_column_types( - reader_opts, parse_opts, column_names, data, row_offsets, num_records, column_flags, stream); + auto const column_types = determine_column_types(reader_opts, + parse_opts, + column_names, + data, + row_offsets, + num_records, + column_flags, + num_active_columns, + stream); auto metadata = table_metadata{}; auto out_columns = std::vector>(); diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 3e587768b11..17fa7abdffe 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -622,7 +622,7 @@ void make_device_json_column(device_span input, // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking std::map, NodeIndexT> mapped_columns; // find column_ids which are values, but should be ignored in validity - std::vector ignore_vals(num_columns, 0); + auto ignore_vals = cudf::detail::make_host_vector(num_columns, stream); std::vector is_mixed_type_column(num_columns, 0); std::vector is_pruned(num_columns, 0); columns.try_emplace(parent_node_sentinel, std::ref(root)); @@ -812,7 +812,7 @@ void make_device_json_column(device_span input, return thrust::get<1>(a) < thrust::get<1>(b); }); // move columns data to device. - std::vector columns_data(num_columns); + auto columns_data = cudf::detail::make_host_vector(num_columns, stream); for (auto& [col_id, col_ref] : columns) { if (col_id == parent_node_sentinel) continue; auto& col = col_ref.get(); diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 74001e5e01a..0f486457452 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -78,10 +78,9 @@ device_span ingest_raw_input(device_span buffer, auto constexpr num_delimiter_chars = 1; if (compression == compression_type::NONE) { - std::vector delimiter_map{}; + auto delimiter_map = cudf::detail::make_empty_host_vector(sources.size(), stream); std::vector prefsum_source_sizes(sources.size()); std::vector> h_buffers; - delimiter_map.reserve(sources.size()); size_t bytes_read = 0; std::transform_inclusive_scan(sources.begin(), sources.end(), From 0446d345f636975045d333507633ff6658a0129c Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 5 Jul 2024 15:38:53 -0700 Subject: [PATCH 59/75] parquet --- cpp/src/io/parquet/reader_impl_chunking.cu | 17 ++++++++--------- cpp/src/io/parquet/reader_impl_preprocess.cu | 10 +++++----- cpp/src/io/parquet/writer_impl.cu | 9 ++++++--- 3 files changed, 19 insertions(+), 17 deletions(-) diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu index 5fba54ab309..2c560049e45 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.cu +++ b/cpp/src/io/parquet/reader_impl_chunking.cu @@ -810,10 +810,10 @@ std::vector compute_page_splits_by_row(device_span>(num_comp_pages, stream); // vectors to save v2 def and rep level data, if any - std::vector> copy_in; - copy_in.reserve(num_comp_pages); - std::vector> copy_out; - copy_out.reserve(num_comp_pages); + auto copy_in = + cudf::detail::make_empty_host_vector>(num_comp_pages, stream); + auto copy_out = + cudf::detail::make_empty_host_vector>(num_comp_pages, stream); rmm::device_uvector comp_res(num_comp_pages, stream); thrust::fill(rmm::exec_policy_nosync(stream), @@ -835,8 +835,8 @@ std::vector compute_page_splits_by_row(device_span(offset)}); + copy_out.push_back({dst_base, static_cast(offset)}); } comp_in.push_back( {page.page_data + offset, static_cast(page.compressed_page_size - offset)}); @@ -1134,9 +1134,8 @@ void include_decompression_scratch_size(device_span chunk decomp_sum{}); // retrieve to host so we can call nvcomp to get compression scratch sizes - std::vector h_decomp_info = - cudf::detail::make_std_vector_sync(decomp_info, stream); - std::vector temp_cost(pages.size()); + auto h_decomp_info = cudf::detail::make_host_vector_sync(decomp_info, stream); + auto temp_cost = cudf::detail::make_host_vector(pages.size(), stream); thrust::transform(thrust::host, h_decomp_info.begin(), h_decomp_info.end(), diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu index f28a7311ccb..18290432aca 100644 --- a/cpp/src/io/parquet/reader_impl_preprocess.cu +++ b/cpp/src/io/parquet/reader_impl_preprocess.cu @@ -370,7 +370,7 @@ void fill_in_page_info(host_span chunks, rmm::cuda_stream_view stream) { auto const num_pages = pages.size(); - std::vector page_indexes(num_pages); + auto page_indexes = cudf::detail::make_host_vector(num_pages, stream); for (size_t c = 0, page_count = 0; c < chunks.size(); c++) { auto const& chunk = chunks[c]; @@ -1031,8 +1031,8 @@ struct get_page_num_rows { }; struct input_col_info { - int const schema_idx; - size_type const nesting_depth; + int schema_idx; + size_type nesting_depth; }; /** @@ -1512,8 +1512,8 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num // compute output column sizes by examining the pages of the -input- columns if (has_lists) { - std::vector h_cols_info; - h_cols_info.reserve(_input_columns.size()); + auto h_cols_info = + cudf::detail::make_empty_host_vector(_input_columns.size(), _stream); std::transform(_input_columns.cbegin(), _input_columns.cend(), std::back_inserter(h_cols_info), diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index bed4dbc5a66..c26622db047 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1135,7 +1135,7 @@ void init_row_group_fragments(cudf::detail::hostdevice_2dvector& f * @param stream CUDA stream used for device memory operations and kernel launches */ void calculate_page_fragments(device_span frag, - host_span frag_sizes, + cudf::detail::host_vector const& frag_sizes, rmm::cuda_stream_view stream) { auto d_frag_sz = cudf::detail::make_device_uvector_async( @@ -1737,7 +1737,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, size_type max_page_fragment_size = max_page_fragment_size_opt.value_or(default_max_page_fragment_size); - std::vector column_frag_size(num_columns, max_page_fragment_size); + auto column_frag_size = cudf::detail::make_host_vector(num_columns, stream); + std::fill(column_frag_size.begin(), column_frag_size.end(), max_page_fragment_size); if (input.num_rows() > 0 && not max_page_fragment_size_opt.has_value()) { std::vector column_sizes; @@ -1793,7 +1794,9 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta, size_type num_fragments = std::reduce(num_frag_in_part.begin(), num_frag_in_part.end()); - std::vector part_frag_offset; // Store the idx of the first fragment in each partition + auto part_frag_offset = + cudf::detail::make_empty_host_vector(num_frag_in_part.size() + 1, stream); + // Store the idx of the first fragment in each partition std::exclusive_scan( num_frag_in_part.begin(), num_frag_in_part.end(), std::back_inserter(part_frag_offset), 0); part_frag_offset.push_back(part_frag_offset.back() + num_frag_in_part.back()); From 6a7ff7345336e75639fa7b5ea337f93b72a0d17b Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Fri, 5 Jul 2024 21:58:48 -0700 Subject: [PATCH 60/75] rest of it --- cpp/include/cudf/lists/detail/dremel.hpp | 8 ++++---- cpp/src/strings/combine/join.cu | 6 ++++-- cpp/src/strings/convert/convert_datetime.cu | 2 +- cpp/src/strings/copying/concatenate.cu | 2 +- cpp/src/strings/filter_chars.cu | 2 +- cpp/src/strings/replace/multi_re.cu | 2 +- cpp/src/strings/translate.cu | 2 +- cpp/src/table/row_operators.cu | 5 ++++- 8 files changed, 17 insertions(+), 12 deletions(-) diff --git a/cpp/include/cudf/lists/detail/dremel.hpp b/cpp/include/cudf/lists/detail/dremel.hpp index d36a4091947..11f641a3fce 100644 --- a/cpp/include/cudf/lists/detail/dremel.hpp +++ b/cpp/include/cudf/lists/detail/dremel.hpp @@ -31,8 +31,8 @@ struct dremel_device_view { size_type const* offsets; uint8_t const* rep_levels; uint8_t const* def_levels; - size_type const leaf_data_size; - uint8_t const max_def_level; + size_type leaf_data_size; + uint8_t max_def_level; }; /** @@ -45,8 +45,8 @@ struct dremel_data { rmm::device_uvector rep_level; rmm::device_uvector def_level; - size_type const leaf_data_size; - uint8_t const max_def_level; + size_type leaf_data_size; + uint8_t max_def_level; operator dremel_device_view() const { diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu index c4cc0dbe09d..b534e9b2e5b 100644 --- a/cpp/src/strings/combine/join.cu +++ b/cpp/src/strings/combine/join.cu @@ -169,8 +169,10 @@ std::unique_ptr join_strings(strings_column_view const& input, // build the offsets: single string output has offsets [0,chars-size] auto offsets_column = [&] { - auto offsets = cudf::detail::make_device_uvector_async( - std::vector({0, static_cast(chars.size())}), stream, mr); + auto h_offsets = cudf::detail::make_host_vector(2, stream); + h_offsets[0] = 0; + h_offsets[1] = chars.size(); + auto offsets = cudf::detail::make_device_uvector_async(h_offsets, stream, mr); return std::make_unique(std::move(offsets), rmm::device_buffer{}, 0); }(); diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu index 2f4ebf97264..64a2107e17a 100644 --- a/cpp/src/strings/convert/convert_datetime.cu +++ b/cpp/src/strings/convert/convert_datetime.cu @@ -123,7 +123,7 @@ struct format_compiler { : format(fmt), d_items(0, stream) { specifiers.insert(extra_specifiers.begin(), extra_specifiers.end()); - std::vector items; + auto items = cudf::detail::make_empty_host_vector(format.length(), stream); auto str = format.data(); auto length = format.length(); while (length > 0) { diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu index 7622e39e735..352e0f9f41a 100644 --- a/cpp/src/strings/copying/concatenate.cu +++ b/cpp/src/strings/copying/concatenate.cu @@ -79,7 +79,7 @@ auto create_strings_device_views(host_span views, rmm::cuda_s // Compute the partition offsets and size of offset column // Note: Using 64-bit size_t so we can detect overflow of 32-bit size_type - auto input_offsets = std::vector(views.size() + 1); + auto input_offsets = cudf::detail::make_host_vector(views.size() + 1, stream); auto offset_it = std::next(input_offsets.begin()); thrust::transform( thrust::host, views.begin(), views.end(), offset_it, [](auto const& col) -> size_t { diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu index a34828fa97e..48620af8cad 100644 --- a/cpp/src/strings/filter_chars.cu +++ b/cpp/src/strings/filter_chars.cu @@ -129,7 +129,7 @@ std::unique_ptr filter_characters( // convert input table for copy to device memory size_type table_size = static_cast(characters_to_filter.size()); - thrust::host_vector htable(table_size); + auto htable = cudf::detail::make_host_vector(table_size, stream); std::transform( characters_to_filter.begin(), characters_to_filter.end(), htable.begin(), [](auto entry) { return char_range{entry.first, entry.second}; diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu index cd60a4296b9..31234ea42ec 100644 --- a/cpp/src/strings/replace/multi_re.cu +++ b/cpp/src/strings/replace/multi_re.cu @@ -171,7 +171,7 @@ std::unique_ptr replace_re(strings_column_view const& input, auto d_buffer = rmm::device_buffer(buffer_size, stream); // copy all the reprog_device instances to a device memory array - std::vector progs; + auto progs = cudf::detail::make_empty_host_vector(h_progs.size(), stream); std::transform(h_progs.begin(), h_progs.end(), std::back_inserter(progs), diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu index 16b22d0de4c..a242b008a54 100644 --- a/cpp/src/strings/translate.cu +++ b/cpp/src/strings/translate.cu @@ -97,7 +97,7 @@ std::unique_ptr translate(strings_column_view const& strings, size_type table_size = static_cast(chars_table.size()); // convert input table - thrust::host_vector htable(table_size); + auto htable = cudf::detail::make_host_vector(table_size, stream); std::transform(chars_table.begin(), chars_table.end(), htable.begin(), [](auto entry) { return translate_table{entry.first, entry.second}; }); diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu index 13c31e8ae4c..2969557c78f 100644 --- a/cpp/src/table/row_operators.cu +++ b/cpp/src/table/row_operators.cu @@ -308,7 +308,10 @@ auto decompose_structs(table_view table, auto list_lex_preprocess(table_view const& table, rmm::cuda_stream_view stream) { std::vector dremel_data; - std::vector dremel_device_views; + auto const num_list_columns = std::count_if( + table.begin(), table.end(), [](auto const& col) { return col.type().id() == type_id::LIST; }); + auto dremel_device_views = + cudf::detail::make_empty_host_vector(num_list_columns, stream); for (auto const& col : table) { if (col.type().id() == type_id::LIST) { dremel_data.push_back(detail::get_comparator_data(col, {}, false, stream)); From a0a6caac3782951788b96a6360eefa433d1e2015 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 8 Jul 2024 10:42:38 -0700 Subject: [PATCH 61/75] style --- cpp/include/cudf/detail/utilities/host_memory.hpp | 2 +- cpp/include/cudf/lists/detail/dremel.hpp | 2 +- cpp/tests/io/json_test.cpp | 13 ++++++------- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/host_memory.hpp b/cpp/include/cudf/detail/utilities/host_memory.hpp index b1a51ed660e..f2500659d5f 100644 --- a/cpp/include/cudf/detail/utilities/host_memory.hpp +++ b/cpp/include/cudf/detail/utilities/host_memory.hpp @@ -43,4 +43,4 @@ rmm_host_allocator get_host_allocator(std::size_t size, rmm::cuda_stream_view return {get_pageable_memory_resource(), _stream}; } -} // namespace cudf::detail \ No newline at end of file +} // namespace cudf::detail diff --git a/cpp/include/cudf/lists/detail/dremel.hpp b/cpp/include/cudf/lists/detail/dremel.hpp index 11f641a3fce..53448424827 100644 --- a/cpp/include/cudf/lists/detail/dremel.hpp +++ b/cpp/include/cudf/lists/detail/dremel.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 0ee139b4787..9c76c344157 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -222,8 +222,7 @@ std::string to_records_orient(std::vector> co } template -struct JsonFixedPointReaderTest : public JsonReaderTest { -}; +struct JsonFixedPointReaderTest : public JsonReaderTest {}; template struct JsonValidFixedPointReaderTest : public JsonFixedPointReaderTest { @@ -1140,7 +1139,7 @@ TEST_P(JsonReaderParamTest, ParseOutOfRangeIntegers) "-33333333333333333333333", "-444444444444444444444444"}; std::vector greater_uint64_max = { - "18446744073709551615", "18446744073709551616", "18446744073709551617", "18446744073709551618"}; + "18446744073709551615", "18446744073709551616", "18446744073709551617", "18446744073709551618"}; std::vector less_int64_min = { "-9223372036854775807", "-9223372036854775808", "-9223372036854775809", "-9223372036854775810"}; std::vector mixed_range = { @@ -1370,10 +1369,10 @@ TEST_F(JsonReaderTest, JsonLongString) "", // null "", // null "கார்த்தி", - "CႮ≪ㇳ䍏凹沦王辿龸ꁗ믜스폶ﴠ", // 0000-FFFF - "𐀀𑿪𒐦𓃰𔙆 𖦆𗿿𘳕𚿾[↳] 𜽆𝓚𞤁🄰", // 10000-1FFFF - "𠘨𡥌𢗉𣇊𤊩𥅽𦉱𧴱𨁲𩁹𪐢𫇭𬬭𭺷𮊦屮", // 20000-2FFFF - "𰾑𱔈𲍉", // 30000-3FFFF + "CႮ≪ㇳ䍏凹沦王辿龸ꁗ믜스폶ﴠ", // 0000-FFFF + "𐀀𑿪𒐦𓃰𔙆 𖦆𗿿𘳕𚿾[↳] 𜽆𝓚𞤁🄰", // 10000-1FFFF + "𠘨𡥌𢗉𣇊𤊩𥅽𦉱𧴱𨁲𩁹𪐢𫇭𬬭𭺷𮊦屮", // 20000-2FFFF + "𰾑𱔈𲍉", // 30000-3FFFF R"("$€ \u0024\u20ac \\u0024\\u20ac \\\u0024\\\u20ac \\\\u0024\\\\u20ac)", R"( \\\\\\\\\\\\\\\\)", R"(\\\\\\\\\\\\\\\\)", From 7789e39a8308049b3882633cf1f52b0394071f15 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 9 Jul 2024 00:54:17 -0700 Subject: [PATCH 62/75] improve docs --- .../cudf/detail/utilities/host_memory.hpp | 14 ++-- .../detail/utilities/vector_factories.hpp | 69 +++++++++++++------ cpp/src/io/orc/writer_impl.cu | 2 + 3 files changed, 59 insertions(+), 26 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/host_memory.hpp b/cpp/include/cudf/detail/utilities/host_memory.hpp index f2500659d5f..fd82b584c7e 100644 --- a/cpp/include/cudf/detail/utilities/host_memory.hpp +++ b/cpp/include/cudf/detail/utilities/host_memory.hpp @@ -25,14 +25,18 @@ #include namespace cudf::detail { - -CUDF_EXPORT rmm::host_async_resource_ref& get_pageable_memory_resource(); +/** + * @brief Get the memory resource to be used for pageable memory allocations. + * + * @return Reference to the pageable memory resource + */ +CUDF_EXPORT rmm::host_async_resource_ref get_pageable_memory_resource(); /** - * @brief Get the rmm resource to be used for host memory allocations. + * @brief Get the memory resource to be used for the host memory allocation. * - * @param size The size of the allocation - * @return The rmm resource to be used for host memory allocations + * @param size The number of elements of type T to allocate + * @return The memory resource to be used for the host memory allocation */ template rmm_host_allocator get_host_allocator(std::size_t size, rmm::cuda_stream_view _stream) diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp index 3f29d9d7a33..26712369b7d 100644 --- a/cpp/include/cudf/detail/utilities/vector_factories.hpp +++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp @@ -180,7 +180,18 @@ rmm::device_uvector make_device_uvector_async( return make_device_uvector_async( device_span{c}, stream, mr); } - +/** + * @brief Asynchronously construct a `device_uvector` containing a deep copy of data from a + * `host_vector` + * + * @note This function does not synchronize `stream` after the copy. + * + * @tparam T The type of the data to copy + * @param v The host_vector of data to deep copy + * @param stream The stream on which to allocate memory and perform the copy + * @param mr The memory resource to use for allocating the returned device_uvector + * @return A device_uvector containing the copied data + */ template rmm::device_uvector make_device_uvector_async(host_vector const& v, rmm::cuda_stream_view stream, @@ -286,21 +297,11 @@ rmm::device_uvector make_device_uvector_sync( return make_device_uvector_sync(device_span{c}, stream, mr); } -// Utility function template to allow copying to either a thrust::host_vector or std::vector -template -OutContainer make_vector_async(device_span v, rmm::cuda_stream_view stream) -{ - OutContainer result(v.size()); - CUDF_CUDA_TRY(cudaMemcpyAsync( - result.data(), v.data(), v.size() * sizeof(T), cudaMemcpyDefault, stream.value())); - return result; -} - /** * @brief Asynchronously construct a `std::vector` containing a copy of data from a * `device_span` * - * @note This function does not synchronize `stream`. + * @note This function does not synchronize `stream` after the copy. * * @tparam T The type of the data to copy * @param source_data The device data to copy @@ -310,14 +311,17 @@ OutContainer make_vector_async(device_span v, rmm::cuda_stream_view str template std::vector make_std_vector_async(device_span v, rmm::cuda_stream_view stream) { - return make_vector_async>(v, stream); + std::vector result(v.size()); + CUDF_CUDA_TRY(cudaMemcpyAsync( + result.data(), v.data(), v.size() * sizeof(T), cudaMemcpyDefault, stream.value())); + return result; } /** * @brief Asynchronously construct a `std::vector` containing a copy of data from a device * container * - * @note This function synchronizes `stream`. + * @note This function synchronizes `stream` after the copy. * * @tparam Container The type of the container to copy from * @tparam T The type of the data to copy @@ -339,7 +343,7 @@ std::vector make_std_vector_async(Container cons * @brief Synchronously construct a `std::vector` containing a copy of data from a * `device_span` * - * @note This function does a synchronize on `stream`. + * @note This function does a synchronize on `stream` after the copy. * * @tparam T The type of the data to copy * @param source_data The device data to copy @@ -376,12 +380,32 @@ std::vector make_std_vector_sync(Container const return make_std_vector_sync(device_span{c}, stream); } +/** + * @brief Construct a `thrust::host_vector` of the given size. + * + * @note The returned vector may be using a pinned memory resource. + * + * @tparam T The type of the vector data + * @param size The number of elements in the created vector + * @param stream The stream on which to allocate memory + * @return A host_vector of the given size + */ template host_vector make_host_vector(size_t size, rmm::cuda_stream_view stream) { return host_vector(size, get_host_allocator(size, stream)); } +/** + * @brief Construct an empty `thrust::host_vector` with the given capacity. + * + * @note The returned vector may be using a pinned memory resource. + * + * @tparam T The type of the vector data + * @param capacity Initial capacity of the vector + * @param stream The stream on which to allocate memory + * @return A host_vector with the given capacity + */ template host_vector make_empty_host_vector(size_t capacity, rmm::cuda_stream_view stream) { @@ -394,7 +418,8 @@ host_vector make_empty_host_vector(size_t capacity, rmm::cuda_stream_view str * @brief Asynchronously construct a `thrust::host_vector` containing a copy of data from a * `device_span` * - * @note This function does not synchronize `stream`. + * @note This function does not synchronize `stream` after the copy. The returned vector may be + * using a pinned memory resource. * * @tparam T The type of the data to copy * @param source_data The device data to copy @@ -418,7 +443,8 @@ host_vector make_host_vector_async(device_span v, rmm::cuda_stream_v * @brief Asynchronously construct a `std::vector` containing a copy of data from a device * container * - * @note This function does not synchronize `stream`. + * @note This function does not synchronize `stream` after the copy. The returned vector may be + * using a pinned memory resource. * * @tparam Container The type of the container to copy from * @tparam T The type of the data to copy @@ -440,7 +466,8 @@ host_vector make_host_vector_async(Container con * @brief Synchronously construct a `thrust::host_vector` containing a copy of data from a * `device_span` * - * @note This function does a synchronize on `stream`. + * @note This function does a synchronize on `stream` after the copy. The returned vector may be + * using a pinned memory resource. * * @tparam T The type of the data to copy * @param source_data The device data to copy @@ -459,7 +486,7 @@ host_vector make_host_vector_sync(device_span v, rmm::cuda_stream_vi * @brief Synchronously construct a `thrust::host_vector` containing a copy of data from a device * container * - * @note This function synchronizes `stream`. + * @note This function synchronizes `stream` after the copy. * * @tparam Container The type of the container to copy from * @tparam T The type of the data to copy @@ -480,7 +507,7 @@ host_vector make_host_vector_sync(Container cons /** * @brief Asynchronously construct a pinned `cudf::detail::host_vector` of the given size * - * @note This function may not synchronize `stream`. + * @note This function may not synchronize `stream` after the copy. * * @tparam T The type of the vector data * @param size The number of elements in the created vector @@ -496,7 +523,7 @@ host_vector make_pinned_vector_async(size_t size, rmm::cuda_stream_view strea /** * @brief Synchronously construct a pinned `cudf::detail::host_vector` of the given size * - * @note This function synchronizes `stream`. + * @note This function synchronizes `stream` after the copy. * * @tparam T The type of the vector data * @param size The number of elements in the created vector diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index ba1a4eef99f..1aed3b8da7c 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -446,6 +446,8 @@ file_segmentation calculate_segmentation(host_span column stripe_size_limits max_stripe_size, rmm::cuda_stream_view stream) { + // Number of stripes is not known in advance. Only reserve a single element to use pinned memory + // resource if at all enabled. auto infos = cudf::detail::make_empty_host_vector(1, stream); size_type const num_rowgroups = rowgroup_bounds.size().first; size_type stripe_start = 0; From d55fb39098a7b55c21c1dfbc3e5ccb2598d85aeb Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 10 Jul 2024 12:45:16 -0700 Subject: [PATCH 63/75] add missing overload --- .../detail/utilities/vector_factories.hpp | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp index 26712369b7d..493e7f788b8 100644 --- a/cpp/include/cudf/detail/utilities/vector_factories.hpp +++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp @@ -180,6 +180,7 @@ rmm::device_uvector make_device_uvector_async( return make_device_uvector_async( device_span{c}, stream, mr); } + /** * @brief Asynchronously construct a `device_uvector` containing a deep copy of data from a * `host_vector` @@ -207,6 +208,28 @@ rmm::device_uvector make_device_uvector_async(host_vector const& v, return ret; } +/** + * @brief Synchronously construct a `device_uvector` containing a deep copy of data from a + * `host_vector` + * + * @note This function synchronizes `stream` after the copy. + * + * @tparam T The type of the data to copy + * @param v The host_vector of data to deep copy + * @param stream The stream on which to allocate memory and perform the copy + * @param mr The memory resource to use for allocating the returned device_uvector + * @return A device_uvector containing the copied data + */ +template +rmm::device_uvector make_device_uvector_sync(host_vector const& v, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto ret = make_device_uvector_async(v, stream, mr); + stream.synchronize(); + return ret; +} + /** * @brief Synchronously construct a `device_uvector` containing a deep copy of data from a * `host_span` From d8f0e58e795d5ace560a051b3caf84d0de88569d Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Wed, 10 Jul 2024 17:56:51 -0700 Subject: [PATCH 64/75] typo fixes; clean up --- cpp/include/cudf/detail/utilities/host_vector.hpp | 8 +++++--- cpp/include/cudf/utilities/pinned_memory.hpp | 6 +++--- cpp/src/datetime/timezone.cpp | 2 +- cpp/src/utilities/host_memory.cpp | 4 ++-- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp index b99e79b2e88..f4e5f718da4 100644 --- a/cpp/include/cudf/detail/utilities/host_vector.hpp +++ b/cpp/include/cudf/detail/utilities/host_vector.hpp @@ -61,6 +61,10 @@ class rmm_host_allocator { }; }; +template +inline constexpr bool contains_property = + (cuda::std::is_same_v || ... || false); + /*! \p rmm_host_allocator is a CUDA-specific host memory allocator * that employs \c `rmm::host_async_resource_ref` for allocation. * @@ -105,9 +109,7 @@ class rmm_host_allocator { rmm::cuda_stream_view _stream) : mr(_mr), stream(_stream), - _is_device_accessible{ - cuda::has_property, - cuda::mr::device_accessible>} + _is_device_accessible{contains_property} { } diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp index 7a9e48f443c..fa7e1b35327 100644 --- a/cpp/include/cudf/utilities/pinned_memory.hpp +++ b/cpp/include/cudf/utilities/pinned_memory.hpp @@ -74,9 +74,9 @@ size_t get_kernel_pinned_copy_threshold(); /** * @brief Set the threshold size for allocating host memory as pinned memory. * - * @param threshold The threshold size in bytes. If the size of the allocation is less than this - * threshold, the memory will be allocated as pinned memory. If the size is greater than or equal - * to this threshold, the memory will be allocated as pageable memory. + * @param threshold The threshold size in bytes. If the size of the allocation is less or equal to + * this threshold, the memory will be allocated as pinned memory. If the size is greater than this + * threshold, the memory will be allocated as pageable memory. */ void set_allocate_host_as_pinned_threshold(size_t threshold); diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp index 070b2f1a77e..7ca1b51df98 100644 --- a/cpp/src/datetime/timezone.cpp +++ b/cpp/src/datetime/timezone.cpp @@ -490,7 +490,7 @@ std::unique_ptr
make_timezone_transition_table(std::optional(transition_times.size(), stream); + auto offsets_typed = make_empty_host_vector(offsets.size(), stream); std::transform(offsets.cbegin(), offsets.cend(), std::back_inserter(offsets_typed), [](auto ts) { return duration_s{ts}; }); diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp index d3bcf7a085d..53bcb00edc5 100644 --- a/cpp/src/utilities/host_memory.cpp +++ b/cpp/src/utilities/host_memory.cpp @@ -241,7 +241,7 @@ class new_delete_memory_resource { }; static_assert(cuda::mr::resource_with, - "Pinned pool mr must be accessible from both host and device"); + "Pageable pool mr must be accessible from the host"); } // namespace @@ -285,7 +285,7 @@ size_t get_kernel_pinned_copy_threshold() { return kernel_pinned_copy_threshold( CUDF_EXPORT auto& allocate_host_as_pinned_threshold() { // use pageable memory for all host allocations - static std::atomic threshold = 16 * 1024; + static std::atomic threshold = 0; return threshold; } From b94d26c11f1503b5c6950f162fbb790cc1e9a420 Mon Sep 17 00:00:00 2001 From: vukasin Date: Mon, 15 Jul 2024 16:12:34 +0000 Subject: [PATCH 65/75] fix return type --- cpp/src/utilities/host_memory.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp index 53bcb00edc5..23793641dc5 100644 --- a/cpp/src/utilities/host_memory.cpp +++ b/cpp/src/utilities/host_memory.cpp @@ -298,7 +298,7 @@ size_t get_allocate_host_as_pinned_threshold() { return allocate_host_as_pinned_ namespace detail { -CUDF_EXPORT rmm::host_async_resource_ref& get_pageable_memory_resource() +CUDF_EXPORT rmm::host_async_resource_ref get_pageable_memory_resource() { static new_delete_memory_resource mr{}; static rmm::host_async_resource_ref mr_ref{mr}; From 0dfaee48439337789a6a4e6ebf370ffb1cdc8684 Mon Sep 17 00:00:00 2001 From: vukasin Date: Mon, 15 Jul 2024 16:37:30 +0000 Subject: [PATCH 66/75] remove noexcept on deallocates --- cpp/src/utilities/host_memory.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp index 23793641dc5..98b1edc1c4e 100644 --- a/cpp/src/utilities/host_memory.cpp +++ b/cpp/src/utilities/host_memory.cpp @@ -83,7 +83,7 @@ class fixed_pinned_pool_memory_resource { void deallocate_async(void* ptr, std::size_t bytes, std::size_t alignment, - cuda::stream_ref stream) noexcept + cuda::stream_ref stream) { if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr < pool_end_) { pool_->deallocate_async(ptr, bytes, alignment, stream); @@ -92,14 +92,14 @@ class fixed_pinned_pool_memory_resource { } } - void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept + void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) { return deallocate_async(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream); } void deallocate(void* ptr, std::size_t bytes, - std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept + std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) { deallocate_async(ptr, bytes, alignment, stream_); stream_.wait(); @@ -214,7 +214,7 @@ class new_delete_memory_resource { void deallocate(void* ptr, std::size_t bytes, - std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept + std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) { rmm::detail::aligned_host_deallocate( ptr, bytes, rmm::CUDA_ALLOCATION_ALIGNMENT, [](void* ptr) { ::operator delete(ptr); }); @@ -223,12 +223,12 @@ class new_delete_memory_resource { void deallocate_async(void* ptr, std::size_t bytes, std::size_t alignment, - cuda::stream_ref stream) noexcept + cuda::stream_ref stream) { deallocate(ptr, bytes, alignment); } - void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept + void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) { deallocate(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT); } From 66da0018aa8cd0db80bf51517d96c1781bf379a7 Mon Sep 17 00:00:00 2001 From: vukasin Date: Wed, 17 Jul 2024 12:12:21 +0000 Subject: [PATCH 67/75] tests --- .../utilities_tests/pinned_memory_tests.cpp | 67 ++++++++++++++++++- 1 file changed, 65 insertions(+), 2 deletions(-) diff --git a/cpp/tests/utilities_tests/pinned_memory_tests.cpp b/cpp/tests/utilities_tests/pinned_memory_tests.cpp index df9103640f4..93259fd63ee 100644 --- a/cpp/tests/utilities_tests/pinned_memory_tests.cpp +++ b/cpp/tests/utilities_tests/pinned_memory_tests.cpp @@ -18,16 +18,33 @@ #include #include +#include #include +#include #include #include #include #include -class PinnedMemoryTest : public cudf::test::BaseFixture {}; +class PinnedMemoryTest : public cudf::test::BaseFixture { + size_t prev_copy_threshold; + size_t prev_alloc_threshold; -TEST(PinnedMemoryTest, MemoryResourceGetAndSet) + public: + PinnedMemoryTest() + : prev_copy_threshold{cudf::get_kernel_pinned_copy_threshold()}, + prev_alloc_threshold{cudf::get_allocate_host_as_pinned_threshold()} + { + } + ~PinnedMemoryTest() override + { + cudf::set_kernel_pinned_copy_threshold(prev_copy_threshold); + cudf::set_allocate_host_as_pinned_threshold(prev_alloc_threshold); + } +}; + +TEST_F(PinnedMemoryTest, MemoryResourceGetAndSet) { // Global environment for temporary files auto const temp_env = static_cast( @@ -63,3 +80,49 @@ TEST(PinnedMemoryTest, MemoryResourceGetAndSet) // reset memory resource back cudf::set_pinned_memory_resource(last_mr); } + +TEST_F(PinnedMemoryTest, KernelCopyThresholdGetAndSet) +{ + cudf::set_kernel_pinned_copy_threshold(12345); + EXPECT_EQ(cudf::get_kernel_pinned_copy_threshold(), 12345); +} + +TEST_F(PinnedMemoryTest, HostAsPinnedThresholdGetAndSet) +{ + cudf::set_allocate_host_as_pinned_threshold(12345); + EXPECT_EQ(cudf::get_allocate_host_as_pinned_threshold(), 12345); +} + +TEST_F(PinnedMemoryTest, MakePinnedVector) +{ + cudf::set_allocate_host_as_pinned_threshold(0); + + // should always use pinned memory + { + auto const vec = cudf::detail::make_pinned_vector_async(1, cudf::get_default_stream()); + EXPECT_TRUE(vec.get_allocator().is_device_accessible()); + } +} + +TEST_F(PinnedMemoryTest, MakeHostVector) +{ + cudf::set_allocate_host_as_pinned_threshold(7); + + // allocate smaller than the threshold + { + auto const vec = cudf::detail::make_host_vector(1, cudf::get_default_stream()); + EXPECT_TRUE(vec.get_allocator().is_device_accessible()); + } + + // allocate the same size as the threshold + { + auto const vec = cudf::detail::make_host_vector(7, cudf::get_default_stream()); + EXPECT_TRUE(vec.get_allocator().is_device_accessible()); + } + + // allocate larger than the threshold + { + auto const vec = cudf::detail::make_host_vector(2, cudf::get_default_stream()); + EXPECT_FALSE(vec.get_allocator().is_device_accessible()); + } +} From bbf5f2968fdd6d20b5e5dcc461a1f7d56f7b401d Mon Sep 17 00:00:00 2001 From: vukasin Date: Wed, 17 Jul 2024 15:13:04 +0000 Subject: [PATCH 68/75] avoid copy_n --- cpp/src/utilities/cuda_memcpy.cu | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu index 3d0822d8545..ccfc7542c80 100644 --- a/cpp/src/utilities/cuda_memcpy.cu +++ b/cpp/src/utilities/cuda_memcpy.cu @@ -14,6 +14,8 @@ * limitations under the License. */ +#include "cudf/detail/utilities/integer_utils.hpp" + #include #include #include @@ -26,15 +28,21 @@ namespace cudf::detail { namespace { +__global__ void copy_kernel(char const* src, char* dst, size_t n) +{ + auto const idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if (idx < n) { dst[idx] = src[idx]; } +} + void copy_pinned(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream) { if (size == 0) return; if (size < get_kernel_pinned_copy_threshold()) { - thrust::copy_n(rmm::exec_policy_nosync(stream), - static_cast(src), - size, - static_cast(dst)); + const int block_size = 256; + auto const grid_size = cudf::util::div_rounding_up_safe(size, block_size); + copy_kernel<<>>( + static_cast(src), static_cast(dst), size); } else { CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream)); } From 6e39c35a6b24ef780c5930eb84c9f26dd53f4a60 Mon Sep 17 00:00:00 2001 From: vukasin Date: Thu, 18 Jul 2024 18:38:13 +0000 Subject: [PATCH 69/75] add is_device_accessible to span --- .../detail/utilities/vector_factories.hpp | 60 ++----------------- cpp/include/cudf/utilities/span.hpp | 34 +++++++++-- cpp/src/io/csv/reader_impl.cu | 4 +- 3 files changed, 38 insertions(+), 60 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp index 493e7f788b8..9e3b0fb0152 100644 --- a/cpp/include/cudf/detail/utilities/vector_factories.hpp +++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp @@ -100,11 +100,12 @@ rmm::device_uvector make_device_uvector_async(host_span source_data, rmm::device_async_resource_ref mr) { rmm::device_uvector ret(source_data.size(), stream, mr); - CUDF_CUDA_TRY(cudaMemcpyAsync(ret.data(), - source_data.data(), - source_data.size() * sizeof(T), - cudaMemcpyDefault, - stream.value())); + auto const is_pinned = source_data.is_device_accessible(); + cuda_memcpy_async(ret.data(), + source_data.data(), + source_data.size() * sizeof(T), + is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, + stream); return ret; } @@ -181,55 +182,6 @@ rmm::device_uvector make_device_uvector_async( device_span{c}, stream, mr); } -/** - * @brief Asynchronously construct a `device_uvector` containing a deep copy of data from a - * `host_vector` - * - * @note This function does not synchronize `stream` after the copy. - * - * @tparam T The type of the data to copy - * @param v The host_vector of data to deep copy - * @param stream The stream on which to allocate memory and perform the copy - * @param mr The memory resource to use for allocating the returned device_uvector - * @return A device_uvector containing the copied data - */ -template -rmm::device_uvector make_device_uvector_async(host_vector const& v, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - rmm::device_uvector ret(v.size(), stream, mr); - auto const is_pinned = v.get_allocator().is_device_accessible(); - cuda_memcpy_async(ret.data(), - v.data(), - v.size() * sizeof(T), - is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE, - stream); - return ret; -} - -/** - * @brief Synchronously construct a `device_uvector` containing a deep copy of data from a - * `host_vector` - * - * @note This function synchronizes `stream` after the copy. - * - * @tparam T The type of the data to copy - * @param v The host_vector of data to deep copy - * @param stream The stream on which to allocate memory and perform the copy - * @param mr The memory resource to use for allocating the returned device_uvector - * @return A device_uvector containing the copied data - */ -template -rmm::device_uvector make_device_uvector_sync(host_vector const& v, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - auto ret = make_device_uvector_async(v, stream, mr); - stream.synchronize(); - return ret; -} - /** * @brief Synchronously construct a `device_uvector` containing a deep copy of data from a * `host_span` diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp index 34e39d01a6a..c5054c733a7 100644 --- a/cpp/include/cudf/utilities/span.hpp +++ b/cpp/include/cudf/utilities/span.hpp @@ -218,10 +218,6 @@ template struct is_host_span_supported_container< // thrust::host_vector> : std::true_type {}; -template -struct is_host_span_supported_container< // - cudf::detail::host_vector> : std::true_type {}; - template struct is_host_span_supported_container< // std::basic_string, Alloc>> : std::true_type {}; @@ -263,6 +259,26 @@ struct host_span : public cudf::detail::span_base>* = nullptr> + constexpr host_span(cudf::detail::host_vector& in) + : base(in.data(), in.size()), _is_device_accessible{in.get_allocator().is_device_accessible()} + { + } + + /// Constructor from a const host_vector + /// @param in The host_vector to construct the span from + template >* = nullptr> + constexpr host_span(cudf::detail::host_vector const& in) + : base(in.data(), in.size()), _is_device_accessible{in.get_allocator().is_device_accessible()} + { + } + // Copy construction to support const conversion /// @param other The span to copy template const& us } void infer_column_types(parse_options const& parse_opts, - cudf::detail::host_vector const& column_flags, + host_span column_flags, device_span data, device_span row_offsets, int32_t num_records, @@ -630,7 +630,7 @@ cudf::detail::host_vector determine_column_types( device_span data, device_span row_offsets, int32_t num_records, - cudf::detail::host_vector& column_flags, + host_span column_flags, cudf::size_type num_active_columns, rmm::cuda_stream_view stream) { From c262c30231a5165ff83ff57d4ea3ac826b571a74 Mon Sep 17 00:00:00 2001 From: vukasin Date: Thu, 18 Jul 2024 19:04:41 +0000 Subject: [PATCH 70/75] pass host_span --- cpp/src/io/csv/reader_impl.cu | 23 +++++++++++------------ cpp/src/io/parquet/writer_impl.cu | 2 +- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 9a4ecfdb0ab..40d4372ae9d 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -566,18 +566,17 @@ void infer_column_types(parse_options const& parse_opts, } } -std::vector decode_data( - parse_options const& parse_opts, - cudf::detail::host_vector const& column_flags, - std::vector const& column_names, - device_span data, - device_span row_offsets, - cudf::detail::host_vector const& column_types, - int32_t num_records, - int32_t num_actual_columns, - int32_t num_active_columns, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::vector decode_data(parse_options const& parse_opts, + host_span column_flags, + std::vector const& column_names, + device_span data, + device_span row_offsets, + host_span column_types, + int32_t num_records, + int32_t num_actual_columns, + int32_t num_active_columns, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { // Alloc output; columns' data memory is still expected for empty dataframe std::vector out_buffers; diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 39dcd5debab..2df71b77301 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1135,7 +1135,7 @@ void init_row_group_fragments(cudf::detail::hostdevice_2dvector& f * @param stream CUDA stream used for device memory operations and kernel launches */ void calculate_page_fragments(device_span frag, - cudf::detail::host_vector const& frag_sizes, + host_span frag_sizes, rmm::cuda_stream_view stream) { auto d_frag_sz = cudf::detail::make_device_uvector_async( From 6cd16b5013f9501b9dcae119af4244e69ffe7a0d Mon Sep 17 00:00:00 2001 From: vukasin Date: Thu, 18 Jul 2024 19:35:19 +0000 Subject: [PATCH 71/75] address review --- cpp/src/utilities/host_memory.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp index 98b1edc1c4e..7c3cea42023 100644 --- a/cpp/src/utilities/host_memory.cpp +++ b/cpp/src/utilities/host_memory.cpp @@ -192,11 +192,9 @@ class new_delete_memory_resource { { try { return rmm::detail::aligned_host_allocate( - bytes, rmm::CUDA_ALLOCATION_ALIGNMENT, [](std::size_t size) { - return ::operator new(size); - }); + bytes, alignment, [](std::size_t size) { return ::operator new(size); }); } catch (std::bad_alloc const& e) { - RMM_FAIL("Failed to allocate memory: " + std::string{e.what()}, rmm::out_of_memory); + CUDF_FAIL("Failed to allocate memory: " + std::string{e.what()}, rmm::out_of_memory); } } @@ -217,13 +215,13 @@ class new_delete_memory_resource { std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) { rmm::detail::aligned_host_deallocate( - ptr, bytes, rmm::CUDA_ALLOCATION_ALIGNMENT, [](void* ptr) { ::operator delete(ptr); }); + ptr, bytes, alignment, [](void* ptr) { ::operator delete(ptr); }); } void deallocate_async(void* ptr, std::size_t bytes, std::size_t alignment, - cuda::stream_ref stream) + [[maybe_unused]] cuda::stream_ref stream) { deallocate(ptr, bytes, alignment); } From 044836a7bacb7cd3960f5686a78076a2d345da71 Mon Sep 17 00:00:00 2001 From: vukasin Date: Mon, 22 Jul 2024 13:47:03 +0000 Subject: [PATCH 72/75] reviews --- cpp/src/utilities/cuda_memcpy.cu | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu index ccfc7542c80..0efb881eb3e 100644 --- a/cpp/src/utilities/cuda_memcpy.cu +++ b/cpp/src/utilities/cuda_memcpy.cu @@ -16,6 +16,7 @@ #include "cudf/detail/utilities/integer_utils.hpp" +#include #include #include #include @@ -28,9 +29,10 @@ namespace cudf::detail { namespace { -__global__ void copy_kernel(char const* src, char* dst, size_t n) +// Simple kernel to copy between device buffers +CUDF_KERNEL void copy_kernel(char const* src, char* dst, size_t n) { - auto const idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + auto const idx = cudf::detail::grid_1d::global_thread_id(); if (idx < n) { dst[idx] = src[idx]; } } @@ -41,6 +43,8 @@ void copy_pinned(void* dst, void const* src, std::size_t size, rmm::cuda_stream_ if (size < get_kernel_pinned_copy_threshold()) { const int block_size = 256; auto const grid_size = cudf::util::div_rounding_up_safe(size, block_size); + // We are explicitly launching the kernel here instead of calling a thrust function because the + // thrust function can potentially call cudaMemcpyAsync instead of using a kernel copy_kernel<<>>( static_cast(src), static_cast(dst), size); } else { From 32c7b725cc8e237734fa5e4c7b11360be0fadd6e Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 22 Jul 2024 06:54:01 -0700 Subject: [PATCH 73/75] review suggestion Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> --- cpp/include/cudf/detail/utilities/host_memory.hpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/host_memory.hpp b/cpp/include/cudf/detail/utilities/host_memory.hpp index fd82b584c7e..9f9a89c91fe 100644 --- a/cpp/include/cudf/detail/utilities/host_memory.hpp +++ b/cpp/include/cudf/detail/utilities/host_memory.hpp @@ -41,10 +41,9 @@ CUDF_EXPORT rmm::host_async_resource_ref get_pageable_memory_resource(); template rmm_host_allocator get_host_allocator(std::size_t size, rmm::cuda_stream_view _stream) { - if (size * sizeof(T) <= get_allocate_host_as_pinned_threshold()) { - return {get_pinned_memory_resource(), _stream}; - } - return {get_pageable_memory_resource(), _stream}; + return { size * sizeof(T) <= get_allocate_host_as_pinned_threshold() ? + get_pinned_memory_resource() : get_pageable_memory_resource(), + _stream}; } } // namespace cudf::detail From cecb289c89d380131c582fbf989d638463043b82 Mon Sep 17 00:00:00 2001 From: vukasin Date: Mon, 22 Jul 2024 14:09:01 +0000 Subject: [PATCH 74/75] fix docs --- cpp/include/cudf/detail/utilities/host_memory.hpp | 14 ++++++++------ .../cudf/detail/utilities/vector_factories.hpp | 4 ++-- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/host_memory.hpp b/cpp/include/cudf/detail/utilities/host_memory.hpp index 9f9a89c91fe..3975e694559 100644 --- a/cpp/include/cudf/detail/utilities/host_memory.hpp +++ b/cpp/include/cudf/detail/utilities/host_memory.hpp @@ -33,17 +33,19 @@ namespace cudf::detail { CUDF_EXPORT rmm::host_async_resource_ref get_pageable_memory_resource(); /** - * @brief Get the memory resource to be used for the host memory allocation. + * @brief Get the allocator to be used for the host memory allocation. * * @param size The number of elements of type T to allocate - * @return The memory resource to be used for the host memory allocation + * @param stream The stream to use for the allocation + * @return The allocator to be used for the host memory allocation */ template -rmm_host_allocator get_host_allocator(std::size_t size, rmm::cuda_stream_view _stream) +rmm_host_allocator get_host_allocator(std::size_t size, rmm::cuda_stream_view stream) { - return { size * sizeof(T) <= get_allocate_host_as_pinned_threshold() ? - get_pinned_memory_resource() : get_pageable_memory_resource(), - _stream}; + return {size * sizeof(T) <= get_allocate_host_as_pinned_threshold() + ? get_pinned_memory_resource() + : get_pageable_memory_resource(), + stream}; } } // namespace cudf::detail diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp index 9e3b0fb0152..45dc839c9bd 100644 --- a/cpp/include/cudf/detail/utilities/vector_factories.hpp +++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp @@ -356,7 +356,7 @@ std::vector make_std_vector_sync(Container const } /** - * @brief Construct a `thrust::host_vector` of the given size. + * @brief Construct a `cudf::detail::host_vector` of the given size. * * @note The returned vector may be using a pinned memory resource. * @@ -372,7 +372,7 @@ host_vector make_host_vector(size_t size, rmm::cuda_stream_view stream) } /** - * @brief Construct an empty `thrust::host_vector` with the given capacity. + * @brief Construct an empty `cudf::detail::host_vector` with the given capacity. * * @note The returned vector may be using a pinned memory resource. * From 5d15a4d1345a88612ee04464b5493a035dd3f915 Mon Sep 17 00:00:00 2001 From: vukasin Date: Mon, 22 Jul 2024 19:07:01 +0000 Subject: [PATCH 75/75] revert to fix get_host_allocator --- cpp/include/cudf/detail/utilities/host_memory.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/host_memory.hpp b/cpp/include/cudf/detail/utilities/host_memory.hpp index 3975e694559..c6775a950c9 100644 --- a/cpp/include/cudf/detail/utilities/host_memory.hpp +++ b/cpp/include/cudf/detail/utilities/host_memory.hpp @@ -42,10 +42,10 @@ CUDF_EXPORT rmm::host_async_resource_ref get_pageable_memory_resource(); template rmm_host_allocator get_host_allocator(std::size_t size, rmm::cuda_stream_view stream) { - return {size * sizeof(T) <= get_allocate_host_as_pinned_threshold() - ? get_pinned_memory_resource() - : get_pageable_memory_resource(), - stream}; + if (size * sizeof(T) <= get_allocate_host_as_pinned_threshold()) { + return {get_pinned_memory_resource(), stream}; + } + return {get_pageable_memory_resource(), stream}; } } // namespace cudf::detail