From eb39019d55615160766af37ff6b36d51c23522a6 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 30 May 2024 16:24:30 -0700
Subject: [PATCH 01/75] remove pinned_host_vector

---
 cpp/benchmarks/io/text/multibyte_split.cpp    |   8 +-
 .../detail/utilities/pinned_host_vector.hpp   | 216 ------------------
 .../cudf/detail/utilities/rmm_host_vector.hpp |   8 +-
 .../detail/utilities/vector_factories.hpp     |  38 ++-
 cpp/src/io/csv/reader_impl.cu                 |   1 +
 cpp/src/io/orc/reader_impl_chunking.cu        |   1 +
 cpp/src/io/orc/writer_impl.cu                 |   5 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp    |   2 +
 cpp/src/io/parquet/writer_impl.cu             |   3 +-
 cpp/src/io/text/bgzip_data_chunk_source.cu    |  16 +-
 .../io/text/data_chunk_source_factories.cpp   |  23 +-
 cpp/src/io/utilities/hostdevice_vector.hpp    |   4 +-
 12 files changed, 77 insertions(+), 248 deletions(-)
 delete mode 100644 cpp/include/cudf/detail/utilities/pinned_host_vector.hpp
diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp
index b5d855d8881..172182c3607 100644
--- a/cpp/benchmarks/io/text/multibyte_split.cpp
+++ b/cpp/benchmarks/io/text/multibyte_split.cpp
@@ -22,7 +22,6 @@
 #include <cudf_test/file_utilities.hpp>
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/utilities/pinned_host_vector.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/detail/bgzip_utils.hpp>
@@ -132,9 +131,10 @@ static void bench_multibyte_split(nvbench::state& state,
 
   auto const delim_factor = static_cast<double>(delim_percent) / 100;
   std::unique_ptr<cudf::io::datasource> datasource;
-  auto device_input      = create_random_input(file_size_approx, delim_factor, 0.05, delim);
-  auto host_input        = std::vector<char>{};
-  auto host_pinned_input = cudf::detail::pinned_host_vector<char>{};
+  auto device_input = create_random_input(file_size_approx, delim_factor, 0.05, delim);
+  auto host_input   = std::vector<char>{};
+  auto host_pinned_input =
+    cudf::detail::make_pinned_vector_async<char>(0, cudf::get_default_stream());
 
   if (source_type != data_chunk_source_type::device &&
       source_type != data_chunk_source_type::host_pinned) {
diff --git a/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp b/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp
deleted file mode 100644
index c22b6a6ba15..00000000000
--- a/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- *  Copyright (c) 2008-2024, NVIDIA CORPORATION
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <cudf/utilities/error.hpp>
-
-#include <thrust/host_vector.h>
-
-#include <cstddef>
-#include <limits>
-#include <new>  // for bad_alloc
-
-namespace cudf::detail {
-
-/*! \p pinned_allocator is a CUDA-specific host memory allocator
- *  that employs \c cudaMallocHost for allocation.
- *
- * This implementation is ported from the experimental/pinned_allocator
- * that Thrust used to provide.
- *
- *  \see https://en.cppreference.com/w/cpp/memory/allocator
- */
-template <typename T>
-class pinned_allocator;
-
-/*! \p pinned_allocator is a CUDA-specific host memory allocator
- *  that employs \c cudaMallocHost for allocation.
- *
- * This implementation is ported from the experimental/pinned_allocator
- * that Thrust used to provide.
- *
- *  \see https://en.cppreference.com/w/cpp/memory/allocator
- */
-template <>
-class pinned_allocator<void> {
- public:
-  using value_type      = void;            ///< The type of the elements in the allocator
-  using pointer         = void*;           ///< The type returned by address() / allocate()
-  using const_pointer   = void const*;     ///< The type returned by address()
-  using size_type       = std::size_t;     ///< The type used for the size of the allocation
-  using difference_type = std::ptrdiff_t;  ///< The type of the distance between two pointers
-
-  /**
-   * @brief converts a `pinned_allocator<void>` to `pinned_allocator<U>`
-   */
-  template <typename U>
-  struct rebind {
-    using other = pinned_allocator<U>;  ///< The rebound type
-  };
-};
-
-/*! \p pinned_allocator is a CUDA-specific host memory allocator
- *  that employs \c cudaMallocHost for allocation.
- *
- * This implementation is ported from the experimental/pinned_allocator
- * that Thrust used to provide.
- *
- *  \see https://en.cppreference.com/w/cpp/memory/allocator
- */
-template <typename T>
-class pinned_allocator {
- public:
-  using value_type      = T;               ///< The type of the elements in the allocator
-  using pointer         = T*;              ///< The type returned by address() / allocate()
-  using const_pointer   = T const*;        ///< The type returned by address()
-  using reference       = T&;              ///< The parameter type for address()
-  using const_reference = T const&;        ///< The parameter type for address()
-  using size_type       = std::size_t;     ///< The type used for the size of the allocation
-  using difference_type = std::ptrdiff_t;  ///< The type of the distance between two pointers
-
-  /**
-   * @brief converts a `pinned_allocator<T>` to `pinned_allocator<U>`
-   */
-  template <typename U>
-  struct rebind {
-    using other = pinned_allocator<U>;  ///< The rebound type
-  };
-
-  /**
-   * @brief pinned_allocator's null constructor does nothing.
-   */
-  __host__ __device__ inline pinned_allocator() {}
-
-  /**
-   * @brief pinned_allocator's null destructor does nothing.
-   */
-  __host__ __device__ inline ~pinned_allocator() {}
-
-  /**
-   * @brief pinned_allocator's copy constructor does nothing.
-   */
-  __host__ __device__ inline pinned_allocator(pinned_allocator const&) {}
-
-  /**
-   * @brief  pinned_allocator's copy constructor does nothing.
-   *
-   *  This version of pinned_allocator's copy constructor
-   *  is templated on the \c value_type of the pinned_allocator
-   *  to copy from.  It is provided merely for convenience; it
-   *  does nothing.
-   */
-  template <typename U>
-  __host__ __device__ inline pinned_allocator(pinned_allocator<U> const&)
-  {
-  }
-
-  /**
-   * @brief This method returns the address of a \c reference of
-   *  interest.
-   *
-   *  @param r The \c reference of interest.
-   *  @return \c r's address.
-   */
-  __host__ __device__ inline pointer address(reference r) { return &r; }
-
-  /**
-   * @brief This method returns the address of a \c const_reference
-   *  of interest.
-   *
-   *  @param r The \c const_reference of interest.
-   *  @return \c r's address.
-   */
-  __host__ __device__ inline const_pointer address(const_reference r) { return &r; }
-
-  /**
-   * @brief This method allocates storage for objects in pinned host
-   *  memory.
-   *
-   *  @param cnt The number of objects to allocate.
-   *  @return a \c pointer to the newly allocated objects.
-   *  @note The second parameter to this function is meant as a
-   *        hint pointer to a nearby memory location, but is
-   *        not used by this allocator.
-   *  @note This method does not invoke \p value_type's constructor.
-   *        It is the responsibility of the caller to initialize the
-   *        objects at the returned \c pointer.
-   */
-  __host__ inline pointer allocate(size_type cnt, const_pointer /*hint*/ = 0)
-  {
-    if (cnt > this->max_size()) { throw std::bad_alloc(); }  // end if
-
-    pointer result(0);
-    CUDF_CUDA_TRY(cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
-    return result;
-  }
-
-  /**
-   * @brief This method deallocates pinned host memory previously allocated
-   *  with this \c pinned_allocator.
-   *
-   *  @param p A \c pointer to the previously allocated memory.
-   *  @note The second parameter is the number of objects previously allocated
-   *        but is ignored by this allocator.
-   *  @note This method does not invoke \p value_type's destructor.
-   *        It is the responsibility of the caller to destroy
-   *        the objects stored at \p p.
-   */
-  __host__ inline void deallocate(pointer p, size_type /*cnt*/)
-  {
-    auto dealloc_worked = cudaFreeHost(p);
-    (void)dealloc_worked;
-    assert(dealloc_worked == cudaSuccess);
-  }
-
-  /**
-   * @brief This method returns the maximum size of the \c cnt parameter
-   *  accepted by the \p allocate() method.
-   *
-   *  @return The maximum number of objects that may be allocated
-   *          by a single call to \p allocate().
-   */
-  inline size_type max_size() const { return (std::numeric_limits<size_type>::max)() / sizeof(T); }
-
-  /**
-   * @brief This method tests this \p pinned_allocator for equality to
-   *  another.
-   *
-   *  @param x The other \p pinned_allocator of interest.
-   *  @return This method always returns \c true.
-   */
-  __host__ __device__ inline bool operator==(pinned_allocator const& x) const { return true; }
-
-  /**
-   * @brief This method tests this \p pinned_allocator for inequality
-   *  to another.
-   *
-   *  @param x The other \p pinned_allocator of interest.
-   *  @return This method always returns \c false.
-   */
-  __host__ __device__ inline bool operator!=(pinned_allocator const& x) const
-  {
-    return !operator==(x);
-  }
-};
-
-/**
- * @brief A vector class with pinned host memory allocator
- */
-template <typename T>
-using pinned_host_vector = thrust::host_vector<T, pinned_allocator<T>>;
-
-}  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp b/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp
index 6901a19473e..6604020c224 100644
--- a/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp
+++ b/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp
@@ -19,6 +19,7 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/aligned.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <thrust/host_vector.h>
@@ -32,8 +33,6 @@ namespace cudf::detail {
 /*! \p rmm_host_allocator is a CUDA-specific host memory allocator
  *  that employs \c a `rmm::host_async_resource_ref` for allocation.
  *
- * This implementation is ported from pinned_host_vector in cudf.
- *
  *  \see https://en.cppreference.com/w/cpp/memory/allocator
  */
 template <typename T>
@@ -42,8 +41,6 @@ class rmm_host_allocator;
 /*! \p rmm_host_allocator is a CUDA-specific host memory allocator
  *  that employs \c an `cudf::host_async_resource_ref` for allocation.
  *
- * This implementation is ported from pinned_host_vector in cudf.
- *
  *  \see https://en.cppreference.com/w/cpp/memory/allocator
  */
 template <>
@@ -70,8 +67,7 @@ class rmm_host_allocator<void> {
  * The \p rmm_host_allocator provides an interface for host memory allocation through the user
  * provided \c `rmm::host_async_resource_ref`. The \p rmm_host_allocator does not take ownership of
  * this reference and therefore it is the user's responsibility to ensure its lifetime for the
- * duration of the lifetime of the \p rmm_host_allocator. This implementation is ported from
- * pinned_host_vector in cudf.
+ * duration of the lifetime of the \p rmm_host_allocator.
  *
  *  \see https://en.cppreference.com/w/cpp/memory/allocator
  */
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 293a4096c57..6f2287fc1c8 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -21,6 +21,8 @@
  * @file vector_factories.hpp
  */
 
+#include <cudf/detail/utilities/rmm_host_vector.hpp>
+#include <cudf/io/memory_resource.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
@@ -380,7 +382,7 @@ thrust::host_vector<T> make_host_vector_async(device_span<T const> v, rmm::cuda_
  * @brief Asynchronously construct a `std::vector` containing a copy of data from a device
  * container
  *
- * @note This function synchronizes `stream`.
+ * @note This function does not synchronize `stream`.
  *
  * @tparam Container The type of the container to copy from
  * @tparam T The type of the data to copy
@@ -439,6 +441,40 @@ thrust::host_vector<typename Container::value_type> make_host_vector_sync(
   return make_host_vector_sync(device_span<typename Container::value_type const>{c}, stream);
 }
 
+/**
+ * @brief Asynchronously construct a `rmm_host_vector` of the given size
+ *
+ * @note This function does not synchronize `stream`.
+ *
+ * @tparam T The type of the vector data
+ * @param size The number of elements in the created vector
+ * @param stream The stream on which to allocate memory
+ * @return A rmm_host_vector of the given size
+ */
+template <typename T>
+rmm_host_vector<T> make_pinned_vector_async(size_t size, rmm::cuda_stream_view stream)
+{
+  return rmm_host_vector<T>(size, {cudf::io::get_host_memory_resource(), stream});
+}
+
+/**
+ * @brief Synchronously construct a `rmm_host_vector` of the given size
+ *
+ * @note This function synchronizes `stream`.
+ *
+ * @tparam T The type of the vector data
+ * @param size The number of elements in the created vector
+ * @param stream The stream on which to allocate memory
+ * @return A rmm_host_vector of the given size
+ */
+template <typename T>
+rmm_host_vector<T> make_pinned_vector_sync(size_t size, rmm::cuda_stream_view stream)
+{
+  auto result = make_pinned_vector_async<T>(size, stream);
+  stream.synchronize();
+  return result;
+}
+
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 5dee0c17a33..05faded651d 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -27,6 +27,7 @@
 #include "io/utilities/parsing_utils.cuh"
 
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/io/csv.hpp>
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 5034aa14a95..43301826003 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -22,6 +22,7 @@
 
 #include <cudf/detail/timezone.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/device_buffer.hpp>
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 344e216cdc8..e9e031a407a 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -27,7 +27,6 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/pinned_host_vector.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -2339,7 +2338,7 @@ auto convert_table_to_orc_data(table_view const& input,
                       std::move(streams),
                       std::move(stripes),
                       std::move(stripe_dicts.views),
-                      cudf::detail::pinned_host_vector<uint8_t>()};
+                      cudf::detail::make_pinned_vector_async<uint8_t>(0, stream)};
   }
 
   // Allocate intermediate output stream buffer
@@ -2407,7 +2406,7 @@ auto convert_table_to_orc_data(table_view const& input,
     return max_stream_size;
   }();
 
-  cudf::detail::pinned_host_vector<uint8_t> bounce_buffer(max_out_stream_size);
+  auto bounce_buffer = cudf::detail::make_pinned_vector_async<uint8_t>(max_out_stream_size, stream);
 
   auto intermediate_stats = gather_statistic_blobs(stats_freq, orc_table, segmentation, stream);
 
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index eb653c6b9ac..9de8a9e2719 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -23,6 +23,8 @@
 #include "ipc/Message_generated.h"
 #include "ipc/Schema_generated.h"
 
+#include <cudf/detail/utilities/logger.hpp>
+
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 1dfced94f5b..6d466748c17 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -36,7 +36,6 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/linked_column.hpp>
-#include <cudf/detail/utilities/pinned_host_vector.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/detail/dremel.hpp>
 #include <cudf/lists/lists_column_view.hpp>
@@ -2278,7 +2277,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   }
 
   auto bounce_buffer =
-    cudf::detail::pinned_host_vector<uint8_t>(all_device_write ? 0 : max_write_size);
+    cudf::detail::make_pinned_vector_async<uint8_t>(all_device_write ? 0 : max_write_size, stream);
 
   return std::tuple{std::move(agg_meta),
                     std::move(pages),
diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu
index faa09e586ab..190015686df 100644
--- a/cpp/src/io/text/bgzip_data_chunk_source.cu
+++ b/cpp/src/io/text/bgzip_data_chunk_source.cu
@@ -14,13 +14,14 @@
  * limitations under the License.
  */
 
+#include "cudf/detail/utilities/vector_factories.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
 #include "io/text/device_data_chunks.hpp"
 #include "io/utilities/config_utils.hpp"
 
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/pinned_host_vector.hpp>
+#include <cudf/detail/utilities/rmm_host_vector.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/detail/bgzip_utils.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -66,7 +67,7 @@ struct bgzip_nvcomp_transform_functor {
 class bgzip_data_chunk_reader : public data_chunk_reader {
  private:
   template <typename T>
-  static void copy_to_device(cudf::detail::pinned_host_vector<T> const& host,
+  static void copy_to_device(cudf::detail::rmm_host_vector<T> const& host,
                              rmm::device_uvector<T>& device,
                              rmm::cuda_stream_view stream)
   {
@@ -84,9 +85,9 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
       1 << 16;  // 64k offset allocation, resized on demand
 
     cudaEvent_t event;
-    cudf::detail::pinned_host_vector<char> h_compressed_blocks;
-    cudf::detail::pinned_host_vector<std::size_t> h_compressed_offsets;
-    cudf::detail::pinned_host_vector<std::size_t> h_decompressed_offsets;
+    cudf::detail::rmm_host_vector<char> h_compressed_blocks;
+    cudf::detail::rmm_host_vector<std::size_t> h_compressed_offsets;
+    cudf::detail::rmm_host_vector<std::size_t> h_decompressed_offsets;
     rmm::device_uvector<char> d_compressed_blocks;
     rmm::device_uvector<char> d_decompressed_blocks;
     rmm::device_uvector<std::size_t> d_compressed_offsets;
@@ -103,7 +104,10 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
     bool is_decompressed{};
 
     decompression_blocks(rmm::cuda_stream_view init_stream)
-      : d_compressed_blocks(0, init_stream),
+      : h_compressed_blocks{cudf::detail::make_pinned_vector_async<char>(0, init_stream)},
+        h_compressed_offsets{cudf::detail::make_pinned_vector_async<std::size_t>(0, init_stream)},
+        h_decompressed_offsets{cudf::detail::make_pinned_vector_async<std::size_t>(0, init_stream)},
+        d_compressed_blocks(0, init_stream),
         d_decompressed_blocks(0, init_stream),
         d_compressed_offsets(0, init_stream),
         d_decompressed_offsets(0, init_stream),
diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp
index 9d1d0498ace..8278b2c25cb 100644
--- a/cpp/src/io/text/data_chunk_source_factories.cpp
+++ b/cpp/src/io/text/data_chunk_source_factories.cpp
@@ -14,10 +14,11 @@
  * limitations under the License.
  */
 
+#include "cudf/detail/utilities/vector_factories.hpp"
 #include "io/text/device_data_chunks.hpp"
 
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/pinned_host_vector.hpp>
+#include <cudf/detail/utilities/rmm_host_vector.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 
 #include <rmm/device_buffer.hpp>
@@ -32,7 +33,7 @@ namespace {
 
 struct host_ticket {
   cudaEvent_t event;
-  cudf::detail::pinned_host_vector<char> buffer;
+  std::unique_ptr<cudf::detail::rmm_host_vector<char>> buffer;
 };
 
 /**
@@ -84,13 +85,16 @@ class datasource_chunk_reader : public data_chunk_reader {
       CUDF_CUDA_TRY(cudaEventSynchronize(h_ticket.event));
 
       // resize the host buffer as necessary to contain the requested number of bytes
-      if (h_ticket.buffer.size() < read_size) { h_ticket.buffer.resize(read_size); }
+      if (h_ticket.buffer == nullptr or h_ticket.buffer->size() < read_size) {
+        h_ticket.buffer = std::make_unique<cudf::detail::rmm_host_vector<char>>(
+          cudf::detail::make_pinned_vector_sync<char>(read_size, stream));
+      }
 
-      _source->host_read(_offset, read_size, reinterpret_cast<uint8_t*>(h_ticket.buffer.data()));
+      _source->host_read(_offset, read_size, reinterpret_cast<uint8_t*>(h_ticket.buffer->data()));
 
       // copy the host-pinned data on to device
       CUDF_CUDA_TRY(cudaMemcpyAsync(
-        chunk.data(), h_ticket.buffer.data(), read_size, cudaMemcpyDefault, stream.value()));
+        chunk.data(), h_ticket.buffer->data(), read_size, cudaMemcpyDefault, stream.value()));
 
       // record the host-to-device copy.
       CUDF_CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value()));
@@ -148,10 +152,13 @@ class istream_data_chunk_reader : public data_chunk_reader {
     CUDF_CUDA_TRY(cudaEventSynchronize(h_ticket.event));
 
     // resize the host buffer as necessary to contain the requested number of bytes
-    if (h_ticket.buffer.size() < read_size) { h_ticket.buffer.resize(read_size); }
+    if (h_ticket.buffer == nullptr or h_ticket.buffer->size() < read_size) {
+      h_ticket.buffer = std::make_unique<cudf::detail::rmm_host_vector<char>>(
+        cudf::detail::make_pinned_vector_sync<char>(read_size, stream));
+    }
 
     // read data from the host istream in to the pinned host memory buffer
-    _datastream->read(h_ticket.buffer.data(), read_size);
+    _datastream->read(h_ticket.buffer->data(), read_size);
 
     // adjust the read size to reflect how many bytes were actually read from the data stream
     read_size = _datastream->gcount();
@@ -161,7 +168,7 @@ class istream_data_chunk_reader : public data_chunk_reader {
 
     // copy the host-pinned data on to device
     CUDF_CUDA_TRY(cudaMemcpyAsync(
-      chunk.data(), h_ticket.buffer.data(), read_size, cudaMemcpyDefault, stream.value()));
+      chunk.data(), h_ticket.buffer->data(), read_size, cudaMemcpyDefault, stream.value()));
 
     // record the host-to-device copy.
     CUDF_CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value()));
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 0883ac3609f..492084bd5bc 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -16,10 +16,10 @@
 
 #pragma once
 
-#include "config_utils.hpp"
 #include "hostdevice_span.hpp"
 
 #include <cudf/detail/utilities/rmm_host_vector.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/memory_resource.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -53,7 +53,7 @@ class hostdevice_vector {
   }
 
   explicit hostdevice_vector(size_t initial_size, size_t max_size, rmm::cuda_stream_view stream)
-    : h_data({cudf::io::get_host_memory_resource(), stream}), d_data(max_size, stream)
+    : h_data{make_pinned_vector_async<T>(0, stream)}, d_data(max_size, stream)
   {
     CUDF_EXPECTS(initial_size <= max_size, "initial_size cannot be larger than max_size");
 

From 24b12451ffea31766d01c9cbe8d4e10bbe3734be Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 30 May 2024 16:54:36 -0700
Subject: [PATCH 02/75] switch to host_device resource ref

---
 cpp/benchmarks/fixture/nvbench_fixture.hpp |  5 +++--
 cpp/include/cudf/io/memory_resource.hpp    |  7 +++----
 cpp/src/io/utilities/config_utils.cpp      | 21 +++++++++++----------
 cpp/src/io/utilities/hostdevice_vector.hpp |  1 -
 4 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/cpp/benchmarks/fixture/nvbench_fixture.hpp b/cpp/benchmarks/fixture/nvbench_fixture.hpp
index ebcbcb17e98..3a5a9bfd2fa 100644
--- a/cpp/benchmarks/fixture/nvbench_fixture.hpp
+++ b/cpp/benchmarks/fixture/nvbench_fixture.hpp
@@ -81,14 +81,15 @@ struct nvbench_base_fixture {
               "\nExpecting: cuda, pool, async, arena, managed, or managed_pool");
   }
 
-  inline rmm::host_async_resource_ref make_cuio_host_pinned()
+  inline rmm::host_device_async_resource_ref make_cuio_host_pinned()
   {
     static std::shared_ptr<rmm::mr::pinned_host_memory_resource> mr =
       std::make_shared<rmm::mr::pinned_host_memory_resource>();
     return *mr;
   }
 
-  inline rmm::host_async_resource_ref create_cuio_host_memory_resource(std::string const& mode)
+  inline rmm::host_device_async_resource_ref create_cuio_host_memory_resource(
+    std::string const& mode)
   {
     if (mode == "pinned") return make_cuio_host_pinned();
     if (mode == "pinned_pool") return cudf::io::get_host_memory_resource();
diff --git a/cpp/include/cudf/io/memory_resource.hpp b/cpp/include/cudf/io/memory_resource.hpp
index a36e220ae7b..2af5755f824 100644
--- a/cpp/include/cudf/io/memory_resource.hpp
+++ b/cpp/include/cudf/io/memory_resource.hpp
@@ -33,7 +33,8 @@ namespace cudf::io {
  * @param mr The rmm resource to be used for host-side allocations
  * @return The previous resource that was in use
  */
-rmm::host_async_resource_ref set_host_memory_resource(rmm::host_async_resource_ref mr);
+rmm::host_device_async_resource_ref set_host_memory_resource(
+  rmm::host_device_async_resource_ref mr);
 
 /**
  * @brief Get the rmm resource being used for host memory allocations by
@@ -41,7 +42,7 @@ rmm::host_async_resource_ref set_host_memory_resource(rmm::host_async_resource_r
  *
  * @return The rmm resource used for host-side allocations
  */
-rmm::host_async_resource_ref get_host_memory_resource();
+rmm::host_device_async_resource_ref get_host_memory_resource();
 
 /**
  * @brief Options to configure the default host memory resource
@@ -54,8 +55,6 @@ struct host_mr_options {
 /**
  * @brief Configure the size of the default host memory resource.
  *
- * @throws cudf::logic_error if called after the default host memory resource has been created
- *
  * @param opts Options to configure the default host memory resource
  * @return True if this call successfully configured the host memory resource, false if a
  * a resource was already configured.
diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp
index dad1135e766..25649d17c76 100644
--- a/cpp/src/io/utilities/config_utils.cpp
+++ b/cpp/src/io/utilities/config_utils.cpp
@@ -209,7 +209,8 @@ static_assert(cuda::mr::resource_with<fixed_pinned_pool_memory_resource,
 
 }  // namespace
 
-CUDF_EXPORT rmm::host_async_resource_ref& make_default_pinned_mr(std::optional<size_t> config_size)
+CUDF_EXPORT rmm::host_device_async_resource_ref& make_default_pinned_mr(
+  std::optional<size_t> config_size)
 {
   static fixed_pinned_pool_memory_resource mr = [config_size]() {
     auto const size = [&config_size]() -> size_t {
@@ -233,7 +234,7 @@ CUDF_EXPORT rmm::host_async_resource_ref& make_default_pinned_mr(std::optional<s
     return fixed_pinned_pool_memory_resource{aligned_size};
   }();
 
-  static rmm::host_async_resource_ref mr_ref{mr};
+  static rmm::host_device_async_resource_ref mr_ref{mr};
   return mr_ref;
 }
 
@@ -244,11 +245,11 @@ CUDF_EXPORT std::mutex& host_mr_mutex()
 }
 
 // Must be called with the host_mr_mutex mutex held
-CUDF_EXPORT rmm::host_async_resource_ref& make_host_mr(std::optional<host_mr_options> const& opts,
-                                                       bool* did_configure = nullptr)
+CUDF_EXPORT rmm::host_device_async_resource_ref& make_host_mr(
+  std::optional<host_mr_options> const& opts, bool* did_configure = nullptr)
 {
-  static rmm::host_async_resource_ref* mr_ref = nullptr;
-  bool configured                             = false;
+  static rmm::host_device_async_resource_ref* mr_ref = nullptr;
+  bool configured                                    = false;
   if (mr_ref == nullptr) {
     configured = true;
     mr_ref     = &make_default_pinned_mr(opts ? opts->pool_size : std::nullopt);
@@ -262,13 +263,13 @@ CUDF_EXPORT rmm::host_async_resource_ref& make_host_mr(std::optional<host_mr_opt
 }
 
 // Must be called with the host_mr_mutex mutex held
-CUDF_EXPORT rmm::host_async_resource_ref& host_mr()
+CUDF_EXPORT rmm::host_device_async_resource_ref& host_mr()
 {
-  static rmm::host_async_resource_ref mr_ref = make_host_mr(std::nullopt);
+  static rmm::host_device_async_resource_ref mr_ref = make_host_mr(std::nullopt);
   return mr_ref;
 }
 
-rmm::host_async_resource_ref set_host_memory_resource(rmm::host_async_resource_ref mr)
+rmm::host_device_async_resource_ref set_host_memory_resource(rmm::host_device_async_resource_ref mr)
 {
   std::scoped_lock lock{host_mr_mutex()};
   auto last_mr = host_mr();
@@ -276,7 +277,7 @@ rmm::host_async_resource_ref set_host_memory_resource(rmm::host_async_resource_r
   return last_mr;
 }
 
-rmm::host_async_resource_ref get_host_memory_resource()
+rmm::host_device_async_resource_ref get_host_memory_resource()
 {
   std::scoped_lock lock{host_mr_mutex()};
   return host_mr();
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 492084bd5bc..f6f7ff14d38 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -20,7 +20,6 @@
 
 #include <cudf/detail/utilities/rmm_host_vector.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/io/memory_resource.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>

From 6c896f6ebf2d6177f62903dd719cebf88da08565 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 30 May 2024 18:07:00 -0700
Subject: [PATCH 03/75] rebrand host memory resource

---
 cpp/CMakeLists.txt                            |   1 +
 cpp/benchmarks/fixture/nvbench_fixture.hpp    |   8 +-
 .../io/parquet/parquet_reader_multithread.cpp |   2 +-
 .../detail/utilities/vector_factories.hpp     |   4 +-
 cpp/include/cudf/io/memory_resource.hpp       |  64 -----
 cpp/include/cudf/utilities/pinned_memory.hpp  |  58 +++++
 cpp/src/io/utilities/config_utils.cpp         | 215 +---------------
 cpp/src/utilities/pinned_memory.cpp           | 230 ++++++++++++++++++
 cpp/tests/io/json_test.cpp                    |   6 +-
 .../utilities_tests/io_utilities_tests.cpp    |   8 +-
 10 files changed, 305 insertions(+), 291 deletions(-)
 delete mode 100644 cpp/include/cudf/io/memory_resource.hpp
 create mode 100644 cpp/include/cudf/utilities/pinned_memory.hpp
 create mode 100644 cpp/src/utilities/pinned_memory.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f637db66c2c..8a0f56aec53 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -665,6 +665,7 @@ add_library(
   src/utilities/default_stream.cpp
   src/utilities/linked_column.cpp
   src/utilities/logger.cpp
+  src/utilities/pinned_memory.cpp
   src/utilities/stacktrace.cpp
   src/utilities/stream_pool.cpp
   src/utilities/traits.cpp
diff --git a/cpp/benchmarks/fixture/nvbench_fixture.hpp b/cpp/benchmarks/fixture/nvbench_fixture.hpp
index 3a5a9bfd2fa..df1492690bb 100644
--- a/cpp/benchmarks/fixture/nvbench_fixture.hpp
+++ b/cpp/benchmarks/fixture/nvbench_fixture.hpp
@@ -15,8 +15,8 @@
  */
 #pragma once
 
-#include <cudf/io/memory_resource.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
 
 #include <rmm/cuda_device.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
@@ -92,7 +92,7 @@ struct nvbench_base_fixture {
     std::string const& mode)
   {
     if (mode == "pinned") return make_cuio_host_pinned();
-    if (mode == "pinned_pool") return cudf::io::get_host_memory_resource();
+    if (mode == "pinned_pool") return cudf::get_pinned_memory_resource();
     CUDF_FAIL("Unknown cuio_host_mem parameter: " + mode + "\nExpecting: pinned or pinned_pool");
   }
 
@@ -113,14 +113,14 @@ struct nvbench_base_fixture {
     rmm::mr::set_current_device_resource(mr.get());
     std::cout << "RMM memory resource = " << rmm_mode << "\n";
 
-    cudf::io::set_host_memory_resource(create_cuio_host_memory_resource(cuio_host_mode));
+    cudf::set_pinned_memory_resource(create_cuio_host_memory_resource(cuio_host_mode));
     std::cout << "CUIO host memory resource = " << cuio_host_mode << "\n";
   }
 
   ~nvbench_base_fixture()
   {
     // Ensure the the pool is freed before the CUDA context is destroyed:
-    cudf::io::set_host_memory_resource(this->make_cuio_host_pinned());
+    cudf::set_pinned_memory_resource(this->make_cuio_host_pinned());
   }
 
   std::shared_ptr<rmm::mr::device_memory_resource> mr;
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
index bd80c4e0e88..f340b4aeb7a 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
@@ -20,9 +20,9 @@
 #include <benchmarks/io/nvbench_helpers.hpp>
 
 #include <cudf/detail/utilities/stream_pool.hpp>
-#include <cudf/io/memory_resource.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
 #include <cudf/utilities/thread_pool.hpp>
 
 #include <nvtx3/nvtx3.hpp>
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 6f2287fc1c8..7343ee25c8f 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -22,9 +22,9 @@
  */
 
 #include <cudf/detail/utilities/rmm_host_vector.hpp>
-#include <cudf/io/memory_resource.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -454,7 +454,7 @@ thrust::host_vector<typename Container::value_type> make_host_vector_sync(
 template <typename T>
 rmm_host_vector<T> make_pinned_vector_async(size_t size, rmm::cuda_stream_view stream)
 {
-  return rmm_host_vector<T>(size, {cudf::io::get_host_memory_resource(), stream});
+  return rmm_host_vector<T>(size, {cudf::get_pinned_memory_resource(), stream});
 }
 
 /**
diff --git a/cpp/include/cudf/io/memory_resource.hpp b/cpp/include/cudf/io/memory_resource.hpp
deleted file mode 100644
index 2af5755f824..00000000000
--- a/cpp/include/cudf/io/memory_resource.hpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <rmm/resource_ref.hpp>
-
-#include <optional>
-
-namespace cudf::io {
-
-/**
- * @brief Set the rmm resource to be used for host memory allocations by
- * cudf::detail::hostdevice_vector
- *
- * hostdevice_vector is a utility class that uses a pair of host and device-side buffers for
- * bouncing state between the cpu and the gpu. The resource set with this function (typically a
- * pinned memory allocator) is what it uses to allocate space for it's host-side buffer.
- *
- * @param mr The rmm resource to be used for host-side allocations
- * @return The previous resource that was in use
- */
-rmm::host_device_async_resource_ref set_host_memory_resource(
-  rmm::host_device_async_resource_ref mr);
-
-/**
- * @brief Get the rmm resource being used for host memory allocations by
- * cudf::detail::hostdevice_vector
- *
- * @return The rmm resource used for host-side allocations
- */
-rmm::host_device_async_resource_ref get_host_memory_resource();
-
-/**
- * @brief Options to configure the default host memory resource
- */
-struct host_mr_options {
-  std::optional<size_t> pool_size;  ///< The size of the pool to use for the default host memory
-                                    ///< resource. If not set, the default pool size is used.
-};
-
-/**
- * @brief Configure the size of the default host memory resource.
- *
- * @param opts Options to configure the default host memory resource
- * @return True if this call successfully configured the host memory resource, false if a
- * a resource was already configured.
- */
-bool config_default_host_memory_resource(host_mr_options const& opts);
-
-}  // namespace cudf::io
diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp
new file mode 100644
index 00000000000..b423eab6d38
--- /dev/null
+++ b/cpp/include/cudf/utilities/pinned_memory.hpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/resource_ref.hpp>
+
+#include <optional>
+
+namespace cudf {
+
+/**
+ * @brief Set the rmm resource to be used for pinned memory allocations.
+ *
+ * @param mr The rmm resource to be used for pinned allocations
+ * @return The previous resource that was in use
+ */
+rmm::host_device_async_resource_ref set_pinned_memory_resource(
+  rmm::host_device_async_resource_ref mr);
+
+/**
+ * @brief Get the rmm resource being used for pinned memory allocations.
+ *
+ * @return The rmm resource used for pinned allocations
+ */
+rmm::host_device_async_resource_ref get_pinned_memory_resource();
+
+/**
+ * @brief Options to configure the default pinned memory resource
+ */
+struct pinned_mr_options {
+  std::optional<size_t> pool_size;  ///< The size of the pool to use for the default pinned memory
+                                    ///< resource. If not set, the default pool size is used.
+};
+
+/**
+ * @brief Configure the size of the default pinned memory resource.
+ *
+ * @param opts Options to configure the default pinned memory resource
+ * @return True if this call successfully configured the pinned memory resource, false if a
+ * a resource was already configured.
+ */
+bool config_default_pinned_memory_resource(pinned_mr_options const& opts);
+
+}  // namespace cudf
diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp
index 25649d17c76..20ac89b4d53 100644
--- a/cpp/src/io/utilities/config_utils.cpp
+++ b/cpp/src/io/utilities/config_utils.cpp
@@ -16,22 +16,12 @@
 
 #include "config_utils.hpp"
 
-#include <cudf/detail/utilities/stream_pool.hpp>
-#include <cudf/io/memory_resource.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/export.hpp>
-
-#include <rmm/cuda_device.hpp>
-#include <rmm/mr/device/pool_memory_resource.hpp>
-#include <rmm/mr/pinned_host_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cstdlib>
 #include <string>
 
-namespace cudf::io {
-
-namespace detail {
+namespace cudf::io::detail {
 
 namespace cufile_integration {
 
@@ -90,205 +80,4 @@ bool is_stable_enabled() { return is_all_enabled() or get_env_policy() == usage_
 
 }  // namespace nvcomp_integration
 
-}  // namespace detail
-
-namespace {
-class fixed_pinned_pool_memory_resource {
-  using upstream_mr    = rmm::mr::pinned_host_memory_resource;
-  using host_pooled_mr = rmm::mr::pool_memory_resource<upstream_mr>;
-
- private:
-  upstream_mr upstream_mr_{};
-  size_t pool_size_{0};
-  // Raw pointer to avoid a segfault when the pool is destroyed on exit
-  host_pooled_mr* pool_{nullptr};
-  void* pool_begin_{nullptr};
-  void* pool_end_{nullptr};
-  cuda::stream_ref stream_{cudf::detail::global_cuda_stream_pool().get_stream().value()};
-
- public:
-  fixed_pinned_pool_memory_resource(size_t size)
-    : pool_size_{size}, pool_{new host_pooled_mr(upstream_mr_, size, size)}
-  {
-    if (pool_size_ == 0) { return; }
-
-    // Allocate full size from the pinned pool to figure out the beginning and end address
-    pool_begin_ = pool_->allocate_async(pool_size_, stream_);
-    pool_end_   = static_cast<void*>(static_cast<uint8_t*>(pool_begin_) + pool_size_);
-    pool_->deallocate_async(pool_begin_, pool_size_, stream_);
-  }
-
-  void* do_allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream)
-  {
-    if (bytes <= pool_size_) {
-      try {
-        return pool_->allocate_async(bytes, alignment, stream);
-      } catch (...) {
-        // If the pool is exhausted, fall back to the upstream memory resource
-      }
-    }
-
-    return upstream_mr_.allocate_async(bytes, alignment, stream);
-  }
-
-  void do_deallocate_async(void* ptr,
-                           std::size_t bytes,
-                           std::size_t alignment,
-                           cuda::stream_ref stream) noexcept
-  {
-    if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr <= pool_end_) {
-      pool_->deallocate_async(ptr, bytes, alignment, stream);
-    } else {
-      upstream_mr_.deallocate_async(ptr, bytes, alignment, stream);
-    }
-  }
-
-  void* allocate_async(std::size_t bytes, cuda::stream_ref stream)
-  {
-    return do_allocate_async(bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
-  }
-
-  void* allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream)
-  {
-    return do_allocate_async(bytes, alignment, stream);
-  }
-
-  void* allocate(std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
-  {
-    auto const result = do_allocate_async(bytes, alignment, stream_);
-    stream_.wait();
-    return result;
-  }
-
-  void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept
-  {
-    return do_deallocate_async(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
-  }
-
-  void deallocate_async(void* ptr,
-                        std::size_t bytes,
-                        std::size_t alignment,
-                        cuda::stream_ref stream) noexcept
-  {
-    return do_deallocate_async(ptr, bytes, alignment, stream);
-  }
-
-  void deallocate(void* ptr,
-                  std::size_t bytes,
-                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept
-  {
-    deallocate_async(ptr, bytes, alignment, stream_);
-    stream_.wait();
-  }
-
-  bool operator==(fixed_pinned_pool_memory_resource const& other) const
-  {
-    return pool_ == other.pool_ and stream_ == other.stream_;
-  }
-
-  bool operator!=(fixed_pinned_pool_memory_resource const& other) const
-  {
-    return !operator==(other);
-  }
-
-  [[maybe_unused]] friend void get_property(fixed_pinned_pool_memory_resource const&,
-                                            cuda::mr::device_accessible) noexcept
-  {
-  }
-
-  [[maybe_unused]] friend void get_property(fixed_pinned_pool_memory_resource const&,
-                                            cuda::mr::host_accessible) noexcept
-  {
-  }
-};
-
-static_assert(cuda::mr::resource_with<fixed_pinned_pool_memory_resource,
-                                      cuda::mr::device_accessible,
-                                      cuda::mr::host_accessible>,
-              "");
-
-}  // namespace
-
-CUDF_EXPORT rmm::host_device_async_resource_ref& make_default_pinned_mr(
-  std::optional<size_t> config_size)
-{
-  static fixed_pinned_pool_memory_resource mr = [config_size]() {
-    auto const size = [&config_size]() -> size_t {
-      if (auto const env_val = getenv("LIBCUDF_PINNED_POOL_SIZE"); env_val != nullptr) {
-        return std::atol(env_val);
-      }
-
-      if (config_size.has_value()) { return *config_size; }
-
-      size_t free{}, total{};
-      CUDF_CUDA_TRY(cudaMemGetInfo(&free, &total));
-      // 0.5% of the total device memory, capped at 100MB
-      return std::min(total / 200, size_t{100} * 1024 * 1024);
-    }();
-
-    // rmm requires the pool size to be a multiple of 256 bytes
-    auto const aligned_size = (size + 255) & ~255;
-    CUDF_LOG_INFO("Pinned pool size = {}", aligned_size);
-
-    // make the pool with max size equal to the initial size
-    return fixed_pinned_pool_memory_resource{aligned_size};
-  }();
-
-  static rmm::host_device_async_resource_ref mr_ref{mr};
-  return mr_ref;
-}
-
-CUDF_EXPORT std::mutex& host_mr_mutex()
-{
-  static std::mutex map_lock;
-  return map_lock;
-}
-
-// Must be called with the host_mr_mutex mutex held
-CUDF_EXPORT rmm::host_device_async_resource_ref& make_host_mr(
-  std::optional<host_mr_options> const& opts, bool* did_configure = nullptr)
-{
-  static rmm::host_device_async_resource_ref* mr_ref = nullptr;
-  bool configured                                    = false;
-  if (mr_ref == nullptr) {
-    configured = true;
-    mr_ref     = &make_default_pinned_mr(opts ? opts->pool_size : std::nullopt);
-  }
-
-  // If the user passed an out param to detect whether this call configured a resource
-  // set the result
-  if (did_configure != nullptr) { *did_configure = configured; }
-
-  return *mr_ref;
-}
-
-// Must be called with the host_mr_mutex mutex held
-CUDF_EXPORT rmm::host_device_async_resource_ref& host_mr()
-{
-  static rmm::host_device_async_resource_ref mr_ref = make_host_mr(std::nullopt);
-  return mr_ref;
-}
-
-rmm::host_device_async_resource_ref set_host_memory_resource(rmm::host_device_async_resource_ref mr)
-{
-  std::scoped_lock lock{host_mr_mutex()};
-  auto last_mr = host_mr();
-  host_mr()    = mr;
-  return last_mr;
-}
-
-rmm::host_device_async_resource_ref get_host_memory_resource()
-{
-  std::scoped_lock lock{host_mr_mutex()};
-  return host_mr();
-}
-
-bool config_default_host_memory_resource(host_mr_options const& opts)
-{
-  std::scoped_lock lock{host_mr_mutex()};
-  auto did_configure = false;
-  make_host_mr(opts, &did_configure);
-  return did_configure;
-}
-
-}  // namespace cudf::io
+}  // namespace cudf::io::detail
diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp
new file mode 100644
index 00000000000..23d673a1382
--- /dev/null
+++ b/cpp/src/utilities/pinned_memory.cpp
@@ -0,0 +1,230 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/detail/utilities/stream_pool.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/export.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+#include <rmm/mr/pinned_host_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+namespace cudf {
+
+namespace {
+class fixed_pinned_pool_memory_resource {
+  using upstream_mr    = rmm::mr::pinned_host_memory_resource;
+  using host_pooled_mr = rmm::mr::pool_memory_resource<upstream_mr>;
+
+ private:
+  upstream_mr upstream_mr_{};
+  size_t pool_size_{0};
+  // Raw pointer to avoid a segfault when the pool is destroyed on exit
+  host_pooled_mr* pool_{nullptr};
+  void* pool_begin_{nullptr};
+  void* pool_end_{nullptr};
+  cuda::stream_ref stream_{cudf::detail::global_cuda_stream_pool().get_stream().value()};
+
+ public:
+  fixed_pinned_pool_memory_resource(size_t size)
+    : pool_size_{size}, pool_{new host_pooled_mr(upstream_mr_, size, size)}
+  {
+    if (pool_size_ == 0) { return; }
+
+    // Allocate full size from the pinned pool to figure out the beginning and end address
+    pool_begin_ = pool_->allocate_async(pool_size_, stream_);
+    pool_end_   = static_cast<void*>(static_cast<uint8_t*>(pool_begin_) + pool_size_);
+    pool_->deallocate_async(pool_begin_, pool_size_, stream_);
+  }
+
+  void* do_allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream)
+  {
+    if (bytes <= pool_size_) {
+      try {
+        return pool_->allocate_async(bytes, alignment, stream);
+      } catch (...) {
+        // If the pool is exhausted, fall back to the upstream memory resource
+      }
+    }
+
+    return upstream_mr_.allocate_async(bytes, alignment, stream);
+  }
+
+  void do_deallocate_async(void* ptr,
+                           std::size_t bytes,
+                           std::size_t alignment,
+                           cuda::stream_ref stream) noexcept
+  {
+    if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr <= pool_end_) {
+      pool_->deallocate_async(ptr, bytes, alignment, stream);
+    } else {
+      upstream_mr_.deallocate_async(ptr, bytes, alignment, stream);
+    }
+  }
+
+  void* allocate_async(std::size_t bytes, cuda::stream_ref stream)
+  {
+    return do_allocate_async(bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
+  }
+
+  void* allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream)
+  {
+    return do_allocate_async(bytes, alignment, stream);
+  }
+
+  void* allocate(std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
+  {
+    auto const result = do_allocate_async(bytes, alignment, stream_);
+    stream_.wait();
+    return result;
+  }
+
+  void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept
+  {
+    return do_deallocate_async(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
+  }
+
+  void deallocate_async(void* ptr,
+                        std::size_t bytes,
+                        std::size_t alignment,
+                        cuda::stream_ref stream) noexcept
+  {
+    return do_deallocate_async(ptr, bytes, alignment, stream);
+  }
+
+  void deallocate(void* ptr,
+                  std::size_t bytes,
+                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept
+  {
+    deallocate_async(ptr, bytes, alignment, stream_);
+    stream_.wait();
+  }
+
+  bool operator==(fixed_pinned_pool_memory_resource const& other) const
+  {
+    return pool_ == other.pool_ and stream_ == other.stream_;
+  }
+
+  bool operator!=(fixed_pinned_pool_memory_resource const& other) const
+  {
+    return !operator==(other);
+  }
+
+  [[maybe_unused]] friend void get_property(fixed_pinned_pool_memory_resource const&,
+                                            cuda::mr::device_accessible) noexcept
+  {
+  }
+
+  [[maybe_unused]] friend void get_property(fixed_pinned_pool_memory_resource const&,
+                                            cuda::mr::host_accessible) noexcept
+  {
+  }
+};
+
+static_assert(cuda::mr::resource_with<fixed_pinned_pool_memory_resource,
+                                      cuda::mr::device_accessible,
+                                      cuda::mr::host_accessible>,
+              "");
+
+}  // namespace
+
+CUDF_EXPORT rmm::host_device_async_resource_ref& make_default_pinned_mr(
+  std::optional<size_t> config_size)
+{
+  static fixed_pinned_pool_memory_resource mr = [config_size]() {
+    auto const size = [&config_size]() -> size_t {
+      if (auto const env_val = getenv("LIBCUDF_PINNED_POOL_SIZE"); env_val != nullptr) {
+        return std::atol(env_val);
+      }
+
+      if (config_size.has_value()) { return *config_size; }
+
+      size_t free{}, total{};
+      CUDF_CUDA_TRY(cudaMemGetInfo(&free, &total));
+      // 0.5% of the total device memory, capped at 100MB
+      return std::min(total / 200, size_t{100} * 1024 * 1024);
+    }();
+
+    // rmm requires the pool size to be a multiple of 256 bytes
+    auto const aligned_size = (size + 255) & ~255;
+    CUDF_LOG_INFO("Pinned pool size = {}", aligned_size);
+
+    // make the pool with max size equal to the initial size
+    return fixed_pinned_pool_memory_resource{aligned_size};
+  }();
+
+  static rmm::host_device_async_resource_ref mr_ref{mr};
+  return mr_ref;
+}
+
+CUDF_EXPORT std::mutex& host_mr_mutex()
+{
+  static std::mutex map_lock;
+  return map_lock;
+}
+
+// Must be called with the host_mr_mutex mutex held
+CUDF_EXPORT rmm::host_device_async_resource_ref& make_host_mr(
+  std::optional<pinned_mr_options> const& opts, bool* did_configure = nullptr)
+{
+  static rmm::host_device_async_resource_ref* mr_ref = nullptr;
+  bool configured                                    = false;
+  if (mr_ref == nullptr) {
+    configured = true;
+    mr_ref     = &make_default_pinned_mr(opts ? opts->pool_size : std::nullopt);
+  }
+
+  // If the user passed an out param to detect whether this call configured a resource
+  // set the result
+  if (did_configure != nullptr) { *did_configure = configured; }
+
+  return *mr_ref;
+}
+
+// Must be called with the host_mr_mutex mutex held
+CUDF_EXPORT rmm::host_device_async_resource_ref& host_mr()
+{
+  static rmm::host_device_async_resource_ref mr_ref = make_host_mr(std::nullopt);
+  return mr_ref;
+}
+
+rmm::host_device_async_resource_ref set_pinned_memory_resource(
+  rmm::host_device_async_resource_ref mr)
+{
+  std::scoped_lock lock{host_mr_mutex()};
+  auto last_mr = host_mr();
+  host_mr()    = mr;
+  return last_mr;
+}
+
+rmm::host_device_async_resource_ref get_pinned_memory_resource()
+{
+  std::scoped_lock lock{host_mr_mutex()};
+  return host_mr();
+}
+
+bool config_default_pinned_memory_resource(pinned_mr_options const& opts)
+{
+  std::scoped_lock lock{host_mr_mutex()};
+  auto did_configure = false;
+  make_host_mr(opts, &did_configure);
+  return did_configure;
+}
+
+}  // namespace cudf
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 9d766e80094..4b3793b22f6 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -28,13 +28,13 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/io/arrow_io_source.hpp>
 #include <cudf/io/json.hpp>
-#include <cudf/io/memory_resource.hpp>
 #include <cudf/strings/convert/convert_fixed_point.hpp>
 #include <cudf/strings/repeat_strings.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
 
 #include <rmm/mr/pinned_host_memory_resource.hpp>
 
@@ -2068,7 +2068,7 @@ TEST_F(JsonReaderTest, JSONLinesRecoveringSync)
                     size_t{128} * 1024 * 1024};
 
   // Set new resource
-  auto last_mr = cudf::io::set_host_memory_resource(mr);
+  auto last_mr = cudf::set_pinned_memory_resource(mr);
 
   /**
    * @brief Spark has the specific need to ignore extra characters that come after the first record
@@ -2158,7 +2158,7 @@ TEST_F(JsonReaderTest, JSONLinesRecoveringSync)
     float64_wrapper{c_data.cbegin(), c_data.cend(), c_validity.cbegin()});
 
   // Restore original memory source
-  cudf::io::set_host_memory_resource(last_mr);
+  cudf::set_pinned_memory_resource(last_mr);
 }
 
 TEST_F(JsonReaderTest, MixedTypes)
diff --git a/cpp/tests/utilities_tests/io_utilities_tests.cpp b/cpp/tests/utilities_tests/io_utilities_tests.cpp
index e5a153bf781..c00f1ab8863 100644
--- a/cpp/tests/utilities_tests/io_utilities_tests.cpp
+++ b/cpp/tests/utilities_tests/io_utilities_tests.cpp
@@ -18,8 +18,8 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 
-#include <cudf/io/memory_resource.hpp>
 #include <cudf/io/parquet.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
 
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
@@ -44,8 +44,8 @@ TEST(IoUtilitiesTest, HostMemoryGetAndSet)
                     size_t{128} * 1024 * 1024);
 
   // set new resource
-  auto last_mr = cudf::io::get_host_memory_resource();
-  cudf::io::set_host_memory_resource(mr);
+  auto last_mr = cudf::get_pinned_memory_resource();
+  cudf::set_pinned_memory_resource(mr);
 
   constexpr int num_rows = 32 * 1024;
   auto valids =
@@ -66,7 +66,7 @@ TEST(IoUtilitiesTest, HostMemoryGetAndSet)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, expected);
 
   // reset memory resource back
-  cudf::io::set_host_memory_resource(last_mr);
+  cudf::set_pinned_memory_resource(last_mr);
 }
 
 TEST(IoUtilitiesTest, Base64EncodeAndDecode)

From 0048c5951171d931b676dee3ab40ca312a7eb560 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 30 May 2024 19:09:32 -0700
Subject: [PATCH 04/75] style

---
 cpp/benchmarks/io/text/multibyte_split.cpp      | 2 +-
 cpp/src/io/text/data_chunk_source_factories.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp
index 172182c3607..67705863d41 100644
--- a/cpp/benchmarks/io/text/multibyte_split.cpp
+++ b/cpp/benchmarks/io/text/multibyte_split.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp
index 8278b2c25cb..46149db929f 100644
--- a/cpp/src/io/text/data_chunk_source_factories.cpp
+++ b/cpp/src/io/text/data_chunk_source_factories.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 1964523554cd86763be1a0a6b1580f7d12016270 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 31 May 2024 10:39:55 -0700
Subject: [PATCH 05/75] java update because breaking

---
 java/src/main/native/src/RmmJni.cpp | 32 +++++++++++++++--------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index fa78f6ca4e2..e1cb7845b77 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -16,7 +16,7 @@
 
 #include "cudf_jni_apis.hpp"
 
-#include <cudf/io/memory_resource.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
 
 #include <rmm/aligned.hpp>
 #include <rmm/mr/device/aligned_resource_adaptor.hpp>
@@ -395,15 +395,17 @@ class java_debug_event_handler_memory_resource final : public java_event_handler
   }
 };
 
-inline auto& prior_cuio_host_mr()
+inline auto& prior_cudf_pinned_mr()
 {
-  static rmm::host_async_resource_ref _prior_cuio_host_mr = cudf::io::get_host_memory_resource();
-  return _prior_cuio_host_mr;
+  static rmm::host_device_async_resource_ref _prior_cudf_pinned_mr =
+    cudf::get_pinned_memory_resource();
+  return _prior_cudf_pinned_mr;
 }
 
 /**
  * This is a pinned fallback memory resource that will try to allocate `pool`
- * and if that fails, attempt to allocate from the prior resource used by cuIO `prior_cuio_host_mr`.
+ * and if that fails, attempt to allocate from the prior resource used by cuDF
+ * `prior_cudf_pinned_mr`.
  *
  * We detect whether a pointer to free is inside of the pool by checking its address (see
  * constructor)
@@ -433,7 +435,7 @@ class pinned_fallback_host_memory_resource {
 
   /**
    * @brief Allocates pinned host memory of size at least \p bytes bytes from either the
-   *        _pool argument provided, or prior_cuio_host_mr.
+   *        _pool argument provided, or prior_cudf_pinned_mr.
    *
    * @throws rmm::bad_alloc if the requested allocation could not be fulfilled due to any other
    * reason.
@@ -450,7 +452,7 @@ class pinned_fallback_host_memory_resource {
       return _pool->allocate(bytes, alignment);
     } catch (const std::exception& unused) {
       // try to allocate using the underlying pinned resource
-      return prior_cuio_host_mr().allocate(bytes, alignment);
+      return prior_cudf_pinned_mr().allocate(bytes, alignment);
     }
     // we should not reached here
     return nullptr;
@@ -459,7 +461,7 @@ class pinned_fallback_host_memory_resource {
   /**
    * @brief Deallocate memory pointed to by \p ptr of size \p bytes bytes. We attempt
    *        to deallocate from _pool, if ptr is detected to be in the pool address range,
-   *        otherwise we deallocate from `prior_cuio_host_mr`.
+   *        otherwise we deallocate from `prior_cudf_pinned_mr`.
    *
    * @param ptr Pointer to be deallocated.
    * @param bytes Size of the allocation.
@@ -472,7 +474,7 @@ class pinned_fallback_host_memory_resource {
     if (ptr >= pool_begin_ && ptr <= pool_end_) {
       _pool->deallocate(ptr, bytes, alignment);
     } else {
-      prior_cuio_host_mr().deallocate(ptr, bytes, alignment);
+      prior_cudf_pinned_mr().deallocate(ptr, bytes, alignment);
     }
   }
 
@@ -1025,7 +1027,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newPinnedPoolMemoryResource(JNIE
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCuioPinnedPoolMemoryResource(JNIEnv* env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCudfPinnedPoolMemoryResource(JNIEnv* env,
                                                                                jclass clazz,
                                                                                jlong pool_ptr)
 {
@@ -1035,7 +1037,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCuioPinnedPoolMemoryResource(J
     // create a pinned fallback pool that will allocate pinned memory
     // if the regular pinned pool is exhausted
     pinned_fallback_mr.reset(new pinned_fallback_host_memory_resource(pool));
-    prior_cuio_host_mr() = cudf::io::set_host_memory_resource(*pinned_fallback_mr);
+    prior_cudf_pinned_mr() = cudf::set_pinned_memory_resource(*pinned_fallback_mr);
   }
   CATCH_STD(env, )
 }
@@ -1048,7 +1050,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releasePinnedPoolMemoryResource(J
     cudf::jni::auto_set_device(env);
     // set the cuio host memory resource to what it was before, or the same
     // if we didn't overwrite it with setCuioPinnedPoolMemoryResource
-    cudf::io::set_host_memory_resource(prior_cuio_host_mr());
+    cudf::set_pinned_memory_resource(prior_cudf_pinned_mr());
     pinned_fallback_mr.reset();
     delete reinterpret_cast<rmm_pinned_pool_t*>(pool_ptr);
   }
@@ -1088,7 +1090,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocFromFallbackPinnedPool(JNIE
                                                                             jlong size)
 {
   cudf::jni::auto_set_device(env);
-  void* ret = cudf::io::get_host_memory_resource().allocate(size);
+  void* ret = cudf::get_pinned_memory_resource().allocate(size);
   return reinterpret_cast<jlong>(ret);
 }
 
@@ -1101,7 +1103,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeFromFallbackPinnedPool(JNIEnv
   try {
     cudf::jni::auto_set_device(env);
     void* cptr = reinterpret_cast<void*>(ptr);
-    cudf::io::get_host_memory_resource().deallocate(cptr, size);
+    cudf::get_pinned_memory_resource().deallocate(cptr, size);
   }
   CATCH_STD(env, )
 }
@@ -1112,7 +1114,7 @@ JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Rmm_configureDefaultCudfPinnedPoo
 {
   try {
     cudf::jni::auto_set_device(env);
-    return cudf::io::config_default_host_memory_resource(cudf::io::host_mr_options{size});
+    return cudf::config_default_pinned_memory_resource(cudf::pinned_mr_options{size});
   }
   CATCH_STD(env, false)
 }

From ac0ce9c0ef5c2b3c6c4dfe1e1b4ee5330100f999 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 31 May 2024 12:04:05 -0700
Subject: [PATCH 06/75] java fix

---
 java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java | 2 +-
 java/src/main/java/ai/rapids/cudf/Rmm.java              | 2 +-
 java/src/main/native/src/RmmJni.cpp                     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
index 83b801db7fb..5050834303d 100644
--- a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
+++ b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
@@ -224,7 +224,7 @@ private PinnedMemoryPool(long poolSize, int gpuId, boolean setCuioHostMemoryReso
     }
     this.poolHandle = Rmm.newPinnedPoolMemoryResource(poolSize, poolSize);
     if (setCuioHostMemoryResource) {
-      Rmm.setCuioPinnedPoolMemoryResource(this.poolHandle);
+      Rmm.setCudfPinnedPoolMemoryResource(this.poolHandle);
     }
     this.poolSize = poolSize;
   }
diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java
index 4dee1b7aa24..ed029c918e4 100755
--- a/java/src/main/java/ai/rapids/cudf/Rmm.java
+++ b/java/src/main/java/ai/rapids/cudf/Rmm.java
@@ -597,7 +597,7 @@ static native long newEventHandlerResourceAdaptor(long handle, long trackerHandl
 
   public static native long newPinnedPoolMemoryResource(long initSize, long maxSize);
 
-  public static native long setCuioPinnedPoolMemoryResource(long poolPtr);
+  public static native long setCudfPinnedPoolMemoryResource(long poolPtr);
 
   public static native void releasePinnedPoolMemoryResource(long poolPtr);
 
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index e1cb7845b77..8bd0f7793b4 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -1049,7 +1049,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releasePinnedPoolMemoryResource(J
   try {
     cudf::jni::auto_set_device(env);
     // set the cuio host memory resource to what it was before, or the same
-    // if we didn't overwrite it with setCuioPinnedPoolMemoryResource
+    // if we didn't overwrite it with setCudfPinnedPoolMemoryResource
     cudf::set_pinned_memory_resource(prior_cudf_pinned_mr());
     pinned_fallback_mr.reset();
     delete reinterpret_cast<rmm_pinned_pool_t*>(pool_ptr);

From ab36162ac32cd0be5fe69ef3d92e421f4e5ea798 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 31 May 2024 15:48:45 -0700
Subject: [PATCH 07/75] move test out of io util

---
 cpp/tests/CMakeLists.txt                      |  5 +-
 .../utilities_tests/io_utilities_tests.cpp    | 45 -------------
 .../utilities_tests/pinned_memory_tests.cpp   | 65 +++++++++++++++++++
 3 files changed, 68 insertions(+), 47 deletions(-)
 create mode 100644 cpp/tests/utilities_tests/pinned_memory_tests.cpp

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 2f2c12f265c..19c87facb51 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -378,15 +378,16 @@ ConfigureTest(
 # * utilities tests -------------------------------------------------------------------------------
 ConfigureTest(
   UTILITIES_TEST
-  utilities_tests/type_list_tests.cpp
   utilities_tests/column_debug_tests.cpp
   utilities_tests/column_utilities_tests.cpp
   utilities_tests/column_wrapper_tests.cpp
+  utilities_tests/default_stream_tests.cpp
   utilities_tests/io_utilities_tests.cpp
   utilities_tests/lists_column_wrapper_tests.cpp
   utilities_tests/logger_tests.cpp
-  utilities_tests/default_stream_tests.cpp
+  utilities_tests/pinned_memory_tests.cpp
   utilities_tests/type_check_tests.cpp
+  utilities_tests/type_list_tests.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/tests/utilities_tests/io_utilities_tests.cpp b/cpp/tests/utilities_tests/io_utilities_tests.cpp
index c00f1ab8863..9ed8f18f5cc 100644
--- a/cpp/tests/utilities_tests/io_utilities_tests.cpp
+++ b/cpp/tests/utilities_tests/io_utilities_tests.cpp
@@ -16,14 +16,6 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/table_utilities.hpp>
-
-#include <cudf/io/parquet.hpp>
-#include <cudf/utilities/pinned_memory.hpp>
-
-#include <rmm/mr/device/pool_memory_resource.hpp>
-#include <rmm/mr/pinned_host_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <src/io/utilities/base64_utilities.hpp>
 
@@ -32,43 +24,6 @@ using cudf::io::detail::base64_encode;
 
 class IoUtilitiesTest : public cudf::test::BaseFixture {};
 
-TEST(IoUtilitiesTest, HostMemoryGetAndSet)
-{
-  // Global environment for temporary files
-  auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
-    ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
-
-  // pinned/pooled host memory resource
-  using host_pooled_mr = rmm::mr::pool_memory_resource<rmm::mr::pinned_host_memory_resource>;
-  host_pooled_mr mr(std::make_shared<rmm::mr::pinned_host_memory_resource>().get(),
-                    size_t{128} * 1024 * 1024);
-
-  // set new resource
-  auto last_mr = cudf::get_pinned_memory_resource();
-  cudf::set_pinned_memory_resource(mr);
-
-  constexpr int num_rows = 32 * 1024;
-  auto valids =
-    cudf::detail::make_counting_transform_iterator(0, [&](int index) { return index % 2; });
-  auto values = thrust::make_counting_iterator(0);
-
-  cudf::test::fixed_width_column_wrapper<int> col(values, values + num_rows, valids);
-
-  cudf::table_view expected({col});
-  auto filepath = temp_env->get_temp_filepath("IoUtilsMemTest.parquet");
-  cudf::io::parquet_writer_options out_args =
-    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected);
-  cudf::io::write_parquet(out_args);
-
-  cudf::io::parquet_reader_options const read_opts =
-    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
-  auto const result = cudf::io::read_parquet(read_opts);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, expected);
-
-  // reset memory resource back
-  cudf::set_pinned_memory_resource(last_mr);
-}
-
 TEST(IoUtilitiesTest, Base64EncodeAndDecode)
 {
   // a vector of lorem ipsum strings
diff --git a/cpp/tests/utilities_tests/pinned_memory_tests.cpp b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
new file mode 100644
index 00000000000..df9103640f4
--- /dev/null
+++ b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+
+#include <cudf/io/parquet.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
+
+#include <rmm/mr/device/pool_memory_resource.hpp>
+#include <rmm/mr/pinned_host_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+class PinnedMemoryTest : public cudf::test::BaseFixture {};
+
+TEST(PinnedMemoryTest, MemoryResourceGetAndSet)
+{
+  // Global environment for temporary files
+  auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
+    ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
+
+  // pinned/pooled host memory resource
+  using host_pooled_mr = rmm::mr::pool_memory_resource<rmm::mr::pinned_host_memory_resource>;
+  host_pooled_mr mr(std::make_shared<rmm::mr::pinned_host_memory_resource>().get(),
+                    4 * 1024 * 1024);
+
+  // set new resource
+  auto last_mr = cudf::get_pinned_memory_resource();
+  cudf::set_pinned_memory_resource(mr);
+
+  constexpr int num_rows = 32 * 1024;
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [&](int index) { return index % 2; });
+  auto values = thrust::make_counting_iterator(0);
+
+  cudf::test::fixed_width_column_wrapper<int> col(values, values + num_rows, valids);
+
+  cudf::table_view expected({col});
+  auto filepath = temp_env->get_temp_filepath("MemoryResourceGetAndSetTest.parquet");
+  cudf::io::parquet_writer_options out_args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  cudf::io::write_parquet(out_args);
+
+  cudf::io::parquet_reader_options const read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto const result = cudf::io::read_parquet(read_opts);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, expected);
+
+  // reset memory resource back
+  cudf::set_pinned_memory_resource(last_mr);
+}

From 83f665a15f4aba3040079b58306fd964621a91c8 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 3 Jun 2024 15:01:38 -0700
Subject: [PATCH 08/75] missed rename

---
 .../src/main/java/ai/rapids/cudf/PinnedMemoryPool.java | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
index 5050834303d..5ca5bc0db68 100644
--- a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
+++ b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
@@ -128,9 +128,9 @@ public static synchronized void initialize(long poolSize, int gpuId) {
    *
    * @param poolSize size of the pool to initialize.
    * @param gpuId    gpu id to set to get memory pool from, -1 means to use default
-   * @param setCuioHostMemoryResource true if this pinned pool should be used by cuIO for host memory
+   * @param setCudfPinnedPoolMemoryResource true if this pinned pool should be used by cuIO for host memory
    */
-  public static synchronized void initialize(long poolSize, int gpuId, boolean setCuioHostMemoryResource) {
+  public static synchronized void initialize(long poolSize, int gpuId, boolean setCudfPinnedPoolMemoryResource) {
     if (isInitialized()) {
       throw new IllegalStateException("Can only initialize the pool once.");
     }
@@ -139,7 +139,7 @@ public static synchronized void initialize(long poolSize, int gpuId, boolean set
       t.setDaemon(true);
       return t;
     });
-    initFuture = initService.submit(() -> new PinnedMemoryPool(poolSize, gpuId, setCuioHostMemoryResource));
+    initFuture = initService.submit(() -> new PinnedMemoryPool(poolSize, gpuId, setCudfPinnedPoolMemoryResource));
     initService.shutdown();
   }
 
@@ -216,14 +216,14 @@ public static long getTotalPoolSizeBytes() {
     return 0;
   }
 
-  private PinnedMemoryPool(long poolSize, int gpuId, boolean setCuioHostMemoryResource) {
+  private PinnedMemoryPool(long poolSize, int gpuId, boolean setCudfPinnedPoolMemoryResource) {
     if (gpuId > -1) {
       // set the gpu device to use
       Cuda.setDevice(gpuId);
       Cuda.freeZero();
     }
     this.poolHandle = Rmm.newPinnedPoolMemoryResource(poolSize, poolSize);
-    if (setCuioHostMemoryResource) {
+    if (setCudfPinnedPoolMemoryResource) {
       Rmm.setCudfPinnedPoolMemoryResource(this.poolHandle);
     }
     this.poolSize = poolSize;

From c1ae478c2b3ce28101e35ec3f5a5af5e03d5452c Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 3 Jun 2024 15:52:34 -0700
Subject: [PATCH 09/75] update benchmark changes

---
 cpp/benchmarks/io/cuio_common.cpp | 12 ++++++++++++
 cpp/benchmarks/io/cuio_common.hpp |  4 ++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
index 37ced8ea703..f06938bd721 100644
--- a/cpp/benchmarks/io/cuio_common.cpp
+++ b/cpp/benchmarks/io/cuio_common.cpp
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+#include "rmm/mr/pinned_host_memory_resource.hpp"
+#include "rmm/resource_ref.hpp"
+
 #include <benchmarks/io/cuio_common.hpp>
 
 #include <cudf/detail/utilities/integer_utils.hpp>
@@ -28,6 +31,14 @@
 
 temp_directory const cuio_source_sink_pair::tmpdir{"cudf_gbench"};
 
+// Don't use cudf's pinned pool for the source data
+rmm::host_async_resource_ref pinned_memory_resource()
+{
+  static rmm::mr::pinned_host_memory_resource mr = rmm::mr::pinned_host_memory_resource{};
+
+  return mr;
+}
+
 std::string random_file_in_dir(std::string const& dir_path)
 {
   // `mkstemp` modifies the template in place
@@ -41,6 +52,7 @@ std::string random_file_in_dir(std::string const& dir_path)
 
 cuio_source_sink_pair::cuio_source_sink_pair(io_type type)
   : type{type},
+    pinned_buffer({pinned_memory_resource(), cudf::get_default_stream()}),
     d_buffer{0, cudf::get_default_stream()},
     file_name{random_file_in_dir(tmpdir.path())},
     void_sink{cudf::io::data_sink::create()}
diff --git a/cpp/benchmarks/io/cuio_common.hpp b/cpp/benchmarks/io/cuio_common.hpp
index d4f39a5f243..407805a8a1a 100644
--- a/cpp/benchmarks/io/cuio_common.hpp
+++ b/cpp/benchmarks/io/cuio_common.hpp
@@ -18,7 +18,7 @@
 
 #include <cudf_test/file_utilities.hpp>
 
-#include <cudf/detail/utilities/pinned_host_vector.hpp>
+#include <cudf/detail/utilities/rmm_host_vector.hpp>
 #include <cudf/io/data_sink.hpp>
 #include <cudf/io/datasource.hpp>
 
@@ -79,7 +79,7 @@ class cuio_source_sink_pair {
 
   io_type const type;
   std::vector<char> h_buffer;
-  cudf::detail::pinned_host_vector<char> pinned_buffer;
+  cudf::detail::rmm_host_vector<char> pinned_buffer;
   rmm::device_uvector<std::byte> d_buffer;
   std::string const file_name;
   std::unique_ptr<cudf::io::data_sink> void_sink;

From 1c09d0cfe15bcf6b5d2ce775129f0db7e0c662f0 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 4 Jun 2024 11:52:48 -0700
Subject: [PATCH 10/75] rename rmm_host_vector

---
 cpp/benchmarks/io/cuio_common.hpp                |  4 ++--
 .../{rmm_host_vector.hpp => host_vector.hpp}     |  2 +-
 .../cudf/detail/utilities/vector_factories.hpp   | 16 ++++++++--------
 cpp/src/io/text/bgzip_data_chunk_source.cu       | 10 +++++-----
 cpp/src/io/text/data_chunk_source_factories.cpp  |  8 ++++----
 cpp/src/io/utilities/hostdevice_vector.hpp       |  4 ++--
 6 files changed, 22 insertions(+), 22 deletions(-)
 rename cpp/include/cudf/detail/utilities/{rmm_host_vector.hpp => host_vector.hpp} (98%)

diff --git a/cpp/benchmarks/io/cuio_common.hpp b/cpp/benchmarks/io/cuio_common.hpp
index 407805a8a1a..64d6021cf50 100644
--- a/cpp/benchmarks/io/cuio_common.hpp
+++ b/cpp/benchmarks/io/cuio_common.hpp
@@ -18,7 +18,7 @@
 
 #include <cudf_test/file_utilities.hpp>
 
-#include <cudf/detail/utilities/rmm_host_vector.hpp>
+#include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/io/data_sink.hpp>
 #include <cudf/io/datasource.hpp>
 
@@ -79,7 +79,7 @@ class cuio_source_sink_pair {
 
   io_type const type;
   std::vector<char> h_buffer;
-  cudf::detail::rmm_host_vector<char> pinned_buffer;
+  cudf::detail::host_vector<char> pinned_buffer;
   rmm::device_uvector<std::byte> d_buffer;
   std::string const file_name;
   std::unique_ptr<cudf::io::data_sink> void_sink;
diff --git a/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp
similarity index 98%
rename from cpp/include/cudf/detail/utilities/rmm_host_vector.hpp
rename to cpp/include/cudf/detail/utilities/host_vector.hpp
index 6604020c224..e62c8017f8b 100644
--- a/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp
+++ b/cpp/include/cudf/detail/utilities/host_vector.hpp
@@ -178,6 +178,6 @@ class rmm_host_allocator {
  * @brief A vector class with rmm host memory allocator
  */
 template <typename T>
-using rmm_host_vector = thrust::host_vector<T, rmm_host_allocator<T>>;
+using host_vector = thrust::host_vector<T, rmm_host_allocator<T>>;
 
 }  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 7343ee25c8f..6f859ded456 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -21,7 +21,7 @@
  * @file vector_factories.hpp
  */
 
-#include <cudf/detail/utilities/rmm_host_vector.hpp>
+#include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
@@ -442,33 +442,33 @@ thrust::host_vector<typename Container::value_type> make_host_vector_sync(
 }
 
 /**
- * @brief Asynchronously construct a `rmm_host_vector` of the given size
+ * @brief Asynchronously construct a `host_vector` of the given size
  *
  * @note This function does not synchronize `stream`.
  *
  * @tparam T The type of the vector data
  * @param size The number of elements in the created vector
  * @param stream The stream on which to allocate memory
- * @return A rmm_host_vector of the given size
+ * @return A host_vector of the given size
  */
 template <typename T>
-rmm_host_vector<T> make_pinned_vector_async(size_t size, rmm::cuda_stream_view stream)
+host_vector<T> make_pinned_vector_async(size_t size, rmm::cuda_stream_view stream)
 {
-  return rmm_host_vector<T>(size, {cudf::get_pinned_memory_resource(), stream});
+  return host_vector<T>(size, {cudf::get_pinned_memory_resource(), stream});
 }
 
 /**
- * @brief Synchronously construct a `rmm_host_vector` of the given size
+ * @brief Synchronously construct a `cudf::detail::host_vector` of the given size
  *
  * @note This function synchronizes `stream`.
  *
  * @tparam T The type of the vector data
  * @param size The number of elements in the created vector
  * @param stream The stream on which to allocate memory
- * @return A rmm_host_vector of the given size
+ * @return A host_vector of the given size
  */
 template <typename T>
-rmm_host_vector<T> make_pinned_vector_sync(size_t size, rmm::cuda_stream_view stream)
+host_vector<T> make_pinned_vector_sync(size_t size, rmm::cuda_stream_view stream)
 {
   auto result = make_pinned_vector_async<T>(size, stream);
   stream.synchronize();
diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu
index 190015686df..896123d95a9 100644
--- a/cpp/src/io/text/bgzip_data_chunk_source.cu
+++ b/cpp/src/io/text/bgzip_data_chunk_source.cu
@@ -20,8 +20,8 @@
 #include "io/utilities/config_utils.hpp"
 
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/rmm_host_vector.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/detail/bgzip_utils.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -67,7 +67,7 @@ struct bgzip_nvcomp_transform_functor {
 class bgzip_data_chunk_reader : public data_chunk_reader {
  private:
   template <typename T>
-  static void copy_to_device(cudf::detail::rmm_host_vector<T> const& host,
+  static void copy_to_device(cudf::detail::host_vector<T> const& host,
                              rmm::device_uvector<T>& device,
                              rmm::cuda_stream_view stream)
   {
@@ -85,9 +85,9 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
       1 << 16;  // 64k offset allocation, resized on demand
 
     cudaEvent_t event;
-    cudf::detail::rmm_host_vector<char> h_compressed_blocks;
-    cudf::detail::rmm_host_vector<std::size_t> h_compressed_offsets;
-    cudf::detail::rmm_host_vector<std::size_t> h_decompressed_offsets;
+    cudf::detail::host_vector<char> h_compressed_blocks;
+    cudf::detail::host_vector<std::size_t> h_compressed_offsets;
+    cudf::detail::host_vector<std::size_t> h_decompressed_offsets;
     rmm::device_uvector<char> d_compressed_blocks;
     rmm::device_uvector<char> d_decompressed_blocks;
     rmm::device_uvector<std::size_t> d_compressed_offsets;
diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp
index 46149db929f..1e1671a1545 100644
--- a/cpp/src/io/text/data_chunk_source_factories.cpp
+++ b/cpp/src/io/text/data_chunk_source_factories.cpp
@@ -18,7 +18,7 @@
 #include "io/text/device_data_chunks.hpp"
 
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/rmm_host_vector.hpp>
+#include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 
 #include <rmm/device_buffer.hpp>
@@ -33,7 +33,7 @@ namespace {
 
 struct host_ticket {
   cudaEvent_t event;
-  std::unique_ptr<cudf::detail::rmm_host_vector<char>> buffer;
+  std::unique_ptr<cudf::detail::host_vector<char>> buffer;
 };
 
 /**
@@ -86,7 +86,7 @@ class datasource_chunk_reader : public data_chunk_reader {
 
       // resize the host buffer as necessary to contain the requested number of bytes
       if (h_ticket.buffer == nullptr or h_ticket.buffer->size() < read_size) {
-        h_ticket.buffer = std::make_unique<cudf::detail::rmm_host_vector<char>>(
+        h_ticket.buffer = std::make_unique<cudf::detail::host_vector<char>>(
           cudf::detail::make_pinned_vector_sync<char>(read_size, stream));
       }
 
@@ -153,7 +153,7 @@ class istream_data_chunk_reader : public data_chunk_reader {
 
     // resize the host buffer as necessary to contain the requested number of bytes
     if (h_ticket.buffer == nullptr or h_ticket.buffer->size() < read_size) {
-      h_ticket.buffer = std::make_unique<cudf::detail::rmm_host_vector<char>>(
+      h_ticket.buffer = std::make_unique<cudf::detail::host_vector<char>>(
         cudf::detail::make_pinned_vector_sync<char>(read_size, stream));
     }
 
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index f6f7ff14d38..1ae27a2f4ae 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -18,7 +18,7 @@
 
 #include "hostdevice_span.hpp"
 
-#include <cudf/detail/utilities/rmm_host_vector.hpp>
+#include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -172,7 +172,7 @@ class hostdevice_vector {
   }
 
  private:
-  cudf::detail::rmm_host_vector<T> h_data;
+  cudf::detail::host_vector<T> h_data;
   rmm::device_uvector<T> d_data;
 };
 

From c343c3194f48dedb10e49c8610e3e0deaacf315b Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 4 Jun 2024 13:00:10 -0700
Subject: [PATCH 11/75] remove do_xyz

---
 cpp/src/utilities/pinned_memory.cpp | 39 ++++++++++-------------------
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp
index 23d673a1382..47c09217363 100644
--- a/cpp/src/utilities/pinned_memory.cpp
+++ b/cpp/src/utilities/pinned_memory.cpp
@@ -53,7 +53,7 @@ class fixed_pinned_pool_memory_resource {
     pool_->deallocate_async(pool_begin_, pool_size_, stream_);
   }
 
-  void* do_allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream)
+  void* allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream)
   {
     if (bytes <= pool_size_) {
       try {
@@ -66,46 +66,33 @@ class fixed_pinned_pool_memory_resource {
     return upstream_mr_.allocate_async(bytes, alignment, stream);
   }
 
-  void do_deallocate_async(void* ptr,
-                           std::size_t bytes,
-                           std::size_t alignment,
-                           cuda::stream_ref stream) noexcept
-  {
-    if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr <= pool_end_) {
-      pool_->deallocate_async(ptr, bytes, alignment, stream);
-    } else {
-      upstream_mr_.deallocate_async(ptr, bytes, alignment, stream);
-    }
-  }
-
   void* allocate_async(std::size_t bytes, cuda::stream_ref stream)
   {
-    return do_allocate_async(bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
-  }
-
-  void* allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream)
-  {
-    return do_allocate_async(bytes, alignment, stream);
+    return allocate_async(bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
   }
 
   void* allocate(std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
   {
-    auto const result = do_allocate_async(bytes, alignment, stream_);
+    auto const result = allocate_async(bytes, alignment, stream_);
     stream_.wait();
     return result;
   }
 
-  void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept
-  {
-    return do_deallocate_async(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
-  }
-
   void deallocate_async(void* ptr,
                         std::size_t bytes,
                         std::size_t alignment,
                         cuda::stream_ref stream) noexcept
   {
-    return do_deallocate_async(ptr, bytes, alignment, stream);
+    if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr <= pool_end_) {
+      pool_->deallocate_async(ptr, bytes, alignment, stream);
+    } else {
+      upstream_mr_.deallocate_async(ptr, bytes, alignment, stream);
+    }
+  }
+
+  void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept
+  {
+    return deallocate_async(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
   }
 
   void deallocate(void* ptr,

From 50f4d3ee3d27a2a0a10a2a2cc8a7f425ab8c82e1 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 4 Jun 2024 15:14:21 -0700
Subject: [PATCH 12/75] comment

---
 cpp/include/cudf/detail/utilities/vector_factories.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 6f859ded456..06dfcbfc5e5 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -442,7 +442,7 @@ thrust::host_vector<typename Container::value_type> make_host_vector_sync(
 }
 
 /**
- * @brief Asynchronously construct a `host_vector` of the given size
+ * @brief Asynchronously construct a pinned `cudf::detail::host_vector` of the given size
  *
  * @note This function does not synchronize `stream`.
  *
@@ -458,7 +458,7 @@ host_vector<T> make_pinned_vector_async(size_t size, rmm::cuda_stream_view strea
 }
 
 /**
- * @brief Synchronously construct a `cudf::detail::host_vector` of the given size
+ * @brief Synchronously construct a pinned `cudf::detail::host_vector` of the given size
  *
  * @note This function synchronizes `stream`.
  *

From e5af4902dad3fcbf3c1cd1c678e271bccdd0489b Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 4 Jun 2024 19:49:49 -0700
Subject: [PATCH 13/75] works

---
 cpp/CMakeLists.txt                            |  1 +
 .../cudf/detail/utilities/cuda_copy.hpp       | 41 +++++++++++++++++++
 cpp/include/cudf/utilities/pinned_memory.hpp  | 17 ++++++++
 cpp/src/io/utilities/hostdevice_vector.hpp    |  7 ++--
 cpp/src/utilities/cuda_copy.cu                | 41 +++++++++++++++++++
 cpp/src/utilities/pinned_memory.cpp           | 10 +++++
 6 files changed, 113 insertions(+), 4 deletions(-)
 create mode 100644 cpp/include/cudf/detail/utilities/cuda_copy.hpp
 create mode 100644 cpp/src/utilities/cuda_copy.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 8a0f56aec53..cf836a45708 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -663,6 +663,7 @@ add_library(
   src/unary/nan_ops.cu
   src/unary/null_ops.cu
   src/utilities/default_stream.cpp
+  src/utilities/cuda_copy.cu
   src/utilities/linked_column.cpp
   src/utilities/logger.cpp
   src/utilities/pinned_memory.cpp
diff --git a/cpp/include/cudf/detail/utilities/cuda_copy.hpp b/cpp/include/cudf/detail/utilities/cuda_copy.hpp
new file mode 100644
index 00000000000..2ceb70f2ef2
--- /dev/null
+++ b/cpp/include/cudf/detail/utilities/cuda_copy.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf::detail {
+
+namespace impl {
+
+void copy_pinned(void* dst, void const* src, size_t size, rmm::cuda_stream_view stream);
+
+}  // namespace impl
+
+template <typename T>
+void copy_pinned_to_device_async(T* dst, T const* src, size_t size, rmm::cuda_stream_view stream)
+{
+  impl::copy_pinned(dst, src, size * sizeof(T), stream);
+}
+
+template <typename T>
+void copy_device_to_pinned_async(T* dst, T const* src, size_t size, rmm::cuda_stream_view stream)
+{
+  impl::copy_pinned(dst, src, size * sizeof(T), stream);
+}
+
+}  // namespace cudf::detail
\ No newline at end of file
diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp
index b423eab6d38..b0d6c55999f 100644
--- a/cpp/include/cudf/utilities/pinned_memory.hpp
+++ b/cpp/include/cudf/utilities/pinned_memory.hpp
@@ -55,4 +55,21 @@ struct pinned_mr_options {
  */
 bool config_default_pinned_memory_resource(pinned_mr_options const& opts);
 
+/**
+ * @brief Set the threshold size for using kernels for pinned memory copies.
+ *
+ * @param threshold The threshold size in bytes. If the size of the copy is less than this
+ * threshold, the copy will be done using kernels. If the size is greater than or equal to this
+ * threshold, the copy will be done using cudaMemcpyAsync.
+ */
+
+void set_kernel_copy_threshold(size_t threshold);
+
+/**
+ * @brief Get the threshold size for using kernels for pinned memory copies.
+ *
+ * @return The threshold size in bytes.
+ */
+size_t get_kernel_copy_threshold();
+
 }  // namespace cudf
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 1ae27a2f4ae..171379143a6 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -18,6 +18,7 @@
 
 #include "hostdevice_span.hpp"
 
+#include <cudf/detail/utilities/cuda_copy.hpp>
 #include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -124,8 +125,7 @@ class hostdevice_vector {
 
   void host_to_device_async(rmm::cuda_stream_view stream)
   {
-    CUDF_CUDA_TRY(
-      cudaMemcpyAsync(device_ptr(), host_ptr(), size_bytes(), cudaMemcpyDefault, stream.value()));
+    copy_pinned_to_device_async(device_ptr(), host_ptr(), size(), stream);
   }
 
   void host_to_device_sync(rmm::cuda_stream_view stream)
@@ -136,8 +136,7 @@ class hostdevice_vector {
 
   void device_to_host_async(rmm::cuda_stream_view stream)
   {
-    CUDF_CUDA_TRY(
-      cudaMemcpyAsync(host_ptr(), device_ptr(), size_bytes(), cudaMemcpyDefault, stream.value()));
+    copy_device_to_pinned_async(host_ptr(), device_ptr(), size(), stream);
   }
 
   void device_to_host_sync(rmm::cuda_stream_view stream)
diff --git a/cpp/src/utilities/cuda_copy.cu b/cpp/src/utilities/cuda_copy.cu
new file mode 100644
index 00000000000..9a574d6d0e5
--- /dev/null
+++ b/cpp/src/utilities/cuda_copy.cu
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/utilities/cuda_copy.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/copy.h>
+
+namespace cudf::detail::impl {
+
+void copy_pinned(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream)
+{
+  if (size == 0) return;
+
+  if (size < get_kernel_copy_threshold()) {
+    thrust::copy_n(rmm::exec_policy_nosync(stream),
+                   static_cast<const char*>(src),
+                   size,
+                   static_cast<char*>(dst));
+  } else {
+    CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream));
+  }
+}
+
+}  // namespace cudf::detail::impl
\ No newline at end of file
diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp
index 47c09217363..0791f404bf2 100644
--- a/cpp/src/utilities/pinned_memory.cpp
+++ b/cpp/src/utilities/pinned_memory.cpp
@@ -214,4 +214,14 @@ bool config_default_pinned_memory_resource(pinned_mr_options const& opts)
   return did_configure;
 }
 
+CUDF_EXPORT auto& kernel_copy_threshold()
+{
+  static std::atomic<size_t> threshold = 0;  // use cudaMemcpyAsync for all pinned copies
+  return threshold;
+}
+
+void set_kernel_copy_threshold(size_t threshold) { kernel_copy_threshold() = threshold; }
+
+size_t get_kernel_copy_threshold() { return kernel_copy_threshold(); }
+
 }  // namespace cudf

From 9082ccc979383b0bcfa7181c54d097d5036f2904 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 5 Jun 2024 15:14:34 -0700
Subject: [PATCH 14/75] include style

Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com>
---
 cpp/benchmarks/io/cuio_common.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
index f06938bd721..45dc812e247 100644
--- a/cpp/benchmarks/io/cuio_common.cpp
+++ b/cpp/benchmarks/io/cuio_common.cpp
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#include "rmm/mr/pinned_host_memory_resource.hpp"
-#include "rmm/resource_ref.hpp"
+#include <rmm/mr/pinned_host_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <benchmarks/io/cuio_common.hpp>
 

From 17b1ee0e736a0dbbbf152d99aad3a27de3bc9c3a Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 5 Jun 2024 17:05:40 -0700
Subject: [PATCH 15/75] reviews

---
 cpp/src/io/text/bgzip_data_chunk_source.cu              | 2 +-
 cpp/src/io/text/data_chunk_source_factories.cpp         | 2 +-
 java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu
index 896123d95a9..0e3ce779089 100644
--- a/cpp/src/io/text/bgzip_data_chunk_source.cu
+++ b/cpp/src/io/text/bgzip_data_chunk_source.cu
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-#include "cudf/detail/utilities/vector_factories.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
 #include "io/text/device_data_chunks.hpp"
 #include "io/utilities/config_utils.hpp"
@@ -22,6 +21,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/detail/bgzip_utils.hpp>
 #include <cudf/utilities/default_stream.hpp>
diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp
index 1e1671a1545..45096b7155c 100644
--- a/cpp/src/io/text/data_chunk_source_factories.cpp
+++ b/cpp/src/io/text/data_chunk_source_factories.cpp
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
-#include "cudf/detail/utilities/vector_factories.hpp"
 #include "io/text/device_data_chunks.hpp"
 
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/host_vector.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 
 #include <rmm/device_buffer.hpp>
diff --git a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
index 5ca5bc0db68..df0d9dc7c3e 100644
--- a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
+++ b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
@@ -128,7 +128,7 @@ public static synchronized void initialize(long poolSize, int gpuId) {
    *
    * @param poolSize size of the pool to initialize.
    * @param gpuId    gpu id to set to get memory pool from, -1 means to use default
-   * @param setCudfPinnedPoolMemoryResource true if this pinned pool should be used by cuIO for host memory
+   * @param setCudfPinnedPoolMemoryResource true if this pinned pool should be used by cuDF for pinned memory
    */
   public static synchronized void initialize(long poolSize, int gpuId, boolean setCudfPinnedPoolMemoryResource) {
     if (isInitialized()) {

From 2dbb68f9d66e752eef4015082d2c877b8145e068 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 6 Jun 2024 09:52:44 -0700
Subject: [PATCH 16/75] available_device_memory

---
 cpp/src/utilities/pinned_memory.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp
index 47c09217363..d0709c6a40e 100644
--- a/cpp/src/utilities/pinned_memory.cpp
+++ b/cpp/src/utilities/pinned_memory.cpp
@@ -142,8 +142,7 @@ CUDF_EXPORT rmm::host_device_async_resource_ref& make_default_pinned_mr(
 
       if (config_size.has_value()) { return *config_size; }
 
-      size_t free{}, total{};
-      CUDF_CUDA_TRY(cudaMemGetInfo(&free, &total));
+      auto const total = rmm::available_device_memory().second;
       // 0.5% of the total device memory, capped at 100MB
       return std::min(total / 200, size_t{100} * 1024 * 1024);
     }();

From cb9cc228bdc8fa74b5a517a149b11f7e6201d71b Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 6 Jun 2024 10:58:48 -0700
Subject: [PATCH 17/75] reviews

---
 cpp/src/utilities/pinned_memory.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp
index d0709c6a40e..53e0d10c6f4 100644
--- a/cpp/src/utilities/pinned_memory.cpp
+++ b/cpp/src/utilities/pinned_memory.cpp
@@ -83,7 +83,7 @@ class fixed_pinned_pool_memory_resource {
                         std::size_t alignment,
                         cuda::stream_ref stream) noexcept
   {
-    if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr <= pool_end_) {
+    if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr < pool_end_) {
       pool_->deallocate_async(ptr, bytes, alignment, stream);
     } else {
       upstream_mr_.deallocate_async(ptr, bytes, alignment, stream);
@@ -113,13 +113,13 @@ class fixed_pinned_pool_memory_resource {
     return !operator==(other);
   }
 
-  [[maybe_unused]] friend void get_property(fixed_pinned_pool_memory_resource const&,
-                                            cuda::mr::device_accessible) noexcept
+  friend void get_property(fixed_pinned_pool_memory_resource const&,
+                           cuda::mr::device_accessible) noexcept
   {
   }
 
-  [[maybe_unused]] friend void get_property(fixed_pinned_pool_memory_resource const&,
-                                            cuda::mr::host_accessible) noexcept
+  friend void get_property(fixed_pinned_pool_memory_resource const&,
+                           cuda::mr::host_accessible) noexcept
   {
   }
 };
@@ -148,7 +148,7 @@ CUDF_EXPORT rmm::host_device_async_resource_ref& make_default_pinned_mr(
     }();
 
     // rmm requires the pool size to be a multiple of 256 bytes
-    auto const aligned_size = (size + 255) & ~255;
+    auto const aligned_size = rmm::align_up(size, rmm::RMM_DEFAULT_HOST_ALIGNMENT);
     CUDF_LOG_INFO("Pinned pool size = {}", aligned_size);
 
     // make the pool with max size equal to the initial size

From cf67a14795017eb4a0835bd727acebfff0a066f9 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 6 Jun 2024 11:01:20 -0700
Subject: [PATCH 18/75] expand anon namespace

---
 cpp/src/utilities/pinned_memory.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp
index 53e0d10c6f4..9cebf980d00 100644
--- a/cpp/src/utilities/pinned_memory.cpp
+++ b/cpp/src/utilities/pinned_memory.cpp
@@ -129,8 +129,6 @@ static_assert(cuda::mr::resource_with<fixed_pinned_pool_memory_resource,
                                       cuda::mr::host_accessible>,
               "");
 
-}  // namespace
-
 CUDF_EXPORT rmm::host_device_async_resource_ref& make_default_pinned_mr(
   std::optional<size_t> config_size)
 {
@@ -190,6 +188,8 @@ CUDF_EXPORT rmm::host_device_async_resource_ref& host_mr()
   return mr_ref;
 }
 
+}  // namespace
+
 rmm::host_device_async_resource_ref set_pinned_memory_resource(
   rmm::host_device_async_resource_ref mr)
 {

From 24c15498b9ad53ec452a99b94fb767b90f4551a0 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 7 Jun 2024 11:07:20 -0700
Subject: [PATCH 19/75] host_uvector

---
 cpp/benchmarks/io/cuio_common.cpp             |   9 +-
 cpp/benchmarks/io/cuio_common.hpp             |   4 +-
 .../cudf/detail/utilities/host_uvector.hpp    | 142 ++++++++++++++
 .../cudf/detail/utilities/host_vector.hpp     | 183 ------------------
 .../detail/utilities/vector_factories.hpp     |  12 +-
 cpp/include/cudf/utilities/span.hpp           |   6 +
 cpp/src/io/text/bgzip_data_chunk_source.cu    |  10 +-
 .../io/text/data_chunk_source_factories.cpp   |   8 +-
 cpp/src/io/utilities/hostdevice_vector.hpp    |   4 +-
 9 files changed, 172 insertions(+), 206 deletions(-)
 create mode 100644 cpp/include/cudf/detail/utilities/host_uvector.hpp
 delete mode 100644 cpp/include/cudf/detail/utilities/host_vector.hpp

diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
index 45dc812e247..09d7d8a9db6 100644
--- a/cpp/benchmarks/io/cuio_common.cpp
+++ b/cpp/benchmarks/io/cuio_common.cpp
@@ -14,13 +14,14 @@
  * limitations under the License.
  */
 
-#include <rmm/mr/pinned_host_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
-
 #include <benchmarks/io/cuio_common.hpp>
 
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/logger.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+
+#include <rmm/mr/pinned_host_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <unistd.h>
 
@@ -52,7 +53,7 @@ std::string random_file_in_dir(std::string const& dir_path)
 
 cuio_source_sink_pair::cuio_source_sink_pair(io_type type)
   : type{type},
-    pinned_buffer({pinned_memory_resource(), cudf::get_default_stream()}),
+    pinned_buffer(0, pinned_memory_resource(), cudf::get_default_stream()),
     d_buffer{0, cudf::get_default_stream()},
     file_name{random_file_in_dir(tmpdir.path())},
     void_sink{cudf::io::data_sink::create()}
diff --git a/cpp/benchmarks/io/cuio_common.hpp b/cpp/benchmarks/io/cuio_common.hpp
index 64d6021cf50..020fd7e00c1 100644
--- a/cpp/benchmarks/io/cuio_common.hpp
+++ b/cpp/benchmarks/io/cuio_common.hpp
@@ -18,7 +18,7 @@
 
 #include <cudf_test/file_utilities.hpp>
 
-#include <cudf/detail/utilities/host_vector.hpp>
+#include <cudf/detail/utilities/host_uvector.hpp>
 #include <cudf/io/data_sink.hpp>
 #include <cudf/io/datasource.hpp>
 
@@ -79,7 +79,7 @@ class cuio_source_sink_pair {
 
   io_type const type;
   std::vector<char> h_buffer;
-  cudf::detail::host_vector<char> pinned_buffer;
+  cudf::detail::host_uvector<char> pinned_buffer;
   rmm::device_uvector<std::byte> d_buffer;
   std::string const file_name;
   std::unique_ptr<cudf::io::data_sink> void_sink;
diff --git a/cpp/include/cudf/detail/utilities/host_uvector.hpp b/cpp/include/cudf/detail/utilities/host_uvector.hpp
new file mode 100644
index 00000000000..39bde04e985
--- /dev/null
+++ b/cpp/include/cudf/detail/utilities/host_uvector.hpp
@@ -0,0 +1,142 @@
+/*
+ *  Copyright 2024 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <algorithm>
+#include <cstddef>
+
+namespace cudf::detail {
+
+template <typename T>
+class host_uvector {
+ public:
+  host_uvector(std::size_t size, rmm::host_async_resource_ref mr, rmm::cuda_stream_view stream)
+    : _size{size}, _capacity{size}, _mr{mr}, _stream{stream}
+  {
+    if (_size != 0) { _data = static_cast<T*>(mr.allocate_async(_size * sizeof(T), _stream)); }
+  }
+
+  host_uvector(host_uvector const&) = delete;
+  host_uvector(host_uvector&& other)
+    : _data{other._data},
+      _size{other._size},
+      _capacity{other._capacity},
+      _mr{other._mr},
+      _stream{other._stream}
+  {
+    other._data     = nullptr;
+    other._size     = 0;
+    other._capacity = 0;
+  }
+
+  host_uvector& operator=(host_uvector const&) = delete;
+  host_uvector& operator=(host_uvector&& other)
+  {
+    if (this != &other) {
+      if (_data != nullptr) { _mr.deallocate_async(_data, _size * sizeof(T), _stream); }
+      _data           = other._data;
+      _size           = other._size;
+      _capacity       = other._capacity;
+      _mr             = other._mr;
+      _stream         = other._stream;
+      other._data     = nullptr;
+      other._size     = 0;
+      other._capacity = 0;
+    }
+    return *this;
+  }
+
+  ~host_uvector()
+  {
+    if (_data != nullptr) { _mr.deallocate_async(_data, _size * sizeof(T), _stream); }
+  }
+
+  void resize(std::size_t new_size)
+  {
+    if (new_size > _capacity) {
+      auto new_data = static_cast<T*>(_mr.allocate_async(new_size * sizeof(T), _stream));
+      _stream.synchronize();
+      if (_data != nullptr) {
+        std::copy(_data, _data + _size, new_data);
+        _mr.deallocate_async(_data, _size * sizeof(T), _stream);
+      }
+      _data     = new_data;
+      _capacity = new_size;
+    }
+    _size = new_size;
+  }
+
+  void reserve(std::size_t new_capacity)
+  {
+    if (new_capacity > _capacity) {
+      auto new_data = static_cast<T*>(_mr.allocate_async(new_capacity * sizeof(T), _stream));
+      _stream.synchronize();
+      if (_data != nullptr) {
+        std::copy(_data, _data + _size, new_data);
+        _mr.deallocate_async(_data, _size * sizeof(T), _stream);
+      }
+      _data     = new_data;
+      _capacity = new_capacity;
+    }
+  }
+
+  void push_back(T const& value)
+  {
+    if (_size == _capacity) { reserve(_capacity == 0 ? 2 : _capacity * 2); }
+    _data[_size++] = value;
+  }
+
+  void clear() { _size = 0; }
+
+  [[nodiscard]] std::size_t size() const { return _size; }
+  [[nodiscard]] std::int64_t ssize() const { return _size; }
+  [[nodiscard]] bool is_empty() const { return _size == 0; }
+  [[nodiscard]] std::size_t capacity() const { return _capacity; }
+
+  [[nodiscard]] T& operator[](std::size_t idx) { return _data[idx]; }
+  [[nodiscard]] T const& operator[](std::size_t idx) const { return _data[idx]; }
+
+  [[nodiscard]] T* data() { return _data; }
+  [[nodiscard]] T const* data() const { return _data; }
+
+  [[nodiscard]] T& front() { return _data[0]; }
+  [[nodiscard]] T const& front() const { return _data[0]; }
+
+  [[nodiscard]] T& back() { return _data[_size - 1]; }
+  [[nodiscard]] T const& back() const { return _data[_size - 1]; }
+
+  [[nodiscard]] T* begin() { return _data; }
+  [[nodiscard]] T const* begin() const { return _data; }
+
+  [[nodiscard]] T* end() { return _data + _size; }
+  [[nodiscard]] T const* end() const { return _data + _size; }
+
+  [[nodiscard]] rmm::host_async_resource_ref memory_resource() const { return _mr; }
+  [[nodiscard]] rmm::cuda_stream_view stream() const { return _stream; }
+
+ private:
+  T* _data{nullptr};
+  std::size_t _size;
+  std::size_t _capacity;
+  rmm::host_async_resource_ref _mr;
+  rmm::cuda_stream_view _stream;
+};
+
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp
deleted file mode 100644
index e62c8017f8b..00000000000
--- a/cpp/include/cudf/detail/utilities/host_vector.hpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- *  Copyright 2024 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/error.hpp>
-
-#include <rmm/aligned.hpp>
-#include <rmm/resource_ref.hpp>
-
-#include <thrust/host_vector.h>
-
-#include <cstddef>
-#include <limits>
-#include <new>  // for bad_alloc
-
-namespace cudf::detail {
-
-/*! \p rmm_host_allocator is a CUDA-specific host memory allocator
- *  that employs \c a `rmm::host_async_resource_ref` for allocation.
- *
- *  \see https://en.cppreference.com/w/cpp/memory/allocator
- */
-template <typename T>
-class rmm_host_allocator;
-
-/*! \p rmm_host_allocator is a CUDA-specific host memory allocator
- *  that employs \c an `cudf::host_async_resource_ref` for allocation.
- *
- *  \see https://en.cppreference.com/w/cpp/memory/allocator
- */
-template <>
-class rmm_host_allocator<void> {
- public:
-  using value_type      = void;            ///< The type of the elements in the allocator
-  using pointer         = void*;           ///< The type returned by address() / allocate()
-  using const_pointer   = void const*;     ///< The type returned by address()
-  using size_type       = std::size_t;     ///< The type used for the size of the allocation
-  using difference_type = std::ptrdiff_t;  ///< The type of the distance between two pointers
-
-  /**
-   * @brief converts a `rmm_host_allocator<void>` to `rmm_host_allocator<U>`
-   */
-  template <typename U>
-  struct rebind {
-    using other = rmm_host_allocator<U>;  ///< The rebound type
-  };
-};
-
-/*! \p rmm_host_allocator is a CUDA-specific host memory allocator
- *  that employs \c `rmm::host_async_resource_ref` for allocation.
- *
- * The \p rmm_host_allocator provides an interface for host memory allocation through the user
- * provided \c `rmm::host_async_resource_ref`. The \p rmm_host_allocator does not take ownership of
- * this reference and therefore it is the user's responsibility to ensure its lifetime for the
- * duration of the lifetime of the \p rmm_host_allocator.
- *
- *  \see https://en.cppreference.com/w/cpp/memory/allocator
- */
-template <typename T>
-class rmm_host_allocator {
- public:
-  using value_type      = T;               ///< The type of the elements in the allocator
-  using pointer         = T*;              ///< The type returned by address() / allocate()
-  using const_pointer   = T const*;        ///< The type returned by address()
-  using reference       = T&;              ///< The parameter type for address()
-  using const_reference = T const&;        ///< The parameter type for address()
-  using size_type       = std::size_t;     ///< The type used for the size of the allocation
-  using difference_type = std::ptrdiff_t;  ///< The type of the distance between two pointers
-
-  typedef cuda::std::true_type propagate_on_container_move_assignment;
-
-  /**
-   * @brief converts a `rmm_host_allocator<T>` to `rmm_host_allocator<U>`
-   */
-  template <typename U>
-  struct rebind {
-    using other = rmm_host_allocator<U>;  ///< The rebound type
-  };
-
-  /**
-   * @brief Cannot declare an empty host allocator.
-   */
-  rmm_host_allocator() = delete;
-
-  /**
-   * @brief Construct from a `cudf::host_async_resource_ref`
-   */
-  rmm_host_allocator(rmm::host_async_resource_ref _mr, rmm::cuda_stream_view _stream)
-    : mr(_mr), stream(_stream)
-  {
-  }
-
-  /**
-   * @brief This method allocates storage for objects in host memory.
-   *
-   *  @param cnt The number of objects to allocate.
-   *  @return a \c pointer to the newly allocated objects.
-   *  @note This method does not invoke \p value_type's constructor.
-   *        It is the responsibility of the caller to initialize the
-   *        objects at the returned \c pointer.
-   */
-  inline pointer allocate(size_type cnt)
-  {
-    if (cnt > this->max_size()) { throw std::bad_alloc(); }  // end if
-    return static_cast<pointer>(
-      mr.allocate_async(cnt * sizeof(value_type), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream));
-  }
-
-  /**
-   * @brief This method deallocates host memory previously allocated
-   *  with this \c rmm_host_allocator.
-   *
-   *  @param p A \c pointer to the previously allocated memory.
-   *  @note The second parameter is the number of objects previously allocated.
-   *  @note This method does not invoke \p value_type's destructor.
-   *        It is the responsibility of the caller to destroy
-   *        the objects stored at \p p.
-   */
-  inline void deallocate(pointer p, size_type cnt)
-  {
-    mr.deallocate_async(p, cnt * sizeof(value_type), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
-  }
-
-  /**
-   * @brief This method returns the maximum size of the \c cnt parameter
-   *  accepted by the \p allocate() method.
-   *
-   *  @return The maximum number of objects that may be allocated
-   *          by a single call to \p allocate().
-   */
-  constexpr inline size_type max_size() const
-  {
-    return (std::numeric_limits<size_type>::max)() / sizeof(T);
-  }
-
-  /**
-   * @brief This method tests this \p rmm_host_allocator for equality to
-   *  another.
-   *
-   *  @param x The other \p rmm_host_allocator of interest.
-   *  @return This method always returns \c true.
-   */
-  inline bool operator==(rmm_host_allocator const& x) const
-  {
-    return x.mr == mr && x.stream == stream;
-  }
-
-  /**
-   * @brief This method tests this \p rmm_host_allocator for inequality
-   *  to another.
-   *
-   *  @param x The other \p rmm_host_allocator of interest.
-   *  @return This method always returns \c false.
-   */
-  inline bool operator!=(rmm_host_allocator const& x) const { return !operator==(x); }
-
- private:
-  rmm::host_async_resource_ref mr;
-  rmm::cuda_stream_view stream;
-};
-
-/**
- * @brief A vector class with rmm host memory allocator
- */
-template <typename T>
-using host_vector = thrust::host_vector<T, rmm_host_allocator<T>>;
-
-}  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 06dfcbfc5e5..f67b671c610 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -21,7 +21,7 @@
  * @file vector_factories.hpp
  */
 
-#include <cudf/detail/utilities/host_vector.hpp>
+#include <cudf/detail/utilities/host_uvector.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
@@ -442,7 +442,7 @@ thrust::host_vector<typename Container::value_type> make_host_vector_sync(
 }
 
 /**
- * @brief Asynchronously construct a pinned `cudf::detail::host_vector` of the given size
+ * @brief Asynchronously construct a pinned `cudf::detail::host_uvector` of the given size
  *
  * @note This function does not synchronize `stream`.
  *
@@ -452,13 +452,13 @@ thrust::host_vector<typename Container::value_type> make_host_vector_sync(
  * @return A host_vector of the given size
  */
 template <typename T>
-host_vector<T> make_pinned_vector_async(size_t size, rmm::cuda_stream_view stream)
+host_uvector<T> make_pinned_vector_async(size_t size, rmm::cuda_stream_view stream)
 {
-  return host_vector<T>(size, {cudf::get_pinned_memory_resource(), stream});
+  return host_uvector<T>(size, cudf::get_pinned_memory_resource(), stream);
 }
 
 /**
- * @brief Synchronously construct a pinned `cudf::detail::host_vector` of the given size
+ * @brief Synchronously construct a pinned `cudf::detail::host_uvector` of the given size
  *
  * @note This function synchronizes `stream`.
  *
@@ -468,7 +468,7 @@ host_vector<T> make_pinned_vector_async(size_t size, rmm::cuda_stream_view strea
  * @return A host_vector of the given size
  */
 template <typename T>
-host_vector<T> make_pinned_vector_sync(size_t size, rmm::cuda_stream_view stream)
+host_uvector<T> make_pinned_vector_sync(size_t size, rmm::cuda_stream_view stream)
 {
   auto result = make_pinned_vector_async<T>(size, stream);
   stream.synchronize();
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index 47e92d61a9f..873d3e56acb 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <cudf/detail/utilities/host_uvector.hpp>
+
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/device_vector.hpp>
@@ -212,6 +214,10 @@ template <typename T, typename Alloc>
 struct is_host_span_supported_container<  //
   thrust::host_vector<T, Alloc>> : std::true_type {};
 
+template <typename T>
+struct is_host_span_supported_container<  //
+  cudf::detail::host_uvector<T>> : std::true_type {};
+
 template <typename T, typename Alloc>
 struct is_host_span_supported_container<  //
   std::basic_string<T, std::char_traits<T>, Alloc>> : std::true_type {};
diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu
index 0e3ce779089..b7644a6fb9f 100644
--- a/cpp/src/io/text/bgzip_data_chunk_source.cu
+++ b/cpp/src/io/text/bgzip_data_chunk_source.cu
@@ -19,7 +19,7 @@
 #include "io/utilities/config_utils.hpp"
 
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/host_vector.hpp>
+#include <cudf/detail/utilities/host_uvector.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
@@ -67,7 +67,7 @@ struct bgzip_nvcomp_transform_functor {
 class bgzip_data_chunk_reader : public data_chunk_reader {
  private:
   template <typename T>
-  static void copy_to_device(cudf::detail::host_vector<T> const& host,
+  static void copy_to_device(cudf::detail::host_uvector<T> const& host,
                              rmm::device_uvector<T>& device,
                              rmm::cuda_stream_view stream)
   {
@@ -85,9 +85,9 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
       1 << 16;  // 64k offset allocation, resized on demand
 
     cudaEvent_t event;
-    cudf::detail::host_vector<char> h_compressed_blocks;
-    cudf::detail::host_vector<std::size_t> h_compressed_offsets;
-    cudf::detail::host_vector<std::size_t> h_decompressed_offsets;
+    cudf::detail::host_uvector<char> h_compressed_blocks;
+    cudf::detail::host_uvector<std::size_t> h_compressed_offsets;
+    cudf::detail::host_uvector<std::size_t> h_decompressed_offsets;
     rmm::device_uvector<char> d_compressed_blocks;
     rmm::device_uvector<char> d_decompressed_blocks;
     rmm::device_uvector<std::size_t> d_compressed_offsets;
diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp
index 45096b7155c..2c4160e48c5 100644
--- a/cpp/src/io/text/data_chunk_source_factories.cpp
+++ b/cpp/src/io/text/data_chunk_source_factories.cpp
@@ -17,7 +17,7 @@
 #include "io/text/device_data_chunks.hpp"
 
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/host_vector.hpp>
+#include <cudf/detail/utilities/host_uvector.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 
@@ -33,7 +33,7 @@ namespace {
 
 struct host_ticket {
   cudaEvent_t event;
-  std::unique_ptr<cudf::detail::host_vector<char>> buffer;
+  std::unique_ptr<cudf::detail::host_uvector<char>> buffer;
 };
 
 /**
@@ -86,7 +86,7 @@ class datasource_chunk_reader : public data_chunk_reader {
 
       // resize the host buffer as necessary to contain the requested number of bytes
       if (h_ticket.buffer == nullptr or h_ticket.buffer->size() < read_size) {
-        h_ticket.buffer = std::make_unique<cudf::detail::host_vector<char>>(
+        h_ticket.buffer = std::make_unique<cudf::detail::host_uvector<char>>(
           cudf::detail::make_pinned_vector_sync<char>(read_size, stream));
       }
 
@@ -153,7 +153,7 @@ class istream_data_chunk_reader : public data_chunk_reader {
 
     // resize the host buffer as necessary to contain the requested number of bytes
     if (h_ticket.buffer == nullptr or h_ticket.buffer->size() < read_size) {
-      h_ticket.buffer = std::make_unique<cudf::detail::host_vector<char>>(
+      h_ticket.buffer = std::make_unique<cudf::detail::host_uvector<char>>(
         cudf::detail::make_pinned_vector_sync<char>(read_size, stream));
     }
 
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 1ae27a2f4ae..ae2ab03ded3 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -18,7 +18,7 @@
 
 #include "hostdevice_span.hpp"
 
-#include <cudf/detail/utilities/host_vector.hpp>
+#include <cudf/detail/utilities/host_uvector.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -172,7 +172,7 @@ class hostdevice_vector {
   }
 
  private:
-  cudf::detail::host_vector<T> h_data;
+  cudf::detail::host_uvector<T> h_data;
   rmm::device_uvector<T> d_data;
 };
 

From 075deca7c87b70b62f30a5b8a266da39a3e852cb Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 7 Jun 2024 11:33:10 -0700
Subject: [PATCH 20/75] style

---
 cpp/include/cudf/utilities/span.hpp | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index 873d3e56acb..2f622612209 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -204,23 +204,28 @@ class span_base {
 // ===== host_span =================================================================================
 
 template <typename T>
-struct is_host_span_supported_container : std::false_type {};
+struct is_host_span_supported_container : std::false_type {
+};
 
 template <typename T, typename Alloc>
 struct is_host_span_supported_container<  //
-  std::vector<T, Alloc>> : std::true_type {};
+  std::vector<T, Alloc>> : std::true_type {
+};
 
 template <typename T, typename Alloc>
 struct is_host_span_supported_container<  //
-  thrust::host_vector<T, Alloc>> : std::true_type {};
+  thrust::host_vector<T, Alloc>> : std::true_type {
+};
 
 template <typename T>
 struct is_host_span_supported_container<  //
-  cudf::detail::host_uvector<T>> : std::true_type {};
+  cudf::detail::host_uvector<T>> : std::true_type {
+};
 
 template <typename T, typename Alloc>
 struct is_host_span_supported_container<  //
-  std::basic_string<T, std::char_traits<T>, Alloc>> : std::true_type {};
+  std::basic_string<T, std::char_traits<T>, Alloc>> : std::true_type {
+};
 
 /**
  * @brief C++20 std::span with reduced feature set.
@@ -275,19 +280,23 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
 // ===== device_span ===============================================================================
 
 template <typename T>
-struct is_device_span_supported_container : std::false_type {};
+struct is_device_span_supported_container : std::false_type {
+};
 
 template <typename T, typename Alloc>
 struct is_device_span_supported_container<  //
-  thrust::device_vector<T, Alloc>> : std::true_type {};
+  thrust::device_vector<T, Alloc>> : std::true_type {
+};
 
 template <typename T>
 struct is_device_span_supported_container<  //
-  rmm::device_vector<T>> : std::true_type {};
+  rmm::device_vector<T>> : std::true_type {
+};
 
 template <typename T>
 struct is_device_span_supported_container<  //
-  rmm::device_uvector<T>> : std::true_type {};
+  rmm::device_uvector<T>> : std::true_type {
+};
 
 /**
  * @brief Device version of C++20 std::span with reduced feature set.

From 164fce20ad07632b5a9899668d9da7d23ced6b97 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 7 Jun 2024 11:53:00 -0700
Subject: [PATCH 21/75] docs; prefixes

---
 cpp/src/utilities/pinned_memory.cpp | 43 ++++++++++++++++-------------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp
index 9cebf980d00..85d4b7e2283 100644
--- a/cpp/src/utilities/pinned_memory.cpp
+++ b/cpp/src/utilities/pinned_memory.cpp
@@ -28,34 +28,39 @@
 namespace cudf {
 
 namespace {
+
+// Asynchronous memory resource that allocates a fixed-size pool of pinned memory and falls back to
+// additional pinned allocations if the pool is exhausted.
 class fixed_pinned_pool_memory_resource {
   using upstream_mr    = rmm::mr::pinned_host_memory_resource;
   using host_pooled_mr = rmm::mr::pool_memory_resource<upstream_mr>;
 
  private:
-  upstream_mr upstream_mr_{};
-  size_t pool_size_{0};
+  upstream_mr _upstream_mr{};
+  size_t _pool_size{0};
   // Raw pointer to avoid a segfault when the pool is destroyed on exit
   host_pooled_mr* pool_{nullptr};
-  void* pool_begin_{nullptr};
-  void* pool_end_{nullptr};
-  cuda::stream_ref stream_{cudf::detail::global_cuda_stream_pool().get_stream().value()};
+  // The beginning and end of the pool memory range; pool is never reallocated so these are constant
+  // and can be used to determine if a pointer is within the pool
+  void* _pool_begin{nullptr};
+  void* _pool_end{nullptr};
+  cuda::stream_ref _stream{cudf::detail::global_cuda_stream_pool().get_stream().value()};
 
  public:
   fixed_pinned_pool_memory_resource(size_t size)
-    : pool_size_{size}, pool_{new host_pooled_mr(upstream_mr_, size, size)}
+    : _pool_size{size}, pool_{new host_pooled_mr(_upstream_mr, size, size)}
   {
-    if (pool_size_ == 0) { return; }
+    if (_pool_size == 0) { return; }
 
     // Allocate full size from the pinned pool to figure out the beginning and end address
-    pool_begin_ = pool_->allocate_async(pool_size_, stream_);
-    pool_end_   = static_cast<void*>(static_cast<uint8_t*>(pool_begin_) + pool_size_);
-    pool_->deallocate_async(pool_begin_, pool_size_, stream_);
+    _pool_begin = pool_->allocate_async(_pool_size, _stream);
+    _pool_end   = static_cast<void*>(static_cast<uint8_t*>(_pool_begin) + _pool_size);
+    pool_->deallocate_async(_pool_begin, _pool_size, _stream);
   }
 
   void* allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream)
   {
-    if (bytes <= pool_size_) {
+    if (bytes <= _pool_size) {
       try {
         return pool_->allocate_async(bytes, alignment, stream);
       } catch (...) {
@@ -63,7 +68,7 @@ class fixed_pinned_pool_memory_resource {
       }
     }
 
-    return upstream_mr_.allocate_async(bytes, alignment, stream);
+    return _upstream_mr.allocate_async(bytes, alignment, stream);
   }
 
   void* allocate_async(std::size_t bytes, cuda::stream_ref stream)
@@ -73,8 +78,8 @@ class fixed_pinned_pool_memory_resource {
 
   void* allocate(std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
   {
-    auto const result = allocate_async(bytes, alignment, stream_);
-    stream_.wait();
+    auto const result = allocate_async(bytes, alignment, _stream);
+    _stream.wait();
     return result;
   }
 
@@ -83,10 +88,10 @@ class fixed_pinned_pool_memory_resource {
                         std::size_t alignment,
                         cuda::stream_ref stream) noexcept
   {
-    if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr < pool_end_) {
+    if (bytes <= _pool_size && ptr >= _pool_begin && ptr < _pool_end) {
       pool_->deallocate_async(ptr, bytes, alignment, stream);
     } else {
-      upstream_mr_.deallocate_async(ptr, bytes, alignment, stream);
+      _upstream_mr.deallocate_async(ptr, bytes, alignment, stream);
     }
   }
 
@@ -99,13 +104,13 @@ class fixed_pinned_pool_memory_resource {
                   std::size_t bytes,
                   std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept
   {
-    deallocate_async(ptr, bytes, alignment, stream_);
-    stream_.wait();
+    deallocate_async(ptr, bytes, alignment, _stream);
+    _stream.wait();
   }
 
   bool operator==(fixed_pinned_pool_memory_resource const& other) const
   {
-    return pool_ == other.pool_ and stream_ == other.stream_;
+    return pool_ == other.pool_ and _stream == other._stream;
   }
 
   bool operator!=(fixed_pinned_pool_memory_resource const& other) const

From b566babb87696cf54656605ab76e9e25b5c42bed Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 7 Jun 2024 12:00:40 -0700
Subject: [PATCH 22/75] type aliases in host_uvector

---
 .../cudf/detail/utilities/host_uvector.hpp    | 69 +++++++++++--------
 1 file changed, 41 insertions(+), 28 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/host_uvector.hpp b/cpp/include/cudf/detail/utilities/host_uvector.hpp
index 39bde04e985..c8166217a73 100644
--- a/cpp/include/cudf/detail/utilities/host_uvector.hpp
+++ b/cpp/include/cudf/detail/utilities/host_uvector.hpp
@@ -27,10 +27,21 @@ namespace cudf::detail {
 template <typename T>
 class host_uvector {
  public:
-  host_uvector(std::size_t size, rmm::host_async_resource_ref mr, rmm::cuda_stream_view stream)
+  using value_type      = T;
+  using size_type       = std::size_t;
+  using reference       = value_type&;
+  using const_reference = value_type const&;
+  using pointer         = value_type*;
+  using const_pointer   = value_type const*;
+  using iterator        = pointer;
+  using const_iterator  = const_pointer;
+
+  host_uvector(size_type size, rmm::host_async_resource_ref mr, rmm::cuda_stream_view stream)
     : _size{size}, _capacity{size}, _mr{mr}, _stream{stream}
   {
-    if (_size != 0) { _data = static_cast<T*>(mr.allocate_async(_size * sizeof(T), _stream)); }
+    if (_size != 0) {
+      _data = static_cast<pointer>(mr.allocate_async(_size * sizeof(value_type), _stream));
+    }
   }
 
   host_uvector(host_uvector const&) = delete;
@@ -50,7 +61,7 @@ class host_uvector {
   host_uvector& operator=(host_uvector&& other)
   {
     if (this != &other) {
-      if (_data != nullptr) { _mr.deallocate_async(_data, _size * sizeof(T), _stream); }
+      if (_data != nullptr) { _mr.deallocate_async(_data, _size * sizeof(value_type), _stream); }
       _data           = other._data;
       _size           = other._size;
       _capacity       = other._capacity;
@@ -65,17 +76,18 @@ class host_uvector {
 
   ~host_uvector()
   {
-    if (_data != nullptr) { _mr.deallocate_async(_data, _size * sizeof(T), _stream); }
+    if (_data != nullptr) { _mr.deallocate_async(_data, _size * sizeof(value_type), _stream); }
   }
 
-  void resize(std::size_t new_size)
+  void resize(size_type new_size)
   {
     if (new_size > _capacity) {
-      auto new_data = static_cast<T*>(_mr.allocate_async(new_size * sizeof(T), _stream));
+      auto new_data =
+        static_cast<pointer>(_mr.allocate_async(new_size * sizeof(value_type), _stream));
       _stream.synchronize();
       if (_data != nullptr) {
         std::copy(_data, _data + _size, new_data);
-        _mr.deallocate_async(_data, _size * sizeof(T), _stream);
+        _mr.deallocate_async(_data, _size * sizeof(value_type), _stream);
       }
       _data     = new_data;
       _capacity = new_size;
@@ -83,21 +95,22 @@ class host_uvector {
     _size = new_size;
   }
 
-  void reserve(std::size_t new_capacity)
+  void reserve(size_type new_capacity)
   {
     if (new_capacity > _capacity) {
-      auto new_data = static_cast<T*>(_mr.allocate_async(new_capacity * sizeof(T), _stream));
+      auto new_data =
+        static_cast<pointer>(_mr.allocate_async(new_capacity * sizeof(value_type), _stream));
       _stream.synchronize();
       if (_data != nullptr) {
         std::copy(_data, _data + _size, new_data);
-        _mr.deallocate_async(_data, _size * sizeof(T), _stream);
+        _mr.deallocate_async(_data, _size * sizeof(value_type), _stream);
       }
       _data     = new_data;
       _capacity = new_capacity;
     }
   }
 
-  void push_back(T const& value)
+  void push_back(const_reference value)
   {
     if (_size == _capacity) { reserve(_capacity == 0 ? 2 : _capacity * 2); }
     _data[_size++] = value;
@@ -105,36 +118,36 @@ class host_uvector {
 
   void clear() { _size = 0; }
 
-  [[nodiscard]] std::size_t size() const { return _size; }
+  [[nodiscard]] size_type size() const { return _size; }
   [[nodiscard]] std::int64_t ssize() const { return _size; }
   [[nodiscard]] bool is_empty() const { return _size == 0; }
-  [[nodiscard]] std::size_t capacity() const { return _capacity; }
+  [[nodiscard]] size_type capacity() const { return _capacity; }
 
-  [[nodiscard]] T& operator[](std::size_t idx) { return _data[idx]; }
-  [[nodiscard]] T const& operator[](std::size_t idx) const { return _data[idx]; }
+  [[nodiscard]] reference operator[](size_type idx) { return _data[idx]; }
+  [[nodiscard]] const_reference operator[](size_type idx) const { return _data[idx]; }
 
-  [[nodiscard]] T* data() { return _data; }
-  [[nodiscard]] T const* data() const { return _data; }
+  [[nodiscard]] pointer data() { return _data; }
+  [[nodiscard]] const_pointer data() const { return _data; }
 
-  [[nodiscard]] T& front() { return _data[0]; }
-  [[nodiscard]] T const& front() const { return _data[0]; }
+  [[nodiscard]] reference front() { return _data[0]; }
+  [[nodiscard]] const_reference front() const { return _data[0]; }
 
-  [[nodiscard]] T& back() { return _data[_size - 1]; }
-  [[nodiscard]] T const& back() const { return _data[_size - 1]; }
+  [[nodiscard]] reference back() { return _data[_size - 1]; }
+  [[nodiscard]] const_reference back() const { return _data[_size - 1]; }
 
-  [[nodiscard]] T* begin() { return _data; }
-  [[nodiscard]] T const* begin() const { return _data; }
+  [[nodiscard]] iterator begin() { return _data; }
+  [[nodiscard]] const_iterator begin() const { return _data; }
 
-  [[nodiscard]] T* end() { return _data + _size; }
-  [[nodiscard]] T const* end() const { return _data + _size; }
+  [[nodiscard]] iterator end() { return _data + _size; }
+  [[nodiscard]] const_iterator end() const { return _data + _size; }
 
   [[nodiscard]] rmm::host_async_resource_ref memory_resource() const { return _mr; }
   [[nodiscard]] rmm::cuda_stream_view stream() const { return _stream; }
 
  private:
-  T* _data{nullptr};
-  std::size_t _size;
-  std::size_t _capacity;
+  pointer _data{nullptr};
+  size_type _size;
+  size_type _capacity;
   rmm::host_async_resource_ref _mr;
   rmm::cuda_stream_view _stream;
 };

From 21edb534a15c836963c116f3c9ca360cadb1844c Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 7 Jun 2024 13:24:37 -0700
Subject: [PATCH 23/75] refactor host_ticket

---
 .../io/text/data_chunk_source_factories.cpp   | 56 +++++++------------
 1 file changed, 19 insertions(+), 37 deletions(-)

diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp
index 2c4160e48c5..39e955232e3 100644
--- a/cpp/src/io/text/data_chunk_source_factories.cpp
+++ b/cpp/src/io/text/data_chunk_source_factories.cpp
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "cudf/utilities/default_stream.hpp"
 #include "io/text/device_data_chunks.hpp"
 
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -32,8 +33,15 @@ namespace cudf::io::text {
 namespace {
 
 struct host_ticket {
-  cudaEvent_t event;
-  std::unique_ptr<cudf::detail::host_uvector<char>> buffer;
+  cudaEvent_t event{};  // tracks the completion of the last device-to-host copy.
+  cudf::detail::host_uvector<char> buffer;
+
+  host_ticket() : buffer{cudf::detail::make_pinned_vector_sync<char>(0, cudf::get_default_stream())}
+  {
+    cudaEventCreate(&event);
+  }
+
+  ~host_ticket() { cudaEventDestroy(event); }
 };
 
 /**
@@ -44,20 +52,7 @@ class datasource_chunk_reader : public data_chunk_reader {
   constexpr static int num_tickets = 2;
 
  public:
-  datasource_chunk_reader(datasource* source) : _source(source)
-  {
-    // create an event to track the completion of the last device-to-host copy.
-    for (auto& ticket : _tickets) {
-      CUDF_CUDA_TRY(cudaEventCreate(&(ticket.event)));
-    }
-  }
-
-  ~datasource_chunk_reader() override
-  {
-    for (auto& ticket : _tickets) {
-      CUDF_CUDA_TRY(cudaEventDestroy(ticket.event));
-    }
-  }
+  datasource_chunk_reader(datasource* source) : _source(source) {}
 
   void skip_bytes(std::size_t size) override
   {
@@ -85,16 +80,15 @@ class datasource_chunk_reader : public data_chunk_reader {
       CUDF_CUDA_TRY(cudaEventSynchronize(h_ticket.event));
 
       // resize the host buffer as necessary to contain the requested number of bytes
-      if (h_ticket.buffer == nullptr or h_ticket.buffer->size() < read_size) {
-        h_ticket.buffer = std::make_unique<cudf::detail::host_uvector<char>>(
-          cudf::detail::make_pinned_vector_sync<char>(read_size, stream));
+      if (h_ticket.buffer.size() < read_size) {
+        h_ticket.buffer = cudf::detail::make_pinned_vector_sync<char>(read_size, stream);
       }
 
-      _source->host_read(_offset, read_size, reinterpret_cast<uint8_t*>(h_ticket.buffer->data()));
+      _source->host_read(_offset, read_size, reinterpret_cast<uint8_t*>(h_ticket.buffer.data()));
 
       // copy the host-pinned data on to device
       CUDF_CUDA_TRY(cudaMemcpyAsync(
-        chunk.data(), h_ticket.buffer->data(), read_size, cudaMemcpyDefault, stream.value()));
+        chunk.data(), h_ticket.buffer.data(), read_size, cudaMemcpyDefault, stream.value()));
 
       // record the host-to-device copy.
       CUDF_CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value()));
@@ -124,17 +118,6 @@ class istream_data_chunk_reader : public data_chunk_reader {
   istream_data_chunk_reader(std::unique_ptr<std::istream> datastream)
     : _datastream(std::move(datastream))
   {
-    // create an event to track the completion of the last device-to-host copy.
-    for (auto& ticket : _tickets) {
-      CUDF_CUDA_TRY(cudaEventCreate(&(ticket.event)));
-    }
-  }
-
-  ~istream_data_chunk_reader() override
-  {
-    for (auto& ticket : _tickets) {
-      CUDF_CUDA_TRY(cudaEventDestroy(ticket.event));
-    }
   }
 
   void skip_bytes(std::size_t size) override { _datastream->ignore(size); };
@@ -152,13 +135,12 @@ class istream_data_chunk_reader : public data_chunk_reader {
     CUDF_CUDA_TRY(cudaEventSynchronize(h_ticket.event));
 
     // resize the host buffer as necessary to contain the requested number of bytes
-    if (h_ticket.buffer == nullptr or h_ticket.buffer->size() < read_size) {
-      h_ticket.buffer = std::make_unique<cudf::detail::host_uvector<char>>(
-        cudf::detail::make_pinned_vector_sync<char>(read_size, stream));
+    if (h_ticket.buffer.size() < read_size) {
+      h_ticket.buffer = cudf::detail::make_pinned_vector_sync<char>(read_size, stream);
     }
 
     // read data from the host istream in to the pinned host memory buffer
-    _datastream->read(h_ticket.buffer->data(), read_size);
+    _datastream->read(h_ticket.buffer.data(), read_size);
 
     // adjust the read size to reflect how many bytes were actually read from the data stream
     read_size = _datastream->gcount();
@@ -168,7 +150,7 @@ class istream_data_chunk_reader : public data_chunk_reader {
 
     // copy the host-pinned data on to device
     CUDF_CUDA_TRY(cudaMemcpyAsync(
-      chunk.data(), h_ticket.buffer->data(), read_size, cudaMemcpyDefault, stream.value()));
+      chunk.data(), h_ticket.buffer.data(), read_size, cudaMemcpyDefault, stream.value()));
 
     // record the host-to-device copy.
     CUDF_CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value()));

From 3814797d5b2d3478901e14f9ecbb733d2168a06a Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 7 Jun 2024 13:33:50 -0700
Subject: [PATCH 24/75] style

---
 cpp/include/cudf/utilities/span.hpp | 27 +++++++++------------------
 1 file changed, 9 insertions(+), 18 deletions(-)

diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index 2f622612209..6deef974c0e 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -204,28 +204,23 @@ class span_base {
 // ===== host_span =================================================================================
 
 template <typename T>
-struct is_host_span_supported_container : std::false_type {
-};
+struct is_host_span_supported_container : std::false_type {};
 
 template <typename T, typename Alloc>
 struct is_host_span_supported_container<  //
-  std::vector<T, Alloc>> : std::true_type {
-};
+  std::vector<T, Alloc>> : std::true_type {};
 
 template <typename T, typename Alloc>
 struct is_host_span_supported_container<  //
-  thrust::host_vector<T, Alloc>> : std::true_type {
-};
+  thrust::host_vector<T, Alloc>> : std::true_type {};
 
 template <typename T>
 struct is_host_span_supported_container<  //
-  cudf::detail::host_uvector<T>> : std::true_type {
-};
+  cudf::detail::host_uvector<T>> : std::true_type {};
 
 template <typename T, typename Alloc>
 struct is_host_span_supported_container<  //
-  std::basic_string<T, std::char_traits<T>, Alloc>> : std::true_type {
-};
+  std::basic_string<T, std::char_traits<T>, Alloc>> : std::true_type {};
 
 /**
  * @brief C++20 std::span with reduced feature set.
@@ -280,23 +275,19 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
 // ===== device_span ===============================================================================
 
 template <typename T>
-struct is_device_span_supported_container : std::false_type {
-};
+struct is_device_span_supported_container : std::false_type {};
 
 template <typename T, typename Alloc>
 struct is_device_span_supported_container<  //
-  thrust::device_vector<T, Alloc>> : std::true_type {
-};
+  thrust::device_vector<T, Alloc>> : std::true_type {};
 
 template <typename T>
 struct is_device_span_supported_container<  //
-  rmm::device_vector<T>> : std::true_type {
-};
+  rmm::device_vector<T>> : std::true_type {};
 
 template <typename T>
 struct is_device_span_supported_container<  //
-  rmm::device_uvector<T>> : std::true_type {
-};
+  rmm::device_uvector<T>> : std::true_type {};
 
 /**
  * @brief Device version of C++20 std::span with reduced feature set.

From c9331575ac9ebe2d9b0ebf85ec2f93a42ed9b876 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 10 Jun 2024 10:24:48 -0700
Subject: [PATCH 25/75] style

---
 cpp/include/cudf/detail/utilities/cuda_copy.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/detail/utilities/cuda_copy.hpp b/cpp/include/cudf/detail/utilities/cuda_copy.hpp
index 2ceb70f2ef2..fce91751f80 100644
--- a/cpp/include/cudf/detail/utilities/cuda_copy.hpp
+++ b/cpp/include/cudf/detail/utilities/cuda_copy.hpp
@@ -38,4 +38,4 @@ void copy_device_to_pinned_async(T* dst, T const* src, size_t size, rmm::cuda_st
   impl::copy_pinned(dst, src, size * sizeof(T), stream);
 }
 
-}  // namespace cudf::detail
\ No newline at end of file
+}  // namespace cudf::detail

From 6784e073197ee450d46350fc348f28e9f085b68a Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 10 Jun 2024 10:35:46 -0700
Subject: [PATCH 26/75] more style

---
 cpp/src/utilities/cuda_copy.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/utilities/cuda_copy.cu b/cpp/src/utilities/cuda_copy.cu
index 9a574d6d0e5..da3b4760967 100644
--- a/cpp/src/utilities/cuda_copy.cu
+++ b/cpp/src/utilities/cuda_copy.cu
@@ -38,4 +38,4 @@ void copy_pinned(void* dst, void const* src, std::size_t size, rmm::cuda_stream_
   }
 }
 
-}  // namespace cudf::detail::impl
\ No newline at end of file
+}  // namespace cudf::detail::impl

From f7999aae606269e187de88279f96d5034ad48753 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 10 Jun 2024 16:58:00 -0700
Subject: [PATCH 27/75] Revert "type aliases in host_uvector"

This reverts commit b566babb87696cf54656605ab76e9e25b5c42bed.
---
 .../cudf/detail/utilities/host_uvector.hpp    | 69 ++++++++-----------
 1 file changed, 28 insertions(+), 41 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/host_uvector.hpp b/cpp/include/cudf/detail/utilities/host_uvector.hpp
index c8166217a73..39bde04e985 100644
--- a/cpp/include/cudf/detail/utilities/host_uvector.hpp
+++ b/cpp/include/cudf/detail/utilities/host_uvector.hpp
@@ -27,21 +27,10 @@ namespace cudf::detail {
 template <typename T>
 class host_uvector {
  public:
-  using value_type      = T;
-  using size_type       = std::size_t;
-  using reference       = value_type&;
-  using const_reference = value_type const&;
-  using pointer         = value_type*;
-  using const_pointer   = value_type const*;
-  using iterator        = pointer;
-  using const_iterator  = const_pointer;
-
-  host_uvector(size_type size, rmm::host_async_resource_ref mr, rmm::cuda_stream_view stream)
+  host_uvector(std::size_t size, rmm::host_async_resource_ref mr, rmm::cuda_stream_view stream)
     : _size{size}, _capacity{size}, _mr{mr}, _stream{stream}
   {
-    if (_size != 0) {
-      _data = static_cast<pointer>(mr.allocate_async(_size * sizeof(value_type), _stream));
-    }
+    if (_size != 0) { _data = static_cast<T*>(mr.allocate_async(_size * sizeof(T), _stream)); }
   }
 
   host_uvector(host_uvector const&) = delete;
@@ -61,7 +50,7 @@ class host_uvector {
   host_uvector& operator=(host_uvector&& other)
   {
     if (this != &other) {
-      if (_data != nullptr) { _mr.deallocate_async(_data, _size * sizeof(value_type), _stream); }
+      if (_data != nullptr) { _mr.deallocate_async(_data, _size * sizeof(T), _stream); }
       _data           = other._data;
       _size           = other._size;
       _capacity       = other._capacity;
@@ -76,18 +65,17 @@ class host_uvector {
 
   ~host_uvector()
   {
-    if (_data != nullptr) { _mr.deallocate_async(_data, _size * sizeof(value_type), _stream); }
+    if (_data != nullptr) { _mr.deallocate_async(_data, _size * sizeof(T), _stream); }
   }
 
-  void resize(size_type new_size)
+  void resize(std::size_t new_size)
   {
     if (new_size > _capacity) {
-      auto new_data =
-        static_cast<pointer>(_mr.allocate_async(new_size * sizeof(value_type), _stream));
+      auto new_data = static_cast<T*>(_mr.allocate_async(new_size * sizeof(T), _stream));
       _stream.synchronize();
       if (_data != nullptr) {
         std::copy(_data, _data + _size, new_data);
-        _mr.deallocate_async(_data, _size * sizeof(value_type), _stream);
+        _mr.deallocate_async(_data, _size * sizeof(T), _stream);
       }
       _data     = new_data;
       _capacity = new_size;
@@ -95,22 +83,21 @@ class host_uvector {
     _size = new_size;
   }
 
-  void reserve(size_type new_capacity)
+  void reserve(std::size_t new_capacity)
   {
     if (new_capacity > _capacity) {
-      auto new_data =
-        static_cast<pointer>(_mr.allocate_async(new_capacity * sizeof(value_type), _stream));
+      auto new_data = static_cast<T*>(_mr.allocate_async(new_capacity * sizeof(T), _stream));
       _stream.synchronize();
       if (_data != nullptr) {
         std::copy(_data, _data + _size, new_data);
-        _mr.deallocate_async(_data, _size * sizeof(value_type), _stream);
+        _mr.deallocate_async(_data, _size * sizeof(T), _stream);
       }
       _data     = new_data;
       _capacity = new_capacity;
     }
   }
 
-  void push_back(const_reference value)
+  void push_back(T const& value)
   {
     if (_size == _capacity) { reserve(_capacity == 0 ? 2 : _capacity * 2); }
     _data[_size++] = value;
@@ -118,36 +105,36 @@ class host_uvector {
 
   void clear() { _size = 0; }
 
-  [[nodiscard]] size_type size() const { return _size; }
+  [[nodiscard]] std::size_t size() const { return _size; }
   [[nodiscard]] std::int64_t ssize() const { return _size; }
   [[nodiscard]] bool is_empty() const { return _size == 0; }
-  [[nodiscard]] size_type capacity() const { return _capacity; }
+  [[nodiscard]] std::size_t capacity() const { return _capacity; }
 
-  [[nodiscard]] reference operator[](size_type idx) { return _data[idx]; }
-  [[nodiscard]] const_reference operator[](size_type idx) const { return _data[idx]; }
+  [[nodiscard]] T& operator[](std::size_t idx) { return _data[idx]; }
+  [[nodiscard]] T const& operator[](std::size_t idx) const { return _data[idx]; }
 
-  [[nodiscard]] pointer data() { return _data; }
-  [[nodiscard]] const_pointer data() const { return _data; }
+  [[nodiscard]] T* data() { return _data; }
+  [[nodiscard]] T const* data() const { return _data; }
 
-  [[nodiscard]] reference front() { return _data[0]; }
-  [[nodiscard]] const_reference front() const { return _data[0]; }
+  [[nodiscard]] T& front() { return _data[0]; }
+  [[nodiscard]] T const& front() const { return _data[0]; }
 
-  [[nodiscard]] reference back() { return _data[_size - 1]; }
-  [[nodiscard]] const_reference back() const { return _data[_size - 1]; }
+  [[nodiscard]] T& back() { return _data[_size - 1]; }
+  [[nodiscard]] T const& back() const { return _data[_size - 1]; }
 
-  [[nodiscard]] iterator begin() { return _data; }
-  [[nodiscard]] const_iterator begin() const { return _data; }
+  [[nodiscard]] T* begin() { return _data; }
+  [[nodiscard]] T const* begin() const { return _data; }
 
-  [[nodiscard]] iterator end() { return _data + _size; }
-  [[nodiscard]] const_iterator end() const { return _data + _size; }
+  [[nodiscard]] T* end() { return _data + _size; }
+  [[nodiscard]] T const* end() const { return _data + _size; }
 
   [[nodiscard]] rmm::host_async_resource_ref memory_resource() const { return _mr; }
   [[nodiscard]] rmm::cuda_stream_view stream() const { return _stream; }
 
  private:
-  pointer _data{nullptr};
-  size_type _size;
-  size_type _capacity;
+  T* _data{nullptr};
+  std::size_t _size;
+  std::size_t _capacity;
   rmm::host_async_resource_ref _mr;
   rmm::cuda_stream_view _stream;
 };

From c9a82d010a997d4c1f4afad94b36709e859d98fe Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 10 Jun 2024 16:58:14 -0700
Subject: [PATCH 28/75] Revert "docs; prefixes"

This reverts commit 164fce20ad07632b5a9899668d9da7d23ced6b97.
---
 cpp/src/utilities/pinned_memory.cpp | 43 +++++++++++++----------------
 1 file changed, 19 insertions(+), 24 deletions(-)

diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp
index 85d4b7e2283..9cebf980d00 100644
--- a/cpp/src/utilities/pinned_memory.cpp
+++ b/cpp/src/utilities/pinned_memory.cpp
@@ -28,39 +28,34 @@
 namespace cudf {
 
 namespace {
-
-// Asynchronous memory resource that allocates a fixed-size pool of pinned memory and falls back to
-// additional pinned allocations if the pool is exhausted.
 class fixed_pinned_pool_memory_resource {
   using upstream_mr    = rmm::mr::pinned_host_memory_resource;
   using host_pooled_mr = rmm::mr::pool_memory_resource<upstream_mr>;
 
  private:
-  upstream_mr _upstream_mr{};
-  size_t _pool_size{0};
+  upstream_mr upstream_mr_{};
+  size_t pool_size_{0};
   // Raw pointer to avoid a segfault when the pool is destroyed on exit
   host_pooled_mr* pool_{nullptr};
-  // The beginning and end of the pool memory range; pool is never reallocated so these are constant
-  // and can be used to determine if a pointer is within the pool
-  void* _pool_begin{nullptr};
-  void* _pool_end{nullptr};
-  cuda::stream_ref _stream{cudf::detail::global_cuda_stream_pool().get_stream().value()};
+  void* pool_begin_{nullptr};
+  void* pool_end_{nullptr};
+  cuda::stream_ref stream_{cudf::detail::global_cuda_stream_pool().get_stream().value()};
 
  public:
   fixed_pinned_pool_memory_resource(size_t size)
-    : _pool_size{size}, pool_{new host_pooled_mr(_upstream_mr, size, size)}
+    : pool_size_{size}, pool_{new host_pooled_mr(upstream_mr_, size, size)}
   {
-    if (_pool_size == 0) { return; }
+    if (pool_size_ == 0) { return; }
 
     // Allocate full size from the pinned pool to figure out the beginning and end address
-    _pool_begin = pool_->allocate_async(_pool_size, _stream);
-    _pool_end   = static_cast<void*>(static_cast<uint8_t*>(_pool_begin) + _pool_size);
-    pool_->deallocate_async(_pool_begin, _pool_size, _stream);
+    pool_begin_ = pool_->allocate_async(pool_size_, stream_);
+    pool_end_   = static_cast<void*>(static_cast<uint8_t*>(pool_begin_) + pool_size_);
+    pool_->deallocate_async(pool_begin_, pool_size_, stream_);
   }
 
   void* allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream)
   {
-    if (bytes <= _pool_size) {
+    if (bytes <= pool_size_) {
       try {
         return pool_->allocate_async(bytes, alignment, stream);
       } catch (...) {
@@ -68,7 +63,7 @@ class fixed_pinned_pool_memory_resource {
       }
     }
 
-    return _upstream_mr.allocate_async(bytes, alignment, stream);
+    return upstream_mr_.allocate_async(bytes, alignment, stream);
   }
 
   void* allocate_async(std::size_t bytes, cuda::stream_ref stream)
@@ -78,8 +73,8 @@ class fixed_pinned_pool_memory_resource {
 
   void* allocate(std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
   {
-    auto const result = allocate_async(bytes, alignment, _stream);
-    _stream.wait();
+    auto const result = allocate_async(bytes, alignment, stream_);
+    stream_.wait();
     return result;
   }
 
@@ -88,10 +83,10 @@ class fixed_pinned_pool_memory_resource {
                         std::size_t alignment,
                         cuda::stream_ref stream) noexcept
   {
-    if (bytes <= _pool_size && ptr >= _pool_begin && ptr < _pool_end) {
+    if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr < pool_end_) {
       pool_->deallocate_async(ptr, bytes, alignment, stream);
     } else {
-      _upstream_mr.deallocate_async(ptr, bytes, alignment, stream);
+      upstream_mr_.deallocate_async(ptr, bytes, alignment, stream);
     }
   }
 
@@ -104,13 +99,13 @@ class fixed_pinned_pool_memory_resource {
                   std::size_t bytes,
                   std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept
   {
-    deallocate_async(ptr, bytes, alignment, _stream);
-    _stream.wait();
+    deallocate_async(ptr, bytes, alignment, stream_);
+    stream_.wait();
   }
 
   bool operator==(fixed_pinned_pool_memory_resource const& other) const
   {
-    return pool_ == other.pool_ and _stream == other._stream;
+    return pool_ == other.pool_ and stream_ == other.stream_;
   }
 
   bool operator!=(fixed_pinned_pool_memory_resource const& other) const

From 930efef8fcec62a4ac87a1f8faebab9783ccabd4 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 10 Jun 2024 16:58:24 -0700
Subject: [PATCH 29/75] Revert "style"

This reverts commit 075deca7c87b70b62f30a5b8a266da39a3e852cb.
---
 cpp/include/cudf/utilities/span.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index 6deef974c0e..873d3e56acb 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 046694978dbe65ea515ad46b079ccbdcd9bc1206 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 10 Jun 2024 16:59:50 -0700
Subject: [PATCH 30/75] Revert "host_uvector"

This reverts commit 24c15498b9ad53ec452a99b94fb767b90f4551a0.
---
 cpp/benchmarks/io/cuio_common.cpp             |   9 +-
 cpp/benchmarks/io/cuio_common.hpp             |   4 +-
 .../cudf/detail/utilities/host_uvector.hpp    | 142 --------------
 .../cudf/detail/utilities/host_vector.hpp     | 183 ++++++++++++++++++
 .../detail/utilities/vector_factories.hpp     |  12 +-
 cpp/include/cudf/utilities/span.hpp           |   6 -
 cpp/src/io/text/bgzip_data_chunk_source.cu    |  10 +-
 .../io/text/data_chunk_source_factories.cpp   |   4 +-
 cpp/src/io/utilities/hostdevice_vector.hpp    |   4 +-
 9 files changed, 204 insertions(+), 170 deletions(-)
 delete mode 100644 cpp/include/cudf/detail/utilities/host_uvector.hpp
 create mode 100644 cpp/include/cudf/detail/utilities/host_vector.hpp

diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
index 09d7d8a9db6..45dc812e247 100644
--- a/cpp/benchmarks/io/cuio_common.cpp
+++ b/cpp/benchmarks/io/cuio_common.cpp
@@ -14,14 +14,13 @@
  * limitations under the License.
  */
 
+#include <rmm/mr/pinned_host_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
 #include <benchmarks/io/cuio_common.hpp>
 
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/logger.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
-
-#include <rmm/mr/pinned_host_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <unistd.h>
 
@@ -53,7 +52,7 @@ std::string random_file_in_dir(std::string const& dir_path)
 
 cuio_source_sink_pair::cuio_source_sink_pair(io_type type)
   : type{type},
-    pinned_buffer(0, pinned_memory_resource(), cudf::get_default_stream()),
+    pinned_buffer({pinned_memory_resource(), cudf::get_default_stream()}),
     d_buffer{0, cudf::get_default_stream()},
     file_name{random_file_in_dir(tmpdir.path())},
     void_sink{cudf::io::data_sink::create()}
diff --git a/cpp/benchmarks/io/cuio_common.hpp b/cpp/benchmarks/io/cuio_common.hpp
index 020fd7e00c1..64d6021cf50 100644
--- a/cpp/benchmarks/io/cuio_common.hpp
+++ b/cpp/benchmarks/io/cuio_common.hpp
@@ -18,7 +18,7 @@
 
 #include <cudf_test/file_utilities.hpp>
 
-#include <cudf/detail/utilities/host_uvector.hpp>
+#include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/io/data_sink.hpp>
 #include <cudf/io/datasource.hpp>
 
@@ -79,7 +79,7 @@ class cuio_source_sink_pair {
 
   io_type const type;
   std::vector<char> h_buffer;
-  cudf::detail::host_uvector<char> pinned_buffer;
+  cudf::detail::host_vector<char> pinned_buffer;
   rmm::device_uvector<std::byte> d_buffer;
   std::string const file_name;
   std::unique_ptr<cudf::io::data_sink> void_sink;
diff --git a/cpp/include/cudf/detail/utilities/host_uvector.hpp b/cpp/include/cudf/detail/utilities/host_uvector.hpp
deleted file mode 100644
index 39bde04e985..00000000000
--- a/cpp/include/cudf/detail/utilities/host_uvector.hpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- *  Copyright 2024 NVIDIA Corporation
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
-
-#include <algorithm>
-#include <cstddef>
-
-namespace cudf::detail {
-
-template <typename T>
-class host_uvector {
- public:
-  host_uvector(std::size_t size, rmm::host_async_resource_ref mr, rmm::cuda_stream_view stream)
-    : _size{size}, _capacity{size}, _mr{mr}, _stream{stream}
-  {
-    if (_size != 0) { _data = static_cast<T*>(mr.allocate_async(_size * sizeof(T), _stream)); }
-  }
-
-  host_uvector(host_uvector const&) = delete;
-  host_uvector(host_uvector&& other)
-    : _data{other._data},
-      _size{other._size},
-      _capacity{other._capacity},
-      _mr{other._mr},
-      _stream{other._stream}
-  {
-    other._data     = nullptr;
-    other._size     = 0;
-    other._capacity = 0;
-  }
-
-  host_uvector& operator=(host_uvector const&) = delete;
-  host_uvector& operator=(host_uvector&& other)
-  {
-    if (this != &other) {
-      if (_data != nullptr) { _mr.deallocate_async(_data, _size * sizeof(T), _stream); }
-      _data           = other._data;
-      _size           = other._size;
-      _capacity       = other._capacity;
-      _mr             = other._mr;
-      _stream         = other._stream;
-      other._data     = nullptr;
-      other._size     = 0;
-      other._capacity = 0;
-    }
-    return *this;
-  }
-
-  ~host_uvector()
-  {
-    if (_data != nullptr) { _mr.deallocate_async(_data, _size * sizeof(T), _stream); }
-  }
-
-  void resize(std::size_t new_size)
-  {
-    if (new_size > _capacity) {
-      auto new_data = static_cast<T*>(_mr.allocate_async(new_size * sizeof(T), _stream));
-      _stream.synchronize();
-      if (_data != nullptr) {
-        std::copy(_data, _data + _size, new_data);
-        _mr.deallocate_async(_data, _size * sizeof(T), _stream);
-      }
-      _data     = new_data;
-      _capacity = new_size;
-    }
-    _size = new_size;
-  }
-
-  void reserve(std::size_t new_capacity)
-  {
-    if (new_capacity > _capacity) {
-      auto new_data = static_cast<T*>(_mr.allocate_async(new_capacity * sizeof(T), _stream));
-      _stream.synchronize();
-      if (_data != nullptr) {
-        std::copy(_data, _data + _size, new_data);
-        _mr.deallocate_async(_data, _size * sizeof(T), _stream);
-      }
-      _data     = new_data;
-      _capacity = new_capacity;
-    }
-  }
-
-  void push_back(T const& value)
-  {
-    if (_size == _capacity) { reserve(_capacity == 0 ? 2 : _capacity * 2); }
-    _data[_size++] = value;
-  }
-
-  void clear() { _size = 0; }
-
-  [[nodiscard]] std::size_t size() const { return _size; }
-  [[nodiscard]] std::int64_t ssize() const { return _size; }
-  [[nodiscard]] bool is_empty() const { return _size == 0; }
-  [[nodiscard]] std::size_t capacity() const { return _capacity; }
-
-  [[nodiscard]] T& operator[](std::size_t idx) { return _data[idx]; }
-  [[nodiscard]] T const& operator[](std::size_t idx) const { return _data[idx]; }
-
-  [[nodiscard]] T* data() { return _data; }
-  [[nodiscard]] T const* data() const { return _data; }
-
-  [[nodiscard]] T& front() { return _data[0]; }
-  [[nodiscard]] T const& front() const { return _data[0]; }
-
-  [[nodiscard]] T& back() { return _data[_size - 1]; }
-  [[nodiscard]] T const& back() const { return _data[_size - 1]; }
-
-  [[nodiscard]] T* begin() { return _data; }
-  [[nodiscard]] T const* begin() const { return _data; }
-
-  [[nodiscard]] T* end() { return _data + _size; }
-  [[nodiscard]] T const* end() const { return _data + _size; }
-
-  [[nodiscard]] rmm::host_async_resource_ref memory_resource() const { return _mr; }
-  [[nodiscard]] rmm::cuda_stream_view stream() const { return _stream; }
-
- private:
-  T* _data{nullptr};
-  std::size_t _size;
-  std::size_t _capacity;
-  rmm::host_async_resource_ref _mr;
-  rmm::cuda_stream_view _stream;
-};
-
-}  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp
new file mode 100644
index 00000000000..e62c8017f8b
--- /dev/null
+++ b/cpp/include/cudf/detail/utilities/host_vector.hpp
@@ -0,0 +1,183 @@
+/*
+ *  Copyright 2024 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/aligned.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <thrust/host_vector.h>
+
+#include <cstddef>
+#include <limits>
+#include <new>  // for bad_alloc
+
+namespace cudf::detail {
+
+/*! \p rmm_host_allocator is a CUDA-specific host memory allocator
+ *  that employs \c a `rmm::host_async_resource_ref` for allocation.
+ *
+ *  \see https://en.cppreference.com/w/cpp/memory/allocator
+ */
+template <typename T>
+class rmm_host_allocator;
+
+/*! \p rmm_host_allocator is a CUDA-specific host memory allocator
+ *  that employs \c an `cudf::host_async_resource_ref` for allocation.
+ *
+ *  \see https://en.cppreference.com/w/cpp/memory/allocator
+ */
+template <>
+class rmm_host_allocator<void> {
+ public:
+  using value_type      = void;            ///< The type of the elements in the allocator
+  using pointer         = void*;           ///< The type returned by address() / allocate()
+  using const_pointer   = void const*;     ///< The type returned by address()
+  using size_type       = std::size_t;     ///< The type used for the size of the allocation
+  using difference_type = std::ptrdiff_t;  ///< The type of the distance between two pointers
+
+  /**
+   * @brief converts a `rmm_host_allocator<void>` to `rmm_host_allocator<U>`
+   */
+  template <typename U>
+  struct rebind {
+    using other = rmm_host_allocator<U>;  ///< The rebound type
+  };
+};
+
+/*! \p rmm_host_allocator is a CUDA-specific host memory allocator
+ *  that employs \c `rmm::host_async_resource_ref` for allocation.
+ *
+ * The \p rmm_host_allocator provides an interface for host memory allocation through the user
+ * provided \c `rmm::host_async_resource_ref`. The \p rmm_host_allocator does not take ownership of
+ * this reference and therefore it is the user's responsibility to ensure its lifetime for the
+ * duration of the lifetime of the \p rmm_host_allocator.
+ *
+ *  \see https://en.cppreference.com/w/cpp/memory/allocator
+ */
+template <typename T>
+class rmm_host_allocator {
+ public:
+  using value_type      = T;               ///< The type of the elements in the allocator
+  using pointer         = T*;              ///< The type returned by address() / allocate()
+  using const_pointer   = T const*;        ///< The type returned by address()
+  using reference       = T&;              ///< The parameter type for address()
+  using const_reference = T const&;        ///< The parameter type for address()
+  using size_type       = std::size_t;     ///< The type used for the size of the allocation
+  using difference_type = std::ptrdiff_t;  ///< The type of the distance between two pointers
+
+  typedef cuda::std::true_type propagate_on_container_move_assignment;
+
+  /**
+   * @brief converts a `rmm_host_allocator<T>` to `rmm_host_allocator<U>`
+   */
+  template <typename U>
+  struct rebind {
+    using other = rmm_host_allocator<U>;  ///< The rebound type
+  };
+
+  /**
+   * @brief Cannot declare an empty host allocator.
+   */
+  rmm_host_allocator() = delete;
+
+  /**
+   * @brief Construct from a `cudf::host_async_resource_ref`
+   */
+  rmm_host_allocator(rmm::host_async_resource_ref _mr, rmm::cuda_stream_view _stream)
+    : mr(_mr), stream(_stream)
+  {
+  }
+
+  /**
+   * @brief This method allocates storage for objects in host memory.
+   *
+   *  @param cnt The number of objects to allocate.
+   *  @return a \c pointer to the newly allocated objects.
+   *  @note This method does not invoke \p value_type's constructor.
+   *        It is the responsibility of the caller to initialize the
+   *        objects at the returned \c pointer.
+   */
+  inline pointer allocate(size_type cnt)
+  {
+    if (cnt > this->max_size()) { throw std::bad_alloc(); }  // end if
+    return static_cast<pointer>(
+      mr.allocate_async(cnt * sizeof(value_type), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream));
+  }
+
+  /**
+   * @brief This method deallocates host memory previously allocated
+   *  with this \c rmm_host_allocator.
+   *
+   *  @param p A \c pointer to the previously allocated memory.
+   *  @note The second parameter is the number of objects previously allocated.
+   *  @note This method does not invoke \p value_type's destructor.
+   *        It is the responsibility of the caller to destroy
+   *        the objects stored at \p p.
+   */
+  inline void deallocate(pointer p, size_type cnt)
+  {
+    mr.deallocate_async(p, cnt * sizeof(value_type), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
+  }
+
+  /**
+   * @brief This method returns the maximum size of the \c cnt parameter
+   *  accepted by the \p allocate() method.
+   *
+   *  @return The maximum number of objects that may be allocated
+   *          by a single call to \p allocate().
+   */
+  constexpr inline size_type max_size() const
+  {
+    return (std::numeric_limits<size_type>::max)() / sizeof(T);
+  }
+
+  /**
+   * @brief This method tests this \p rmm_host_allocator for equality to
+   *  another.
+   *
+   *  @param x The other \p rmm_host_allocator of interest.
+   *  @return This method always returns \c true.
+   */
+  inline bool operator==(rmm_host_allocator const& x) const
+  {
+    return x.mr == mr && x.stream == stream;
+  }
+
+  /**
+   * @brief This method tests this \p rmm_host_allocator for inequality
+   *  to another.
+   *
+   *  @param x The other \p rmm_host_allocator of interest.
+   *  @return This method always returns \c false.
+   */
+  inline bool operator!=(rmm_host_allocator const& x) const { return !operator==(x); }
+
+ private:
+  rmm::host_async_resource_ref mr;
+  rmm::cuda_stream_view stream;
+};
+
+/**
+ * @brief A vector class with rmm host memory allocator
+ */
+template <typename T>
+using host_vector = thrust::host_vector<T, rmm_host_allocator<T>>;
+
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index f67b671c610..06dfcbfc5e5 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -21,7 +21,7 @@
  * @file vector_factories.hpp
  */
 
-#include <cudf/detail/utilities/host_uvector.hpp>
+#include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
@@ -442,7 +442,7 @@ thrust::host_vector<typename Container::value_type> make_host_vector_sync(
 }
 
 /**
- * @brief Asynchronously construct a pinned `cudf::detail::host_uvector` of the given size
+ * @brief Asynchronously construct a pinned `cudf::detail::host_vector` of the given size
  *
  * @note This function does not synchronize `stream`.
  *
@@ -452,13 +452,13 @@ thrust::host_vector<typename Container::value_type> make_host_vector_sync(
  * @return A host_vector of the given size
  */
 template <typename T>
-host_uvector<T> make_pinned_vector_async(size_t size, rmm::cuda_stream_view stream)
+host_vector<T> make_pinned_vector_async(size_t size, rmm::cuda_stream_view stream)
 {
-  return host_uvector<T>(size, cudf::get_pinned_memory_resource(), stream);
+  return host_vector<T>(size, {cudf::get_pinned_memory_resource(), stream});
 }
 
 /**
- * @brief Synchronously construct a pinned `cudf::detail::host_uvector` of the given size
+ * @brief Synchronously construct a pinned `cudf::detail::host_vector` of the given size
  *
  * @note This function synchronizes `stream`.
  *
@@ -468,7 +468,7 @@ host_uvector<T> make_pinned_vector_async(size_t size, rmm::cuda_stream_view stre
  * @return A host_vector of the given size
  */
 template <typename T>
-host_uvector<T> make_pinned_vector_sync(size_t size, rmm::cuda_stream_view stream)
+host_vector<T> make_pinned_vector_sync(size_t size, rmm::cuda_stream_view stream)
 {
   auto result = make_pinned_vector_async<T>(size, stream);
   stream.synchronize();
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index 873d3e56acb..47e92d61a9f 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -16,8 +16,6 @@
 
 #pragma once
 
-#include <cudf/detail/utilities/host_uvector.hpp>
-
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/device_vector.hpp>
@@ -214,10 +212,6 @@ template <typename T, typename Alloc>
 struct is_host_span_supported_container<  //
   thrust::host_vector<T, Alloc>> : std::true_type {};
 
-template <typename T>
-struct is_host_span_supported_container<  //
-  cudf::detail::host_uvector<T>> : std::true_type {};
-
 template <typename T, typename Alloc>
 struct is_host_span_supported_container<  //
   std::basic_string<T, std::char_traits<T>, Alloc>> : std::true_type {};
diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu
index b7644a6fb9f..0e3ce779089 100644
--- a/cpp/src/io/text/bgzip_data_chunk_source.cu
+++ b/cpp/src/io/text/bgzip_data_chunk_source.cu
@@ -19,7 +19,7 @@
 #include "io/utilities/config_utils.hpp"
 
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/host_uvector.hpp>
+#include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
@@ -67,7 +67,7 @@ struct bgzip_nvcomp_transform_functor {
 class bgzip_data_chunk_reader : public data_chunk_reader {
  private:
   template <typename T>
-  static void copy_to_device(cudf::detail::host_uvector<T> const& host,
+  static void copy_to_device(cudf::detail::host_vector<T> const& host,
                              rmm::device_uvector<T>& device,
                              rmm::cuda_stream_view stream)
   {
@@ -85,9 +85,9 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
       1 << 16;  // 64k offset allocation, resized on demand
 
     cudaEvent_t event;
-    cudf::detail::host_uvector<char> h_compressed_blocks;
-    cudf::detail::host_uvector<std::size_t> h_compressed_offsets;
-    cudf::detail::host_uvector<std::size_t> h_decompressed_offsets;
+    cudf::detail::host_vector<char> h_compressed_blocks;
+    cudf::detail::host_vector<std::size_t> h_compressed_offsets;
+    cudf::detail::host_vector<std::size_t> h_decompressed_offsets;
     rmm::device_uvector<char> d_compressed_blocks;
     rmm::device_uvector<char> d_decompressed_blocks;
     rmm::device_uvector<std::size_t> d_compressed_offsets;
diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp
index 39e955232e3..596ca3458c8 100644
--- a/cpp/src/io/text/data_chunk_source_factories.cpp
+++ b/cpp/src/io/text/data_chunk_source_factories.cpp
@@ -18,7 +18,7 @@
 #include "io/text/device_data_chunks.hpp"
 
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/host_uvector.hpp>
+#include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 
@@ -34,7 +34,7 @@ namespace {
 
 struct host_ticket {
   cudaEvent_t event{};  // tracks the completion of the last device-to-host copy.
-  cudf::detail::host_uvector<char> buffer;
+  cudf::detail::host_vector<char> buffer;
 
   host_ticket() : buffer{cudf::detail::make_pinned_vector_sync<char>(0, cudf::get_default_stream())}
   {
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index ae2ab03ded3..1ae27a2f4ae 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -18,7 +18,7 @@
 
 #include "hostdevice_span.hpp"
 
-#include <cudf/detail/utilities/host_uvector.hpp>
+#include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -172,7 +172,7 @@ class hostdevice_vector {
   }
 
  private:
-  cudf::detail::host_uvector<T> h_data;
+  cudf::detail::host_vector<T> h_data;
   rmm::device_uvector<T> d_data;
 };
 

From f31221901aab0712b7e4e416c7454d4ef03a7019 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 10 Jun 2024 17:38:31 -0700
Subject: [PATCH 31/75] make do without host_uvector

---
 cpp/include/cudf/detail/utilities/host_vector.hpp      | 8 ++++++--
 cpp/include/cudf/detail/utilities/vector_factories.hpp | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp
index e62c8017f8b..756fdab177a 100644
--- a/cpp/include/cudf/detail/utilities/host_vector.hpp
+++ b/cpp/include/cudf/detail/utilities/host_vector.hpp
@@ -117,8 +117,12 @@ class rmm_host_allocator {
   inline pointer allocate(size_type cnt)
   {
     if (cnt > this->max_size()) { throw std::bad_alloc(); }  // end if
-    return static_cast<pointer>(
-      mr.allocate_async(cnt * sizeof(value_type), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream));
+    auto const result =
+      mr.allocate_async(cnt * sizeof(value_type), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
+    // synchronize to ensure the memory is allocated before thrust::host_vector initialization
+    // TODO: replace thrust::host_vector with a type that does not require synchronization
+    stream.synchronize();
+    return static_cast<pointer>(result);
   }
 
   /**
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 06dfcbfc5e5..20cb55bb1c7 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -444,7 +444,7 @@ thrust::host_vector<typename Container::value_type> make_host_vector_sync(
 /**
  * @brief Asynchronously construct a pinned `cudf::detail::host_vector` of the given size
  *
- * @note This function does not synchronize `stream`.
+ * @note This function may not synchronize `stream`.
  *
  * @tparam T The type of the vector data
  * @param size The number of elements in the created vector

From 7cfee0ab2d3bfc3b261edce2340555a69840ebcc Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 10 Jun 2024 17:41:16 -0700
Subject: [PATCH 32/75] missed change

---
 cpp/include/cudf/detail/utilities/host_vector.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp
index 756fdab177a..6a115177ab5 100644
--- a/cpp/include/cudf/detail/utilities/host_vector.hpp
+++ b/cpp/include/cudf/detail/utilities/host_vector.hpp
@@ -119,7 +119,7 @@ class rmm_host_allocator {
     if (cnt > this->max_size()) { throw std::bad_alloc(); }  // end if
     auto const result =
       mr.allocate_async(cnt * sizeof(value_type), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
-    // synchronize to ensure the memory is allocated before thrust::host_vector initialization
+    // Synchronize to ensure the memory is allocated before thrust::host_vector initialization
     // TODO: replace thrust::host_vector with a type that does not require synchronization
     stream.synchronize();
     return static_cast<pointer>(result);

From fe4d668fb4e6dddf0a019e5443acf2ecc34ff0e8 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 10 Jun 2024 17:54:30 -0700
Subject: [PATCH 33/75] style

---
 cpp/benchmarks/io/cuio_common.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
index 45dc812e247..645994f3f0d 100644
--- a/cpp/benchmarks/io/cuio_common.cpp
+++ b/cpp/benchmarks/io/cuio_common.cpp
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
-#include <rmm/mr/pinned_host_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
-
 #include <benchmarks/io/cuio_common.hpp>
 
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/logger.hpp>
 
+#include <rmm/mr/pinned_host_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
 #include <unistd.h>
 
 #include <cstdio>

From 5a71f7702483d39b589abf55b6926aa0f07f9ec4 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 12 Jun 2024 12:22:41 -0700
Subject: [PATCH 34/75] rename

---
 cpp/include/cudf/utilities/pinned_memory.hpp |  4 ++--
 cpp/src/utilities/cuda_copy.cu               |  2 +-
 cpp/src/utilities/pinned_memory.cpp          | 12 ++++++++----
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp
index b0d6c55999f..c57c96dcb41 100644
--- a/cpp/include/cudf/utilities/pinned_memory.hpp
+++ b/cpp/include/cudf/utilities/pinned_memory.hpp
@@ -63,13 +63,13 @@ bool config_default_pinned_memory_resource(pinned_mr_options const& opts);
  * threshold, the copy will be done using cudaMemcpyAsync.
  */
 
-void set_kernel_copy_threshold(size_t threshold);
+void kernel_pinned_copy_threshold(size_t threshold);
 
 /**
  * @brief Get the threshold size for using kernels for pinned memory copies.
  *
  * @return The threshold size in bytes.
  */
-size_t get_kernel_copy_threshold();
+size_t get_kernel_pinned_copy_threshold();
 
 }  // namespace cudf
diff --git a/cpp/src/utilities/cuda_copy.cu b/cpp/src/utilities/cuda_copy.cu
index da3b4760967..7e0cab1f21b 100644
--- a/cpp/src/utilities/cuda_copy.cu
+++ b/cpp/src/utilities/cuda_copy.cu
@@ -28,7 +28,7 @@ void copy_pinned(void* dst, void const* src, std::size_t size, rmm::cuda_stream_
 {
   if (size == 0) return;
 
-  if (size < get_kernel_copy_threshold()) {
+  if (size < get_kernel_pinned_copy_threshold()) {
     thrust::copy_n(rmm::exec_policy_nosync(stream),
                    static_cast<const char*>(src),
                    size,
diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp
index c1305ad1e89..86d37987b07 100644
--- a/cpp/src/utilities/pinned_memory.cpp
+++ b/cpp/src/utilities/pinned_memory.cpp
@@ -213,14 +213,18 @@ bool config_default_pinned_memory_resource(pinned_mr_options const& opts)
   return did_configure;
 }
 
-CUDF_EXPORT auto& kernel_copy_threshold()
+CUDF_EXPORT auto& kernel_pinned_copy_threshold()
 {
-  static std::atomic<size_t> threshold = 0;  // use cudaMemcpyAsync for all pinned copies
+  // use cudaMemcpyAsync for all pinned copies
+  static std::atomic<size_t> threshold = 0;
   return threshold;
 }
 
-void set_kernel_copy_threshold(size_t threshold) { kernel_copy_threshold() = threshold; }
+void set_kernel_pinned_copy_threshold(size_t threshold)
+{
+  kernel_pinned_copy_threshold() = threshold;
+}
 
-size_t get_kernel_copy_threshold() { return kernel_copy_threshold(); }
+size_t get_kernel_pinned_copy_threshold() { return kernel_pinned_copy_threshold(); }
 
 }  // namespace cudf

From 9068642c86a3d4bf2f30c705683ac52f9d9f42f3 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 12 Jun 2024 14:30:35 -0700
Subject: [PATCH 35/75] refactor

---
 .../cudf/detail/utilities/cuda_copy.hpp       | 49 +++++++++++++++++--
 cpp/src/io/utilities/hostdevice_vector.hpp    | 10 ++--
 cpp/src/utilities/cuda_copy.cu                | 43 ++++++++++++++++
 3 files changed, 91 insertions(+), 11 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/cuda_copy.hpp b/cpp/include/cudf/detail/utilities/cuda_copy.hpp
index fce91751f80..7732e108938 100644
--- a/cpp/include/cudf/detail/utilities/cuda_copy.hpp
+++ b/cpp/include/cudf/detail/utilities/cuda_copy.hpp
@@ -22,20 +22,59 @@ namespace cudf::detail {
 
 namespace impl {
 
-void copy_pinned(void* dst, void const* src, size_t size, rmm::cuda_stream_view stream);
+void copy_pinned_to_device(void* dst, void const* src, size_t size, rmm::cuda_stream_view stream);
+void copy_device_to_pinned(void* dst, void const* src, size_t size, rmm::cuda_stream_view stream);
+
+void copy_pageable_to_device(void* dst, void const* src, size_t size, rmm::cuda_stream_view stream);
+void copy_device_to_pageable(void* dst, void const* src, size_t size, rmm::cuda_stream_view stream);
 
 }  // namespace impl
 
+enum class copy_kind { PINNED_TO_DEVICE, DEVICE_TO_PINNED, PAGEABLE_TO_DEVICE, DEVICE_TO_PAGEABLE };
+
+/**
+ * @brief Asynchronously copies data between the host and device.
+ *
+ * Implementation may use different strategies depending on the size and type of host data.
+ *
+ * @param dst Destination memory address
+ * @param src Source memory address
+ * @param size Number of bytes to copy
+ * @param kind Direction of the copy and type of host memory
+ * @param stream CUDA stream used for the copy
+ */
+
 template <typename T>
-void copy_pinned_to_device_async(T* dst, T const* src, size_t size, rmm::cuda_stream_view stream)
+void cuda_memcpy_async(
+  T* dst, T const* src, size_t size, copy_kind kind, rmm::cuda_stream_view stream)
 {
-  impl::copy_pinned(dst, src, size * sizeof(T), stream);
+  if (kind == copy_kind::PINNED_TO_DEVICE) {
+    impl::copy_pinned_to_device(dst, src, size * sizeof(T), stream);
+  } else if (kind == copy_kind::DEVICE_TO_PINNED) {
+    impl::copy_device_to_pinned(dst, src, size * sizeof(T), stream);
+  } else if (kind == copy_kind::PAGEABLE_TO_DEVICE) {
+    impl::copy_pageable_to_device(dst, src, size * sizeof(T), stream);
+  } else if (kind == copy_kind::DEVICE_TO_PAGEABLE) {
+    impl::copy_device_to_pageable(dst, src, size * sizeof(T), stream);
+  }
 }
 
+/**
+ * @brief Synchronously copies data between the host and device.
+ *
+ * Implementation may use different strategies depending on the size and type of host data.
+ *
+ * @param dst Destination memory address
+ * @param src Source memory address
+ * @param size Number of bytes to copy
+ * @param kind Direction of the copy and type of host memory
+ * @param stream CUDA stream used for the copy
+ */
 template <typename T>
-void copy_device_to_pinned_async(T* dst, T const* src, size_t size, rmm::cuda_stream_view stream)
+void cuda_memcpy(T* dst, T const* src, size_t size, copy_kind kind, rmm::cuda_stream_view stream)
 {
-  impl::copy_pinned(dst, src, size * sizeof(T), stream);
+  cuda_memcpy_async(dst, src, size, kind, stream);
+  stream.synchronize();
 }
 
 }  // namespace cudf::detail
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index fe2100a7886..2429bca57fa 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -125,24 +125,22 @@ class hostdevice_vector {
 
   void host_to_device_async(rmm::cuda_stream_view stream)
   {
-    copy_pinned_to_device_async(device_ptr(), host_ptr(), size(), stream);
+    cuda_memcpy_async(device_ptr(), host_ptr(), size(), copy_kind::PINNED_TO_DEVICE, stream);
   }
 
   void host_to_device_sync(rmm::cuda_stream_view stream)
   {
-    host_to_device_async(stream);
-    stream.synchronize();
+    cuda_memcpy(device_ptr(), host_ptr(), size(), copy_kind::PINNED_TO_DEVICE, stream);
   }
 
   void device_to_host_async(rmm::cuda_stream_view stream)
   {
-    copy_device_to_pinned_async(host_ptr(), device_ptr(), size(), stream);
+    cuda_memcpy_async(device_ptr(), host_ptr(), size(), copy_kind::DEVICE_TO_PINNED, stream);
   }
 
   void device_to_host_sync(rmm::cuda_stream_view stream)
   {
-    device_to_host_async(stream);
-    stream.synchronize();
+    cuda_memcpy(device_ptr(), host_ptr(), size(), copy_kind::DEVICE_TO_PINNED, stream);
   }
 
   /**
diff --git a/cpp/src/utilities/cuda_copy.cu b/cpp/src/utilities/cuda_copy.cu
index 7e0cab1f21b..78445c45a63 100644
--- a/cpp/src/utilities/cuda_copy.cu
+++ b/cpp/src/utilities/cuda_copy.cu
@@ -24,6 +24,8 @@
 
 namespace cudf::detail::impl {
 
+namespace {
+
 void copy_pinned(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream)
 {
   if (size == 0) return;
@@ -38,4 +40,45 @@ void copy_pinned(void* dst, void const* src, std::size_t size, rmm::cuda_stream_
   }
 }
 
+void copy_pageable(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream)
+{
+  if (size == 0) return;
+
+  CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream));
+}
+
+};  // namespace
+
+void copy_pinned_to_device(void* dst,
+                           void const* src,
+                           std::size_t size,
+                           rmm::cuda_stream_view stream)
+{
+  copy_pinned(dst, src, size, stream);
+}
+
+void copy_device_to_pinned(void* dst,
+                           void const* src,
+                           std::size_t size,
+                           rmm::cuda_stream_view stream)
+{
+  copy_pinned(dst, src, size, stream);
+}
+
+void copy_device_to_pageable(void* dst,
+                             void const* src,
+                             std::size_t size,
+                             rmm::cuda_stream_view stream)
+{
+  copy_pageable(dst, src, size, stream);
+}
+
+void copy_pageable_to_device(void* dst,
+                             void const* src,
+                             std::size_t size,
+                             rmm::cuda_stream_view stream)
+{
+  copy_pageable(dst, src, size, stream);
+}
+
 }  // namespace cudf::detail::impl

From 2ec467002c94063bf03303cf36c9cc9b038c5c8f Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 17 Jun 2024 16:14:38 -0700
Subject: [PATCH 36/75] missing newlines

---
 cpp/include/cudf/detail/utilities/cuda_copy.hpp | 1 -
 cpp/include/cudf/utilities/pinned_memory.hpp    | 1 -
 2 files changed, 2 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/cuda_copy.hpp b/cpp/include/cudf/detail/utilities/cuda_copy.hpp
index 7732e108938..47533959ae4 100644
--- a/cpp/include/cudf/detail/utilities/cuda_copy.hpp
+++ b/cpp/include/cudf/detail/utilities/cuda_copy.hpp
@@ -43,7 +43,6 @@ enum class copy_kind { PINNED_TO_DEVICE, DEVICE_TO_PINNED, PAGEABLE_TO_DEVICE, D
  * @param kind Direction of the copy and type of host memory
  * @param stream CUDA stream used for the copy
  */
-
 template <typename T>
 void cuda_memcpy_async(
   T* dst, T const* src, size_t size, copy_kind kind, rmm::cuda_stream_view stream)
diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp
index c57c96dcb41..e41020dff1e 100644
--- a/cpp/include/cudf/utilities/pinned_memory.hpp
+++ b/cpp/include/cudf/utilities/pinned_memory.hpp
@@ -62,7 +62,6 @@ bool config_default_pinned_memory_resource(pinned_mr_options const& opts);
  * threshold, the copy will be done using kernels. If the size is greater than or equal to this
  * threshold, the copy will be done using cudaMemcpyAsync.
  */
-
 void kernel_pinned_copy_threshold(size_t threshold);
 
 /**

From a886eb4b9df0fcf3e8a536ffeff355593712add2 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 17 Jun 2024 16:47:41 -0700
Subject: [PATCH 37/75] rename files

---
 cpp/CMakeLists.txt                                              | 2 +-
 .../cudf/detail/utilities/{cuda_copy.hpp => cuda_memcpy.hpp}    | 0
 cpp/src/io/utilities/hostdevice_vector.hpp                      | 2 +-
 cpp/src/utilities/{cuda_copy.cu => cuda_memcpy.cu}              | 2 +-
 4 files changed, 3 insertions(+), 3 deletions(-)
 rename cpp/include/cudf/detail/utilities/{cuda_copy.hpp => cuda_memcpy.hpp} (100%)
 rename cpp/src/utilities/{cuda_copy.cu => cuda_memcpy.cu} (98%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 13db81f3c97..afbeb7c3266 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -662,7 +662,7 @@ add_library(
   src/unary/nan_ops.cu
   src/unary/null_ops.cu
   src/utilities/default_stream.cpp
-  src/utilities/cuda_copy.cu
+  src/utilities/cuda_memcpy.cu
   src/utilities/linked_column.cpp
   src/utilities/logger.cpp
   src/utilities/pinned_memory.cpp
diff --git a/cpp/include/cudf/detail/utilities/cuda_copy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
similarity index 100%
rename from cpp/include/cudf/detail/utilities/cuda_copy.hpp
rename to cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 2429bca57fa..db1f9f1e461 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -18,7 +18,7 @@
 
 #include "hostdevice_span.hpp"
 
-#include <cudf/detail/utilities/cuda_copy.hpp>
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
 #include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
diff --git a/cpp/src/utilities/cuda_copy.cu b/cpp/src/utilities/cuda_memcpy.cu
similarity index 98%
rename from cpp/src/utilities/cuda_copy.cu
rename to cpp/src/utilities/cuda_memcpy.cu
index 78445c45a63..ff8d3bf120a 100644
--- a/cpp/src/utilities/cuda_copy.cu
+++ b/cpp/src/utilities/cuda_memcpy.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/utilities/cuda_copy.hpp>
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
 

From dcaeaba8285fc4b3e60ee0a12d60cd9a3cbcf66c Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 18 Jun 2024 15:26:36 -0700
Subject: [PATCH 38/75] test commit, please ignore

---
 cpp/src/io/utilities/hostdevice_vector.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index db1f9f1e461..5b9bc4d36c0 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -135,12 +135,12 @@ class hostdevice_vector {
 
   void device_to_host_async(rmm::cuda_stream_view stream)
   {
-    cuda_memcpy_async(device_ptr(), host_ptr(), size(), copy_kind::DEVICE_TO_PINNED, stream);
+    cuda_memcpy_async(host_ptr(), device_ptr(), size(), copy_kind::DEVICE_TO_PINNED, stream);
   }
 
   void device_to_host_sync(rmm::cuda_stream_view stream)
   {
-    cuda_memcpy(device_ptr(), host_ptr(), size(), copy_kind::DEVICE_TO_PINNED, stream);
+    cuda_memcpy(host_ptr(), device_ptr(), size(), copy_kind::DEVICE_TO_PINNED, stream);
   }
 
   /**

From 0a2742f5026fac3546e5d8814d76eb8fdf794fe8 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 24 Jun 2024 10:46:40 -0700
Subject: [PATCH 39/75] fix typo

---
 cpp/include/cudf/utilities/pinned_memory.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp
index e41020dff1e..3e2fa43cb50 100644
--- a/cpp/include/cudf/utilities/pinned_memory.hpp
+++ b/cpp/include/cudf/utilities/pinned_memory.hpp
@@ -62,7 +62,7 @@ bool config_default_pinned_memory_resource(pinned_mr_options const& opts);
  * threshold, the copy will be done using kernels. If the size is greater than or equal to this
  * threshold, the copy will be done using cudaMemcpyAsync.
  */
-void kernel_pinned_copy_threshold(size_t threshold);
+void set_kernel_pinned_copy_threshold(size_t threshold);
 
 /**
  * @brief Get the threshold size for using kernels for pinned memory copies.

From 68a03f13f9a9b1ebd33659018a44f0227e4e9432 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 24 Jun 2024 11:47:14 -0700
Subject: [PATCH 40/75] typeless API

---
 .../cudf/detail/utilities/cuda_memcpy.hpp     | 32 ++-----------------
 cpp/src/io/utilities/hostdevice_vector.hpp    |  8 ++---
 cpp/src/utilities/cuda_memcpy.cu              | 29 ++++++++++++++---
 3 files changed, 32 insertions(+), 37 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
index 47533959ae4..3d497d0a5e2 100644
--- a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
+++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
@@ -20,16 +20,6 @@
 
 namespace cudf::detail {
 
-namespace impl {
-
-void copy_pinned_to_device(void* dst, void const* src, size_t size, rmm::cuda_stream_view stream);
-void copy_device_to_pinned(void* dst, void const* src, size_t size, rmm::cuda_stream_view stream);
-
-void copy_pageable_to_device(void* dst, void const* src, size_t size, rmm::cuda_stream_view stream);
-void copy_device_to_pageable(void* dst, void const* src, size_t size, rmm::cuda_stream_view stream);
-
-}  // namespace impl
-
 enum class copy_kind { PINNED_TO_DEVICE, DEVICE_TO_PINNED, PAGEABLE_TO_DEVICE, DEVICE_TO_PAGEABLE };
 
 /**
@@ -43,20 +33,8 @@ enum class copy_kind { PINNED_TO_DEVICE, DEVICE_TO_PINNED, PAGEABLE_TO_DEVICE, D
  * @param kind Direction of the copy and type of host memory
  * @param stream CUDA stream used for the copy
  */
-template <typename T>
 void cuda_memcpy_async(
-  T* dst, T const* src, size_t size, copy_kind kind, rmm::cuda_stream_view stream)
-{
-  if (kind == copy_kind::PINNED_TO_DEVICE) {
-    impl::copy_pinned_to_device(dst, src, size * sizeof(T), stream);
-  } else if (kind == copy_kind::DEVICE_TO_PINNED) {
-    impl::copy_device_to_pinned(dst, src, size * sizeof(T), stream);
-  } else if (kind == copy_kind::PAGEABLE_TO_DEVICE) {
-    impl::copy_pageable_to_device(dst, src, size * sizeof(T), stream);
-  } else if (kind == copy_kind::DEVICE_TO_PAGEABLE) {
-    impl::copy_device_to_pageable(dst, src, size * sizeof(T), stream);
-  }
-}
+  void* dst, void const* src, size_t size, copy_kind kind, rmm::cuda_stream_view stream);
 
 /**
  * @brief Synchronously copies data between the host and device.
@@ -69,11 +47,7 @@ void cuda_memcpy_async(
  * @param kind Direction of the copy and type of host memory
  * @param stream CUDA stream used for the copy
  */
-template <typename T>
-void cuda_memcpy(T* dst, T const* src, size_t size, copy_kind kind, rmm::cuda_stream_view stream)
-{
-  cuda_memcpy_async(dst, src, size, kind, stream);
-  stream.synchronize();
-}
+void cuda_memcpy(
+  void* dst, void const* src, size_t size, copy_kind kind, rmm::cuda_stream_view stream);
 
 }  // namespace cudf::detail
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 5b9bc4d36c0..1cbf850bf20 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -125,22 +125,22 @@ class hostdevice_vector {
 
   void host_to_device_async(rmm::cuda_stream_view stream)
   {
-    cuda_memcpy_async(device_ptr(), host_ptr(), size(), copy_kind::PINNED_TO_DEVICE, stream);
+    cuda_memcpy_async(device_ptr(), host_ptr(), size_bytes(), copy_kind::PINNED_TO_DEVICE, stream);
   }
 
   void host_to_device_sync(rmm::cuda_stream_view stream)
   {
-    cuda_memcpy(device_ptr(), host_ptr(), size(), copy_kind::PINNED_TO_DEVICE, stream);
+    cuda_memcpy(device_ptr(), host_ptr(), size_bytes(), copy_kind::PINNED_TO_DEVICE, stream);
   }
 
   void device_to_host_async(rmm::cuda_stream_view stream)
   {
-    cuda_memcpy_async(host_ptr(), device_ptr(), size(), copy_kind::DEVICE_TO_PINNED, stream);
+    cuda_memcpy_async(host_ptr(), device_ptr(), size_bytes(), copy_kind::DEVICE_TO_PINNED, stream);
   }
 
   void device_to_host_sync(rmm::cuda_stream_view stream)
   {
-    cuda_memcpy(host_ptr(), device_ptr(), size(), copy_kind::DEVICE_TO_PINNED, stream);
+    cuda_memcpy(host_ptr(), device_ptr(), size_bytes(), copy_kind::DEVICE_TO_PINNED, stream);
   }
 
   /**
diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu
index ff8d3bf120a..ed920cc90c7 100644
--- a/cpp/src/utilities/cuda_memcpy.cu
+++ b/cpp/src/utilities/cuda_memcpy.cu
@@ -22,7 +22,7 @@
 
 #include <thrust/copy.h>
 
-namespace cudf::detail::impl {
+namespace cudf::detail {
 
 namespace {
 
@@ -47,8 +47,6 @@ void copy_pageable(void* dst, void const* src, std::size_t size, rmm::cuda_strea
   CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream));
 }
 
-};  // namespace
-
 void copy_pinned_to_device(void* dst,
                            void const* src,
                            std::size_t size,
@@ -81,4 +79,27 @@ void copy_pageable_to_device(void* dst,
   copy_pageable(dst, src, size, stream);
 }
 
-}  // namespace cudf::detail::impl
+};  // namespace
+
+void cuda_memcpy_async(
+  void* dst, void const* src, size_t size, copy_kind kind, rmm::cuda_stream_view stream)
+{
+  if (kind == copy_kind::PINNED_TO_DEVICE) {
+    copy_pinned_to_device(dst, src, size, stream);
+  } else if (kind == copy_kind::DEVICE_TO_PINNED) {
+    copy_device_to_pinned(dst, src, size, stream);
+  } else if (kind == copy_kind::PAGEABLE_TO_DEVICE) {
+    copy_pageable_to_device(dst, src, size, stream);
+  } else if (kind == copy_kind::DEVICE_TO_PAGEABLE) {
+    copy_device_to_pageable(dst, src, size, stream);
+  }
+}
+
+void cuda_memcpy(
+  void* dst, void const* src, size_t size, copy_kind kind, rmm::cuda_stream_view stream)
+{
+  cuda_memcpy_async(dst, src, size, kind, stream);
+  stream.synchronize();
+}
+
+}  // namespace cudf::detail

From 1741037f77aada99936f4c6c3af720b5a3af7ddc Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 25 Jun 2024 16:54:27 -0700
Subject: [PATCH 41/75] sorthidth

---
 cpp/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index afbeb7c3266..9ec35acb6fb 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -661,8 +661,8 @@ add_library(
   src/unary/math_ops.cu
   src/unary/nan_ops.cu
   src/unary/null_ops.cu
-  src/utilities/default_stream.cpp
   src/utilities/cuda_memcpy.cu
+  src/utilities/default_stream.cpp
   src/utilities/linked_column.cpp
   src/utilities/logger.cpp
   src/utilities/pinned_memory.cpp

From fff667b3f5521213999c56d4e7c6e795d0269742 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 25 Jun 2024 23:26:33 -0700
Subject: [PATCH 42/75] simplify

---
 .../cudf/detail/utilities/cuda_memcpy.hpp     | 10 ++--
 cpp/src/io/utilities/hostdevice_vector.hpp    |  8 ++--
 cpp/src/utilities/cuda_memcpy.cu              | 48 +++----------------
 3 files changed, 15 insertions(+), 51 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
index 3d497d0a5e2..b66c461ab12 100644
--- a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
+++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
@@ -20,7 +20,7 @@
 
 namespace cudf::detail {
 
-enum class copy_kind { PINNED_TO_DEVICE, DEVICE_TO_PINNED, PAGEABLE_TO_DEVICE, DEVICE_TO_PAGEABLE };
+enum class host_memory_kind : uint8_t { PINNED, PAGEABLE };
 
 /**
  * @brief Asynchronously copies data between the host and device.
@@ -30,11 +30,11 @@ enum class copy_kind { PINNED_TO_DEVICE, DEVICE_TO_PINNED, PAGEABLE_TO_DEVICE, D
  * @param dst Destination memory address
  * @param src Source memory address
  * @param size Number of bytes to copy
- * @param kind Direction of the copy and type of host memory
+ * @param kind Type of host memory
  * @param stream CUDA stream used for the copy
  */
 void cuda_memcpy_async(
-  void* dst, void const* src, size_t size, copy_kind kind, rmm::cuda_stream_view stream);
+  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
 
 /**
  * @brief Synchronously copies data between the host and device.
@@ -44,10 +44,10 @@ void cuda_memcpy_async(
  * @param dst Destination memory address
  * @param src Source memory address
  * @param size Number of bytes to copy
- * @param kind Direction of the copy and type of host memory
+ * @param kind Type of host memory
  * @param stream CUDA stream used for the copy
  */
 void cuda_memcpy(
-  void* dst, void const* src, size_t size, copy_kind kind, rmm::cuda_stream_view stream);
+  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
 
 }  // namespace cudf::detail
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 1cbf850bf20..aed745c42dd 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -125,22 +125,22 @@ class hostdevice_vector {
 
   void host_to_device_async(rmm::cuda_stream_view stream)
   {
-    cuda_memcpy_async(device_ptr(), host_ptr(), size_bytes(), copy_kind::PINNED_TO_DEVICE, stream);
+    cuda_memcpy_async(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
   }
 
   void host_to_device_sync(rmm::cuda_stream_view stream)
   {
-    cuda_memcpy(device_ptr(), host_ptr(), size_bytes(), copy_kind::PINNED_TO_DEVICE, stream);
+    cuda_memcpy(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
   }
 
   void device_to_host_async(rmm::cuda_stream_view stream)
   {
-    cuda_memcpy_async(host_ptr(), device_ptr(), size_bytes(), copy_kind::DEVICE_TO_PINNED, stream);
+    cuda_memcpy_async(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
   }
 
   void device_to_host_sync(rmm::cuda_stream_view stream)
   {
-    cuda_memcpy(host_ptr(), device_ptr(), size_bytes(), copy_kind::DEVICE_TO_PINNED, stream);
+    cuda_memcpy(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
   }
 
   /**
diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu
index ed920cc90c7..42696ac9d4b 100644
--- a/cpp/src/utilities/cuda_memcpy.cu
+++ b/cpp/src/utilities/cuda_memcpy.cu
@@ -47,56 +47,20 @@ void copy_pageable(void* dst, void const* src, std::size_t size, rmm::cuda_strea
   CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream));
 }
 
-void copy_pinned_to_device(void* dst,
-                           void const* src,
-                           std::size_t size,
-                           rmm::cuda_stream_view stream)
-{
-  copy_pinned(dst, src, size, stream);
-}
-
-void copy_device_to_pinned(void* dst,
-                           void const* src,
-                           std::size_t size,
-                           rmm::cuda_stream_view stream)
-{
-  copy_pinned(dst, src, size, stream);
-}
-
-void copy_device_to_pageable(void* dst,
-                             void const* src,
-                             std::size_t size,
-                             rmm::cuda_stream_view stream)
-{
-  copy_pageable(dst, src, size, stream);
-}
-
-void copy_pageable_to_device(void* dst,
-                             void const* src,
-                             std::size_t size,
-                             rmm::cuda_stream_view stream)
-{
-  copy_pageable(dst, src, size, stream);
-}
-
 };  // namespace
 
 void cuda_memcpy_async(
-  void* dst, void const* src, size_t size, copy_kind kind, rmm::cuda_stream_view stream)
+  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
 {
-  if (kind == copy_kind::PINNED_TO_DEVICE) {
-    copy_pinned_to_device(dst, src, size, stream);
-  } else if (kind == copy_kind::DEVICE_TO_PINNED) {
-    copy_device_to_pinned(dst, src, size, stream);
-  } else if (kind == copy_kind::PAGEABLE_TO_DEVICE) {
-    copy_pageable_to_device(dst, src, size, stream);
-  } else if (kind == copy_kind::DEVICE_TO_PAGEABLE) {
-    copy_device_to_pageable(dst, src, size, stream);
+  switch (kind) {
+    case host_memory_kind::PINNED: copy_pinned(dst, src, size, stream);
+    case host_memory_kind::PAGEABLE:
+    default: copy_pageable(dst, src, size, stream);
   }
 }
 
 void cuda_memcpy(
-  void* dst, void const* src, size_t size, copy_kind kind, rmm::cuda_stream_view stream)
+  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
 {
   cuda_memcpy_async(dst, src, size, kind, stream);
   stream.synchronize();

From 84683d20f8643ca6fc36c40fcf3f342ec393a666 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 25 Jun 2024 23:55:41 -0700
Subject: [PATCH 43/75] another day, another threshold

---
 cpp/include/cudf/utilities/pinned_memory.hpp | 16 ++++++++++++++++
 cpp/src/utilities/pinned_memory.cpp          | 14 ++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp
index 3e2fa43cb50..7a9e48f443c 100644
--- a/cpp/include/cudf/utilities/pinned_memory.hpp
+++ b/cpp/include/cudf/utilities/pinned_memory.hpp
@@ -71,4 +71,20 @@ void set_kernel_pinned_copy_threshold(size_t threshold);
  */
 size_t get_kernel_pinned_copy_threshold();
 
+/**
+ * @brief Set the threshold size for allocating host memory as pinned memory.
+ *
+ * @param threshold The threshold size in bytes. If the size of the allocation is less than this
+ * threshold, the memory will be allocated as pinned memory. If the size is greater than or equal
+ * to this threshold, the memory will be allocated as pageable memory.
+ */
+void set_allocate_host_as_pinned_threshold(size_t threshold);
+
+/**
+ * @brief Get the threshold size for allocating host memory as pinned memory.
+ *
+ * @return The threshold size in bytes.
+ */
+size_t get_allocate_host_as_pinned_threshold();
+
 }  // namespace cudf
diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp
index 3ea4293fc60..feba66d6e8c 100644
--- a/cpp/src/utilities/pinned_memory.cpp
+++ b/cpp/src/utilities/pinned_memory.cpp
@@ -225,4 +225,18 @@ void set_kernel_pinned_copy_threshold(size_t threshold)
 
 size_t get_kernel_pinned_copy_threshold() { return kernel_pinned_copy_threshold(); }
 
+CUDF_EXPORT auto& allocate_host_as_pinned_threshold()
+{
+  // use pageable memory for all host allocations
+  static std::atomic<size_t> threshold = 0;
+  return threshold;
+}
+
+void set_allocate_host_as_pinned_threshold(size_t threshold)
+{
+  allocate_host_as_pinned_threshold() = threshold;
+}
+
+size_t get_allocate_host_as_pinned_threshold() { return allocate_host_as_pinned_threshold(); }
+
 }  // namespace cudf

From 1bbd5743c8956cb80ee2d1c2221e47ba368e3a58 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 26 Jun 2024 09:34:17 -0700
Subject: [PATCH 44/75] add missing break

---
 cpp/src/utilities/cuda_memcpy.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu
index 42696ac9d4b..c73bd9b7799 100644
--- a/cpp/src/utilities/cuda_memcpy.cu
+++ b/cpp/src/utilities/cuda_memcpy.cu
@@ -53,9 +53,9 @@ void cuda_memcpy_async(
   void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
 {
   switch (kind) {
-    case host_memory_kind::PINNED: copy_pinned(dst, src, size, stream);
+    case host_memory_kind::PINNED: copy_pinned(dst, src, size, stream); break;
     case host_memory_kind::PAGEABLE:
-    default: copy_pageable(dst, src, size, stream);
+    default: copy_pageable(dst, src, size, stream); break;
   }
 }
 

From 101288fb0a56cf935884ee5b28f2a3511dc590c3 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 26 Jun 2024 16:51:37 -0700
Subject: [PATCH 45/75] rename files to host

---
 cpp/CMakeLists.txt                                |  2 +-
 cpp/benchmarks/fixture/nvbench_fixture.hpp        |  2 +-
 .../io/orc/orc_reader_multithreaded.cpp           |  2 +-
 .../io/parquet/parquet_reader_multithread.cpp     |  2 +-
 .../cudf/detail/utilities/vector_factories.hpp    |  2 +-
 .../{pinned_memory.hpp => host_memory.hpp}        |  0
 cpp/src/utilities/cuda_memcpy.cu                  |  2 +-
 .../{pinned_memory.cpp => host_memory.cpp}        |  2 +-
 cpp/tests/io/json_test.cpp                        | 15 ++++++++-------
 cpp/tests/utilities_tests/pinned_memory_tests.cpp |  2 +-
 java/src/main/native/src/RmmJni.cpp               |  2 +-
 11 files changed, 17 insertions(+), 16 deletions(-)
 rename cpp/include/cudf/utilities/{pinned_memory.hpp => host_memory.hpp} (100%)
 rename cpp/src/utilities/{pinned_memory.cpp => host_memory.cpp} (99%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 35cf90411f2..94df0433b81 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -666,7 +666,7 @@ add_library(
   src/utilities/default_stream.cpp
   src/utilities/linked_column.cpp
   src/utilities/logger.cpp
-  src/utilities/pinned_memory.cpp
+  src/utilities/host_memory.cpp
   src/utilities/stacktrace.cpp
   src/utilities/stream_pool.cpp
   src/utilities/traits.cpp
diff --git a/cpp/benchmarks/fixture/nvbench_fixture.hpp b/cpp/benchmarks/fixture/nvbench_fixture.hpp
index df1492690bb..699844afe62 100644
--- a/cpp/benchmarks/fixture/nvbench_fixture.hpp
+++ b/cpp/benchmarks/fixture/nvbench_fixture.hpp
@@ -16,7 +16,7 @@
 #pragma once
 
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/pinned_memory.hpp>
+#include <cudf/utilities/host_memory.hpp>
 
 #include <rmm/cuda_device.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
diff --git a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
index aa0ee39a179..d3574985bc1 100644
--- a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
@@ -23,7 +23,7 @@
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/io/orc.hpp>
 #include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/pinned_memory.hpp>
+#include <cudf/utilities/host_memory.hpp>
 #include <cudf/utilities/thread_pool.hpp>
 
 #include <nvbench/nvbench.cuh>
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
index b4c8ed78ed8..71ce265e066 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
@@ -22,7 +22,7 @@
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/pinned_memory.hpp>
+#include <cudf/utilities/host_memory.hpp>
 #include <cudf/utilities/thread_pool.hpp>
 
 #include <nvtx3/nvtx3.hpp>
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 20cb55bb1c7..41ec6ae7e16 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -24,7 +24,7 @@
 #include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/pinned_memory.hpp>
+#include <cudf/utilities/host_memory.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/host_memory.hpp
similarity index 100%
rename from cpp/include/cudf/utilities/pinned_memory.hpp
rename to cpp/include/cudf/utilities/host_memory.hpp
diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu
index 42696ac9d4b..3b2aefb3a99 100644
--- a/cpp/src/utilities/cuda_memcpy.cu
+++ b/cpp/src/utilities/cuda_memcpy.cu
@@ -16,7 +16,7 @@
 
 #include <cudf/detail/utilities/cuda_memcpy.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/pinned_memory.hpp>
+#include <cudf/utilities/host_memory.hpp>
 
 #include <rmm/exec_policy.hpp>
 
diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/host_memory.cpp
similarity index 99%
rename from cpp/src/utilities/pinned_memory.cpp
rename to cpp/src/utilities/host_memory.cpp
index feba66d6e8c..92bbff13c7f 100644
--- a/cpp/src/utilities/pinned_memory.cpp
+++ b/cpp/src/utilities/host_memory.cpp
@@ -18,7 +18,7 @@
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
-#include <cudf/utilities/pinned_memory.hpp>
+#include <cudf/utilities/host_memory.hpp>
 
 #include <rmm/cuda_device.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 9c76c344157..1ac68b859e9 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -34,7 +34,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/pinned_memory.hpp>
+#include <cudf/utilities/host_memory.hpp>
 
 #include <rmm/mr/pinned_host_memory_resource.hpp>
 
@@ -222,7 +222,8 @@ std::string to_records_orient(std::vector<std::map<std::string, std::string>> co
 }
 
 template <typename DecimalType>
-struct JsonFixedPointReaderTest : public JsonReaderTest {};
+struct JsonFixedPointReaderTest : public JsonReaderTest {
+};
 
 template <typename DecimalType>
 struct JsonValidFixedPointReaderTest : public JsonFixedPointReaderTest<DecimalType> {
@@ -1139,7 +1140,7 @@ TEST_P(JsonReaderParamTest, ParseOutOfRangeIntegers)
                                                     "-33333333333333333333333",
                                                     "-444444444444444444444444"};
   std::vector<std::string> greater_uint64_max    = {
-    "18446744073709551615", "18446744073709551616", "18446744073709551617", "18446744073709551618"};
+       "18446744073709551615", "18446744073709551616", "18446744073709551617", "18446744073709551618"};
   std::vector<std::string> less_int64_min = {
     "-9223372036854775807", "-9223372036854775808", "-9223372036854775809", "-9223372036854775810"};
   std::vector<std::string> mixed_range = {
@@ -1369,10 +1370,10 @@ TEST_F(JsonReaderTest, JsonLongString)
       "",  // null
       "",  // null
       "கார்த்தி",
-      "CႮ≪ㇳ䍏凹沦王辿龸ꁗ믜스폶ﴠ",  //  0000-FFFF
-      "𐀀𑿪𒐦𓃰𔙆 𖦆𗿿𘳕𚿾[↳] 𜽆𝓚𞤁🄰",                            // 10000-1FFFF
-      "𠘨𡥌𢗉𣇊𤊩𥅽𦉱𧴱𨁲𩁹𪐢𫇭𬬭𭺷𮊦屮",                // 20000-2FFFF
-      "𰾑𱔈𲍉",                                          // 30000-3FFFF
+      "CႮ≪ㇳ䍏凹沦王辿龸ꁗ믜스폶ﴠ",                                      //  0000-FFFF
+      "𐀀𑿪𒐦𓃰𔙆 𖦆𗿿𘳕𚿾[↳] 𜽆𝓚𞤁🄰",      // 10000-1FFFF
+      "𠘨𡥌𢗉𣇊𤊩𥅽𦉱𧴱𨁲𩁹𪐢𫇭𬬭𭺷𮊦屮",  // 20000-2FFFF
+      "𰾑𱔈𲍉",                                                     // 30000-3FFFF
       R"("$€ \u0024\u20ac \\u0024\\u20ac  \\\u0024\\\u20ac \\\\u0024\\\\u20ac)",
       R"(        \\\\\\\\\\\\\\\\)",
       R"(\\\\\\\\\\\\\\\\)",
diff --git a/cpp/tests/utilities_tests/pinned_memory_tests.cpp b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
index df9103640f4..5b81930b2c7 100644
--- a/cpp/tests/utilities_tests/pinned_memory_tests.cpp
+++ b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
@@ -19,7 +19,7 @@
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/io/parquet.hpp>
-#include <cudf/utilities/pinned_memory.hpp>
+#include <cudf/utilities/host_memory.hpp>
 
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 5842a980fc4..706e478842d 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -16,7 +16,7 @@
 
 #include "cudf_jni_apis.hpp"
 
-#include <cudf/utilities/pinned_memory.hpp>
+#include <cudf/utilities/host_memory.hpp>
 
 #include <rmm/aligned.hpp>
 #include <rmm/mr/device/aligned_resource_adaptor.hpp>

From ce58c4636b2e3fb47488f4200387fc4486eaf5b6 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 26 Jun 2024 17:00:39 -0700
Subject: [PATCH 46/75] lines

---
 cpp/src/utilities/cuda_memcpy.cu | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu
index c73bd9b7799..5a32f73f236 100644
--- a/cpp/src/utilities/cuda_memcpy.cu
+++ b/cpp/src/utilities/cuda_memcpy.cu
@@ -53,9 +53,15 @@ void cuda_memcpy_async(
   void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
 {
   switch (kind) {
-    case host_memory_kind::PINNED: copy_pinned(dst, src, size, stream); break;
+    case host_memory_kind::PINNED: {
+      copy_pinned(dst, src, size, stream);
+      break;
+    }
     case host_memory_kind::PAGEABLE:
-    default: copy_pageable(dst, src, size, stream); break;
+    default: {
+      copy_pageable(dst, src, size, stream);
+      break;
+    }
   }
 }
 

From 3739c47789fff7891bc2b51ef677166ae75f64fa Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 26 Jun 2024 18:08:39 -0700
Subject: [PATCH 47/75] get_host_memory_resource

---
 cpp/include/cudf/utilities/host_memory.hpp |  8 +++
 cpp/src/utilities/host_memory.cpp          | 70 ++++++++++++++++++++++
 2 files changed, 78 insertions(+)

diff --git a/cpp/include/cudf/utilities/host_memory.hpp b/cpp/include/cudf/utilities/host_memory.hpp
index 7a9e48f443c..1db747b12a3 100644
--- a/cpp/include/cudf/utilities/host_memory.hpp
+++ b/cpp/include/cudf/utilities/host_memory.hpp
@@ -87,4 +87,12 @@ void set_allocate_host_as_pinned_threshold(size_t threshold);
  */
 size_t get_allocate_host_as_pinned_threshold();
 
+/**
+ * @brief Get the rmm resource to be used for host memory allocations.
+ *
+ * @param size The size of the allocation
+ * @return The rmm resource to be used for host memory allocations
+ */
+rmm::host_async_resource_ref get_host_memory_resource(size_t size);
+
 }  // namespace cudf
diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp
index 92bbff13c7f..8125c851b71 100644
--- a/cpp/src/utilities/host_memory.cpp
+++ b/cpp/src/utilities/host_memory.cpp
@@ -186,6 +186,70 @@ CUDF_EXPORT rmm::host_device_async_resource_ref& host_mr()
   return mr_ref;
 }
 
+class new_delete_memory_resource {
+ public:
+  void* allocate(std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
+  {
+    try {
+      return rmm::detail::aligned_host_allocate(
+        bytes, rmm::CUDA_ALLOCATION_ALIGNMENT, [](std::size_t size) {
+          return ::operator new(size);
+        });
+    } catch (std::bad_alloc const& e) {
+      RMM_FAIL("Failed to allocate memory: " + std::string{e.what()}, rmm::out_of_memory);
+    }
+  }
+
+  void* allocate_async(std::size_t bytes, [[maybe_unused]] cuda::stream_ref stream)
+  {
+    return allocate(bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT);
+  }
+
+  void* allocate_async(std::size_t bytes,
+                       std::size_t alignment,
+                       [[maybe_unused]] cuda::stream_ref stream)
+  {
+    return allocate(bytes, alignment);
+  }
+
+  void deallocate(void* ptr,
+                  std::size_t bytes,
+                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept
+  {
+    rmm::detail::aligned_host_deallocate(
+      ptr, bytes, rmm::CUDA_ALLOCATION_ALIGNMENT, [](void* ptr) { ::operator delete(ptr); });
+  }
+
+  void deallocate_async(void* ptr,
+                        std::size_t bytes,
+                        std::size_t alignment,
+                        cuda::stream_ref stream) noexcept
+  {
+    deallocate(ptr, bytes, alignment);
+  }
+
+  void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept
+  {
+    deallocate(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT);
+  }
+
+  bool operator==(new_delete_memory_resource const& other) const { return true; }
+
+  bool operator!=(new_delete_memory_resource const& other) const { return !operator==(other); }
+
+  friend void get_property(new_delete_memory_resource const&, cuda::mr::host_accessible) noexcept {}
+};
+
+static_assert(cuda::mr::resource_with<new_delete_memory_resource, cuda::mr::host_accessible>,
+              "Pinned pool mr must be accessible from both host and device");
+
+CUDF_EXPORT rmm::host_async_resource_ref& get_pageable_memory_resource()
+{
+  static new_delete_memory_resource mr{};
+  static rmm::host_async_resource_ref mr_ref{mr};
+  return mr_ref;
+}
+
 }  // namespace
 
 rmm::host_device_async_resource_ref set_pinned_memory_resource(
@@ -239,4 +303,10 @@ void set_allocate_host_as_pinned_threshold(size_t threshold)
 
 size_t get_allocate_host_as_pinned_threshold() { return allocate_host_as_pinned_threshold(); }
 
+rmm::host_async_resource_ref get_host_memory_resource(size_t size)
+{
+  if (size <= get_allocate_host_as_pinned_threshold()) { return get_pinned_memory_resource(); }
+  return get_pageable_memory_resource();
+}
+
 }  // namespace cudf

From 49d65b86635e7487c8698c38e65177d646df7742 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 26 Jun 2024 18:11:57 -0700
Subject: [PATCH 48/75] use if/else

---
 cpp/src/utilities/cuda_memcpy.cu | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu
index 5a32f73f236..3d0822d8545 100644
--- a/cpp/src/utilities/cuda_memcpy.cu
+++ b/cpp/src/utilities/cuda_memcpy.cu
@@ -52,16 +52,12 @@ void copy_pageable(void* dst, void const* src, std::size_t size, rmm::cuda_strea
 void cuda_memcpy_async(
   void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
 {
-  switch (kind) {
-    case host_memory_kind::PINNED: {
-      copy_pinned(dst, src, size, stream);
-      break;
-    }
-    case host_memory_kind::PAGEABLE:
-    default: {
-      copy_pageable(dst, src, size, stream);
-      break;
-    }
+  if (kind == host_memory_kind::PINNED) {
+    copy_pinned(dst, src, size, stream);
+  } else if (kind == host_memory_kind::PAGEABLE) {
+    copy_pageable(dst, src, size, stream);
+  } else {
+    CUDF_FAIL("Unsupported host memory kind");
   }
 }
 

From db45aa77b9c82eb61d42990eedd7b1a6a21f9263 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 26 Jun 2024 18:55:28 -0700
Subject: [PATCH 49/75] rename back :D

---
 cpp/benchmarks/fixture/nvbench_fixture.hpp                      | 2 +-
 cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp              | 2 +-
 cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp        | 2 +-
 cpp/include/cudf/detail/utilities/host_memory.hpp               | 0
 cpp/include/cudf/detail/utilities/vector_factories.hpp          | 2 +-
 .../cudf/utilities/{host_memory.hpp => pinned_memory.hpp}       | 0
 cpp/src/utilities/cuda_memcpy.cu                                | 2 +-
 cpp/src/utilities/host_memory.cpp                               | 2 +-
 cpp/tests/io/json_test.cpp                                      | 2 +-
 cpp/tests/utilities_tests/pinned_memory_tests.cpp               | 2 +-
 java/src/main/native/src/RmmJni.cpp                             | 2 +-
 11 files changed, 9 insertions(+), 9 deletions(-)
 create mode 100644 cpp/include/cudf/detail/utilities/host_memory.hpp
 rename cpp/include/cudf/utilities/{host_memory.hpp => pinned_memory.hpp} (100%)

diff --git a/cpp/benchmarks/fixture/nvbench_fixture.hpp b/cpp/benchmarks/fixture/nvbench_fixture.hpp
index 699844afe62..df1492690bb 100644
--- a/cpp/benchmarks/fixture/nvbench_fixture.hpp
+++ b/cpp/benchmarks/fixture/nvbench_fixture.hpp
@@ -16,7 +16,7 @@
 #pragma once
 
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/host_memory.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
 
 #include <rmm/cuda_device.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
diff --git a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
index d3574985bc1..aa0ee39a179 100644
--- a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
@@ -23,7 +23,7 @@
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/io/orc.hpp>
 #include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/host_memory.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
 #include <cudf/utilities/thread_pool.hpp>
 
 #include <nvbench/nvbench.cuh>
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
index 71ce265e066..b4c8ed78ed8 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
@@ -22,7 +22,7 @@
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/host_memory.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
 #include <cudf/utilities/thread_pool.hpp>
 
 #include <nvtx3/nvtx3.hpp>
diff --git a/cpp/include/cudf/detail/utilities/host_memory.hpp b/cpp/include/cudf/detail/utilities/host_memory.hpp
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 41ec6ae7e16..20cb55bb1c7 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -24,7 +24,7 @@
 #include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/host_memory.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/include/cudf/utilities/host_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp
similarity index 100%
rename from cpp/include/cudf/utilities/host_memory.hpp
rename to cpp/include/cudf/utilities/pinned_memory.hpp
diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu
index fca6385ffbf..3d0822d8545 100644
--- a/cpp/src/utilities/cuda_memcpy.cu
+++ b/cpp/src/utilities/cuda_memcpy.cu
@@ -16,7 +16,7 @@
 
 #include <cudf/detail/utilities/cuda_memcpy.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/host_memory.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
 
 #include <rmm/exec_policy.hpp>
 
diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp
index 8125c851b71..b2f76e7dd78 100644
--- a/cpp/src/utilities/host_memory.cpp
+++ b/cpp/src/utilities/host_memory.cpp
@@ -18,7 +18,7 @@
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
-#include <cudf/utilities/host_memory.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
 
 #include <rmm/cuda_device.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 1ac68b859e9..0ee139b4787 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -34,7 +34,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-#include <cudf/utilities/host_memory.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
 
 #include <rmm/mr/pinned_host_memory_resource.hpp>
 
diff --git a/cpp/tests/utilities_tests/pinned_memory_tests.cpp b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
index 5b81930b2c7..df9103640f4 100644
--- a/cpp/tests/utilities_tests/pinned_memory_tests.cpp
+++ b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
@@ -19,7 +19,7 @@
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/io/parquet.hpp>
-#include <cudf/utilities/host_memory.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
 
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 706e478842d..5842a980fc4 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -16,7 +16,7 @@
 
 #include "cudf_jni_apis.hpp"
 
-#include <cudf/utilities/host_memory.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
 
 #include <rmm/aligned.hpp>
 #include <rmm/mr/device/aligned_resource_adaptor.hpp>

From 5a072cfdb12bbab499d4aebb8639d921b9ed798f Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 27 Jun 2024 11:47:53 -0700
Subject: [PATCH 50/75] working make_host_vector

---
 .../cudf/detail/utilities/host_memory.hpp     | 46 +++++++++++++++++++
 .../cudf/detail/utilities/host_vector.hpp     | 13 +++++-
 .../detail/utilities/vector_factories.hpp     | 23 +++++++---
 cpp/include/cudf/utilities/pinned_memory.hpp  |  8 ----
 cpp/src/utilities/host_memory.cpp             | 20 ++++----
 5 files changed, 82 insertions(+), 28 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/host_memory.hpp b/cpp/include/cudf/detail/utilities/host_memory.hpp
index e69de29bb2d..b1a51ed660e 100644
--- a/cpp/include/cudf/detail/utilities/host_memory.hpp
+++ b/cpp/include/cudf/detail/utilities/host_memory.hpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/utilities/host_vector.hpp>
+#include <cudf/utilities/export.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
+
+#include <rmm/resource_ref.hpp>
+
+#include <cstddef>
+
+namespace cudf::detail {
+
+CUDF_EXPORT rmm::host_async_resource_ref& get_pageable_memory_resource();
+
+/**
+ * @brief Get the rmm resource to be used for host memory allocations.
+ *
+ * @param size The size of the allocation
+ * @return The rmm resource to be used for host memory allocations
+ */
+template <typename T>
+rmm_host_allocator<T> get_host_allocator(std::size_t size, rmm::cuda_stream_view _stream)
+{
+  if (size * sizeof(T) <= get_allocate_host_as_pinned_threshold()) {
+    return {get_pinned_memory_resource(), _stream};
+  }
+  return {get_pageable_memory_resource(), _stream};
+}
+
+}  // namespace cudf::detail
\ No newline at end of file
diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp
index 2d14d0306cd..e688d90a760 100644
--- a/cpp/include/cudf/detail/utilities/host_vector.hpp
+++ b/cpp/include/cudf/detail/utilities/host_vector.hpp
@@ -100,8 +100,14 @@ class rmm_host_allocator {
   /**
    * @brief Construct from a `cudf::host_async_resource_ref`
    */
-  rmm_host_allocator(rmm::host_async_resource_ref _mr, rmm::cuda_stream_view _stream)
-    : mr(_mr), stream(_stream)
+  template <class... Properties>
+  rmm_host_allocator(cuda::mr::async_resource_ref<cuda::mr::host_accessible, Properties...> _mr,
+                     rmm::cuda_stream_view _stream)
+    : mr(_mr),
+      stream(_stream),
+      _is_device_accessible{
+        cuda::has_property<cuda::mr::async_resource_ref<cuda::mr::host_accessible, Properties...>,
+                           cuda::mr::device_accessible>}
   {
   }
 
@@ -173,9 +179,12 @@ class rmm_host_allocator {
    */
   inline bool operator!=(rmm_host_allocator const& x) const { return !operator==(x); }
 
+  bool is_device_accessible() const { return _is_device_accessible; }
+
  private:
   rmm::host_async_resource_ref mr;
   rmm::cuda_stream_view stream;
+  bool _is_device_accessible;
 };
 
 /**
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 20cb55bb1c7..306aa8e2f77 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -21,6 +21,8 @@
  * @file vector_factories.hpp
  */
 
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
+#include <cudf/detail/utilities/host_memory.hpp>
 #include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -373,9 +375,16 @@ std::vector<typename Container::value_type> make_std_vector_sync(Container const
  * @return The data copied to the host
  */
 template <typename T>
-thrust::host_vector<T> make_host_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
+host_vector<T> make_host_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
 {
-  return make_vector_async<T, thrust::host_vector<T>>(v, stream);
+  auto result          = host_vector<T>(v.size(), get_host_allocator<T>(v.size(), stream));
+  auto const is_pinned = result.get_allocator().is_device_accessible();
+  cuda_memcpy_async(result.data(),
+                    v.data(),
+                    v.size() * sizeof(T),
+                    is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
+                    stream);
+  return result;
 }
 
 /**
@@ -394,8 +403,8 @@ template <
   typename Container,
   std::enable_if_t<
     std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
-thrust::host_vector<typename Container::value_type> make_host_vector_async(
-  Container const& c, rmm::cuda_stream_view stream)
+host_vector<typename Container::value_type> make_host_vector_async(Container const& c,
+                                                                   rmm::cuda_stream_view stream)
 {
   return make_host_vector_async(device_span<typename Container::value_type const>{c}, stream);
 }
@@ -412,7 +421,7 @@ thrust::host_vector<typename Container::value_type> make_host_vector_async(
  * @return The data copied to the host
  */
 template <typename T>
-thrust::host_vector<T> make_host_vector_sync(device_span<T const> v, rmm::cuda_stream_view stream)
+host_vector<T> make_host_vector_sync(device_span<T const> v, rmm::cuda_stream_view stream)
 {
   auto result = make_host_vector_async(v, stream);
   stream.synchronize();
@@ -435,8 +444,8 @@ template <
   typename Container,
   std::enable_if_t<
     std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
-thrust::host_vector<typename Container::value_type> make_host_vector_sync(
-  Container const& c, rmm::cuda_stream_view stream)
+host_vector<typename Container::value_type> make_host_vector_sync(Container const& c,
+                                                                  rmm::cuda_stream_view stream)
 {
   return make_host_vector_sync(device_span<typename Container::value_type const>{c}, stream);
 }
diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp
index 1db747b12a3..7a9e48f443c 100644
--- a/cpp/include/cudf/utilities/pinned_memory.hpp
+++ b/cpp/include/cudf/utilities/pinned_memory.hpp
@@ -87,12 +87,4 @@ void set_allocate_host_as_pinned_threshold(size_t threshold);
  */
 size_t get_allocate_host_as_pinned_threshold();
 
-/**
- * @brief Get the rmm resource to be used for host memory allocations.
- *
- * @param size The size of the allocation
- * @return The rmm resource to be used for host memory allocations
- */
-rmm::host_async_resource_ref get_host_memory_resource(size_t size);
-
 }  // namespace cudf
diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp
index b2f76e7dd78..b816b9f4e2e 100644
--- a/cpp/src/utilities/host_memory.cpp
+++ b/cpp/src/utilities/host_memory.cpp
@@ -243,13 +243,6 @@ class new_delete_memory_resource {
 static_assert(cuda::mr::resource_with<new_delete_memory_resource, cuda::mr::host_accessible>,
               "Pinned pool mr must be accessible from both host and device");
 
-CUDF_EXPORT rmm::host_async_resource_ref& get_pageable_memory_resource()
-{
-  static new_delete_memory_resource mr{};
-  static rmm::host_async_resource_ref mr_ref{mr};
-  return mr_ref;
-}
-
 }  // namespace
 
 rmm::host_device_async_resource_ref set_pinned_memory_resource(
@@ -292,7 +285,7 @@ size_t get_kernel_pinned_copy_threshold() { return kernel_pinned_copy_threshold(
 CUDF_EXPORT auto& allocate_host_as_pinned_threshold()
 {
   // use pageable memory for all host allocations
-  static std::atomic<size_t> threshold = 0;
+  static std::atomic<size_t> threshold = 20;
   return threshold;
 }
 
@@ -303,10 +296,15 @@ void set_allocate_host_as_pinned_threshold(size_t threshold)
 
 size_t get_allocate_host_as_pinned_threshold() { return allocate_host_as_pinned_threshold(); }
 
-rmm::host_async_resource_ref get_host_memory_resource(size_t size)
+namespace detail {
+
+CUDF_EXPORT rmm::host_async_resource_ref& get_pageable_memory_resource()
 {
-  if (size <= get_allocate_host_as_pinned_threshold()) { return get_pinned_memory_resource(); }
-  return get_pageable_memory_resource();
+  static new_delete_memory_resource mr{};
+  static rmm::host_async_resource_ref mr_ref{mr};
+  return mr_ref;
 }
 
+}  // namespace detail
+
 }  // namespace cudf

From dd93448238b329178e008a29fb70ecc2b0c40080 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 28 Jun 2024 11:23:38 -0700
Subject: [PATCH 51/75] auto

---
 cpp/include/cudf/detail/utilities/vector_factories.hpp | 2 --
 cpp/src/io/json/nested_json_gpu.cu                     | 6 ++----
 cpp/src/lists/dremel.cu                                | 6 ++----
 cpp/src/utilities/host_memory.cpp                      | 2 +-
 cpp/tests/io/json_tree.cpp                             | 6 ++----
 cpp/tests/strings/integers_tests.cpp                   | 4 +---
 6 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 306aa8e2f77..ea16ac0ef66 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -34,8 +34,6 @@
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <thrust/host_vector.h>
-
 #include <vector>
 
 namespace cudf {
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 031edfde4f6..405084cc4ad 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1698,10 +1698,8 @@ void make_json_column(json_column& root_column,
   auto const [d_tokens_gpu, d_token_indices_gpu] = get_token_stream(d_input, options, stream, mr);
 
   // Copy the JSON tokens to the host
-  thrust::host_vector<PdaTokenT> tokens =
-    cudf::detail::make_host_vector_async(d_tokens_gpu, stream);
-  thrust::host_vector<SymbolOffsetT> token_indices_gpu =
-    cudf::detail::make_host_vector_async(d_token_indices_gpu, stream);
+  auto tokens            = cudf::detail::make_host_vector_async(d_tokens_gpu, stream);
+  auto token_indices_gpu = cudf::detail::make_host_vector_async(d_token_indices_gpu, stream);
 
   // Make sure tokens have been copied to the host
   stream.synchronize();
diff --git a/cpp/src/lists/dremel.cu b/cpp/src/lists/dremel.cu
index 5625e1bf05c..50f40924478 100644
--- a/cpp/src/lists/dremel.cu
+++ b/cpp/src/lists/dremel.cu
@@ -257,10 +257,8 @@ dremel_data get_encoding(column_view h_col,
     },
     stream);
 
-  thrust::host_vector<size_type> column_offsets =
-    cudf::detail::make_host_vector_async(d_column_offsets, stream);
-  thrust::host_vector<size_type> column_ends =
-    cudf::detail::make_host_vector_async(d_column_ends, stream);
+  auto column_offsets = cudf::detail::make_host_vector_async(d_column_offsets, stream);
+  auto column_ends    = cudf::detail::make_host_vector_async(d_column_ends, stream);
   stream.synchronize();
 
   size_t max_vals_size = 0;
diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp
index b816b9f4e2e..d3bcf7a085d 100644
--- a/cpp/src/utilities/host_memory.cpp
+++ b/cpp/src/utilities/host_memory.cpp
@@ -285,7 +285,7 @@ size_t get_kernel_pinned_copy_threshold() { return kernel_pinned_copy_threshold(
 CUDF_EXPORT auto& allocate_host_as_pinned_threshold()
 {
   // use pageable memory for all host allocations
-  static std::atomic<size_t> threshold = 20;
+  static std::atomic<size_t> threshold = 16 * 1024;
   return threshold;
 }
 
diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json_tree.cpp
index 7a72b77e1fb..8bcd5790e99 100644
--- a/cpp/tests/io/json_tree.cpp
+++ b/cpp/tests/io/json_tree.cpp
@@ -235,10 +235,8 @@ tree_meta_t2 get_tree_representation_cpu(
 {
   constexpr bool include_quote_char = true;
   // Copy the JSON tokens to the host
-  thrust::host_vector<cuio_json::PdaTokenT> tokens =
-    cudf::detail::make_host_vector_async(tokens_gpu, stream);
-  thrust::host_vector<cuio_json::SymbolOffsetT> token_indices =
-    cudf::detail::make_host_vector_async(token_indices_gpu1, stream);
+  auto tokens        = cudf::detail::make_host_vector_async(tokens_gpu, stream);
+  auto token_indices = cudf::detail::make_host_vector_async(token_indices_gpu1, stream);
 
   // Make sure tokens have been copied to the host
   stream.synchronize();
diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp
index 51e9b3bd0a0..7a038fa6d75 100644
--- a/cpp/tests/strings/integers_tests.cpp
+++ b/cpp/tests/strings/integers_tests.cpp
@@ -294,7 +294,7 @@ TYPED_TEST(StringsIntegerConvertTest, FromToInteger)
   std::iota(h_integers.begin(), h_integers.end(), -(TypeParam)(h_integers.size() / 2));
   h_integers.push_back(std::numeric_limits<TypeParam>::min());
   h_integers.push_back(std::numeric_limits<TypeParam>::max());
-  auto d_integers = cudf::detail::make_device_uvector_sync(
+  auto const d_integers = cudf::detail::make_device_uvector_sync(
     h_integers, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto integers      = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
                                             (cudf::size_type)d_integers.size());
@@ -308,8 +308,6 @@ TYPED_TEST(StringsIntegerConvertTest, FromToInteger)
   // convert to strings
   auto results_strings = cudf::strings::from_integers(integers->view());
 
-  // copy back to host
-  h_integers = cudf::detail::make_host_vector_sync(d_integers, cudf::get_default_stream());
   std::vector<std::string> h_strings;
   for (auto itr = h_integers.begin(); itr != h_integers.end(); ++itr)
     h_strings.push_back(std::to_string(*itr));

From 02e7bfb8fcc5a6138e80522402deb2bfabef312f Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 3 Jul 2024 11:31:06 -0700
Subject: [PATCH 52/75] derive host_vector

---
 .../cudf/detail/utilities/host_vector.hpp     |  6 +++-
 .../detail/utilities/vector_factories.hpp     | 31 ++++++++++++++++++-
 cpp/include/cudf/utilities/span.hpp           |  6 ++++
 cpp/src/io/parquet/predicate_pushdown.cpp     | 20 +++++++-----
 4 files changed, 54 insertions(+), 9 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp
index e688d90a760..71c5bc842c9 100644
--- a/cpp/include/cudf/detail/utilities/host_vector.hpp
+++ b/cpp/include/cudf/detail/utilities/host_vector.hpp
@@ -191,6 +191,10 @@ class rmm_host_allocator {
  * @brief A vector class with rmm host memory allocator
  */
 template <typename T>
-using host_vector = thrust::host_vector<T, rmm_host_allocator<T>>;
+class host_vector : public thrust::host_vector<T, rmm_host_allocator<T>> {
+ public:
+  using base = thrust::host_vector<T, rmm_host_allocator<T>>;
+  host_vector(size_t size, rmm_host_allocator<T> const& alloc) : base(size, alloc) {}
+};
 
 }  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index ea16ac0ef66..f4a421138f1 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -181,6 +181,21 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
     device_span<typename Container::value_type const>{c}, stream, mr);
 }
 
+template <typename T>
+rmm::device_uvector<T> make_device_uvector_async(host_vector<T> const& v,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::device_async_resource_ref mr)
+{
+  rmm::device_uvector<T> ret(v.size(), stream, mr);
+  auto const is_pinned = v.get_allocator().is_device_accessible();
+  cuda_memcpy_async(ret.data(),
+                    v.data(),
+                    v.size() * sizeof(T),
+                    is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
+                    stream);
+  return ret;
+}
+
 /**
  * @brief Synchronously construct a `device_uvector` containing a deep copy of data from a
  * `host_span`
@@ -361,6 +376,20 @@ std::vector<typename Container::value_type> make_std_vector_sync(Container const
   return make_std_vector_sync(device_span<typename Container::value_type const>{c}, stream);
 }
 
+template <typename T>
+host_vector<T> make_host_vector(size_t size, rmm::cuda_stream_view stream)
+{
+  return host_vector<T>(size, get_host_allocator<T>(size, stream));
+}
+
+template <typename T>
+host_vector<T> make_empty_host_vector(size_t capacity, rmm::cuda_stream_view stream)
+{
+  auto result = host_vector<T>(0, get_host_allocator<T>(capacity, stream));
+  result.reserve(capacity);
+  return result;
+}
+
 /**
  * @brief Asynchronously construct a `thrust::host_vector` containing a copy of data from a
  * `device_span`
@@ -375,7 +404,7 @@ std::vector<typename Container::value_type> make_std_vector_sync(Container const
 template <typename T>
 host_vector<T> make_host_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
 {
-  auto result          = host_vector<T>(v.size(), get_host_allocator<T>(v.size(), stream));
+  auto result          = make_host_vector<T>(v.size(), stream);
   auto const is_pinned = result.get_allocator().is_device_accessible();
   cuda_memcpy_async(result.data(),
                     v.data(),
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index 3b35e60e034..34e39d01a6a 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <cudf/detail/utilities/host_vector.hpp>
+
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/device_vector.hpp>
@@ -216,6 +218,10 @@ template <typename T, typename Alloc>
 struct is_host_span_supported_container<  //
   thrust::host_vector<T, Alloc>> : std::true_type {};
 
+template <typename T>
+struct is_host_span_supported_container<  //
+  cudf::detail::host_vector<T>> : std::true_type {};
+
 template <typename T, typename Alloc>
 struct is_host_span_supported_container<  //
   std::basic_string<T, std::char_traits<T>, Alloc>> : std::true_type {};
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index 11f4a00ee8b..481c1e9fcdd 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -141,11 +141,11 @@ struct stats_caster {
       // Local struct to hold host columns
       struct host_column {
         // using thrust::host_vector because std::vector<bool> uses bitmap instead of byte per bool.
-        thrust::host_vector<T> val;
+        cudf::detail::host_vector<T> val;
         std::vector<bitmask_type> null_mask;
         cudf::size_type null_count = 0;
-        host_column(size_type total_row_groups)
-          : val(total_row_groups),
+        host_column(size_type total_row_groups, rmm::cuda_stream_view stream)
+          : val{cudf::detail::make_host_vector<T>(total_row_groups, stream)},
             null_mask(
               cudf::util::div_rounding_up_safe<size_type>(
                 cudf::bitmask_allocation_size_bytes(total_row_groups), sizeof(bitmask_type)),
@@ -170,8 +170,14 @@ struct stats_caster {
                                           rmm::cuda_stream_view stream,
                                           rmm::device_async_resource_ref mr)
         {
-          std::vector<char> chars{};
-          std::vector<cudf::size_type> offsets(1, 0);
+          auto const total_char_count = std::accumulate(
+            host_strings.begin(), host_strings.end(), 0, [](auto sum, auto const& str) {
+              return sum + str.size_bytes();
+            });
+          auto chars = cudf::detail::make_empty_host_vector<char>(total_char_count, stream);
+          auto offsets =
+            cudf::detail::make_empty_host_vector<cudf::size_type>(host_strings.size() + 1, stream);
+          offsets.push_back(0);
           for (auto const& str : host_strings) {
             auto tmp =
               str.empty() ? std::string_view{} : std::string_view(str.data(), str.size_bytes());
@@ -206,8 +212,8 @@ struct stats_caster {
             null_count);
         }
       };  // local struct host_column
-      host_column min(total_row_groups);
-      host_column max(total_row_groups);
+      host_column min(total_row_groups, stream);
+      host_column max(total_row_groups, stream);
       size_type stats_idx = 0;
       for (size_t src_idx = 0; src_idx < row_group_indices.size(); ++src_idx) {
         for (auto const rg_idx : row_group_indices[src_idx]) {

From ef4e1de5402c4db1f428cd0436f214c41d00c3b5 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 3 Jul 2024 11:31:32 -0700
Subject: [PATCH 53/75] use host_vector pt2

---
 cpp/include/cudf/detail/gather.cuh         |  2 +-
 cpp/src/io/orc/writer_impl.cu              | 23 ++++----
 cpp/src/io/parquet/reader_impl_chunking.cu | 61 ++++++++++++----------
 3 files changed, 46 insertions(+), 40 deletions(-)

diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 5977c7341c1..04dbe0a9294 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -577,7 +577,7 @@ void gather_bitmask(table_view const& source,
   }
 
   // Make device array of target bitmask pointers
-  std::vector<bitmask_type*> target_masks(target.size());
+  auto target_masks = cudf::detail::make_host_vector<bitmask_type*>(target.size(), stream);
   std::transform(target.begin(), target.end(), target_masks.begin(), [](auto const& col) {
     return col->mutable_view().null_mask();
   });
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index e9e031a407a..409bf91997a 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -1335,7 +1335,7 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer,
     if (num_file_blobs == 0) { return {}; }
 
     // Create empty file stats and merge groups
-    std::vector<statistics_chunk> h_stat_chunks(num_file_blobs);
+    auto h_stat_chunks = cudf::detail::make_host_vector<statistics_chunk>(num_file_blobs, stream);
     cudf::detail::hostdevice_vector<statistics_merge_group> stats_merge(num_file_blobs, stream);
     // Fill in stats_merge and stat_chunks on the host
     for (auto i = 0u; i < num_file_blobs; ++i) {
@@ -1676,39 +1676,39 @@ struct pushdown_null_masks {
   // Owning vector for masks in device memory
   std::vector<rmm::device_uvector<bitmask_type>> data;
   // Pointers to pushdown masks in device memory. Can be same for multiple columns.
-  std::vector<bitmask_type const*> masks;
+  cudf::detail::host_vector<bitmask_type const*> masks;
 };
 
 pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table,
                                              rmm::cuda_stream_view stream)
 {
-  std::vector<bitmask_type const*> mask_ptrs;
-  mask_ptrs.reserve(orc_table.num_columns());
+  auto mask_ptrs =
+    cudf::detail::make_empty_host_vector<bitmask_type const*>(orc_table.num_columns(), stream);
   std::vector<rmm::device_uvector<bitmask_type>> pd_masks;
   for (auto const& col : orc_table.columns) {
     // Leaf columns don't need pushdown masks
     if (col.num_children() == 0) {
-      mask_ptrs.emplace_back(nullptr);
+      mask_ptrs.push_back({nullptr});
       continue;
     }
     auto const parent_pd_mask = col.is_child() ? mask_ptrs[col.parent_index()] : nullptr;
     auto const null_mask      = col.null_mask();
 
     if (null_mask == nullptr and parent_pd_mask == nullptr) {
-      mask_ptrs.emplace_back(nullptr);
+      mask_ptrs.push_back({nullptr});
       continue;
     }
     if (col.orc_kind() == STRUCT) {
       if (null_mask != nullptr and parent_pd_mask == nullptr) {
         // Reuse own null mask
-        mask_ptrs.emplace_back(null_mask);
+        mask_ptrs.push_back(null_mask);
       } else if (null_mask == nullptr and parent_pd_mask != nullptr) {
         // Reuse parent's pushdown mask
-        mask_ptrs.emplace_back(parent_pd_mask);
+        mask_ptrs.push_back(parent_pd_mask);
       } else {
         // Both are nullable, allocate new pushdown mask
         pd_masks.emplace_back(num_bitmask_words(col.size()), stream);
-        mask_ptrs.emplace_back(pd_masks.back().data());
+        mask_ptrs.push_back({pd_masks.back().data()});
 
         thrust::transform(rmm::exec_policy(stream),
                           null_mask,
@@ -1723,7 +1723,7 @@ pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table,
       auto const child_col = orc_table.column(col.child_begin()[0]);
       // pushdown mask applies to child column(s); use the child column size
       pd_masks.emplace_back(num_bitmask_words(child_col.size()), stream);
-      mask_ptrs.emplace_back(pd_masks.back().data());
+      mask_ptrs.push_back({pd_masks.back().data()});
       pushdown_lists_null_mask(col, orc_table.d_columns, parent_pd_mask, pd_masks.back(), stream);
     }
   }
@@ -1814,8 +1814,7 @@ orc_table_view make_orc_table_view(table_view const& table,
     append_orc_column(table.column(col_idx), nullptr, table_meta.column_metadata[col_idx]);
   }
 
-  std::vector<TypeKind> type_kinds;
-  type_kinds.reserve(orc_columns.size());
+  auto type_kinds = cudf::detail::make_empty_host_vector<TypeKind>(orc_columns.size(), stream);
   std::transform(
     orc_columns.cbegin(), orc_columns.cend(), std::back_inserter(type_kinds), [](auto& orc_column) {
       return orc_column.orc_kind();
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index d371ef5de93..5fba54ab309 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -804,10 +804,10 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
   rmm::device_buffer decomp_pages(
     cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
 
-  std::vector<device_span<uint8_t const>> comp_in;
-  comp_in.reserve(num_comp_pages);
-  std::vector<device_span<uint8_t>> comp_out;
-  comp_out.reserve(num_comp_pages);
+  auto comp_in =
+    cudf::detail::make_empty_host_vector<device_span<uint8_t const>>(num_comp_pages, stream);
+  auto comp_out =
+    cudf::detail::make_empty_host_vector<device_span<uint8_t>>(num_comp_pages, stream);
 
   // vectors to save v2 def and rep level data, if any
   std::vector<device_span<uint8_t const>> copy_in;
@@ -822,7 +822,6 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
                compression_result{0, compression_status::FAILURE});
 
   size_t decomp_offset = 0;
-  int32_t start_pos    = 0;
   for (auto const& codec : codecs) {
     if (codec.num_pages == 0) { continue; }
 
@@ -839,53 +838,61 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
         copy_in.emplace_back(page.page_data, offset);
         copy_out.emplace_back(dst_base, offset);
       }
-      comp_in.emplace_back(page.page_data + offset,
-                           static_cast<size_t>(page.compressed_page_size - offset));
-      comp_out.emplace_back(dst_base + offset,
-                            static_cast<size_t>(page.uncompressed_page_size - offset));
+      comp_in.push_back(
+        {page.page_data + offset, static_cast<size_t>(page.compressed_page_size - offset)});
+      comp_out.push_back(
+        {dst_base + offset, static_cast<size_t>(page.uncompressed_page_size - offset)});
       page.page_data = dst_base;
       decomp_offset += page.uncompressed_page_size;
     });
+  }
+  auto d_comp_in = cudf::detail::make_device_uvector_async(
+    comp_in, stream, rmm::mr::get_current_device_resource());
+  auto d_comp_out = cudf::detail::make_device_uvector_async(
+    comp_out, stream, rmm::mr::get_current_device_resource());
+
+  int32_t start_pos = 0;
+  for (auto const& codec : codecs) {
+    if (codec.num_pages == 0) { continue; }
+
+    device_span<device_span<uint8_t const> const> d_comp_in_view{d_comp_in.data() + start_pos,
+                                                                 codec.num_pages};
+
+    device_span<device_span<uint8_t> const> d_comp_out_view(d_comp_out.data() + start_pos,
+                                                            codec.num_pages);
 
-    host_span<device_span<uint8_t const> const> comp_in_view{comp_in.data() + start_pos,
-                                                             codec.num_pages};
-    auto const d_comp_in = cudf::detail::make_device_uvector_async(
-      comp_in_view, stream, rmm::mr::get_current_device_resource());
-    host_span<device_span<uint8_t> const> comp_out_view(comp_out.data() + start_pos,
-                                                        codec.num_pages);
-    auto const d_comp_out = cudf::detail::make_device_uvector_async(
-      comp_out_view, stream, rmm::mr::get_current_device_resource());
     device_span<compression_result> d_comp_res_view(comp_res.data() + start_pos, codec.num_pages);
 
     switch (codec.compression_type) {
       case GZIP:
-        gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, stream);
+        gpuinflate(
+          d_comp_in_view, d_comp_out_view, d_comp_res_view, gzip_header_included::YES, stream);
         break;
       case SNAPPY:
         if (cudf::io::detail::nvcomp_integration::is_stable_enabled()) {
           nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
-                                     d_comp_in,
-                                     d_comp_out,
+                                     d_comp_in_view,
+                                     d_comp_out_view,
                                      d_comp_res_view,
                                      codec.max_decompressed_size,
                                      codec.total_decomp_size,
                                      stream);
         } else {
-          gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, stream);
+          gpu_unsnap(d_comp_in_view, d_comp_out, d_comp_res_view, stream);
         }
         break;
       case ZSTD:
         nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
-                                   d_comp_in,
-                                   d_comp_out,
+                                   d_comp_in_view,
+                                   d_comp_out_view,
                                    d_comp_res_view,
                                    codec.max_decompressed_size,
                                    codec.total_decomp_size,
                                    stream);
         break;
       case BROTLI:
-        gpu_debrotli(d_comp_in,
-                     d_comp_out,
+        gpu_debrotli(d_comp_in_view,
+                     d_comp_out_view,
                      d_comp_res_view,
                      debrotli_scratch.data(),
                      debrotli_scratch.size(),
@@ -893,8 +900,8 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
         break;
       case LZ4_RAW:
         nvcomp::batched_decompress(nvcomp::compression_type::LZ4,
-                                   d_comp_in,
-                                   d_comp_out,
+                                   d_comp_in_view,
+                                   d_comp_out_view,
                                    d_comp_res_view,
                                    codec.max_decompressed_size,
                                    codec.total_decomp_size,

From 1dbafa5d0c1793518e01d7250101a71a768bd76f Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 5 Jul 2024 11:48:59 -0700
Subject: [PATCH 54/75] include changes

---
 cpp/include/cudf/detail/gather.cuh                     | 2 +-
 cpp/include/cudf/detail/null_mask.cuh                  | 4 +++-
 cpp/include/cudf/detail/utilities/host_vector.hpp      | 3 +++
 cpp/include/cudf/detail/utilities/vector_factories.hpp | 2 +-
 cpp/include/cudf/io/text/detail/trie.hpp               | 4 ++--
 5 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 04dbe0a9294..d3e9fc4974d 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -577,7 +577,7 @@ void gather_bitmask(table_view const& source,
   }
 
   // Make device array of target bitmask pointers
-  auto target_masks = cudf::detail::make_host_vector<bitmask_type*>(target.size(), stream);
+  auto target_masks = make_host_vector<bitmask_type*>(target.size(), stream);
   std::transform(target.begin(), target.end(), target_masks.begin(), [](auto const& col) {
     return col->mutable_view().null_mask();
   });
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index e62675cbc8c..ae6db5409cc 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -430,7 +430,9 @@ std::vector<size_type> segmented_count_bits(bitmask_type const* bitmask,
   if (num_segments == 0) { return std::vector<size_type>{}; }
 
   // Construct a contiguous host buffer of indices and copy to device.
-  auto const h_indices = std::vector<size_type>(indices_begin, indices_end);
+  auto h_indices = make_empty_host_vector<typename std::iterator_traits<IndexIterator>::value_type>(
+    std::distance(indices_begin, indices_end), stream);
+  std::copy(indices_begin, indices_end, std::back_inserter(h_indices));
   auto const d_indices =
     make_device_uvector_async(h_indices, stream, rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp
index 71c5bc842c9..b99e79b2e88 100644
--- a/cpp/include/cudf/detail/utilities/host_vector.hpp
+++ b/cpp/include/cudf/detail/utilities/host_vector.hpp
@@ -194,6 +194,9 @@ template <typename T>
 class host_vector : public thrust::host_vector<T, rmm_host_allocator<T>> {
  public:
   using base = thrust::host_vector<T, rmm_host_allocator<T>>;
+
+  host_vector(rmm_host_allocator<T> const& alloc) : base(alloc) {}
+
   host_vector(size_t size, rmm_host_allocator<T> const& alloc) : base(size, alloc) {}
 };
 
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index f4a421138f1..3f29d9d7a33 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -385,7 +385,7 @@ host_vector<T> make_host_vector(size_t size, rmm::cuda_stream_view stream)
 template <typename T>
 host_vector<T> make_empty_host_vector(size_t capacity, rmm::cuda_stream_view stream)
 {
-  auto result = host_vector<T>(0, get_host_allocator<T>(capacity, stream));
+  auto result = host_vector<T>(get_host_allocator<T>(capacity, stream));
   result.reserve(capacity);
   return result;
 }
diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp
index e0b9c7635e3..28862d97ede 100644
--- a/cpp/include/cudf/io/text/detail/trie.hpp
+++ b/cpp/include/cudf/io/text/detail/trie.hpp
@@ -223,11 +223,11 @@ struct trie {
 
     match_length.emplace_back(0);
 
-    std::vector<trie_node> trie_nodes;
     auto token_counts = std::unordered_map<cudf::size_type, int32_t>();
+    auto trie_nodes   = cudf::detail::make_empty_host_vector<trie_node>(tokens.size(), stream);
 
     for (uint32_t i = 0; i < tokens.size(); i++) {
-      trie_nodes.emplace_back(trie_node{tokens[i], match_length[i], transitions[i]});
+      trie_nodes.push_back(trie_node{tokens[i], match_length[i], transitions[i]});
       token_counts[tokens[i]]++;
     }
 

From 58900ddd3b3f98d4d699b35b52ff5e7ba5f1a4f4 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 5 Jul 2024 13:47:38 -0700
Subject: [PATCH 55/75] orc

---
 cpp/src/io/orc/reader_impl_decode.cu | 10 ++++++++--
 cpp/src/io/orc/stripe_enc.cu         |  4 ++--
 cpp/src/io/orc/writer_impl.cu        | 25 +++++++++++++++----------
 cpp/src/io/orc/writer_impl.hpp       |  9 ++++-----
 4 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 72eb41b1360..ab3c54584cb 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -492,11 +492,17 @@ void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const&
   if (num_stripes == 0) return;
 
   auto const num_columns = chunks.size().second;
-  std::vector<thrust::pair<size_type, uint32_t*>> prefix_sums_to_update;
+  auto const num_struct_cols =
+    std::count_if(chunks[0].begin(), chunks[0].end(), [](auto const& chunk) {
+      return chunk.type_kind == STRUCT;
+    });
+  auto prefix_sums_to_update =
+    cudf::detail::make_empty_host_vector<thrust::pair<size_type, uint32_t*>>(num_struct_cols,
+                                                                             stream);
   for (auto col_idx = 0ul; col_idx < num_columns; ++col_idx) {
     // Null counts sums are only needed for children of struct columns
     if (chunks[0][col_idx].type_kind == STRUCT) {
-      prefix_sums_to_update.emplace_back(col_idx, d_prefix_sums + num_stripes * col_idx);
+      prefix_sums_to_update.push_back({col_idx, d_prefix_sums + num_stripes * col_idx});
     }
   }
   auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index b6fc4e3510f..8b06fd05cb0 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -1417,8 +1417,8 @@ void decimal_sizes_to_offsets(device_2dspan<rowgroup_rows const> rg_bounds,
   if (rg_bounds.count() == 0) return;
 
   // Convert map to a vector of views of the `elem_sizes` device buffers
-  std::vector<decimal_column_element_sizes> h_sizes;
-  h_sizes.reserve(elem_sizes.size());
+  auto h_sizes =
+    cudf::detail::make_empty_host_vector<decimal_column_element_sizes>(elem_sizes.size(), stream);
   std::transform(elem_sizes.begin(), elem_sizes.end(), std::back_inserter(h_sizes), [](auto& p) {
     return decimal_column_element_sizes{p.first, p.second};
   });
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 409bf91997a..ba1a4eef99f 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -443,14 +443,15 @@ namespace {
  */
 file_segmentation calculate_segmentation(host_span<orc_column_view const> columns,
                                          hostdevice_2dvector<rowgroup_rows>&& rowgroup_bounds,
-                                         stripe_size_limits max_stripe_size)
+                                         stripe_size_limits max_stripe_size,
+                                         rmm::cuda_stream_view stream)
 {
-  std::vector<stripe_rowgroups> infos;
-  auto const num_rowgroups = rowgroup_bounds.size().first;
-  size_t stripe_start      = 0;
-  size_t stripe_bytes      = 0;
-  size_type stripe_rows    = 0;
-  for (size_t rg_idx = 0; rg_idx < num_rowgroups; ++rg_idx) {
+  auto infos                    = cudf::detail::make_empty_host_vector<stripe_rowgroups>(1, stream);
+  size_type const num_rowgroups = rowgroup_bounds.size().first;
+  size_type stripe_start        = 0;
+  size_t stripe_bytes           = 0;
+  size_type stripe_rows         = 0;
+  for (size_type rg_idx = 0; rg_idx < num_rowgroups; ++rg_idx) {
     auto const rowgroup_total_bytes =
       std::accumulate(columns.begin(), columns.end(), 0ul, [&](size_t total_size, auto const& col) {
         auto const rows = rowgroup_bounds[rg_idx][col.index()].size();
@@ -469,7 +470,9 @@ file_segmentation calculate_segmentation(host_span<orc_column_view const> column
     // Check if adding the current rowgroup to the stripe will make the stripe too large or long
     if ((rg_idx > stripe_start) && (stripe_bytes + rowgroup_total_bytes > max_stripe_size.bytes ||
                                     stripe_rows + rowgroup_rows_max > max_stripe_size.rows)) {
-      infos.emplace_back(infos.size(), stripe_start, rg_idx - stripe_start);
+      infos.push_back(stripe_rowgroups{static_cast<size_type>(infos.size()),
+                                       stripe_start,
+                                       static_cast<size_type>(rg_idx - stripe_start)});
       stripe_start = rg_idx;
       stripe_bytes = 0;
       stripe_rows  = 0;
@@ -478,7 +481,9 @@ file_segmentation calculate_segmentation(host_span<orc_column_view const> column
     stripe_bytes += rowgroup_total_bytes;
     stripe_rows += rowgroup_rows_max;
     if (rg_idx + 1 == num_rowgroups) {
-      infos.emplace_back(infos.size(), stripe_start, num_rowgroups - stripe_start);
+      infos.push_back(stripe_rowgroups{static_cast<size_type>(infos.size()),
+                                       stripe_start,
+                                       static_cast<size_type>(num_rowgroups - stripe_start)});
     }
   }
 
@@ -2297,7 +2302,7 @@ auto convert_table_to_orc_data(table_view const& input,
 
   // Decide stripe boundaries based on rowgroups and char counts
   auto segmentation =
-    calculate_segmentation(orc_table.columns, std::move(rowgroup_bounds), max_stripe_size);
+    calculate_segmentation(orc_table.columns, std::move(rowgroup_bounds), max_stripe_size, stream);
 
   auto stripe_dicts    = build_dictionaries(orc_table, segmentation, sort_dictionaries, stream);
   auto dec_chunk_sizes = decimal_chunk_sizes(orc_table, segmentation, stream);
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index bd082befe0c..f5f8b3cfed9 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -78,10 +78,9 @@ struct orc_table_view {
  * Provides a container-like interface to iterate over rowgroup indices.
  */
 struct stripe_rowgroups {
-  uint32_t id;     // stripe id
-  uint32_t first;  // first rowgroup in the stripe
-  uint32_t size;   // number of rowgroups in the stripe
-  stripe_rowgroups(uint32_t id, uint32_t first, uint32_t size) : id{id}, first{first}, size{size} {}
+  size_type id;     // stripe id
+  size_type first;  // first rowgroup in the stripe
+  size_type size;   // number of rowgroups in the stripe
   [[nodiscard]] auto cbegin() const { return thrust::make_counting_iterator(first); }
   [[nodiscard]] auto cend() const { return thrust::make_counting_iterator(first + size); }
 };
@@ -125,7 +124,7 @@ class orc_streams {
  */
 struct file_segmentation {
   hostdevice_2dvector<rowgroup_rows> rowgroups;
-  std::vector<stripe_rowgroups> stripes;
+  cudf::detail::host_vector<stripe_rowgroups> stripes;
 
   auto num_rowgroups() const noexcept { return rowgroups.size().first; }
   auto num_stripes() const noexcept { return stripes.size(); }

From 395898a82abd2f5cf1eb7599037ef35cc884d934 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 5 Jul 2024 13:51:55 -0700
Subject: [PATCH 56/75] copying

---
 cpp/src/copying/concatenate.cu      | 6 +++---
 cpp/src/copying/contiguous_split.cu | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index 47e74a5cb48..b66e5cab333 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -73,8 +73,8 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
   });
 
   // Assemble contiguous array of device views
-  auto device_views = thrust::host_vector<column_device_view>();
-  device_views.reserve(views.size());
+  auto device_views =
+    cudf::detail::make_empty_host_vector<column_device_view>(views.size(), stream);
   std::transform(device_view_owners.cbegin(),
                  device_view_owners.cend(),
                  std::back_inserter(device_views),
@@ -84,7 +84,7 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
     make_device_uvector_async(device_views, stream, rmm::mr::get_current_device_resource());
 
   // Compute the partition offsets
-  auto offsets = thrust::host_vector<size_t>(views.size() + 1);
+  auto offsets = cudf::detail::make_host_vector<size_t>(views.size() + 1, stream);
   thrust::transform_inclusive_scan(
     thrust::host,
     device_views.cbegin(),
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 37db2c74790..95544742fb7 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -1539,7 +1539,8 @@ std::unique_ptr<chunk_iteration_state> chunk_iteration_state::create(
 
     std::vector<std::size_t> num_batches_per_iteration;
     std::vector<std::size_t> size_of_batches_per_iteration;
-    std::vector<std::size_t> accum_size_per_iteration;
+    auto accum_size_per_iteration =
+      cudf::detail::make_empty_host_vector<std::size_t>(h_offsets.size(), stream);
     std::size_t accum_size = 0;
     {
       auto current_offset_it = h_offsets.begin();

From be916f9c2e100cec2e25ec9d68e177f35c404402 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 5 Jul 2024 13:55:52 -0700
Subject: [PATCH 57/75] few more

---
 cpp/src/datetime/timezone.cpp            | 6 ++----
 cpp/src/dictionary/detail/concatenate.cu | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp
index 1b0d201501b..070b2f1a77e 100644
--- a/cpp/src/datetime/timezone.cpp
+++ b/cpp/src/datetime/timezone.cpp
@@ -485,14 +485,12 @@ std::unique_ptr<table> make_timezone_transition_table(std::optional<std::string_
   CUDF_EXPECTS(transition_times.size() == offsets.size(),
                "Error reading TZif file for timezone " + std::string{timezone_name});
 
-  std::vector<timestamp_s> ttimes_typed;
-  ttimes_typed.reserve(transition_times.size());
+  auto ttimes_typed = make_empty_host_vector<timestamp_s>(transition_times.size(), stream);
   std::transform(transition_times.cbegin(),
                  transition_times.cend(),
                  std::back_inserter(ttimes_typed),
                  [](auto ts) { return timestamp_s{duration_s{ts}}; });
-  std::vector<duration_s> offsets_typed;
-  offsets_typed.reserve(offsets.size());
+  auto offsets_typed = make_empty_host_vector<duration_s>(transition_times.size(), stream);
   std::transform(offsets.cbegin(), offsets.cend(), std::back_inserter(offsets_typed), [](auto ts) {
     return duration_s{ts};
   });
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index fdc3d9d0ecf..72828309425 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -105,7 +105,7 @@ struct compute_children_offsets_fn {
    */
   rmm::device_uvector<offsets_pair> create_children_offsets(rmm::cuda_stream_view stream)
   {
-    std::vector<offsets_pair> offsets(columns_ptrs.size());
+    auto offsets = cudf::detail::make_host_vector<offsets_pair>(columns_ptrs.size(), stream);
     thrust::transform_exclusive_scan(
       thrust::host,
       columns_ptrs.begin(),

From 2225e3b0d28aa4492303ffb16c1e4e12f0b95724 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 5 Jul 2024 14:04:32 -0700
Subject: [PATCH 58/75] partial IO

---
 cpp/src/io/avro/reader_impl.cu |  8 ++--
 cpp/src/io/csv/reader_impl.cu  | 67 ++++++++++++++++++++--------------
 cpp/src/io/json/json_column.cu |  4 +-
 cpp/src/io/json/read_json.cu   |  3 +-
 4 files changed, 48 insertions(+), 34 deletions(-)

diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index 814efe2b5a1..69a0e982a5b 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -554,9 +554,11 @@ table_with_metadata read_avro(std::unique_ptr<cudf::io::datasource>&& source,
       auto d_global_dict_data = rmm::device_uvector<char>(0, stream);
 
       if (total_dictionary_entries > 0) {
-        auto h_global_dict      = std::vector<string_index_pair>(total_dictionary_entries);
-        auto h_global_dict_data = std::vector<char>(dictionary_data_size);
-        size_t dict_pos         = 0;
+        auto h_global_dict =
+          cudf::detail::make_host_vector<string_index_pair>(total_dictionary_entries, stream);
+        auto h_global_dict_data =
+          cudf::detail::make_host_vector<char>(dictionary_data_size, stream);
+        size_t dict_pos = 0;
 
         for (size_t i = 0; i < column_types.size(); ++i) {
           auto const col_idx          = selected_columns[i].first;
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 05faded651d..9a3d777593b 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -506,7 +506,7 @@ void get_data_types_from_column_names(std::map<std::string, data_type> const& us
 }
 
 void infer_column_types(parse_options const& parse_opts,
-                        host_span<column_parse::flags const> column_flags,
+                        cudf::detail::host_vector<column_parse::flags> const& column_flags,
                         device_span<char const> data,
                         device_span<uint64_t const> row_offsets,
                         int32_t num_records,
@@ -566,17 +566,18 @@ void infer_column_types(parse_options const& parse_opts,
   }
 }
 
-std::vector<column_buffer> decode_data(parse_options const& parse_opts,
-                                       std::vector<column_parse::flags> const& column_flags,
-                                       std::vector<std::string> const& column_names,
-                                       device_span<char const> data,
-                                       device_span<uint64_t const> row_offsets,
-                                       host_span<data_type const> column_types,
-                                       int32_t num_records,
-                                       int32_t num_actual_columns,
-                                       int32_t num_active_columns,
-                                       rmm::cuda_stream_view stream,
-                                       rmm::device_async_resource_ref mr)
+std::vector<column_buffer> decode_data(
+  parse_options const& parse_opts,
+  cudf::detail::host_vector<column_parse::flags> const& column_flags,
+  std::vector<std::string> const& column_names,
+  device_span<char const> data,
+  device_span<uint64_t const> row_offsets,
+  cudf::detail::host_vector<data_type> const& column_types,
+  int32_t num_records,
+  int32_t num_actual_columns,
+  int32_t num_active_columns,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
 {
   // Alloc output; columns' data memory is still expected for empty dataframe
   std::vector<column_buffer> out_buffers;
@@ -592,8 +593,8 @@ std::vector<column_buffer> decode_data(parse_options const& parse_opts,
     }
   }
 
-  thrust::host_vector<void*> h_data(num_active_columns);
-  thrust::host_vector<bitmask_type*> h_valid(num_active_columns);
+  auto h_data  = cudf::detail::make_host_vector<void*>(num_active_columns, stream);
+  auto h_valid = cudf::detail::make_host_vector<bitmask_type*>(num_active_columns, stream);
 
   for (int i = 0; i < num_active_columns; ++i) {
     h_data[i]  = out_buffers[i].data();
@@ -622,14 +623,16 @@ std::vector<column_buffer> decode_data(parse_options const& parse_opts,
   return out_buffers;
 }
 
-std::vector<data_type> determine_column_types(csv_reader_options const& reader_opts,
-                                              parse_options const& parse_opts,
-                                              host_span<std::string const> column_names,
-                                              device_span<char const> data,
-                                              device_span<uint64_t const> row_offsets,
-                                              int32_t num_records,
-                                              host_span<column_parse::flags> column_flags,
-                                              rmm::cuda_stream_view stream)
+cudf::detail::host_vector<data_type> determine_column_types(
+  csv_reader_options const& reader_opts,
+  parse_options const& parse_opts,
+  host_span<std::string const> column_names,
+  device_span<char const> data,
+  device_span<uint64_t const> row_offsets,
+  int32_t num_records,
+  cudf::detail::host_vector<column_parse::flags>& column_flags,
+  cudf::size_type num_active_columns,
+  rmm::cuda_stream_view stream)
 {
   std::vector<data_type> column_types(column_flags.size());
 
@@ -653,7 +656,8 @@ std::vector<data_type> determine_column_types(csv_reader_options const& reader_o
                      stream);
 
   // compact column_types to only include active columns
-  std::vector<data_type> active_col_types;
+  auto active_col_types =
+    cudf::detail::make_empty_host_vector<data_type>(num_active_columns, stream);
   std::copy_if(column_types.cbegin(),
                column_types.cend(),
                std::back_inserter(active_col_types),
@@ -697,8 +701,10 @@ table_with_metadata read_csv(cudf::io::datasource* source,
 
   auto const num_actual_columns = static_cast<int32_t>(column_names.size());
   auto num_active_columns       = num_actual_columns;
-  auto column_flags             = std::vector<column_parse::flags>(
-    num_actual_columns, column_parse::enabled | column_parse::inferred);
+  auto column_flags =
+    cudf::detail::make_host_vector<column_parse::flags>(num_actual_columns, stream);
+  std::fill(
+    column_flags.begin(), column_flags.end(), column_parse::enabled | column_parse::inferred);
 
   // User did not pass column names to override names in the file
   // Process names from the file to remove empty and duplicated strings
@@ -842,8 +848,15 @@ table_with_metadata read_csv(cudf::io::datasource* source,
 
   // Exclude the end-of-data row from number of rows with actual data
   auto const num_records  = std::max(row_offsets.size(), 1ul) - 1;
-  auto const column_types = determine_column_types(
-    reader_opts, parse_opts, column_names, data, row_offsets, num_records, column_flags, stream);
+  auto const column_types = determine_column_types(reader_opts,
+                                                   parse_opts,
+                                                   column_names,
+                                                   data,
+                                                   row_offsets,
+                                                   num_records,
+                                                   column_flags,
+                                                   num_active_columns,
+                                                   stream);
 
   auto metadata    = table_metadata{};
   auto out_columns = std::vector<std::unique_ptr<cudf::column>>();
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 3e587768b11..17fa7abdffe 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -622,7 +622,7 @@ void make_device_json_column(device_span<SymbolT const> input,
   // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking
   std::map<std::pair<NodeIndexT, std::string>, NodeIndexT> mapped_columns;
   // find column_ids which are values, but should be ignored in validity
-  std::vector<uint8_t> ignore_vals(num_columns, 0);
+  auto ignore_vals = cudf::detail::make_host_vector<uint8_t>(num_columns, stream);
   std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
   std::vector<uint8_t> is_pruned(num_columns, 0);
   columns.try_emplace(parent_node_sentinel, std::ref(root));
@@ -812,7 +812,7 @@ void make_device_json_column(device_span<SymbolT const> input,
     return thrust::get<1>(a) < thrust::get<1>(b);
   });
   // move columns data to device.
-  std::vector<json_column_data> columns_data(num_columns);
+  auto columns_data = cudf::detail::make_host_vector<json_column_data>(num_columns, stream);
   for (auto& [col_id, col_ref] : columns) {
     if (col_id == parent_node_sentinel) continue;
     auto& col            = col_ref.get();
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 74001e5e01a..0f486457452 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -78,10 +78,9 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
   auto constexpr num_delimiter_chars = 1;
 
   if (compression == compression_type::NONE) {
-    std::vector<size_t> delimiter_map{};
+    auto delimiter_map = cudf::detail::make_empty_host_vector<size_t>(sources.size(), stream);
     std::vector<size_t> prefsum_source_sizes(sources.size());
     std::vector<std::unique_ptr<datasource::buffer>> h_buffers;
-    delimiter_map.reserve(sources.size());
     size_t bytes_read = 0;
     std::transform_inclusive_scan(sources.begin(),
                                   sources.end(),

From 0446d345f636975045d333507633ff6658a0129c Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 5 Jul 2024 15:38:53 -0700
Subject: [PATCH 59/75] parquet

---
 cpp/src/io/parquet/reader_impl_chunking.cu   | 17 ++++++++---------
 cpp/src/io/parquet/reader_impl_preprocess.cu | 10 +++++-----
 cpp/src/io/parquet/writer_impl.cu            |  9 ++++++---
 3 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 5fba54ab309..2c560049e45 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -810,10 +810,10 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
     cudf::detail::make_empty_host_vector<device_span<uint8_t>>(num_comp_pages, stream);
 
   // vectors to save v2 def and rep level data, if any
-  std::vector<device_span<uint8_t const>> copy_in;
-  copy_in.reserve(num_comp_pages);
-  std::vector<device_span<uint8_t>> copy_out;
-  copy_out.reserve(num_comp_pages);
+  auto copy_in =
+    cudf::detail::make_empty_host_vector<device_span<uint8_t const>>(num_comp_pages, stream);
+  auto copy_out =
+    cudf::detail::make_empty_host_vector<device_span<uint8_t>>(num_comp_pages, stream);
 
   rmm::device_uvector<compression_result> comp_res(num_comp_pages, stream);
   thrust::fill(rmm::exec_policy_nosync(stream),
@@ -835,8 +835,8 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
       // input and output buffers. otherwise we'd have to keep both the compressed
       // and decompressed data.
       if (offset != 0) {
-        copy_in.emplace_back(page.page_data, offset);
-        copy_out.emplace_back(dst_base, offset);
+        copy_in.push_back({page.page_data, static_cast<size_t>(offset)});
+        copy_out.push_back({dst_base, static_cast<size_t>(offset)});
       }
       comp_in.push_back(
         {page.page_data + offset, static_cast<size_t>(page.compressed_page_size - offset)});
@@ -1134,9 +1134,8 @@ void include_decompression_scratch_size(device_span<ColumnChunkDesc const> chunk
                                 decomp_sum{});
 
   // retrieve to host so we can call nvcomp to get compression scratch sizes
-  std::vector<decompression_info> h_decomp_info =
-    cudf::detail::make_std_vector_sync(decomp_info, stream);
-  std::vector<size_t> temp_cost(pages.size());
+  auto h_decomp_info = cudf::detail::make_host_vector_sync(decomp_info, stream);
+  auto temp_cost     = cudf::detail::make_host_vector<size_t>(pages.size(), stream);
   thrust::transform(thrust::host,
                     h_decomp_info.begin(),
                     h_decomp_info.end(),
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index f28a7311ccb..18290432aca 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -370,7 +370,7 @@ void fill_in_page_info(host_span<ColumnChunkDesc> chunks,
                        rmm::cuda_stream_view stream)
 {
   auto const num_pages = pages.size();
-  std::vector<page_index_info> page_indexes(num_pages);
+  auto page_indexes    = cudf::detail::make_host_vector<page_index_info>(num_pages, stream);
 
   for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
     auto const& chunk = chunks[c];
@@ -1031,8 +1031,8 @@ struct get_page_num_rows {
 };
 
 struct input_col_info {
-  int const schema_idx;
-  size_type const nesting_depth;
+  int schema_idx;
+  size_type nesting_depth;
 };
 
 /**
@@ -1512,8 +1512,8 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
 
   // compute output column sizes by examining the pages of the -input- columns
   if (has_lists) {
-    std::vector<input_col_info> h_cols_info;
-    h_cols_info.reserve(_input_columns.size());
+    auto h_cols_info =
+      cudf::detail::make_empty_host_vector<input_col_info>(_input_columns.size(), _stream);
     std::transform(_input_columns.cbegin(),
                    _input_columns.cend(),
                    std::back_inserter(h_cols_info),
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index bed4dbc5a66..c26622db047 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1135,7 +1135,7 @@ void init_row_group_fragments(cudf::detail::hostdevice_2dvector<PageFragment>& f
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 void calculate_page_fragments(device_span<PageFragment> frag,
-                              host_span<size_type const> frag_sizes,
+                              cudf::detail::host_vector<size_type> const& frag_sizes,
                               rmm::cuda_stream_view stream)
 {
   auto d_frag_sz = cudf::detail::make_device_uvector_async(
@@ -1737,7 +1737,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   size_type max_page_fragment_size =
     max_page_fragment_size_opt.value_or(default_max_page_fragment_size);
 
-  std::vector<size_type> column_frag_size(num_columns, max_page_fragment_size);
+  auto column_frag_size = cudf::detail::make_host_vector<size_type>(num_columns, stream);
+  std::fill(column_frag_size.begin(), column_frag_size.end(), max_page_fragment_size);
 
   if (input.num_rows() > 0 && not max_page_fragment_size_opt.has_value()) {
     std::vector<size_t> column_sizes;
@@ -1793,7 +1794,9 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
 
   size_type num_fragments = std::reduce(num_frag_in_part.begin(), num_frag_in_part.end());
 
-  std::vector<int> part_frag_offset;  // Store the idx of the first fragment in each partition
+  auto part_frag_offset =
+    cudf::detail::make_empty_host_vector<int>(num_frag_in_part.size() + 1, stream);
+  // Store the idx of the first fragment in each partition
   std::exclusive_scan(
     num_frag_in_part.begin(), num_frag_in_part.end(), std::back_inserter(part_frag_offset), 0);
   part_frag_offset.push_back(part_frag_offset.back() + num_frag_in_part.back());

From 6a7ff7345336e75639fa7b5ea337f93b72a0d17b Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 5 Jul 2024 21:58:48 -0700
Subject: [PATCH 60/75] rest of it

---
 cpp/include/cudf/lists/detail/dremel.hpp    | 8 ++++----
 cpp/src/strings/combine/join.cu             | 6 ++++--
 cpp/src/strings/convert/convert_datetime.cu | 2 +-
 cpp/src/strings/copying/concatenate.cu      | 2 +-
 cpp/src/strings/filter_chars.cu             | 2 +-
 cpp/src/strings/replace/multi_re.cu         | 2 +-
 cpp/src/strings/translate.cu                | 2 +-
 cpp/src/table/row_operators.cu              | 5 ++++-
 8 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/cpp/include/cudf/lists/detail/dremel.hpp b/cpp/include/cudf/lists/detail/dremel.hpp
index d36a4091947..11f641a3fce 100644
--- a/cpp/include/cudf/lists/detail/dremel.hpp
+++ b/cpp/include/cudf/lists/detail/dremel.hpp
@@ -31,8 +31,8 @@ struct dremel_device_view {
   size_type const* offsets;
   uint8_t const* rep_levels;
   uint8_t const* def_levels;
-  size_type const leaf_data_size;
-  uint8_t const max_def_level;
+  size_type leaf_data_size;
+  uint8_t max_def_level;
 };
 
 /**
@@ -45,8 +45,8 @@ struct dremel_data {
   rmm::device_uvector<uint8_t> rep_level;
   rmm::device_uvector<uint8_t> def_level;
 
-  size_type const leaf_data_size;
-  uint8_t const max_def_level;
+  size_type leaf_data_size;
+  uint8_t max_def_level;
 
   operator dremel_device_view() const
   {
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index c4cc0dbe09d..b534e9b2e5b 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -169,8 +169,10 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
 
   // build the offsets: single string output has offsets [0,chars-size]
   auto offsets_column = [&] {
-    auto offsets = cudf::detail::make_device_uvector_async(
-      std::vector<size_type>({0, static_cast<size_type>(chars.size())}), stream, mr);
+    auto h_offsets = cudf::detail::make_host_vector<size_type>(2, stream);
+    h_offsets[0]   = 0;
+    h_offsets[1]   = chars.size();
+    auto offsets   = cudf::detail::make_device_uvector_async(h_offsets, stream, mr);
     return std::make_unique<column>(std::move(offsets), rmm::device_buffer{}, 0);
   }();
 
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index 2f4ebf97264..64a2107e17a 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -123,7 +123,7 @@ struct format_compiler {
     : format(fmt), d_items(0, stream)
   {
     specifiers.insert(extra_specifiers.begin(), extra_specifiers.end());
-    std::vector<format_item> items;
+    auto items  = cudf::detail::make_empty_host_vector<format_item>(format.length(), stream);
     auto str    = format.data();
     auto length = format.length();
     while (length > 0) {
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 7622e39e735..352e0f9f41a 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -79,7 +79,7 @@ auto create_strings_device_views(host_span<column_view const> views, rmm::cuda_s
 
   // Compute the partition offsets and size of offset column
   // Note: Using 64-bit size_t so we can detect overflow of 32-bit size_type
-  auto input_offsets = std::vector<size_t>(views.size() + 1);
+  auto input_offsets = cudf::detail::make_host_vector<size_t>(views.size() + 1, stream);
   auto offset_it     = std::next(input_offsets.begin());
   thrust::transform(
     thrust::host, views.begin(), views.end(), offset_it, [](auto const& col) -> size_t {
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index a34828fa97e..48620af8cad 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -129,7 +129,7 @@ std::unique_ptr<column> filter_characters(
 
   // convert input table for copy to device memory
   size_type table_size = static_cast<size_type>(characters_to_filter.size());
-  thrust::host_vector<char_range> htable(table_size);
+  auto htable          = cudf::detail::make_host_vector<char_range>(table_size, stream);
   std::transform(
     characters_to_filter.begin(), characters_to_filter.end(), htable.begin(), [](auto entry) {
       return char_range{entry.first, entry.second};
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index cd60a4296b9..31234ea42ec 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -171,7 +171,7 @@ std::unique_ptr<column> replace_re(strings_column_view const& input,
   auto d_buffer          = rmm::device_buffer(buffer_size, stream);
 
   // copy all the reprog_device instances to a device memory array
-  std::vector<reprog_device> progs;
+  auto progs = cudf::detail::make_empty_host_vector<reprog_device>(h_progs.size(), stream);
   std::transform(h_progs.begin(),
                  h_progs.end(),
                  std::back_inserter(progs),
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index 16b22d0de4c..a242b008a54 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -97,7 +97,7 @@ std::unique_ptr<column> translate(strings_column_view const& strings,
 
   size_type table_size = static_cast<size_type>(chars_table.size());
   // convert input table
-  thrust::host_vector<translate_table> htable(table_size);
+  auto htable = cudf::detail::make_host_vector<translate_table>(table_size, stream);
   std::transform(chars_table.begin(), chars_table.end(), htable.begin(), [](auto entry) {
     return translate_table{entry.first, entry.second};
   });
diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu
index 13c31e8ae4c..2969557c78f 100644
--- a/cpp/src/table/row_operators.cu
+++ b/cpp/src/table/row_operators.cu
@@ -308,7 +308,10 @@ auto decompose_structs(table_view table,
 auto list_lex_preprocess(table_view const& table, rmm::cuda_stream_view stream)
 {
   std::vector<detail::dremel_data> dremel_data;
-  std::vector<detail::dremel_device_view> dremel_device_views;
+  auto const num_list_columns = std::count_if(
+    table.begin(), table.end(), [](auto const& col) { return col.type().id() == type_id::LIST; });
+  auto dremel_device_views =
+    cudf::detail::make_empty_host_vector<detail::dremel_device_view>(num_list_columns, stream);
   for (auto const& col : table) {
     if (col.type().id() == type_id::LIST) {
       dremel_data.push_back(detail::get_comparator_data(col, {}, false, stream));

From a0a6caac3782951788b96a6360eefa433d1e2015 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 8 Jul 2024 10:42:38 -0700
Subject: [PATCH 61/75] style

---
 cpp/include/cudf/detail/utilities/host_memory.hpp |  2 +-
 cpp/include/cudf/lists/detail/dremel.hpp          |  2 +-
 cpp/tests/io/json_test.cpp                        | 13 ++++++-------
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/host_memory.hpp b/cpp/include/cudf/detail/utilities/host_memory.hpp
index b1a51ed660e..f2500659d5f 100644
--- a/cpp/include/cudf/detail/utilities/host_memory.hpp
+++ b/cpp/include/cudf/detail/utilities/host_memory.hpp
@@ -43,4 +43,4 @@ rmm_host_allocator<T> get_host_allocator(std::size_t size, rmm::cuda_stream_view
   return {get_pageable_memory_resource(), _stream};
 }
 
-}  // namespace cudf::detail
\ No newline at end of file
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/lists/detail/dremel.hpp b/cpp/include/cudf/lists/detail/dremel.hpp
index 11f641a3fce..53448424827 100644
--- a/cpp/include/cudf/lists/detail/dremel.hpp
+++ b/cpp/include/cudf/lists/detail/dremel.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 0ee139b4787..9c76c344157 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -222,8 +222,7 @@ std::string to_records_orient(std::vector<std::map<std::string, std::string>> co
 }
 
 template <typename DecimalType>
-struct JsonFixedPointReaderTest : public JsonReaderTest {
-};
+struct JsonFixedPointReaderTest : public JsonReaderTest {};
 
 template <typename DecimalType>
 struct JsonValidFixedPointReaderTest : public JsonFixedPointReaderTest<DecimalType> {
@@ -1140,7 +1139,7 @@ TEST_P(JsonReaderParamTest, ParseOutOfRangeIntegers)
                                                     "-33333333333333333333333",
                                                     "-444444444444444444444444"};
   std::vector<std::string> greater_uint64_max    = {
-       "18446744073709551615", "18446744073709551616", "18446744073709551617", "18446744073709551618"};
+    "18446744073709551615", "18446744073709551616", "18446744073709551617", "18446744073709551618"};
   std::vector<std::string> less_int64_min = {
     "-9223372036854775807", "-9223372036854775808", "-9223372036854775809", "-9223372036854775810"};
   std::vector<std::string> mixed_range = {
@@ -1370,10 +1369,10 @@ TEST_F(JsonReaderTest, JsonLongString)
       "",  // null
       "",  // null
       "கார்த்தி",
-      "CႮ≪ㇳ䍏凹沦王辿龸ꁗ믜스폶ﴠ",                                      //  0000-FFFF
-      "𐀀𑿪𒐦𓃰𔙆 𖦆𗿿𘳕𚿾[↳] 𜽆𝓚𞤁🄰",      // 10000-1FFFF
-      "𠘨𡥌𢗉𣇊𤊩𥅽𦉱𧴱𨁲𩁹𪐢𫇭𬬭𭺷𮊦屮",  // 20000-2FFFF
-      "𰾑𱔈𲍉",                                                     // 30000-3FFFF
+      "CႮ≪ㇳ䍏凹沦王辿龸ꁗ믜스폶ﴠ",  //  0000-FFFF
+      "𐀀𑿪𒐦𓃰𔙆 𖦆𗿿𘳕𚿾[↳] 𜽆𝓚𞤁🄰",                            // 10000-1FFFF
+      "𠘨𡥌𢗉𣇊𤊩𥅽𦉱𧴱𨁲𩁹𪐢𫇭𬬭𭺷𮊦屮",                // 20000-2FFFF
+      "𰾑𱔈𲍉",                                          // 30000-3FFFF
       R"("$€ \u0024\u20ac \\u0024\\u20ac  \\\u0024\\\u20ac \\\\u0024\\\\u20ac)",
       R"(        \\\\\\\\\\\\\\\\)",
       R"(\\\\\\\\\\\\\\\\)",

From 7789e39a8308049b3882633cf1f52b0394071f15 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 9 Jul 2024 00:54:17 -0700
Subject: [PATCH 62/75] improve docs

---
 .../cudf/detail/utilities/host_memory.hpp     | 14 ++--
 .../detail/utilities/vector_factories.hpp     | 69 +++++++++++++------
 cpp/src/io/orc/writer_impl.cu                 |  2 +
 3 files changed, 59 insertions(+), 26 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/host_memory.hpp b/cpp/include/cudf/detail/utilities/host_memory.hpp
index f2500659d5f..fd82b584c7e 100644
--- a/cpp/include/cudf/detail/utilities/host_memory.hpp
+++ b/cpp/include/cudf/detail/utilities/host_memory.hpp
@@ -25,14 +25,18 @@
 #include <cstddef>
 
 namespace cudf::detail {
-
-CUDF_EXPORT rmm::host_async_resource_ref& get_pageable_memory_resource();
+/**
+ * @brief Get the memory resource to be used for pageable memory allocations.
+ *
+ * @return Reference to the pageable memory resource
+ */
+CUDF_EXPORT rmm::host_async_resource_ref get_pageable_memory_resource();
 
 /**
- * @brief Get the rmm resource to be used for host memory allocations.
+ * @brief Get the memory resource to be used for the host memory allocation.
  *
- * @param size The size of the allocation
- * @return The rmm resource to be used for host memory allocations
+ * @param size The number of elements of type T to allocate
+ * @return The memory resource to be used for the host memory allocation
  */
 template <typename T>
 rmm_host_allocator<T> get_host_allocator(std::size_t size, rmm::cuda_stream_view _stream)
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 3f29d9d7a33..26712369b7d 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -180,7 +180,18 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
   return make_device_uvector_async(
     device_span<typename Container::value_type const>{c}, stream, mr);
 }
-
+/**
+ * @brief Asynchronously construct a `device_uvector` containing a deep copy of data from a
+ * `host_vector`
+ *
+ * @note This function does not synchronize `stream` after the copy.
+ *
+ * @tparam T The type of the data to copy
+ * @param v The host_vector of data to deep copy
+ * @param stream The stream on which to allocate memory and perform the copy
+ * @param mr The memory resource to use for allocating the returned device_uvector
+ * @return A device_uvector containing the copied data
+ */
 template <typename T>
 rmm::device_uvector<T> make_device_uvector_async(host_vector<T> const& v,
                                                  rmm::cuda_stream_view stream,
@@ -286,21 +297,11 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
   return make_device_uvector_sync(device_span<typename Container::value_type const>{c}, stream, mr);
 }
 
-// Utility function template to allow copying to either a thrust::host_vector or std::vector
-template <typename T, typename OutContainer>
-OutContainer make_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
-{
-  OutContainer result(v.size());
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    result.data(), v.data(), v.size() * sizeof(T), cudaMemcpyDefault, stream.value()));
-  return result;
-}
-
 /**
  * @brief Asynchronously construct a `std::vector` containing a copy of data from a
  * `device_span`
  *
- * @note This function does not synchronize `stream`.
+ * @note This function does not synchronize `stream` after the copy.
  *
  * @tparam T The type of the data to copy
  * @param source_data The device data to copy
@@ -310,14 +311,17 @@ OutContainer make_vector_async(device_span<T const> v, rmm::cuda_stream_view str
 template <typename T>
 std::vector<T> make_std_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
 {
-  return make_vector_async<T, std::vector<T>>(v, stream);
+  std::vector<T> result(v.size());
+  CUDF_CUDA_TRY(cudaMemcpyAsync(
+    result.data(), v.data(), v.size() * sizeof(T), cudaMemcpyDefault, stream.value()));
+  return result;
 }
 
 /**
  * @brief Asynchronously construct a `std::vector` containing a copy of data from a device
  * container
  *
- * @note This function synchronizes `stream`.
+ * @note This function synchronizes `stream` after the copy.
  *
  * @tparam Container The type of the container to copy from
  * @tparam T The type of the data to copy
@@ -339,7 +343,7 @@ std::vector<typename Container::value_type> make_std_vector_async(Container cons
  * @brief Synchronously construct a `std::vector` containing a copy of data from a
  * `device_span`
  *
- * @note This function does a synchronize on `stream`.
+ * @note This function does a synchronize on `stream` after the copy.
  *
  * @tparam T The type of the data to copy
  * @param source_data The device data to copy
@@ -376,12 +380,32 @@ std::vector<typename Container::value_type> make_std_vector_sync(Container const
   return make_std_vector_sync(device_span<typename Container::value_type const>{c}, stream);
 }
 
+/**
+ * @brief Construct a `thrust::host_vector` of the given size.
+ *
+ * @note The returned vector may be using a pinned memory resource.
+ *
+ * @tparam T The type of the vector data
+ * @param size The number of elements in the created vector
+ * @param stream The stream on which to allocate memory
+ * @return A host_vector of the given size
+ */
 template <typename T>
 host_vector<T> make_host_vector(size_t size, rmm::cuda_stream_view stream)
 {
   return host_vector<T>(size, get_host_allocator<T>(size, stream));
 }
 
+/**
+ * @brief Construct an empty `thrust::host_vector` with the given capacity.
+ *
+ * @note The returned vector may be using a pinned memory resource.
+ *
+ * @tparam T The type of the vector data
+ * @param capacity Initial capacity of the vector
+ * @param stream The stream on which to allocate memory
+ * @return A host_vector with the given capacity
+ */
 template <typename T>
 host_vector<T> make_empty_host_vector(size_t capacity, rmm::cuda_stream_view stream)
 {
@@ -394,7 +418,8 @@ host_vector<T> make_empty_host_vector(size_t capacity, rmm::cuda_stream_view str
  * @brief Asynchronously construct a `thrust::host_vector` containing a copy of data from a
  * `device_span`
  *
- * @note This function does not synchronize `stream`.
+ * @note This function does not synchronize `stream` after the copy. The returned vector may be
+ * using a pinned memory resource.
  *
  * @tparam T The type of the data to copy
  * @param source_data The device data to copy
@@ -418,7 +443,8 @@ host_vector<T> make_host_vector_async(device_span<T const> v, rmm::cuda_stream_v
  * @brief Asynchronously construct a `std::vector` containing a copy of data from a device
  * container
  *
- * @note This function does not synchronize `stream`.
+ * @note This function does not synchronize `stream` after the copy. The returned vector may be
+ * using a pinned memory resource.
  *
  * @tparam Container The type of the container to copy from
  * @tparam T The type of the data to copy
@@ -440,7 +466,8 @@ host_vector<typename Container::value_type> make_host_vector_async(Container con
  * @brief Synchronously construct a `thrust::host_vector` containing a copy of data from a
  * `device_span`
  *
- * @note This function does a synchronize on `stream`.
+ * @note This function does a synchronize on `stream` after the copy. The returned vector may be
+ * using a pinned memory resource.
  *
  * @tparam T The type of the data to copy
  * @param source_data The device data to copy
@@ -459,7 +486,7 @@ host_vector<T> make_host_vector_sync(device_span<T const> v, rmm::cuda_stream_vi
  * @brief Synchronously construct a `thrust::host_vector` containing a copy of data from a device
  * container
  *
- * @note This function synchronizes `stream`.
+ * @note This function synchronizes `stream` after the copy.
  *
  * @tparam Container The type of the container to copy from
  * @tparam T The type of the data to copy
@@ -480,7 +507,7 @@ host_vector<typename Container::value_type> make_host_vector_sync(Container cons
 /**
  * @brief Asynchronously construct a pinned `cudf::detail::host_vector` of the given size
  *
- * @note This function may not synchronize `stream`.
+ * @note This function may not synchronize `stream` after the copy.
  *
  * @tparam T The type of the vector data
  * @param size The number of elements in the created vector
@@ -496,7 +523,7 @@ host_vector<T> make_pinned_vector_async(size_t size, rmm::cuda_stream_view strea
 /**
  * @brief Synchronously construct a pinned `cudf::detail::host_vector` of the given size
  *
- * @note This function synchronizes `stream`.
+ * @note This function synchronizes `stream` after the copy.
  *
  * @tparam T The type of the vector data
  * @param size The number of elements in the created vector
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index ba1a4eef99f..1aed3b8da7c 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -446,6 +446,8 @@ file_segmentation calculate_segmentation(host_span<orc_column_view const> column
                                          stripe_size_limits max_stripe_size,
                                          rmm::cuda_stream_view stream)
 {
+  // Number of stripes is not known in advance. Only reserve a single element to use pinned memory
+  // resource if at all enabled.
   auto infos                    = cudf::detail::make_empty_host_vector<stripe_rowgroups>(1, stream);
   size_type const num_rowgroups = rowgroup_bounds.size().first;
   size_type stripe_start        = 0;

From d55fb39098a7b55c21c1dfbc3e5ccb2598d85aeb Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 10 Jul 2024 12:45:16 -0700
Subject: [PATCH 63/75] add missing overload

---
 .../detail/utilities/vector_factories.hpp     | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 26712369b7d..493e7f788b8 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -180,6 +180,7 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
   return make_device_uvector_async(
     device_span<typename Container::value_type const>{c}, stream, mr);
 }
+
 /**
  * @brief Asynchronously construct a `device_uvector` containing a deep copy of data from a
  * `host_vector`
@@ -207,6 +208,28 @@ rmm::device_uvector<T> make_device_uvector_async(host_vector<T> const& v,
   return ret;
 }
 
+/**
+ * @brief Synchronously construct a `device_uvector` containing a deep copy of data from a
+ * `host_vector`
+ *
+ * @note This function synchronizes `stream` after the copy.
+ *
+ * @tparam T The type of the data to copy
+ * @param v The host_vector of data to deep copy
+ * @param stream The stream on which to allocate memory and perform the copy
+ * @param mr The memory resource to use for allocating the returned device_uvector
+ * @return A device_uvector containing the copied data
+ */
+template <typename T>
+rmm::device_uvector<T> make_device_uvector_sync(host_vector<T> const& v,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::device_async_resource_ref mr)
+{
+  auto ret = make_device_uvector_async(v, stream, mr);
+  stream.synchronize();
+  return ret;
+}
+
 /**
  * @brief Synchronously construct a `device_uvector` containing a deep copy of data from a
  * `host_span`

From d8f0e58e795d5ace560a051b3caf84d0de88569d Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 10 Jul 2024 17:56:51 -0700
Subject: [PATCH 64/75] typo fixes; clean up

---
 cpp/include/cudf/detail/utilities/host_vector.hpp | 8 +++++---
 cpp/include/cudf/utilities/pinned_memory.hpp      | 6 +++---
 cpp/src/datetime/timezone.cpp                     | 2 +-
 cpp/src/utilities/host_memory.cpp                 | 4 ++--
 4 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp
index b99e79b2e88..f4e5f718da4 100644
--- a/cpp/include/cudf/detail/utilities/host_vector.hpp
+++ b/cpp/include/cudf/detail/utilities/host_vector.hpp
@@ -61,6 +61,10 @@ class rmm_host_allocator<void> {
   };
 };
 
+template <class DesiredProperty, class... Properties>
+inline constexpr bool contains_property =
+  (cuda::std::is_same_v<DesiredProperty, Properties> || ... || false);
+
 /*! \p rmm_host_allocator is a CUDA-specific host memory allocator
  *  that employs \c `rmm::host_async_resource_ref` for allocation.
  *
@@ -105,9 +109,7 @@ class rmm_host_allocator {
                      rmm::cuda_stream_view _stream)
     : mr(_mr),
       stream(_stream),
-      _is_device_accessible{
-        cuda::has_property<cuda::mr::async_resource_ref<cuda::mr::host_accessible, Properties...>,
-                           cuda::mr::device_accessible>}
+      _is_device_accessible{contains_property<cuda::mr::device_accessible, Properties...>}
   {
   }
 
diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp
index 7a9e48f443c..fa7e1b35327 100644
--- a/cpp/include/cudf/utilities/pinned_memory.hpp
+++ b/cpp/include/cudf/utilities/pinned_memory.hpp
@@ -74,9 +74,9 @@ size_t get_kernel_pinned_copy_threshold();
 /**
  * @brief Set the threshold size for allocating host memory as pinned memory.
  *
- * @param threshold The threshold size in bytes. If the size of the allocation is less than this
- * threshold, the memory will be allocated as pinned memory. If the size is greater than or equal
- * to this threshold, the memory will be allocated as pageable memory.
+ * @param threshold The threshold size in bytes. If the size of the allocation is less or equal to
+ * this threshold, the memory will be allocated as pinned memory. If the size is greater than this
+ * threshold, the memory will be allocated as pageable memory.
  */
 void set_allocate_host_as_pinned_threshold(size_t threshold);
 
diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp
index 070b2f1a77e..7ca1b51df98 100644
--- a/cpp/src/datetime/timezone.cpp
+++ b/cpp/src/datetime/timezone.cpp
@@ -490,7 +490,7 @@ std::unique_ptr<table> make_timezone_transition_table(std::optional<std::string_
                  transition_times.cend(),
                  std::back_inserter(ttimes_typed),
                  [](auto ts) { return timestamp_s{duration_s{ts}}; });
-  auto offsets_typed = make_empty_host_vector<duration_s>(transition_times.size(), stream);
+  auto offsets_typed = make_empty_host_vector<duration_s>(offsets.size(), stream);
   std::transform(offsets.cbegin(), offsets.cend(), std::back_inserter(offsets_typed), [](auto ts) {
     return duration_s{ts};
   });
diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp
index d3bcf7a085d..53bcb00edc5 100644
--- a/cpp/src/utilities/host_memory.cpp
+++ b/cpp/src/utilities/host_memory.cpp
@@ -241,7 +241,7 @@ class new_delete_memory_resource {
 };
 
 static_assert(cuda::mr::resource_with<new_delete_memory_resource, cuda::mr::host_accessible>,
-              "Pinned pool mr must be accessible from both host and device");
+              "Pageable pool mr must be accessible from the host");
 
 }  // namespace
 
@@ -285,7 +285,7 @@ size_t get_kernel_pinned_copy_threshold() { return kernel_pinned_copy_threshold(
 CUDF_EXPORT auto& allocate_host_as_pinned_threshold()
 {
   // use pageable memory for all host allocations
-  static std::atomic<size_t> threshold = 16 * 1024;
+  static std::atomic<size_t> threshold = 0;
   return threshold;
 }
 

From b94d26c11f1503b5c6950f162fbb790cc1e9a420 Mon Sep 17 00:00:00 2001
From: vukasin <vmilovanovic@nvidia.com>
Date: Mon, 15 Jul 2024 16:12:34 +0000
Subject: [PATCH 65/75] fix return type

---
 cpp/src/utilities/host_memory.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp
index 53bcb00edc5..23793641dc5 100644
--- a/cpp/src/utilities/host_memory.cpp
+++ b/cpp/src/utilities/host_memory.cpp
@@ -298,7 +298,7 @@ size_t get_allocate_host_as_pinned_threshold() { return allocate_host_as_pinned_
 
 namespace detail {
 
-CUDF_EXPORT rmm::host_async_resource_ref& get_pageable_memory_resource()
+CUDF_EXPORT rmm::host_async_resource_ref get_pageable_memory_resource()
 {
   static new_delete_memory_resource mr{};
   static rmm::host_async_resource_ref mr_ref{mr};

From 0dfaee48439337789a6a4e6ebf370ffb1cdc8684 Mon Sep 17 00:00:00 2001
From: vukasin <vmilovanovic@nvidia.com>
Date: Mon, 15 Jul 2024 16:37:30 +0000
Subject: [PATCH 66/75] remove noexcept on deallocates

---
 cpp/src/utilities/host_memory.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp
index 23793641dc5..98b1edc1c4e 100644
--- a/cpp/src/utilities/host_memory.cpp
+++ b/cpp/src/utilities/host_memory.cpp
@@ -83,7 +83,7 @@ class fixed_pinned_pool_memory_resource {
   void deallocate_async(void* ptr,
                         std::size_t bytes,
                         std::size_t alignment,
-                        cuda::stream_ref stream) noexcept
+                        cuda::stream_ref stream)
   {
     if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr < pool_end_) {
       pool_->deallocate_async(ptr, bytes, alignment, stream);
@@ -92,14 +92,14 @@ class fixed_pinned_pool_memory_resource {
     }
   }
 
-  void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept
+  void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream)
   {
     return deallocate_async(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
   }
 
   void deallocate(void* ptr,
                   std::size_t bytes,
-                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept
+                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
   {
     deallocate_async(ptr, bytes, alignment, stream_);
     stream_.wait();
@@ -214,7 +214,7 @@ class new_delete_memory_resource {
 
   void deallocate(void* ptr,
                   std::size_t bytes,
-                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept
+                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
   {
     rmm::detail::aligned_host_deallocate(
       ptr, bytes, rmm::CUDA_ALLOCATION_ALIGNMENT, [](void* ptr) { ::operator delete(ptr); });
@@ -223,12 +223,12 @@ class new_delete_memory_resource {
   void deallocate_async(void* ptr,
                         std::size_t bytes,
                         std::size_t alignment,
-                        cuda::stream_ref stream) noexcept
+                        cuda::stream_ref stream)
   {
     deallocate(ptr, bytes, alignment);
   }
 
-  void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept
+  void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream)
   {
     deallocate(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT);
   }

From 66da0018aa8cd0db80bf51517d96c1781bf379a7 Mon Sep 17 00:00:00 2001
From: vukasin <vmilovanovic@nvidia.com>
Date: Wed, 17 Jul 2024 12:12:21 +0000
Subject: [PATCH 67/75] tests

---
 .../utilities_tests/pinned_memory_tests.cpp   | 67 ++++++++++++++++++-
 1 file changed, 65 insertions(+), 2 deletions(-)

diff --git a/cpp/tests/utilities_tests/pinned_memory_tests.cpp b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
index df9103640f4..93259fd63ee 100644
--- a/cpp/tests/utilities_tests/pinned_memory_tests.cpp
+++ b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
@@ -18,16 +18,33 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/parquet.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
 
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-class PinnedMemoryTest : public cudf::test::BaseFixture {};
+class PinnedMemoryTest : public cudf::test::BaseFixture {
+  size_t prev_copy_threshold;
+  size_t prev_alloc_threshold;
 
-TEST(PinnedMemoryTest, MemoryResourceGetAndSet)
+ public:
+  PinnedMemoryTest()
+    : prev_copy_threshold{cudf::get_kernel_pinned_copy_threshold()},
+      prev_alloc_threshold{cudf::get_allocate_host_as_pinned_threshold()}
+  {
+  }
+  ~PinnedMemoryTest() override
+  {
+    cudf::set_kernel_pinned_copy_threshold(prev_copy_threshold);
+    cudf::set_allocate_host_as_pinned_threshold(prev_alloc_threshold);
+  }
+};
+
+TEST_F(PinnedMemoryTest, MemoryResourceGetAndSet)
 {
   // Global environment for temporary files
   auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
@@ -63,3 +80,49 @@ TEST(PinnedMemoryTest, MemoryResourceGetAndSet)
   // reset memory resource back
   cudf::set_pinned_memory_resource(last_mr);
 }
+
+TEST_F(PinnedMemoryTest, KernelCopyThresholdGetAndSet)
+{
+  cudf::set_kernel_pinned_copy_threshold(12345);
+  EXPECT_EQ(cudf::get_kernel_pinned_copy_threshold(), 12345);
+}
+
+TEST_F(PinnedMemoryTest, HostAsPinnedThresholdGetAndSet)
+{
+  cudf::set_allocate_host_as_pinned_threshold(12345);
+  EXPECT_EQ(cudf::get_allocate_host_as_pinned_threshold(), 12345);
+}
+
+TEST_F(PinnedMemoryTest, MakePinnedVector)
+{
+  cudf::set_allocate_host_as_pinned_threshold(0);
+
+  // should always use pinned memory
+  {
+    auto const vec = cudf::detail::make_pinned_vector_async<char>(1, cudf::get_default_stream());
+    EXPECT_TRUE(vec.get_allocator().is_device_accessible());
+  }
+}
+
+TEST_F(PinnedMemoryTest, MakeHostVector)
+{
+  cudf::set_allocate_host_as_pinned_threshold(7);
+
+  // allocate smaller than the threshold
+  {
+    auto const vec = cudf::detail::make_host_vector<int32_t>(1, cudf::get_default_stream());
+    EXPECT_TRUE(vec.get_allocator().is_device_accessible());
+  }
+
+  // allocate the same size as the threshold
+  {
+    auto const vec = cudf::detail::make_host_vector<char>(7, cudf::get_default_stream());
+    EXPECT_TRUE(vec.get_allocator().is_device_accessible());
+  }
+
+  // allocate larger than the threshold
+  {
+    auto const vec = cudf::detail::make_host_vector<int32_t>(2, cudf::get_default_stream());
+    EXPECT_FALSE(vec.get_allocator().is_device_accessible());
+  }
+}

From bbf5f2968fdd6d20b5e5dcc461a1f7d56f7b401d Mon Sep 17 00:00:00 2001
From: vukasin <vmilovanovic@nvidia.com>
Date: Wed, 17 Jul 2024 15:13:04 +0000
Subject: [PATCH 68/75] avoid copy_n

---
 cpp/src/utilities/cuda_memcpy.cu | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu
index 3d0822d8545..ccfc7542c80 100644
--- a/cpp/src/utilities/cuda_memcpy.cu
+++ b/cpp/src/utilities/cuda_memcpy.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "cudf/detail/utilities/integer_utils.hpp"
+
 #include <cudf/detail/utilities/cuda_memcpy.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
@@ -26,15 +28,21 @@ namespace cudf::detail {
 
 namespace {
 
+__global__ void copy_kernel(char const* src, char* dst, size_t n)
+{
+  auto const idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+  if (idx < n) { dst[idx] = src[idx]; }
+}
+
 void copy_pinned(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream)
 {
   if (size == 0) return;
 
   if (size < get_kernel_pinned_copy_threshold()) {
-    thrust::copy_n(rmm::exec_policy_nosync(stream),
-                   static_cast<const char*>(src),
-                   size,
-                   static_cast<char*>(dst));
+    const int block_size = 256;
+    auto const grid_size = cudf::util::div_rounding_up_safe<size_t>(size, block_size);
+    copy_kernel<<<grid_size, block_size, 0, stream.value()>>>(
+      static_cast<char const*>(src), static_cast<char*>(dst), size);
   } else {
     CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream));
   }

From 6e39c35a6b24ef780c5930eb84c9f26dd53f4a60 Mon Sep 17 00:00:00 2001
From: vukasin <vmilovanovic@nvidia.com>
Date: Thu, 18 Jul 2024 18:38:13 +0000
Subject: [PATCH 69/75] add is_device_accessible to span

---
 .../detail/utilities/vector_factories.hpp     | 60 ++-----------------
 cpp/include/cudf/utilities/span.hpp           | 34 +++++++++--
 cpp/src/io/csv/reader_impl.cu                 |  4 +-
 3 files changed, 38 insertions(+), 60 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 493e7f788b8..9e3b0fb0152 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -100,11 +100,12 @@ rmm::device_uvector<T> make_device_uvector_async(host_span<T const> source_data,
                                                  rmm::device_async_resource_ref mr)
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(ret.data(),
-                                source_data.data(),
-                                source_data.size() * sizeof(T),
-                                cudaMemcpyDefault,
-                                stream.value()));
+  auto const is_pinned = source_data.is_device_accessible();
+  cuda_memcpy_async(ret.data(),
+                    source_data.data(),
+                    source_data.size() * sizeof(T),
+                    is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
+                    stream);
   return ret;
 }
 
@@ -181,55 +182,6 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
     device_span<typename Container::value_type const>{c}, stream, mr);
 }
 
-/**
- * @brief Asynchronously construct a `device_uvector` containing a deep copy of data from a
- * `host_vector`
- *
- * @note This function does not synchronize `stream` after the copy.
- *
- * @tparam T The type of the data to copy
- * @param v The host_vector of data to deep copy
- * @param stream The stream on which to allocate memory and perform the copy
- * @param mr The memory resource to use for allocating the returned device_uvector
- * @return A device_uvector containing the copied data
- */
-template <typename T>
-rmm::device_uvector<T> make_device_uvector_async(host_vector<T> const& v,
-                                                 rmm::cuda_stream_view stream,
-                                                 rmm::device_async_resource_ref mr)
-{
-  rmm::device_uvector<T> ret(v.size(), stream, mr);
-  auto const is_pinned = v.get_allocator().is_device_accessible();
-  cuda_memcpy_async(ret.data(),
-                    v.data(),
-                    v.size() * sizeof(T),
-                    is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
-                    stream);
-  return ret;
-}
-
-/**
- * @brief Synchronously construct a `device_uvector` containing a deep copy of data from a
- * `host_vector`
- *
- * @note This function synchronizes `stream` after the copy.
- *
- * @tparam T The type of the data to copy
- * @param v The host_vector of data to deep copy
- * @param stream The stream on which to allocate memory and perform the copy
- * @param mr The memory resource to use for allocating the returned device_uvector
- * @return A device_uvector containing the copied data
- */
-template <typename T>
-rmm::device_uvector<T> make_device_uvector_sync(host_vector<T> const& v,
-                                                rmm::cuda_stream_view stream,
-                                                rmm::device_async_resource_ref mr)
-{
-  auto ret = make_device_uvector_async(v, stream, mr);
-  stream.synchronize();
-  return ret;
-}
-
 /**
  * @brief Synchronously construct a `device_uvector` containing a deep copy of data from a
  * `host_span`
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index 34e39d01a6a..c5054c733a7 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -218,10 +218,6 @@ template <typename T, typename Alloc>
 struct is_host_span_supported_container<  //
   thrust::host_vector<T, Alloc>> : std::true_type {};
 
-template <typename T>
-struct is_host_span_supported_container<  //
-  cudf::detail::host_vector<T>> : std::true_type {};
-
 template <typename T, typename Alloc>
 struct is_host_span_supported_container<  //
   std::basic_string<T, std::char_traits<T>, Alloc>> : std::true_type {};
@@ -263,6 +259,26 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
   {
   }
 
+  /// Constructor from a host_vector
+  /// @param in The host_vector to construct the span from
+  template <typename OtherT,
+            // Only supported containers of types convertible to T
+            std::enable_if_t<std::is_convertible_v<OtherT (*)[], T (*)[]>>* = nullptr>
+  constexpr host_span(cudf::detail::host_vector<OtherT>& in)
+    : base(in.data(), in.size()), _is_device_accessible{in.get_allocator().is_device_accessible()}
+  {
+  }
+
+  /// Constructor from a const host_vector
+  /// @param in The host_vector to construct the span from
+  template <typename OtherT,
+            // Only supported containers of types convertible to T
+            std::enable_if_t<std::is_convertible_v<OtherT (*)[], T (*)[]>>* = nullptr>
+  constexpr host_span(cudf::detail::host_vector<OtherT> const& in)
+    : base(in.data(), in.size()), _is_device_accessible{in.get_allocator().is_device_accessible()}
+  {
+  }
+
   // Copy construction to support const conversion
   /// @param other The span to copy
   template <typename OtherT,
@@ -274,6 +290,16 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
     : base(other.data(), other.size())
   {
   }
+
+  /**
+   * @brief Returns whether the data is device accessible (e.g. pinned memory)
+   *
+   * @return true if the data is device accessible
+   */
+  [[nodiscard]] bool is_device_accessible() const { return _is_device_accessible; }
+
+ private:
+  bool _is_device_accessible{false};
 };
 
 // ===== device_span ===============================================================================
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 9a3d777593b..9a4ecfdb0ab 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -506,7 +506,7 @@ void get_data_types_from_column_names(std::map<std::string, data_type> const& us
 }
 
 void infer_column_types(parse_options const& parse_opts,
-                        cudf::detail::host_vector<column_parse::flags> const& column_flags,
+                        host_span<column_parse::flags const> column_flags,
                         device_span<char const> data,
                         device_span<uint64_t const> row_offsets,
                         int32_t num_records,
@@ -630,7 +630,7 @@ cudf::detail::host_vector<data_type> determine_column_types(
   device_span<char const> data,
   device_span<uint64_t const> row_offsets,
   int32_t num_records,
-  cudf::detail::host_vector<column_parse::flags>& column_flags,
+  host_span<column_parse::flags> column_flags,
   cudf::size_type num_active_columns,
   rmm::cuda_stream_view stream)
 {

From c262c30231a5165ff83ff57d4ea3ac826b571a74 Mon Sep 17 00:00:00 2001
From: vukasin <vmilovanovic@nvidia.com>
Date: Thu, 18 Jul 2024 19:04:41 +0000
Subject: [PATCH 70/75] pass host_span

---
 cpp/src/io/csv/reader_impl.cu     | 23 +++++++++++------------
 cpp/src/io/parquet/writer_impl.cu |  2 +-
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 9a4ecfdb0ab..40d4372ae9d 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -566,18 +566,17 @@ void infer_column_types(parse_options const& parse_opts,
   }
 }
 
-std::vector<column_buffer> decode_data(
-  parse_options const& parse_opts,
-  cudf::detail::host_vector<column_parse::flags> const& column_flags,
-  std::vector<std::string> const& column_names,
-  device_span<char const> data,
-  device_span<uint64_t const> row_offsets,
-  cudf::detail::host_vector<data_type> const& column_types,
-  int32_t num_records,
-  int32_t num_actual_columns,
-  int32_t num_active_columns,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
+std::vector<column_buffer> decode_data(parse_options const& parse_opts,
+                                       host_span<column_parse::flags const> column_flags,
+                                       std::vector<std::string> const& column_names,
+                                       device_span<char const> data,
+                                       device_span<uint64_t const> row_offsets,
+                                       host_span<data_type const> column_types,
+                                       int32_t num_records,
+                                       int32_t num_actual_columns,
+                                       int32_t num_active_columns,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::device_async_resource_ref mr)
 {
   // Alloc output; columns' data memory is still expected for empty dataframe
   std::vector<column_buffer> out_buffers;
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 39dcd5debab..2df71b77301 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1135,7 +1135,7 @@ void init_row_group_fragments(cudf::detail::hostdevice_2dvector<PageFragment>& f
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 void calculate_page_fragments(device_span<PageFragment> frag,
-                              cudf::detail::host_vector<size_type> const& frag_sizes,
+                              host_span<size_type const> frag_sizes,
                               rmm::cuda_stream_view stream)
 {
   auto d_frag_sz = cudf::detail::make_device_uvector_async(

From 6cd16b5013f9501b9dcae119af4244e69ffe7a0d Mon Sep 17 00:00:00 2001
From: vukasin <vmilovanovic@nvidia.com>
Date: Thu, 18 Jul 2024 19:35:19 +0000
Subject: [PATCH 71/75] address review

---
 cpp/src/utilities/host_memory.cpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp
index 98b1edc1c4e..7c3cea42023 100644
--- a/cpp/src/utilities/host_memory.cpp
+++ b/cpp/src/utilities/host_memory.cpp
@@ -192,11 +192,9 @@ class new_delete_memory_resource {
   {
     try {
       return rmm::detail::aligned_host_allocate(
-        bytes, rmm::CUDA_ALLOCATION_ALIGNMENT, [](std::size_t size) {
-          return ::operator new(size);
-        });
+        bytes, alignment, [](std::size_t size) { return ::operator new(size); });
     } catch (std::bad_alloc const& e) {
-      RMM_FAIL("Failed to allocate memory: " + std::string{e.what()}, rmm::out_of_memory);
+      CUDF_FAIL("Failed to allocate memory: " + std::string{e.what()}, rmm::out_of_memory);
     }
   }
 
@@ -217,13 +215,13 @@ class new_delete_memory_resource {
                   std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
   {
     rmm::detail::aligned_host_deallocate(
-      ptr, bytes, rmm::CUDA_ALLOCATION_ALIGNMENT, [](void* ptr) { ::operator delete(ptr); });
+      ptr, bytes, alignment, [](void* ptr) { ::operator delete(ptr); });
   }
 
   void deallocate_async(void* ptr,
                         std::size_t bytes,
                         std::size_t alignment,
-                        cuda::stream_ref stream)
+                        [[maybe_unused]] cuda::stream_ref stream)
   {
     deallocate(ptr, bytes, alignment);
   }

From 044836a7bacb7cd3960f5686a78076a2d345da71 Mon Sep 17 00:00:00 2001
From: vukasin <vmilovanovic@nvidia.com>
Date: Mon, 22 Jul 2024 13:47:03 +0000
Subject: [PATCH 72/75] reviews

---
 cpp/src/utilities/cuda_memcpy.cu | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu
index ccfc7542c80..0efb881eb3e 100644
--- a/cpp/src/utilities/cuda_memcpy.cu
+++ b/cpp/src/utilities/cuda_memcpy.cu
@@ -16,6 +16,7 @@
 
 #include "cudf/detail/utilities/integer_utils.hpp"
 
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/cuda_memcpy.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
@@ -28,9 +29,10 @@ namespace cudf::detail {
 
 namespace {
 
-__global__ void copy_kernel(char const* src, char* dst, size_t n)
+// Simple kernel to copy between device buffers
+CUDF_KERNEL void copy_kernel(char const* src, char* dst, size_t n)
 {
-  auto const idx = static_cast<size_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
   if (idx < n) { dst[idx] = src[idx]; }
 }
 
@@ -41,6 +43,8 @@ void copy_pinned(void* dst, void const* src, std::size_t size, rmm::cuda_stream_
   if (size < get_kernel_pinned_copy_threshold()) {
     const int block_size = 256;
     auto const grid_size = cudf::util::div_rounding_up_safe<size_t>(size, block_size);
+    // We are explicitly launching the kernel here instead of calling a thrust function because the
+    // thrust function can potentially call cudaMemcpyAsync instead of using a kernel
     copy_kernel<<<grid_size, block_size, 0, stream.value()>>>(
       static_cast<char const*>(src), static_cast<char*>(dst), size);
   } else {

From 32c7b725cc8e237734fa5e4c7b11360be0fadd6e Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 22 Jul 2024 06:54:01 -0700
Subject: [PATCH 73/75] review suggestion

Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
---
 cpp/include/cudf/detail/utilities/host_memory.hpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/host_memory.hpp b/cpp/include/cudf/detail/utilities/host_memory.hpp
index fd82b584c7e..9f9a89c91fe 100644
--- a/cpp/include/cudf/detail/utilities/host_memory.hpp
+++ b/cpp/include/cudf/detail/utilities/host_memory.hpp
@@ -41,10 +41,9 @@ CUDF_EXPORT rmm::host_async_resource_ref get_pageable_memory_resource();
 template <typename T>
 rmm_host_allocator<T> get_host_allocator(std::size_t size, rmm::cuda_stream_view _stream)
 {
-  if (size * sizeof(T) <= get_allocate_host_as_pinned_threshold()) {
-    return {get_pinned_memory_resource(), _stream};
-  }
-  return {get_pageable_memory_resource(), _stream};
+  return { size * sizeof(T) <= get_allocate_host_as_pinned_threshold() ? 
+                 get_pinned_memory_resource() : get_pageable_memory_resource(),
+                _stream};
 }
 
 }  // namespace cudf::detail

From cecb289c89d380131c582fbf989d638463043b82 Mon Sep 17 00:00:00 2001
From: vukasin <vmilovanovic@nvidia.com>
Date: Mon, 22 Jul 2024 14:09:01 +0000
Subject: [PATCH 74/75] fix docs

---
 cpp/include/cudf/detail/utilities/host_memory.hpp  | 14 ++++++++------
 .../cudf/detail/utilities/vector_factories.hpp     |  4 ++--
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/host_memory.hpp b/cpp/include/cudf/detail/utilities/host_memory.hpp
index 9f9a89c91fe..3975e694559 100644
--- a/cpp/include/cudf/detail/utilities/host_memory.hpp
+++ b/cpp/include/cudf/detail/utilities/host_memory.hpp
@@ -33,17 +33,19 @@ namespace cudf::detail {
 CUDF_EXPORT rmm::host_async_resource_ref get_pageable_memory_resource();
 
 /**
- * @brief Get the memory resource to be used for the host memory allocation.
+ * @brief Get the allocator to be used for the host memory allocation.
  *
  * @param size The number of elements of type T to allocate
- * @return The memory resource to be used for the host memory allocation
+ * @param stream The stream to use for the allocation
+ * @return The allocator to be used for the host memory allocation
  */
 template <typename T>
-rmm_host_allocator<T> get_host_allocator(std::size_t size, rmm::cuda_stream_view _stream)
+rmm_host_allocator<T> get_host_allocator(std::size_t size, rmm::cuda_stream_view stream)
 {
-  return { size * sizeof(T) <= get_allocate_host_as_pinned_threshold() ? 
-                 get_pinned_memory_resource() : get_pageable_memory_resource(),
-                _stream};
+  return {size * sizeof(T) <= get_allocate_host_as_pinned_threshold()
+            ? get_pinned_memory_resource()
+            : get_pageable_memory_resource(),
+          stream};
 }
 
 }  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 9e3b0fb0152..45dc839c9bd 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -356,7 +356,7 @@ std::vector<typename Container::value_type> make_std_vector_sync(Container const
 }
 
 /**
- * @brief Construct a `thrust::host_vector` of the given size.
+ * @brief Construct a `cudf::detail::host_vector` of the given size.
  *
  * @note The returned vector may be using a pinned memory resource.
  *
@@ -372,7 +372,7 @@ host_vector<T> make_host_vector(size_t size, rmm::cuda_stream_view stream)
 }
 
 /**
- * @brief Construct an empty `thrust::host_vector` with the given capacity.
+ * @brief Construct an empty `cudf::detail::host_vector` with the given capacity.
  *
  * @note The returned vector may be using a pinned memory resource.
  *

From 5d15a4d1345a88612ee04464b5493a035dd3f915 Mon Sep 17 00:00:00 2001
From: vukasin <vmilovanovic@nvidia.com>
Date: Mon, 22 Jul 2024 19:07:01 +0000
Subject: [PATCH 75/75] revert to fix get_host_allocator

---
 cpp/include/cudf/detail/utilities/host_memory.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/host_memory.hpp b/cpp/include/cudf/detail/utilities/host_memory.hpp
index 3975e694559..c6775a950c9 100644
--- a/cpp/include/cudf/detail/utilities/host_memory.hpp
+++ b/cpp/include/cudf/detail/utilities/host_memory.hpp
@@ -42,10 +42,10 @@ CUDF_EXPORT rmm::host_async_resource_ref get_pageable_memory_resource();
 template <typename T>
 rmm_host_allocator<T> get_host_allocator(std::size_t size, rmm::cuda_stream_view stream)
 {
-  return {size * sizeof(T) <= get_allocate_host_as_pinned_threshold()
-            ? get_pinned_memory_resource()
-            : get_pageable_memory_resource(),
-          stream};
+  if (size * sizeof(T) <= get_allocate_host_as_pinned_threshold()) {
+    return {get_pinned_memory_resource(), stream};
+  }
+  return {get_pageable_memory_resource(), stream};
 }
 
 }  // namespace cudf::detail