Merge remote-tracking branch 'upstream/dev' into mixin-docs

NVIDIA · Jan 22, 2025 · f5fad85 · f5fad85
2 parents 6bf276e + deab579
commit f5fad85
Show file tree

Hide file tree

Showing 5 changed files with 58 additions and 37 deletions.
diff --git a/include/cuco/bloom_filter.cuh b/include/cuco/bloom_filter.cuh
@@ -82,9 +82,9 @@ class bloom_filter {
   using size_type   = typename extent_type::value_type;  ///< Underlying type of the extent type
   using word_type =
     typename ref_type<>::word_type;  ///< Underlying word/segment type of a filter block
-  using allocator_type =
-    typename std::allocator_traits<Allocator>::template rebind_alloc<word_type>;  ///< Allocator
-                                                                                  ///< type
+  using allocator_type = typename std::allocator_traits<Allocator>::template rebind_alloc<
+    typename ref_type<>::filter_block_type>;  ///< Allocator
+                                              ///< type
 
   bloom_filter(bloom_filter const&) = delete;  ///< Copy constructor is not available
   bloom_filter& operator=(bloom_filter const&) =
@@ -349,7 +349,8 @@ class bloom_filter {
 
  private:
   allocator_type allocator_;  ///< Allocator used to allocate device-accessible storage
-  std::unique_ptr<word_type, detail::custom_deleter<std::size_t, allocator_type>>
+  std::unique_ptr<typename ref_type<>::filter_block_type,
+                  detail::custom_deleter<std::size_t, allocator_type>>
     data_;          ///< Storage of the current `bloom_filter` object
   ref_type<> ref_;  ///< Device ref of the current `bloom_filter` object
 };

diff --git a/include/cuco/bloom_filter_ref.cuh b/include/cuco/bloom_filter_ref.cuh
@@ -49,13 +49,29 @@ class bloom_filter_ref {
   using size_type   = typename extent_type::value_type;  ///< Underlying type of the extent type
   using word_type =
     typename impl_type::word_type;  ///< Underlying word/segment type of a filter block
+  using filter_block_type =
+    typename impl_type::filter_block_type;  ///< Opaque type of a filter block
 
   /**
    * @brief Constructs the ref object from existing storage.
    *
    * @note The storage span starting at `data` must have an extent of at least `num_blocks`
-   * elements.
-   * @note `data` must be aligned to at least `sizeof(word_type) * words_per_block`.
+   * elements of type `filter_block_type`.
+   *
+   * @param data Pointer to the storage span of the filter
+   * @param num_blocks Number of sub-filters or blocks
+   * @param scope The scope in which operations will be performed
+   * @param policy Fingerprint generation policy (see `cuco/bloom_filter_policies.cuh`)
+   */
+  __host__ __device__ explicit constexpr bloom_filter_ref(filter_block_type* data,
+                                                          Extent num_blocks,
+                                                          cuda_thread_scope<Scope> scope,
+                                                          Policy const& policy);
+
+  /**
+   * @brief Constructs the ref object from existing storage.
+   *
+   * @note This overload is deprecated and will be removed in the near future.
    *
    * @param data Pointer to the storage span of the filter
    * @param num_blocks Number of sub-filters or blocks

diff --git a/include/cuco/detail/bloom_filter/bloom_filter.inl b/include/cuco/detail/bloom_filter/bloom_filter.inl
@@ -22,7 +22,7 @@
 #include <cuda/atomic>
 #include <cuda/stream_ref>
 
-#include <cstdint>
+#include <cstddef>
 
 namespace cuco {
 
@@ -34,9 +34,8 @@ __host__ constexpr bloom_filter<Key, Extent, Scope, Policy, Allocator>::bloom_fi
   Allocator const& alloc,
   cuda::stream_ref stream)
   : allocator_{alloc},
-    data_{allocator_.allocate(num_blocks * words_per_block),
-          detail::custom_deleter<std::size_t, allocator_type>{num_blocks * words_per_block,
-                                                              allocator_}},
+    data_{allocator_.allocate(num_blocks),
+          detail::custom_deleter<std::size_t, allocator_type>{num_blocks, allocator_}},
     ref_{data_.get(), num_blocks, {}, policy}
 {
   this->clear_async(stream);

diff --git a/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh b/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh
@@ -36,7 +36,6 @@
 #include <thrust/iterator/constant_iterator.h>
 
 #include <cstdint>
-#include <nv/target>
 
 namespace cuco::detail {
 
@@ -52,6 +51,21 @@ class bloom_filter_impl {
   static constexpr auto thread_scope    = Scope;
   static constexpr auto words_per_block = policy_type::words_per_block;
 
+ private:
+  __host__ __device__ static constexpr size_t max_vec_bytes() noexcept
+  {
+    constexpr auto word_bytes  = sizeof(word_type);
+    constexpr auto block_bytes = word_bytes * words_per_block;
+    return cuda::std::min(cuda::std::max(word_bytes, 32ul),
+                          block_bytes);  // aiming for 2xLDG128 -> 1 sector per thread
+  }
+
+ public:
+  struct alignas(max_vec_bytes()) filter_block_type {
+   private:
+    word_type data_[words_per_block];
+  };
+
   static_assert(cuda::std::has_single_bit(words_per_block) and words_per_block <= 32,
                 "Number of words per block must be a power-of-two and less than or equal to 32");
 
@@ -64,23 +78,20 @@ class bloom_filter_impl {
                                   cuda::std::memory_order>,
     "Invalid word type");
 
+  __host__ __device__ explicit constexpr bloom_filter_impl(filter_block_type* filter,
+                                                           Extent num_blocks,
+                                                           cuda_thread_scope<Scope>,
+                                                           Policy policy) noexcept
+    : words_{reinterpret_cast<word_type*>(filter)}, num_blocks_{num_blocks}, policy_{policy}
+  {
+  }
+
   __host__ __device__ explicit constexpr bloom_filter_impl(word_type* filter,
                                                            Extent num_blocks,
                                                            cuda_thread_scope<Scope>,
-                                                           Policy policy)
+                                                           Policy policy) noexcept
     : words_{filter}, num_blocks_{num_blocks}, policy_{policy}
   {
-    auto const alignment =
-      1ull << cuda::std::countr_zero(reinterpret_cast<cuda::std::uintptr_t>(filter));
-
-    NV_DISPATCH_TARGET(
-      NV_IS_HOST,
-      (CUCO_EXPECTS(alignment >= required_alignment(), "Invalid memory alignment");
-       CUCO_EXPECTS(num_blocks_ > 0, "Number of blocks cannot be zero");),
-      NV_IS_DEVICE,
-      (if (alignment < required_alignment() or num_blocks_ == 0) {
-        __trap();  // TODO this kills the kernel and corrupts the CUDA context. Not ideal.
-      }))
   }
 
   template <class CG>
@@ -333,15 +344,7 @@ class bloom_filter_impl {
   __device__ constexpr cuda::std::array<word_type, NumWords> vec_load_words(size_type index) const
   {
     return *reinterpret_cast<cuda::std::array<word_type, NumWords>*>(__builtin_assume_aligned(
-      words_ + index, cuda::std::min(sizeof(word_type) * NumWords, required_alignment())));
-  }
-
-  __host__ __device__ static constexpr size_t max_vec_bytes() noexcept
-  {
-    constexpr auto word_bytes  = sizeof(word_type);
-    constexpr auto block_bytes = word_bytes * words_per_block;
-    return cuda::std::min(cuda::std::max(word_bytes, 32ul),
-                          block_bytes);  // aiming for 2xLDG128 -> 1 sector per thread
+      words_ + index, cuda::std::min(sizeof(word_type) * NumWords, max_vec_bytes())));
   }
 
   [[nodiscard]] __host__ __device__ static constexpr int32_t add_optimal_cg_size()
@@ -356,11 +359,6 @@ class bloom_filter_impl {
     return block_bytes / max_vec_bytes();  // one vector load per thread
   }
 
-  __host__ __device__ static constexpr size_t required_alignment() noexcept
-  {
-    return cuda::std::min(sizeof(word_type) * words_per_block, max_vec_bytes());
-  }
-
   word_type* words_;
   extent_type num_blocks_;
   policy_type policy_;

diff --git a/include/cuco/detail/bloom_filter/bloom_filter_ref.inl b/include/cuco/detail/bloom_filter/bloom_filter_ref.inl
@@ -23,6 +23,13 @@
 
 namespace cuco {
 
+template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
+__host__ __device__ constexpr bloom_filter_ref<Key, Extent, Scope, Policy>::bloom_filter_ref(
+  filter_block_type* data, Extent num_blocks, cuda_thread_scope<Scope>, Policy const& policy)
+  : impl_{data, num_blocks, {}, policy}
+{
+}
+
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
 __host__ __device__ constexpr bloom_filter_ref<Key, Extent, Scope, Policy>::bloom_filter_ref(
   word_type* data, Extent num_blocks, cuda_thread_scope<Scope>, Policy const& policy)