Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/dev' into mixin-docs
Browse files Browse the repository at this point in the history
  • Loading branch information
sleeepyjack committed Jan 22, 2025
2 parents 6bf276e + deab579 commit f5fad85
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 37 deletions.
9 changes: 5 additions & 4 deletions include/cuco/bloom_filter.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,9 @@ class bloom_filter {
using size_type = typename extent_type::value_type; ///< Underlying type of the extent type
using word_type =
typename ref_type<>::word_type; ///< Underlying word/segment type of a filter block
using allocator_type =
typename std::allocator_traits<Allocator>::template rebind_alloc<word_type>; ///< Allocator
///< type
using allocator_type = typename std::allocator_traits<Allocator>::template rebind_alloc<
typename ref_type<>::filter_block_type>; ///< Allocator
///< type

bloom_filter(bloom_filter const&) = delete; ///< Copy constructor is not available
bloom_filter& operator=(bloom_filter const&) =
Expand Down Expand Up @@ -349,7 +349,8 @@ class bloom_filter {

private:
allocator_type allocator_; ///< Allocator used to allocate device-accessible storage
std::unique_ptr<word_type, detail::custom_deleter<std::size_t, allocator_type>>
std::unique_ptr<typename ref_type<>::filter_block_type,
detail::custom_deleter<std::size_t, allocator_type>>
data_; ///< Storage of the current `bloom_filter` object
ref_type<> ref_; ///< Device ref of the current `bloom_filter` object
};
Expand Down
20 changes: 18 additions & 2 deletions include/cuco/bloom_filter_ref.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,29 @@ class bloom_filter_ref {
using size_type = typename extent_type::value_type; ///< Underlying type of the extent type
using word_type =
typename impl_type::word_type; ///< Underlying word/segment type of a filter block
using filter_block_type =
typename impl_type::filter_block_type; ///< Opaque type of a filter block

/**
* @brief Constructs the ref object from existing storage.
*
* @note The storage span starting at `data` must have an extent of at least `num_blocks`
* elements.
* @note `data` must be aligned to at least `sizeof(word_type) * words_per_block`.
* elements of type `filter_block_type`.
*
* @param data Pointer to the storage span of the filter
* @param num_blocks Number of sub-filters or blocks
* @param scope The scope in which operations will be performed
* @param policy Fingerprint generation policy (see `cuco/bloom_filter_policies.cuh`)
*/
__host__ __device__ explicit constexpr bloom_filter_ref(filter_block_type* data,
Extent num_blocks,
cuda_thread_scope<Scope> scope,
Policy const& policy);

/**
* @brief Constructs the ref object from existing storage.
*
* @note This overload is deprecated and will be removed in the near future.
*
* @param data Pointer to the storage span of the filter
* @param num_blocks Number of sub-filters or blocks
Expand Down
7 changes: 3 additions & 4 deletions include/cuco/detail/bloom_filter/bloom_filter.inl
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
#include <cuda/atomic>
#include <cuda/stream_ref>

#include <cstdint>
#include <cstddef>

namespace cuco {

Expand All @@ -34,9 +34,8 @@ __host__ constexpr bloom_filter<Key, Extent, Scope, Policy, Allocator>::bloom_fi
Allocator const& alloc,
cuda::stream_ref stream)
: allocator_{alloc},
data_{allocator_.allocate(num_blocks * words_per_block),
detail::custom_deleter<std::size_t, allocator_type>{num_blocks * words_per_block,
allocator_}},
data_{allocator_.allocate(num_blocks),
detail::custom_deleter<std::size_t, allocator_type>{num_blocks, allocator_}},
ref_{data_.get(), num_blocks, {}, policy}
{
this->clear_async(stream);
Expand Down
52 changes: 25 additions & 27 deletions include/cuco/detail/bloom_filter/bloom_filter_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
#include <thrust/iterator/constant_iterator.h>

#include <cstdint>
#include <nv/target>

namespace cuco::detail {

Expand All @@ -52,6 +51,21 @@ class bloom_filter_impl {
static constexpr auto thread_scope = Scope;
static constexpr auto words_per_block = policy_type::words_per_block;

private:
__host__ __device__ static constexpr size_t max_vec_bytes() noexcept
{
constexpr auto word_bytes = sizeof(word_type);
constexpr auto block_bytes = word_bytes * words_per_block;
return cuda::std::min(cuda::std::max(word_bytes, 32ul),
block_bytes); // aiming for 2xLDG128 -> 1 sector per thread
}

public:
struct alignas(max_vec_bytes()) filter_block_type {
private:
word_type data_[words_per_block];
};

static_assert(cuda::std::has_single_bit(words_per_block) and words_per_block <= 32,
"Number of words per block must be a power-of-two and less than or equal to 32");

Expand All @@ -64,23 +78,20 @@ class bloom_filter_impl {
cuda::std::memory_order>,
"Invalid word type");

__host__ __device__ explicit constexpr bloom_filter_impl(filter_block_type* filter,
Extent num_blocks,
cuda_thread_scope<Scope>,
Policy policy) noexcept
: words_{reinterpret_cast<word_type*>(filter)}, num_blocks_{num_blocks}, policy_{policy}
{
}

__host__ __device__ explicit constexpr bloom_filter_impl(word_type* filter,
Extent num_blocks,
cuda_thread_scope<Scope>,
Policy policy)
Policy policy) noexcept
: words_{filter}, num_blocks_{num_blocks}, policy_{policy}
{
auto const alignment =
1ull << cuda::std::countr_zero(reinterpret_cast<cuda::std::uintptr_t>(filter));

NV_DISPATCH_TARGET(
NV_IS_HOST,
(CUCO_EXPECTS(alignment >= required_alignment(), "Invalid memory alignment");
CUCO_EXPECTS(num_blocks_ > 0, "Number of blocks cannot be zero");),
NV_IS_DEVICE,
(if (alignment < required_alignment() or num_blocks_ == 0) {
__trap(); // TODO this kills the kernel and corrupts the CUDA context. Not ideal.
}))
}

template <class CG>
Expand Down Expand Up @@ -333,15 +344,7 @@ class bloom_filter_impl {
__device__ constexpr cuda::std::array<word_type, NumWords> vec_load_words(size_type index) const
{
return *reinterpret_cast<cuda::std::array<word_type, NumWords>*>(__builtin_assume_aligned(
words_ + index, cuda::std::min(sizeof(word_type) * NumWords, required_alignment())));
}

__host__ __device__ static constexpr size_t max_vec_bytes() noexcept
{
constexpr auto word_bytes = sizeof(word_type);
constexpr auto block_bytes = word_bytes * words_per_block;
return cuda::std::min(cuda::std::max(word_bytes, 32ul),
block_bytes); // aiming for 2xLDG128 -> 1 sector per thread
words_ + index, cuda::std::min(sizeof(word_type) * NumWords, max_vec_bytes())));
}

[[nodiscard]] __host__ __device__ static constexpr int32_t add_optimal_cg_size()
Expand All @@ -356,11 +359,6 @@ class bloom_filter_impl {
return block_bytes / max_vec_bytes(); // one vector load per thread
}

__host__ __device__ static constexpr size_t required_alignment() noexcept
{
return cuda::std::min(sizeof(word_type) * words_per_block, max_vec_bytes());
}

word_type* words_;
extent_type num_blocks_;
policy_type policy_;
Expand Down
7 changes: 7 additions & 0 deletions include/cuco/detail/bloom_filter/bloom_filter_ref.inl
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,13 @@

namespace cuco {

template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
__host__ __device__ constexpr bloom_filter_ref<Key, Extent, Scope, Policy>::bloom_filter_ref(
filter_block_type* data, Extent num_blocks, cuda_thread_scope<Scope>, Policy const& policy)
: impl_{data, num_blocks, {}, policy}
{
}

template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
__host__ __device__ constexpr bloom_filter_ref<Key, Extent, Scope, Policy>::bloom_filter_ref(
word_type* data, Extent num_blocks, cuda_thread_scope<Scope>, Policy const& policy)
Expand Down

0 comments on commit f5fad85

Please sign in to comment.