diff --git a/benchmarks/hash_bench.cu b/benchmarks/hash_bench.cu index 58c6ee770..973f6976d 100644 --- a/benchmarks/hash_bench.cu +++ b/benchmarks/hash_bench.cu @@ -77,7 +77,7 @@ void hash_eval(nvbench::state& state, nvbench::type_list) state.add_element_count(num_keys); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + state.exec([&](nvbench::launch& launch) { hash_bench_kernel<<>>( Hash{}, num_keys, hash_values.begin(), materialize_result); }); diff --git a/include/cuco/detail/hash_functions/murmurhash3.cuh b/include/cuco/detail/hash_functions/murmurhash3.cuh index ce5ab9d56..a12143523 100644 --- a/include/cuco/detail/hash_functions/murmurhash3.cuh +++ b/include/cuco/detail/hash_functions/murmurhash3.cuh @@ -16,7 +16,12 @@ #pragma once +#include +#include + +#include #include +#include namespace cuco::detail { @@ -31,15 +36,15 @@ template struct MurmurHash3_fmix32 { static_assert(sizeof(Key) == 4, "Key type must be 4 bytes in size."); - using argument_type = Key; ///< The type of the values taken as argument - using result_type = uint32_t; ///< The type of the hash values produced + using argument_type = Key; ///< The type of the values taken as argument + using result_type = std::uint32_t; ///< The type of the hash values produced /** * @brief Constructs a MurmurHash3_fmix32 hash function with the given `seed`. * * @param seed A custom number to randomize the resulting hash value */ - __host__ __device__ constexpr MurmurHash3_fmix32(uint32_t seed = 0) : seed_{seed} {} + __host__ __device__ constexpr MurmurHash3_fmix32(std::uint32_t seed = 0) : seed_{seed} {} /** * @brief Returns a hash value for its argument, as a value of type `result_type`. @@ -49,7 +54,7 @@ struct MurmurHash3_fmix32 { */ constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept { - uint32_t h = static_cast(key) ^ seed_; + std::uint32_t h = static_cast(key) ^ seed_; h ^= h >> 16; h *= 0x85ebca6b; h ^= h >> 13; @@ -59,7 +64,7 @@ struct MurmurHash3_fmix32 { } private: - uint32_t seed_; + std::uint32_t seed_; }; /** @@ -73,15 +78,15 @@ template struct MurmurHash3_fmix64 { static_assert(sizeof(Key) == 8, "Key type must be 8 bytes in size."); - using argument_type = Key; ///< The type of the values taken as argument - using result_type = uint64_t; ///< The type of the hash values produced + using argument_type = Key; ///< The type of the values taken as argument + using result_type = std::uint64_t; ///< The type of the hash values produced /** * @brief Constructs a MurmurHash3_fmix64 hash function with the given `seed`. * * @param seed A custom number to randomize the resulting hash value */ - __host__ __device__ constexpr MurmurHash3_fmix64(uint64_t seed = 0) : seed_{seed} {} + __host__ __device__ constexpr MurmurHash3_fmix64(std::uint64_t seed = 0) : seed_{seed} {} /** * @brief Returns a hash value for its argument, as a value of type `result_type`. @@ -91,7 +96,7 @@ struct MurmurHash3_fmix64 { */ constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept { - uint64_t h = static_cast(key) ^ seed_; + std::uint64_t h = static_cast(key) ^ seed_; h ^= h >> 33; h *= 0xff51afd7ed558ccd; h ^= h >> 33; @@ -101,7 +106,7 @@ struct MurmurHash3_fmix64 { } private: - uint64_t seed_; + std::uint64_t seed_; }; /** @@ -121,36 +126,50 @@ struct MurmurHash3_fmix64 { */ template struct MurmurHash3_32 { - using argument_type = Key; ///< The type of the values taken as argument - using result_type = uint32_t; ///< The type of the hash values produced + using argument_type = Key; ///< The type of the values taken as argument + using result_type = std::uint32_t; ///< The type of the hash values produced /** * @brief Constructs a MurmurHash3_32 hash function with the given `seed`. * * @param seed A custom number to randomize the resulting hash value */ - __host__ __device__ constexpr MurmurHash3_32(uint32_t seed = 0) : fmix32_{0}, seed_{seed} {} + __host__ __device__ constexpr MurmurHash3_32(std::uint32_t seed = 0) : fmix32_{0}, seed_{seed} {} /** * @brief Returns a hash value for its argument, as a value of type `result_type`. * * @param key The input argument to hash - * @return A resulting hash value for `key` + * @return The resulting hash value for `key` */ constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept { - constexpr int len = sizeof(argument_type); - const uint8_t* const data = (const uint8_t*)&key; - constexpr int nblocks = len / 4; + return compute_hash(reinterpret_cast(&key), + cuco::experimental::extent{}); + } - uint32_t h1 = seed_; - constexpr uint32_t c1 = 0xcc9e2d51; - constexpr uint32_t c2 = 0x1b873593; + /** + * @brief Returns a hash value for its argument, as a value of type `result_type`. + * + * @tparam Extent The extent type + * + * @param bytes The input argument to hash + * @param size The extent of the data in bytes + * @return The resulting hash value + */ + template + constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes, + Extent size) const noexcept + { + auto const nblocks = size / 4; + + std::uint32_t h1 = seed_; + constexpr std::uint32_t c1 = 0xcc9e2d51; + constexpr std::uint32_t c2 = 0x1b873593; //---------- // body - const uint32_t* const blocks = (const uint32_t*)(data + nblocks * 4); - for (int i = -nblocks; i; i++) { - uint32_t k1 = blocks[i]; // getblock32(blocks,i); + for (std::remove_const_t i = 0; size >= 4 && i < nblocks; i++) { + std::uint32_t k1 = load_chunk(bytes, i); k1 *= c1; k1 = rotl32(k1, 15); k1 *= c2; @@ -160,13 +179,12 @@ struct MurmurHash3_32 { } //---------- // tail - const uint8_t* tail = (const uint8_t*)(data + nblocks * 4); - uint32_t k1 = 0; - switch (len & 3) { - case 3: k1 ^= tail[2] << 16; - case 2: k1 ^= tail[1] << 8; + std::uint32_t k1 = 0; + switch (size & 3) { + case 3: k1 ^= std::to_integer(bytes[nblocks * 4 + 2]) << 16; [[fallthrough]]; + case 2: k1 ^= std::to_integer(bytes[nblocks * 4 + 1]) << 8; [[fallthrough]]; case 1: - k1 ^= tail[0]; + k1 ^= std::to_integer(bytes[nblocks * 4 + 0]); k1 *= c1; k1 = rotl32(k1, 15); k1 *= c2; @@ -174,19 +192,18 @@ struct MurmurHash3_32 { }; //---------- // finalization - h1 ^= len; + h1 ^= size; h1 = fmix32_(h1); return h1; } private: - constexpr __host__ __device__ uint32_t rotl32(uint32_t x, int8_t r) const noexcept + constexpr __host__ __device__ std::uint32_t rotl32(std::uint32_t x, std::int8_t r) const noexcept { return (x << r) | (x >> (32 - r)); } - MurmurHash3_fmix32 fmix32_; - uint32_t seed_; + MurmurHash3_fmix32 fmix32_; + std::uint32_t seed_; }; - } // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/detail/hash_functions/utils.cuh b/include/cuco/detail/hash_functions/utils.cuh new file mode 100644 index 000000000..a50779f23 --- /dev/null +++ b/include/cuco/detail/hash_functions/utils.cuh @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace cuco::detail { + +template +constexpr __host__ __device__ T load_chunk(U const* const data, Extent index) noexcept +{ + auto const chunks = reinterpret_cast(data); + return chunks[index]; +} + +}; // namespace cuco::detail \ No newline at end of file diff --git a/include/cuco/detail/hash_functions/xxhash.cuh b/include/cuco/detail/hash_functions/xxhash.cuh index 2ea67fd8a..c686f3b82 100644 --- a/include/cuco/detail/hash_functions/xxhash.cuh +++ b/include/cuco/detail/hash_functions/xxhash.cuh @@ -16,6 +16,10 @@ #pragma once +#include +#include + +#include #include namespace cuco::detail { @@ -81,40 +85,51 @@ struct XXHash_32 { * @brief Returns a hash value for its argument, as a value of type `result_type`. * * @param key The input argument to hash - * @return A resulting hash value for `key` + * @return The resulting hash value for `key` */ constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept { - // TODO do we need to add checks/hints for alignment? - constexpr auto nbytes = sizeof(Key); - [[maybe_unused]] auto const bytes = reinterpret_cast(&key); ///< per-byte access - [[maybe_unused]] auto const blocks = - reinterpret_cast(&key); ///< 4-byte word access + return compute_hash(reinterpret_cast(&key), + cuco::experimental::extent{}); + } + /** + * @brief Returns a hash value for its argument, as a value of type `result_type`. + * + * @tparam Extent The extent type + * + * @param bytes The input argument to hash + * @param size The extent of the data in bytes + * @return The resulting hash value + */ + template + constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes, + Extent size) const noexcept + { std::size_t offset = 0; std::uint32_t h32; // data can be processed in 16-byte chunks - if constexpr (nbytes >= 16) { - constexpr auto limit = nbytes - 16; - std::uint32_t v1 = seed_ + prime1 + prime2; - std::uint32_t v2 = seed_ + prime2; - std::uint32_t v3 = seed_; - std::uint32_t v4 = seed_ - prime1; + if (size >= 16) { + auto const limit = size - 16; + std::uint32_t v1 = seed_ + prime1 + prime2; + std::uint32_t v2 = seed_ + prime2; + std::uint32_t v3 = seed_; + std::uint32_t v4 = seed_ - prime1; do { // pipeline 4*4byte computations auto const pipeline_offset = offset / 4; - v1 += blocks[pipeline_offset] * prime2; + v1 += load_chunk(bytes, pipeline_offset + 0) * prime2; v1 = rotl(v1, 13); v1 *= prime1; - v2 += blocks[pipeline_offset + 1] * prime2; + v2 += load_chunk(bytes, pipeline_offset + 1) * prime2; v2 = rotl(v2, 13); v2 *= prime1; - v3 += blocks[pipeline_offset + 2] * prime2; + v3 += load_chunk(bytes, pipeline_offset + 2) * prime2; v3 = rotl(v3, 13); v3 *= prime1; - v4 += blocks[pipeline_offset + 3] * prime2; + v4 += load_chunk(bytes, pipeline_offset + 3) * prime2; v4 = rotl(v4, 13); v4 *= prime1; offset += 16; @@ -125,20 +140,20 @@ struct XXHash_32 { h32 = seed_ + prime5; } - h32 += nbytes; + h32 += size; // remaining data can be processed in 4-byte chunks - if constexpr ((nbytes % 16) >= 4) { - for (; offset <= nbytes - 4; offset += 4) { - h32 += blocks[offset / 4] * prime3; + if ((size % 16) >= 4) { + for (; offset <= size - 4; offset += 4) { + h32 += load_chunk(bytes, offset / 4) * prime3; h32 = rotl(h32, 17) * prime4; } } // the following loop is only needed if the size of the key is not a multiple of the block size - if constexpr (nbytes % 4) { - while (offset < nbytes) { - h32 += (bytes[offset] & 255) * prime5; + if (size % 4) { + while (offset < size) { + h32 += (std::to_integer(bytes[offset]) & 255) * prime5; h32 = rotl(h32, 11) * prime1; ++offset; } @@ -232,42 +247,51 @@ struct XXHash_64 { * @brief Returns a hash value for its argument, as a value of type `result_type`. * * @param key The input argument to hash - * @return A resulting hash value for `key` + * @return The resulting hash value for `key` */ constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept { - // TODO do we need to add checks/hints for alignment? - constexpr auto nbytes = sizeof(Key); - [[maybe_unused]] auto const bytes = reinterpret_cast(&key); ///< per-byte access - [[maybe_unused]] auto const blocks4 = - reinterpret_cast(&key); ///< 4-byte word access - [[maybe_unused]] auto const blocks8 = - reinterpret_cast(&key); ///< 8-byte word access + return compute_hash(reinterpret_cast(&key), + cuco::experimental::extent{}); + } + /** + * @brief Returns a hash value for its argument, as a value of type `result_type`. + * + * @tparam Extent The extent type + * + * @param bytes The input argument to hash + * @param size The extent of the data in bytes + * @return The resulting hash value + */ + template + constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes, + Extent size) const noexcept + { std::size_t offset = 0; std::uint64_t h64; // data can be processed in 32-byte chunks - if constexpr (nbytes >= 32) { - constexpr auto limit = nbytes - 32; - std::uint64_t v1 = seed_ + prime1 + prime2; - std::uint64_t v2 = seed_ + prime2; - std::uint64_t v3 = seed_; - std::uint64_t v4 = seed_ - prime1; + if (size >= 32) { + auto const limit = size - 32; + std::uint64_t v1 = seed_ + prime1 + prime2; + std::uint64_t v2 = seed_ + prime2; + std::uint64_t v3 = seed_; + std::uint64_t v4 = seed_ - prime1; do { // pipeline 4*8byte computations auto const pipeline_offset = offset / 8; - v1 += blocks8[pipeline_offset] * prime2; + v1 += load_chunk(bytes, pipeline_offset + 0) * prime2; v1 = rotl(v1, 31); v1 *= prime1; - v2 += blocks8[pipeline_offset + 1] * prime2; + v2 += load_chunk(bytes, pipeline_offset + 1) * prime2; v2 = rotl(v2, 31); v2 *= prime1; - v3 += blocks8[pipeline_offset + 2] * prime2; + v3 += load_chunk(bytes, pipeline_offset + 2) * prime2; v3 = rotl(v3, 31); v3 *= prime1; - v4 += blocks8[pipeline_offset + 3] * prime2; + v4 += load_chunk(bytes, pipeline_offset + 3) * prime2; v4 = rotl(v4, 31); v4 *= prime1; offset += 32; @@ -302,12 +326,12 @@ struct XXHash_64 { h64 = seed_ + prime5; } - h64 += nbytes; + h64 += size; // remaining data can be processed in 8-byte chunks - if constexpr ((nbytes % 32) >= 8) { - for (; offset <= nbytes - 8; offset += 8) { - std::uint64_t k1 = blocks8[offset / 8] * prime2; + if ((size % 32) >= 8) { + for (; offset <= size - 8; offset += 8) { + std::uint64_t k1 = load_chunk(bytes, offset / 8) * prime2; k1 = rotl(k1, 31) * prime1; h64 ^= k1; h64 = rotl(h64, 27) * prime1 + prime4; @@ -315,18 +339,18 @@ struct XXHash_64 { } // remaining data can be processed in 4-byte chunks - if constexpr (((nbytes % 32) % 8) >= 4) { - for (; offset <= nbytes - 4; offset += 4) { - h64 ^= (blocks4[offset / 4] & 0xffffffffull) * prime1; + if (((size % 32) % 8) >= 4) { + for (; offset <= size - 4; offset += 4) { + h64 ^= (load_chunk(bytes, offset / 4) & 0xffffffffull) * prime1; h64 = rotl(h64, 23) * prime2 + prime3; } } // the following loop is only needed if the size of the key is not a multiple of a previous // block size - if constexpr (nbytes % 4) { - while (offset < nbytes) { - h64 ^= (bytes[offset] & 0xff) * prime5; + if (size % 4) { + while (offset < size) { + h64 ^= (std::to_integer(bytes[offset]) & 0xff) * prime5; h64 = rotl(h64, 11) * prime1; ++offset; } diff --git a/tests/utility/hash_test.cu b/tests/utility/hash_test.cu index 5e518669e..3e8880860 100644 --- a/tests/utility/hash_test.cu +++ b/tests/utility/hash_test.cu @@ -21,8 +21,11 @@ #include +#include #include +#include + template struct large_key { constexpr __host__ __device__ large_key(int32_t value) noexcept @@ -170,4 +173,26 @@ TEST_CASE("Test cuco::xxhash_32", "") CHECK(cuco::test::all_of(result.begin(), result.end(), [] __device__(bool v) { return v; })); } +} + +TEMPLATE_TEST_CASE_SIG("Static vs. dynamic key hash test", + "", + ((typename Hash), Hash), + (cuco::murmurhash3_32), + (cuco::murmurhash3_32), + (cuco::xxhash_32), + (cuco::xxhash_32), + (cuco::xxhash_64), + (cuco::xxhash_64)) +{ + using key_type = typename Hash::argument_type; + + Hash hash; + key_type key = 42; + + SECTION("Identical keys with static and dynamic key size should have the same hash value.") + { + CHECK(hash(key) == + hash.compute_hash(reinterpret_cast(&key), sizeof(key_type))); + } } \ No newline at end of file