From 70d2fbb83eb5d3061c75a14f68c1d57fe4abd044 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Thu, 11 Nov 2021 13:40:36 -0800 Subject: [PATCH 1/4] Add unique-by-key --- cub/agent/agent_select_if.cuh | 9 +- cub/agent/agent_unique_by_key.cuh | 570 ++++++++++++++ cub/device/device_select.cuh | 89 ++- cub/device/dispatch/dispatch_merge_sort.cuh | 21 +- cub/device/dispatch/dispatch_scan.cuh | 2 +- cub/device/dispatch/dispatch_select_if.cuh | 4 +- .../dispatch/dispatch_unique_by_key.cuh | 523 ++++++++++++ cub/util_math.cuh | 16 + test/test_device_select_unique.cu | 16 +- test/test_device_select_unique_by_key.cu | 741 ++++++++++++++++++ 10 files changed, 1953 insertions(+), 38 deletions(-) create mode 100644 cub/agent/agent_unique_by_key.cuh create mode 100644 cub/device/dispatch/dispatch_unique_by_key.cuh create mode 100644 test/test_device_select_unique_by_key.cu diff --git a/cub/agent/agent_select_if.cuh b/cub/agent/agent_select_if.cuh index f88d2f384f..807ea18c6d 100644 --- a/cub/agent/agent_select_if.cuh +++ b/cub/agent/agent_select_if.cuh @@ -93,7 +93,7 @@ template < typename AgentSelectIfPolicyT, ///< Parameterized AgentSelectIfPolicy tuning policy type typename InputIteratorT, ///< Random-access input iterator type for selection items typename FlagsInputIteratorT, ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection) - typename SelectedOutputIteratorT, ///< Random-access input iterator type for selection_flags items + typename SelectedOutputIteratorT, ///< Random-access output iterator type for selection_flags items typename SelectOpT, ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection) typename EqualityOpT, ///< Equality operator type (NullType if selection functor or selections is to be used for selection) typename OffsetT, ///< Signed integer type for global offsets @@ -511,7 +511,7 @@ struct AgentSelectIf */ template __device__ __forceinline__ OffsetT ConsumeFirstTile( - int num_tile_items, ///< Number of input items comprising this tile + int num_tile_items, ///< Number of input items comprising this tile OffsetT tile_offset, ///< Tile offset ScanTileStateT& tile_state) ///< Global tile state descriptor { @@ -570,7 +570,7 @@ struct AgentSelectIf */ template __device__ __forceinline__ OffsetT ConsumeSubsequentTile( - int num_tile_items, ///< Number of input items comprising this tile + int num_tile_items, ///< Number of input items comprising this tile int tile_idx, ///< Tile index OffsetT tile_offset, ///< Tile offset ScanTileStateT& tile_state) ///< Global tile state descriptor @@ -632,7 +632,7 @@ struct AgentSelectIf */ template __device__ __forceinline__ OffsetT ConsumeTile( - int num_tile_items, ///< Number of input items comprising this tile + int num_tile_items, ///< Number of input items comprising this tile int tile_idx, ///< Tile index OffsetT tile_offset, ///< Tile offset ScanTileStateT& tile_state) ///< Global tile state descriptor @@ -688,4 +688,3 @@ struct AgentSelectIf CUB_NAMESPACE_END - diff --git a/cub/agent/agent_unique_by_key.cuh b/cub/agent/agent_unique_by_key.cuh new file mode 100644 index 0000000000..659c8786e6 --- /dev/null +++ b/cub/agent/agent_unique_by_key.cuh @@ -0,0 +1,570 @@ +/****************************************************************************** + * Copyright (c) NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::AgentUniqueByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide unique-by-key. + */ + +#pragma once + +#include + +#include "../thread/thread_operators.cuh" +#include "../block/block_load.cuh" +#include "../block/block_scan.cuh" +#include "../agent/single_pass_scan_operators.cuh" +#include "../block/block_discontinuity.cuh" + +CUB_NAMESPACE_BEGIN + + +/****************************************************************************** + * Tuning policy types + ******************************************************************************/ + +/** + * Parameterizable tuning policy type for AgentUniqueByKey + */ +template +struct AgentUniqueByKeyPolicy +{ + enum + { + BLOCK_THREADS = _BLOCK_THREADS, + ITEMS_PER_THREAD = _ITEMS_PER_THREAD, + }; + static const cub::BlockLoadAlgorithm LOAD_ALGORITHM = _LOAD_ALGORITHM; + static const cub::CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; + static const cub::BlockScanAlgorithm SCAN_ALGORITHM = _SCAN_ALGORITHM; +}; + + + + +/****************************************************************************** + * Thread block abstractions + ******************************************************************************/ + + +/** + * \brief AgentUniqueByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide unique-by-key + */ +template < + typename AgentUniqueByKeyPolicyT, ///< Parameterized AgentUniqueByKeyPolicy tuning policy type + typename KeyInputIteratorT, ///< Random-access input iterator type for keys + typename ValueInputIteratorT, ///< Random-access input iterator type for values + typename KeyOutputIteratorT, ///< Random-access output iterator type for keys + typename ValueOutputIteratorT, ///< Random-access output iterator type for values + typename EqualityOpT, ///< Equality operator type + typename OffsetT> ///< Signed integer type for global offsets +struct AgentUniqueByKey +{ + //--------------------------------------------------------------------- + // Types and constants + //--------------------------------------------------------------------- + + // The input key and value type + using KeyT = typename std::iterator_traits::value_type; + using ValueT = typename std::iterator_traits::value_type; + + // Tile status descriptor interface type + using ScanTileStateT = ScanTileState; + + // Constants + enum + { + BLOCK_THREADS = AgentUniqueByKeyPolicyT::BLOCK_THREADS, + ITEMS_PER_THREAD = AgentUniqueByKeyPolicyT::ITEMS_PER_THREAD, + ITEMS_PER_TILE = BLOCK_THREADS * ITEMS_PER_THREAD, + }; + + // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys + using WrappedKeyInputIteratorT = typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator + KeyInputIteratorT>::Type; // Directly use the supplied input iterator type + + // Cache-modified Input iterator wrapper type (for applying cache modifier) for values + using WrappedValueInputIteratorT = typename If::VALUE, + CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator + ValueInputIteratorT>::Type; // Directly use the supplied input iterator type + + // Parameterized BlockLoad type for input data + using BlockLoadKeys = BlockLoad< + KeyT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + AgentUniqueByKeyPolicyT::LOAD_ALGORITHM>; + + // Parameterized BlockLoad type for flags + using BlockLoadValues = BlockLoad< + ValueT, + BLOCK_THREADS, + ITEMS_PER_THREAD, + AgentUniqueByKeyPolicyT::LOAD_ALGORITHM>; + + // Parameterized BlockDiscontinuity type for items + using BlockDiscontinuityKeys = cub::BlockDiscontinuity; + + // Parameterized BlockScan type + using BlockScanT = cub::BlockScan; + + // Parameterized BlockDiscontinuity type for items + using TilePrefixCallback = cub::TilePrefixCallbackOp; + + // Key exchange type + using KeyExchangeT = KeyT[ITEMS_PER_TILE]; + + // Value exchange type + using ValueExchangeT = ValueT[ITEMS_PER_TILE]; + + // Shared memory type for this thread block + union _TempStorage + { + struct ScanStorage + { + typename BlockScanT::TempStorage scan; + typename TilePrefixCallback::TempStorage prefix; + typename BlockDiscontinuityKeys::TempStorage discontinuity; + } scan_storage; + + // Smem needed for loading keys + typename BlockLoadKeys::TempStorage load_keys; + + // Smem needed for loading values + typename BlockLoadValues::TempStorage load_values; + + // Smem needed for compacting items (allows non POD items in this union) + Uninitialized shared_keys; + Uninitialized shared_values; + }; + + // Alias wrapper allowing storage to be unioned + struct TempStorage : Uninitialized<_TempStorage> {}; + + + //--------------------------------------------------------------------- + // Per-thread fields + //--------------------------------------------------------------------- + + _TempStorage& temp_storage; + WrappedKeyInputIteratorT d_keys_in; + WrappedValueInputIteratorT d_values_in; + KeyOutputIteratorT d_keys_out; + ValueOutputIteratorT d_values_out; + cub::InequalityWrapper inequality_op; + OffsetT num_items; + + + //--------------------------------------------------------------------- + // Constructor + //--------------------------------------------------------------------- + + // Constructor + __device__ __forceinline__ + AgentUniqueByKey( + TempStorage &temp_storage_, + WrappedKeyInputIteratorT d_keys_in_, + WrappedValueInputIteratorT d_values_in_, + KeyOutputIteratorT d_keys_out_, + ValueOutputIteratorT d_values_out_, + EqualityOpT equality_op_, + OffsetT num_items_) + : + temp_storage(temp_storage_.Alias()), + d_keys_in(d_keys_in_), + d_values_in(d_values_in_), + d_keys_out(d_keys_out_), + d_values_out(d_values_out_), + inequality_op(equality_op_), + num_items(num_items_) + {} + + + + //--------------------------------------------------------------------- + // Utility functions + //--------------------------------------------------------------------- + + struct key_tag {}; + struct value_tag {}; + + __device__ __forceinline__ + KeyExchangeT &get_shared(key_tag) + { + return temp_storage.shared_keys.Alias(); + } + __device__ __forceinline__ + ValueExchangeT &get_shared(value_tag) + { + return temp_storage.shared_values.Alias(); + } + + + //--------------------------------------------------------------------- + // Scatter utility methods + //--------------------------------------------------------------------- + template + __device__ __forceinline__ void Scatter( + Tag tag, + OutputIt items_out, + T (&items)[ITEMS_PER_THREAD], + OffsetT (&selection_flags)[ITEMS_PER_THREAD], + OffsetT (&selection_indices)[ITEMS_PER_THREAD], + int /*num_tile_items*/, + int num_tile_selections, + OffsetT num_selections_prefix, + OffsetT /*num_selections*/) + { + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + int local_scatter_offset = selection_indices[ITEM] - + num_selections_prefix; + if (selection_flags[ITEM]) + { + get_shared(tag)[local_scatter_offset] = items[ITEM]; + } + } + + CTA_SYNC(); + + for (int item = threadIdx.x; + item < num_tile_selections; + item += BLOCK_THREADS) + { + items_out[num_selections_prefix + item] = get_shared(tag)[item]; + } + + CTA_SYNC(); + } + + + //--------------------------------------------------------------------- + // Cooperatively scan a device-wide sequence of tiles with other CTAs + //--------------------------------------------------------------------- + + + /** + * Process first tile of input (dynamic chained scan). Returns the running count of selections (including this tile) + */ + template + __device__ __forceinline__ OffsetT ConsumeFirstTile( + int num_tile_items, ///< Number of input items comprising this tile + OffsetT tile_offset, ///< Tile offset + ScanTileStateT& tile_state) ///< Global tile state descriptor + { + KeyT keys[ITEMS_PER_THREAD]; + OffsetT selection_flags[ITEMS_PER_THREAD]; + OffsetT selection_idx[ITEMS_PER_THREAD]; + + if (IS_LAST_TILE) + { + // Fill last elements with the first element + // because collectives are not suffix guarded + BlockLoadKeys(temp_storage.load_keys) + .Load(d_keys_in + tile_offset, + keys, + num_tile_items, + *(d_keys_in + tile_offset)); + } + else + { + BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys); + } + + + CTA_SYNC(); + + ValueT values[ITEMS_PER_THREAD]; + if (IS_LAST_TILE) + { + // Fill last elements with the first element + // because collectives are not suffix guarded + BlockLoadValues(temp_storage.load_values) + .Load(d_values_in + tile_offset, + values, + num_tile_items, + *(d_values_in + tile_offset)); + } + else + { + BlockLoadValues(temp_storage.load_values) + .Load(d_values_in + tile_offset, values); + } + + CTA_SYNC(); + + BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity) + .FlagHeads(selection_flags, keys, inequality_op); + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + // Set selection_flags for out-of-bounds items + if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items)) + selection_flags[ITEM] = 1; + } + + CTA_SYNC(); + + + OffsetT num_tile_selections = 0; + OffsetT num_selections = 0; + OffsetT num_selections_prefix = 0; + + BlockScanT(temp_storage.scan_storage.scan) + .ExclusiveSum(selection_flags, + selection_idx, + num_tile_selections); + + if (threadIdx.x == 0) + { + // Update tile status if this is not the last tile + if (!IS_LAST_TILE) + tile_state.SetInclusive(0, num_tile_selections); + } + + // Do not count any out-of-bounds selections + if (IS_LAST_TILE) + { + int num_discount = ITEMS_PER_TILE - num_tile_items; + num_tile_selections -= num_discount; + } + num_selections = num_tile_selections; + + CTA_SYNC(); + + Scatter(key_tag(), + d_keys_out, + keys, + selection_flags, + selection_idx, + num_tile_items, + num_tile_selections, + num_selections_prefix, + num_selections); + + CTA_SYNC(); + + Scatter(value_tag(), + d_values_out, + values, + selection_flags, + selection_idx, + num_tile_items, + num_tile_selections, + num_selections_prefix, + num_selections); + + return num_selections; + } + + /** + * Process subsequent tile of input (dynamic chained scan). Returns the running count of selections (including this tile) + */ + template + __device__ __forceinline__ OffsetT ConsumeSubsequentTile( + int num_tile_items, ///< Number of input items comprising this tile + int tile_idx, ///< Tile index + OffsetT tile_offset, ///< Tile offset + ScanTileStateT& tile_state) ///< Global tile state descriptor + { + KeyT keys[ITEMS_PER_THREAD]; + OffsetT selection_flags[ITEMS_PER_THREAD]; + OffsetT selection_idx[ITEMS_PER_THREAD]; + + if (IS_LAST_TILE) + { + // Fill last elements with the first element + // because collectives are not suffix guarded + BlockLoadKeys(temp_storage.load_keys) + .Load(d_keys_in + tile_offset, + keys, + num_tile_items, + *(d_keys_in + tile_offset)); + } + else + { + BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys); + } + + + CTA_SYNC(); + + ValueT values[ITEMS_PER_THREAD]; + if (IS_LAST_TILE) + { + // Fill last elements with the first element + // because collectives are not suffix guarded + BlockLoadValues(temp_storage.load_values) + .Load(d_values_in + tile_offset, + values, + num_tile_items, + *(d_values_in + tile_offset)); + } + else + { + BlockLoadValues(temp_storage.load_values) + .Load(d_values_in + tile_offset, values); + } + + CTA_SYNC(); + + KeyT tile_predecessor = d_keys_in[tile_offset - 1]; + BlockDiscontinuityKeys(temp_storage.scan_storage.discontinuity) + .FlagHeads(selection_flags, keys, inequality_op, tile_predecessor); + + #pragma unroll + for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM) + { + // Set selection_flags for out-of-bounds items + if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items)) + selection_flags[ITEM] = 1; + } + + CTA_SYNC(); + + + OffsetT num_tile_selections = 0; + OffsetT num_selections = 0; + OffsetT num_selections_prefix = 0; + + TilePrefixCallback prefix_cb(tile_state, + temp_storage.scan_storage.prefix, + cub::Sum(), + tile_idx); + BlockScanT(temp_storage.scan_storage.scan) + .ExclusiveSum(selection_flags, + selection_idx, + prefix_cb); + + num_selections = prefix_cb.GetInclusivePrefix(); + num_tile_selections = prefix_cb.GetBlockAggregate(); + num_selections_prefix = prefix_cb.GetExclusivePrefix(); + + if (IS_LAST_TILE) + { + int num_discount = ITEMS_PER_TILE - num_tile_items; + num_tile_selections -= num_discount; + num_selections -= num_discount; + } + + CTA_SYNC(); + + Scatter(key_tag(), + d_keys_out, + keys, + selection_flags, + selection_idx, + num_tile_items, + num_tile_selections, + num_selections_prefix, + num_selections); + + CTA_SYNC(); + + Scatter(value_tag(), + d_values_out, + values, + selection_flags, + selection_idx, + num_tile_items, + num_tile_selections, + num_selections_prefix, + num_selections); + + return num_selections; + } + + + /** + * Process a tile of input + */ + template + __device__ __forceinline__ OffsetT ConsumeTile( + int num_tile_items, ///< Number of input items comprising this tile + int tile_idx, ///< Tile index + OffsetT tile_offset, ///< Tile offset + ScanTileStateT& tile_state) ///< Global tile state descriptor + { + OffsetT num_selections; + if (tile_idx == 0) + { + num_selections = ConsumeFirstTile(num_tile_items, tile_offset, tile_state); + } + else + { + num_selections = ConsumeSubsequentTile(num_tile_items, tile_idx, tile_offset, tile_state); + } + + return num_selections; + } + + /** + * Scan tiles of items as part of a dynamic chained scan + */ + template ///< Output iterator type for recording number of items selection_flags + __device__ __forceinline__ void ConsumeRange( + int num_tiles, ///< Total number of input tiles + ScanTileStateT& tile_state, ///< Global tile state descriptor + NumSelectedIteratorT d_num_selected_out) ///< Output total number selection_flags + { + // Blocks are launched in increasing order, so just assign one tile per block + int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y; // Current tile index + OffsetT tile_offset = tile_idx * ITEMS_PER_TILE; // Global offset for the current tile + + if (tile_idx < num_tiles - 1) + { + ConsumeTile(ITEMS_PER_TILE, + tile_idx, + tile_offset, + tile_state); + } + else + { + int num_remaining = static_cast(num_items - tile_offset); + OffsetT num_selections = ConsumeTile(num_remaining, + tile_idx, + tile_offset, + tile_state); + if (threadIdx.x == 0) + { + *d_num_selected_out = num_selections; + } + } + } +}; + + + +CUB_NAMESPACE_END diff --git a/cub/device/device_select.cuh b/cub/device/device_select.cuh index ed896a68d2..f9ed6d0a25 100644 --- a/cub/device/device_select.cuh +++ b/cub/device/device_select.cuh @@ -1,7 +1,7 @@ /****************************************************************************** * Copyright (c) 2011, Duane Merrill. All rights reserved. - * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2011-2021, NVIDIA CORPORATION. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: @@ -38,6 +38,7 @@ #include #include "dispatch/dispatch_select_if.cuh" +#include "dispatch/dispatch_unique_by_key.cuh" #include "../config.cuh" CUB_NAMESPACE_BEGIN @@ -323,7 +324,7 @@ struct DeviceSelect typename NumSelectedIteratorT> CUB_RUNTIME_FUNCTION __forceinline__ static cudaError_t Unique( - void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items OutputIteratorT d_out, ///< [out] Pointer to the output sequence of selected data items @@ -332,7 +333,7 @@ struct DeviceSelect cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. { - typedef int OffsetT; // Signed integer type for global offsets + typedef int OffsetT; // Signed integer type for global offsets typedef NullType* FlagIterator; // FlagT iterator type (not used) typedef NullType SelectOp; // Selection op (not used) typedef Equality EqualityOp; // Default == operator @@ -351,6 +352,88 @@ struct DeviceSelect debug_synchronous); } + + /** + * \brief Given an input sequence \p d_keys_in and \p d_values_in with runs of key-value pairs with consecutive equal-valued keys, only the first key and its value from each run is selectively copied to \p d_keys_out and \p d_values_out. The total number of items selected is written to \p d_num_selected_out. ![](unique_logo.png) + * + * \par + * - The == equality operator is used to determine whether keys are equivalent + * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering. + * - \devicestorage + * + * \par Snippet + * The code snippet below illustrates the compaction of items selected from an \p int device vector. + * \par + * \code + * #include // or equivalently + * + * // Declare, allocate, and initialize device-accessible pointers for input and output + * int num_items; // e.g., 8 + * int *d_keys_in; // e.g., [0, 2, 2, 9, 5, 5, 5, 8] + * int *d_values_in; // e.g., [1, 2, 3, 4, 5, 6, 7, 8] + * int *d_keys_out; // e.g., [ , , , , , , , ] + * int *d_values_out; // e.g., [ , , , , , , , ] + * int *d_num_selected_out; // e.g., [ ] + * ... + * + * // Determine temporary device storage requirements + * void *d_temp_storage = NULL; + * size_t temp_storage_bytes = 0; + * cub::DeviceSelect::UniqueByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items); + * + * // Allocate temporary storage + * cudaMalloc(&d_temp_storage, temp_storage_bytes); + * + * // Run selection + * cub::DeviceSelect::UniqueByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items); + * + * // d_keys_out <-- [0, 2, 9, 5, 8] + * // d_values_out <-- [1, 2, 4, 5, 8] + * // d_num_selected_out <-- [5] + * + * \endcode + * + * \tparam KeyInputIteratorT [inferred] Random-access input iterator type for reading input keys \iterator + * \tparam ValueInputIteratorT [inferred] Random-access input iterator type for reading input values \iterator + * \tparam KeyOutputIteratorT [inferred] Random-access output iterator type for writing selected keys \iterator + * \tparam ValueOutputIteratorT [inferred] Random-access output iterator type for writing selected values \iterator + * \tparam NumSelectedIteratorT [inferred] Output iterator type for recording the number of items selected \iterator + */ + template < + typename KeyInputIteratorT, + typename ValueInputIteratorT, + typename KeyOutputIteratorT, + typename ValueOutputIteratorT, + typename NumSelectedIteratorT> + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t UniqueByKey( + void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + KeyInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys + ValueInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of values + KeyOutputIteratorT d_keys_out, ///< [out] Pointer to the output sequence of selected keys + ValueOutputIteratorT d_values_out, ///< [out] Pointer to the output sequence of selected values + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out) + int num_items, ///< [in] Total number of input items (i.e., length of \p d_keys_in or \p d_values_in) + cudaStream_t stream = 0, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous = false) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false. + { + using OffsetT = int; + using EqualityOp = Equality; + + return DispatchUniqueByKey::Dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_values_in, + d_keys_out, + d_values_out, + d_num_selected_out, + EqualityOp(), + num_items, + stream, + debug_synchronous); + } }; /** diff --git a/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/device/dispatch/dispatch_merge_sort.cuh index 8f4f0be638..ee2b790d6c 100644 --- a/cub/device/dispatch/dispatch_merge_sort.cuh +++ b/cub/device/dispatch/dispatch_merge_sort.cuh @@ -488,21 +488,6 @@ struct DispatchMergeSort : SelectedPolicy bool debug_synchronous; int ptx_version; - CUB_RUNTIME_FUNCTION __forceinline__ std::size_t - vshmem_size(std::size_t max_shmem, - std::size_t shmem_per_block, - std::size_t num_blocks) - { - if (shmem_per_block > max_shmem) - { - return shmem_per_block * num_blocks; - } - else - { - return 0; - } - } - // Constructor CUB_RUNTIME_FUNCTION __forceinline__ DispatchMergeSort(void *d_temp_storage, @@ -637,9 +622,9 @@ struct DispatchMergeSort : SelectedPolicy static_cast(max_shmem); virtual_shared_memory_size = - vshmem_size(static_cast(max_shmem), - (cub::max)(block_sort_shmem_size, merge_shmem_size), - static_cast(num_tiles)); + VshmemSize(static_cast(max_shmem), + (cub::max)(block_sort_shmem_size, merge_shmem_size), + static_cast(num_tiles)); } diff --git a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh index 2aac82a262..c2d04588be 100644 --- a/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/device/dispatch/dispatch_scan.cuh @@ -348,7 +348,7 @@ struct DispatchScan: // Get max x-dimension of grid int max_dim_x; - if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; + if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break; // Run grids in epochs (in case number of tiles exceeds max x-dimension int scan_grid_size = CUB_MIN(num_tiles, max_dim_x); diff --git a/cub/device/dispatch/dispatch_select_if.cuh b/cub/device/dispatch/dispatch_select_if.cuh index a052252317..94a17419a8 100644 --- a/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/device/dispatch/dispatch_select_if.cuh @@ -333,7 +333,7 @@ struct DispatchSelectIf // Get max x-dimension of grid int max_dim_x; - if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;; + if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break; // Get grid size for scanning tiles dim3 scan_grid_size; @@ -446,5 +446,3 @@ struct DispatchSelectIf CUB_NAMESPACE_END - - diff --git a/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/device/dispatch/dispatch_unique_by_key.cuh new file mode 100644 index 0000000000..824e8f8433 --- /dev/null +++ b/cub/device/dispatch/dispatch_unique_by_key.cuh @@ -0,0 +1,523 @@ + +/****************************************************************************** + * Copyright (c) NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/** + * \file + * cub::DeviceSelect::UniqueByKey provides device-wide, parallel operations for selecting unique items by key from sequences of data items residing within device-accessible memory. + */ + +// TODO: remove this note: +// copy-pasted from https://github.com/NVIDIA/thrust/blob/main/thrust/system/cuda/detail/unique_by_key.h + +#include "../../agent/agent_unique_by_key.cuh" +#include "../../util_math.cuh" +#include "../../util_macro.cuh" + +#include "dispatch_scan.cuh" + +CUB_NAMESPACE_BEGIN + +/****************************************************************************** + * Kernel entry points + *****************************************************************************/ + +/** + * Unique by key kernel entry point (multi-block) + */ +template < + typename AgentUniqueByKeyPolicyT, ///< Parameterized AgentUniqueByKeyPolicy tuning policy type + typename KeyInputIteratorT, ///< Random-access input iterator type for keys + typename ValueInputIteratorT, ///< Random-access input iterator type for values + typename KeyOutputIteratorT, ///< Random-access output iterator type for keys + typename ValueOutputIteratorT, ///< Random-access output iterator type for values + typename NumSelectedIteratorT, ///< Output iterator type for recording the number of items selected + typename ScanTileStateT, ///< Tile status interface type + typename EqualityOpT, ///< Equality operator type + typename OffsetT> ///< Signed integer type for global offsets +__launch_bounds__ (int(AgentUniqueByKeyPolicyT::UniqueByKeyPolicyT::BLOCK_THREADS)) +__global__ void DeviceUniqueByKeySweepKernel( + KeyInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys + ValueInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of values + KeyOutputIteratorT d_keys_out, ///< [out] Pointer to the output sequence of selected data items + ValueOutputIteratorT d_values_out, ///< [out] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out) + ScanTileStateT tile_state, ///< [in] Tile status interface + EqualityOpT equality_op, ///< [in] Equality operator + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_keys_in or \p d_values_in) + int num_tiles) ///< [in] Total number of tiles for the entire problem +{ + // Thread block type for selecting data from input tiles + using AgentUniqueByKeyT = AgentUniqueByKey< + typename AgentUniqueByKeyPolicyT::UniqueByKeyPolicyT, + KeyInputIteratorT, + ValueInputIteratorT, + KeyOutputIteratorT, + ValueOutputIteratorT, + EqualityOpT, + OffsetT>; + + // Shared memory for AgentSelectIf + __shared__ typename AgentUniqueByKeyT::TempStorage temp_storage; + + // Process tiles + AgentUniqueByKeyT(temp_storage, d_keys_in, d_values_in, d_keys_out, d_values_out, equality_op, num_items).ConsumeRange( + num_tiles, + tile_state, + d_num_selected_out); +} + + +/****************************************************************************** + * Policy + ******************************************************************************/ + +template +struct DeviceUniqueByKeyPolicy +{ + using KeyT = typename std::iterator_traits::value_type; + + // SM350 + struct Policy350 : ChainedPolicy<350, Policy350, Policy350> { + const static int INPUT_SIZE = sizeof(KeyT); + enum + { + NOMINAL_4B_ITEMS_PER_THREAD = 9, + ITEMS_PER_THREAD = Nominal4BItemsToItems(NOMINAL_4B_ITEMS_PER_THREAD), + }; + + using UniqueByKeyPolicyT = AgentUniqueByKeyPolicy<128, + ITEMS_PER_THREAD, + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_LDG, + cub::BLOCK_SCAN_WARP_SCANS>; + }; + + // SM520 + struct Policy520 : ChainedPolicy<520, Policy520, Policy350> + { + const static int INPUT_SIZE = sizeof(KeyT); + enum + { + NOMINAL_4B_ITEMS_PER_THREAD = 11, + ITEMS_PER_THREAD = Nominal4BItemsToItems(NOMINAL_4B_ITEMS_PER_THREAD), + }; + + using UniqueByKeyPolicyT = AgentUniqueByKeyPolicy<64, + ITEMS_PER_THREAD, + cub::BLOCK_LOAD_WARP_TRANSPOSE, + cub::LOAD_LDG, + cub::BLOCK_SCAN_WARP_SCANS>; + }; + + /// MaxPolicy + using MaxPolicy = Policy520; +}; + + +/****************************************************************************** + * Dispatch + ******************************************************************************/ + +/** + * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect + */ +template < + typename KeyInputIteratorT, ///< Random-access input iterator type for keys + typename ValueInputIteratorT, ///< Random-access input iterator type for values + typename KeyOutputIteratorT, ///< Random-access output iterator type for keys + typename ValueOutputIteratorT, ///< Random-access output iterator type for values + typename NumSelectedIteratorT, ///< Output iterator type for recording the number of items selected + typename EqualityOpT, ///< Equality operator type + typename OffsetT, ///< Signed integer type for global offsets + typename SelectedPolicy = DeviceUniqueByKeyPolicy> +struct DispatchUniqueByKey: SelectedPolicy +{ + /****************************************************************************** + * Types and constants + ******************************************************************************/ + + enum + { + INIT_KERNEL_THREADS = 128, + }; + + // The input key and value type + using KeyT = typename std::iterator_traits::value_type; + using ValueT = typename std::iterator_traits::value_type; + + // Tile status descriptor interface type + using ScanTileStateT = ScanTileState; + + + void* d_temp_storage; ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + KeyInputIteratorT d_keys_in; ///< [in] Pointer to the input sequence of keys + ValueInputIteratorT d_values_in; ///< [in] Pointer to the input sequence of values + KeyOutputIteratorT d_keys_out; ///< [out] Pointer to the output sequence of selected data items + ValueOutputIteratorT d_values_out; ///< [out] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out; ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out) + EqualityOpT equality_op; ///< [in] Equality operator + OffsetT num_items; ///< [in] Total number of input items (i.e., length of \p d_keys_in or \p d_values_in) + cudaStream_t stream; ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous; + + CUB_RUNTIME_FUNCTION __forceinline__ + DispatchUniqueByKey( + void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t& temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + KeyInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys + ValueInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of values + KeyOutputIteratorT d_keys_out, ///< [out] Pointer to the output sequence of selected data items + ValueOutputIteratorT d_values_out, ///< [out] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out) + EqualityOpT equality_op, ///< [in] Equality operator + OffsetT num_items, ///< [in] Total number of input items (i.e., length of \p d_keys_in or \p d_values_in) + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous + ): + d_temp_storage(d_temp_storage), + temp_storage_bytes(temp_storage_bytes), + d_keys_in(d_keys_in), + d_values_in(d_values_in), + d_keys_out(d_keys_out), + d_values_out(d_values_out), + d_num_selected_out(d_num_selected_out), + equality_op(equality_op), + num_items(num_items), + stream(stream), + debug_synchronous(debug_synchronous) + {} + + + /****************************************************************************** + * Dispatch entrypoints + ******************************************************************************/ + + template + CUB_RUNTIME_FUNCTION __host__ __forceinline__ + cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel) + { +#ifndef CUB_RUNTIME_ENABLED + + (void)init_kernel; + (void)scan_kernel; + + // Kernel launch not supported from this device + return CubDebug(cudaErrorNotSupported); + +#else + + using Policy = typename ActivePolicyT::UniqueByKeyPolicyT; + using UniqueByKeyAgentT = AgentUniqueByKey; + + cudaError error = cudaSuccess; + do + { + // Get device ordinal + int device_ordinal; + if (CubDebug(error = cudaGetDevice(&device_ordinal))) break; + + // Number of input tiles + int tile_size = Policy::BLOCK_THREADS * Policy::ITEMS_PER_THREAD; + int num_tiles = static_cast(cub::DivideAndRoundUp(num_items, tile_size)); + + // Size of virtual shared memory + int max_shmem = 0; + if (CubDebug( + error = cudaDeviceGetAttribute(&max_shmem, + cudaDevAttrMaxSharedMemoryPerBlock, + device_ordinal))) + { + break; + } + std::size_t vshmem_size = VshmemSize(max_shmem, sizeof(typename UniqueByKeyAgentT::TempStorage), num_tiles); + + // Specify temporary storage allocation requirements + size_t allocation_sizes[2] = {0, vshmem_size}; + if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break; // bytes needed for tile status descriptors + + // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob) + void *allocations[2] = {NULL, NULL}; + if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break; + if (d_temp_storage == NULL) + { + // Return if the caller is simply requesting the size of the storage allocation + break; + } + + // Construct the tile status interface + ScanTileStateT tile_state; + if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break; + + // Log init_kernel configuration + num_tiles = CUB_MAX(1, num_tiles); + int init_grid_size = cub::DivideAndRoundUp(num_tiles, INIT_KERNEL_THREADS); + if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream); + + // Invoke init_kernel to initialize tile descriptors + THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( + init_grid_size, INIT_KERNEL_THREADS, 0, stream + ).doit(init_kernel, tile_state, num_tiles, d_num_selected_out); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + + // Return if empty problem + if (num_items == 0) break; + + // Get max x-dimension of grid + int max_dim_x; + if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break; + + // Get grid size for scanning tiles + dim3 scan_grid_size; + scan_grid_size.z = 1; + scan_grid_size.y = cub::DivideAndRoundUp(num_tiles, max_dim_x); + scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x); + + // Log select_if_kernel configuration + if (debug_synchronous) + { + // Get SM occupancy for unique_by_key_kernel + int scan_sm_occupancy; + if (CubDebug(error = MaxSmOccupancy(scan_sm_occupancy, // out + scan_kernel, + Policy::BLOCK_THREADS))) + { + break; + } + + _CubLog("Invoking unique_by_key_kernel<<<{%d,%d,%d}, %d, 0, " + "%lld>>>(), %d items per thread, %d SM occupancy\n", + scan_grid_size.x, + scan_grid_size.y, + scan_grid_size.z, + Policy::BLOCK_THREADS, + (long long)stream, + Policy::ITEMS_PER_THREAD, + scan_sm_occupancy); + } + + // Invoke select_if_kernel + THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron( + scan_grid_size, Policy::BLOCK_THREADS, 0, stream + ).doit(scan_kernel, + d_keys_in, + d_values_in, + d_keys_out, + d_values_out, + d_num_selected_out, + tile_state, + equality_op, + num_items, + num_tiles); + + // Check for failure to launch + if (CubDebug(error = cudaPeekAtLastError())) break; + + // Sync the stream if specified to flush runtime errors + if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break; + } + while(0); + + return error; + +#endif // CUB_RUNTIME_ENABLED + } + + template + CUB_RUNTIME_FUNCTION __host__ __forceinline__ + cudaError_t Invoke() + { + // Ensure kernels are instantiated. + return Invoke( + DeviceCompactInitKernel, + DeviceUniqueByKeySweepKernel< + ActivePolicyT, + KeyInputIteratorT, + ValueInputIteratorT, + KeyOutputIteratorT, + ValueOutputIteratorT, + NumSelectedIteratorT, + ScanTileStateT, + EqualityOpT, + OffsetT> + ); + } + + + /** + * Internal dispatch routine + */ + CUB_RUNTIME_FUNCTION __forceinline__ + static cudaError_t Dispatch( + void* d_temp_storage, ///< [in] Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done. + size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation + KeyInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys + ValueInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of values + KeyOutputIteratorT d_keys_out, ///< [out] Pointer to the output sequence of selected data items + ValueOutputIteratorT d_values_out, ///< [out] Pointer to the output sequence of selected data items + NumSelectedIteratorT d_num_selected_out, ///< [out] Pointer to the total number of items selected (i.e., length of \p d_keys_out or \p d_values_out) + EqualityOpT equality_op, ///< [in] Equality operator + OffsetT num_items, ///< [in] Total number of input items (i.e., the length of \p d_in) + cudaStream_t stream, ///< [in] [optional] CUDA stream to launch kernels within. Default is stream0. + bool debug_synchronous) ///< [in] [optional] Whether or not to synchronize the stream after every kernel launch to check for errors. Also causes launch configurations to be printed to the console. Default is \p false. + { + using MaxPolicyT = typename DispatchUniqueByKey::MaxPolicy; + + cudaError_t error; + do + { + // Get PTX version + int ptx_version = 0; + if (CubDebug(error = PtxVersion(ptx_version))) break; + + // Create dispatch functor + DispatchUniqueByKey dispatch( + d_temp_storage, + temp_storage_bytes, + d_keys_in, + d_values_in, + d_keys_out, + d_values_out, + d_num_selected_out, + equality_op, + num_items, + stream, + debug_synchronous + ); + + // Dispatch to chained policy + if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break; + } + while (0); + + return error; + } +}; + + +// template +// THRUST_RUNTIME_FUNCTION +// pair +// unique_by_key(execution_policy& policy, +// KeyInputIteratorT keys_first, +// KeyInputIteratorT keys_last, +// ValueInputIteratorT values_first, +// KeyOutputIteratorT keys_result, +// ValueOutputIteratorT values_result, +// EqualityOpT binary_pred) +// { + +// typedef int size_type; + +// size_type num_items +// = static_cast(thrust::distance(keys_first, keys_last)); + +// size_t temp_storage_bytes = 0; +// cudaStream_t stream = cuda_cub::stream(policy); +// bool debug_sync = THRUST_DEBUG_SYNC_FLAG; + +// cudaError_t status; +// status = __unique_by_key::doit_step(NULL, +// temp_storage_bytes, +// keys_first, +// values_first, +// keys_result, +// values_result, +// binary_pred, +// reinterpret_cast(NULL), +// num_items, +// stream, +// debug_sync); +// cuda_cub::throw_on_error(status, "unique_by_key: failed on 1st step"); + +// size_t allocation_sizes[2] = {sizeof(size_type), temp_storage_bytes}; +// void * allocations[2] = {NULL, NULL}; + +// size_t storage_size = 0; +// status = core::alias_storage(NULL, +// storage_size, +// allocations, +// allocation_sizes); +// cuda_cub::throw_on_error(status, "unique_by_key failed on 1st alias_storage"); + +// // Allocate temporary storage. +// thrust::detail::temporary_array +// tmp(policy, storage_size); +// void *ptr = static_cast(tmp.data().get()); + +// status = core::alias_storage(ptr, +// storage_size, +// allocations, +// allocation_sizes); +// cuda_cub::throw_on_error(status, "unique_by_key failed on 2nd alias_storage"); + +// size_type* d_num_selected_out +// = thrust::detail::aligned_reinterpret_cast(allocations[0]); + +// status = __unique_by_key::doit_step(allocations[1], +// temp_storage_bytes, +// keys_first, +// values_first, +// keys_result, +// values_result, +// binary_pred, +// d_num_selected_out, +// num_items, +// stream, +// debug_sync); +// cuda_cub::throw_on_error(status, "unique_by_key: failed on 2nd step"); + +// status = cuda_cub::synchronize(policy); +// cuda_cub::throw_on_error(status, "unique_by_key: failed to synchronize"); + +// size_type num_selected = get_value(policy, d_num_selected_out); + +// return thrust::make_pair( +// keys_result + num_selected, +// values_result + num_selected +// ); +// } + + +CUB_NAMESPACE_END diff --git a/cub/util_math.cuh b/cub/util_math.cuh index 3c5406fa46..72d8c816f2 100644 --- a/cub/util_math.cuh +++ b/cub/util_math.cuh @@ -67,6 +67,22 @@ DivideAndRoundUp(NumeratorT n, DenominatorT d) return static_cast(n / d + (n % d != 0 ? 1 : 0)); } + +__host__ __device__ __forceinline__ constexpr std::size_t +VshmemSize(std::size_t max_shmem, + std::size_t shmem_per_block, + std::size_t num_blocks) +{ + if (shmem_per_block > max_shmem) + { + return shmem_per_block * num_blocks; + } + else + { + return 0; + } +} + constexpr __device__ __host__ int Nominal4BItemsToItemsCombined(int nominal_4b_items_per_thread, int combined_bytes) { diff --git a/test/test_device_select_unique.cu b/test/test_device_select_unique.cu index d33e4895ac..497b43fc36 100644 --- a/test/test_device_select_unique.cu +++ b/test/test_device_select_unique.cu @@ -78,11 +78,11 @@ cudaError_t Dispatch( size_t */*d_temp_storage_bytes*/, cudaError_t */*d_cdp_error*/, - void* d_temp_storage, + void* d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, - OutputIteratorT d_out, - NumSelectedIteratorT d_num_selected_out, + OutputIteratorT d_out, + NumSelectedIteratorT d_num_selected_out, OffsetT num_items, cudaStream_t stream, bool debug_synchronous) @@ -147,11 +147,11 @@ cudaError_t Dispatch( size_t *d_temp_storage_bytes, cudaError_t *d_cdp_error, - void* d_temp_storage, + void* d_temp_storage, size_t &temp_storage_bytes, InputIteratorT d_in, - OutputIteratorT d_out, - NumSelectedIteratorT d_num_selected_out, + OutputIteratorT d_out, + NumSelectedIteratorT d_num_selected_out, OffsetT num_items, cudaStream_t stream, bool debug_synchronous) @@ -275,8 +275,8 @@ void Test( int num_items) { // Allocate device output array and num selected - T *d_out = NULL; - int *d_num_selected_out = NULL; + T *d_out = NULL; + int *d_num_selected_out = NULL; CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(T) * num_items)); CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int))); diff --git a/test/test_device_select_unique_by_key.cu b/test/test_device_select_unique_by_key.cu new file mode 100644 index 0000000000..504c6bb2bc --- /dev/null +++ b/test/test_device_select_unique_by_key.cu @@ -0,0 +1,741 @@ +/****************************************************************************** + * Copyright (c) NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ + +/****************************************************************************** + * Test of DeviceSelect::Unique utilities + ******************************************************************************/ + +// Ensure printing of CUDA runtime errors to console +#define CUB_STDERR + +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include + +#include "test_util.h" + +using namespace cub; + + +//--------------------------------------------------------------------- +// Globals, constants and typedefs +//--------------------------------------------------------------------- + +bool g_verbose = false; +int g_timing_iterations = 0; +int g_repeat = 0; +float g_device_giga_bandwidth; +CachingDeviceAllocator g_allocator(true); + +// Dispatch types +enum Backend +{ + CUB, // CUB method + THRUST, // Thrust method + CDP, // GPU-based (dynamic parallelism) dispatch to CUB method +}; + + +//--------------------------------------------------------------------- +// Dispatch to different CUB DeviceSelect entrypoints +//--------------------------------------------------------------------- + + +/** + * Dispatch to unique entrypoint + */ +template +CUB_RUNTIME_FUNCTION __forceinline__ +cudaError_t Dispatch( + Int2Type /*dispatch_to*/, + int timing_timing_iterations, + size_t */*d_temp_storage_bytes*/, + cudaError_t */*d_cdp_error*/, + + void* d_temp_storage, + size_t &temp_storage_bytes, + KeyInputIteratorT d_keys_in, + ValueInputIteratorT d_values_in, + KeyOutputIteratorT d_keys_out, + ValueOutputIteratorT d_values_out, + NumSelectedIteratorT d_num_selected_out, + OffsetT num_items, + cudaStream_t stream, + bool debug_synchronous) +{ + cudaError_t error = cudaSuccess; + for (int i = 0; i < timing_timing_iterations; ++i) + { + error = DeviceSelect::UniqueByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items, stream, debug_synchronous); + } + return error; +} + + +//--------------------------------------------------------------------- +// Dispatch to different Thrust entrypoints +//--------------------------------------------------------------------- + + +/** + * Dispatch to unique entrypoint + */ +template +__host__ __forceinline__ +cudaError_t Dispatch( + Int2Type /*dispatch_to*/, + int timing_timing_iterations, + size_t */*d_temp_storage_bytes*/, + cudaError_t */*d_cdp_error*/, + + void *d_temp_storage, + size_t &temp_storage_bytes, + KeyInputIteratorT d_keys_in, + ValueInputIteratorT d_values_in, + KeyOutputIteratorT d_keys_out, + ValueOutputIteratorT d_values_out, + NumSelectedIteratorT d_num_selected_out, + OffsetT num_items, + cudaStream_t /*stream*/, + bool /*debug_synchronous*/) +{ + // The input types + typedef typename std::iterator_traits::value_type InputKeyT; + typedef typename std::iterator_traits::value_type InputValueT; + + // The output types + using OutputKeyT = typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type; // ... else the output iterator's value type + using OutputValueT = typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? + typename std::iterator_traits::value_type, // ... then the input iterator's value type, + typename std::iterator_traits::value_type>::Type; // ... else the output iterator's value type + + if (d_temp_storage == 0) + { + temp_storage_bytes = 1; + } + else + { + THRUST_NS_QUALIFIER::device_ptr d_keys_in_wrapper(d_keys_in); + THRUST_NS_QUALIFIER::device_ptr d_values_in_wrapper(d_values_in); + using OutputKeyIterT = THRUST_NS_QUALIFIER::device_ptr; + using OutputValueIterT = THRUST_NS_QUALIFIER::device_ptr; + OutputKeyIterT d_keys_out_wrapper(d_keys_out); + OutputValueIterT d_values_out_wrapper(d_values_out); + THRUST_NS_QUALIFIER::pair d_out_wrapper_end; + for (int i = 0; i < timing_timing_iterations; ++i) + { + d_out_wrapper_end = THRUST_NS_QUALIFIER::unique_by_key_copy(d_keys_in_wrapper, d_keys_in_wrapper + num_items, d_values_in_wrapper, d_keys_out_wrapper, d_values_out_wrapper); + } + + OffsetT num_selected = OffsetT(d_out_wrapper_end.first - d_keys_out_wrapper); + CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice)); + + } + + return cudaSuccess; +} + + + +//--------------------------------------------------------------------- +// CUDA Nested Parallelism Test Kernel +//--------------------------------------------------------------------- + +/** + * Simple wrapper kernel to invoke DeviceSelect + */ +template +__global__ void CnpDispatchKernel( + int timing_timing_iterations, + size_t *d_temp_storage_bytes, + cudaError_t *d_cdp_error, + + void* d_temp_storage, + size_t temp_storage_bytes, + KeyInputIteratorT d_keys_in, + ValueInputIteratorT d_values_in, + KeyOutputIteratorT d_keys_out, + ValueOutputIteratorT d_values_out, + NumSelectedIteratorT d_num_selected_out, + OffsetT num_items, + bool debug_synchronous) +{ + +#ifndef CUB_CDP + (void)timing_timing_iterations; + (void)d_temp_storage_bytes; + (void)d_cdp_error; + (void)d_temp_storage; + (void)temp_storage_bytes; + (void)d_keys_in; + (void)d_values_in; + (void)d_keys_out; + (void)d_values_out; + (void)d_num_selected_out; + (void)num_items; + (void)debug_synchronous; + *d_cdp_error = cudaErrorNotSupported; +#else + *d_cdp_error = Dispatch(Int2Type(), timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, + d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items, 0, debug_synchronous); + *d_temp_storage_bytes = temp_storage_bytes; +#endif +} + + +/** + * Dispatch to CDP kernel + */ +template +cudaError_t Dispatch( + Int2Type dispatch_to, + int timing_timing_iterations, + size_t *d_temp_storage_bytes, + cudaError_t *d_cdp_error, + + void* d_temp_storage, + size_t &temp_storage_bytes, + KeyInputIteratorT d_keys_in, + ValueInputIteratorT d_values_in, + KeyOutputIteratorT d_keys_out, + ValueOutputIteratorT d_values_out, + NumSelectedIteratorT d_num_selected_out, + OffsetT num_items, + cudaStream_t stream, + bool debug_synchronous) +{ + // Invoke kernel to invoke device-side dispatch + CnpDispatchKernel<<<1,1>>>(timing_timing_iterations, d_temp_storage_bytes, d_cdp_error, + d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items, debug_synchronous); + + // Copy out temp_storage_bytes + CubDebugExit(cudaMemcpy(&temp_storage_bytes, d_temp_storage_bytes, sizeof(size_t) * 1, cudaMemcpyDeviceToHost)); + + // Copy out error + cudaError_t retval; + CubDebugExit(cudaMemcpy(&retval, d_cdp_error, sizeof(cudaError_t) * 1, cudaMemcpyDeviceToHost)); + return retval; +} + + + +//--------------------------------------------------------------------- +// Test generation +//--------------------------------------------------------------------- + + +/** + * Initialize problem + */ +template +void Initialize( + int entropy_reduction, + T *h_in, + int num_items, + int max_segment) +{ + unsigned int max_int = (unsigned int) -1; + + int key = 0; + int i = 0; + while (i < num_items) + { + // Select number of repeating occurrences for the current run + int repeat; + if (max_segment < 0) + { + repeat = num_items; + } + else if (max_segment < 2) + { + repeat = 1; + } + else + { + RandomBits(repeat, entropy_reduction); + repeat = (int) ((double(repeat) * double(max_segment)) / double(max_int)); + repeat = CUB_MAX(1, repeat); + } + + int j = i; + while (j < CUB_MIN(i + repeat, num_items)) + { + InitValue(INTEGER_SEED, h_in[j], key); + j++; + } + + i = j; + key++; + } + + if (g_verbose) + { + printf("Input:\n"); + DisplayResults(h_in, num_items); + printf("\n\n"); + } +} + + +/** + * Solve unique problem + */ +template < + typename KeyInputIteratorT, + typename ValueInputIteratorT, + typename KeyT, + typename ValueT> +int Solve( + KeyInputIteratorT h_keys_in, + ValueInputIteratorT h_values_in, + KeyT *h_keys_reference, + ValueT *h_values_reference, + int num_items) +{ + int num_selected = 0; + if (num_items > 0) + { + h_keys_reference[num_selected] = h_keys_in[0]; + h_values_reference[num_selected] = h_values_in[0]; + num_selected++; + } + + for (int i = 1; i < num_items; ++i) + { + if (h_keys_in[i] != h_keys_in[i - 1]) + { + h_keys_reference[num_selected] = h_keys_in[i]; + h_values_reference[num_selected] = h_values_in[i]; + num_selected++; + } + } + + return num_selected; +} + + + +/** + * Test DeviceSelect for a given problem input + */ +template < + Backend BACKEND, + typename KeyInputIteratorT, + typename ValueInputIteratorT, + typename KeyT, + typename ValueT> +void Test( + KeyInputIteratorT d_keys_in, + ValueInputIteratorT d_values_in, + KeyT *h_keys_reference, + ValueT *h_values_reference, + int num_selected, + int num_items) +{ + // Allocate device output array and num selected + KeyT *d_keys_out = NULL; + ValueT *d_values_out = NULL; + int *d_num_selected_out = NULL; + CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_out, sizeof(KeyT) * num_items)); + CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_out, sizeof(ValueT) * num_items)); + CubDebugExit(g_allocator.DeviceAllocate((void**)&d_num_selected_out, sizeof(int))); + + // Allocate CDP device arrays + size_t *d_temp_storage_bytes = NULL; + cudaError_t *d_cdp_error = NULL; + CubDebugExit(g_allocator.DeviceAllocate((void**)&d_temp_storage_bytes, sizeof(size_t) * 1)); + CubDebugExit(g_allocator.DeviceAllocate((void**)&d_cdp_error, sizeof(cudaError_t) * 1)); + + // Allocate temporary storage + void *d_temp_storage = NULL; + size_t temp_storage_bytes = 0; + CubDebugExit(Dispatch(Int2Type(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items, 0, true)); + CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes)); + + // Clear device output array + CubDebugExit(cudaMemset(d_keys_out, 0, sizeof(KeyT) * num_items)); + CubDebugExit(cudaMemset(d_values_out, 0, sizeof(ValueT) * num_items)); + CubDebugExit(cudaMemset(d_num_selected_out, 0, sizeof(int))); + + // Run warmup/correctness iteration + CubDebugExit(Dispatch(Int2Type(), 1, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items, 0, true)); + + // Check for correctness (and display results, if specified) + int compare11 = CompareDeviceResults(h_keys_reference, d_keys_out, num_selected, true, g_verbose); + int compare12 = CompareDeviceResults(h_values_reference, d_values_out, num_selected, true, g_verbose); + int compare1 = compare11 && compare12; + printf("\t Data %s ", compare1 ? "FAIL" : "PASS"); + + int compare2 = CompareDeviceResults(&num_selected, d_num_selected_out, 1, true, g_verbose); + printf("\t Count %s ", compare2 ? "FAIL" : "PASS"); + + // Flush any stdout/stderr + fflush(stdout); + fflush(stderr); + + // Performance + GpuTimer gpu_timer; + gpu_timer.Start(); + CubDebugExit(Dispatch(Int2Type(), g_timing_iterations, d_temp_storage_bytes, d_cdp_error, d_temp_storage, temp_storage_bytes, d_keys_in, d_values_in, d_keys_out, d_values_out, d_num_selected_out, num_items, 0, false)); + gpu_timer.Stop(); + float elapsed_millis = gpu_timer.ElapsedMillis(); + + // Display performance + if (g_timing_iterations > 0) + { + float avg_millis = elapsed_millis / g_timing_iterations; + float giga_rate = float(num_items) / avg_millis / 1000.0f / 1000.0f; + float giga_bandwidth = float((num_items + num_selected) * (sizeof(KeyT) + sizeof(ValueT))) / avg_millis / 1000.0f / 1000.0f; + printf(", %.3f avg ms, %.3f billion items/s, %.3f logical GB/s, %.1f%% peak", avg_millis, giga_rate, giga_bandwidth, giga_bandwidth / g_device_giga_bandwidth * 100.0); + } + printf("\n\n"); + + // Flush any stdout/stderr + fflush(stdout); + fflush(stderr); + + // Cleanup + if (d_keys_out) CubDebugExit(g_allocator.DeviceFree(d_keys_out)); + if (d_values_out) CubDebugExit(g_allocator.DeviceFree(d_values_out)); + if (d_num_selected_out) CubDebugExit(g_allocator.DeviceFree(d_num_selected_out)); + if (d_temp_storage_bytes) CubDebugExit(g_allocator.DeviceFree(d_temp_storage_bytes)); + if (d_cdp_error) CubDebugExit(g_allocator.DeviceFree(d_cdp_error)); + if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage)); + + // Correctness asserts + AssertEquals(0, compare1 | compare2); +} + + +/** + * Test DeviceSelect on pointer type + */ +template < + Backend BACKEND, + typename KeyT, + typename ValueT> +void TestPointer( + int num_items, + int entropy_reduction, + int max_segment) +{ + // Allocate host arrays + KeyT* h_keys_in = new KeyT[num_items]; + ValueT* h_values_in = new ValueT[num_items]; + KeyT* h_keys_reference = new KeyT[num_items]; + ValueT* h_values_reference = new ValueT[num_items]; + + // Initialize problem and solution + Initialize(entropy_reduction, h_keys_in, num_items, max_segment); + Initialize(entropy_reduction, h_values_in, num_items, max_segment); + int num_selected = Solve(h_keys_in, h_values_in, h_keys_reference, h_values_reference, num_items); + + printf("\nPointer %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements, entropy_reduction %d\n", + (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB", + num_items, num_selected, float(num_items) / num_selected, + typeid(KeyT).name(), + (int) sizeof(KeyT), + entropy_reduction); + fflush(stdout); + + // Allocate problem device arrays + KeyT *d_keys_in = NULL; + ValueT *d_values_in = NULL; + CubDebugExit(g_allocator.DeviceAllocate((void**)&d_keys_in, sizeof(KeyT) * num_items)); + CubDebugExit(g_allocator.DeviceAllocate((void**)&d_values_in, sizeof(ValueT) * num_items)); + + // Initialize device input + CubDebugExit(cudaMemcpy(d_keys_in, h_keys_in, sizeof(KeyT) * num_items, cudaMemcpyHostToDevice)); + CubDebugExit(cudaMemcpy(d_values_in, h_values_in, sizeof(ValueT) * num_items, cudaMemcpyHostToDevice)); + + // Run Test + Test(d_keys_in, d_values_in, h_keys_reference, h_values_reference, num_selected, num_items); + + // Cleanup + if (h_keys_in) delete[] h_keys_in; + if (h_values_in) delete[] h_values_in; + if (h_keys_reference) delete[] h_keys_reference; + if (h_values_reference) delete[] h_values_reference; + if (d_keys_in) CubDebugExit(g_allocator.DeviceFree(d_keys_in)); + if (d_values_in) CubDebugExit(g_allocator.DeviceFree(d_values_in)); +} + + +/** + * Test DeviceSelect on iterator type + */ +template < + Backend BACKEND, + typename KeyT, + typename ValueT> +void TestIterator( + int num_items) +{ + // Use a counting iterator as the input + CountingInputIterator h_keys_in(0); + CountingInputIterator h_values_in(0); + + // Allocate host arrays + KeyT* h_keys_reference = new KeyT[num_items]; + ValueT* h_values_reference = new ValueT[num_items]; + + // Initialize problem and solution + int num_selected = Solve(h_keys_in, h_values_in, h_keys_reference, h_values_reference, num_items); + + printf("\nIterator %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements\n", + (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB", + num_items, num_selected, float(num_items) / num_selected, + typeid(KeyT).name(), + (int) sizeof(ValueT)); + fflush(stdout); + + // Run Test + Test(h_keys_in, h_values_in, h_keys_reference, h_values_reference, num_selected, num_items); + + // Cleanup + if (h_keys_reference) delete[] h_keys_reference; + if (h_values_reference) delete[] h_values_reference; +} + + +/** + * Test different gen modes + */ +template < + Backend BACKEND, + typename KeyT, + typename ValueT> +void Test( + int num_items) +{ + for (int max_segment = 1; ((max_segment > 0) && (max_segment < num_items)); max_segment *= 11) + { + TestPointer(num_items, 0, max_segment); + TestPointer(num_items, 2, max_segment); + TestPointer(num_items, 7, max_segment); + } +} + + +/** + * Test different dispatch + */ +template < + typename KeyT, + typename ValueT> +void TestOp( + int num_items) +{ + Test(num_items); +#ifdef CUB_CDP + Test(num_items); +#endif +} + + +/** + * Test different input sizes + */ +template < + typename KeyT, + typename ValueT> +void Test( + int num_items) +{ + if (num_items < 0) + { + TestOp(0); + TestOp(1); + TestOp(100); + TestOp(10000); + TestOp(1000000); + } + else + { + TestOp(num_items); + } +} + + + +//--------------------------------------------------------------------- +// Main +//--------------------------------------------------------------------- + +/** + * Main + */ +int main(int argc, char** argv) +{ + int num_items = -1; + int entropy_reduction = 0; + int maxseg = 1000; + + // Initialize command line + CommandLineArgs args(argc, argv); + g_verbose = args.CheckCmdLineFlag("v"); + args.GetCmdLineArgument("n", num_items); + args.GetCmdLineArgument("i", g_timing_iterations); + args.GetCmdLineArgument("repeat", g_repeat); + args.GetCmdLineArgument("maxseg", maxseg); + args.GetCmdLineArgument("entropy", entropy_reduction); + + // Print usage + if (args.CheckCmdLineFlag("help")) + { + printf("%s " + "[--n= " + "[--i= " + "[--device=] " + "[--maxseg=]" + "[--entropy=]" + "[--repeat=]" + "[--v] " + "[--cdp]" + "\n", argv[0]); + exit(0); + } + + // Initialize device + CubDebugExit(args.DeviceInit()); + g_device_giga_bandwidth = args.device_giga_bandwidth; + printf("\n"); + +#ifdef CUB_TEST_MINIMAL + + // Compile/run basic CUB test + if (num_items < 0) num_items = 32000000; + TestPointer( num_items, entropy_reduction, maxseg); + +#elif defined(CUB_TEST_BENCHMARK) + + // Get device ordinal + int device_ordinal; + CubDebugExit(cudaGetDevice(&device_ordinal)); + + // Get device SM version + int sm_version = 0; + CubDebugExit(SmVersion(sm_version, device_ordinal)); + + // Compile/run quick tests + if (num_items < 0) num_items = 32000000; + + printf("-- Iterator ----------------------------\n"); + TestIterator( num_items); + TestIterator( num_items); + + printf("----------------------------\n"); + TestPointer( num_items * ((sm_version <= 130) ? 1 : 4), entropy_reduction, maxseg); + TestPointer( num_items * ((sm_version <= 130) ? 1 : 4), entropy_reduction, maxseg); + TestPointer( num_items * ((sm_version <= 130) ? 1 : 4), entropy_reduction, maxseg); + TestPointer( num_items * ((sm_version <= 130) ? 1 : 4), entropy_reduction, maxseg); + + printf("----------------------------\n"); + TestPointer( num_items * ((sm_version <= 130) ? 1 : 2), entropy_reduction, maxseg); + TestPointer( num_items * ((sm_version <= 130) ? 1 : 2), entropy_reduction, maxseg); + TestPointer( num_items * ((sm_version <= 130) ? 1 : 2), entropy_reduction, maxseg); + TestPointer( num_items * ((sm_version <= 130) ? 1 : 2), entropy_reduction, maxseg); + + printf("----------------------------\n"); + TestPointer( num_items, entropy_reduction, maxseg); + TestPointer( num_items, entropy_reduction, maxseg); + TestPointer( num_items, entropy_reduction, maxseg); + TestPointer( num_items, entropy_reduction, maxseg); + + printf("----------------------------\n"); + TestPointer( num_items / 2, entropy_reduction, maxseg); + TestPointer( num_items / 2, entropy_reduction, maxseg); + TestPointer( num_items / 2, entropy_reduction, maxseg); + TestPointer( num_items / 2, entropy_reduction, maxseg); + + printf("----------------------------\n"); + TestPointer( num_items / 4, entropy_reduction, maxseg); + TestPointer( num_items / 4, entropy_reduction, maxseg); + TestPointer( num_items / 4, entropy_reduction, maxseg); + TestPointer( num_items / 4, entropy_reduction, maxseg); + TestPointer( num_items / 4, entropy_reduction, maxseg); + TestPointer(num_items / 4, entropy_reduction, maxseg); + +#else + + // Compile/run thorough tests + for (int i = 0; i <= g_repeat; ++i) + { + // Test different input types + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + } + +#endif + + return 0; +} + + + From 8986eef8c41a90510b368777ce7528e0b1deac46 Mon Sep 17 00:00:00 2001 From: Xiang Gao Date: Fri, 14 Jan 2022 10:39:33 -0800 Subject: [PATCH 2/4] Resolve review comments --- cub/agent/agent_unique_by_key.cuh | 28 +-- cub/device/dispatch/dispatch_merge_sort.cuh | 2 +- .../dispatch/dispatch_unique_by_key.cuh | 107 +-------- cub/util_math.cuh | 31 ++- test/test_device_select_unique_by_key.cu | 211 +++--------------- 5 files changed, 73 insertions(+), 306 deletions(-) diff --git a/cub/agent/agent_unique_by_key.cuh b/cub/agent/agent_unique_by_key.cuh index 659c8786e6..7ca8771033 100644 --- a/cub/agent/agent_unique_by_key.cuh +++ b/cub/agent/agent_unique_by_key.cuh @@ -108,14 +108,14 @@ struct AgentUniqueByKey }; // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys - using WrappedKeyInputIteratorT = typename If::VALUE, + using WrappedKeyInputIteratorT = typename std::conditional::VALUE, CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator - KeyInputIteratorT>::Type; // Directly use the supplied input iterator type + KeyInputIteratorT>::type; // Directly use the supplied input iterator type // Cache-modified Input iterator wrapper type (for applying cache modifier) for values - using WrappedValueInputIteratorT = typename If::VALUE, + using WrappedValueInputIteratorT = typename std::conditional::VALUE, CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator - ValueInputIteratorT>::Type; // Directly use the supplied input iterator type + ValueInputIteratorT>::type; // Directly use the supplied input iterator type // Parameterized BlockLoad type for input data using BlockLoadKeys = BlockLoad< @@ -214,16 +214,16 @@ struct AgentUniqueByKey // Utility functions //--------------------------------------------------------------------- - struct key_tag {}; - struct value_tag {}; + struct KeyTagT {}; + struct ValueTagT {}; __device__ __forceinline__ - KeyExchangeT &get_shared(key_tag) + KeyExchangeT &GetShared(KeyTagT) { return temp_storage.shared_keys.Alias(); } __device__ __forceinline__ - ValueExchangeT &get_shared(value_tag) + ValueExchangeT &GetShared(ValueTagT) { return temp_storage.shared_values.Alias(); } @@ -253,7 +253,7 @@ struct AgentUniqueByKey num_selections_prefix; if (selection_flags[ITEM]) { - get_shared(tag)[local_scatter_offset] = items[ITEM]; + GetShared(tag)[local_scatter_offset] = items[ITEM]; } } @@ -263,7 +263,7 @@ struct AgentUniqueByKey item < num_tile_selections; item += BLOCK_THREADS) { - items_out[num_selections_prefix + item] = get_shared(tag)[item]; + items_out[num_selections_prefix + item] = GetShared(tag)[item]; } CTA_SYNC(); @@ -364,7 +364,7 @@ struct AgentUniqueByKey CTA_SYNC(); - Scatter(key_tag(), + Scatter(KeyTagT(), d_keys_out, keys, selection_flags, @@ -376,7 +376,7 @@ struct AgentUniqueByKey CTA_SYNC(); - Scatter(value_tag(), + Scatter(ValueTagT(), d_values_out, values, selection_flags, @@ -481,7 +481,7 @@ struct AgentUniqueByKey CTA_SYNC(); - Scatter(key_tag(), + Scatter(KeyTagT(), d_keys_out, keys, selection_flags, @@ -493,7 +493,7 @@ struct AgentUniqueByKey CTA_SYNC(); - Scatter(value_tag(), + Scatter(ValueTagT(), d_values_out, values, selection_flags, diff --git a/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/device/dispatch/dispatch_merge_sort.cuh index ee2b790d6c..0b480899c5 100644 --- a/cub/device/dispatch/dispatch_merge_sort.cuh +++ b/cub/device/dispatch/dispatch_merge_sort.cuh @@ -622,7 +622,7 @@ struct DispatchMergeSort : SelectedPolicy static_cast(max_shmem); virtual_shared_memory_size = - VshmemSize(static_cast(max_shmem), + detail::VshmemSize(static_cast(max_shmem), (cub::max)(block_sort_shmem_size, merge_shmem_size), static_cast(num_tiles)); } diff --git a/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/device/dispatch/dispatch_unique_by_key.cuh index 824e8f8433..8597e3c821 100644 --- a/cub/device/dispatch/dispatch_unique_by_key.cuh +++ b/cub/device/dispatch/dispatch_unique_by_key.cuh @@ -31,9 +31,6 @@ * cub::DeviceSelect::UniqueByKey provides device-wide, parallel operations for selecting unique items by key from sequences of data items residing within device-accessible memory. */ -// TODO: remove this note: -// copy-pasted from https://github.com/NVIDIA/thrust/blob/main/thrust/system/cuda/detail/unique_by_key.h - #include "../../agent/agent_unique_by_key.cuh" #include "../../util_math.cuh" #include "../../util_macro.cuh" @@ -126,7 +123,7 @@ struct DeviceUniqueByKeyPolicy NOMINAL_4B_ITEMS_PER_THREAD = 11, ITEMS_PER_THREAD = Nominal4BItemsToItems(NOMINAL_4B_ITEMS_PER_THREAD), }; - + using UniqueByKeyPolicyT = AgentUniqueByKeyPolicy<64, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE, @@ -223,13 +220,13 @@ struct DispatchUniqueByKey: SelectedPolicy cudaError_t Invoke(InitKernel init_kernel, ScanKernel scan_kernel) { #ifndef CUB_RUNTIME_ENABLED - + (void)init_kernel; (void)scan_kernel; // Kernel launch not supported from this device return CubDebug(cudaErrorNotSupported); - + #else using Policy = typename ActivePolicyT::UniqueByKeyPolicyT; @@ -261,7 +258,7 @@ struct DispatchUniqueByKey: SelectedPolicy { break; } - std::size_t vshmem_size = VshmemSize(max_shmem, sizeof(typename UniqueByKeyAgentT::TempStorage), num_tiles); + std::size_t vshmem_size = detail::VshmemSize(max_shmem, sizeof(typename UniqueByKeyAgentT::TempStorage), num_tiles); // Specify temporary storage allocation requirements size_t allocation_sizes[2] = {0, vshmem_size}; @@ -355,10 +352,10 @@ struct DispatchUniqueByKey: SelectedPolicy while(0); return error; - + #endif // CUB_RUNTIME_ENABLED } - + template CUB_RUNTIME_FUNCTION __host__ __forceinline__ cudaError_t Invoke() @@ -379,7 +376,7 @@ struct DispatchUniqueByKey: SelectedPolicy ); } - + /** * Internal dispatch routine */ @@ -430,94 +427,4 @@ struct DispatchUniqueByKey: SelectedPolicy } }; - -// template -// THRUST_RUNTIME_FUNCTION -// pair -// unique_by_key(execution_policy& policy, -// KeyInputIteratorT keys_first, -// KeyInputIteratorT keys_last, -// ValueInputIteratorT values_first, -// KeyOutputIteratorT keys_result, -// ValueOutputIteratorT values_result, -// EqualityOpT binary_pred) -// { - -// typedef int size_type; - -// size_type num_items -// = static_cast(thrust::distance(keys_first, keys_last)); - -// size_t temp_storage_bytes = 0; -// cudaStream_t stream = cuda_cub::stream(policy); -// bool debug_sync = THRUST_DEBUG_SYNC_FLAG; - -// cudaError_t status; -// status = __unique_by_key::doit_step(NULL, -// temp_storage_bytes, -// keys_first, -// values_first, -// keys_result, -// values_result, -// binary_pred, -// reinterpret_cast(NULL), -// num_items, -// stream, -// debug_sync); -// cuda_cub::throw_on_error(status, "unique_by_key: failed on 1st step"); - -// size_t allocation_sizes[2] = {sizeof(size_type), temp_storage_bytes}; -// void * allocations[2] = {NULL, NULL}; - -// size_t storage_size = 0; -// status = core::alias_storage(NULL, -// storage_size, -// allocations, -// allocation_sizes); -// cuda_cub::throw_on_error(status, "unique_by_key failed on 1st alias_storage"); - -// // Allocate temporary storage. -// thrust::detail::temporary_array -// tmp(policy, storage_size); -// void *ptr = static_cast(tmp.data().get()); - -// status = core::alias_storage(ptr, -// storage_size, -// allocations, -// allocation_sizes); -// cuda_cub::throw_on_error(status, "unique_by_key failed on 2nd alias_storage"); - -// size_type* d_num_selected_out -// = thrust::detail::aligned_reinterpret_cast(allocations[0]); - -// status = __unique_by_key::doit_step(allocations[1], -// temp_storage_bytes, -// keys_first, -// values_first, -// keys_result, -// values_result, -// binary_pred, -// d_num_selected_out, -// num_items, -// stream, -// debug_sync); -// cuda_cub::throw_on_error(status, "unique_by_key: failed on 2nd step"); - -// status = cuda_cub::synchronize(policy); -// cuda_cub::throw_on_error(status, "unique_by_key: failed to synchronize"); - -// size_type num_selected = get_value(policy, d_num_selected_out); - -// return thrust::make_pair( -// keys_result + num_selected, -// values_result + num_selected -// ); -// } - - CUB_NAMESPACE_END diff --git a/cub/util_math.cuh b/cub/util_math.cuh index 72d8c816f2..46470cfba2 100644 --- a/cub/util_math.cuh +++ b/cub/util_math.cuh @@ -47,6 +47,21 @@ using is_integral_or_enum = std::integral_constant::value || std::is_enum::value>; +__host__ __device__ __forceinline__ constexpr std::size_t +VshmemSize(std::size_t max_shmem, + std::size_t shmem_per_block, + std::size_t num_blocks) +{ + if (shmem_per_block > max_shmem) + { + return shmem_per_block * num_blocks; + } + else + { + return 0; + } +} + } /** @@ -67,22 +82,6 @@ DivideAndRoundUp(NumeratorT n, DenominatorT d) return static_cast(n / d + (n % d != 0 ? 1 : 0)); } - -__host__ __device__ __forceinline__ constexpr std::size_t -VshmemSize(std::size_t max_shmem, - std::size_t shmem_per_block, - std::size_t num_blocks) -{ - if (shmem_per_block > max_shmem) - { - return shmem_per_block * num_blocks; - } - else - { - return 0; - } -} - constexpr __device__ __host__ int Nominal4BItemsToItemsCombined(int nominal_4b_items_per_thread, int combined_bytes) { diff --git a/test/test_device_select_unique_by_key.cu b/test/test_device_select_unique_by_key.cu index 504c6bb2bc..d8db2f7935 100644 --- a/test/test_device_select_unique_by_key.cu +++ b/test/test_device_select_unique_by_key.cu @@ -35,16 +35,10 @@ #include #include -#include -#include - #include #include #include -#include -#include - #include "test_util.h" using namespace cub; @@ -64,7 +58,6 @@ CachingDeviceAllocator g_allocator(true); enum Backend { CUB, // CUB method - THRUST, // Thrust method CDP, // GPU-based (dynamic parallelism) dispatch to CUB method }; @@ -104,74 +97,6 @@ cudaError_t Dispatch( return error; } - -//--------------------------------------------------------------------- -// Dispatch to different Thrust entrypoints -//--------------------------------------------------------------------- - - -/** - * Dispatch to unique entrypoint - */ -template -__host__ __forceinline__ -cudaError_t Dispatch( - Int2Type /*dispatch_to*/, - int timing_timing_iterations, - size_t */*d_temp_storage_bytes*/, - cudaError_t */*d_cdp_error*/, - - void *d_temp_storage, - size_t &temp_storage_bytes, - KeyInputIteratorT d_keys_in, - ValueInputIteratorT d_values_in, - KeyOutputIteratorT d_keys_out, - ValueOutputIteratorT d_values_out, - NumSelectedIteratorT d_num_selected_out, - OffsetT num_items, - cudaStream_t /*stream*/, - bool /*debug_synchronous*/) -{ - // The input types - typedef typename std::iterator_traits::value_type InputKeyT; - typedef typename std::iterator_traits::value_type InputValueT; - - // The output types - using OutputKeyT = typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type; // ... else the output iterator's value type - using OutputValueT = typename If<(Equals::value_type, void>::VALUE), // OutputT = (if output iterator's value type is void) ? - typename std::iterator_traits::value_type, // ... then the input iterator's value type, - typename std::iterator_traits::value_type>::Type; // ... else the output iterator's value type - - if (d_temp_storage == 0) - { - temp_storage_bytes = 1; - } - else - { - THRUST_NS_QUALIFIER::device_ptr d_keys_in_wrapper(d_keys_in); - THRUST_NS_QUALIFIER::device_ptr d_values_in_wrapper(d_values_in); - using OutputKeyIterT = THRUST_NS_QUALIFIER::device_ptr; - using OutputValueIterT = THRUST_NS_QUALIFIER::device_ptr; - OutputKeyIterT d_keys_out_wrapper(d_keys_out); - OutputValueIterT d_values_out_wrapper(d_values_out); - THRUST_NS_QUALIFIER::pair d_out_wrapper_end; - for (int i = 0; i < timing_timing_iterations; ++i) - { - d_out_wrapper_end = THRUST_NS_QUALIFIER::unique_by_key_copy(d_keys_in_wrapper, d_keys_in_wrapper + num_items, d_values_in_wrapper, d_keys_out_wrapper, d_values_out_wrapper); - } - - OffsetT num_selected = OffsetT(d_out_wrapper_end.first - d_keys_out_wrapper); - CubDebugExit(cudaMemcpy(d_num_selected_out, &num_selected, sizeof(OffsetT), cudaMemcpyHostToDevice)); - - } - - return cudaSuccess; -} - - - //--------------------------------------------------------------------- // CUDA Nested Parallelism Test Kernel //--------------------------------------------------------------------- @@ -466,7 +391,7 @@ void TestPointer( int num_selected = Solve(h_keys_in, h_values_in, h_keys_reference, h_values_reference, num_items); printf("\nPointer %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements, entropy_reduction %d\n", - (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB", + (BACKEND == CDP) ? "CDP CUB" : "CUB", num_items, num_selected, float(num_items) / num_selected, typeid(KeyT).name(), (int) sizeof(KeyT), @@ -518,7 +443,7 @@ void TestIterator( int num_selected = Solve(h_keys_in, h_values_in, h_keys_reference, h_values_reference, num_items); printf("\nIterator %s cub::DeviceSelect::Unique %d items, %d selected (avg run length %.3f), %s %d-byte elements\n", - (BACKEND == CDP) ? "CDP CUB" : (BACKEND == THRUST) ? "Thrust" : "CUB", + (BACKEND == CDP) ? "CDP CUB" : "CUB", num_items, num_selected, float(num_items) / num_selected, typeid(KeyT).name(), (int) sizeof(ValueT)); @@ -624,7 +549,6 @@ int main(int argc, char** argv) "[--device=] " "[--maxseg=]" "[--entropy=]" - "[--repeat=]" "[--v] " "[--cdp]" "\n", argv[0]); @@ -636,103 +560,40 @@ int main(int argc, char** argv) g_device_giga_bandwidth = args.device_giga_bandwidth; printf("\n"); -#ifdef CUB_TEST_MINIMAL - - // Compile/run basic CUB test - if (num_items < 0) num_items = 32000000; - TestPointer( num_items, entropy_reduction, maxseg); - -#elif defined(CUB_TEST_BENCHMARK) - - // Get device ordinal - int device_ordinal; - CubDebugExit(cudaGetDevice(&device_ordinal)); - - // Get device SM version - int sm_version = 0; - CubDebugExit(SmVersion(sm_version, device_ordinal)); - - // Compile/run quick tests - if (num_items < 0) num_items = 32000000; - - printf("-- Iterator ----------------------------\n"); - TestIterator( num_items); - TestIterator( num_items); - - printf("----------------------------\n"); - TestPointer( num_items * ((sm_version <= 130) ? 1 : 4), entropy_reduction, maxseg); - TestPointer( num_items * ((sm_version <= 130) ? 1 : 4), entropy_reduction, maxseg); - TestPointer( num_items * ((sm_version <= 130) ? 1 : 4), entropy_reduction, maxseg); - TestPointer( num_items * ((sm_version <= 130) ? 1 : 4), entropy_reduction, maxseg); - - printf("----------------------------\n"); - TestPointer( num_items * ((sm_version <= 130) ? 1 : 2), entropy_reduction, maxseg); - TestPointer( num_items * ((sm_version <= 130) ? 1 : 2), entropy_reduction, maxseg); - TestPointer( num_items * ((sm_version <= 130) ? 1 : 2), entropy_reduction, maxseg); - TestPointer( num_items * ((sm_version <= 130) ? 1 : 2), entropy_reduction, maxseg); - - printf("----------------------------\n"); - TestPointer( num_items, entropy_reduction, maxseg); - TestPointer( num_items, entropy_reduction, maxseg); - TestPointer( num_items, entropy_reduction, maxseg); - TestPointer( num_items, entropy_reduction, maxseg); - - printf("----------------------------\n"); - TestPointer( num_items / 2, entropy_reduction, maxseg); - TestPointer( num_items / 2, entropy_reduction, maxseg); - TestPointer( num_items / 2, entropy_reduction, maxseg); - TestPointer( num_items / 2, entropy_reduction, maxseg); - - printf("----------------------------\n"); - TestPointer( num_items / 4, entropy_reduction, maxseg); - TestPointer( num_items / 4, entropy_reduction, maxseg); - TestPointer( num_items / 4, entropy_reduction, maxseg); - TestPointer( num_items / 4, entropy_reduction, maxseg); - TestPointer( num_items / 4, entropy_reduction, maxseg); - TestPointer(num_items / 4, entropy_reduction, maxseg); - -#else - - // Compile/run thorough tests - for (int i = 0; i <= g_repeat; ++i) - { - // Test different input types - Test(num_items); - Test(num_items); - Test(num_items); - Test(num_items); - Test(num_items); - Test(num_items); - Test(num_items); - Test(num_items); - - Test(num_items); - Test(num_items); - Test(num_items); - Test(num_items); - Test(num_items); - Test(num_items); - Test(num_items); - Test(num_items); - - Test(num_items); - Test(num_items); - Test(num_items); - Test(num_items); - Test(num_items); - Test(num_items); - Test(num_items); - Test(num_items); - - Test(num_items); - Test(num_items); - Test(num_items); - Test(num_items); - Test(num_items); - Test(num_items); - } - -#endif + // Test different input types + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); + Test(num_items); return 0; } From d4c948bf6cfc8b6b19ef31863a703547be652a3f Mon Sep 17 00:00:00 2001 From: "Gao, Xiang" Date: Wed, 19 Jan 2022 17:30:02 -0800 Subject: [PATCH 3/4] Fix build on C++11 --- cub/util_math.cuh | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/cub/util_math.cuh b/cub/util_math.cuh index 46470cfba2..d69fc2ee2d 100644 --- a/cub/util_math.cuh +++ b/cub/util_math.cuh @@ -52,14 +52,7 @@ VshmemSize(std::size_t max_shmem, std::size_t shmem_per_block, std::size_t num_blocks) { - if (shmem_per_block > max_shmem) - { - return shmem_per_block * num_blocks; - } - else - { - return 0; - } + return shmem_per_block > max_shmem ? shmem_per_block * num_blocks : 0; } } From d1604544c0ee83ccc8c128c028fb1f185d1230eb Mon Sep 17 00:00:00 2001 From: "Gao, Xiang" Date: Thu, 20 Jan 2022 22:11:51 -0800 Subject: [PATCH 4/4] fix deprecated --- cub/agent/agent_unique_by_key.cuh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cub/agent/agent_unique_by_key.cuh b/cub/agent/agent_unique_by_key.cuh index 7ca8771033..d5925c43be 100644 --- a/cub/agent/agent_unique_by_key.cuh +++ b/cub/agent/agent_unique_by_key.cuh @@ -33,6 +33,7 @@ #pragma once #include +#include #include "../thread/thread_operators.cuh" #include "../block/block_load.cuh" @@ -108,12 +109,12 @@ struct AgentUniqueByKey }; // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys - using WrappedKeyInputIteratorT = typename std::conditional::VALUE, + using WrappedKeyInputIteratorT = typename std::conditional::value, CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator KeyInputIteratorT>::type; // Directly use the supplied input iterator type // Cache-modified Input iterator wrapper type (for applying cache modifier) for values - using WrappedValueInputIteratorT = typename std::conditional::VALUE, + using WrappedValueInputIteratorT = typename std::conditional::value, CacheModifiedInputIterator, // Wrap the native input pointer with CacheModifiedValuesInputIterator ValueInputIteratorT>::type; // Directly use the supplied input iterator type