From d84db8ad03e47377ad4ce767760be883bc52deb7 Mon Sep 17 00:00:00 2001 From: dumerrill Date: Tue, 11 Oct 2016 00:47:53 -0400 Subject: [PATCH] Fixes for rle test failures Former-commit-id: fa35717193b7f11bcc918b4aa5b6b87d462ed4bc --- README.md | 2 +- cub/agent/agent_rle.cuh | 8 ++--- cub/block/block_scan.cuh | 70 ++++++++++++++++++++++++++++++++++++++++ test/test_util.h | 4 +-- 4 files changed, 77 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index a2aa79712a..42410bf571 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@

About CUB

-Current release: v1.5.2 (03/21/2016) +Current release: v1.5.3 (10/11/2016) We recommend the [CUB Project Website](http://nvlabs.github.com/cub) and the [cub-users discussion forum](http://groups.google.com/group/cub-users) for further information and examples. diff --git a/cub/agent/agent_rle.cuh b/cub/agent/agent_rle.cuh index 400e8b8486..68a633f8f0 100644 --- a/cub/agent/agent_rle.cuh +++ b/cub/agent/agent_rle.cuh @@ -367,7 +367,7 @@ struct AgentRle LengthOffsetPair (&lengths_and_num_runs)[ITEMS_PER_THREAD]) { // Perform warpscans - int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); + unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); int lane_id = LaneId(); LengthOffsetPair identity; @@ -422,7 +422,7 @@ struct AgentRle Int2Type is_warp_time_slice) { unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); - unsigned int lane_id = LaneId(); + int lane_id = LaneId(); // Locally compact items within the warp (first warp) if (warp_id == 0) @@ -479,7 +479,7 @@ struct AgentRle Int2Type is_warp_time_slice) { unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); - unsigned int lane_id = LaneId(); + int lane_id = LaneId(); // Unzip OffsetT run_offsets[ITEMS_PER_THREAD]; @@ -733,7 +733,7 @@ struct AgentRle // First warp computes tile prefix in lane 0 TilePrefixCallbackOpT prefix_op(tile_status, temp_storage.prefix, Sum(), tile_idx); - int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); + unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS); if (warp_id == 0) { prefix_op(tile_aggregate); diff --git a/cub/block/block_scan.cuh b/cub/block/block_scan.cuh index 426c85ecfa..4bc180faa8 100644 --- a/cub/block/block_scan.cuh +++ b/cub/block/block_scan.cuh @@ -1115,6 +1115,7 @@ public: //@} end member group #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans + /******************************************************************//** * \name Exclusive prefix scan operations (no initial value, single datum per thread) *********************************************************************/ @@ -1161,10 +1162,79 @@ public: InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate); } + //@} end member group + /******************************************************************//** + * \name Exclusive prefix scan operations (no initial value, multiple data per thread) + *********************************************************************/ + //@{ + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. With no initial value, the output computed for thread0 is undefined. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void ExclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op) ///< [in] Binary scan functor + { + // Reduce consecutive thread items in registers + T thread_partial = ThreadReduce(input, scan_op); + + // Exclusive threadblock-scan + ExclusiveScan(thread_partial, thread_partial, scan_op); + + // Exclusive scan in registers with prefix + ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); + } + + + /** + * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor. Each thread contributes an array of consecutive input elements. Also provides every thread with the block-wide \p block_aggregate of all inputs. With no initial value, the output computed for thread0 is undefined. + * + * \par + * - Supports non-commutative scan operators. + * - \blocked + * - \granularity + * - \smemreuse + * + * \tparam ITEMS_PER_THREAD [inferred] The number of consecutive items partitioned onto each thread. + * \tparam ScanOp [inferred] Binary scan functor type having member T operator()(const T &a, const T &b) + */ + template < + int ITEMS_PER_THREAD, + typename ScanOp> + __device__ __forceinline__ void ExclusiveScan( + T (&input)[ITEMS_PER_THREAD], ///< [in] Calling thread's input items + T (&output)[ITEMS_PER_THREAD], ///< [out] Calling thread's output items (may be aliased to \p input) + ScanOp scan_op, ///< [in] Binary scan functor + T &block_aggregate) ///< [out] block-wide aggregate reduction of input items + { + // Reduce consecutive thread items in registers + T thread_partial = ThreadReduce(input, scan_op); + + // Exclusive threadblock-scan + ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate); + + // Exclusive scan in registers with prefix + ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0)); + } //@} end member group #endif // DOXYGEN_SHOULD_SKIP_THIS // Do not document no-initial-value scans + /******************************************************************//** * \name Inclusive prefix sum operations *********************************************************************/ diff --git a/test/test_util.h b/test/test_util.h index b35194269d..f29595f23a 100644 --- a/test/test_util.h +++ b/test/test_util.h @@ -1430,9 +1430,9 @@ void DisplayResults( /** * Print the contents of a host array */ -template +template void DisplayResults( - T *h_data, + InputIteratorT h_data, size_t num_items) { // Display data