Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Support future value for initial value for device scan #305

Merged
merged 1 commit into from
Oct 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 37 additions & 4 deletions cub/device/device_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -161,13 +161,13 @@ struct DeviceScan
// Initial value
OutputT init_value = 0;

return DispatchScan<InputIteratorT, OutputIteratorT, Sum, OutputT, OffsetT>::Dispatch(
return DispatchScan<InputIteratorT, OutputIteratorT, Sum, detail::InputValue<OutputT>, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
Sum(),
init_value,
detail::InputValue<OutputT>(init_value),
num_items,
stream,
debug_synchronous);
Expand Down Expand Up @@ -249,13 +249,46 @@ struct DeviceScan
// Signed integer type for global offsets
typedef int OffsetT;

return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, OffsetT>::Dispatch(
return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
scan_op,
init_value,
detail::InputValue<InitValueT>(init_value),
num_items,
stream,
debug_synchronous);
}

template <
typename InputIteratorT,
typename OutputIteratorT,
typename ScanOpT,
typename InitValueT,
typename InitValueIterT=InitValueT*>
CUB_RUNTIME_FUNCTION
static cudaError_t ExclusiveScan(
void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t &temp_storage_bytes, ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
InputIteratorT d_in, ///< [in] Pointer to the input sequence of data items
OutputIteratorT d_out, ///< [out] Pointer to the output sequence of data items
ScanOpT scan_op, ///< [in] Binary scan functor
FutureValue<InitValueT, InitValueIterT> init_value, ///< [in] Initial value to seed the exclusive scan (and is assigned to *d_out)
int num_items, ///< [in] Total number of input items (i.e., the length of \p d_in)
cudaStream_t stream = 0, ///< [in] <b>[optional]</b> CUDA stream to launch kernels within. Default is stream<sub>0</sub>.
bool debug_synchronous = false) ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors. May cause significant slowdown. Default is \p false.
{
// Signed integer type for global offsets
typedef int OffsetT;

return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, detail::InputValue<InitValueT>, OffsetT>::Dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
scan_op,
detail::InputValue<InitValueT>(init_value),
num_items,
stream,
debug_synchronous);
Expand Down
16 changes: 9 additions & 7 deletions cub/device/dispatch/dispatch_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@

CUB_NAMESPACE_BEGIN


/******************************************************************************
* Kernel entry points
*****************************************************************************/
Expand Down Expand Up @@ -108,6 +107,7 @@ __global__ void DeviceScanKernel(
InitValueT init_value, ///< Initial value to seed the exclusive scan
OffsetT num_items) ///< Total number of scan items for the entire problem
{
using RealInitValueT = typename InitValueT::value_type;
typedef typename ChainedPolicyT::ActivePolicy::ScanPolicyT ScanPolicyT;

// Thread block type for scanning input tiles
Expand All @@ -116,14 +116,16 @@ __global__ void DeviceScanKernel(
InputIteratorT,
OutputIteratorT,
ScanOpT,
InitValueT,
RealInitValueT,
OffsetT> AgentScanT;

// Shared memory for AgentScan
__shared__ typename AgentScanT::TempStorage temp_storage;

RealInitValueT real_init_value = init_value;

// Process tiles
AgentScanT(temp_storage, d_in, d_out, scan_op, init_value).ConsumeRange(
AgentScanT(temp_storage, d_in, d_out, scan_op, real_init_value).ConsumeRange(
num_items,
tile_state,
start_tile);
Expand Down Expand Up @@ -205,13 +207,13 @@ template <
typename InputIteratorT, ///< Random-access input iterator type for reading scan inputs \iterator
typename OutputIteratorT, ///< Random-access output iterator type for writing scan outputs \iterator
typename ScanOpT, ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
typename InitValueT, ///< The init_value element type for ScanOpT (cub::NullType for inclusive scans)
typename InitValueT, ///< The init_value element type for ScanOpT (cub::NullType for inclusive scans)
typename OffsetT, ///< Signed integer type for global offsets
typename SelectedPolicy = DeviceScanPolicy<
// Accumulator type.
typename If<Equals<InitValueT, NullType>::VALUE,
typename std::iterator_traits<InputIteratorT>::value_type,
InitValueT>::Type>>
typename InitValueT::value_type>::Type>>
struct DispatchScan:
SelectedPolicy
{
Expand All @@ -228,10 +230,10 @@ struct DispatchScan:
using InputT = typename std::iterator_traits<InputIteratorT>::value_type;

// The output value type -- used as the intermediate accumulator
// Per https://wg21.link/P0571, use InitValueT if provided, otherwise the
// Per https://wg21.link/P0571, use InitValueT::value_type if provided, otherwise the
// input iterator's value type.
using OutputT =
typename If<Equals<InitValueT, NullType>::VALUE, InputT, InitValueT>::Type;
typename If<Equals<InitValueT, NullType>::VALUE, InputT, typename InitValueT::value_type>::Type;

void* d_temp_storage; ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
size_t& temp_storage_bytes; ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
Expand Down
78 changes: 77 additions & 1 deletion cub/util_type.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ struct RemoveQualifiers<Tp, const volatile Up>
*/
struct NullType
{
using value_type = NullType;
#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document

template <typename T>
Expand All @@ -273,9 +274,84 @@ struct NullType
template <int A>
struct Int2Type
{
enum {VALUE = A};
enum {VALUE = A};
};

/**
* \brief Allows algorithms that take a value as input to take a future value that is not computed yet at launch time.
*
* Note that it is user's responsibility to ensure that the result will be ready before use via external synchronization
* or stream-ordering dependencies.
zasdfgbnm marked this conversation as resolved.
Show resolved Hide resolved
*
* \code
* int *d_intermediate_result;
* allocator.DeviceAllocate((void **)&d_intermediate_result, sizeof(int));
* compute_intermediate_result<<<blocks, threads>>>(
* d_intermediate_result, // output
* arg1, // input
* arg2); // input
* cub::FutureValue<int> init_value(d_intermediate_result);
* cub::DeviceScan::ExclusiveScan(
* d_temp_storage,
* temp_storage_bytes,
* d_in,
* d_out,
* cub::Sum(),
* init_value,
* num_items);
* allocator.DeviceFree(d_intermediate_result);
* \endcode
*/
template <typename T, typename IterT = T*>
struct FutureValue
{
using value_type = T;
using iterator_type = IterT;
explicit __host__ __device__ __forceinline__ FutureValue(IterT iter):m_iter(iter) {}
__host__ __device__ __forceinline__ operator T() {
return *m_iter;
}

private:
IterT m_iter;
};

namespace detail {

/**
* \brief Allows algorithms to instantiate a single kernel to support both immediate value and future value.
*/
template <typename T, typename IterT = T*>
zasdfgbnm marked this conversation as resolved.
Show resolved Hide resolved
struct InputValue
{
using value_type = T;
using iterator_type = IterT;
__host__ __device__ __forceinline__ operator T() {
if (m_is_future) {
return m_future_value;
}
return m_immediate_value;
}
explicit __host__ __device__ __forceinline__ InputValue(T immediate_value): m_is_future(false), m_immediate_value(immediate_value) {}
explicit __host__ __device__ __forceinline__ InputValue(FutureValue<T, IterT> future_value): m_is_future(true), m_future_value(future_value) {}
__host__ __device__ __forceinline__ InputValue(const InputValue &other): m_is_future(other.m_is_future) {
if (m_is_future) {
m_future_value = other.m_future_value;
} else {
m_immediate_value = other.m_immediate_value;
}
}

private:
bool m_is_future;
union
{
FutureValue<T, IterT> m_future_value;
T m_immediate_value;
};
};

} // namespace detail

#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document

Expand Down
88 changes: 88 additions & 0 deletions test/test_device_scan.cu
Original file line number Diff line number Diff line change
Expand Up @@ -737,6 +737,92 @@ void Test(
AssertEquals(0, compare);
}

template <typename InitialValueT>
__global__ void FillInitValue(InitialValueT *ptr, InitialValueT initial_value) {
*ptr = initial_value;
}

template <
Backend BACKEND,
typename DeviceInputIteratorT,
typename OutputT,
typename ScanOpT,
typename InitialValueT>
typename std::enable_if<!std::is_same<InitialValueT, cub::NullType>::value && BACKEND != THRUST>::type
TestFutureInitValue(
DeviceInputIteratorT d_in,
OutputT *h_reference,
int num_items,
ScanOpT scan_op,
InitialValueT initial_value)
{
// Allocate device initial_value
InitialValueT *d_initial_value = NULL;
CubDebugExit(g_allocator.DeviceAllocate((void**)&d_initial_value, sizeof(InitialValueT)));
FillInitValue<<<1, 1>>>(d_initial_value, initial_value);

// Run test
auto future_init_value = cub::FutureValue<InitialValueT>(d_initial_value);
Test<BACKEND>(d_in, h_reference, num_items, scan_op, future_init_value);

// Cleanup
if (d_initial_value) CubDebugExit(g_allocator.DeviceFree(d_initial_value));
}

template <
Backend BACKEND,
typename DeviceInputIteratorT,
typename OutputT,
typename ScanOpT,
typename InitialValueT>
typename std::enable_if<std::is_same<InitialValueT, cub::NullType>::value || BACKEND == THRUST>::type
TestFutureInitValue(
DeviceInputIteratorT,
OutputT *,
int,
ScanOpT,
InitialValueT)
{
// cub::NullType does not have device pointer, so nothing to do here
}

template <
Backend BACKEND,
typename DeviceInputIteratorT,
typename OutputT,
typename ScanOpT,
typename InitialValueT>
typename std::enable_if<!std::is_same<InitialValueT, cub::NullType>::value && BACKEND != THRUST>::type
TestFutureInitValueIter(
DeviceInputIteratorT d_in,
OutputT *h_reference,
int num_items,
ScanOpT scan_op,
InitialValueT initial_value)
{
using IterT = cub::ConstantInputIterator<InitialValueT>;
IterT iter(initial_value);
auto future_init_value = cub::FutureValue<InitialValueT, IterT>(iter);
Test<BACKEND>(d_in, h_reference, num_items, scan_op, future_init_value);
}

template <
Backend BACKEND,
typename DeviceInputIteratorT,
typename OutputT,
typename ScanOpT,
typename InitialValueT>
typename std::enable_if<std::is_same<InitialValueT, cub::NullType>::value || BACKEND == THRUST>::type
TestFutureInitValueIter(
DeviceInputIteratorT,
OutputT *,
int,
ScanOpT,
InitialValueT)
{
// cub::NullType does not have device pointer, so nothing to do here
}

template <
Backend BACKEND,
typename DeviceInputIteratorT,
Expand Down Expand Up @@ -825,6 +911,7 @@ void TestPointer(

// Run Test
Test<BACKEND>(d_in, h_reference, num_items, scan_op, initial_value);
TestFutureInitValue<BACKEND>(d_in, h_reference, num_items, scan_op, initial_value);
TestInplace<BACKEND>(d_in, h_reference, num_items, scan_op, initial_value);

// Cleanup
Expand Down Expand Up @@ -868,6 +955,7 @@ void TestIterator(

// Run Test
Test<BACKEND>(h_in, h_reference, num_items, scan_op, initial_value);
TestFutureInitValueIter<BACKEND>(h_in, h_reference, num_items, scan_op, initial_value);

// Cleanup
if (h_reference) delete[] h_reference;
Expand Down