Skip to content

Commit

Permalink
Internalize triple_chevron (#3648)
Browse files Browse the repository at this point in the history
  • Loading branch information
bernhardmgruber authored Feb 3, 2025
1 parent 711ee01 commit e586c3b
Show file tree
Hide file tree
Showing 22 changed files with 90 additions and 87 deletions.
4 changes: 2 additions & 2 deletions cub/cub/detail/launcher/cuda_runtime.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ namespace detail

struct TripleChevronFactory
{
CUB_RUNTIME_FUNCTION THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron operator()(
CUB_RUNTIME_FUNCTION THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron operator()(
dim3 grid, dim3 block, _CUDA_VSTD::size_t shared_mem, cudaStream_t stream, bool dependent_launch = false) const
{
return THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(grid, block, shared_mem, stream, dependent_launch);
return THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(grid, block, shared_mem, stream, dependent_launch);
}

CUB_RUNTIME_FUNCTION cudaError_t PtxVersion(int& version)
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_adjacent_difference.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ struct DispatchAdjacentDifference
reinterpret_cast<long long>(stream));
#endif // CUB_DEBUG_LOG

THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, init_block_size, 0, stream)
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, init_block_size, 0, stream)
.doit(detail::adjacent_difference::
DeviceAdjacentDifferenceInitKernel<AgentDifferenceInitT, InputIteratorT, InputT, OffsetT>,
d_input,
Expand Down Expand Up @@ -234,7 +234,7 @@ struct DispatchAdjacentDifference
reinterpret_cast<long long>(stream));
#endif // CUB_DEBUG_LOG

THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
num_tiles, AdjacentDifferencePolicyT::BLOCK_THREADS, 0, stream)
.doit(detail::adjacent_difference::DeviceAdjacentDifferenceDifferenceKernel<
typename PolicyHub::MaxPolicy,
Expand Down
6 changes: 3 additions & 3 deletions cub/cub/device/dispatch/dispatch_batch_memcpy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -548,7 +548,7 @@ struct DispatchBatchMemcpy
#endif

// Invoke init_kernel to initialize buffer prefix sum-tile descriptors
error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
error = THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
.doit(init_scan_states_kernel, buffer_scan_tile_state, block_scan_tile_state, num_tiles);

// Check for failure to launch
Expand Down Expand Up @@ -578,7 +578,7 @@ struct DispatchBatchMemcpy
// Invoke kernel to copy small buffers and put the larger ones into a queue that will get picked
// up by next kernel
error =
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
batch_memcpy_grid_size, ActivePolicyT::AgentSmallBufferPolicyT::BLOCK_THREADS, 0, stream)
.doit(batch_memcpy_non_blev_kernel,
input_buffer_it,
Expand Down Expand Up @@ -615,7 +615,7 @@ struct DispatchBatchMemcpy
#endif

error =
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(batch_memcpy_blev_grid_size, BLEV_BLOCK_THREADS, 0, stream)
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(batch_memcpy_blev_grid_size, BLEV_BLOCK_THREADS, 0, stream)
.doit(multi_block_memcpy_kernel,
d_blev_src_buffers,
d_blev_dst_buffers,
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_for.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ struct dispatch_t
static_cast<int>(items_per_thread));
#endif

error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
error = THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
static_cast<unsigned int>(num_tiles), static_cast<unsigned int>(block_threads), 0, stream)
.doit(detail::for_each::dynamic_kernel<max_policy_t, OffsetT, OpT>, num_items, op);
error = CubDebug(error);
Expand Down Expand Up @@ -150,7 +150,7 @@ struct dispatch_t
static_cast<int>(items_per_thread));
#endif

error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
error = THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
static_cast<unsigned int>(num_tiles), static_cast<unsigned int>(block_threads), 0, stream)
.doit(detail::for_each::static_kernel<typename PolicyHubT::MaxPolicy, OffsetT, OpT>, num_items, op);
error = CubDebug(error);
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_for_each_in_extents.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ public:
items_per_thread);
# endif
auto status =
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(num_cta, block_threads, 0, _stream)
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_cta, block_threads, 0, _stream)
.doit(detail::for_each_in_extents::
static_kernel<max_policy_t, OpType, ExtentsType, decltype(sub_sizes_div_array), Ranks...>,
_op,
Expand Down Expand Up @@ -160,7 +160,7 @@ public:
_stream,
items_per_thread);
# endif
status = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(num_cta, block_threads, 0, _stream)
status = THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_cta, block_threads, 0, _stream)
.doit(kernel, _op, _ext, sub_sizes_div_array, extents_div_array);
_CUB_RETURN_IF_ERROR(status)
_CUB_RETURN_IF_STREAM_ERROR(_stream)
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_histogram.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,7 @@ struct dispatch_histogram
#endif // CUB_DEBUG_LOG

// Invoke histogram_init_kernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
histogram_init_grid_dims, histogram_init_block_threads, 0, stream)
.doit(histogram_init_kernel, num_output_bins_wrapper, d_output_histograms_wrapper, tile_queue);

Expand All @@ -467,7 +467,7 @@ struct dispatch_histogram
#endif // CUB_DEBUG_LOG

// Invoke histogram_sweep_kernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(sweep_grid_dims, block_threads, 0, stream)
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(sweep_grid_dims, block_threads, 0, stream)
.doit(histogram_sweep_kernel,
d_samples,
num_output_bins_wrapper,
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_merge.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ struct dispatch_t
const int partition_grid_size = static_cast<int>(::cuda::ceil_div(num_partitions, threads_per_partition_block));

auto error = CubDebug(
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
partition_grid_size, threads_per_partition_block, 0, stream)
.doit(device_partition_merge_path_kernel<
max_policy_t,
Expand Down Expand Up @@ -253,7 +253,7 @@ struct dispatch_t
{
auto vshmem_ptr = vsmem_t{allocations[1]};
auto error = CubDebug(
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
static_cast<int>(num_tiles), static_cast<int>(agent_t::policy::BLOCK_THREADS), 0, stream)
.doit(
device_merge_kernel<max_policy_t, KeyIt1, ValueIt1, KeyIt2, ValueIt2, KeyIt3, ValueIt3, Offset, CompareOp>,
Expand Down
17 changes: 8 additions & 9 deletions cub/cub/device/dispatch/dispatch_radix_sort.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -986,8 +986,7 @@ struct DispatchRadixSort
#endif

// Invoke upsweep_kernel with same grid size as downsweep_kernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream)
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream)
.doit(single_tile_kernel,
d_keys.Current(),
d_keys.Alternate(),
Expand Down Expand Up @@ -1060,7 +1059,7 @@ struct DispatchRadixSort
int pass_spine_length = pass_config.even_share.grid_size * pass_config.radix_digits;

// Invoke upsweep_kernel with same grid size as downsweep_kernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, 0, stream)
.doit(pass_config.upsweep_kernel,
d_keys_in,
Expand Down Expand Up @@ -1095,7 +1094,7 @@ struct DispatchRadixSort
#endif

// Invoke scan_kernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(1, pass_config.scan_config.block_threads, 0, stream)
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(1, pass_config.scan_config.block_threads, 0, stream)
.doit(pass_config.scan_kernel, d_spine, pass_spine_length);

// Check for failure to launch
Expand Down Expand Up @@ -1123,7 +1122,7 @@ struct DispatchRadixSort
#endif

// Invoke downsweep_kernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, 0, stream)
.doit(pass_config.downsweep_kernel,
d_keys_in,
Expand Down Expand Up @@ -1325,7 +1324,7 @@ struct DispatchRadixSort
ActivePolicyT::HistogramPolicy::RADIX_BITS);
#endif

error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
error = THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
histo_blocks_per_sm * num_sms, HISTO_BLOCK_THREADS, 0, stream)
.doit(histogram_kernel, d_bins, d_keys.Current(), num_items, begin_bit, end_bit, decomposer);
error = CubDebug(error);
Expand All @@ -1352,7 +1351,7 @@ struct DispatchRadixSort
ActivePolicyT::ExclusiveSumPolicy::RADIX_BITS);
#endif

error = THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(num_passes, SCAN_BLOCK_THREADS, 0, stream)
error = THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_passes, SCAN_BLOCK_THREADS, 0, stream)
.doit(detail::radix_sort::DeviceRadixSortExclusiveSumKernel<max_policy_t, OffsetT>, d_bins);
error = CubDebug(error);
if (cudaSuccess != error)
Expand Down Expand Up @@ -1416,7 +1415,7 @@ struct DispatchRadixSort
DecomposerT>;

error =
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(num_blocks, ONESWEEP_BLOCK_THREADS, 0, stream)
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(num_blocks, ONESWEEP_BLOCK_THREADS, 0, stream)
.doit(onesweep_kernel,
d_lookback,
d_ctrs + portion * num_passes + pass,
Expand Down Expand Up @@ -2026,7 +2025,7 @@ struct DispatchSegmentedRadixSort
pass_bits);
#endif

THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
num_segments, pass_config.segmented_config.block_threads, 0, stream)
.doit(pass_config.segmented_kernel,
d_keys_in,
Expand Down
2 changes: 1 addition & 1 deletion cub/cub/device/dispatch/dispatch_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -896,7 +896,7 @@ struct DispatchSegmentedReduce
#endif // CUB_DEBUG_LOG

// Invoke DeviceReduceKernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(
num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream)
.doit(segmented_reduce_kernel, d_in, d_out, d_begin_offsets, d_end_offsets, num_segments, reduction_op, init);

Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,7 @@ struct DispatchReduceByKey
#endif // CUB_DEBUG_LOG

// Invoke init_kernel to initialize tile descriptors
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
.doit(init_kernel, tile_state, num_tiles, d_num_runs_out);

// Check for failure to launch
Expand Down Expand Up @@ -408,7 +408,7 @@ struct DispatchReduceByKey
#endif // CUB_DEBUG_LOG

// Invoke reduce_by_key_kernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream)
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(scan_grid_size, block_threads, 0, stream)
.doit(reduce_by_key_kernel,
d_keys_in,
d_unique_out,
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_rle.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ struct DeviceRleDispatch
#endif // CUB_DEBUG_LOG

// Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
.doit(device_scan_init_kernel, tile_status, num_tiles, d_num_runs_out);

// Check for failure to launch
Expand Down Expand Up @@ -422,7 +422,7 @@ struct DeviceRleDispatch
#endif // CUB_DEBUG_LOG

// Invoke device_rle_sweep_kernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, block_threads, 0, stream)
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(scan_grid_size, block_threads, 0, stream)
.doit(device_rle_sweep_kernel,
d_in,
d_offsets_out,
Expand Down
4 changes: 2 additions & 2 deletions cub/cub/device/dispatch/dispatch_scan_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -415,7 +415,7 @@ struct DispatchScanByKey
#endif // CUB_DEBUG_LOG

// Invoke init_kernel to initialize tile descriptors
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(init_grid_size, INIT_KERNEL_THREADS, 0, stream)
.doit(init_kernel, tile_state, d_keys_in, d_keys_prev_in, static_cast<OffsetT>(tile_size), num_tiles);

// Check for failure to launch
Expand Down Expand Up @@ -456,7 +456,7 @@ struct DispatchScanByKey
#endif // CUB_DEBUG_LOG

// Invoke scan_kernel
THRUST_NS_QUALIFIER::cuda_cub::launcher::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream)
THRUST_NS_QUALIFIER::cuda_cub::detail::triple_chevron(scan_grid_size, Policy::BLOCK_THREADS, 0, stream)
.doit(scan_kernel,
d_keys_in,
d_keys_prev_in,
Expand Down
Loading

0 comments on commit e586c3b

Please sign in to comment.