diff --git a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh index 51a88179331..3dad811958e 100644 --- a/cub/cub/device/dispatch/tuning/tuning_histogram.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_histogram.cuh @@ -138,7 +138,7 @@ struct sm100_tuning; // even template -struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_1> +struct sm100_tuning { // ipt_12.tpb_928.rle_0.ws_0.mem_1.ld_2.laid_0.vec_2 1.033332 0.940517 1.031835 1.195876 static constexpr int items = 12; @@ -148,30 +148,14 @@ struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, s static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; static constexpr CacheLoadModifier load_modifier = LOAD_CA; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; - static constexpr int tune_vec_size = 1 << 2; + static constexpr int vec_size = 1 << 2; }; -// same as base -template -struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_2> - : sm90_tuning -{}; - -// same as base -template -struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_4> - : sm90_tuning -{}; - -// same as base -template -struct sm100_tuning<1, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_8> - : sm90_tuning -{}; +// sample_size 2/4/8 showed no benefit over SM90 during verification benchmarks // range template -struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_1> +struct sm100_tuning { // ipt_12.tpb_448.rle_0.ws_0.mem_1.ld_1.laid_0.vec_2 1.078987 0.985542 1.085118 1.175637 static constexpr int items = 12; @@ -181,108 +165,12 @@ struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, s static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; static constexpr CacheLoadModifier load_modifier = LOAD_LDG; static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; - static constexpr int tune_vec_size = 1 << 2; -}; - -// same as base -template -struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_2> - : sm90_tuning -{}; - -template -struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_4> -{ - // ipt_9.tpb_1024.rle_1.ws_0.mem_1.ld_0.laid_1.vec_0 1.358537 1.001009 1.373329 2.614104 - static constexpr int items = 9; - static constexpr int threads = 1024; - static constexpr bool rle_compress = true; - static constexpr bool work_stealing = false; - static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; - static constexpr CacheLoadModifier load_modifier = LOAD_DEFAULT; - static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; - static constexpr int tune_vec_size = 1 << 0; -}; - -template -struct sm100_tuning<0, SampleT, 1, 1, counter_size::_4, primitive_sample::yes, sample_size::_8> -{ - // ipt_7.tpb_544.rle_1.ws_0.mem_1.ld_1.laid_0.vec_0 1.105331 0.934888 1.108557 1.391657 - static constexpr int items = 7; - static constexpr int threads = 544; - static constexpr bool rle_compress = true; - static constexpr bool work_stealing = false; - static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; - static constexpr CacheLoadModifier load_modifier = LOAD_LDG; - static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_DIRECT; - static constexpr int tune_vec_size = 1 << 0; -}; - -// multi.even -template -struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_1> -{ - // ipt_9.tpb_1024.rle_0.ws_0.mem_1.ld_1.laid_1.vec_0 1.629591 0.997416 1.570900 2.772504 - static constexpr int items = 9; - static constexpr int threads = 1024; - static constexpr bool rle_compress = false; - static constexpr bool work_stealing = false; - static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; - static constexpr CacheLoadModifier load_modifier = LOAD_LDG; - static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; - static constexpr int tune_vec_size = 1 << 0; -}; - -// same as base -template -struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_2> - : sm90_tuning -{}; - -// same as base -template -struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_4> - : sm90_tuning -{}; - -// same as base -template -struct sm100_tuning<1, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_8> - : sm90_tuning -{}; - -// multi.range -template -struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_1> -{ - // ipt_7.tpb_160.rle_0.ws_0.mem_1.ld_1.laid_1.vec_1 1.210837 0.99556 1.189049 1.939584 - static constexpr int items = 7; - static constexpr int threads = 160; - static constexpr bool rle_compress = false; - static constexpr bool work_stealing = false; - static constexpr BlockHistogramMemoryPreference mem_preference = SMEM; - static constexpr CacheLoadModifier load_modifier = LOAD_LDG; - static constexpr BlockLoadAlgorithm load_algorithm = BLOCK_LOAD_WARP_TRANSPOSE; - static constexpr int tune_vec_size = 1 << 1; + static constexpr int vec_size = 1 << 2; }; -// same as base -template -struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_2> - : sm90_tuning -{}; - -// same as base -template -struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_4> - : sm90_tuning -{}; +// sample_size 2/4/8 showed no benefit over SM90 during verification benchmarks -// same as base -template -struct sm100_tuning<0, SampleT, 4, 3, counter_size::_4, primitive_sample::yes, sample_size::_8> - : sm90_tuning -{}; +// multi.even and multi.range: none of the found tunings surpassed the SM90 tuning during verification benchmarks template struct policy_hub