From 8d1000c2531f8c59ee4de4a482c1d25f51cb60d7 Mon Sep 17 00:00:00 2001 From: Wayne Franz Date: Thu, 29 Aug 2024 22:53:07 -0400 Subject: [PATCH 01/13] Remove website URL from comments (#600) Referencing or using code from some websites is prohibited in this repository. This change removes an informational reference in the comments. --- scripts/copyright-date/check-copyright.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/copyright-date/check-copyright.sh b/scripts/copyright-date/check-copyright.sh index 04d941b96..fccbdb9ac 100755 --- a/scripts/copyright-date/check-copyright.sh +++ b/scripts/copyright-date/check-copyright.sh @@ -61,7 +61,6 @@ if $forkdiff; then source_commit="remotes/$remote/HEAD" # don't use fork-point for finding fork point (lol) - # see: https://stackoverflow.com/a/53981615 diff_hash="$(git merge-base "$source_commit" "$branch")" fi From 72fac4a8419b0e71ebb47a284a04c8152492f561 Mon Sep 17 00:00:00 2001 From: Nara Date: Tue, 3 Sep 2024 11:03:22 -0700 Subject: [PATCH 02/13] Fix rare memory access faults when using internal serial merge (#597) * test: add tests for internal serial merge function * refactor(detail/merge_path.hpp): removed code duplication * fix(detail/merge_path.hpp): stricter boundary checking in serial merge * fix(detail/block_sort_merge.hpp): fix missing block-wide sync During a previous refactor, serial_merge does no longer do a block sync. This has now been re-added. * feat: add unsafe variant of serial merge * fix: use bounded version for serial merge to fix rare page faults * test(test_internal_merge_path): clean up internal merge path tests * style: standardize range_t<> construction * fix(detail/merge_path.hpp): fix 'range_t<>::count1()' and 'range_t<>::count2()' return types to be same as encapsulated type * perf(detail/merge_path.hpp): use const ref in function parameters * refactor(detail/merge_path.hpp): replace redundant use of 'OffsetT' with 'unsigned int' * chore: update changelog * fix: restore missing thread sync This got removed during a rebase. --- CHANGELOG.md | 6 + .../rocprim/block/detail/block_sort_merge.hpp | 33 ++- rocprim/include/rocprim/detail/merge_path.hpp | 222 +++++++++++------- .../rocprim/device/detail/device_merge.hpp | 65 ++--- .../detail/device_merge_sort_mergepath.hpp | 32 ++- .../rocprim/warp/detail/warp_sort_stable.hpp | 19 +- test/rocprim/CMakeLists.txt | 3 + test/rocprim/internal/CMakeLists.txt | 23 ++ .../internal/test_internal_merge_path.cpp | 64 +++++ 9 files changed, 298 insertions(+), 169 deletions(-) create mode 100644 test/rocprim/internal/CMakeLists.txt create mode 100644 test/rocprim/internal/test_internal_merge_path.cpp diff --git a/CHANGELOG.md b/CHANGELOG.md index 972b2edca..619dbe2a3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,12 @@ Documentation for rocPRIM is available at * Fixed a bug for `rocprim::merge_path_search` where using `unsigned` offsets would output wrong results. * Fixed a bug for `rocprim::thread_load` and `rocprim::thread_store` where `float` and `double` were not casted to the correct type resulting in wrong results. * Fix tests failing when compiling with `-D_GLIBCXX_ASSERTIONS=ON`. +* Fixed a bug for algorithms that use an internal serial merge routine that causes a memory access fault. This may result in a performance drop when using: + * block sort, + * device merge sort (block merge), + * device merge, + * device partial sort, and/or + * device sort (merge sort). ### Deprecations diff --git a/rocprim/include/rocprim/block/detail/block_sort_merge.hpp b/rocprim/include/rocprim/block/detail/block_sort_merge.hpp index 015d67cf0..251844411 100644 --- a/rocprim/include/rocprim/block/detail/block_sort_merge.hpp +++ b/rocprim/include/rocprim/block/detail/block_sort_merge.hpp @@ -25,7 +25,6 @@ #include "../../detail/merge_path.hpp" #include "../../detail/various.hpp" #include "../../warp/detail/warp_sort_stable.hpp" -#include "../../warp/warp_sort.hpp" BEGIN_ROCPRIM_NAMESPACE @@ -385,10 +384,14 @@ class block_sort_merge diag0_local, compare_function); const unsigned int keys2_beg_local = diag0_local - keys1_beg_local; - range_t range_local - = {keys1_beg_local + keys1_beg, keys1_end, keys2_beg_local + keys1_end, keys2_end}; - serial_merge(keys_shared, thread_keys, range_local, compare_function); + range_t<> range_local{keys1_beg_local + keys1_beg, + keys1_end, + keys2_beg_local + keys1_end, + keys2_end}; + + serial_merge(keys_shared, thread_keys, range_local, compare_function); + ::rocprim::syncthreads(); } } @@ -426,15 +429,19 @@ class block_sort_merge diag0_local, compare_function); const unsigned int keys2_beg_local = diag0_local - keys1_beg_local; - range_t range_local - = {keys1_beg_local + keys1_beg, keys1_end, keys2_beg_local + keys1_end, keys2_end}; - - serial_merge(keys_shared, - thread_keys, - values_shared, - thread_values, - range_local, - compare_function); + + range_t<> range_local{keys1_beg_local + keys1_beg, + keys1_end, + keys2_beg_local + keys1_end, + keys2_end}; + + serial_merge(keys_shared, + thread_keys, + values_shared, + thread_values, + range_local, + compare_function); + ::rocprim::syncthreads(); } } }; diff --git a/rocprim/include/rocprim/detail/merge_path.hpp b/rocprim/include/rocprim/detail/merge_path.hpp index 25194290a..8cdd51d33 100644 --- a/rocprim/include/rocprim/detail/merge_path.hpp +++ b/rocprim/include/rocprim/detail/merge_path.hpp @@ -21,10 +21,9 @@ #ifndef ROCPRIM_DETAIL_MERGE_PATH_HPP_ #define ROCPRIM_DETAIL_MERGE_PATH_HPP_ -#include "../intrinsics/thread.hpp" - #include "../config.hpp" +#include #include BEGIN_ROCPRIM_NAMESPACE @@ -32,19 +31,24 @@ BEGIN_ROCPRIM_NAMESPACE namespace detail { +template struct range_t { - unsigned int begin1; - unsigned int end1; - unsigned int begin2; - unsigned int end2; - - ROCPRIM_DEVICE ROCPRIM_INLINE constexpr unsigned int count1() const + OffsetT begin1; + OffsetT end1; + OffsetT begin2; + OffsetT end2; + + /// \brief Number of elements in first range. + ROCPRIM_DEVICE ROCPRIM_INLINE + constexpr OffsetT count1() const { return end1 - begin1; } - ROCPRIM_DEVICE ROCPRIM_INLINE constexpr unsigned int count2() const + /// \brief Number of elements in second range. + ROCPRIM_DEVICE ROCPRIM_INLINE + constexpr OffsetT count2() const { return end2 - begin2; } @@ -83,108 +87,148 @@ ROCPRIM_HOST_DEVICE ROCPRIM_INLINE OffsetT merge_path(KeysInputIterator1 keys_in return begin; } -template -ROCPRIM_DEVICE ROCPRIM_INLINE void serial_merge(KeyType* keys_shared, - KeyType (&outputs)[ItemsPerThread], - unsigned int (&index)[ItemsPerThread], - range_t range, - BinaryFunction compare_function) +template +ROCPRIM_DEVICE ROCPRIM_INLINE +void serial_merge(KeyType* keys_shared, + const range_t& range, + BinaryFunction compare_function, + OutputFunction output_function) { - KeyType a = keys_shared[range.begin1]; - KeyType b = keys_shared[range.begin2]; + // Pre condition, we're including some edge cases too. + assert(range.begin1 <= range.end1); + assert(range.begin2 <= range.end2); + + // More descriptive names for ranges: + auto idx_a = range.begin1; + auto idx_b = range.begin2; + const auto end_a = range.end1; + const auto end_b = range.end2; + + // Pre-loaded keys so we don't have to re-fetch multiple times from memory. + // These will be updated every iteration. + KeyType key_a; + KeyType key_b; + + // Only load valid keys, otherwise might be out of bounds! + // If we allow unsafe, this check is not done. + if(AllowUnsafe || idx_a < end_a) + { + key_a = keys_shared[idx_a]; + } + if(AllowUnsafe || idx_b < end_b) + { + key_b = keys_shared[idx_b]; + } ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; ++i) { - bool compare = (range.begin2 >= range.end2) - || ((range.begin1 < range.end1) && !compare_function(b, a)); - unsigned int x = compare ? range.begin1 : range.begin2; + // If we don't have any in b, we always take from a. Then, if we don't + // have any in a, we take from b. Otherwise we take the smallest item. + const bool take_a + = (idx_b >= end_b) || ((idx_a < end_a) && !compare_function(key_b, key_a)); - outputs[i] = compare ? a : b; - index[i] = x; + // Retrieve info about the smallest key. + const auto idx = take_a ? idx_a : idx_b; + const auto end = take_a ? end_a : end_b; + const auto key = take_a ? key_a : key_b; - KeyType c = keys_shared[++x]; - if(compare) - { - a = c; - range.begin1 = x; - } - else - { - b = c; - range.begin2 = x; - } - } - ::rocprim::syncthreads(); -} + // Output results. + output_function(i, key, idx); -template -ROCPRIM_DEVICE ROCPRIM_INLINE void serial_merge(KeyType* keys_shared, - KeyType (&outputs)[ItemsPerThread], - range_t range, - BinaryFunction compare_function) -{ - KeyType a = keys_shared[range.begin1]; - KeyType b = keys_shared[range.begin2]; + // Get the next idx, if we allow unsafe we may access out-of-bounds elements. + const auto next_idx = idx + 1; - ROCPRIM_UNROLL - for(unsigned int i = 0; i < ItemsPerThread; ++i) - { - bool compare = (range.begin2 >= range.end2) - || ((range.begin1 < range.end1) && !compare_function(b, a)); - unsigned int x = compare ? range.begin1 : range.begin2; + // Load the next item. The compiler *should* be smart enough to optimize + // away the case where we don't have any items to read. + const auto next_key = keys_shared[AllowUnsafe ? next_idx : min(next_idx, end - 1)]; - outputs[i] = compare ? a : b; - - KeyType c = keys_shared[++x]; - if(compare) + // Store the info about the next key. + if(take_a) { - a = c; - range.begin1 = x; + idx_a = next_idx; + key_a = next_key; } else { - b = c; - range.begin2 = x; + idx_b = next_idx; + key_b = next_key; } } - ::rocprim::syncthreads(); + + // We don't finish with a block sync since this may be used on thread or + // warp granularity! } -template -ROCPRIM_DEVICE ROCPRIM_INLINE void serial_merge(KeyType* keys_shared, - KeyType (&outputs)[ItemsPerThread], - ValueType* values_shared, - ValueType (&values)[ItemsPerThread], - range_t range, - BinaryFunction compare_function) +template +ROCPRIM_DEVICE ROCPRIM_INLINE +void serial_merge(KeyType* keys_shared, + KeyType (&outputs)[ItemsPerThread], + unsigned int (&indices)[ItemsPerThread], + const range_t& range, + BinaryFunction compare_function) { - KeyType a = keys_shared[range.begin1]; - KeyType b = keys_shared[range.begin2]; - - ROCPRIM_UNROLL - for(unsigned int i = 0; i < ItemsPerThread; ++i) - { - bool compare = (range.begin2 >= range.end2) - || ((range.begin1 < range.end1) && !compare_function(b, a)); - unsigned int x = compare ? range.begin1 : range.begin2; + serial_merge( + keys_shared, + range, + compare_function, + [&](const unsigned int& i, const KeyType& key, const OffsetT& index) + { + outputs[i] = key; + indices[i] = index; + }); +} - outputs[i] = compare ? a : b; - values[i] = values_shared[x]; +template +ROCPRIM_DEVICE ROCPRIM_INLINE +void serial_merge(KeyType* keys_shared, + KeyType (&outputs)[ItemsPerThread], + const range_t& range, + BinaryFunction compare_function) +{ + serial_merge( + keys_shared, + range, + compare_function, + [&](const unsigned int& i, const KeyType& key, const OffsetT&) { outputs[i] = key; }); +} - KeyType c = keys_shared[++x]; - if(compare) - { - a = c; - range.begin1 = x; - } - else +template +ROCPRIM_DEVICE ROCPRIM_INLINE +void serial_merge(KeyType* keys_shared, + KeyType (&outputs)[ItemsPerThread], + ValueType* values_shared, + ValueType (&values)[ItemsPerThread], + const range_t& range, + BinaryFunction compare_function) +{ + serial_merge( + keys_shared, + range, + compare_function, + [&](const unsigned int& i, const KeyType& key, const OffsetT& index) { - b = c; - range.begin2 = x; - } - } - ::rocprim::syncthreads(); + outputs[i] = key; + values[i] = values_shared[index]; + }); } } // end namespace detail diff --git a/rocprim/include/rocprim/device/detail/device_merge.hpp b/rocprim/include/rocprim/device/detail/device_merge.hpp index e2dae6df4..54abb8b41 100644 --- a/rocprim/include/rocprim/device/detail/device_merge.hpp +++ b/rocprim/include/rocprim/device/detail/device_merge.hpp @@ -40,17 +40,17 @@ namespace detail { ROCPRIM_DEVICE ROCPRIM_INLINE -range_t compute_range(const unsigned int id, - const unsigned int size1, - const unsigned int size2, - const unsigned int spacing, - const unsigned int p1, - const unsigned int p2) +range_t<> compute_range(const unsigned int id, + const unsigned int size1, + const unsigned int size2, + const unsigned int spacing, + const unsigned int p1, + const unsigned int p2) { unsigned int diag1 = id * spacing; unsigned int diag2 = min(size1 + size2, diag1 + spacing); - return range_t{p1, p2, diag1 - p1, diag2 - p2}; + return range_t<>{p1, p2, diag1 - p1, diag2 - p2}; } template< @@ -121,22 +121,20 @@ void load(unsigned int flat_id, ::rocprim::syncthreads(); } -template< - unsigned int BlockSize, - class KeysInputIterator1, - class KeysInputIterator2, - class KeyType, - unsigned int ItemsPerThread, - class BinaryFunction -> +template ROCPRIM_DEVICE ROCPRIM_INLINE -void merge_keys(unsigned int flat_id, +void merge_keys(unsigned int flat_id, KeysInputIterator1 keys_input1, KeysInputIterator2 keys_input2, KeyType (&key_inputs)[ItemsPerThread], unsigned int (&index)[ItemsPerThread], - KeyType * keys_shared, - range_t range, + KeyType* keys_shared, + range_t<> range, BinaryFunction compare_function) { load( @@ -144,11 +142,7 @@ void merge_keys(unsigned int flat_id, keys_shared, range.count1(), range.count2() ); - range_t range_local = - range_t { - 0, range.count1(), range.count1(), - (range.count1() + range.count2()) - }; + range_t<> range_local{0, range.count1(), range.count1(), (range.count1() + range.count2())}; unsigned int diag = ItemsPerThread * flat_id; unsigned int partition = @@ -161,18 +155,12 @@ void merge_keys(unsigned int flat_id, compare_function ); - range_t range_partition = - range_t { - range_local.begin1 + partition, - range_local.end1, - range_local.begin2 + diag - partition, - range_local.end2 - }; - - serial_merge( - keys_shared, key_inputs, index, range_partition, - compare_function - ); + range_t<> range_partition{range_local.begin1 + partition, + range_local.end1, + range_local.begin2 + diag - partition, + range_local.end2}; + + serial_merge(keys_shared, key_inputs, index, range_partition, compare_function); } template< @@ -316,11 +304,8 @@ void merge_kernel_impl(IndexIterator indices, const unsigned int p1 = indices[rocprim::min(flat_block_id, partitions)]; const unsigned int p2 = indices[rocprim::min(flat_block_id + 1, partitions)]; - range_t range = - compute_range( - flat_block_id, input1_size, input2_size, items_per_block, - p1, p2 - ); + range_t<> range + = compute_range(flat_block_id, input1_size, input2_size, items_per_block, p1, p2); merge_keys( flat_id, keys_input1, keys_input2, input, index, diff --git a/rocprim/include/rocprim/device/detail/device_merge_sort_mergepath.hpp b/rocprim/include/rocprim/device/detail/device_merge_sort_mergepath.hpp index e9eed52a3..adf552973 100644 --- a/rocprim/include/rocprim/device/detail/device_merge_sort_mergepath.hpp +++ b/rocprim/include/rocprim/device/detail/device_merge_sort_mergepath.hpp @@ -192,18 +192,16 @@ namespace detail const unsigned int keys1_end_local = num_keys1; const unsigned int keys2_beg_local = diag0_local - keys1_beg_local; const unsigned int keys2_end_local = num_keys2; - range_t range_local = {keys1_beg_local, - keys1_end_local, - keys2_beg_local + keys1_end_local, - keys2_end_local + keys1_end_local}; + + range_t<> range_local{keys1_beg_local, + keys1_end_local, + keys2_beg_local + keys1_end_local, + keys2_end_local + keys1_end_local}; unsigned int indices[ItemsPerThread]; - serial_merge(keys_shared, - keys, - indices, - range_local, - compare_function); + serial_merge(keys_shared, keys, indices, range_local, compare_function); + rocprim::syncthreads(); if ROCPRIM_IF_CONSTEXPR(with_values){ reg_to_shared(values_shared, values); @@ -330,18 +328,16 @@ namespace detail const unsigned int keys1_end_local = num_keys1; const unsigned int keys2_beg_local = diag0_local - keys1_beg_local; const unsigned int keys2_end_local = num_keys2; - range_t range_local = {keys1_beg_local, - keys1_end_local, - keys2_beg_local + keys1_end_local, - keys2_end_local + keys1_end_local}; + + range_t<> range_local{keys1_beg_local, + keys1_end_local, + keys2_beg_local + keys1_end_local, + keys2_end_local + keys1_end_local}; unsigned int indices[ItemsPerThread]; - serial_merge(keys_shared, - keys, - indices, - range_local, - compare_function); + serial_merge(keys_shared, keys, indices, range_local, compare_function); + rocprim::syncthreads(); if ROCPRIM_IF_CONSTEXPR(with_values) { diff --git a/rocprim/include/rocprim/warp/detail/warp_sort_stable.hpp b/rocprim/include/rocprim/warp/detail/warp_sort_stable.hpp index 0c2f42770..96ff48356 100644 --- a/rocprim/include/rocprim/warp/detail/warp_sort_stable.hpp +++ b/rocprim/include/rocprim/warp/detail/warp_sort_stable.hpp @@ -24,6 +24,7 @@ #include #include "../../config.hpp" +#include "../../detail/merge_path.hpp" #include "../../detail/various.hpp" #include "../../functional.hpp" @@ -154,14 +155,14 @@ class warp_sort_stable const auto keys1_merge_begin = keys1_begin + partition; const auto keys2_merge_begin = keys2_begin + diag - partition; - const range_t range = { + const range_t<> range{ keys1_merge_begin, keys1_end, keys2_merge_begin, keys2_end, }; - serial_merge(shared_keys, thread_keys, range, compare_function); + serial_merge(shared_keys, thread_keys, range, compare_function); wave_barrier(); } @@ -215,19 +216,19 @@ class warp_sort_stable const auto keys1_merge_begin = keys1_begin + partition; const auto keys2_merge_begin = keys2_begin + diag - partition; - const range_t range = { + const range_t<> range{ keys1_merge_begin, keys1_end, keys2_merge_begin, keys2_end, }; - serial_merge(shared_keys, - thread_keys, - shared_values, - thread_values, - range, - compare_function); + serial_merge(shared_keys, + thread_keys, + shared_values, + thread_values, + range, + compare_function); wave_barrier(); } diff --git a/test/rocprim/CMakeLists.txt b/test/rocprim/CMakeLists.txt index 9d7f9a4b2..2385c9026 100644 --- a/test/rocprim/CMakeLists.txt +++ b/test/rocprim/CMakeLists.txt @@ -231,6 +231,9 @@ endfunction() # Tests # **************************************************************************** +# Internal test to check internal behaviour +add_rocprim_test("rocprim.internal_merge_path" "internal/test_internal_merge_path.cpp") + # HIP basic test, which also checks if there are no linkage problems when there are multiple sources add_rocprim_test("rocprim.basic_test" "test_basic.cpp;detail/get_rocprim_version.cpp") diff --git a/test/rocprim/internal/CMakeLists.txt b/test/rocprim/internal/CMakeLists.txt new file mode 100644 index 000000000..fa360ab3f --- /dev/null +++ b/test/rocprim/internal/CMakeLists.txt @@ -0,0 +1,23 @@ +# MIT License +# +# Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +add_rocprim_test("rocprim.internal.merge_path" test_internal_merge_path.hpp) diff --git a/test/rocprim/internal/test_internal_merge_path.cpp b/test/rocprim/internal/test_internal_merge_path.cpp new file mode 100644 index 000000000..9b0873444 --- /dev/null +++ b/test/rocprim/internal/test_internal_merge_path.cpp @@ -0,0 +1,64 @@ + +#include "../../common_test_header.hpp" +#include "../test_utils_assertions.hpp" +#include "../test_utils_data_generation.hpp" + +#include +#include + +template +__global__ +void merge_kernel(T* shared, rocprim::detail::range_t<> range, Op compare_function) +{ + T outputs[IPT]; + + rocprim::detail::serial_merge(shared, outputs, range, compare_function); + + rocprim::block_store_direct_blocked(0, shared, outputs, range.end2); +} + +template +void serial_merge(std::vector& input, + std::vector& output, + unsigned int mid, + OpT compare_function) +{ + static_assert(IPT >= N, "Kernel must be launched such that all items can be processed!"); + + size_t num_bytes = sizeof(T) * N; + T* device_data; + + HIP_CHECK(hipMalloc(&device_data, num_bytes)); + HIP_CHECK(hipMemcpy(device_data, input.data(), num_bytes, hipMemcpyHostToDevice)); + + merge_kernel + <<<1, 1>>>(device_data, rocprim::detail::range_t<>{0, mid, mid, N}, compare_function); + HIP_CHECK(hipGetLastError()); + + HIP_CHECK(hipMemcpy(output.data(), device_data, num_bytes, hipMemcpyDeviceToHost)); +} + +TEST(RocprimInternalMergePathTests, Basic) +{ + using T = int; + using OpT = rocprim::less; + + constexpr int n = 512; + constexpr int m = n / 3; + constexpr int ipt = 2 * n; + + std::vector x = test_utils::get_random_data(n, + std::numeric_limits::min(), + std::numeric_limits::max(), + 0); + std::vector y(n); + + std::sort(x.begin(), x.begin() + m); + std::sort(x.begin() + m, x.end()); + + serial_merge(x, y, m, OpT{}); + + std::sort(x.begin(), x.end()); + + test_utils::assert_eq(x, y); +} From 483386de2006589a47af1f4d5dc384eb397e8561 Mon Sep 17 00:00:00 2001 From: amd-garydeng Date: Thu, 5 Sep 2024 15:19:58 -0600 Subject: [PATCH 03/13] Add gfx1151 target (#601) (#603) Co-authored-by: Stanley Tsang --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0e1f75e8e..cba75281c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -94,7 +94,7 @@ if(NOT USE_HIP_CPU) ) else() rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS - TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" + TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201" ) endif() From 95f09a24cf6ecef99faa29773702dd29506c20d3 Mon Sep 17 00:00:00 2001 From: Stanley Tsang Date: Fri, 11 Oct 2024 10:43:23 -0600 Subject: [PATCH 04/13] Merge back 6.2 hotfixes (#607) (#620) * Update dependency names for static builds (#557) This also removes the line setting `BUILD_SHARED_LIBS` to `ON`, which was previously required to get the correctly named packages when not specifically compiling for a static build. Updates to the ROCmCMakeBuildTools (rocm-cmake) should mean this is no longer necessary. * Fix BUILD_SHARED_LIBS for packaging (#558) * Fix the dependencies of the static packages (#563) * cmake: don't set CMAKE_C_COMPILER, as rocPRIM is a CXX project (#567) * add developer guidelines (#555) (#574) * Update Read the Docs config to Python 3.10 and latest rocm-docs-core (#564) (#579) * Cherry-pick: Optimize block_reduce_warp_reduce when block size is the same as warp size (#599) * Optimize block_reduce_warp_reduce when block size == warp size * Make conditional constexpr * Fix conflict in concepts.rst --------- Co-authored-by: Lauren Wrubleski Co-authored-by: Steve Leung Co-authored-by: randyh62 <42045079+randyh62@users.noreply.github.com> Co-authored-by: Nol Moonen Co-authored-by: Sam Wu <22262939+samjwu@users.noreply.github.com> --- CHANGELOG.md | 7 +- CMakeLists.txt | 15 ++++- docs/sphinx/requirements.txt | 10 +-- .../block/detail/block_reduce_warp_reduce.hpp | 66 +++++++++++-------- 4 files changed, 61 insertions(+), 37 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 619dbe2a3..59f45f150 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -42,7 +42,12 @@ Documentation for rocPRIM is available at * `rocprim::thread_load` and `rocprim::thread_store`, use dereference instead. Not all of those functions are available on every device architecture, and their usage can hurt performance, because inline assembly inhibits optimizations. -## Unreleased rocPRIM-3.2.0 for ROCm 6.2.0 +## rocPRIM-3.2.1 for ROCm 6.2.1 + +### Optimizations +* Improved performance of block_reduce_warp_reduce when warp size == block size. + +## rocPRIM-3.2.0 for ROCm 6.2.0 ### Additions diff --git a/CMakeLists.txt b/CMakeLists.txt index cba75281c..a5b9b1274 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -74,6 +74,11 @@ set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) +if(DEFINED BUILD_SHARED_LIBS) + set(PKG_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) +else() + set(PKG_BUILD_SHARED_LIBS ON) +endif() set(BUILD_SHARED_LIBS OFF) # don't build client dependencies as shared if(NOT USE_HIP_CPU) # Get dependencies (required here to get rocm-cmake) @@ -172,16 +177,22 @@ if(BUILD_DOCS AND NOT ONLY_INSTALL) add_subdirectory(docs) endif() +# set BUILD_SHARED_LIBS for packaging +set(BUILD_SHARED_LIBS ${PKG_BUILD_SHARED_LIBS}) # Package if (ROCPRIM_PROJECT_IS_TOP_LEVEL) - set(BUILD_SHARED_LIBS ON) # Build as though shared library for naming + # add dependency on HIP runtime + set(HIP_RUNTIME_MINIMUM 4.5.0) if(BUILD_ADDRESS_SANITIZER) set(DEPENDS_HIP_RUNTIME "hip-runtime-amd-asan" ) else() set(DEPENDS_HIP_RUNTIME "hip-runtime-amd" ) endif() - rocm_package_add_dependencies(DEPENDS "${DEPENDS_HIP_RUNTIME} >= 4.5.0") + rocm_package_add_dependencies(SHARED_DEPENDS "${DEPENDS_HIP_RUNTIME} >= ${HIP_RUNTIME_MINIMUM}") + rocm_package_add_deb_dependencies(STATIC_DEPENDS "hip-static-dev >= ${HIP_RUNTIME_MINIMUM}") + rocm_package_add_rpm_dependencies(STATIC_DEPENDS "hip-static-devel >= ${HIP_RUNTIME_MINIMUM}") + set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.txt") set(CPACK_RPM_PACKAGE_LICENSE "MIT") diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt index 26260c116..f3f68ddfe 100644 --- a/docs/sphinx/requirements.txt +++ b/docs/sphinx/requirements.txt @@ -36,7 +36,7 @@ docutils==0.21.2 # myst-parser # pydata-sphinx-theme # sphinx -fastjsonschema==2.19.1 +fastjsonschema==2.20.0 # via rocm-docs-core gitdb==4.0.11 # via gitpython @@ -62,13 +62,13 @@ mdurl==0.1.2 # via markdown-it-py myst-parser==3.0.1 # via rocm-docs-core -packaging==24.0 +packaging==24.1 # via # pydata-sphinx-theme # sphinx pycparser==2.22 # via cffi -pydata-sphinx-theme==0.15.3 +pydata-sphinx-theme==0.15.4 # via # rocm-docs-core # sphinx-book-theme @@ -111,7 +111,7 @@ sphinx==7.3.7 # sphinx-design # sphinx-external-toc # sphinx-notfound-page -sphinx-book-theme==1.1.2 +sphinx-book-theme==1.1.3 # via rocm-docs-core sphinx-copybutton==0.5.2 # via rocm-docs-core @@ -135,7 +135,7 @@ sphinxcontrib-serializinghtml==1.1.10 # via sphinx tomli==2.0.1 # via sphinx -typing-extensions==4.12.0 +typing-extensions==4.12.2 # via # pydata-sphinx-theme # pygithub diff --git a/rocprim/include/rocprim/block/detail/block_reduce_warp_reduce.hpp b/rocprim/include/rocprim/block/detail/block_reduce_warp_reduce.hpp index 11bf18cdb..2ffc7437d 100644 --- a/rocprim/include/rocprim/block/detail/block_reduce_warp_reduce.hpp +++ b/rocprim/include/rocprim/block/detail/block_reduce_warp_reduce.hpp @@ -180,21 +180,25 @@ class block_reduce_warp_reduce input, output, num_valid, reduce_op ); - // i-th warp will have its partial stored in storage_.warp_partials[i-1] - if(lane_id == 0) + // Final reduction across warps is only required if there is more than 1 warp + if ROCPRIM_IF_CONSTEXPR (warps_no_ > 1) { - storage_.warp_partials[warp_id] = output; - } - ::rocprim::syncthreads(); - - if(flat_tid < warps_no_) - { - // Use warp partial to calculate the final reduce results for every thread - auto warp_partial = storage_.warp_partials[lane_id]; - - warp_reduce( - warp_partial, output, warps_no_, reduce_op - ); + // i-th warp will have its partial stored in storage_.warp_partials[i-1] + if(lane_id == 0) + { + storage_.warp_partials[warp_id] = output; + } + ::rocprim::syncthreads(); + + if(flat_tid < warps_no_) + { + // Use warp partial to calculate the final reduce results for every thread + auto warp_partial = storage_.warp_partials[lane_id]; + + warp_reduce( + warp_partial, output, warps_no_, reduce_op + ); + } } } @@ -246,22 +250,26 @@ class block_reduce_warp_reduce input, output, num_valid, reduce_op ); - // i-th warp will have its partial stored in storage_.warp_partials[i-1] - if(lane_id == 0) + // Final reduction across warps is only required if there is more than 1 warp + if ROCPRIM_IF_CONSTEXPR (warps_no_ > 1) { - storage_.warp_partials[warp_id] = output; - } - ::rocprim::syncthreads(); - - if(flat_tid < warps_no_) - { - // Use warp partial to calculate the final reduce results for every thread - auto warp_partial = storage_.warp_partials[lane_id]; - - unsigned int valid_warps_no = (valid_items + warp_size_ - 1) / warp_size_; - warp_reduce_output_type().reduce( - warp_partial, output, valid_warps_no, reduce_op - ); + // i-th warp will have its partial stored in storage_.warp_partials[i-1] + if(lane_id == 0) + { + storage_.warp_partials[warp_id] = output; + } + ::rocprim::syncthreads(); + + if(flat_tid < warps_no_) + { + // Use warp partial to calculate the final reduce results for every thread + auto warp_partial = storage_.warp_partials[lane_id]; + + unsigned int valid_warps_no = (valid_items + warp_size_ - 1) / warp_size_; + warp_reduce_output_type().reduce( + warp_partial, output, valid_warps_no, reduce_op + ); + } } } }; From b9a2da94b7cee6488bff5f1a4ef17b231df64f56 Mon Sep 17 00:00:00 2001 From: Di Nguyen Date: Fri, 11 Oct 2024 12:35:20 -0600 Subject: [PATCH 05/13] Changed precondition for edge case in serial_merge to prevent assertion error (#622) * added std::min to ensure no out of bound acess * fixed typo keys->keys1 * updated changelog * reverted std::min * implemented suggested logic --- CHANGELOG.md | 1 + rocprim/include/rocprim/detail/merge_path.hpp | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 59f45f150..f65a20eb9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -37,6 +37,7 @@ Documentation for rocPRIM is available at * device merge, * device partial sort, and/or * device sort (merge sort). +* Fixed an issue where on certain inputs to block_sort_merge, device_merge_sort_merge_path, device_merge, and warp_sort_stable would cause an assertion error during its call to serial_merge ### Deprecations diff --git a/rocprim/include/rocprim/detail/merge_path.hpp b/rocprim/include/rocprim/detail/merge_path.hpp index 8cdd51d33..eb61fb627 100644 --- a/rocprim/include/rocprim/detail/merge_path.hpp +++ b/rocprim/include/rocprim/detail/merge_path.hpp @@ -100,8 +100,8 @@ void serial_merge(KeyType* keys_shared, OutputFunction output_function) { // Pre condition, we're including some edge cases too. - assert(range.begin1 <= range.end1); - assert(range.begin2 <= range.end2); + if (!AllowUnsafe && range.begin1 > range.end1 && range.begin2 > range.end2) + return; // don't do anything, we have invalid inputs // More descriptive names for ranges: auto idx_a = range.begin1; From a8e5c791b233def1a651af3d1b1f994bcf1f8ab1 Mon Sep 17 00:00:00 2001 From: spolifroni-amd Date: Fri, 11 Oct 2024 18:36:24 -0400 Subject: [PATCH 06/13] edited to conform to standards (#618) --- CHANGELOG.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f65a20eb9..19ffc7c2b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,8 @@ Documentation for rocPRIM is available at ## Unreleased rocPRIM-3.3.0 for ROCm 6.3.0 -### Additions +### Added + * Add --test smoke option in rtest.py. It will run a subset of tests such that the total test time is in 5 minutes. Use python3 ./rtest.py --test smoke or python3 ./rtest.py -t smoke to execute smoke test. * Option `--seed` to benchmarks to specify a seed for the generation of random inputs. The default behavior is to keep using a random seed per benchmark measurement. * Added configuration autotuning to device partition (`rocprim::partition`, `rocprim::partition_two_way`, and `rocprim::partition_three_way`), device select (`rocprim::select`, `rocprim::unique`, and `rocprim::unique_by_key`), and device reduce by key (`rocprim::reduce_by_key`) for improved performance on selected architectures. @@ -15,12 +16,13 @@ Documentation for rocPRIM is available at * Added deterministic (bitwise reproducible) algorithm variants `rocprim::deterministic_inclusive_scan`, `rocprim::deterministic_exclusive_scan`, `rocprim::deterministic_inclusive_scan_by_key`, `rocprim::deterministic_exclusive_scan_by_key`, and `rocprim::deterministic_reduce_by_key`. These provide run-to-run stable results with non-associative operators such as float operations, at the cost of reduced performance. * Added a parallel `partial_sort` and `partial_sort_copy` device function similar to `std::partial_sort` and `std::partial_sort_copy`, these functions rearranges elements such that the elements are the same as a sorted list up to and including the middle index. -### Changes +### Changed * Modified the input size in device adjacent difference benchmarks. Observed performance with these benchmarks might be different. * Changed the default seed for `device_benchmark_segmented_reduce`. -### Fixes +### Resolved issues + * Fixed an issue in rtest.py where if the build folder was made without release or debug directory it would crash the program * Fixed an issue where while running rtest.py on windows and passing in an absolute path to `--install_dir` causes a `FileNotFound` error. * rocPRIM functions are no longer forcefully inlined on Windows, significantly reducing the build @@ -39,9 +41,9 @@ Documentation for rocPRIM is available at * device sort (merge sort). * Fixed an issue where on certain inputs to block_sort_merge, device_merge_sort_merge_path, device_merge, and warp_sort_stable would cause an assertion error during its call to serial_merge -### Deprecations +### Upcoming changes -* `rocprim::thread_load` and `rocprim::thread_store`, use dereference instead. Not all of those functions are available on every device architecture, and their usage can hurt performance, because inline assembly inhibits optimizations. +* `rocprim::thread_load` and `rocprim::thread_store` are deprecated. Use dereference instead. Not all of those functions are available on every device architecture, and their usage can hurt performance, because inline assembly inhibits optimizations. ## rocPRIM-3.2.1 for ROCm 6.2.1 From c8744fa9fffedca18c3e915ee3882fcbfb6d8a3b Mon Sep 17 00:00:00 2001 From: Di Nguyen Date: Thu, 17 Oct 2024 15:10:38 -0600 Subject: [PATCH 07/13] Memory leak fix for multiple rocPRIM unit tests (#614) * fixed mem leak in test_config_dispatch.cpp * added missing hip free for method==4 in test_block_scan.kernels * added graphHelpeer class that does not cause memory leak due to using hipGraphCreate * replaced old hipGraph helpers with new class in device_bin_search * changed HIP_CHECK_NON_VOID to HIP_CHECK * fixed mem leak in device_bin_search * added additional functions * changed out old calls to hipGraphCrete to new GraphHelper class * added missing stream sync for hipgrag_algs * n * added missing hipFree and HIP_CHECK for lookback_reproducibility * added missing hipFree in test_discard_iterator * fixed test failures * removed extra hipFree * removed unused variables * updated change log * removed redundant function --------- Co-authored-by: Your Name Co-authored-by: root --- CHANGELOG.md | 1 + test/hipgraph/test_hipgraph_algs.cpp | 14 +-- test/rocprim/test_block_scan.kernels.hpp | 2 + test/rocprim/test_config_dispatch.cpp | 2 + .../test_device_adjacent_difference.cpp | 18 ++- test/rocprim/test_device_binary_search.cpp | 27 ++-- test/rocprim/test_device_histogram.cpp | 36 +++--- test/rocprim/test_device_merge.cpp | 27 ++-- test/rocprim/test_device_merge_sort.cpp | 34 +++--- test/rocprim/test_device_nth_element.cpp | 9 +- test/rocprim/test_device_partial_sort.cpp | 18 ++- test/rocprim/test_device_partition.cpp | 79 ++++++------ test/rocprim/test_device_radix_sort.hpp | 54 ++++---- test/rocprim/test_device_reduce.cpp | 60 ++++----- test/rocprim/test_device_reduce_by_key.cpp | 30 ++--- test/rocprim/test_device_scan.cpp | 90 +++++++------- test/rocprim/test_device_segmented_reduce.cpp | 20 +-- test/rocprim/test_device_segmented_scan.cpp | 40 +++--- test/rocprim/test_device_select.cpp | 81 ++++++------ test/rocprim/test_device_transform.cpp | 30 ++--- test/rocprim/test_discard_iterator.cpp | 1 + .../rocprim/test_lookback_reproducibility.cpp | 23 ++-- test/rocprim/test_utils_hipgraphs.hpp | 115 +++++++++--------- test/rocprim/test_warp_scan.hpp | 1 + 24 files changed, 402 insertions(+), 410 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 19ffc7c2b..f4adc45a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,6 +39,7 @@ Documentation for rocPRIM is available at * device merge, * device partial sort, and/or * device sort (merge sort). +* Fixed memory leaks in unit tests that were due to missing hipFree calls and incorrect use of hipGraphs * Fixed an issue where on certain inputs to block_sort_merge, device_merge_sort_merge_path, device_merge, and warp_sort_stable would cause an assertion error during its call to serial_merge ### Upcoming changes diff --git a/test/hipgraph/test_hipgraph_algs.cpp b/test/hipgraph/test_hipgraph_algs.cpp index ae98add4f..125ab6c44 100644 --- a/test/hipgraph/test_hipgraph_algs.cpp +++ b/test/hipgraph/test_hipgraph_algs.cpp @@ -149,7 +149,8 @@ TEST(TestHipGraphAlgs, SortAndSearch) HIP_CHECK(hipDeviceSynchronize()); // Begin graph capture - hipGraph_t graph = test_utils::createGraphHelper(stream); + test_utils::GraphHelper gHelper; + gHelper.startStreamCapture(stream); // Launch merge_sort HIP_CHECK( @@ -180,8 +181,8 @@ TEST(TestHipGraphAlgs, SortAndSearch) ); // End graph capture, but do not execute the graph yet. - hipGraphExec_t graph_instance; - graph_instance = test_utils::endCaptureGraphHelper(graph, stream); + gHelper.endStreamCapture(stream); + gHelper.createGraph(); std::vector sort_input; std::vector search_needles(search_needle_size); @@ -203,7 +204,7 @@ TEST(TestHipGraphAlgs, SortAndSearch) HIP_CHECK(hipMemcpy(d_search_needles, search_needles.data(), search_needle_size * sizeof(key_type), hipMemcpyHostToDevice)); // Launch the graph - test_utils::launchGraphHelper(graph_instance, stream, true); + gHelper.launchGraph(stream, true); // Copy output back to host HIP_CHECK(hipMemcpy(device_output.data(), d_search_output, search_needle_size * sizeof(key_type), hipMemcpyDeviceToHost)); @@ -218,9 +219,8 @@ TEST(TestHipGraphAlgs, SortAndSearch) HIP_CHECK(hipFree(d_search_output)); HIP_CHECK(hipFree(d_search_needles)); HIP_CHECK(hipFree(d_temp_storage)); - - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); HIP_CHECK(hipStreamDestroy(stream)); } - + \ No newline at end of file diff --git a/test/rocprim/test_block_scan.kernels.hpp b/test/rocprim/test_block_scan.kernels.hpp index 70f07ff85..acde7be9b 100644 --- a/test/rocprim/test_block_scan.kernels.hpp +++ b/test/rocprim/test_block_scan.kernels.hpp @@ -989,6 +989,8 @@ auto test_block_scan_input_arrays() // Validating results ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output_reductions, expected_reductions)); + HIP_CHECK(hipFree(device_output)); + HIP_CHECK(hipFree(device_output_reductions)); } } diff --git a/test/rocprim/test_config_dispatch.cpp b/test/rocprim/test_config_dispatch.cpp index b5dd0a201..62ebda06b 100644 --- a/test/rocprim/test_config_dispatch.cpp +++ b/test/rocprim/test_config_dispatch.cpp @@ -60,6 +60,8 @@ TEST(RocprimConfigDispatchTests, HostMatchesDevice) ASSERT_NE(host_arch, target_arch::invalid); ASSERT_EQ(host_arch, device_arch); + + HIP_CHECK(hipFree(device_arch_ptr)) } TEST(RocprimConfigDispatchTests, ParseCommonArches) diff --git a/test/rocprim/test_device_adjacent_difference.cpp b/test/rocprim/test_device_adjacent_difference.cpp index a2b7c6f82..00b82256f 100644 --- a/test/rocprim/test_device_adjacent_difference.cpp +++ b/test/rocprim/test_device_adjacent_difference.cpp @@ -369,8 +369,7 @@ TYPED_TEST(RocprimDeviceAdjacentDifferenceTests, AdjacentDifference) HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size)); - hipGraph_t graph; - hipGraphExec_t graph_instance; + test_utils::GraphHelper gHelper; // We might call the API multiple times, with almost the same parameter // (in-place and out-of-place) @@ -380,7 +379,7 @@ TYPED_TEST(RocprimDeviceAdjacentDifferenceTests, AdjacentDifference) { if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -398,7 +397,7 @@ TYPED_TEST(RocprimDeviceAdjacentDifferenceTests, AdjacentDifference) if(TestFixture::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } // input_type for in-place, output_type for out of place @@ -425,7 +424,7 @@ TYPED_TEST(RocprimDeviceAdjacentDifferenceTests, AdjacentDifference) if(TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } }; @@ -676,10 +675,10 @@ TYPED_TEST(RocprimDeviceAdjacentDifferenceLargeTests, LargeIndices) HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size)); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Capture the memset in the graph so that relaunching will have expected result @@ -698,10 +697,9 @@ TYPED_TEST(RocprimDeviceAdjacentDifferenceLargeTests, LargeIndices) stream, debug_synchronous)); - hipGraphExec_t graph_instance; if(TestFixture::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } // Copy output to host @@ -722,7 +720,7 @@ TYPED_TEST(RocprimDeviceAdjacentDifferenceLargeTests, LargeIndices) if(TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } } } diff --git a/test/rocprim/test_device_binary_search.cpp b/test/rocprim/test_device_binary_search.cpp index dfe36a607..1a5f8b362 100644 --- a/test/rocprim/test_device_binary_search.cpp +++ b/test/rocprim/test_device_binary_search.cpp @@ -174,10 +174,10 @@ TYPED_TEST(RocprimDeviceBinarySearch, LowerBound) HIP_CHECK(test_common_utils::hipMallocHelper(&d_temporary_storage, temporary_storage_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::params::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } HIP_CHECK(rocprim::lower_bound(d_temporary_storage, @@ -191,10 +191,9 @@ TYPED_TEST(RocprimDeviceBinarySearch, LowerBound) stream, debug_synchronous)); - hipGraphExec_t graph_instance; if(TestFixture::params::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } std::vector output(needles_size); @@ -213,7 +212,7 @@ TYPED_TEST(RocprimDeviceBinarySearch, LowerBound) if(TestFixture::params::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); @@ -320,10 +319,10 @@ TYPED_TEST(RocprimDeviceBinarySearch, UpperBound) HIP_CHECK(test_common_utils::hipMallocHelper(&d_temporary_storage, temporary_storage_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::params::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } HIP_CHECK(rocprim::upper_bound(d_temporary_storage, @@ -337,10 +336,9 @@ TYPED_TEST(RocprimDeviceBinarySearch, UpperBound) stream, debug_synchronous)); - hipGraphExec_t graph_instance; if(TestFixture::params::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } std::vector output(needles_size); @@ -359,7 +357,7 @@ TYPED_TEST(RocprimDeviceBinarySearch, UpperBound) if(TestFixture::params::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); @@ -467,10 +465,10 @@ TYPED_TEST(RocprimDeviceBinarySearch, BinarySearch) HIP_CHECK(test_common_utils::hipMallocHelper(&d_temporary_storage, temporary_storage_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::params::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } HIP_CHECK(rocprim::binary_search(d_temporary_storage, @@ -484,10 +482,9 @@ TYPED_TEST(RocprimDeviceBinarySearch, BinarySearch) stream, debug_synchronous)); - hipGraphExec_t graph_instance; if(TestFixture::params::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } std::vector output(needles_size); @@ -506,7 +503,7 @@ TYPED_TEST(RocprimDeviceBinarySearch, BinarySearch) if(TestFixture::params::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); diff --git a/test/rocprim/test_device_histogram.cpp b/test/rocprim/test_device_histogram.cpp index 3cb086a6d..ffa46216a 100644 --- a/test/rocprim/test_device_histogram.cpp +++ b/test/rocprim/test_device_histogram.cpp @@ -307,10 +307,10 @@ TYPED_TEST(RocprimDeviceHistogramEven, Even) void * d_temporary_storage; HIP_CHECK(test_common_utils::hipMallocHelper(&d_temporary_storage, temporary_storage_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::params::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } if(rows == 1) @@ -338,10 +338,9 @@ TYPED_TEST(RocprimDeviceHistogramEven, Even) ); } - hipGraphExec_t graph_instance; if(TestFixture::params::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } std::vector histogram(bins); @@ -364,7 +363,7 @@ TYPED_TEST(RocprimDeviceHistogramEven, Even) if(TestFixture::params::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } } } @@ -585,10 +584,10 @@ TYPED_TEST(RocprimDeviceHistogramRange, Range) void * d_temporary_storage; HIP_CHECK(test_common_utils::hipMallocHelper(&d_temporary_storage, temporary_storage_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::params::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } if(rows == 1) @@ -618,10 +617,9 @@ TYPED_TEST(RocprimDeviceHistogramRange, Range) debug_synchronous)); } - hipGraphExec_t graph_instance; if(TestFixture::params::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } std::vector histogram(bins); @@ -645,7 +643,7 @@ TYPED_TEST(RocprimDeviceHistogramRange, Range) if(TestFixture::params::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } } } @@ -873,10 +871,10 @@ TYPED_TEST(RocprimDeviceHistogramMultiEven, MultiEven) void * d_temporary_storage; HIP_CHECK(test_common_utils::hipMallocHelper(&d_temporary_storage, temporary_storage_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::params::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } if(rows == 1) @@ -910,10 +908,9 @@ TYPED_TEST(RocprimDeviceHistogramMultiEven, MultiEven) debug_synchronous))); } - hipGraphExec_t graph_instance; if(TestFixture::params::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } std::vector histogram[active_channels]; @@ -945,7 +942,7 @@ TYPED_TEST(RocprimDeviceHistogramMultiEven, MultiEven) if(TestFixture::params::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } } } @@ -1189,10 +1186,10 @@ TYPED_TEST(RocprimDeviceHistogramMultiRange, MultiRange) void * d_temporary_storage; HIP_CHECK(test_common_utils::hipMallocHelper(&d_temporary_storage, temporary_storage_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::params::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } if(rows == 1) @@ -1220,10 +1217,9 @@ TYPED_TEST(RocprimDeviceHistogramMultiRange, MultiRange) )); } - hipGraphExec_t graph_instance; if(TestFixture::params::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } std::vector histogram[active_channels]; @@ -1246,7 +1242,7 @@ TYPED_TEST(RocprimDeviceHistogramMultiRange, MultiRange) if(TestFixture::params::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } for(unsigned int channel = 0; channel < active_channels; channel++) diff --git a/test/rocprim/test_device_merge.cpp b/test/rocprim/test_device_merge.cpp index 1369d0b98..d028dcf14 100644 --- a/test/rocprim/test_device_merge.cpp +++ b/test/rocprim/test_device_merge.cpp @@ -212,10 +212,10 @@ TYPED_TEST(RocprimDeviceMergeTests, MergeKey) // allocate temporary storage HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -229,10 +229,9 @@ TYPED_TEST(RocprimDeviceMergeTests, MergeKey) ) ); - hipGraphExec_t graph_instance; if(TestFixture::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } HIP_CHECK(hipGetLastError()); @@ -258,7 +257,7 @@ TYPED_TEST(RocprimDeviceMergeTests, MergeKey) hipFree(d_temp_storage); if (TestFixture::use_graphs) - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } } @@ -418,10 +417,10 @@ TYPED_TEST(RocprimDeviceMergeTests, MergeKeyValue) // allocate temporary storage HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -437,10 +436,9 @@ TYPED_TEST(RocprimDeviceMergeTests, MergeKeyValue) ) ); - hipGraphExec_t graph_instance; if(TestFixture::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream, true, false); } HIP_CHECK(hipGetLastError()); @@ -483,7 +481,7 @@ TYPED_TEST(RocprimDeviceMergeTests, MergeKeyValue) hipFree(d_temp_storage); if (TestFixture::use_graphs) - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } } @@ -554,10 +552,10 @@ void testMergeMismatchedIteratorTypes() void* d_temp_storage = nullptr; HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper; if(UseGraphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } HIP_CHECK(rocprim::merge(d_temp_storage, @@ -571,10 +569,9 @@ void testMergeMismatchedIteratorTypes() hipStreamDefault, debug_synchronous)); - hipGraphExec_t graph_instance; if(UseGraphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } std::vector keys_output(expected_keys_output.size()); @@ -591,7 +588,7 @@ void testMergeMismatchedIteratorTypes() if (UseGraphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); HIP_CHECK(hipStreamDestroy(stream)); } } diff --git a/test/rocprim/test_device_merge_sort.cpp b/test/rocprim/test_device_merge_sort.cpp index 4a1288230..8f424360a 100644 --- a/test/rocprim/test_device_merge_sort.cpp +++ b/test/rocprim/test_device_merge_sort.cpp @@ -165,10 +165,10 @@ TYPED_TEST(RocprimDeviceSortTests, SortKey) HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); - hipGraph_t graph; + test_utils::GraphHelper gHelper; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream);; } // Run @@ -180,10 +180,9 @@ TYPED_TEST(RocprimDeviceSortTests, SortKey) ) ); - hipGraphExec_t graph_instance; if(TestFixture::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } HIP_CHECK(hipGetLastError()); @@ -202,16 +201,16 @@ TYPED_TEST(RocprimDeviceSortTests, SortKey) // Check if output values are as expected ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(output, expected)); - hipFree(d_input); + HIP_CHECK(hipFree(d_input)); if(!in_place) { - hipFree(d_output); + HIP_CHECK(hipFree(d_output)); } - hipFree(d_temp_storage); + HIP_CHECK(hipFree(d_temp_storage)); if (TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); HIP_CHECK(hipStreamDestroy(stream)); } } @@ -335,10 +334,10 @@ TYPED_TEST(RocprimDeviceSortTests, SortKeyValue) HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); - hipGraph_t graph; + test_utils::GraphHelper gHelper; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream);; } // Run @@ -351,10 +350,9 @@ TYPED_TEST(RocprimDeviceSortTests, SortKeyValue) ) ); - hipGraphExec_t graph_instance; if(TestFixture::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream, true, false); } HIP_CHECK(hipGetLastError()); @@ -389,18 +387,18 @@ TYPED_TEST(RocprimDeviceSortTests, SortKeyValue) ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(keys_output, expected_key)); ASSERT_NO_FATAL_FAILURE(test_utils::assert_eq(values_output, expected_value)); - hipFree(d_keys_input); - hipFree(d_values_input); + HIP_CHECK(hipFree(d_keys_input)); + HIP_CHECK(hipFree(d_values_input)); if(!in_place) { - hipFree(d_keys_output); - hipFree(d_values_output); + HIP_CHECK(hipFree(d_keys_output)); + HIP_CHECK(hipFree(d_values_output)); } - hipFree(d_temp_storage); + HIP_CHECK(hipFree(d_temp_storage)); if (TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); HIP_CHECK(hipStreamDestroy(stream)); } } diff --git a/test/rocprim/test_device_nth_element.cpp b/test/rocprim/test_device_nth_element.cpp index 0e6519941..9e74abd3d 100644 --- a/test/rocprim/test_device_nth_element.cpp +++ b/test/rocprim/test_device_nth_element.cpp @@ -292,10 +292,10 @@ TYPED_TEST(RocprimDeviceNthelementTests, NthelementKey) // allocate temporary storage HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } if(in_place) @@ -324,10 +324,9 @@ TYPED_TEST(RocprimDeviceNthelementTests, NthelementKey) debug_synchronous)); } - hipGraphExec_t graph_instance; if(TestFixture::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } HIP_CHECK(hipGetLastError()); @@ -353,7 +352,7 @@ TYPED_TEST(RocprimDeviceNthelementTests, NthelementKey) if(TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); HIP_CHECK(hipStreamDestroy(stream)); } } diff --git a/test/rocprim/test_device_partial_sort.cpp b/test/rocprim/test_device_partial_sort.cpp index 077d1fc59..f5a873c40 100644 --- a/test/rocprim/test_device_partial_sort.cpp +++ b/test/rocprim/test_device_partial_sort.cpp @@ -253,10 +253,10 @@ TYPED_TEST(RocprimDevicePartialSortTests, PartialSort) HIP_CHECK( test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } HIP_CHECK(rocprim::partial_sort(d_temp_storage, temp_storage_size_bytes, @@ -269,10 +269,9 @@ TYPED_TEST(RocprimDevicePartialSortTests, PartialSort) HIP_CHECK(hipGetLastError()); - hipGraphExec_t graph_instance; if(TestFixture::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } std::vector output(size); @@ -288,7 +287,7 @@ TYPED_TEST(RocprimDevicePartialSortTests, PartialSort) if(TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); HIP_CHECK(hipStreamDestroy(stream)); } } @@ -474,10 +473,10 @@ TYPED_TEST(RocprimDevicePartialSortTests, PartialSortCopy) HIP_CHECK( test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } HIP_CHECK(rocprim::partial_sort_copy(d_temp_storage, @@ -492,10 +491,9 @@ TYPED_TEST(RocprimDevicePartialSortTests, PartialSortCopy) HIP_CHECK(hipGetLastError()); - hipGraphExec_t graph_instance; if(TestFixture::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } std::vector output(size); @@ -512,7 +510,7 @@ TYPED_TEST(RocprimDevicePartialSortTests, PartialSortCopy) if(TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); HIP_CHECK(hipStreamDestroy(stream)); } } diff --git a/test/rocprim/test_device_partition.cpp b/test/rocprim/test_device_partition.cpp index 0468de782..902cd4ccb 100644 --- a/test/rocprim/test_device_partition.cpp +++ b/test/rocprim/test_device_partition.cpp @@ -170,10 +170,10 @@ TYPED_TEST(RocprimDevicePartitionTests, Flagged) void* d_temp_storage = nullptr; HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -188,10 +188,9 @@ TYPED_TEST(RocprimDevicePartitionTests, Flagged) stream, debug_synchronous)); - hipGraphExec_t graph_instance; if(TestFixture::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } // Check if number of selected value is as expected_selected @@ -226,7 +225,7 @@ TYPED_TEST(RocprimDevicePartitionTests, Flagged) if(TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } } } @@ -293,10 +292,10 @@ TYPED_TEST(RocprimDevicePartitionTests, PredicateEmptyInput) void* d_temp_storage = nullptr; HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -310,10 +309,10 @@ TYPED_TEST(RocprimDevicePartitionTests, PredicateEmptyInput) stream, debug_synchronous)); - hipGraphExec_t graph_instance; + if(TestFixture::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream, true, false); } HIP_CHECK(hipDeviceSynchronize()); @@ -335,7 +334,7 @@ TYPED_TEST(RocprimDevicePartitionTests, PredicateEmptyInput) if (TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); HIP_CHECK(hipStreamDestroy(stream)); } } @@ -425,10 +424,10 @@ TYPED_TEST(RocprimDevicePartitionTests, Predicate) void* d_temp_storage = nullptr; HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -443,10 +442,10 @@ TYPED_TEST(RocprimDevicePartitionTests, Predicate) stream, debug_synchronous)); - hipGraphExec_t graph_instance; + if(TestFixture::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream, true, false); } HIP_CHECK(hipDeviceSynchronize()); @@ -482,7 +481,7 @@ TYPED_TEST(RocprimDevicePartitionTests, Predicate) if(TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } } } @@ -585,10 +584,10 @@ TYPED_TEST(RocprimDevicePartitionTests, PredicateTwoWay) void* d_temp_storage = nullptr; HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -604,10 +603,10 @@ TYPED_TEST(RocprimDevicePartitionTests, PredicateTwoWay) stream, debug_synchronous)); - hipGraphExec_t graph_instance; + if(TestFixture::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream, true, false); } HIP_CHECK(hipDeviceSynchronize()); @@ -645,7 +644,7 @@ TYPED_TEST(RocprimDevicePartitionTests, PredicateTwoWay) if(TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } } } @@ -779,10 +778,10 @@ TYPED_TEST(RocprimDevicePartitionTests, PredicateThreeWay) void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -801,10 +800,10 @@ TYPED_TEST(RocprimDevicePartitionTests, PredicateThreeWay) stream, debug_synchronous)); - hipGraphExec_t graph_instance; + if(TestFixture::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } HIP_CHECK(hipDeviceSynchronize()); @@ -858,7 +857,7 @@ TYPED_TEST(RocprimDevicePartitionTests, PredicateThreeWay) if(TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } } } @@ -1167,10 +1166,10 @@ TEST_P(RocprimDevicePartitionLargeInputTests, LargeInputPartition) ASSERT_NE(0, temporary_storage_size); HIP_CHECK(test_common_utils::hipMallocHelper(&d_temporary_storage, temporary_storage_size)); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } HIP_CHECK(rocprim::partition(d_temporary_storage, @@ -1183,10 +1182,10 @@ TEST_P(RocprimDevicePartitionLargeInputTests, LargeInputPartition) stream, debug_synchronous)); - hipGraphExec_t graph_instance; + if(use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } size_t count_output{}; @@ -1214,7 +1213,7 @@ TEST_P(RocprimDevicePartitionLargeInputTests, LargeInputPartition) if(use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } } @@ -1288,10 +1287,10 @@ TEST_P(RocprimDevicePartitionLargeInputTests, LargeInputPartitionTwoWay) ASSERT_NE(0, temporary_storage_size); HIP_CHECK(test_common_utils::hipMallocHelper(&d_temporary_storage, temporary_storage_size)); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } HIP_CHECK(rocprim::partition_two_way(d_temporary_storage, @@ -1305,10 +1304,10 @@ TEST_P(RocprimDevicePartitionLargeInputTests, LargeInputPartitionTwoWay) stream, debug_synchronous)); - hipGraphExec_t graph_instance; + if(use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } size_t count_output{}; @@ -1344,7 +1343,7 @@ TEST_P(RocprimDevicePartitionLargeInputTests, LargeInputPartitionTwoWay) if(use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } } @@ -1417,10 +1416,10 @@ TEST_P(RocprimDevicePartitionLargeInputTests, LargeInputPartitionThreeWay) ASSERT_NE(0, temporary_storage_size); HIP_CHECK(test_common_utils::hipMallocHelper(&d_temporary_storage, temporary_storage_size)); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } HIP_CHECK(rocprim::partition_three_way(d_temporary_storage, @@ -1436,10 +1435,10 @@ TEST_P(RocprimDevicePartitionLargeInputTests, LargeInputPartitionThreeWay) stream, debug_synchronous)); - hipGraphExec_t graph_instance; + if(use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } size_t count_output[2]{}; @@ -1471,7 +1470,7 @@ TEST_P(RocprimDevicePartitionLargeInputTests, LargeInputPartitionThreeWay) if(use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } } diff --git a/test/rocprim/test_device_radix_sort.hpp b/test/rocprim/test_device_radix_sort.hpp index cf6a80ef8..93f7a8d09 100644 --- a/test/rocprim/test_device_radix_sort.hpp +++ b/test/rocprim/test_device_radix_sort.hpp @@ -317,10 +317,10 @@ void sort_keys() HIP_CHECK( test_common_utils::hipMallocHelper(&d_temporary_storage, temporary_storage_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::params::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } HIP_CHECK((invoke_sort_keys(d_temporary_storage, @@ -333,10 +333,10 @@ void sort_keys() stream, debug_synchronous))); - hipGraphExec_t graph_instance; + if(TestFixture::params::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } auto keys_output = std::make_unique(size); @@ -354,7 +354,7 @@ void sort_keys() if(TestFixture::params::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } ASSERT_NO_FATAL_FAILURE(test_utils::assert_bit_eq(keys_output.get(), @@ -632,11 +632,11 @@ void sort_pairs() 4>, 1024 * 512>; - hipGraph_t graph; - hipGraphExec_t graph_instance; + test_utils::GraphHelper gHelper;; + if(TestFixture::params::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } void* d_temporary_storage = nullptr; @@ -655,7 +655,7 @@ void sort_pairs() if(TestFixture::params::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } ASSERT_GT(temporary_storage_bytes, 0); @@ -665,7 +665,7 @@ void sort_pairs() if(TestFixture::params::use_graphs) { - test_utils::resetGraphHelper(graph, graph_instance, stream); + gHelper.resetGraphHelper(stream); } HIP_CHECK((invoke_sort_pairs(d_temporary_storage, @@ -682,7 +682,7 @@ void sort_pairs() if(TestFixture::params::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } auto keys_output = std::make_unique(size); @@ -708,7 +708,7 @@ void sort_pairs() if(TestFixture::params::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } ASSERT_NO_FATAL_FAILURE(test_utils::assert_bit_eq(keys_output.get(), @@ -923,10 +923,10 @@ void sort_keys_double_buffer() HIP_CHECK( test_common_utils::hipMallocHelper(&d_temporary_storage, temporary_storage_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::params::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } HIP_CHECK( @@ -939,10 +939,10 @@ void sort_keys_double_buffer() stream, debug_synchronous))); - hipGraphExec_t graph_instance; + if(TestFixture::params::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } HIP_CHECK(hipFree(d_temporary_storage)); @@ -958,7 +958,7 @@ void sort_keys_double_buffer() if(TestFixture::params::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } ASSERT_NO_FATAL_FAILURE(test_utils::assert_bit_eq(keys_output.get(), @@ -1210,10 +1210,10 @@ void sort_pairs_double_buffer() HIP_CHECK( test_common_utils::hipMallocHelper(&d_temporary_storage, temporary_storage_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::params::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } HIP_CHECK( @@ -1227,10 +1227,10 @@ void sort_pairs_double_buffer() stream, debug_synchronous))); - hipGraphExec_t graph_instance; + if(TestFixture::params::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } HIP_CHECK(hipFree(d_temporary_storage)); @@ -1254,7 +1254,7 @@ void sort_pairs_double_buffer() if(TestFixture::params::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } ASSERT_NO_FATAL_FAILURE(test_utils::assert_bit_eq(keys_output.get(), @@ -1342,10 +1342,10 @@ void sort_keys_over_4g() void* d_temporary_storage; HIP_CHECK(test_common_utils::hipMallocHelper(&d_temporary_storage, temporary_storage_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(UseGraphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } HIP_CHECK(rocprim::radix_sort_keys(d_temporary_storage, @@ -1358,10 +1358,10 @@ void sort_keys_over_4g() stream, debug_synchronous)); - hipGraphExec_t graph_instance; + if(UseGraphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } std::vector output(keys_input.size()); @@ -1386,7 +1386,7 @@ void sort_keys_over_4g() if (UseGraphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); HIP_CHECK(hipStreamDestroy(stream)); } } diff --git a/test/rocprim/test_device_reduce.cpp b/test/rocprim/test_device_reduce.cpp index b4ef724e9..589dfc06a 100644 --- a/test/rocprim/test_device_reduce.cpp +++ b/test/rocprim/test_device_reduce.cpp @@ -187,10 +187,10 @@ TYPED_TEST(RocprimDeviceReduceTests, ReduceEmptyInput) void * d_temp_storage = nullptr; HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -204,10 +204,10 @@ TYPED_TEST(RocprimDeviceReduceTests, ReduceEmptyInput) ) ); - hipGraphExec_t graph_instance; + if(TestFixture::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } HIP_CHECK(hipDeviceSynchronize()); @@ -227,7 +227,7 @@ TYPED_TEST(RocprimDeviceReduceTests, ReduceEmptyInput) if (TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); HIP_CHECK(hipStreamDestroy(stream)); } } @@ -315,10 +315,10 @@ TYPED_TEST(RocprimDeviceReduceTests, ReduceSum) HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); - hipGraph_t graph; + test_utils::GraphHelper gHelper; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -331,10 +331,10 @@ TYPED_TEST(RocprimDeviceReduceTests, ReduceSum) ) ); - hipGraphExec_t graph_instance; + if(TestFixture::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } HIP_CHECK(hipGetLastError()); @@ -360,7 +360,7 @@ TYPED_TEST(RocprimDeviceReduceTests, ReduceSum) if (TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); HIP_CHECK(hipStreamDestroy(stream)); } } @@ -467,10 +467,10 @@ TYPED_TEST(RocprimDeviceReduceTests, ReduceArgMinimum) HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); - hipGraph_t graph; + test_utils::GraphHelper gHelper; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -483,10 +483,10 @@ TYPED_TEST(RocprimDeviceReduceTests, ReduceArgMinimum) ) ); - hipGraphExec_t graph_instance; + if(TestFixture::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } HIP_CHECK(hipGetLastError()); @@ -512,7 +512,7 @@ TYPED_TEST(RocprimDeviceReduceTests, ReduceArgMinimum) if (TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); HIP_CHECK(hipStreamDestroy(stream)); } } @@ -569,10 +569,10 @@ void testLargeIndices() HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); - hipGraph_t graph; + test_utils::GraphHelper gHelper; if(use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -585,10 +585,10 @@ void testLargeIndices() stream, debug_synchronous)); - hipGraphExec_t graph_instance; + if(use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream, true, false); } HIP_CHECK(hipGetLastError()); @@ -610,7 +610,7 @@ void testLargeIndices() if(use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } } } @@ -711,10 +711,10 @@ TYPED_TEST(RocprimDeviceReducePrecisionTests, ReduceSumInputEqualExponentFunctio HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); - hipGraph_t graph; + test_utils::GraphHelper gHelper; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -728,10 +728,10 @@ TYPED_TEST(RocprimDeviceReducePrecisionTests, ReduceSumInputEqualExponentFunctio stream, debug_synchronous)); - hipGraphExec_t graph_instance; + if(TestFixture::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } HIP_CHECK(hipGetLastError()); @@ -751,7 +751,7 @@ TYPED_TEST(RocprimDeviceReducePrecisionTests, ReduceSumInputEqualExponentFunctio if (TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); HIP_CHECK(hipStreamDestroy(stream)); } } @@ -833,10 +833,10 @@ TYPED_TEST(RocprimDeviceReduceTests, ReduceMinimum) HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); - hipGraph_t graph; + test_utils::GraphHelper gHelper; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -858,10 +858,10 @@ TYPED_TEST(RocprimDeviceReduceTests, ReduceMinimum) ) ); - hipGraphExec_t graph_instance; + if(TestFixture::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream, true, false); } HIP_CHECK(hipDeviceSynchronize()); @@ -880,7 +880,7 @@ TYPED_TEST(RocprimDeviceReduceTests, ReduceMinimum) if (TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); HIP_CHECK(hipStreamDestroy(stream)); } } diff --git a/test/rocprim/test_device_reduce_by_key.cpp b/test/rocprim/test_device_reduce_by_key.cpp index 20dffcfb5..c7ff2936e 100644 --- a/test/rocprim/test_device_reduce_by_key.cpp +++ b/test/rocprim/test_device_reduce_by_key.cpp @@ -296,10 +296,10 @@ TYPED_TEST(RocprimDeviceReduceByKey, ReduceByKey) void * d_temporary_storage; HIP_CHECK(test_common_utils::hipMallocHelper(&d_temporary_storage, temporary_storage_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper; if(TestFixture::params::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } HIP_CHECK((invoke_reduce_by_key(d_temporary_storage, @@ -315,10 +315,10 @@ TYPED_TEST(RocprimDeviceReduceByKey, ReduceByKey) stream, debug_synchronous))); - hipGraphExec_t graph_instance; + if(TestFixture::params::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } HIP_CHECK(hipFree(d_temporary_storage)); @@ -356,7 +356,7 @@ TYPED_TEST(RocprimDeviceReduceByKey, ReduceByKey) if (TestFixture::params::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); HIP_CHECK(hipStreamDestroy(stream)); } @@ -443,10 +443,10 @@ void large_indices_reduce_by_key() HIP_CHECK( test_common_utils::hipMallocHelper(&d_temporary_storage, temporary_storage_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper; if(use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } HIP_CHECK(invoke_reduce_by_key(d_temporary_storage, @@ -462,10 +462,10 @@ void large_indices_reduce_by_key() stream, debug_synchronous)); - hipGraphExec_t graph_instance; + if(use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } HIP_CHECK(hipFree(d_temporary_storage)); @@ -492,7 +492,7 @@ void large_indices_reduce_by_key() if(use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } ASSERT_EQ(unique_count_output[0], unique_count_expected); @@ -598,10 +598,10 @@ void large_segment_count_reduce_by_key() HIP_CHECK( test_common_utils::hipMallocHelper(&d_temporary_storage, temporary_storage_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper; if(use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } HIP_CHECK(invoke_reduce_by_key(d_temporary_storage, @@ -617,10 +617,10 @@ void large_segment_count_reduce_by_key() stream, debug_synchronous)); - hipGraphExec_t graph_instance; + if(use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } HIP_CHECK(hipFree(d_temporary_storage)); @@ -636,7 +636,7 @@ void large_segment_count_reduce_by_key() ASSERT_EQ(unique_count_output, unique_count_expected); if (use_graphs) - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } if (use_graphs) diff --git a/test/rocprim/test_device_scan.cpp b/test/rocprim/test_device_scan.cpp index c86138995..b62746a48 100644 --- a/test/rocprim/test_device_scan.cpp +++ b/test/rocprim/test_device_scan.cpp @@ -290,10 +290,10 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScanEmptyInput) // allocate temporary storage HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -306,10 +306,10 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScanEmptyInput) stream, debug_synchronous)); - hipGraphExec_t graph_instance; + if(TestFixture::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream, true, false); } HIP_CHECK(hipGetLastError()); @@ -322,7 +322,7 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScanEmptyInput) if (TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); HIP_CHECK(hipStreamDestroy(stream)); } } @@ -431,10 +431,10 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScan) HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -448,10 +448,10 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScan) stream, TestFixture::debug_synchronous))); - hipGraphExec_t graph_instance; + if(TestFixture::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream, true, false); } HIP_CHECK(hipGetLastError()); @@ -477,7 +477,7 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScan) if (TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); HIP_CHECK(hipStreamDestroy(stream)); } } @@ -593,10 +593,10 @@ TYPED_TEST(RocprimDeviceScanTests, ExclusiveScan) HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -611,10 +611,10 @@ TYPED_TEST(RocprimDeviceScanTests, ExclusiveScan) stream, debug_synchronous))); - hipGraphExec_t graph_instance; + if(TestFixture::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream, true, false); } HIP_CHECK(hipGetLastError()); @@ -640,7 +640,7 @@ TYPED_TEST(RocprimDeviceScanTests, ExclusiveScan) if (TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); HIP_CHECK(hipStreamDestroy(stream)); } } @@ -771,10 +771,10 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScanByKey) HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -789,10 +789,10 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScanByKey) stream, debug_synchronous))); - hipGraphExec_t graph_instance; + if(TestFixture::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream, true, false); } HIP_CHECK(hipGetLastError()); @@ -819,7 +819,7 @@ TYPED_TEST(RocprimDeviceScanTests, InclusiveScanByKey) if (TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); HIP_CHECK(hipStreamDestroy(stream)); } } @@ -954,10 +954,10 @@ TYPED_TEST(RocprimDeviceScanTests, ExclusiveScanByKey) HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -973,10 +973,10 @@ TYPED_TEST(RocprimDeviceScanTests, ExclusiveScanByKey) stream, debug_synchronous))); - hipGraphExec_t graph_instance; + if(TestFixture::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream, true, false); } HIP_CHECK(hipGetLastError()); @@ -1003,7 +1003,7 @@ TYPED_TEST(RocprimDeviceScanTests, ExclusiveScanByKey) if (TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); HIP_CHECK(hipStreamDestroy(stream)); } } @@ -1139,10 +1139,10 @@ void testLargeIndicesInclusiveScan() HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(UseGraphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -1155,10 +1155,10 @@ void testLargeIndicesInclusiveScan() ) ); - hipGraphExec_t graph_instance; + if(UseGraphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream, true, false); } HIP_CHECK(hipGetLastError()); @@ -1182,7 +1182,7 @@ void testLargeIndicesInclusiveScan() if(UseGraphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } } } @@ -1268,10 +1268,10 @@ void testLargeIndicesExclusiveScan() HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(UseGraphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -1285,10 +1285,10 @@ void testLargeIndicesExclusiveScan() ) ); - hipGraphExec_t graph_instance; + if(UseGraphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream, true, false); } HIP_CHECK(hipGetLastError()); @@ -1315,7 +1315,7 @@ void testLargeIndicesExclusiveScan() if(UseGraphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } } } @@ -1519,10 +1519,10 @@ void large_indices_scan_by_key_test(ScanByKeyFun scan_by_key_fun) ASSERT_GT(temp_storage_size_bytes, 0); HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(UseGraphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } HIP_CHECK(scan_by_key_fun(d_temp_storage, @@ -1536,10 +1536,10 @@ void large_indices_scan_by_key_test(ScanByKeyFun scan_by_key_fun) debug_synchronous, seed_value)); - hipGraphExec_t graph_instance; + if(UseGraphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } HIP_CHECK(hipGetLastError()); @@ -1557,7 +1557,7 @@ void large_indices_scan_by_key_test(ScanByKeyFun scan_by_key_fun) if (UseGraphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); HIP_CHECK(hipStreamDestroy(stream)); } } @@ -1781,10 +1781,10 @@ TYPED_TEST(RocprimDeviceScanFutureTests, ExclusiveScan) &d_temp_storage, temp_storage_size_bytes + temp_storage_reduce)); HIP_CHECK(hipDeviceSynchronize()); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Fill initial value on the device @@ -1809,10 +1809,10 @@ TYPED_TEST(RocprimDeviceScanFutureTests, ExclusiveScan) debug_synchronous))); HIP_CHECK(hipGetLastError()); - hipGraphExec_t graph_instance; + if(TestFixture::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } // Copy output to host @@ -1831,7 +1831,7 @@ TYPED_TEST(RocprimDeviceScanFutureTests, ExclusiveScan) if (TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); HIP_CHECK(hipStreamDestroy(stream)); } } diff --git a/test/rocprim/test_device_segmented_reduce.cpp b/test/rocprim/test_device_segmented_reduce.cpp index fee4de509..cf00db384 100644 --- a/test/rocprim/test_device_segmented_reduce.cpp +++ b/test/rocprim/test_device_segmented_reduce.cpp @@ -268,10 +268,10 @@ TYPED_TEST(RocprimDeviceSegmentedReduce, Reduce) HIP_CHECK( test_common_utils::hipMallocHelper(&d_temporary_storage, temporary_storage_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper; if(TestFixture::params::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } HIP_CHECK(rocprim::segmented_reduce( @@ -287,10 +287,10 @@ TYPED_TEST(RocprimDeviceSegmentedReduce, Reduce) stream, debug_synchronous)); - hipGraphExec_t graph_instance; + if(TestFixture::params::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } HIP_CHECK(hipFree(d_temporary_storage)); @@ -307,7 +307,7 @@ TYPED_TEST(RocprimDeviceSegmentedReduce, Reduce) if (TestFixture::params::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); HIP_CHECK(hipStreamDestroy(stream)); } SCOPED_TRACE(testing::Message() << "with seed = " << seed); @@ -413,10 +413,10 @@ void testLargeIndices() HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); - hipGraph_t graph; + test_utils::GraphHelper gHelper; if(use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -432,10 +432,10 @@ void testLargeIndices() stream, debug_synchronous)); - hipGraphExec_t graph_instance; + if(use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream, true, false); } HIP_CHECK(hipGetLastError()); @@ -458,7 +458,7 @@ void testLargeIndices() if(use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } } diff --git a/test/rocprim/test_device_segmented_scan.cpp b/test/rocprim/test_device_segmented_scan.cpp index 5f79a7287..f19236fd7 100644 --- a/test/rocprim/test_device_segmented_scan.cpp +++ b/test/rocprim/test_device_segmented_scan.cpp @@ -211,10 +211,10 @@ TYPED_TEST(RocprimDeviceSegmentedScan, InclusiveScan) void * d_temporary_storage; HIP_CHECK(test_common_utils::hipMallocHelper(&d_temporary_storage, temporary_storage_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper; if(TestFixture::params::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } HIP_CHECK( @@ -229,10 +229,10 @@ TYPED_TEST(RocprimDeviceSegmentedScan, InclusiveScan) ) ); - hipGraphExec_t graph_instance; + if(TestFixture::params::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream, true, false); } HIP_CHECK(hipDeviceSynchronize()); @@ -257,7 +257,7 @@ TYPED_TEST(RocprimDeviceSegmentedScan, InclusiveScan) if(TestFixture::params::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } } } @@ -398,10 +398,10 @@ TYPED_TEST(RocprimDeviceSegmentedScan, ExclusiveScan) void * d_temporary_storage; HIP_CHECK(test_common_utils::hipMallocHelper(&d_temporary_storage, temporary_storage_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper; if(TestFixture::params::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } HIP_CHECK( @@ -416,10 +416,10 @@ TYPED_TEST(RocprimDeviceSegmentedScan, ExclusiveScan) ) ); - hipGraphExec_t graph_instance; + if(TestFixture::params::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream, true, false); } HIP_CHECK(hipDeviceSynchronize()); @@ -444,7 +444,7 @@ TYPED_TEST(RocprimDeviceSegmentedScan, ExclusiveScan) if(TestFixture::params::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } } } @@ -587,10 +587,10 @@ TYPED_TEST(RocprimDeviceSegmentedScan, InclusiveScanUsingHeadFlags) HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); - hipGraph_t graph; + test_utils::GraphHelper gHelper; if(TestFixture::params::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -603,10 +603,10 @@ TYPED_TEST(RocprimDeviceSegmentedScan, InclusiveScanUsingHeadFlags) ) ); - hipGraphExec_t graph_instance; + if(TestFixture::params::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream, true, false); } HIP_CHECK(hipDeviceSynchronize()); @@ -631,7 +631,7 @@ TYPED_TEST(RocprimDeviceSegmentedScan, InclusiveScanUsingHeadFlags) if(TestFixture::params::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } } } @@ -776,10 +776,10 @@ TYPED_TEST(RocprimDeviceSegmentedScan, ExclusiveScanUsingHeadFlags) HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); - hipGraph_t graph; + test_utils::GraphHelper gHelper; if(TestFixture::params::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -791,10 +791,10 @@ TYPED_TEST(RocprimDeviceSegmentedScan, ExclusiveScanUsingHeadFlags) ) ); - hipGraphExec_t graph_instance; + if(TestFixture::params::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream, true, false); } HIP_CHECK(hipDeviceSynchronize()); @@ -811,7 +811,7 @@ TYPED_TEST(RocprimDeviceSegmentedScan, ExclusiveScanUsingHeadFlags) if(TestFixture::params::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } HIP_CHECK(hipDeviceSynchronize()); diff --git a/test/rocprim/test_device_select.cpp b/test/rocprim/test_device_select.cpp index ffda63d63..79f31eef2 100644 --- a/test/rocprim/test_device_select.cpp +++ b/test/rocprim/test_device_select.cpp @@ -168,10 +168,10 @@ TYPED_TEST(RocprimDeviceSelectTests, Flagged) HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -189,10 +189,10 @@ TYPED_TEST(RocprimDeviceSelectTests, Flagged) ) ); - hipGraphExec_t graph_instance; + if(TestFixture::use_graphs) { - graph_instance = graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream, true, false); } HIP_CHECK(hipDeviceSynchronize()); @@ -229,7 +229,7 @@ TYPED_TEST(RocprimDeviceSelectTests, Flagged) if(TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } } } @@ -331,10 +331,10 @@ TYPED_TEST(RocprimDeviceSelectTests, SelectOp) HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -352,10 +352,10 @@ TYPED_TEST(RocprimDeviceSelectTests, SelectOp) ) ); - hipGraphExec_t graph_instance; + if(TestFixture::use_graphs) { - graph_instance = graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream, true, false); } HIP_CHECK(hipDeviceSynchronize()); @@ -391,7 +391,7 @@ TYPED_TEST(RocprimDeviceSelectTests, SelectOp) if(TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } } } @@ -509,10 +509,10 @@ TYPED_TEST(RocprimDeviceSelectTests, Unique) HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -530,10 +530,10 @@ TYPED_TEST(RocprimDeviceSelectTests, Unique) ) ); - hipGraphExec_t graph_instance; + if(TestFixture::use_graphs) { - graph_instance = graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream, true, false); } HIP_CHECK(hipDeviceSynchronize()); @@ -569,7 +569,7 @@ TYPED_TEST(RocprimDeviceSelectTests, Unique) if(TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } } } @@ -717,10 +717,10 @@ void testUniqueGuardedOperator() HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(UseGraphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -738,10 +738,10 @@ void testUniqueGuardedOperator() ) ); - hipGraphExec_t graph_instance; + if(UseGraphs) { - graph_instance = graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream, true, false); } HIP_CHECK(hipDeviceSynchronize()); @@ -778,7 +778,7 @@ void testUniqueGuardedOperator() if(UseGraphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } } } @@ -969,10 +969,10 @@ TYPED_TEST(RocprimDeviceUniqueByKeyTests, UniqueByKey) HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -992,10 +992,10 @@ TYPED_TEST(RocprimDeviceUniqueByKeyTests, UniqueByKey) ) ); - hipGraphExec_t graph_instance; + if(TestFixture::use_graphs) { - graph_instance = graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream, true, false); } HIP_CHECK(hipDeviceSynchronize()); @@ -1042,7 +1042,7 @@ TYPED_TEST(RocprimDeviceUniqueByKeyTests, UniqueByKey) if(TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } } } @@ -1175,10 +1175,10 @@ TYPED_TEST(RocprimDeviceUniqueByKeyTests, UniqueByKeyAlias) test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -1195,11 +1195,10 @@ TYPED_TEST(RocprimDeviceUniqueByKeyTests, UniqueByKeyAlias) stream, debug_synchronous)); - hipGraphExec_t graph_instance; + if(TestFixture::use_graphs) { - graph_instance = graph_instance - = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream); } HIP_CHECK(hipDeviceSynchronize()); @@ -1237,7 +1236,7 @@ TYPED_TEST(RocprimDeviceUniqueByKeyTests, UniqueByKeyAlias) if(TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } } } @@ -1346,10 +1345,10 @@ TEST_P(RocprimDeviceSelectLargeInputTests, LargeInputFlagged) HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -1367,10 +1366,10 @@ TEST_P(RocprimDeviceSelectLargeInputTests, LargeInputFlagged) ) ); - hipGraphExec_t graph_instance; + if(use_graphs) { - graph_instance = graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream, true, false); } HIP_CHECK(hipDeviceSynchronize()); @@ -1404,7 +1403,7 @@ TEST_P(RocprimDeviceSelectLargeInputTests, LargeInputFlagged) if(use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } } @@ -1468,10 +1467,10 @@ TEST_P(RocprimDeviceSelectLargeInputTests, LargeInputUnique) ASSERT_GT(temp_storage_size_bytes, 0); HIP_CHECK(test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper;; if(use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } HIP_CHECK(rocprim::unique(d_temp_storage, @@ -1484,10 +1483,10 @@ TEST_P(RocprimDeviceSelectLargeInputTests, LargeInputUnique) stream, debug_synchronous)); - hipGraphExec_t graph_instance; + if(use_graphs) { - graph_instance = graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } size_t unique_count_output{}; @@ -1513,7 +1512,7 @@ TEST_P(RocprimDeviceSelectLargeInputTests, LargeInputUnique) HIP_CHECK(hipFree(d_temp_storage)); if (use_graphs) - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } if (use_graphs) diff --git a/test/rocprim/test_device_transform.cpp b/test/rocprim/test_device_transform.cpp index 5d400927d..e9b60aac1 100644 --- a/test/rocprim/test_device_transform.cpp +++ b/test/rocprim/test_device_transform.cpp @@ -158,10 +158,10 @@ TYPED_TEST(RocprimDeviceTransformTests, Transform) std::vector expected(input.size()); std::transform(input.begin(), input.end(), expected.begin(), transform()); - hipGraph_t graph; + test_utils::GraphHelper gHelper; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -173,10 +173,10 @@ TYPED_TEST(RocprimDeviceTransformTests, Transform) ) ); - hipGraphExec_t graph_instance; + if(TestFixture::use_graphs) { - graph_instance = graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream, true, false); } HIP_CHECK(hipGetLastError()); @@ -201,7 +201,7 @@ TYPED_TEST(RocprimDeviceTransformTests, Transform) if (TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); HIP_CHECK(hipStreamDestroy(stream)); } } @@ -281,10 +281,10 @@ TYPED_TEST(RocprimDeviceTransformTests, BinaryTransform) expected.begin(), binary_transform() ); - hipGraph_t graph; + test_utils::GraphHelper gHelper; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -296,10 +296,10 @@ TYPED_TEST(RocprimDeviceTransformTests, BinaryTransform) ) ); - hipGraphExec_t graph_instance; + if(TestFixture::use_graphs) { - graph_instance = graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream, true, false); } HIP_CHECK(hipGetLastError()); @@ -325,7 +325,7 @@ TYPED_TEST(RocprimDeviceTransformTests, BinaryTransform) if (TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); HIP_CHECK(hipStreamDestroy(stream)); } } @@ -390,20 +390,20 @@ void testLargeIndices() return 0; }; - hipGraph_t graph; + test_utils::GraphHelper gHelper; if(UseGraphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run HIP_CHECK( rocprim::transform(input, output, size, flag_expected, stream, debug_synchronous)); - hipGraphExec_t graph_instance; + if(UseGraphs) { - graph_instance = graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, false); + gHelper.createAndLaunchGraph(stream, true, false); } HIP_CHECK(hipGetLastError()); @@ -419,7 +419,7 @@ void testLargeIndices() HIP_CHECK(hipFree(d_flag)); if (UseGraphs) - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); } } diff --git a/test/rocprim/test_discard_iterator.cpp b/test/rocprim/test_discard_iterator.cpp index e37861151..65ee91ebe 100644 --- a/test/rocprim/test_discard_iterator.cpp +++ b/test/rocprim/test_discard_iterator.cpp @@ -180,4 +180,5 @@ TEST(RocprimDiscardIteratorTests, ReduceByKey) HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_aggregates_output)); + HIP_CHECK(hipFree(d_temporary_storage)); } diff --git a/test/rocprim/test_lookback_reproducibility.cpp b/test/rocprim/test_lookback_reproducibility.cpp index b16be5c01..2c50818f5 100644 --- a/test/rocprim/test_lookback_reproducibility.cpp +++ b/test/rocprim/test_lookback_reproducibility.cpp @@ -106,6 +106,8 @@ void test_reproducibility(S scan_op, F run_test) auto second = run_test(eepy_scan_op); // We want the result to be bitwise equal, even if the inputs/outputs are floats. ASSERT_NO_FATAL_FAILURE(test_utils::assert_bit_eq(first, second)); + + HIP_CHECK(hipFree(d_enable_sleep)); } template @@ -199,12 +201,12 @@ TYPED_TEST(RocprimLookbackReproducibilityTests, Scan) d_output, output.size() * sizeof(T), hipMemcpyDeviceToHost)); - hipFree(d_temp_storage); + HIP_CHECK(hipFree(d_temp_storage)); return output; }); - hipFree(d_input); - hipFree(d_output); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); } } } @@ -296,12 +298,12 @@ TYPED_TEST(RocprimLookbackReproducibilityTests, ScanByKey) d_output, output.size() * sizeof(V), hipMemcpyDeviceToHost)); - hipFree(d_temp_storage); + HIP_CHECK(hipFree(d_temp_storage)); return output; }); - hipFree(d_input); - hipFree(d_output); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_input)); } } } @@ -409,13 +411,14 @@ TYPED_TEST(RocprimLookbackReproducibilityTests, ReduceByKey) d_output, output.size() * sizeof(V), hipMemcpyDeviceToHost)); - hipFree(d_temp_storage); + HIP_CHECK(hipFree(d_temp_storage)); return output; }); - hipFree(d_input); - hipFree(d_output); - hipFree(d_unique_count_output); + HIP_CHECK(hipFree(d_keys)); + HIP_CHECK(hipFree(d_input)); + HIP_CHECK(hipFree(d_output)); + HIP_CHECK(hipFree(d_unique_count_output)); } } } diff --git a/test/rocprim/test_utils_hipgraphs.hpp b/test/rocprim/test_utils_hipgraphs.hpp index 9d20b0ed7..82df35dc5 100644 --- a/test/rocprim/test_utils_hipgraphs.hpp +++ b/test/rocprim/test_utils_hipgraphs.hpp @@ -18,6 +18,7 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. + #ifndef ROCPRIM_TEST_UTILS_HIPGRAPHS_HPP #define ROCPRIM_TEST_UTILS_HIPGRAPHS_HPP @@ -27,65 +28,65 @@ // Helper functions for testing with hipGraph stream capture. // Note: graphs will not work on the default stream. -namespace test_utils -{ -inline hipGraph_t createGraphHelper(hipStream_t& stream, const bool beginCapture=true) -{ - // Create a new graph - hipGraph_t graph; - HIP_CHECK(hipGraphCreate(&graph, 0)); - - // Optionally begin stream capture - if (beginCapture) - HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); - - return graph; -} - -inline void cleanupGraphHelper(hipGraph_t& graph, hipGraphExec_t& instance) -{ - HIP_CHECK(hipGraphDestroy(graph)); - HIP_CHECK(hipGraphExecDestroy(instance)); -} - -inline void resetGraphHelper(hipGraph_t& graph, hipGraphExec_t& instance, hipStream_t& stream, const bool beginCapture=true) -{ - // Destroy the old graph and instance - cleanupGraphHelper(graph, instance); - - // Create a new graph and optionally begin capture - graph = createGraphHelper(stream, beginCapture); -} - -inline hipGraphExec_t endCaptureGraphHelper(hipGraph_t& graph, hipStream_t& stream, const bool launchGraph=false, const bool sync=false) -{ - // End the capture - HIP_CHECK(hipStreamEndCapture(stream, &graph)); - - // Instantiate the graph - hipGraphExec_t instance; - HIP_CHECK(hipGraphInstantiate(&instance, graph, nullptr, nullptr, 0)); - - // Optionally launch the graph - if (launchGraph) - HIP_CHECK(hipGraphLaunch(instance, stream)); - - // Optionally synchronize the stream when we're done - if (sync) - HIP_CHECK(hipStreamSynchronize(stream)); - - return instance; -} - -inline void launchGraphHelper(hipGraphExec_t& instance, hipStream_t& stream, const bool sync=false) +namespace test_utils { - HIP_CHECK(hipGraphLaunch(instance, stream)); - - // Optionally sync after the launch - if (sync) - HIP_CHECK(hipStreamSynchronize(stream)); -} + class GraphHelper{ + private: + hipGraph_t graph; + hipGraphExec_t graph_instance; + public: + + inline void startStreamCapture(hipStream_t & stream){ + HIP_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); + } + + inline void endStreamCapture(hipStream_t & stream){ + HIP_CHECK(hipStreamEndCapture(stream, &graph)); + } + + inline void createGraph(){ + HIP_CHECK(hipGraphInstantiate(&graph_instance, graph, nullptr, nullptr, 0)); + } + + inline void launchGraph(hipStream_t & stream, const bool sync=false){ + HIP_CHECK(hipGraphLaunch(graph_instance, stream)); + + if (sync) + HIP_CHECK(hipStreamSynchronize(stream)); + } + + inline void createAndLaunchGraph(hipStream_t & stream, const bool launchGraph=true, const bool sync=true){ + + endStreamCapture(stream); + + HIP_CHECK(hipGraphInstantiate(&graph_instance, graph, nullptr, nullptr, 0)); + + // Optionally launch the graph + if (launchGraph) + HIP_CHECK(hipGraphLaunch(graph_instance, stream)); + + // Optionally synchronize the stream when we're done + if (sync) + HIP_CHECK(hipStreamSynchronize(stream)); + } + + inline void cleanupGraphHelper() + { + HIP_CHECK(hipGraphDestroy(this->graph)); + HIP_CHECK(hipGraphExecDestroy(this->graph_instance)); + } + + inline void resetGraphHelper(hipStream_t& stream, const bool beginCapture=true) + { + // Destroy the old graph and instance + cleanupGraphHelper(); + + if(beginCapture) + startStreamCapture(stream); + + } + }; } // end namespace test_utils diff --git a/test/rocprim/test_warp_scan.hpp b/test/rocprim/test_warp_scan.hpp index 69d45506a..c1f9c1e57 100644 --- a/test/rocprim/test_warp_scan.hpp +++ b/test/rocprim/test_warp_scan.hpp @@ -1212,6 +1212,7 @@ typed_test_def(RocprimWarpScanTests, name_suffix, ScanReduce) HIP_CHECK(hipFree(device_input)); HIP_CHECK(hipFree(device_inclusive_output)); HIP_CHECK(hipFree(device_exclusive_output)); + HIP_CHECK(hipFree(device_output_reductions)); } } From 968620eeeeffdafcc2a00fbe0f6710b34aa0e8a7 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 30 Oct 2024 21:11:59 +0000 Subject: [PATCH 08/13] changed to using gHelper --- test/rocprim/test_device_find_first_of.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/test/rocprim/test_device_find_first_of.cpp b/test/rocprim/test_device_find_first_of.cpp index ca3c7b9ce..dd22ec7ff 100644 --- a/test/rocprim/test_device_find_first_of.cpp +++ b/test/rocprim/test_device_find_first_of.cpp @@ -253,10 +253,10 @@ TYPED_TEST(RocprimDeviceFindFirstOfTests, FindFirstOf) HIP_CHECK( test_common_utils::hipMallocHelper(&d_temp_storage, temp_storage_size_bytes)); - hipGraph_t graph; + test_utils::GraphHelper gHelper; if(TestFixture::use_graphs) { - graph = test_utils::createGraphHelper(stream); + gHelper.startStreamCapture(stream); } // Run @@ -271,10 +271,9 @@ TYPED_TEST(RocprimDeviceFindFirstOfTests, FindFirstOf) stream, debug_synchronous)); - hipGraphExec_t graph_instance; if(TestFixture::use_graphs) { - graph_instance = test_utils::endCaptureGraphHelper(graph, stream, true, true); + gHelper.createAndLaunchGraph(stream); } HIP_CHECK(hipGetLastError()); @@ -302,7 +301,7 @@ TYPED_TEST(RocprimDeviceFindFirstOfTests, FindFirstOf) if(TestFixture::use_graphs) { - test_utils::cleanupGraphHelper(graph, graph_instance); + gHelper.cleanupGraphHelper(); HIP_CHECK(hipStreamDestroy(stream)); } } From 6fa3378e0c936a30ae942bbc0467497acf479758 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 30 Oct 2024 22:47:29 +0000 Subject: [PATCH 09/13] merged 6.3 version of merge_sort_merge_path (failing large indices test) --- .../detail/device_merge_sort_mergepath.hpp | 719 +++++++++--------- 1 file changed, 352 insertions(+), 367 deletions(-) diff --git a/rocprim/include/rocprim/device/detail/device_merge_sort_mergepath.hpp b/rocprim/include/rocprim/device/detail/device_merge_sort_mergepath.hpp index ec80a2197..6fae39024 100644 --- a/rocprim/include/rocprim/device/detail/device_merge_sort_mergepath.hpp +++ b/rocprim/include/rocprim/device/detail/device_merge_sort_mergepath.hpp @@ -33,440 +33,425 @@ #include "../../detail/various.hpp" -#include "device_merge.hpp" #include "device_merge_sort.hpp" +#include "device_merge.hpp" BEGIN_ROCPRIM_NAMESPACE namespace detail { -// Load items from input1 and input2 from global memory -template -ROCPRIM_DEVICE ROCPRIM_INLINE -void gmem_to_reg(KeyT (&output)[ItemsPerThread], - InputIterator input1, - InputIterator input2, - unsigned int count1, - unsigned int count2, - bool is_incomplete_tile) -{ - if(is_incomplete_tile) + // Load items from input1 and input2 from global memory + template + ROCPRIM_DEVICE ROCPRIM_INLINE + void gmem_to_reg(KeyT (&output)[ItemsPerThread], + InputIterator input1, + InputIterator input2, + unsigned int count1, + unsigned int count2, + bool IsLastTile) { - ROCPRIM_UNROLL - for(unsigned int item = 0; item < ItemsPerThread; ++item) + if(IsLastTile) { - unsigned int idx = BlockSize * item + threadIdx.x; - if(idx < count1 + count2) + ROCPRIM_UNROLL + for (unsigned int item = 0; item < ItemsPerThread; ++item) + { + unsigned int idx = rocprim::flat_block_size() * item + threadIdx.x; + if (idx < count1 + count2) + { + output[item] = (idx < count1) ? input1[idx] : input2[idx - count1]; + } + } + + } + else + { + ROCPRIM_UNROLL + for (unsigned int item = 0; item < ItemsPerThread; ++item) { + unsigned int idx = rocprim::flat_block_size() * item + threadIdx.x; output[item] = (idx < count1) ? input1[idx] : input2[idx - count1]; } } } - else + + template + ROCPRIM_DEVICE ROCPRIM_INLINE + void reg_to_shared(OutputIterator output, + KeyT (&input)[ItemsPerThread]) { ROCPRIM_UNROLL - for(unsigned int item = 0; item < ItemsPerThread; ++item) + for (unsigned int item = 0; item < ItemsPerThread; ++item) { unsigned int idx = BlockSize * item + threadIdx.x; - output[item] = (idx < count1) ? input1[idx] : input2[idx - count1]; + output[idx] = input[item]; } } -} -template -ROCPRIM_DEVICE ROCPRIM_INLINE -void reg_to_shared(OutputIterator output, KeyT (&input)[ItemsPerThread]) -{ - ROCPRIM_UNROLL - for(unsigned int item = 0; item < ItemsPerThread; ++item) + template::value_type> + ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE auto block_merge_process_tile(KeysInputIterator keys_input, + KeysOutputIterator keys_output, + ValuesInputIterator values_input, + ValuesOutputIterator values_output, + const OffsetT input_size, + const OffsetT sorted_block_size, + const unsigned int num_blocks, + BinaryFunction compare_function, + const OffsetT* merge_partitions) + -> std::enable_if_t<(!std::is_trivially_copyable::value + || rocprim::is_floating_point::value + || std::is_integral::value), + void> { - unsigned int idx = BlockSize * item + threadIdx.x; - output[idx] = input[item]; - } -} - -template::value_type> -ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE -auto block_merge_process_tile(KeysInputIterator keys_input, - KeysOutputIterator keys_output, - ValuesInputIterator values_input, - ValuesOutputIterator values_output, - const OffsetT input_size, - const OffsetT sorted_block_size, - const unsigned int num_blocks, - BinaryFunction compare_function, - const OffsetT* merge_partitions) - -> std::enable_if_t<(!std::is_trivially_copyable::value - || rocprim::is_floating_point::value - || std::is_integral::value), - void> -{ - using key_type = typename std::iterator_traits::value_type; - using value_type = typename std::iterator_traits::value_type; - constexpr bool with_values = !std::is_same::value; - constexpr unsigned int items_per_tile = BlockSize * ItemsPerThread; + using key_type = typename std::iterator_traits::value_type; + using value_type = typename std::iterator_traits::value_type; + constexpr bool with_values = !std::is_same::value; + constexpr unsigned int items_per_tile = BlockSize * ItemsPerThread; + + using block_store = block_store_impl; + + using keys_storage_ = key_type[items_per_tile + 1]; + using values_storage_ = value_type[items_per_tile + 1]; + + ROCPRIM_SHARED_MEMORY union { + typename block_store::storage_type store; + ROCPRIM_DETAIL_SUPPRESS_DEPRECATION_WITH_PUSH + detail::raw_storage keys; + detail::raw_storage values; + ROCPRIM_DETAIL_SUPPRESS_DEPRECATION_POP + } storage; + + auto& keys_shared = storage.keys.get(); + auto& values_shared = storage.values.get(); + + const unsigned short flat_id = block_thread_id<0>(); + const unsigned int flat_block_id = ::rocprim::flat_block_id(); + if(flat_block_id >= num_blocks) + { + return; + } + const bool IsIncompleteTile = flat_block_id == (input_size / items_per_tile); - using block_store - = block_store_impl; + const OffsetT partition_beg = merge_partitions[flat_block_id]; + const OffsetT partition_end = merge_partitions[flat_block_id + 1]; - using keys_storage_ = key_type[items_per_tile + 1]; - using values_storage_ = value_type[items_per_tile + 1]; + const unsigned int merged_tiles_number = sorted_block_size / items_per_tile; + const unsigned int target_merged_tiles_number = merged_tiles_number * 2; + const unsigned int mask = target_merged_tiles_number - 1; + const unsigned int tilegroup_start_id = ~mask & flat_block_id; + const OffsetT tilegroup_start = static_cast(tilegroup_start_id) * items_per_tile; // Tile-group starts here + const OffsetT diag = static_cast(flat_block_id) * items_per_tile - tilegroup_start; - ROCPRIM_SHARED_MEMORY union - { - typename block_store::storage_type store; - ROCPRIM_DETAIL_SUPPRESS_DEPRECATION_WITH_PUSH - detail::raw_storage keys; - detail::raw_storage values; - ROCPRIM_DETAIL_SUPPRESS_DEPRECATION_POP - } storage; - - auto& keys_shared = storage.keys.get(); - auto& values_shared = storage.values.get(); - - const unsigned int flat_block_id = ::rocprim::flat_block_id(); - if(flat_block_id >= num_blocks) - { - return; - } + const OffsetT keys1_beg = partition_beg; + OffsetT keys1_end = partition_end; + const OffsetT keys2_beg = rocprim::min(input_size, 2 * tilegroup_start + sorted_block_size + diag - partition_beg); + OffsetT keys2_end = rocprim::min(input_size, 2 * tilegroup_start + sorted_block_size + diag + items_per_tile - partition_end); - const bool is_incomplete_tile = flat_block_id == (input_size / items_per_tile); + if (mask == (mask & flat_block_id)) // If last tile in the tile-group + { + keys1_end = rocprim::min(input_size, tilegroup_start + sorted_block_size); + keys2_end = rocprim::min(input_size, tilegroup_start + sorted_block_size * 2); + } - const OffsetT partition_beg = merge_partitions[flat_block_id]; - const OffsetT partition_end = merge_partitions[flat_block_id + 1]; + // Number of keys per tile + const unsigned int num_keys1 = static_cast(keys1_end - keys1_beg); + const unsigned int num_keys2 = static_cast(keys2_end - keys2_beg); + // Load keys1 & keys2 + key_type keys[ItemsPerThread]; + gmem_to_reg(keys, + keys_input + keys1_beg, + keys_input + keys2_beg, + num_keys1, + num_keys2, + IsIncompleteTile); + // Load keys into shared memory + reg_to_shared(keys_shared, keys); + + value_type values[ItemsPerThread]; + if ROCPRIM_IF_CONSTEXPR(with_values){ + gmem_to_reg(values, + values_input + keys1_beg, + values_input + keys2_beg, + num_keys1, + num_keys2, + IsIncompleteTile); + } + rocprim::syncthreads(); - const unsigned int merged_tiles_number = sorted_block_size / items_per_tile; - const unsigned int target_merged_tiles_number = merged_tiles_number * 2; - const unsigned int mask = target_merged_tiles_number - 1; - const unsigned int tilegroup_start_id = ~mask & flat_block_id; + const unsigned int diag0_local = rocprim::min(num_keys1 + num_keys2, ItemsPerThread * flat_id); - const OffsetT tilegroup_start - = static_cast(tilegroup_start_id) * items_per_tile; // Tile-group starts here - const OffsetT diag = static_cast(flat_block_id) * items_per_tile - tilegroup_start; + const unsigned int keys1_beg_local = merge_path(keys_shared, + &keys_shared[num_keys1], + num_keys1, + num_keys2, + diag0_local, + compare_function); + const unsigned int keys1_end_local = num_keys1; + const unsigned int keys2_beg_local = diag0_local - keys1_beg_local; + const unsigned int keys2_end_local = num_keys2; - const OffsetT keys1_beg = partition_beg; - OffsetT keys1_end = partition_end; - const OffsetT keys2_beg - = rocprim::min(input_size, 2 * tilegroup_start + sorted_block_size + diag - partition_beg); - OffsetT keys2_end = rocprim::min(input_size, - 2 * tilegroup_start + sorted_block_size + diag + items_per_tile - - partition_end); + range_t<> range_local{keys1_beg_local, + keys1_end_local, + keys2_beg_local + keys1_end_local, + keys2_end_local + keys1_end_local}; - if(mask == (mask & flat_block_id)) // If last tile in the tile-group - { - keys1_end = rocprim::min(input_size, tilegroup_start + sorted_block_size); - keys2_end = rocprim::min(input_size, tilegroup_start + sorted_block_size * 2); - } - - // Number of keys per tile - const unsigned int num_keys1 = static_cast(keys1_end - keys1_beg); - const unsigned int num_keys2 = static_cast(keys2_end - keys2_beg); - // Load keys1 & keys2 - key_type keys[ItemsPerThread]; - gmem_to_reg(keys, - keys_input + keys1_beg, - keys_input + keys2_beg, - num_keys1, - num_keys2, - is_incomplete_tile); - // Load keys into shared memory - reg_to_shared(keys_shared, keys); - - value_type values[ItemsPerThread]; - if ROCPRIM_IF_CONSTEXPR(with_values) - { - gmem_to_reg(values, - values_input + keys1_beg, - values_input + keys2_beg, - num_keys1, - num_keys2, - is_incomplete_tile); - } - rocprim::syncthreads(); - - const unsigned int flat_id = block_thread_id<0>(); - const unsigned int diag0_local = rocprim::min(num_keys1 + num_keys2, ItemsPerThread * flat_id); - - const unsigned int keys1_beg_local = merge_path(keys_shared, - &keys_shared[num_keys1], - num_keys1, - num_keys2, - diag0_local, - compare_function); - const unsigned int keys1_end_local = num_keys1; - const unsigned int keys2_beg_local = diag0_local - keys1_beg_local; - const unsigned int keys2_end_local = num_keys2; - range_t range_local = {keys1_beg_local, - keys1_end_local, - keys2_beg_local + keys1_end_local, - keys2_end_local + keys1_end_local}; - - unsigned int indices[ItemsPerThread]; - - serial_merge(keys_shared, keys, indices, range_local, compare_function); - - if ROCPRIM_IF_CONSTEXPR(with_values) - { - reg_to_shared(values_shared, values); + unsigned int indices[ItemsPerThread]; + serial_merge(keys_shared, keys, indices, range_local, compare_function); rocprim::syncthreads(); - ROCPRIM_UNROLL - for(unsigned int item = 0; item < ItemsPerThread; ++item) - { - values[item] = values_shared[indices[item]]; + if ROCPRIM_IF_CONSTEXPR(with_values){ + reg_to_shared(values_shared, values); + + rocprim::syncthreads(); + + ROCPRIM_UNROLL + for (unsigned int item = 0; item < ItemsPerThread; ++item) + { + values[item] = values_shared[indices[item]]; + } + + rocprim::syncthreads(); } - rocprim::syncthreads(); + const OffsetT offset = flat_block_id * items_per_tile; + block_store().store(offset, + input_size - offset, + IsIncompleteTile, + keys_output, + values_output, + keys, + values, + storage.store); } - const OffsetT offset = static_cast(flat_block_id) * items_per_tile; - block_store().store(offset, - input_size - offset, - is_incomplete_tile, - keys_output, - values_output, - keys, - values, - storage.store); -} - -// The specialization below exists because the compiler creates slow code for -// ValueTypes with misaligned datastructures in them (e.g. custom_char_double) -// when storing/loading those ValueTypes to/from registers. -// Thus this is a temporary workaround. -template::value_type> -ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE -auto block_merge_process_tile(KeysInputIterator keys_input, - KeysOutputIterator keys_output, - ValuesInputIterator values_input, - ValuesOutputIterator values_output, - const OffsetT input_size, - const OffsetT sorted_block_size, - const unsigned int num_blocks, - BinaryFunction compare_function, - const OffsetT* merge_partitions) - -> std::enable_if_t<(std::is_trivially_copyable::value - && !rocprim::is_floating_point::value - && !std::is_integral::value), - void> -{ - using key_type = typename std::iterator_traits::value_type; - using value_type = typename std::iterator_traits::value_type; - constexpr bool with_values = !std::is_same::value; - constexpr unsigned int items_per_tile = BlockSize * ItemsPerThread; + // The specialization below exists because the compiler creates slow code for + // ValueTypes with misaligned datastructures in them (e.g. custom_char_double) + // when storing/loading those ValueTypes to/from registers. + // Thus this is a temporary workaround. + template::value_type> + ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE auto block_merge_process_tile(KeysInputIterator keys_input, + KeysOutputIterator keys_output, + ValuesInputIterator values_input, + ValuesOutputIterator values_output, + const OffsetT input_size, + const OffsetT sorted_block_size, + const unsigned int num_blocks, + BinaryFunction compare_function, + const OffsetT* merge_partitions) + -> std::enable_if_t<(std::is_trivially_copyable::value + && !rocprim::is_floating_point::value + && !std::is_integral::value), + void> + { + using key_type = typename std::iterator_traits::value_type; + using value_type = typename std::iterator_traits::value_type; + constexpr bool with_values = !std::is_same::value; + constexpr unsigned int items_per_tile = BlockSize * ItemsPerThread; + + using block_store = block_store_impl; + + using keys_storage_ = key_type[items_per_tile]; + using values_storage_ = value_type[items_per_tile]; + + ROCPRIM_SHARED_MEMORY union { + typename block_store::storage_type store; + ROCPRIM_DETAIL_SUPPRESS_DEPRECATION_WITH_PUSH + detail::raw_storage keys; + detail::raw_storage values; + ROCPRIM_DETAIL_SUPPRESS_DEPRECATION_POP + } storage; + + auto& keys_shared = storage.keys.get(); + auto& values_shared = storage.values.get(); + + const unsigned short flat_id = block_thread_id<0>(); + const unsigned int flat_block_id = block_id<0>(); + if(flat_block_id >= num_blocks) + { + return; + } - using block_store = block_store_impl; + const bool is_incomplete_tile = flat_block_id == (input_size / items_per_tile); + const OffsetT partition_beg = merge_partitions[flat_block_id]; + const OffsetT partition_end = merge_partitions[flat_block_id + 1]; - using keys_storage_ = key_type[items_per_tile]; - using values_storage_ = value_type[items_per_tile]; + const unsigned int merged_tiles_number = sorted_block_size / items_per_tile; + const unsigned int target_merged_tiles_number = merged_tiles_number * 2; + const unsigned int mask = target_merged_tiles_number - 1; + const unsigned int tilegroup_start_id = ~mask & flat_block_id; + const OffsetT tilegroup_start = static_cast(tilegroup_start_id) * items_per_tile; // Tile-group starts here + const OffsetT diag = static_cast(flat_block_id) * items_per_tile - tilegroup_start; - ROCPRIM_SHARED_MEMORY union - { - typename block_store::storage_type store; - ROCPRIM_DETAIL_SUPPRESS_DEPRECATION_WITH_PUSH - detail::raw_storage keys; - detail::raw_storage values; - ROCPRIM_DETAIL_SUPPRESS_DEPRECATION_POP - } storage; - - auto& keys_shared = storage.keys.get(); - auto& values_shared = storage.values.get(); - - const unsigned int flat_block_id = ::rocprim::flat_block_id(); - if(flat_block_id >= num_blocks) - { - return; - } + const OffsetT keys1_beg = partition_beg; + OffsetT keys1_end = partition_end; + const OffsetT keys2_beg = rocprim::min(input_size, 2 * tilegroup_start + sorted_block_size + diag - partition_beg); + OffsetT keys2_end = rocprim::min(input_size, 2 * tilegroup_start + sorted_block_size + diag + items_per_tile - partition_end); - const bool is_incomplete_tile = flat_block_id == (input_size / items_per_tile); + if (mask == (mask & flat_block_id)) // If last tile in the tile-group + { + keys1_end = rocprim::min(input_size, tilegroup_start + sorted_block_size); + keys2_end = rocprim::min(input_size, tilegroup_start + sorted_block_size * 2); + } - const OffsetT partition_beg = merge_partitions[flat_block_id]; - const OffsetT partition_end = merge_partitions[flat_block_id + 1]; + // Number of keys per tile + const unsigned int num_keys1 = static_cast(keys1_end - keys1_beg); + const unsigned int num_keys2 = static_cast(keys2_end - keys2_beg); + // Load keys1 & keys2 + key_type keys[ItemsPerThread]; + gmem_to_reg(keys, + keys_input + keys1_beg, + keys_input + keys2_beg, + num_keys1, + num_keys2, + is_incomplete_tile); + // Load keys into shared memory + reg_to_shared(keys_shared, keys); - const unsigned int merged_tiles_number = sorted_block_size / items_per_tile; - const unsigned int target_merged_tiles_number = merged_tiles_number * 2; - const unsigned int mask = target_merged_tiles_number - 1; - const unsigned int tilegroup_start_id = ~mask & flat_block_id; + rocprim::syncthreads(); - const OffsetT tilegroup_start - = static_cast(tilegroup_start_id) * items_per_tile; // Tile-group starts here - const OffsetT diag = static_cast(flat_block_id) * items_per_tile - tilegroup_start; + const unsigned int diag0_local = rocprim::min(num_keys1 + num_keys2, ItemsPerThread * flat_id); - const OffsetT keys1_beg = partition_beg; - OffsetT keys1_end = partition_end; - const OffsetT keys2_beg - = rocprim::min(input_size, 2 * tilegroup_start + sorted_block_size + diag - partition_beg); - OffsetT keys2_end = rocprim::min(input_size, - 2 * tilegroup_start + sorted_block_size + diag + items_per_tile - - partition_end); + const unsigned int keys1_beg_local = merge_path(keys_shared, + &keys_shared[num_keys1], + num_keys1, + num_keys2, + diag0_local, + compare_function); + const unsigned int keys1_end_local = num_keys1; + const unsigned int keys2_beg_local = diag0_local - keys1_beg_local; + const unsigned int keys2_end_local = num_keys2; - if(mask == (mask & flat_block_id)) // If last tile in the tile-group - { - keys1_end = rocprim::min(input_size, tilegroup_start + sorted_block_size); - keys2_end = rocprim::min(input_size, tilegroup_start + sorted_block_size * 2); - } + range_t<> range_local{keys1_beg_local, + keys1_end_local, + keys2_beg_local + keys1_end_local, + keys2_end_local + keys1_end_local}; - // Number of keys per tile - const unsigned int num_keys1 = static_cast(keys1_end - keys1_beg); - const unsigned int num_keys2 = static_cast(keys2_end - keys2_beg); - // Load keys1 & keys2 - key_type keys[ItemsPerThread]; - gmem_to_reg(keys, - keys_input + keys1_beg, - keys_input + keys2_beg, - num_keys1, - num_keys2, - is_incomplete_tile); - // Load keys into shared memory - reg_to_shared(keys_shared, keys); - - rocprim::syncthreads(); - - const unsigned int flat_id = block_thread_id<0>(); - const unsigned int diag0_local = rocprim::min(num_keys1 + num_keys2, ItemsPerThread * flat_id); - - const unsigned int keys1_beg_local = merge_path(keys_shared, - &keys_shared[num_keys1], - num_keys1, - num_keys2, - diag0_local, - compare_function); - const unsigned int keys1_end_local = num_keys1; - const unsigned int keys2_beg_local = diag0_local - keys1_beg_local; - const unsigned int keys2_end_local = num_keys2; - - range_t range_local = {keys1_beg_local, - keys1_end_local, - keys2_beg_local + keys1_end_local, - keys2_end_local + keys1_end_local}; - - unsigned int indices[ItemsPerThread]; - - serial_merge(keys_shared, keys, indices, range_local, compare_function); - - if ROCPRIM_IF_CONSTEXPR(with_values) - { - const ValuesInputIterator input1 = values_input + keys1_beg; - const ValuesInputIterator input2 = values_input + keys2_beg; - if(is_incomplete_tile) + unsigned int indices[ItemsPerThread]; + + serial_merge(keys_shared, keys, indices, range_local, compare_function); + rocprim::syncthreads(); + + if ROCPRIM_IF_CONSTEXPR(with_values) { - ROCPRIM_UNROLL - for(unsigned int item = 0; item < ItemsPerThread; ++item) + const ValuesInputIterator input1 = values_input + keys1_beg; + const ValuesInputIterator input2 = values_input + keys2_beg; + if(is_incomplete_tile) { - const unsigned int idx = BlockSize * item + threadIdx.x; - if(idx < num_keys1) + ROCPRIM_UNROLL + for (unsigned int item = 0; item < ItemsPerThread; ++item) { - values_shared[idx] = input1[idx]; - } - else if(idx - num_keys1 < num_keys2) - { - values_shared[idx] = input2[idx - num_keys1]; + const unsigned int idx = BlockSize * item + threadIdx.x; + if(idx < num_keys1) + { + values_shared[idx] = input1[idx]; + } + else if(idx - num_keys1 < num_keys2) + { + values_shared[idx] = input2[idx - num_keys1]; + } } } - } - else - { - ROCPRIM_UNROLL - for(unsigned int item = 0; item < ItemsPerThread; ++item) + else { - const unsigned int idx = BlockSize * item + threadIdx.x; - if(idx < num_keys1) - { - values_shared[idx] = input1[idx]; - } - else + ROCPRIM_UNROLL + for (unsigned int item = 0; item < ItemsPerThread; ++item) { - values_shared[idx] = input2[idx - num_keys1]; + const unsigned int idx = BlockSize * item + threadIdx.x; + if(idx < num_keys1) + { + values_shared[idx] = input1[idx]; + } + else + { + values_shared[idx] = input2[idx - num_keys1]; + } } } - } - rocprim::syncthreads(); - const OffsetT thread_offset - = static_cast(flat_block_id) * items_per_tile + flat_id * ItemsPerThread; - if(is_incomplete_tile) - { - ROCPRIM_UNROLL - for(unsigned int item = 0; item < ItemsPerThread; ++item) + rocprim::syncthreads(); + const OffsetT thread_offset = static_cast(flat_block_id) * items_per_tile + flat_id * ItemsPerThread; + if(is_incomplete_tile) { - if(flat_id * ItemsPerThread + item < num_keys1 + num_keys2) + ROCPRIM_UNROLL + for(unsigned int item = 0; item < ItemsPerThread; ++item) { - values_output[thread_offset + item] = values_shared[indices[item]]; + if(flat_id * ItemsPerThread + item < num_keys1 + num_keys2) + { + values_output[thread_offset + item] = values_shared[indices[item]]; + } } } - } - else - { - ROCPRIM_UNROLL - for(unsigned int item = 0; item < ItemsPerThread; ++item) + else { - values_output[thread_offset + item] = values_shared[indices[item]]; + ROCPRIM_UNROLL + for(unsigned int item = 0; item < ItemsPerThread; ++item) + { + values_output[thread_offset + item] = values_shared[indices[item]]; + } } + + rocprim::syncthreads(); } - rocprim::syncthreads(); + const OffsetT offset = static_cast(flat_block_id) * items_per_tile; + value_type values[ItemsPerThread]; + block_store().store(offset, + input_size - offset, + is_incomplete_tile, + keys_output, + values_output, + keys, + values, + storage.store); } - const OffsetT offset = static_cast(flat_block_id) * items_per_tile; - value_type values[ItemsPerThread]; - block_store().store(offset, - input_size - offset, - is_incomplete_tile, - keys_output, - values_output, - keys, - values, - storage.store); -} - -template -ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE -void block_merge_mergepath_kernel(KeysInputIterator keys_input, - KeysOutputIterator keys_output, - ValuesInputIterator values_input, - ValuesOutputIterator values_output, - const OffsetT input_size, - const OffsetT sorted_block_size, - const unsigned int num_blocks, - BinaryFunction compare_function, - const OffsetT* merge_partitions) -{ - block_merge_process_tile(keys_input, - keys_output, - values_input, - values_output, - input_size, - sorted_block_size, - num_blocks, - compare_function, - merge_partitions); -} - -} // namespace detail + template + ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void + block_merge_mergepath_kernel(KeysInputIterator keys_input, + KeysOutputIterator keys_output, + ValuesInputIterator values_input, + ValuesOutputIterator values_output, + const OffsetT input_size, + const OffsetT sorted_block_size, + const unsigned int num_blocks, + BinaryFunction compare_function, + const OffsetT* merge_partitions) + { + block_merge_process_tile(keys_input, + keys_output, + values_input, + values_output, + input_size, + sorted_block_size, + num_blocks, + compare_function, + merge_partitions); + } + +} // end of detail namespace END_ROCPRIM_NAMESPACE From 27734f0a3fbd84c069451f0f4bcf3a2aab89bc60 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Wed, 30 Oct 2024 23:56:58 +0000 Subject: [PATCH 10/13] updated mergepath to have fix for large indices --- .../detail/device_merge_sort_mergepath.hpp | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/rocprim/include/rocprim/device/detail/device_merge_sort_mergepath.hpp b/rocprim/include/rocprim/device/detail/device_merge_sort_mergepath.hpp index 6fae39024..c04105d55 100644 --- a/rocprim/include/rocprim/device/detail/device_merge_sort_mergepath.hpp +++ b/rocprim/include/rocprim/device/detail/device_merge_sort_mergepath.hpp @@ -137,7 +137,8 @@ namespace detail { return; } - const bool IsIncompleteTile = flat_block_id == (input_size / items_per_tile); + + const bool is_incomplete_tile = flat_block_id == (input_size / items_per_tile); const OffsetT partition_beg = merge_partitions[flat_block_id]; const OffsetT partition_end = merge_partitions[flat_block_id + 1]; @@ -146,9 +147,12 @@ namespace detail const unsigned int target_merged_tiles_number = merged_tiles_number * 2; const unsigned int mask = target_merged_tiles_number - 1; const unsigned int tilegroup_start_id = ~mask & flat_block_id; - const OffsetT tilegroup_start = static_cast(tilegroup_start_id) * items_per_tile; // Tile-group starts here + + const OffsetT tilegroup_start + = static_cast(tilegroup_start_id) * items_per_tile; // Tile-group starts here const OffsetT diag = static_cast(flat_block_id) * items_per_tile - tilegroup_start; + const OffsetT keys1_beg = partition_beg; OffsetT keys1_end = partition_end; const OffsetT keys2_beg = rocprim::min(input_size, 2 * tilegroup_start + sorted_block_size + diag - partition_beg); @@ -170,7 +174,7 @@ namespace detail keys_input + keys2_beg, num_keys1, num_keys2, - IsIncompleteTile); + is_incomplete_tile); // Load keys into shared memory reg_to_shared(keys_shared, keys); @@ -181,7 +185,7 @@ namespace detail values_input + keys2_beg, num_keys1, num_keys2, - IsIncompleteTile); + is_incomplete_tile); } rocprim::syncthreads(); @@ -221,10 +225,10 @@ namespace detail rocprim::syncthreads(); } - const OffsetT offset = flat_block_id * items_per_tile; + const OffsetT offset = static_cast(flat_block_id) * items_per_tile; block_store().store(offset, input_size - offset, - IsIncompleteTile, + is_incomplete_tile, keys_output, values_output, keys, @@ -281,13 +285,14 @@ namespace detail auto& values_shared = storage.values.get(); const unsigned short flat_id = block_thread_id<0>(); - const unsigned int flat_block_id = block_id<0>(); + const unsigned int flat_block_id = ::rocprim::flat_block_id(); if(flat_block_id >= num_blocks) { return; } const bool is_incomplete_tile = flat_block_id == (input_size / items_per_tile); + const OffsetT partition_beg = merge_partitions[flat_block_id]; const OffsetT partition_end = merge_partitions[flat_block_id + 1]; @@ -295,9 +300,11 @@ namespace detail const unsigned int target_merged_tiles_number = merged_tiles_number * 2; const unsigned int mask = target_merged_tiles_number - 1; const unsigned int tilegroup_start_id = ~mask & flat_block_id; - const OffsetT tilegroup_start = static_cast(tilegroup_start_id) * items_per_tile; // Tile-group starts here + const OffsetT tilegroup_start + = static_cast(tilegroup_start_id) * items_per_tile; // Tile-group starts here const OffsetT diag = static_cast(flat_block_id) * items_per_tile - tilegroup_start; + const OffsetT keys1_beg = partition_beg; OffsetT keys1_end = partition_end; const OffsetT keys2_beg = rocprim::min(input_size, 2 * tilegroup_start + sorted_block_size + diag - partition_beg); @@ -385,7 +392,7 @@ namespace detail } rocprim::syncthreads(); - const OffsetT thread_offset = static_cast(flat_block_id) * items_per_tile + flat_id * ItemsPerThread; + const OffsetT thread_offset = items_per_tile * static_cast(flat_block_id) + ItemsPerThread * flat_id; if(is_incomplete_tile) { ROCPRIM_UNROLL From b4f8c462192cc7f7299e2c9f3ab695624156d146 Mon Sep 17 00:00:00 2001 From: spolifroni-amd Date: Thu, 31 Oct 2024 17:21:04 -0400 Subject: [PATCH 11/13] updated the changelog for 6.3 (#632) --- CHANGELOG.md | 53 +++++++++++++++++++++++----------------------------- 1 file changed, 23 insertions(+), 30 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f4adc45a1..49615b2c8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,50 +1,43 @@ # Changelog for rocPRIM -Documentation for rocPRIM is available at -[https://rocm.docs.amd.com/projects/rocPRIM/en/latest/](https://rocm.docs.amd.com/projects/rocPRIM/en/latest/). +Full documentation for rocPRIM is available at [https://rocm.docs.amd.com/projects/rocPRIM/en/latest/](https://rocm.docs.amd.com/projects/rocPRIM/en/latest/). -## Unreleased rocPRIM-3.3.0 for ROCm 6.3.0 +## rocPRIM 3.3.0 for ROCm 6.3.0 ### Added -* Add --test smoke option in rtest.py. It will run a subset of tests such that the total test time is in 5 minutes. Use python3 ./rtest.py --test smoke or python3 ./rtest.py -t smoke to execute smoke test. -* Option `--seed` to benchmarks to specify a seed for the generation of random inputs. The default behavior is to keep using a random seed per benchmark measurement. -* Added configuration autotuning to device partition (`rocprim::partition`, `rocprim::partition_two_way`, and `rocprim::partition_three_way`), device select (`rocprim::select`, `rocprim::unique`, and `rocprim::unique_by_key`), and device reduce by key (`rocprim::reduce_by_key`) for improved performance on selected architectures. -* Added `rocprim::uninitialized_array` which provides uninitialized storage in local memory for user-defined types. +* The `--test smoke` option has been added to `rtest.py`. When `rtest.py` is called with this option it runs a subset of tests such that the total test time is 5 minutes. Use `python3 ./rtest.py --test smoke` or `python3 ./rtest.py -t smoke` to run the smoke test. +* The `--seed` option has been added to `run_benchmarks.py`. The `--seed` option specifies a seed for the generation of random inputs. When the option is omitted, the default behavior is to use a random seed for each benchmark measurement. +* Added configuration autotuning to device partition (`rocprim::partition`, `rocprim::partition_two_way`, and `rocprim::partition_three_way`), to device select (`rocprim::select`, `rocprim::unique`, and `rocprim::unique_by_key`), and to device reduce by key (`rocprim::reduce_by_key`) to improve performance on selected architectures. +* Added `rocprim::uninitialized_array` to provide uninitialized storage in local memory for user-defined types. * Added large segment support for `rocprim:segmented_reduce`. -* Added a parallel `nth_element` device function similar to `std::nth_element`, this function rearranges elements smaller than the n-th before and bigger than the n-th after the n-th element. +* Added a parallel `nth_element` device function similar to `std::nth_element`. `nth_element` places elements that are smaller than the nth element before the nth element, and elements that are bigger than the nth element after the nth element. * Added deterministic (bitwise reproducible) algorithm variants `rocprim::deterministic_inclusive_scan`, `rocprim::deterministic_exclusive_scan`, `rocprim::deterministic_inclusive_scan_by_key`, `rocprim::deterministic_exclusive_scan_by_key`, and `rocprim::deterministic_reduce_by_key`. These provide run-to-run stable results with non-associative operators such as float operations, at the cost of reduced performance. -* Added a parallel `partial_sort` and `partial_sort_copy` device function similar to `std::partial_sort` and `std::partial_sort_copy`, these functions rearranges elements such that the elements are the same as a sorted list up to and including the middle index. +* Added a parallel `partial_sort` and `partial_sort_copy` device functions similar to `std::partial_sort` and `std::partial_sort_copy`. `partial_sort` and `partial_sort_copy` arrange elements such that the elements are in the same order as a sorted list up to and including the middle index. ### Changed * Modified the input size in device adjacent difference benchmarks. Observed performance with these benchmarks might be different. * Changed the default seed for `device_benchmark_segmented_reduce`. +### Removed + +* `rocprim::thread_load()` and `rocprim::thread_store()` have been deprecated. Use `dereference()` instead. + ### Resolved issues -* Fixed an issue in rtest.py where if the build folder was made without release or debug directory it would crash the program -* Fixed an issue where while running rtest.py on windows and passing in an absolute path to `--install_dir` causes a `FileNotFound` error. -* rocPRIM functions are no longer forcefully inlined on Windows, significantly reducing the build - time in debug builds. -* `block_load`, `block_store`, `block_shuffle`, `block_exchange` and `warp_exchange` now use placement `new` instead of copy - assignment (`operator=`) when writing to local memory. This fixes the behavior of custom types with non-trivial copy assignments. +* Resolved an issue in `rtest.py` where it crashed if the `build` folder was created without `release` or `debug` subdirectories. +* Resolved an issue with `rtest.py` on Windows where passing an absolute path to `--install_dir` caused a `FileNotFound` error. +* rocPRIM functions are no longer forcefully inlined on Windows. This significantly reduces the build + time of debug builds. +* `block_load`, `block_store`, `block_shuffle`, `block_exchange`, and `warp_exchange` now use placement `new` instead of copy assignment (`operator=`) when writing to local memory. This fixes the behavior of custom types with non-trivial copy assignments. * Fixed a bug in the generation of input data for benchmarks, which caused incorrect performance to be reported in specific cases. It may affect the reported performance for one-byte types (`uint8_t` and `int8_t`) and instantiations of `custom_type`. Specifically, device binary search, device histogram, device merge and warp sort are affected. -* Fixed a bug for `rocprim::merge_path_search` where using `unsigned` offsets would output wrong results. -* Fixed a bug for `rocprim::thread_load` and `rocprim::thread_store` where `float` and `double` were not casted to the correct type resulting in wrong results. -* Fix tests failing when compiling with `-D_GLIBCXX_ASSERTIONS=ON`. -* Fixed a bug for algorithms that use an internal serial merge routine that causes a memory access fault. This may result in a performance drop when using: - * block sort, - * device merge sort (block merge), - * device merge, - * device partial sort, and/or - * device sort (merge sort). -* Fixed memory leaks in unit tests that were due to missing hipFree calls and incorrect use of hipGraphs -* Fixed an issue where on certain inputs to block_sort_merge, device_merge_sort_merge_path, device_merge, and warp_sort_stable would cause an assertion error during its call to serial_merge - -### Upcoming changes - -* `rocprim::thread_load` and `rocprim::thread_store` are deprecated. Use dereference instead. Not all of those functions are available on every device architecture, and their usage can hurt performance, because inline assembly inhibits optimizations. +* Fixed a bug for `rocprim::merge_path_search` where using `unsigned` offsets would produce incorrect results. +* Fixed a bug for `rocprim::thread_load` and `rocprim::thread_store` where `float` and `double` were not cast to the correct type, resulting in incorrect results. +* Resolved an issue where tests where failing when they were compiled with `-D_GLIBCXX_ASSERTIONS=ON`. +* Resolved an issue where algorithms that used an internal serial merge routine caused a memory access fault that resulted in potential performance drops when using block sort, device merge sort (block merge), device merge, device partial sort, and device sort (merge sort). +* Fixed memory leaks in unit tests due to missing calls to `hipFree()` and the incorrect use of hipGraphs. +* Fixed an issue where certain inputs to `block_sort_merge()`, `device_merge_sort_merge_path()`, `device_merge()`, and `warp_sort_stable()` caused an assertion error during the call to `serial_merge()`. ## rocPRIM-3.2.1 for ROCm 6.2.1 From f01fdda9d71970ce798ccc24334b6a70179a1b38 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Fri, 1 Nov 2024 16:07:44 +0000 Subject: [PATCH 12/13] updated changelog for test_utils_hipgraph change --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 16364ceb6..ec83fd323 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,7 @@ Full documentation for rocPRIM is available at [https://rocm.docs.amd.com/projec * Modified the input size in device adjacent difference benchmarks. Observed performance with these benchmarks might be different. * Changed the default seed for `device_benchmark_segmented_reduce`. +* Changed `test_utils_hipgraphs.hpp` to be a class `graphHelper` with internal graph and graph instances ### Removed From 02aea06050463de1ac91ba43df9ba18f6f308e30 Mon Sep 17 00:00:00 2001 From: NguyenNhuDi Date: Fri, 1 Nov 2024 16:08:53 +0000 Subject: [PATCH 13/13] updated changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ec83fd323..9fe7ddc16 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,7 +35,7 @@ Full documentation for rocPRIM is available at [https://rocm.docs.amd.com/projec * Modified the input size in device adjacent difference benchmarks. Observed performance with these benchmarks might be different. * Changed the default seed for `device_benchmark_segmented_reduce`. -* Changed `test_utils_hipgraphs.hpp` to be a class `graphHelper` with internal graph and graph instances +* Changed `test_utils_hipgraphs.hpp` to be a class `GraphHelper` with internal graph and graph instances ### Removed