From 1c0e3bf3fefa016405bbfa92bc8f0a35f79ceed8 Mon Sep 17 00:00:00 2001 From: Seyong Lee Date: Mon, 8 May 2023 09:00:36 -0400 Subject: [PATCH] Update the OpenACC parallel_reduce() constructs with Range/MDRange/Team (#6072) * Update the OpenACC parallel_reduce() constructs with Range/MDRange/Team Policy to support reductions on device data. * Update as suggested by the code review. * Add comments as suggested by the code review. * Undo the unit test CMake change. * Update the OpenACC parallel_reduce() implementations to correctly handle the cases where the number of iterations is zero. Update reduction-related unit tests to disable unsupported tests for the OpenACC backend. Update CMakeLists.txt in the unit test to enable reduction-related unit tests supported by the OpenACC backend. * Re-enabled supported unit tests. * Disable TestOpenACC_Reducers_a.cpp since it fails when compiled by NVHPC V22.5 or older * Disable unsupported unit test. --- .../Kokkos_OpenACC_ParallelReduce_MDRange.hpp | 33 +++++++++++++--- .../Kokkos_OpenACC_ParallelReduce_Range.hpp | 33 +++++++++++++--- .../Kokkos_OpenACC_ParallelReduce_Team.hpp | 38 ++++++++++++++---- core/unit_test/CMakeLists.txt | 39 +------------------ core/unit_test/TestReduce.hpp | 6 +++ core/unit_test/TestReducers.hpp | 18 +++++++++ 6 files changed, 112 insertions(+), 55 deletions(-) diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp index 0ebd8b219f..2c7793dc11 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp @@ -51,6 +51,7 @@ class Kokkos::Impl::ParallelReduce @@ -58,22 +59,32 @@ class Kokkos::Impl::ParallelReduce::accessible) {} void execute() const { static_assert(1 < Policy::rank && Policy::rank < 7); static_assert(Policy::inner_direction == Iterate::Left || Policy::inner_direction == Iterate::Right); constexpr int rank = Policy::rank; + ValueType val; + const ReducerType& reducer = m_functor_reducer.get_reducer(); + reducer.init(&val); + for (int i = 0; i < rank; ++i) { if (m_policy.m_lower[i] >= m_policy.m_upper[i]) { + if (m_result_ptr_on_device) { + acc_memcpy_to_device(m_result_ptr, &val, sizeof(ValueType)); + } else { + *m_result_ptr = val; + } return; } } - ValueType val; - const ReducerType& reducer = m_functor_reducer.get_reducer(); - reducer.init(&val); + int const async_arg = m_policy.space().acc_async_queue(); Kokkos::Experimental::Impl::OpenACCParallelReduceMDRangeHelper( Kokkos::Experimental::Impl::FunctorAdapter< @@ -85,8 +96,20 @@ class Kokkos::Impl::ParallelReduce, typename ReducerType::functor_type>(val), m_policy); + // OpenACC backend supports only built-in Reducer types; thus + // reducer.final() below is a no-op. reducer.final(&val); - *m_result_ptr = val; + // acc_wait(async_arg) in the below if-else statements is needed because the + // above OpenACC compute kernel can be executed asynchronously and val is a + // local host variable. + if (m_result_ptr_on_device) { + acc_memcpy_to_device_async(m_result_ptr, &val, sizeof(ValueType), + async_arg); + acc_wait(async_arg); + } else { + acc_wait(async_arg); + *m_result_ptr = val; + } } }; diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp index e70b8997f0..b61a05a8ee 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Range.hpp @@ -52,6 +52,7 @@ class Kokkos::Impl::ParallelReduce @@ -59,19 +60,29 @@ class Kokkos::Impl::ParallelReduce::accessible) {} void execute() const { auto const begin = m_policy.begin(); auto const end = m_policy.end(); + ValueType val; + ReducerType const& reducer = m_functor_reducer.get_reducer(); + reducer.init(&val); + if (end <= begin) { + if (m_result_ptr_on_device == false) { + *m_result_ptr = val; + } else { + acc_memcpy_to_device(m_result_ptr, &val, sizeof(ValueType)); + } return; } - ValueType val; - ReducerType const& reducer = m_functor_reducer.get_reducer(); - reducer.init(&val); + int const async_arg = m_policy.space().acc_async_queue(); Kokkos::Experimental::Impl::OpenACCParallelReduceHelper( Kokkos::Experimental::Impl::FunctorAdapter< @@ -83,8 +94,20 @@ class Kokkos::Impl::ParallelReduce, typename ReducerType::functor_type>(val), m_policy); + // OpenACC backend supports only built-in Reducer types; thus + // reducer.final() below is a no-op. reducer.final(&val); - *m_result_ptr = val; + // acc_wait(async_arg) in the below if-else statements is needed because the + // above OpenACC compute kernel can be executed asynchronously and val is a + // local host variable. + if (m_result_ptr_on_device == false) { + acc_wait(async_arg); + *m_result_ptr = val; + } else { + acc_memcpy_to_device_async(m_result_ptr, &val, sizeof(ValueType), + async_arg); + acc_wait(async_arg); + } } }; diff --git a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp index d572072aba..3223ce3f9a 100644 --- a/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp +++ b/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp @@ -63,6 +63,7 @@ class Kokkos::Impl::ParallelReduce, - Sum, typename ReducerType::functor_type>(tmp), + Sum, typename ReducerType::functor_type>(val), m_policy); - reducer.final(&tmp); - - m_result_ptr[0] = tmp; + // OpenACC backend supports only built-in Reducer types; thus + // reducer.final() below is a no-op. + reducer.final(&val); + // acc_wait(async_arg) in the below if-else statements is needed because the + // above OpenACC compute kernel can be executed asynchronously and val is a + // local host variable. + if (m_result_ptr_on_device == false) { + acc_wait(async_arg); + *m_result_ptr = val; + } else { + acc_memcpy_to_device_async(m_result_ptr, &val, sizeof(value_type), + async_arg); + acc_wait(async_arg); + } } template @@ -93,7 +114,10 @@ class Kokkos::Impl::ParallelReduce::accessible) {} }; namespace Kokkos { diff --git a/core/unit_test/CMakeLists.txt b/core/unit_test/CMakeLists.txt index 0b48eba9ea..2c4262e3ff 100644 --- a/core/unit_test/CMakeLists.txt +++ b/core/unit_test/CMakeLists.txt @@ -367,23 +367,16 @@ if(Kokkos_ENABLE_OPENACC) list(REMOVE_ITEM OpenACC_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_complexdouble.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicOperations_complexfloat.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_CommonPolicyConstructors.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_CommonPolicyInterface.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Crs.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_JoinBackwardCompatibility.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_LocalDeepCopy.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRangePolicyConstructors.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Other.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicyConstructors.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamCombinedReducers.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamMDRange.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamPolicyConstructors.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamReductionScan.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamScan.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamVectorRange.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_e.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewCopy_a.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewCopy_b.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewMapping_subview.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewOfClass.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_WorkGraph.cpp @@ -492,61 +485,31 @@ IF(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Atomics.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_AtomicViews.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_BlockSizeDeduction.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_CommonPolicyConstructors.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_CommonPolicyInterface.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_DeepCopyAlignment.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_HostSharedPtr.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_HostSharedPtrAccessOnDevice.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions1.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions2.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MathematicalFunctions3.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_a.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_b.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_c.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_d.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_e.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_f.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRange_g.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_MDRangePolicyConstructors.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_NumericTraits.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicy.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicyConstructors.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_RangePolicyRequire.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reduce.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_a.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_b.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_c.cpp + ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_a.cpp #fails if NVHPC V22.5 or lower. ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_d.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reducers_e.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_Reductions_DeviceView.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_a.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_b.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c01.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c02.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c03.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c04.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c05.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c06.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c07.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c08.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c09.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c10.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c11.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c12.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_SubView_c13.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamBasic.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamPolicyConstructors.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamScratch.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamTeamSize.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_TeamVectorRange.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_UniqueToken.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_a.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_b.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_c.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_d.cpp - ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewAPI_f.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewMapping_b.cpp ${CMAKE_CURRENT_BINARY_DIR}/openacc/TestOpenACC_ViewResize.cpp ) diff --git a/core/unit_test/TestReduce.hpp b/core/unit_test/TestReduce.hpp index 4cf30f6fbe..e1aa851f10 100644 --- a/core/unit_test/TestReduce.hpp +++ b/core/unit_test/TestReduce.hpp @@ -369,7 +369,10 @@ class TestReduceDynamic { TestReduceDynamic(const size_type nwork) { run_test_dynamic(nwork); +#ifndef KOKKOS_ENABLE_OPENACC + // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. run_test_dynamic_minmax(nwork); +#endif run_test_dynamic_final(nwork); } @@ -542,6 +545,8 @@ TEST(TEST_CATEGORY, int64_t_reduce_dynamic_view) { // FIXME_OPENMPTARGET: Not yet implemented. #ifndef KOKKOS_ENABLE_OPENMPTARGET +// FIXME_OPENACC: Not yet implemented. +#ifndef KOKKOS_ENABLE_OPENACC TEST(TEST_CATEGORY, int_combined_reduce) { using functor_type = CombinedReduceFunctorSameType; constexpr uint64_t nw = 1000; @@ -619,4 +624,5 @@ TEST(TEST_CATEGORY, int_combined_reduce_mixed) { } } #endif +#endif } // namespace Test diff --git a/core/unit_test/TestReducers.hpp b/core/unit_test/TestReducers.hpp index 633b203afe..621cb28c9e 100644 --- a/core/unit_test/TestReducers.hpp +++ b/core/unit_test/TestReducers.hpp @@ -982,14 +982,23 @@ struct TestReducers { test_sum(10001); test_prod(35); test_min(10003); +#if !defined(KOKKOS_ENABLE_OPENACC) + // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_minloc(10003); +#endif test_max(10007); +#if !defined(KOKKOS_ENABLE_OPENACC) + // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_maxloc(10007); +#endif #if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CLANG) && \ (KOKKOS_COMPILER_CLANG < 1300) // FIXME_OPENMPTARGET - The minmaxloc test fails llvm <= 13 version. #else +#if !defined(KOKKOS_ENABLE_OPENACC) + // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_minmaxloc(10007); +#endif #endif } @@ -1000,14 +1009,23 @@ struct TestReducers { test_sum(10001); test_prod(sizeof(Scalar) > 4 ? 35 : 19); // avoid int overflow (see above) test_min(10003); +#if !defined(KOKKOS_ENABLE_OPENACC) + // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_minloc(10003); +#endif test_max(10007); +#if !defined(KOKKOS_ENABLE_OPENACC) + // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_maxloc(10007); +#endif #if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_CLANG) && \ (KOKKOS_COMPILER_CLANG < 1300) // FIXME_OPENMPTARGET - The minmaxloc test fails llvm <= 13 version. #else +#if !defined(KOKKOS_ENABLE_OPENACC) + // FIXME_OPENACC - OpenACC (V3.3) does not support custom reductions. test_minmaxloc(10007); +#endif #endif test_BAnd(35); test_BOr(35);