From 25876cfccb5afd7f403edd44272f54a7567c4e75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Mon, 26 Sep 2022 17:55:28 +0200 Subject: [PATCH] Port Custom Reduction tests --- core/perf_test/CMakeLists.txt | 5 +- core/perf_test/PerfTest_CustomReduction.cpp | 107 ++++++++++++-------- 2 files changed, 68 insertions(+), 44 deletions(-) diff --git a/core/perf_test/CMakeLists.txt b/core/perf_test/CMakeLists.txt index 40b3f89397..e633c21385 100644 --- a/core/perf_test/CMakeLists.txt +++ b/core/perf_test/CMakeLists.txt @@ -11,9 +11,9 @@ IF(KOKKOS_ENABLE_TESTS) SET(SOURCES PerfTestMain.cpp PerfTestGramSchmidt.cpp - PerfTest_CustomReduction.cpp PerfTest_ExecSpacePartitioning.cpp - ) + ) + IF(Kokkos_ENABLE_OPENMPTARGET) # FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction @@ -176,6 +176,7 @@ SET( BENCHMARK_SOURCES BenchmarkMain.cpp Benchmark_Context.cpp + PerfTest_CustomReduction.cpp PerfTestHexGrad.cpp PerfTest_ViewAllocate.cpp PerfTest_ViewCopy_a123.cpp diff --git a/core/perf_test/PerfTest_CustomReduction.cpp b/core/perf_test/PerfTest_CustomReduction.cpp index 049301f9a7..2fdab006e9 100644 --- a/core/perf_test/PerfTest_CustomReduction.cpp +++ b/core/perf_test/PerfTest_CustomReduction.cpp @@ -15,14 +15,16 @@ //@HEADER #include -#include +#include +#include "Benchmark_Context.hpp" #include #include +#include #ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA namespace Test { template -void custom_reduction_test(int N, int R, int num_trials) { +std::pair custom_reduction_test(int N, int R) { Kokkos::Random_XorShift64_Pool<> rand_pool(183291); Kokkos::View a("A", N); Kokkos::fill_random(a, rand_pool, 1.0); @@ -62,49 +64,70 @@ void custom_reduction_test(int N, int R, int num_trials) { // Timing Kokkos::Timer timer; - for (int r = 0; r < num_trials; r++) { - Kokkos::parallel_reduce( - Kokkos::TeamPolicy<>(N / 1024, team_size), - KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team, - Scalar& lmax) { - Scalar team_max = Scalar(0); - for (int rr = 0; rr < R; rr++) { - int i = team.league_rank(); - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, 32), - [&](const int& j, Scalar& thread_max) { - Scalar t_max = Scalar(0); - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, 32), - [&](const int& k, Scalar& max_) { - const Scalar val = a((i * 32 + j) * 32 + k); - if (val > max_) max_ = val; - if ((k == 11) && (j == 17) && (i == 2)) max_ = 11.5; - }, - Kokkos::Max(t_max)); - if (t_max > thread_max) thread_max = t_max; - }, - Kokkos::Max(team_max)); - } - if (team_max > lmax) lmax = team_max; - }, - Kokkos::Max(max)); - } - double time = timer.seconds(); - printf("%e %e %e\n", time, - 1.0 * N * R * num_trials * sizeof(Scalar) / time / 1024 / 1024 / 1024, - max); + Kokkos::parallel_reduce( + Kokkos::TeamPolicy<>(N / 1024, team_size), + KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team, + Scalar& lmax) { + Scalar team_max = Scalar(0); + for (int rr = 0; rr < R; rr++) { + int i = team.league_rank(); + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, 32), + [&](const int& j, Scalar& thread_max) { + Scalar t_max = Scalar(0); + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, 32), + [&](const int& k, Scalar& max_) { + const Scalar val = a((i * 32 + j) * 32 + k); + if (val > max_) max_ = val; + if ((k == 11) && (j == 17) && (i == 2)) max_ = 11.5; + }, + Kokkos::Max(t_max)); + if (t_max > thread_max) thread_max = t_max; + }, + Kokkos::Max(team_max)); + } + if (team_max > lmax) lmax = team_max; + }, + Kokkos::Max(max)); + + return std::make_pair(timer.seconds(), max); } -TEST(default_exec, custom_reduction) { - int N = 100000; - int R = 1000; - int num_trials = 1; +int get_N(benchmark::State& state) { + return (Test::command_line_num_args() > 1) + ? std::stoi(Test::command_line_arg(1)) + : state.range(0); +} - if (command_line_num_args() > 1) N = std::stoi(command_line_arg(1)); - if (command_line_num_args() > 2) R = std::stoi(command_line_arg(2)); - if (command_line_num_args() > 3) num_trials = std::stoi(command_line_arg(3)); - custom_reduction_test(N, R, num_trials); +int get_R(benchmark::State& state) { + return (Test::command_line_num_args() > 2) + ? std::stoi(Test::command_line_arg(2)) + : state.range(1); } + +template +static void CustomReduction(benchmark::State& state) { + int N = get_N(state); + int R = get_R(state); + + for (auto _ : state) { + auto results = custom_reduction_test(N, R); + // data processed in gigabytes + const double data_processed = + N * R * sizeof(Scalar) / results.first / 1'000'000'000; + + state.SetIterationTime(results.first); + state.counters[KokkosBenchmark::benchmark_fom("GB/s")] = benchmark::Counter( + data_processed, benchmark::Counter::kIsIterationInvariantRate); + state.counters["Max"] = benchmark::Counter(results.second); + } +} + +BENCHMARK(CustomReduction) + ->ArgNames({"N", "R"}) + ->Args({100'000, 1'000}) + ->UseManualTime(); + } // namespace Test #endif