From b1d26ed2e98f1dd33f5c8584c41ec3a405499b3d Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 9 Feb 2021 14:20:42 -0700 Subject: [PATCH 01/47] perf_test/blas: - Add GFLOP/s output - Add support for separate batch_size option - Update step option to add step size --- perf_test/blas/blas/KokkosBlas_common.hpp | 3 +- perf_test/blas/blas/KokkosBlas_perf_test.cpp | 17 +- .../blas/blas/KokkosBlas_trtri_perf_test.hpp | 129 ++++++++++----- .../blas/blas3/KokkosBlas3_perf_test.cpp | 2 +- .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 150 ++++++++++++------ 5 files changed, 210 insertions(+), 91 deletions(-) diff --git a/perf_test/blas/blas/KokkosBlas_common.hpp b/perf_test/blas/blas/KokkosBlas_common.hpp index a6f9c65d8b..54e79647bf 100644 --- a/perf_test/blas/blas/KokkosBlas_common.hpp +++ b/perf_test/blas/blas/KokkosBlas_common.hpp @@ -56,6 +56,7 @@ #define DEFAULT_STEP 3 #define DEFAULT_WARM_UP_N 100 #define DEFAULT_N 100 +#define DEFAULT_K 10 #define DEFAULT_OUT &std::cout #define DEFAULT_BLAS_ROUTINES "trtri," @@ -117,7 +118,7 @@ static std::string test_e_str[TEST_N]{"BLAS", "BATCHED"}; * @var n: Number of columns. */ struct matrix_dim { - int m, n; + int k, m, n; }; typedef struct matrix_dim matrix_dim_t; diff --git a/perf_test/blas/blas/KokkosBlas_perf_test.cpp b/perf_test/blas/blas/KokkosBlas_perf_test.cpp index 46e89d5abb..803286f266 100644 --- a/perf_test/blas/blas/KokkosBlas_perf_test.cpp +++ b/perf_test/blas/blas/KokkosBlas_perf_test.cpp @@ -57,6 +57,7 @@ static struct option long_options[] = { {"matrix_size_step", required_argument, 0, 's'}, {"warm_up_loop", required_argument, 0, 'w'}, {"iter", required_argument, 0, 'i'}, + {"batch_size", required_argument, 0, 'k'}, {"csv", required_argument, 0, 'c'}, {"routines", required_argument, 0, 'r'}, {"trtri_options", required_argument, 0, 'o'}, @@ -135,6 +136,11 @@ static void __print_help_blas_perf_test() { "(default: %d)\n\n", DEFAULT_N); + printf("\t-k, --batch_size=LEN\n"); + printf("\t\tBatch size. Adds third dimension to matrices A and B.\n"); + printf("\t\t\tThe value of LEN as an integer. (default: %d)\n", + DEFAULT_K); + printf("\t-c, --csv=/path/to/file.csv\n"); printf("\t\tCsv output file selection.\n"); printf( @@ -166,12 +172,16 @@ int main(int argc, char **argv) { /* set default options */ options.test = DEFAULT_TEST; options.loop = DEFAULT_LOOP; + options.start.a.k = DEFAULT_K; options.start.a.m = DEFAULT_MATRIX_START; options.start.a.n = DEFAULT_MATRIX_START; + options.stop.a.k = DEFAULT_K; options.stop.a.m = DEFAULT_MATRIX_STOP; options.stop.a.n = DEFAULT_MATRIX_STOP; + options.start.b.k = DEFAULT_K; options.start.b.m = DEFAULT_MATRIX_START; options.start.b.n = DEFAULT_MATRIX_START; + options.stop.b.k = DEFAULT_K; options.stop.b.m = DEFAULT_MATRIX_STOP; options.stop.b.n = DEFAULT_MATRIX_STOP; options.step = DEFAULT_STEP; @@ -182,7 +192,7 @@ int main(int argc, char **argv) { options.blas_args.trtri.trtri_args = DEFAULT_TRTRI_ARGS; - while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:c:r:", long_options, + while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:c:r:k:", long_options, &option_idx)) != -1) { switch (ret) { case 'h': __print_help_blas_perf_test(); return 0; @@ -255,6 +265,11 @@ int main(int argc, char **argv) { case 's': options.step = atoi(optarg); break; case 'w': options.warm_up_n = atoi(optarg); break; case 'i': options.n = atoi(optarg); break; + case 'k': + options.start.a.k = options.stop.a.k = + options.start.b.k = options.stop.b.k = + atoi(optarg); + break; case 'c': out_file = optarg; options.out_file = std::string(out_file); diff --git a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp index e6b7b825a7..34c0237871 100644 --- a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp +++ b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp @@ -78,6 +78,21 @@ void (*do_trtri_invoke[LOOP_N][TEST_N])(options_t) = { /*************************** Test types and defaults **************************/ #define DEFAULT_TRTRI_ARGS "UU" + /** + * The KokkosBatched::SerialTrtri implementation performs trmm and scal on subblocks + * of the A matrix. a_m subblocks are selected. + */ +static inline int trtri_flop_count(int a_m, int a_n) { + int flop_count = 0; + + for (int i = 0; i < a_m; i++) { + flop_count++; // 1 / A[i,j] + flop_count += (i * (i + 1)); // TRMM FLOPS + flop_count += i; // SCAL FLOPS + } + return flop_count; +} + using view_type_3d = Kokkos::View; struct trtri_args { @@ -87,18 +102,25 @@ struct trtri_args { typedef struct trtri_args trtri_args_t; static std::string trtri_csv_header_str = - "algorithm,side-uplo-trans-diag,alpha,loop_type,A_dims,warm_up_n,iter," - "total_time(s),average_time(s)"; + "algorithm,side-uplo-trans-diag,loop_type,A_dims,warm_up_n,iter," + "total_time(s),average_time(s),GFLOPS,GFLOP/average_time(s)"; /*************************** Internal helper fns **************************/ static void __trtri_output_csv_row(options_t options, trtri_args_t trtri_args, double time_in_seconds) { + double flops = trtri_args.A.extent(0) * trtri_flop_count(trtri_args.A.extent(1), trtri_args.A.extent(2)); + double gflops = flops / 10e9; + double average_time = time_in_seconds / options.n; + options.out[0] << test_e_str[options.test] << "," << options.blas_args.trtri.trtri_args << "," - << loop_e_str[options.loop] << "," << trtri_args.A.extent(1) + << loop_e_str[options.loop] << "," << trtri_args.A.extent(0) << "x" << trtri_args.A.extent(1) << "x" << trtri_args.A.extent(2) << "," << options.warm_up_n << "," << options.n << "," << time_in_seconds << "," - << time_in_seconds / options.n << std::endl; + << average_time << "," + << gflops << "," + << gflops / average_time + << std::endl; } static void __print_trtri_perf_test_options(options_t options) { @@ -133,19 +155,26 @@ void __do_trtri_serial_blas(options_t options, trtri_args_t trtri_args) { STATUS; - for (uint32_t i = 0; i < warm_up_n; ++i) { - auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < warm_up_n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A); + KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A); + } + // Fence after each batch operation + Kokkos::fence(); } timer.reset(); - for (uint32_t i = 0; i < n; ++i) { - auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A); + KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A); + } + // Fence after each batch operation + Kokkos::fence(); } - Kokkos::fence(); __trtri_output_csv_row(options, trtri_args, timer.seconds()); #else std::cerr << std::string(__func__) @@ -164,19 +193,26 @@ void __do_trtri_serial_batched_template(options_t options, Kokkos::Timer timer; using tag = Algo::Trtri::Unblocked; - for (uint32_t i = 0; i < warm_up_n; ++i) { - auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < warm_up_n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - SerialTrtri::invoke(A); + SerialTrtri::invoke(A); + } + // Fence after each batch operation + Kokkos::fence(); } timer.reset(); - for (uint32_t i = 0; i < n; ++i) { - auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - SerialTrtri::invoke(A); + SerialTrtri::invoke(A); + } + // Fence after each batch operation + Kokkos::fence(); } - Kokkos::fence(); __trtri_output_csv_row(options, trtri_args, timer.seconds()); #else std::cerr << std::string(__func__) @@ -241,16 +277,22 @@ void __do_trtri_parallel_blas(options_t options, trtri_args_t trtri_args) { STATUS; - Kokkos::parallel_for("parallelBlasWarmUpLoopTrtri", - Kokkos::RangePolicy(0, warm_up_n), - parallel_blas_trtri_functor); - Kokkos::fence(); + for (uint32_t i = 0; i < warm_up_n; ++i) { + Kokkos::parallel_for("parallelBlasWarmUpLoopTrtri", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_blas_trtri_functor); + // Fence after each batch operation + Kokkos::fence(); + } timer.reset(); - Kokkos::parallel_for("parallelBlasTimedLoopTrtri", - Kokkos::RangePolicy(0, n), - parallel_blas_trtri_functor); - Kokkos::fence(); + for (uint32_t i = 0; i < n; ++i) { + Kokkos::parallel_for("parallelBlasTimedLoopTrtri", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_blas_trtri_functor); + // Fence after each batch operation + Kokkos::fence(); + } __trtri_output_csv_row(options, trtri_args, timer.seconds()); #else std::cerr << std::string(__func__) @@ -287,16 +329,23 @@ void __do_trtri_parallel_batched_template(options_t options, STATUS; - Kokkos::parallel_for("parallelBatchedWarmUpLoopTrtri", - Kokkos::RangePolicy(0, warm_up_n), - parallel_batched_trtri_functor); - Kokkos::fence(); + for (uint32_t i = 0; i < warm_up_n; ++i) { + Kokkos::parallel_for("parallelBatchedWarmUpLoopTrtri", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_batched_trtri_functor); + // Fence after each batch operation + Kokkos::fence(); + } timer.reset(); - Kokkos::parallel_for("parallelBatchedTimedLoopTrtri", - Kokkos::RangePolicy(0, n), - parallel_batched_trtri_functor); - Kokkos::fence(); + + for (uint32_t i = 0; i < n; ++i) { + Kokkos::parallel_for("parallelBatchedTimedLoopTrtri", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_batched_trtri_functor); + // Fence after each batch operation + Kokkos::fence(); + } __trtri_output_csv_row(options, trtri_args, timer.seconds()); return; @@ -345,7 +394,7 @@ trtri_args_t __do_setup(options_t options, matrix_dims_t dim) { trtri_args.uplo = options.blas_args.trtri.trtri_args.c_str()[0]; trtri_args.diag = options.blas_args.trtri.trtri_args.c_str()[1]; - trtri_args.A = vta("trtri_args.A", options.n, dim.a.m, dim.a.n); + trtri_args.A = vta("trtri_args.A", dim.a.k, dim.a.m, dim.a.n); host_A = Kokkos::create_mirror_view(trtri_args.A); Kokkos::fill_random(trtri_args.A, rand_pool, @@ -355,7 +404,7 @@ trtri_args_t __do_setup(options_t options, matrix_dims_t dim) { if (trtri_args.uplo == 'U' || trtri_args.uplo == 'u') { // Make A upper triangular - for (uint32_t k = 0; k < options.n; ++k) { + for (int k = 0; k < dim.a.k; ++k) { auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL()); for (int i = 1; i < dim.a.m; i++) { for (int j = 0; j < i; j++) { @@ -367,7 +416,7 @@ trtri_args_t __do_setup(options_t options, matrix_dims_t dim) { // Make A lower triangular // Kokkos::parallel_for("toLowerLoop", options.n, KOKKOS_LAMBDA (const int& // i) { - for (uint32_t k = 0; k < options.n; ++k) { + for (int k = 0; k < dim.a.k; ++k) { auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL()); for (int i = 0; i < dim.a.m - 1; i++) { for (int j = i + 1; j < dim.a.n; j++) { @@ -378,7 +427,7 @@ trtri_args_t __do_setup(options_t options, matrix_dims_t dim) { } if (trtri_args.diag == 'U' || trtri_args.diag == 'u') { - for (uint32_t k = 0; k < options.n; ++k) { + for (int k = 0; k < dim.a.k; ++k) { auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL()); for (int i = 0; i < min_dim; i++) { A(i, i) = scalar_type(1); @@ -408,8 +457,8 @@ void __do_loop_and_invoke(options_t options, for (cur_dims = options.start; cur_dims.a.m <= options.stop.a.m && cur_dims.a.n <= options.stop.a.n && cur_dims.b.m <= options.stop.b.m && cur_dims.b.n <= options.stop.b.n; - cur_dims.a.m *= options.step, cur_dims.a.n *= options.step, - cur_dims.b.m *= options.step, cur_dims.b.n *= options.step) { + cur_dims.a.m += options.step, cur_dims.a.n += options.step, + cur_dims.b.m += options.step, cur_dims.b.n += options.step) { trtri_args = __do_setup( options, cur_dims); fn(options, trtri_args); diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp index b493c244d8..6c95960e25 100644 --- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp @@ -122,7 +122,7 @@ static void __print_help_blas3_perf_test() { printf("\t-k, --batch_size=LEN\n"); printf("\t\tBatch size. Adds third dimension to matrices A, B, and C.\n"); printf("\t\t\tThe value of LEN as an integer. (default: %d)\n", - DEFAULT_VECTOR_LEN); + DEFAULT_K); printf("\t-l, --loop_type=OPTION\n"); printf("\t\tLoop selection.\n"); diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index 70f7664679..79b58dc7d8 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -72,6 +72,26 @@ void (*do_trmm_invoke[LOOP_N][TEST_N])(options_t) = { #define DEFAULT_TRMM_ARGS "LUNU" #define DEFAULT_TRMM_ALPHA 1.0 +/** + * The KokkosBatched::SerialTrmm implementation performs dot products on + * non-zero elements of the triangular matrices. The flop calculation below + * assumes KokkosBatched::SerialTrmm is being used. Since the dot products + * do a multiply and add we can calculate the flops for any element in the last + * column of the LHS to be 2*columns_LHS, any element in the last-1 column of + * the LHS to be 2*(columns_LHS-1), and so on. We do this for every row of the LHS + * giving us this flop count: + * flops = columns_LHS * (columns_LHS + 1) + * flops = (flops / 2) * 2 + * flops = flops * rows_LHS + */ +static inline int trmm_flop_count(char side, int b_m, int b_n, int a_m, int a_n) { + if (side == 'L' || side == 'l') { + return (a_n * (a_n + 1)) * a_m; + } else { + return (b_n * (b_n + 1)) * b_m; + } +} + using view_type_3d = Kokkos::View; struct trmm_args { @@ -83,19 +103,28 @@ typedef struct trmm_args trmm_args_t; static std::string trmm_csv_header_str = "algorithm,side-uplo-trans-diag,alpha,loop_type,A_dims,B_dims,warm_up_n," - "iter,total_time(s),average_time(s)"; + "iter,total_time(s),average_time(s),GFLOPS,GFLOP/average_time(s)"; /*************************** Internal helper fns **************************/ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, double time_in_seconds) { + double flops = trmm_args.A.extent(0) * trmm_flop_count(trmm_args.side, + trmm_args.B.extent(1), trmm_args.B.extent(2), + trmm_args.A.extent(1), trmm_args.A.extent(2)); + double gflops = flops / 10e9; + double average_time = time_in_seconds / options.n; + options.out[0] << test_e_str[options.test] << "," << options.blas_args.trmm.trmm_args << "," << options.blas_args.trmm.alpha << "," - << loop_e_str[options.loop] << "," << trmm_args.A.extent(1) - << "x" << trmm_args.A.extent(2) << "," << trmm_args.B.extent(1) + << loop_e_str[options.loop] << "," << trmm_args.A.extent(0) << "x" << trmm_args.A.extent(1) + << "x" << trmm_args.A.extent(2) << "," << trmm_args.B.extent(0) << "x" << trmm_args.B.extent(1) << "x" << trmm_args.B.extent(2) << "," << options.warm_up_n << "," << options.n << "," << time_in_seconds << "," - << time_in_seconds / options.n << std::endl; + << time_in_seconds / options.n << "," + << gflops << "," + << gflops / average_time + << std::endl; } static void __print_trmm_perf_test_options(options_t options) { @@ -131,24 +160,30 @@ void __do_trmm_serial_blas(options_t options, trmm_args_t trmm_args) { STATUS; - for (uint32_t i = 0; i < warm_up_n; ++i) { - auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < warm_up_n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); - KokkosBlas::trmm(&trmm_args.side, &trmm_args.uplo, &trmm_args.trans, - &trmm_args.diag, trmm_args.alpha, A, B); + KokkosBlas::trmm(&trmm_args.side, &trmm_args.uplo, &trmm_args.trans, + &trmm_args.diag, trmm_args.alpha, A, B); + } + // Fence after submitting each batch operation + Kokkos::fence(); } - Kokkos::fence(); timer.reset(); - for (uint32_t i = 0; i < n; ++i) { - auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); - KokkosBlas::trmm(&trmm_args.side, &trmm_args.uplo, &trmm_args.trans, - &trmm_args.diag, trmm_args.alpha, A, B); + KokkosBlas::trmm(&trmm_args.side, &trmm_args.uplo, &trmm_args.trans, + &trmm_args.diag, trmm_args.alpha, A, B); + } + // Fence after submitting each batch operation + Kokkos::fence(); } - Kokkos::fence(); __trmm_output_csv_row(options, trmm_args, timer.seconds()); #else std::cerr << std::string(__func__) @@ -167,21 +202,28 @@ void __do_trmm_serial_batched_template(options_t options, Kokkos::Timer timer; using tag = Algo::Trmm::Unblocked; - for (uint32_t i = 0; i < warm_up_n; ++i) { - auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < warm_up_n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); - SerialTrmm::invoke(trmm_args.alpha, A, B); + SerialTrmm::invoke(trmm_args.alpha, A, B); + } + // Fence after submitting each batch operation + Kokkos::fence(); } timer.reset(); - for (uint32_t i = 0; i < n; ++i) { - auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); + for (uint32_t j = 0; j < n; ++j) { + for (int i = 0; i < options.start.a.k; ++i) { + auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); - SerialTrmm::invoke(trmm_args.alpha, A, B); + SerialTrmm::invoke(trmm_args.alpha, A, B); + } + // Fence after submitting each batch operation + Kokkos::fence(); } - Kokkos::fence(); __trmm_output_csv_row(options, trmm_args, timer.seconds()); #else std::cerr << std::string(__func__) @@ -316,16 +358,22 @@ void __do_trmm_parallel_blas(options_t options, trmm_args_t trmm_args) { STATUS; - Kokkos::parallel_for("parallelBlasWarmUpLoopTrmm", - Kokkos::RangePolicy(0, warm_up_n), - parallel_blas_trmm_functor); - Kokkos::fence(); + for (uint32_t j = 0; j < warm_up_n; ++j) { + Kokkos::parallel_for("parallelBlasWarmUpLoopTrmm", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_blas_trmm_functor); + // Fence after each batch operation + Kokkos::fence(); + } timer.reset(); - Kokkos::parallel_for("parallelBlasTimedLoopTrmm", - Kokkos::RangePolicy(0, n), - parallel_blas_trmm_functor); - Kokkos::fence(); + for (uint32_t j = 0; j < n; ++j) { + Kokkos::parallel_for("parallelBlasTimedLoopTrmm", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_blas_trmm_functor); + // Fence after each batch operation + Kokkos::fence(); + } __trmm_output_csv_row(options, trmm_args, timer.seconds()); #else std::cerr << std::string(__func__) @@ -368,16 +416,22 @@ void __do_trmm_parallel_batched_template(options_t options, STATUS; - Kokkos::parallel_for("parallelBatchedWarmUpLoopTrmm", - Kokkos::RangePolicy(0, warm_up_n), - parallel_batched_trmm_functor); - Kokkos::fence(); + for (uint32_t j = 0; j < warm_up_n; ++j) { + Kokkos::parallel_for("parallelBatchedWarmUpLoopTrmm", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_batched_trmm_functor); + // Fence after each batch operation + Kokkos::fence(); + } timer.reset(); - Kokkos::parallel_for("parallelBatchedTimedLoopTrmm", - Kokkos::RangePolicy(0, n), - parallel_batched_trmm_functor); - Kokkos::fence(); + for (uint32_t j = 0; j < n; ++j) { + Kokkos::parallel_for("parallelBatchedTimedLoopTrmm", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_batched_trmm_functor); + // Fence after each batch operation + Kokkos::fence(); + } __trmm_output_csv_row(options, trmm_args, timer.seconds()); return; @@ -498,8 +552,8 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { trmm_args.uplo = options.blas_args.trmm.trmm_args.c_str()[1]; trmm_args.trans = options.blas_args.trmm.trmm_args.c_str()[2]; trmm_args.diag = options.blas_args.trmm.trmm_args.c_str()[3]; - trmm_args.A = vta("trmm_args.A", options.n, dim.a.m, dim.a.n); - trmm_args.B = vtb("trmm_args.B", options.n, dim.b.m, dim.b.n); + trmm_args.A = vta("trmm_args.A", dim.a.k, dim.a.m, dim.a.n); + trmm_args.B = vtb("trmm_args.B", dim.b.k, dim.b.m, dim.b.n); trmm_args.alpha = options.blas_args.trmm.alpha; host_A = Kokkos::create_mirror_view(trmm_args.A); @@ -510,7 +564,7 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { if (trmm_args.uplo == 'U' || trmm_args.uplo == 'u') { // Make A upper triangular - for (uint32_t k = 0; k < options.n; ++k) { + for (int k = 0; k < dim.a.k; ++k) { auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL()); for (int i = 1; i < dim.a.m; i++) { for (int j = 0; j < i; j++) { @@ -522,7 +576,7 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { // Make A lower triangular // Kokkos::parallel_for("toLowerLoop", options.n, KOKKOS_LAMBDA (const int& // i) { - for (uint32_t k = 0; k < options.n; ++k) { + for (int k = 0; k < dim.a.k; ++k) { auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL()); for (int i = 0; i < dim.a.m - 1; i++) { for (int j = i + 1; j < dim.a.n; j++) { @@ -533,7 +587,7 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { } if (trmm_args.diag == 'U' || trmm_args.diag == 'u') { - for (uint32_t k = 0; k < options.n; ++k) { + for (int k = 0; k < dim.a.k; ++k) { auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL()); for (int i = 0; i < min_dim; i++) { A(i, i) = scalar_type(1); @@ -566,8 +620,8 @@ void __do_loop_and_invoke(options_t options, for (cur_dims = options.start; cur_dims.a.m <= options.stop.a.m && cur_dims.a.n <= options.stop.a.n && cur_dims.b.m <= options.stop.b.m && cur_dims.b.n <= options.stop.b.n; - cur_dims.a.m *= options.step, cur_dims.a.n *= options.step, - cur_dims.b.m *= options.step, cur_dims.b.n *= options.step) { + cur_dims.a.m += options.step, cur_dims.a.n += options.step, + cur_dims.b.m += options.step, cur_dims.b.n += options.step) { trmm_args = __do_setup( options, cur_dims); From 3211987c766583f587d3ad9bffca32e3e59d3d18 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 Feb 2021 08:41:57 -0700 Subject: [PATCH 02/47] perf_test: Account for complex flop counts --- .../blas/blas/KokkosBlas_trtri_perf_test.hpp | 21 ++++++++++++++++--- .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 17 +++++++++++++-- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp index 34c0237871..3cacc73739 100644 --- a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp +++ b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp @@ -84,12 +84,27 @@ void (*do_trtri_invoke[LOOP_N][TEST_N])(options_t) = { */ static inline int trtri_flop_count(int a_m, int a_n) { int flop_count = 0; + int flops_per_div, flops_per_mul, flops_per_add; + + if (std::is_same::value || + std::is_same::value || + std::is_same::value) { + flops_per_div = 1; + flops_per_mul = 1; + flops_per_add = 1; + } else { + // For complex, we need to count 2 flops for each add and 6 flops for each multiply or divide. + flops_per_div = 6; + flops_per_mul = 6; + flops_per_add = 2; + } for (int i = 0; i < a_m; i++) { - flop_count++; // 1 / A[i,j] - flop_count += (i * (i + 1)); // TRMM FLOPS - flop_count += i; // SCAL FLOPS + flop_count += flops_per_div; // 1 / A[i,j] + flop_count += ((i * (i + 1)) / 2) * (flops_per_mul + flops_per_add); // TRMM FLOPS + flop_count += i * flops_per_mul; // SCAL FLOPS } + return flop_count; } diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index 79b58dc7d8..077c5b3d80 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -85,11 +85,23 @@ void (*do_trmm_invoke[LOOP_N][TEST_N])(options_t) = { * flops = flops * rows_LHS */ static inline int trmm_flop_count(char side, int b_m, int b_n, int a_m, int a_n) { + int flops; + if (side == 'L' || side == 'l') { - return (a_n * (a_n + 1)) * a_m; + flops = (a_n * (a_n + 1)) * a_m; } else { - return (b_n * (b_n + 1)) * b_m; + flops = (b_n * (b_n + 1)) * b_m; } + + if (std::is_same::value || + std::is_same::value || + std::is_same::value) + return flops; + + // Account for 6 additional flops when complex numbers are used. + // Above we have counted 1 flop for each add and 1 flop for each multiply. + // For complex, we need to count 2 flops for each add and 6 flops for each multiply. + return flops * 4; } using view_type_3d = @@ -348,6 +360,7 @@ struct parallel_blas_trmm { template void __do_trmm_parallel_blas(options_t options, trmm_args_t trmm_args) { +// TODO: Note why this is disabled on CUDA and HIP #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) uint32_t warm_up_n = options.warm_up_n; uint32_t n = options.n; From 667fee39d51110ee2bcd3b7e1216f0d91eac9685 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 Feb 2021 14:08:10 -0700 Subject: [PATCH 03/47] perf_test: Use flop counts from lapack note 41 --- .../blas/blas/KokkosBlas_trtri_perf_test.hpp | 34 +++++++++++++++++-- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp index 3cacc73739..de24a96254 100644 --- a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp +++ b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp @@ -82,7 +82,7 @@ void (*do_trtri_invoke[LOOP_N][TEST_N])(options_t) = { * The KokkosBatched::SerialTrtri implementation performs trmm and scal on subblocks * of the A matrix. a_m subblocks are selected. */ -static inline int trtri_flop_count(int a_m, int a_n) { +static inline int trtri_impl_flop_count(int a_m, int a_n) { int flop_count = 0; int flops_per_div, flops_per_mul, flops_per_add; @@ -108,6 +108,34 @@ static inline int trtri_flop_count(int a_m, int a_n) { return flop_count; } +// Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf +static inline int trtri_flop_count(int a_m, int a_n) { + int flops; + int flops_per_mul; + int flops_per_add; + + if (a_m != a_n) { + fprintf(stderr, "%s:%d:ERROR: a_m != a_n.\n", __FILE__, __LINE__); + exit(255); + } + + if (std::is_same::value || + std::is_same::value || + std::is_same::value) { + flops_per_mul = 1; + flops_per_add = 1; + } else { + // For complex, we need to count 2 flops for each add and 6 flops for each multiply. + flops_per_mul = 6; + flops_per_add = 2; + } + + flops = (1./6.*a_n*a_n*a_n + 1./2.*a_n*a_n + 1./3.*a_n) * flops_per_mul + + (1./6.*a_n*a_n*a_n - 1./2.*a_n*a_n + 1./3.*a_n) * flops_per_add; + + return flops; +} + using view_type_3d = Kokkos::View; struct trtri_args { @@ -118,7 +146,7 @@ typedef struct trtri_args trtri_args_t; static std::string trtri_csv_header_str = "algorithm,side-uplo-trans-diag,loop_type,A_dims,warm_up_n,iter," - "total_time(s),average_time(s),GFLOPS,GFLOP/average_time(s)"; + "total_time(s),average_time(s),FLOPS,GFLOP/average_time(s)"; /*************************** Internal helper fns **************************/ static void __trtri_output_csv_row(options_t options, trtri_args_t trtri_args, @@ -133,7 +161,7 @@ static void __trtri_output_csv_row(options_t options, trtri_args_t trtri_args, << "x" << trtri_args.A.extent(2) << "," << options.warm_up_n << "," << options.n << "," << time_in_seconds << "," << average_time << "," - << gflops << "," + << flops << "," << gflops / average_time << std::endl; } From 973afc564f7ba479731773856b9a93f4ddcda647 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 Feb 2021 14:08:18 -0700 Subject: [PATCH 04/47] perf_test: Update flop counts - Use flop counts from lapack note 41 - Fix impl flop counts for side == left --- .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 29 ++++++++++++++++--- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index 077c5b3d80..a35caad5dd 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -84,11 +84,11 @@ void (*do_trmm_invoke[LOOP_N][TEST_N])(options_t) = { * flops = (flops / 2) * 2 * flops = flops * rows_LHS */ -static inline int trmm_flop_count(char side, int b_m, int b_n, int a_m, int a_n) { +static inline int trmm_impl_flop_count(char side, int b_m, int b_n, int a_m, int a_n) { int flops; if (side == 'L' || side == 'l') { - flops = (a_n * (a_n + 1)) * a_m; + flops = (b_m * (b_m + 1)) * b_n; } else { flops = (b_n * (b_n + 1)) * b_m; } @@ -104,6 +104,27 @@ static inline int trmm_flop_count(char side, int b_m, int b_n, int a_m, int a_n) return flops * 4; } +// Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf +static inline int trmm_flop_count(char side, int b_m, int b_n, int a_m, int a_n) { + int flops; + + if (side == 'L' || side == 'l') { + flops = b_m * b_m * b_n; + } else { + flops = b_n * b_n * b_m; + } + + if (std::is_same::value || + std::is_same::value || + std::is_same::value) + return flops; + + // Account for 6 additional flops when complex numbers are used. + // Above we have counted 1 flop for each add and 1 flop for each multiply. + // For complex, we need to count 2 flops for each add and 6 flops for each multiply. + return flops * 4; +} + using view_type_3d = Kokkos::View; struct trmm_args { @@ -115,7 +136,7 @@ typedef struct trmm_args trmm_args_t; static std::string trmm_csv_header_str = "algorithm,side-uplo-trans-diag,alpha,loop_type,A_dims,B_dims,warm_up_n," - "iter,total_time(s),average_time(s),GFLOPS,GFLOP/average_time(s)"; + "iter,total_time(s),average_time(s),FLOPS,GFLOP/average_time(s)"; /*************************** Internal helper fns **************************/ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, @@ -134,7 +155,7 @@ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, << "x" << trmm_args.B.extent(2) << "," << options.warm_up_n << "," << options.n << "," << time_in_seconds << "," << time_in_seconds / options.n << "," - << gflops << "," + << flops << "," << gflops / average_time << std::endl; } From 8d2868740c2dd960f7bddaac061d3bd5edfd61a9 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 Feb 2021 14:27:36 -0700 Subject: [PATCH 05/47] perf_test: Update gemm to optionally use RangePolicy --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 46 ++++++++++++++++++- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index f26fbb7287..b66f4c3bd0 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -58,6 +58,7 @@ #include "KokkosBatched_Util.hpp" //#define GEMM_PERF_TEST_DEBUG +#define KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY // Forward declarations void do_gemm_serial_blas(options_t options); @@ -322,6 +323,24 @@ void __do_gemm_parallel_blas(options_t options, gemm_args_t gemm_args) { return; } +template +struct parallel_batched_gemm_range_policy { + gemm_args_t gemm_args_; + + parallel_batched_gemm_range_policy(gemm_args_t gemm_args) : gemm_args_(gemm_args) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int &i) const { + auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL()); + auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL()); + auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL()); + + KokkosBatched::SerialGemm::invoke( + gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + } +}; + template struct parallel_batched_gemm { @@ -375,36 +394,59 @@ template ; + using functor_type = + parallel_batched_gemm_range_policy; +#else using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; using functor_type = parallel_batched_gemm; +#endif uint32_t warm_up_n = options.warm_up_n; uint32_t n = options.n; +#if !defined(KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY) auto league_size = options.start.c.k; +#endif Kokkos::Timer timer; STATUS; functor_type parallel_batched_gemm_functor(gemm_args); +#if !defined(KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY) auto team_size = gemm_args.bp.team_size; auto vector_len = gemm_args.bp.vector_len; +#endif for (uint32_t i = 0; i < warm_up_n; i++) { +#if defined(KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY) + Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", + policy_type(0, options.start.c.k), + parallel_batched_gemm_functor); +#else Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", policy_type(league_size, team_size, vector_len), parallel_batched_gemm_functor); +#endif + Kokkos::fence(); } - Kokkos::fence(); timer.reset(); for (uint32_t i = 0; i < n; i++) { +#if defined(KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY) + Kokkos::parallel_for("parallelBatchedTimedLoopGemm", + policy_type(0, options.start.c.k), + parallel_batched_gemm_functor); +#else Kokkos::parallel_for("parallelBatchedTimedLoopGemm", policy_type(league_size, team_size, vector_len), parallel_batched_gemm_functor); +#endif + Kokkos::fence(); } - Kokkos::fence(); __gemm_output_csv_row(options, gemm_args, timer.seconds()); From ccbbad3546c4cad67ba1419b35ab9b02f95b1043 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 Feb 2021 14:39:54 -0700 Subject: [PATCH 06/47] perf_test: Update GEMM to output GFLOPs --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index b66f4c3bd0..9792c3a061 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -128,15 +128,24 @@ typedef struct gemm_args gemm_args_t; static std::string gemm_csv_header_str = "algorithm,transAtransB,alpha,beta,team_size,vector_len,loop_type,A_dims,B_" "dims,C_dims,warm_up_n," - "iter,total_time(s),average_time(s)"; + "iter,total_time(s),average_time(s),FLOPS,GFLOP/average_time(s)"; /*************************** Internal helper fns **************************/ +// Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf +static inline int __gemm_flop_count(int a_m, int a_n, int b_k) { + return 2 * a_m * b_k * a_n; +} static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, double time_in_seconds, const char *experiment_name = nullptr) { std::string algo_name = test_e_str[options.test]; if (experiment_name) algo_name = std::string(experiment_name); + double flops = gemm_args.A.extent(0) * __gemm_flop_count(gemm_args.A.extent(1), gemm_args.A.extent(2), + gemm_args.B.extent(2)); + double gflops = flops / 10e9; + double average_time = time_in_seconds / options.n; + options.out[0] << algo_name << "," << options.blas_args.gemm.gemm_args << "," << options.blas_args.gemm.alpha << "," << options.blas_args.gemm.beta << "," << gemm_args.bp.team_size @@ -147,7 +156,10 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, << "x" << gemm_args.B.extent(2) << "," << gemm_args.C.extent(0) << "x" << gemm_args.C.extent(1) << "x" << gemm_args.C.extent(2) << "," << options.warm_up_n << "," << options.n << "," - << time_in_seconds << "," << time_in_seconds / options.n + << time_in_seconds << "," + << time_in_seconds / options.n << "," + << flops << "," + << gflops / average_time << std::endl; } From 274e9289af0838c1b675eb0ec0bc31d2c506fc62 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 Feb 2021 14:44:22 -0700 Subject: [PATCH 07/47] perf_test: Update gemm size step --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 9792c3a061..0f1b7f70b6 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -959,9 +959,9 @@ void __do_loop_and_invoke(options_t options, cur_dims.a.m <= options.stop.a.m && cur_dims.a.n <= options.stop.a.n && cur_dims.b.m <= options.stop.b.m && cur_dims.b.n <= options.stop.b.n && cur_dims.c.m <= options.stop.c.m && cur_dims.c.n <= options.stop.c.n; - cur_dims.a.m *= options.step, cur_dims.a.n *= options.step, - cur_dims.b.m *= options.step, cur_dims.b.n *= options.step, - cur_dims.c.m *= options.step, cur_dims.c.n *= options.step) { + cur_dims.a.m += options.step, cur_dims.a.n += options.step, + cur_dims.b.m += options.step, cur_dims.b.n += options.step, + cur_dims.c.m += options.step, cur_dims.c.n += options.step) { gemm_args = __do_setup(options, cur_dims); fn(options, gemm_args); From 6f4e05bd022a61c1b2c71b8f822fdb2a4165aab1 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 11 Feb 2021 11:42:41 -0700 Subject: [PATCH 08/47] perf_test: Disable KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 0f1b7f70b6..59f5a84803 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -58,7 +58,7 @@ #include "KokkosBatched_Util.hpp" //#define GEMM_PERF_TEST_DEBUG -#define KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY +//#define KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY // Forward declarations void do_gemm_serial_blas(options_t options); From b066fa9b2a3170e62e33cc686a386749089942b3 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 11 Feb 2021 14:32:06 -0700 Subject: [PATCH 09/47] perf_test/blas: Fix GFLOP calculation --- perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp | 4 ++-- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 2 +- perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp index de24a96254..32626cfba5 100644 --- a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp +++ b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp @@ -129,7 +129,7 @@ static inline int trtri_flop_count(int a_m, int a_n) { flops_per_mul = 6; flops_per_add = 2; } - + flops = (1./6.*a_n*a_n*a_n + 1./2.*a_n*a_n + 1./3.*a_n) * flops_per_mul + (1./6.*a_n*a_n*a_n - 1./2.*a_n*a_n + 1./3.*a_n) * flops_per_add; @@ -152,7 +152,7 @@ static std::string trtri_csv_header_str = static void __trtri_output_csv_row(options_t options, trtri_args_t trtri_args, double time_in_seconds) { double flops = trtri_args.A.extent(0) * trtri_flop_count(trtri_args.A.extent(1), trtri_args.A.extent(2)); - double gflops = flops / 10e9; + double gflops = flops / 1e9; double average_time = time_in_seconds / options.n; options.out[0] << test_e_str[options.test] << "," diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 59f5a84803..29fcace727 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -143,7 +143,7 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, double flops = gemm_args.A.extent(0) * __gemm_flop_count(gemm_args.A.extent(1), gemm_args.A.extent(2), gemm_args.B.extent(2)); - double gflops = flops / 10e9; + double gflops = flops / 1e9; double average_time = time_in_seconds / options.n; options.out[0] << algo_name << "," << options.blas_args.gemm.gemm_args << "," diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index a35caad5dd..9a7f7cc480 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -144,7 +144,7 @@ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, double flops = trmm_args.A.extent(0) * trmm_flop_count(trmm_args.side, trmm_args.B.extent(1), trmm_args.B.extent(2), trmm_args.A.extent(1), trmm_args.A.extent(2)); - double gflops = flops / 10e9; + double gflops = flops / 1e9; double average_time = time_in_seconds / options.n; options.out[0] << test_e_str[options.test] << "," From 63382d3722d2868258cefffd4b5639a475d35198 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 12 Feb 2021 20:38:12 -0700 Subject: [PATCH 10/47] perf_test/blas/blas3: Add bandwidth metric to trmm --- .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 25 ++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index 9a7f7cc480..a313eabbaf 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -136,7 +136,7 @@ typedef struct trmm_args trmm_args_t; static std::string trmm_csv_header_str = "algorithm,side-uplo-trans-diag,alpha,loop_type,A_dims,B_dims,warm_up_n," - "iter,total_time(s),average_time(s),FLOPS,GFLOP/average_time(s)"; + "iter,total_time(s),average_time(s),FLOPS,GFLOP/average_time(s),min_achieved_bandwidth(GB/s),max_achieved_bandwidth(GB/s)"; /*************************** Internal helper fns **************************/ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, @@ -146,6 +146,23 @@ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, trmm_args.A.extent(1), trmm_args.A.extent(2)); double gflops = flops / 1e9; double average_time = time_in_seconds / options.n; + double gbytes_in_matrix = (trmm_args.B.extent(0) * trmm_args.B.extent(1) * trmm_args.B.extent(2) * sizeof(default_scalar)) / 1e9; + double min_memory_transactions, max_memory_transactions; + + // Assuming infinite cache size + // We have to read A and B into the cache once and then write + // B back out to main memory once. + min_memory_transactions = 3; + + // Assuming no register or real caching + // We have to go out to memory for every element we read from A and B as well as + // every element we write to B. + // We use the trmm flops from lapack note 41 and multiple by 3/2 to account for the + // write to B since this flop count is for one multiply and one add. + if (trmm_args.side == 'l' || trmm_args.side == 'L') + max_memory_transactions = trmm_args.B.extent(1) * trmm_args.B.extent(1) * trmm_args.B.extent(2) * (3./2.); + else + max_memory_transactions = trmm_args.B.extent(2) * trmm_args.B.extent(2) * trmm_args.B.extent(1) * (3./2.); options.out[0] << test_e_str[options.test] << "," << options.blas_args.trmm.trmm_args << "," @@ -154,9 +171,11 @@ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, << "x" << trmm_args.A.extent(2) << "," << trmm_args.B.extent(0) << "x" << trmm_args.B.extent(1) << "x" << trmm_args.B.extent(2) << "," << options.warm_up_n << "," << options.n << "," << time_in_seconds << "," - << time_in_seconds / options.n << "," + << average_time << "," << flops << "," - << gflops / average_time + << gflops / average_time << "," + << (gbytes_in_matrix * min_memory_transactions) / average_time << "," + << (gbytes_in_matrix * max_memory_transactions) / average_time << std::endl; } From 898794eb3860f897a44a03b77211d06d4d74809a Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 15 Feb 2021 16:20:06 -0700 Subject: [PATCH 11/47] perf_test: Handle complex numbers in flop count --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 29fcace727..d6572dfd34 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -133,7 +133,13 @@ static std::string gemm_csv_header_str = /*************************** Internal helper fns **************************/ // Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf static inline int __gemm_flop_count(int a_m, int a_n, int b_k) { - return 2 * a_m * b_k * a_n; + if (std::is_same::value || + std::is_same::value || + std::is_same::value) + return 2 * a_m * b_k * a_n; + else + // For complex, we need to count 2 flops for each add and 6 flops for each multiply. + return (2 + 6) * a_m * b_k * a_n; } static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, double time_in_seconds, From f11f9138e9c69a6387b9e8c67c0809d81be7f872 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 15 Feb 2021 16:35:38 -0700 Subject: [PATCH 12/47] perf_test/blas/blas3: Gemm perf_test_updates - Fix batched_serial to use RangePolicy instead of TeamPolicy - Add --use_auto option and optionally use Kokkos::AUTO for team_size and vector_len in gemm. --- perf_test/blas/blas3/KokkosBlas3_common.hpp | 2 + .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 93 +++++++++++++------ .../blas/blas3/KokkosBlas3_perf_test.cpp | 9 +- 3 files changed, 73 insertions(+), 31 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_common.hpp b/perf_test/blas/blas3/KokkosBlas3_common.hpp index 4952a8e606..01e368e15c 100644 --- a/perf_test/blas/blas3/KokkosBlas3_common.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_common.hpp @@ -61,6 +61,7 @@ #define DEFAULT_BLAS_ROUTINES "trmm,gemm," #define DEFAULT_TEAM_SIZE 1 #define DEFAULT_VECTOR_LEN 1 +#define DEFAULT_USE_AUTO 0 /************************ blas routine structure definitions **********/ struct perf_test_trmm_args { @@ -83,6 +84,7 @@ struct blas_args { // ADD MORE BLAS3 ROUTINES HERE int team_size; int vector_len; + bool use_auto; // ADD MORE COMMON BLAS3 OPTIONS HERE }; typedef struct blas_args blas_args_t; diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index d6572dfd34..b4d55d0e90 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -58,7 +58,6 @@ #include "KokkosBatched_Util.hpp" //#define GEMM_PERF_TEST_DEBUG -//#define KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY // Forward declarations void do_gemm_serial_blas(options_t options); @@ -409,60 +408,32 @@ struct parallel_batched_gemm { template -void __do_gemm_parallel_batched_template(options_t options, - gemm_args_t gemm_args) { +void __do_gemm_parallel_batched_template_range_policy(options_t options, gemm_args_t gemm_args) { using execution_space = typename device_type::execution_space; -#if defined(KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY) - printf("Using RangePolicy!\n"); using policy_type = Kokkos::RangePolicy; using functor_type = parallel_batched_gemm_range_policy; -#else - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; - using functor_type = - parallel_batched_gemm; -#endif uint32_t warm_up_n = options.warm_up_n; uint32_t n = options.n; -#if !defined(KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY) - auto league_size = options.start.c.k; -#endif Kokkos::Timer timer; STATUS; functor_type parallel_batched_gemm_functor(gemm_args); -#if !defined(KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY) - auto team_size = gemm_args.bp.team_size; - auto vector_len = gemm_args.bp.vector_len; -#endif for (uint32_t i = 0; i < warm_up_n; i++) { -#if defined(KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY) Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", policy_type(0, options.start.c.k), parallel_batched_gemm_functor); -#else - Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", - policy_type(league_size, team_size, vector_len), - parallel_batched_gemm_functor); -#endif Kokkos::fence(); } timer.reset(); for (uint32_t i = 0; i < n; i++) { -#if defined(KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY) Kokkos::parallel_for("parallelBatchedTimedLoopGemm", policy_type(0, options.start.c.k), parallel_batched_gemm_functor); -#else - Kokkos::parallel_for("parallelBatchedTimedLoopGemm", - policy_type(league_size, team_size, vector_len), - parallel_batched_gemm_functor); -#endif Kokkos::fence(); } @@ -471,6 +442,68 @@ void __do_gemm_parallel_batched_template(options_t options, return; } +template +void __do_gemm_parallel_batched_template(options_t options, + gemm_args_t gemm_args) { + using execution_space = typename device_type::execution_space; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + using functor_type = + parallel_batched_gemm; + + uint32_t warm_up_n = options.warm_up_n; + uint32_t n = options.n; + auto league_size = options.start.c.k; + Kokkos::Timer timer; + + if (std::is_same::value) { + return __do_gemm_parallel_batched_template_range_policy(options, gemm_args); + } + + STATUS; + + functor_type parallel_batched_gemm_functor(gemm_args); + auto team_size = gemm_args.bp.team_size; + auto vector_len = gemm_args.bp.vector_len; + + if (options.blas_args.use_auto) { + for (uint32_t i = 0; i < warm_up_n; i++) { + Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", + policy_type(league_size, Kokkos::AUTO, Kokkos::AUTO), + parallel_batched_gemm_functor); + Kokkos::fence(); + } + + timer.reset(); + for (uint32_t i = 0; i < n; i++) { + Kokkos::parallel_for("parallelBatchedTimedLoopGemm", + policy_type(league_size, Kokkos::AUTO, Kokkos::AUTO), + parallel_batched_gemm_functor); + Kokkos::fence(); + } + } else { + for (uint32_t i = 0; i < warm_up_n; i++) { + Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", + policy_type(league_size, team_size, vector_len), + parallel_batched_gemm_functor); + Kokkos::fence(); + } + + timer.reset(); + for (uint32_t i = 0; i < n; i++) { + Kokkos::parallel_for("parallelBatchedTimedLoopGemm", + policy_type(league_size, team_size, vector_len), + parallel_batched_gemm_functor); + Kokkos::fence(); + } + } + + __gemm_output_csv_row(options, gemm_args, timer.seconds()); + + return; +} + template void __do_gemm_parallel_batched(options_t options, gemm_args_t gemm_args) { char a = gemm_args.transA; diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp index 6c95960e25..0f1f2b5d07 100644 --- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp @@ -119,6 +119,11 @@ static void __print_help_blas3_perf_test() { printf("\t\t\tThe value of LEN as an integer. (default: %d)\n", DEFAULT_VECTOR_LEN); + printf("\t-u, --use_auto={0,1}\n"); + printf("\t\tWhether to use Kokkos::AUTO for vector_len and team_size (Heirarchical parallelism).\n"); + printf("\t\t\t1 to use Kokkos::AUTO, otherwise --vector_len and --team_size will be used. (default: %d)\n", + DEFAULT_USE_AUTO); + printf("\t-k, --batch_size=LEN\n"); printf("\t\tBatch size. Adds third dimension to matrices A, B, and C.\n"); printf("\t\t\tThe value of LEN as an integer. (default: %d)\n", @@ -238,6 +243,7 @@ int main(int argc, char **argv) { options.blas_routines = std::string(DEFAULT_BLAS_ROUTINES); options.blas_args.team_size = DEFAULT_TEAM_SIZE; options.blas_args.vector_len = DEFAULT_VECTOR_LEN; + options.blas_args.use_auto = DEFAULT_USE_AUTO; options.blas_args.trmm.trmm_args = DEFAULT_TRMM_ARGS; options.blas_args.trmm.alpha = DEFAULT_TRMM_ALPHA; @@ -245,7 +251,7 @@ int main(int argc, char **argv) { options.blas_args.gemm.gemm_args = DEFAULT_GEMM_ARGS; options.blas_args.gemm.alpha = DEFAULT_GEMM_ALPHA; - while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:", + while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:", long_options, &option_idx)) != -1) { switch (ret) { case 'h': __print_help_blas3_perf_test(); return 0; @@ -363,6 +369,7 @@ int main(int argc, char **argv) { break; case 'z': options.blas_args.team_size = atoi(optarg); break; case 'n': options.blas_args.vector_len = atoi(optarg); break; + case 'u': options.blas_args.use_auto = atoi(optarg); break; case 'c': out_file = optarg; options.out_file = std::string(out_file); From fb41b4c01582cfab5c88cb39261660030af260c2 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 16 Feb 2021 10:11:21 -0700 Subject: [PATCH 13/47] perf_test/blas/blas3: - Initialize options.blas_args.gemm.beta. - rename --gemm_alpha to --gemm_scalars and accept beta input arg. --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 3 ++- .../blas/blas3/KokkosBlas3_perf_test.cpp | 22 ++++++++++++------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index b4d55d0e90..06d854bc2a 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -105,6 +105,7 @@ void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = { /*************************** Test types and defaults **************************/ #define DEFAULT_GEMM_ARGS "NN" #define DEFAULT_GEMM_ALPHA 1.0 +#define DEFAULT_GEMM_BETA 1.0 using view_type_3d = Kokkos::View; @@ -963,7 +964,7 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dim) { gemm_args.B = vtb("gemm_args.B", dim.b.k, dim.b.m, dim.b.n); gemm_args.C = vtc("gemm_args.C", dim.c.k, dim.c.m, dim.c.n); gemm_args.alpha = options.blas_args.gemm.alpha; - gemm_args.alpha = options.blas_args.gemm.beta; + gemm_args.beta = options.blas_args.gemm.beta; gemm_args.bp.team_size = options.blas_args.team_size; gemm_args.bp.vector_len = options.blas_args.vector_len; diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp index 0f1f2b5d07..0ec88f42f7 100644 --- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp @@ -63,7 +63,7 @@ static struct option long_options[] = { {"trmm_options", required_argument, 0, 'o'}, {"trmm_alpha", required_argument, 0, 'a'}, {"gemm_options", required_argument, 0, 'g'}, - {"gemm_alpha", required_argument, 0, 'p'}, + {"gemm_scalars", required_argument, 0, 'p'}, {"team_size", required_argument, 0, 'z'}, {"vector_len", required_argument, 0, 'n'}, {"batch_size", required_argument, 0, 'k'}, @@ -104,10 +104,10 @@ static void __print_help_blas3_perf_test() { "%s)\n", DEFAULT_GEMM_ARGS); - printf("\t-p, --gemm_alpha=SCALAR_VALUE\n"); - printf("\t\tGEMM alpha value.\n"); - printf("\t\t\tThe value of alpha in floating point. (default: %lf)\n", - DEFAULT_GEMM_ALPHA); + printf("\t-p, --gemm_scalars=ALPHA_SCALAR_VALUE,BETA_SCALAR_VALUE\n"); + printf("\t\tGEMM alpha and beta values.\n"); + printf("\t\t\tThe value of alpha and beta in floating point. (default: %lf,%lf)\n", + DEFAULT_GEMM_ALPHA, DEFAULT_GEMM_BETA); printf("\t-z, --team_size=SIZE\n"); printf("\t\tKokkos team size.\n"); @@ -250,8 +250,9 @@ int main(int argc, char **argv) { options.blas_args.gemm.gemm_args = DEFAULT_GEMM_ARGS; options.blas_args.gemm.alpha = DEFAULT_GEMM_ALPHA; + options.blas_args.gemm.beta = DEFAULT_GEMM_BETA; - while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:", + while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:p:", long_options, &option_idx)) != -1) { switch (ret) { case 'h': __print_help_blas3_perf_test(); return 0; @@ -275,14 +276,19 @@ int main(int argc, char **argv) { break; case 'g': // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4)); - if (strlen(optarg) != 3) { + if (strlen(optarg) != 2) { __blas3_perf_test_input_error(argv, ret, optarg); } options.blas_args.gemm.gemm_args = optarg; break; case 'p': // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4)); - options.blas_args.gemm.alpha = (default_scalar)atof(optarg); + double alpha, beta; + if (sscanf(optarg, "%lf,%lf", &alpha, &beta) != 2) + __blas3_perf_test_input_error(argv, ret, optarg); + + options.blas_args.gemm.alpha = static_cast(alpha); + options.blas_args.gemm.beta = static_cast(beta); break; case 'a': // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4)); From a91bd6c9d26f7c8dbddae2b06150f2c3f4bad579 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 16 Feb 2021 10:53:56 -0700 Subject: [PATCH 14/47] perf_test/blas/blas3: Update csv row for --use_auto --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 06d854bc2a..a5dcbbfb0f 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -145,7 +145,10 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, double time_in_seconds, const char *experiment_name = nullptr) { std::string algo_name = test_e_str[options.test]; + std::string ts = std::to_string(gemm_args.bp.team_size); + std::string vlen = std::to_string(gemm_args.bp.vector_len); if (experiment_name) algo_name = std::string(experiment_name); + if (options.blas_args.use_auto) ts = vlen = "Kokkos::AUTO"; double flops = gemm_args.A.extent(0) * __gemm_flop_count(gemm_args.A.extent(1), gemm_args.A.extent(2), gemm_args.B.extent(2)); @@ -154,8 +157,9 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, options.out[0] << algo_name << "," << options.blas_args.gemm.gemm_args << "," << options.blas_args.gemm.alpha << "," - << options.blas_args.gemm.beta << "," << gemm_args.bp.team_size - << "," << gemm_args.bp.vector_len << "," + << options.blas_args.gemm.beta << "," + << ts << "," + << vlen << "," << loop_e_str[options.loop] << "," << gemm_args.A.extent(0) << "x" << gemm_args.A.extent(1) << "x" << gemm_args.A.extent(2) << "," << gemm_args.B.extent(0) << "x" << gemm_args.B.extent(1) From 0d4fe93f72950903c138438738c0b1b2789679dd Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 17 Feb 2021 13:25:11 -0700 Subject: [PATCH 15/47] perf_test/blas/blas3: Add -d option for view allocation --- perf_test/blas/blas3/KokkosBlas3_common.hpp | 3 +- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 85 ++++++++++++++----- .../blas/blas3/KokkosBlas3_perf_test.cpp | 9 +- 3 files changed, 76 insertions(+), 21 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_common.hpp b/perf_test/blas/blas3/KokkosBlas3_common.hpp index 01e368e15c..a2c1e6f6ae 100644 --- a/perf_test/blas/blas3/KokkosBlas3_common.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_common.hpp @@ -62,6 +62,7 @@ #define DEFAULT_TEAM_SIZE 1 #define DEFAULT_VECTOR_LEN 1 #define DEFAULT_USE_AUTO 0 +#define DEFAULT_BATCH_SIZE_LAST_DIM 0 /************************ blas routine structure definitions **********/ struct perf_test_trmm_args { @@ -84,7 +85,7 @@ struct blas_args { // ADD MORE BLAS3 ROUTINES HERE int team_size; int vector_len; - bool use_auto; + bool use_auto, batch_size_last_dim; // ADD MORE COMMON BLAS3 OPTIONS HERE }; typedef struct blas_args blas_args_t; diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index a5dcbbfb0f..7e86d04a4f 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -82,6 +82,8 @@ struct TeamVectorTag {}; struct LayoutLeftTag {}; struct LayoutRightTag {}; struct SimdCpuTag {}; +struct LastDimTag {}; +struct FirstDimTag {}; // gemm invoke table void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = { @@ -150,11 +152,20 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, if (experiment_name) algo_name = std::string(experiment_name); if (options.blas_args.use_auto) ts = vlen = "Kokkos::AUTO"; - double flops = gemm_args.A.extent(0) * __gemm_flop_count(gemm_args.A.extent(1), gemm_args.A.extent(2), - gemm_args.B.extent(2)); - double gflops = flops / 1e9; + double flops; + double gflops; double average_time = time_in_seconds / options.n; + if (options.blas_args.batch_size_last_dim) { + flops = gemm_args.A.extent(2) * __gemm_flop_count(gemm_args.A.extent(0), gemm_args.A.extent(1), + gemm_args.B.extent(1)); + } else { + flops = gemm_args.A.extent(0) * __gemm_flop_count(gemm_args.A.extent(1), gemm_args.A.extent(2), + gemm_args.B.extent(2)); + } + + gflops = flops / 1e9; + options.out[0] << algo_name << "," << options.blas_args.gemm.gemm_args << "," << options.blas_args.gemm.alpha << "," << options.blas_args.gemm.beta << "," @@ -353,7 +364,7 @@ struct parallel_batched_gemm_range_policy { parallel_batched_gemm_range_policy(gemm_args_t gemm_args) : gemm_args_(gemm_args) {} KOKKOS_INLINE_FUNCTION - void operator()(const int &i) const { + void operator()(const FirstDimTag &, const int &i) const { auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL()); auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL()); auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL()); @@ -361,6 +372,16 @@ struct parallel_batched_gemm_range_policy { KokkosBatched::SerialGemm::invoke( gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); } + + KOKKOS_INLINE_FUNCTION + void operator()(const LastDimTag &, const int &i) const { + auto svA = Kokkos::subview(gemm_args_.A, Kokkos::ALL(), Kokkos::ALL(), i); + auto svB = Kokkos::subview(gemm_args_.B, Kokkos::ALL(), Kokkos::ALL(), i); + auto svC = Kokkos::subview(gemm_args_.C, Kokkos::ALL(), Kokkos::ALL(), i); + + KokkosBatched::SerialGemm::invoke( + gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + } }; template void __do_gemm_parallel_batched_template_range_policy(options_t options, gemm_args_t gemm_args) { using execution_space = typename device_type::execution_space; - using policy_type = Kokkos::RangePolicy; + using policy_type = Kokkos::RangePolicy; + using policy_type_last_dim = Kokkos::RangePolicy; using functor_type = parallel_batched_gemm_range_policy; @@ -427,19 +449,38 @@ void __do_gemm_parallel_batched_template_range_policy(options_t options, gemm_ar functor_type parallel_batched_gemm_functor(gemm_args); - for (uint32_t i = 0; i < warm_up_n; i++) { - Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", - policy_type(0, options.start.c.k), - parallel_batched_gemm_functor); - Kokkos::fence(); + if (options.blas_args.batch_size_last_dim) { + for (uint32_t i = 0; i < warm_up_n; i++) { + Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", + policy_type_last_dim(0, options.start.c.k), + parallel_batched_gemm_functor); + Kokkos::fence(); + } + } else { + for (uint32_t i = 0; i < warm_up_n; i++) { + Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", + policy_type(0, options.start.c.k), + parallel_batched_gemm_functor); + Kokkos::fence(); + } } - timer.reset(); - for (uint32_t i = 0; i < n; i++) { - Kokkos::parallel_for("parallelBatchedTimedLoopGemm", - policy_type(0, options.start.c.k), - parallel_batched_gemm_functor); - Kokkos::fence(); + if (options.blas_args.batch_size_last_dim) { + timer.reset(); + for (uint32_t i = 0; i < n; i++) { + Kokkos::parallel_for("parallelBatchedTimedLoopGemm", + policy_type_last_dim(0, options.start.c.k), + parallel_batched_gemm_functor); + Kokkos::fence(); + } + } else { + timer.reset(); + for (uint32_t i = 0; i < n; i++) { + Kokkos::parallel_for("parallelBatchedTimedLoopGemm", + policy_type(0, options.start.c.k), + parallel_batched_gemm_functor); + Kokkos::fence(); + } } __gemm_output_csv_row(options, gemm_args, timer.seconds()); @@ -964,9 +1005,15 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dim) { gemm_args.transA = options.blas_args.gemm.gemm_args.c_str()[0]; gemm_args.transB = options.blas_args.gemm.gemm_args.c_str()[1]; - gemm_args.A = vta("gemm_args.A", dim.a.k, dim.a.m, dim.a.n); - gemm_args.B = vtb("gemm_args.B", dim.b.k, dim.b.m, dim.b.n); - gemm_args.C = vtc("gemm_args.C", dim.c.k, dim.c.m, dim.c.n); + if (options.blas_args.batch_size_last_dim) { + gemm_args.A = vta("gemm_args.A", dim.a.m, dim.a.n, dim.a.k); + gemm_args.B = vtb("gemm_args.B", dim.b.m, dim.b.n, dim.b.k); + gemm_args.C = vtc("gemm_args.C", dim.c.m, dim.c.n, dim.c.k); + } else { + gemm_args.A = vta("gemm_args.A", dim.a.k, dim.a.m, dim.a.n); + gemm_args.B = vtb("gemm_args.B", dim.b.k, dim.b.m, dim.b.n); + gemm_args.C = vtc("gemm_args.C", dim.c.k, dim.c.m, dim.c.n); + } gemm_args.alpha = options.blas_args.gemm.alpha; gemm_args.beta = options.blas_args.gemm.beta; gemm_args.bp.team_size = options.blas_args.team_size; diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp index 0ec88f42f7..72a92a32b1 100644 --- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp @@ -67,6 +67,7 @@ static struct option long_options[] = { {"team_size", required_argument, 0, 'z'}, {"vector_len", required_argument, 0, 'n'}, {"batch_size", required_argument, 0, 'k'}, + {"batch_size_last_dim", required_argument, 0, 'd'}, {0, 0, 0, 0}}; static void __print_help_blas3_perf_test() { @@ -129,6 +130,11 @@ static void __print_help_blas3_perf_test() { printf("\t\t\tThe value of LEN as an integer. (default: %d)\n", DEFAULT_K); + printf("\t-d, --batch_size_last_dim={0,1}\n"); + printf("\t\tHow to allocate the batch_size in the matrices.\n"); + printf("\t\t\t1 make the batch_size the last dimension, otherwise batch_size is the first dimension (default: %d)\n", + DEFAULT_BATCH_SIZE_LAST_DIM); + printf("\t-l, --loop_type=OPTION\n"); printf("\t\tLoop selection.\n"); printf("\t\t\tValid values for OPTION:\n"); @@ -252,7 +258,7 @@ int main(int argc, char **argv) { options.blas_args.gemm.alpha = DEFAULT_GEMM_ALPHA; options.blas_args.gemm.beta = DEFAULT_GEMM_BETA; - while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:p:", + while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:p:d:", long_options, &option_idx)) != -1) { switch (ret) { case 'h': __print_help_blas3_perf_test(); return 0; @@ -373,6 +379,7 @@ int main(int argc, char **argv) { options.stop.a.k = options.stop.b.k = options.stop.c.k = atoi(optarg); break; + case 'd': options.blas_args.batch_size_last_dim = atoi(optarg); break; case 'z': options.blas_args.team_size = atoi(optarg); break; case 'n': options.blas_args.vector_len = atoi(optarg); break; case 'u': options.blas_args.use_auto = atoi(optarg); break; From 5c729bc9243b903059c2abd1422519079e655d07 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 17 Feb 2021 13:55:15 -0700 Subject: [PATCH 16/47] perf_test/blas/blas3: Update team and team_vector for -d --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 159 ++++++++++++------ 1 file changed, 107 insertions(+), 52 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 7e86d04a4f..3db8f0dc1a 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -77,13 +77,14 @@ void do_gemm_team_vector_batched_blocked_parallel(options_t options); void do_gemm_experiment_parallel(options_t options); struct SerialTag {}; +struct SerialBatchDim3Tag {}; struct TeamTag {}; +struct TeamBatchDim3Tag {}; struct TeamVectorTag {}; +struct TeamVectorBatchDim3Tag {}; struct LayoutLeftTag {}; struct LayoutRightTag {}; struct SimdCpuTag {}; -struct LastDimTag {}; -struct FirstDimTag {}; // gemm invoke table void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = { @@ -364,7 +365,7 @@ struct parallel_batched_gemm_range_policy { parallel_batched_gemm_range_policy(gemm_args_t gemm_args) : gemm_args_(gemm_args) {} KOKKOS_INLINE_FUNCTION - void operator()(const FirstDimTag &, const int &i) const { + void operator()(const SerialTag &, const int &i) const { auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL()); auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL()); auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL()); @@ -374,7 +375,7 @@ struct parallel_batched_gemm_range_policy { } KOKKOS_INLINE_FUNCTION - void operator()(const LastDimTag &, const int &i) const { + void operator()(const SerialBatchDim3Tag &, const int &i) const { auto svA = Kokkos::subview(gemm_args_.A, Kokkos::ALL(), Kokkos::ALL(), i); auto svB = Kokkos::subview(gemm_args_.B, Kokkos::ALL(), Kokkos::ALL(), i); auto svC = Kokkos::subview(gemm_args_.C, Kokkos::ALL(), Kokkos::ALL(), i); @@ -382,6 +383,15 @@ struct parallel_batched_gemm_range_policy { KokkosBatched::SerialGemm::invoke( gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamTag &, const int &i) const {} + KOKKOS_INLINE_FUNCTION + void operator()(const TeamBatchDim3Tag &, const int &i) const {} + KOKKOS_INLINE_FUNCTION + void operator()(const TeamVectorTag &, const int &i) const {} + KOKKOS_INLINE_FUNCTION + void operator()(const TeamVectorBatchDim3Tag &, const int &i) const {} }; template ::invoke( + gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + } + KOKKOS_INLINE_FUNCTION void operator()(const TeamTag &, const MemberType &member) const { auto i = member.league_rank(); @@ -414,6 +435,18 @@ struct parallel_batched_gemm { svB, gemm_args_.beta, svC); } + KOKKOS_INLINE_FUNCTION + void operator()(const TeamBatchDim3Tag &, const MemberType &member) const { + auto i = member.league_rank(); + auto svA = Kokkos::subview(gemm_args_.A, Kokkos::ALL(), Kokkos::ALL(), i); + auto svB = Kokkos::subview(gemm_args_.B, Kokkos::ALL(), Kokkos::ALL(), i); + auto svC = Kokkos::subview(gemm_args_.C, Kokkos::ALL(), Kokkos::ALL(), i); + + KokkosBatched::TeamGemm::invoke(member, gemm_args_.alpha, svA, + svB, gemm_args_.beta, svC); + } + KOKKOS_INLINE_FUNCTION void operator()(const TeamVectorTag &, const MemberType &member) const { auto team_idx = member.league_rank(); @@ -430,14 +463,30 @@ struct parallel_batched_gemm { svB, gemm_args_.beta, svC); } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamVectorBatchDim3Tag &, const MemberType &member) const { + auto team_idx = member.league_rank(); + auto svA = + Kokkos::subview(gemm_args_.A, Kokkos::ALL(), Kokkos::ALL(), team_idx); + auto svB = + Kokkos::subview(gemm_args_.B, Kokkos::ALL(), Kokkos::ALL(), team_idx); + auto svC = + Kokkos::subview(gemm_args_.C, Kokkos::ALL(), Kokkos::ALL(), team_idx); + + KokkosBatched::TeamVectorGemm::invoke(member, + gemm_args_.alpha, svA, + svB, gemm_args_.beta, + svC); + } }; template void __do_gemm_parallel_batched_template_range_policy(options_t options, gemm_args_t gemm_args) { using execution_space = typename device_type::execution_space; - using policy_type = Kokkos::RangePolicy; - using policy_type_last_dim = Kokkos::RangePolicy; + using policy_type = Kokkos::RangePolicy; using functor_type = parallel_batched_gemm_range_policy; @@ -449,38 +498,19 @@ void __do_gemm_parallel_batched_template_range_policy(options_t options, gemm_ar functor_type parallel_batched_gemm_functor(gemm_args); - if (options.blas_args.batch_size_last_dim) { - for (uint32_t i = 0; i < warm_up_n; i++) { - Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", - policy_type_last_dim(0, options.start.c.k), - parallel_batched_gemm_functor); - Kokkos::fence(); - } - } else { - for (uint32_t i = 0; i < warm_up_n; i++) { - Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", - policy_type(0, options.start.c.k), - parallel_batched_gemm_functor); - Kokkos::fence(); - } + for (uint32_t i = 0; i < warm_up_n; i++) { + Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", + policy_type(0, options.start.c.k), + parallel_batched_gemm_functor); + Kokkos::fence(); } - if (options.blas_args.batch_size_last_dim) { - timer.reset(); - for (uint32_t i = 0; i < n; i++) { - Kokkos::parallel_for("parallelBatchedTimedLoopGemm", - policy_type_last_dim(0, options.start.c.k), - parallel_batched_gemm_functor); - Kokkos::fence(); - } - } else { - timer.reset(); - for (uint32_t i = 0; i < n; i++) { - Kokkos::parallel_for("parallelBatchedTimedLoopGemm", - policy_type(0, options.start.c.k), - parallel_batched_gemm_functor); - Kokkos::fence(); - } + timer.reset(); + for (uint32_t i = 0; i < n; i++) { + Kokkos::parallel_for("parallelBatchedTimedLoopGemm", + policy_type(0, options.start.c.k), + parallel_batched_gemm_functor); + Kokkos::fence(); } __gemm_output_csv_row(options, gemm_args, timer.seconds()); @@ -503,8 +533,8 @@ void __do_gemm_parallel_batched_template(options_t options, auto league_size = options.start.c.k; Kokkos::Timer timer; - if (std::is_same::value) { - return __do_gemm_parallel_batched_template_range_policy(options, gemm_args); + if (std::is_same::value || std::is_same::value) { + return __do_gemm_parallel_batched_template_range_policy(options, gemm_args); } STATUS; @@ -1089,41 +1119,66 @@ void do_gemm_serial_batched_blocked(options_t options) { void do_gemm_serial_batched_parallel(options_t options) { STATUS; - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); return; } void do_gemm_serial_batched_blocked_parallel(options_t options) { STATUS; - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); return; } void do_gemm_team_batched_parallel(options_t options) { STATUS; - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); return; } void do_gemm_team_batched_blocked_parallel(options_t options) { STATUS; - __do_loop_and_invoke( - options, - __do_gemm_parallel_batched); + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); return; } void do_gemm_team_vector_batched_parallel(options_t options) { STATUS; - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); return; } From 6da5a7b637552ba325a1d42c0561cf8b294b362a Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 18 Feb 2021 13:35:09 -0700 Subject: [PATCH 17/47] perf_test/blas/blas3: Add simd gemm as experiment6. --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 98 ++++++++++++++++++- 1 file changed, 97 insertions(+), 1 deletion(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 3db8f0dc1a..f24a1091b7 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -1023,6 +1023,99 @@ void __do_gemm_parallel_experiment5(options_t options, gemm_args_t gemm_args) { return; } +template +class parallel_batched_gemm_experiment6 { + private: + SimdViewType &A, &B, &C; + gemm_args_t gemm_args; + + public: + parallel_batched_gemm_experiment6(SimdViewType &_A, SimdViewType &_B, + SimdViewType &_C, gemm_args_t _gemm_args) + : A(_A), B(_B), C(_C), gemm_args(_gemm_args) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const MemberType &member) const { + auto i = member.league_rank(); + auto svA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + auto svB = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL()); + auto svC = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL()); + + // Uses two serial for-loops internally + KokkosBatched::TeamVectorGemm::invoke( + member, gemm_args.alpha, svA, svB, gemm_args.beta, svC); + } +}; + +template +void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) { + using execution_space = typename device_type::execution_space; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + + // Construct the vector type + using scalar_type = typename view_type_3d::value_type; + constexpr int vl = + KokkosBatched::DefaultVectorLength::value; + constexpr int il = + KokkosBatched::DefaultInternalVectorLength::value; + using vector_type = KokkosBatched::Vector, vl>; + using internal_vector_type = KokkosBatched::Vector, il>; + using view_type = Kokkos::View; + using vector_view_type = Kokkos::View; + using internal_vector_view_type = Kokkos::View; + using functor_type = + parallel_batched_gemm_experiment6; + + uint32_t warm_up_n = options.warm_up_n; + uint32_t n = options.n; + auto k = options.start.c.k; + Kokkos::Timer timer; + auto simd_batch_size = k / vl + (k % vl > 0); + STATUS; + + // Construct matrices + vector_view_type A_vector("A_vector", simd_batch_size, gemm_args.A.extent(0), gemm_args.A.extent(1)); + view_type A((scalar_type *)A_vector.data(), simd_batch_size, gemm_args.A.extent(0), gemm_args.A.extent(1)); + internal_vector_view_type A_vector_internal(A_vector.data(), simd_batch_size, gemm_args.A.extent(0), gemm_args.A.extent(1)); + + vector_view_type B_vector("B_vector", simd_batch_size, gemm_args.B.extent(0), gemm_args.B.extent(1)); + view_type B((scalar_type *)B_vector.data(), simd_batch_size, gemm_args.B.extent(0), gemm_args.B.extent(1)); + internal_vector_view_type B_vector_internal(B_vector.data(), simd_batch_size, gemm_args.B.extent(0), gemm_args.B.extent(1)); + + vector_view_type C_vector("C_vector", simd_batch_size, gemm_args.C.extent(0), gemm_args.C.extent(1)); + view_type C((scalar_type *)C_vector.data(), simd_batch_size, gemm_args.C.extent(0), gemm_args.C.extent(1)); + internal_vector_view_type C_vector_internal(C_vector.data(), simd_batch_size, gemm_args.C.extent(0), gemm_args.C.extent(1)); + + uint64_t seed = Kokkos::Impl::clock_tic(); + Kokkos::Random_XorShift64_Pool rand_pool(seed); + Kokkos::fill_random(A, rand_pool, Kokkos::rand, scalar_type>::max()); + Kokkos::fill_random(B, rand_pool, Kokkos::rand, scalar_type>::max()); + Kokkos::fill_random(C, rand_pool, Kokkos::rand, scalar_type>::max()); + Kokkos::fence(); + + functor_type experiment6_functor(A_vector_internal, B_vector_internal, C_vector_internal, gemm_args); + + for (uint32_t i = 0; i < warm_up_n; ++i) { + Kokkos::parallel_for("parallelBatchedUntimedExperiment6Gemm", + policy_type(simd_batch_size, Kokkos::AUTO, vl/il), experiment6_functor); + Kokkos::fence(); + } + + timer.reset(); + for (uint32_t i = 0; i < n; ++i) { + Kokkos::parallel_for("parallelBatchedTimedExperiment6Gemm", + policy_type(simd_batch_size, Kokkos::AUTO, vl/il), experiment6_functor); + Kokkos::fence(); + } + + __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment6"); + return; +} + /*************************** Internal setup fns **************************/ template gemm_args_t __do_setup(options_t options, matrix_dims_t dim) { @@ -1195,7 +1288,7 @@ void do_gemm_experiment_parallel(options_t options) { using TransBType = Trans::NoTranspose; using BlockingType = Algo::Gemm::Unblocked; - __do_loop_and_invoke( +/* __do_loop_and_invoke( options, __do_gemm_parallel_experiment1); __do_loop_and_invoke( @@ -1209,6 +1302,9 @@ void do_gemm_experiment_parallel(options_t options) { BlockingType, default_device>); __do_loop_and_invoke( options, __do_gemm_parallel_experiment5); */ + __do_loop_and_invoke( + options, __do_gemm_parallel_experiment6); } From 441c4d4a6bfaf16b5e9ae28e8e0795ccea3b1c21 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 23 Feb 2021 11:45:10 -0700 Subject: [PATCH 18/47] perf_test/blas/blas3: Add experiment7 (Simd + TeamGemm) --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 105 +++++++++++++++++- 1 file changed, 104 insertions(+), 1 deletion(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index f24a1091b7..86b46e5adb 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -1116,6 +1116,106 @@ void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) { return; } +template +class parallel_batched_gemm_experiment7 { + private: + SimdViewType &A, &B, &C; + gemm_args_t gemm_args; + + public: + parallel_batched_gemm_experiment7(SimdViewType &_A, SimdViewType &_B, + SimdViewType &_C, gemm_args_t _gemm_args) + : A(_A), B(_B), C(_C), gemm_args(_gemm_args) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const MemberType &member) const { + auto i = member.league_rank(); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, A.extent(0)),[&](const int &vector_lane) { + auto svA = Kokkos::subview(A, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); + auto svB = Kokkos::subview(B, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); + auto svC = Kokkos::subview(C, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); + + KokkosBatched::TeamGemm::invoke(member, gemm_args.alpha, svA, svB, gemm_args.beta, svC); + }); + } +}; + +template +void __do_gemm_parallel_experiment7(options_t options, gemm_args_t gemm_args) { + using execution_space = typename device_type::execution_space; + using policy_type = Kokkos::TeamPolicy; + using member_type = typename policy_type::member_type; + + // Construct the vector type + using scalar_type = typename view_type_3d::value_type; + constexpr int vl = + KokkosBatched::DefaultVectorLength::value; + constexpr int il = + KokkosBatched::DefaultInternalVectorLength::value; + using vector_type = KokkosBatched::Vector, vl>; + using internal_vector_type = KokkosBatched::Vector, il>; + using view_type = Kokkos::View; + using vector_view_type = Kokkos::View; + using internal_vector_view_type = Kokkos::View; + + uint32_t warm_up_n = options.warm_up_n; + uint32_t n = options.n; + auto k = options.start.c.k; + Kokkos::Timer timer; + auto simd_batch_size = k / vl + (k % vl > 0); + STATUS; + + // Construct matrices + vector_view_type A_vector("A_vector", gemm_args.A.extent(0), gemm_args.A.extent(1), simd_batch_size); + view_type A((scalar_type *)A_vector.data(), vl, gemm_args.A.extent(0), gemm_args.A.extent(1), simd_batch_size); + internal_vector_view_type A_vector_internal(A_vector.data(), il/vl, gemm_args.A.extent(0), gemm_args.A.extent(1), simd_batch_size); + + vector_view_type B_vector("B_vector", gemm_args.B.extent(0), gemm_args.B.extent(1), simd_batch_size); + view_type B((scalar_type *)B_vector.data(), vl, gemm_args.B.extent(0), gemm_args.B.extent(1), simd_batch_size); + internal_vector_view_type B_vector_internal(B_vector.data(), il/vl, gemm_args.B.extent(0), gemm_args.B.extent(1), simd_batch_size); + + vector_view_type C_vector("C_vector", gemm_args.C.extent(0), gemm_args.C.extent(1), simd_batch_size); + view_type C((scalar_type *)C_vector.data(), vl, gemm_args.C.extent(0), gemm_args.C.extent(1), simd_batch_size); + internal_vector_view_type C_vector_internal(C_vector.data(), il/vl, gemm_args.C.extent(0), gemm_args.C.extent(1), simd_batch_size); + + uint64_t seed = Kokkos::Impl::clock_tic(); + Kokkos::Random_XorShift64_Pool rand_pool(seed); + Kokkos::fill_random(A, rand_pool, Kokkos::rand, scalar_type>::max()); + Kokkos::fill_random(B, rand_pool, Kokkos::rand, scalar_type>::max()); + Kokkos::fill_random(C, rand_pool, Kokkos::rand, scalar_type>::max()); + Kokkos::fence(); + + using functor_type = + parallel_batched_gemm_experiment7; + functor_type experiment7_functor(A_vector_internal, B_vector_internal, C_vector_internal, gemm_args); + + //using functor_type = + // parallel_batched_gemm_experiment7; + // functor_type experiment7_functor(A, B, C, gemm_args); + + for (uint32_t i = 0; i < warm_up_n; ++i) { + Kokkos::parallel_for("parallelBatchedUntimedExperiment7Gemm", + policy_type(simd_batch_size, Kokkos::AUTO, vl/il), experiment7_functor); + //policy_type(simd_batch_size, Kokkos::AUTO, vl), experiment7_functor); + Kokkos::fence(); + } + + timer.reset(); + for (uint32_t i = 0; i < n; ++i) { + Kokkos::parallel_for("parallelBatchedTimedExperiment7Gemm", + policy_type(simd_batch_size, Kokkos::AUTO, vl/il), experiment7_functor); + //policy_type(simd_batch_size, Kokkos::AUTO, vl), experiment7_functor); + Kokkos::fence(); + } + + __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment7"); + return; +} + /*************************** Internal setup fns **************************/ template gemm_args_t __do_setup(options_t options, matrix_dims_t dim) { @@ -1302,9 +1402,12 @@ void do_gemm_experiment_parallel(options_t options) { BlockingType, default_device>); __do_loop_and_invoke( options, __do_gemm_parallel_experiment5); */ + BlockingType, default_device>); __do_loop_and_invoke( options, __do_gemm_parallel_experiment6); */ + __do_loop_and_invoke( + options, __do_gemm_parallel_experiment7); } From 3c805868b780334ab037d2ebc47ce711f1246cc5 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 2 Mar 2021 12:51:37 -0700 Subject: [PATCH 19/47] perf_test/blas/blas3: replace experiment7 with batched_team_simd --- perf_test/blas/blas3/KokkosBlas3_common.hpp | 5 +- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 395 +++++++++++------- .../blas/blas3/KokkosBlas3_perf_test.cpp | 19 +- 3 files changed, 261 insertions(+), 158 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_common.hpp b/perf_test/blas/blas3/KokkosBlas3_common.hpp index a2c1e6f6ae..b398ed62aa 100644 --- a/perf_test/blas/blas3/KokkosBlas3_common.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_common.hpp @@ -137,6 +137,8 @@ typedef enum TEST { BATCHED_TEAM_BLOCKED, BATCHED_TEAM_VECTOR, BATCHED_TEAM_VECTOR_BLOCKED, + BATCHED_TEAM_SIMD, + BATCHED_TEAM_SIMD_BLOCKED, // ADD MORE TEST TYPES HERE EXPERIMENT, TEST_N @@ -145,7 +147,8 @@ typedef enum TEST { static std::string test_e_str[TEST_N]{ "blas", "batched_serial", "batched_serial_blocked", "batched_team", "batched_team_blocked", "batched_team_vector", - "batched_team_vector_blocked", + "batched_team_vector_blocked", "batched_team_simd", + "batched_team_simd_blocked", // ADD MORE TEST TYPES HERE "experiment"}; diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 86b46e5adb..91bf649fed 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -74,6 +74,8 @@ void do_gemm_team_batched_parallel(options_t options); void do_gemm_team_batched_blocked_parallel(options_t options); void do_gemm_team_vector_batched_parallel(options_t options); void do_gemm_team_vector_batched_blocked_parallel(options_t options); +void do_gemm_team_simd_batched_parallel(options_t options); +void do_gemm_team_simd_batched_blocked_parallel(options_t options); void do_gemm_experiment_parallel(options_t options); struct SerialTag {}; @@ -82,6 +84,10 @@ struct TeamTag {}; struct TeamBatchDim3Tag {}; struct TeamVectorTag {}; struct TeamVectorBatchDim3Tag {}; +struct TeamSimdTag {}; +struct TeamSimdBatchDim4Tag {}; +// TODO: struct SerialSimdTag {}; +// TODO: struct SerialSimdBatchDim4Tag {}; struct LayoutLeftTag {}; struct LayoutRightTag {}; struct SimdCpuTag {}; @@ -93,6 +99,7 @@ void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = { do_gemm_serial_batched, do_gemm_serial_batched_blocked, // Serial NULL, NULL, // Team NULL, NULL, // TeamVector + NULL, NULL, // TeamSimd NULL // Serial Experiment }, { @@ -102,6 +109,8 @@ void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = { do_gemm_team_batched_parallel, do_gemm_team_batched_blocked_parallel, // Team do_gemm_team_vector_batched_parallel, NULL, // TeamVector + do_gemm_team_simd_batched_parallel, + do_gemm_team_simd_batched_blocked_parallel, // TeamSimd do_gemm_experiment_parallel // Parallel Experiment }}; @@ -112,6 +121,18 @@ void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = { using view_type_3d = Kokkos::View; +using view_type_4d = Kokkos::View; + +// Construct the vector type +using memory_space = typename default_device::execution_space::memory_space; +constexpr int simd_vector_size = + KokkosBatched::DefaultVectorLength::value; +constexpr int simd_internal_vector_size = + KokkosBatched::DefaultInternalVectorLength::value; +using vector_type = KokkosBatched::Vector, simd_vector_size>; +using internal_vector_type = KokkosBatched::Vector, simd_internal_vector_size>; +using vector_view_type_3d = Kokkos::View; +using internal_vector_view_type_4d = Kokkos::View; struct batched_params { int team_size; @@ -119,12 +140,58 @@ struct batched_params { }; typedef struct batched_params batched_params_t; +/** + * @brief struct gemm_simd_args encapsulates the data types required + * for allocating and passing a single matrix to the KokkosBatched gemm + * kernels. To invoke gemm on a batch of matrices, three instances of this + * struct are required, one for each matrix, A, B, and C. + * + * @var vec_3d: 3-rank view type used for allocating the underlying data. + * A reference must be kept to this object to ensure the + * data is not free'd by the C++ runtime. + * @var mat_4d: 4-rank view type used for populating the simd view with + random values. + * @var ivec_4d: 4-rank view type used for passing to math kernels. This + * view type is used for leveraging simd instructions on + * both the host and device. + */ +struct gemm_simd_args { + vector_view_type_3d vec_3d; + view_type_4d mat_4d; + internal_vector_view_type_4d ivec_4d; +}; +typedef struct gemm_simd_args gemm_simd_args_t; + +/** + * @brief struct gemm_args are common arguments passed to + * both gemm implementations in the KokkosBlas and KokkosBatched + * namespaces throughout these performance tests. + * + * @var transA: transpose type for A matrix. + * supported types: 'n' - no transpose, 't' - transpose. + * unsupported types: 'c' - conjugate transpose. + * @var transB: transpose type for B matrix. + * supported types: 'n' - no transpose, 't' - transpose. + * unsupported types: 'c' - conjugate transpose. + * @var alpha: scalar applied to A matrix. + * @var beta: scalar applied to B matrix. + * @var A: 3-rank view type used in all non-simd tests. + * @var B: 3-rank view type used in all non-simd tests. + * @var C: 3-rank view type used in all non-simd tests. + * @var bp: team_size and vector_length for tests that use Kokkos::TeamPolicy. + * @var Av: 3-rank and 4-rank vector view types for simd tests. + * @var Bv: 3-rank and 4-rank vector view types for simd tests. + * @var Cv: 3-rank and 4-rank vector view types for simd tests. + */ struct gemm_args { char transA, transB; default_scalar alpha; default_scalar beta; view_type_3d A, B, C; batched_params_t bp; + // Below are matrices for simd tests + gemm_simd_args_t Av, Bv, Cv; + matrix_dims_t dims; }; typedef struct gemm_args gemm_args_t; @@ -135,15 +202,26 @@ static std::string gemm_csv_header_str = /*************************** Internal helper fns **************************/ // Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf -static inline int __gemm_flop_count(int a_m, int a_n, int b_k) { +static inline int __gemm_flop_count(int a_m, int a_n, int b_n) { if (std::is_same::value || std::is_same::value || std::is_same::value) - return 2 * a_m * b_k * a_n; + return 2 * a_m * b_n * a_n; else // For complex, we need to count 2 flops for each add and 6 flops for each multiply. - return (2 + 6) * a_m * b_k * a_n; + return (2 + 6) * a_m * b_n * a_n; } + +static inline std::string __gemm_output_dim_string(options_t options, matrix_dim_t dim) { + std::string x = "x"; + std::string ret = std::to_string(dim.m) + x + std::to_string(dim.n); + + if (options.blas_args.batch_size_last_dim) + return ret + x + std::to_string(dim.k); + else + return std::to_string(dim.k) + x + ret; +} + static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, double time_in_seconds, const char *experiment_name = nullptr) { @@ -157,13 +235,8 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, double gflops; double average_time = time_in_seconds / options.n; - if (options.blas_args.batch_size_last_dim) { - flops = gemm_args.A.extent(2) * __gemm_flop_count(gemm_args.A.extent(0), gemm_args.A.extent(1), - gemm_args.B.extent(1)); - } else { - flops = gemm_args.A.extent(0) * __gemm_flop_count(gemm_args.A.extent(1), gemm_args.A.extent(2), - gemm_args.B.extent(2)); - } + flops = gemm_args.dims.a.k * __gemm_flop_count(gemm_args.dims.a.m, gemm_args.dims.a.n, + gemm_args.dims.b.n); gflops = flops / 1e9; @@ -172,12 +245,11 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, << options.blas_args.gemm.beta << "," << ts << "," << vlen << "," - << loop_e_str[options.loop] << "," << gemm_args.A.extent(0) - << "x" << gemm_args.A.extent(1) << "x" << gemm_args.A.extent(2) - << "," << gemm_args.B.extent(0) << "x" << gemm_args.B.extent(1) - << "x" << gemm_args.B.extent(2) << "," << gemm_args.C.extent(0) - << "x" << gemm_args.C.extent(1) << "x" << gemm_args.C.extent(2) - << "," << options.warm_up_n << "," << options.n << "," + << loop_e_str[options.loop] << "," + << __gemm_output_dim_string(options, gemm_args.dims.a) << "," + << __gemm_output_dim_string(options, gemm_args.dims.b) << "," + << __gemm_output_dim_string(options, gemm_args.dims.c) << "," + << options.warm_up_n << "," << options.n << "," << time_in_seconds << "," << time_in_seconds / options.n << "," << flops << "," @@ -385,13 +457,34 @@ struct parallel_batched_gemm_range_policy { } KOKKOS_INLINE_FUNCTION - void operator()(const TeamTag &, const int &i) const {} + void operator()(const TeamTag &, const int &i) const { + Kokkos::abort("TeamTag not supported using RangePolicy."); + } + KOKKOS_INLINE_FUNCTION - void operator()(const TeamBatchDim3Tag &, const int &i) const {} + void operator()(const TeamBatchDim3Tag &, const int &i) const { + Kokkos::abort("TeamBatchDim3Tag not supported using RangePolicy."); + } + KOKKOS_INLINE_FUNCTION - void operator()(const TeamVectorTag &, const int &i) const {} + void operator()(const TeamVectorTag &, const int &i) const { + Kokkos::abort("TeamVectorTag not supported using RangePolicy."); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamVectorBatchDim3Tag &, const int &i) const { + Kokkos::abort("TeamVectorBatchDim3Tag not supported using RangePolicy."); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamSimdTag &, const int &i) const { + Kokkos::abort("TeamSimdTag not supported using RangePolicy."); + } + KOKKOS_INLINE_FUNCTION - void operator()(const TeamVectorBatchDim3Tag &, const int &i) const {} + void operator()(const TeamSimdBatchDim4Tag &, const int &i) const { + Kokkos::abort("TeamSimdBatchDim4Tag not supported using RangePolicy."); + } }; template ::invoke(member, gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + }); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const TeamSimdBatchDim4Tag &, const MemberType &member) const { + auto i = member.league_rank(); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, simd_vector_size),[&](const int &vector_lane) { + auto svA = Kokkos::subview(gemm_args_.Av.ivec_4d, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); + auto svB = Kokkos::subview(gemm_args_.Bv.ivec_4d, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); + auto svC = Kokkos::subview(gemm_args_.Cv.ivec_4d, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); + + KokkosBatched::TeamGemm::invoke(member, gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + }); + } }; template ::value || std::is_same::value) { return __do_gemm_parallel_batched_template_range_policy(options, gemm_args); } + if (std::is_same::value || std::is_same::value) { + league_size = options.blas_args.batch_size_last_dim ? gemm_args.Cv.ivec_4d.extent(3) : gemm_args.Cv.ivec_4d.extent(0); + vector_len = simd_vector_size/simd_internal_vector_size; // TODO: use bp.vector_len? + } + STATUS; functor_type parallel_batched_gemm_functor(gemm_args); - auto team_size = gemm_args.bp.team_size; - auto vector_len = gemm_args.bp.vector_len; if (options.blas_args.use_auto) { for (uint32_t i = 0; i < warm_up_n; i++) { @@ -965,7 +1087,7 @@ void __do_gemm_parallel_experiment5(options_t options, gemm_args_t gemm_args) { using scalar_type = typename view_type_3d::value_type; constexpr int vl = KokkosBatched::DefaultVectorLength::value; - using simd_type = KokkosBatched::Vector, vl>; + using simd_type = KokkosBatched::Vector, simd_vector_size>; using simd_view_type = Kokkos::View; using functor_type = @@ -1051,6 +1173,7 @@ class parallel_batched_gemm_experiment6 { template void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) { +#if 0 using execution_space = typename device_type::execution_space; using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; @@ -1061,8 +1184,6 @@ void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) { KokkosBatched::DefaultVectorLength::value; constexpr int il = KokkosBatched::DefaultInternalVectorLength::value; - using vector_type = KokkosBatched::Vector, vl>; - using internal_vector_type = KokkosBatched::Vector, il>; using view_type = Kokkos::View; using vector_view_type = Kokkos::View; using internal_vector_view_type = Kokkos::View; @@ -1113,112 +1234,13 @@ void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) { } __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment6"); - return; -} - -template -class parallel_batched_gemm_experiment7 { - private: - SimdViewType &A, &B, &C; - gemm_args_t gemm_args; - - public: - parallel_batched_gemm_experiment7(SimdViewType &_A, SimdViewType &_B, - SimdViewType &_C, gemm_args_t _gemm_args) - : A(_A), B(_B), C(_C), gemm_args(_gemm_args) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const MemberType &member) const { - auto i = member.league_rank(); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, A.extent(0)),[&](const int &vector_lane) { - auto svA = Kokkos::subview(A, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); - auto svB = Kokkos::subview(B, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); - auto svC = Kokkos::subview(C, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); - - KokkosBatched::TeamGemm::invoke(member, gemm_args.alpha, svA, svB, gemm_args.beta, svC); - }); - } -}; - -template -void __do_gemm_parallel_experiment7(options_t options, gemm_args_t gemm_args) { - using execution_space = typename device_type::execution_space; - using policy_type = Kokkos::TeamPolicy; - using member_type = typename policy_type::member_type; - - // Construct the vector type - using scalar_type = typename view_type_3d::value_type; - constexpr int vl = - KokkosBatched::DefaultVectorLength::value; - constexpr int il = - KokkosBatched::DefaultInternalVectorLength::value; - using vector_type = KokkosBatched::Vector, vl>; - using internal_vector_type = KokkosBatched::Vector, il>; - using view_type = Kokkos::View; - using vector_view_type = Kokkos::View; - using internal_vector_view_type = Kokkos::View; - - uint32_t warm_up_n = options.warm_up_n; - uint32_t n = options.n; - auto k = options.start.c.k; - Kokkos::Timer timer; - auto simd_batch_size = k / vl + (k % vl > 0); - STATUS; - - // Construct matrices - vector_view_type A_vector("A_vector", gemm_args.A.extent(0), gemm_args.A.extent(1), simd_batch_size); - view_type A((scalar_type *)A_vector.data(), vl, gemm_args.A.extent(0), gemm_args.A.extent(1), simd_batch_size); - internal_vector_view_type A_vector_internal(A_vector.data(), il/vl, gemm_args.A.extent(0), gemm_args.A.extent(1), simd_batch_size); - - vector_view_type B_vector("B_vector", gemm_args.B.extent(0), gemm_args.B.extent(1), simd_batch_size); - view_type B((scalar_type *)B_vector.data(), vl, gemm_args.B.extent(0), gemm_args.B.extent(1), simd_batch_size); - internal_vector_view_type B_vector_internal(B_vector.data(), il/vl, gemm_args.B.extent(0), gemm_args.B.extent(1), simd_batch_size); - - vector_view_type C_vector("C_vector", gemm_args.C.extent(0), gemm_args.C.extent(1), simd_batch_size); - view_type C((scalar_type *)C_vector.data(), vl, gemm_args.C.extent(0), gemm_args.C.extent(1), simd_batch_size); - internal_vector_view_type C_vector_internal(C_vector.data(), il/vl, gemm_args.C.extent(0), gemm_args.C.extent(1), simd_batch_size); - - uint64_t seed = Kokkos::Impl::clock_tic(); - Kokkos::Random_XorShift64_Pool rand_pool(seed); - Kokkos::fill_random(A, rand_pool, Kokkos::rand, scalar_type>::max()); - Kokkos::fill_random(B, rand_pool, Kokkos::rand, scalar_type>::max()); - Kokkos::fill_random(C, rand_pool, Kokkos::rand, scalar_type>::max()); - Kokkos::fence(); - - using functor_type = - parallel_batched_gemm_experiment7; - functor_type experiment7_functor(A_vector_internal, B_vector_internal, C_vector_internal, gemm_args); - - //using functor_type = - // parallel_batched_gemm_experiment7; - // functor_type experiment7_functor(A, B, C, gemm_args); - - for (uint32_t i = 0; i < warm_up_n; ++i) { - Kokkos::parallel_for("parallelBatchedUntimedExperiment7Gemm", - policy_type(simd_batch_size, Kokkos::AUTO, vl/il), experiment7_functor); - //policy_type(simd_batch_size, Kokkos::AUTO, vl), experiment7_functor); - Kokkos::fence(); - } - - timer.reset(); - for (uint32_t i = 0; i < n; ++i) { - Kokkos::parallel_for("parallelBatchedTimedExperiment7Gemm", - policy_type(simd_batch_size, Kokkos::AUTO, vl/il), experiment7_functor); - //policy_type(simd_batch_size, Kokkos::AUTO, vl), experiment7_functor); - Kokkos::fence(); - } - - __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment7"); +#endif return; } /*************************** Internal setup fns **************************/ template -gemm_args_t __do_setup(options_t options, matrix_dims_t dim) { +gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { using execution_space = typename device_type::execution_space; gemm_args_t gemm_args; @@ -1226,32 +1248,83 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dim) { Kokkos::Random_XorShift64_Pool rand_pool(seed); STATUS; + gemm_args.dims = dims; gemm_args.transA = options.blas_args.gemm.gemm_args.c_str()[0]; gemm_args.transB = options.blas_args.gemm.gemm_args.c_str()[1]; - if (options.blas_args.batch_size_last_dim) { - gemm_args.A = vta("gemm_args.A", dim.a.m, dim.a.n, dim.a.k); - gemm_args.B = vtb("gemm_args.B", dim.b.m, dim.b.n, dim.b.k); - gemm_args.C = vtc("gemm_args.C", dim.c.m, dim.c.n, dim.c.k); + if (options.test == BATCHED_TEAM_SIMD || options.test == BATCHED_TEAM_SIMD_BLOCKED) { + // Calculate the batch size for simd views + auto a_simd_batch_size = dims.a.k / simd_vector_size + (dims.a.k % simd_vector_size > 0); + auto b_simd_batch_size = dims.b.k / simd_vector_size + (dims.b.k % simd_vector_size > 0); + auto c_simd_batch_size = dims.c.k / simd_vector_size + (dims.c.k % simd_vector_size > 0); + + // Reference gemm simd arguments for allocating A, B, and C matrices + gemm_simd_args_t &A = gemm_args.Av, &B = gemm_args.Bv, &C = gemm_args.Cv; + + if (options.blas_args.batch_size_last_dim) { + // Construct simd matrices with batch_size in the last dimension (better for LayoutLeft views) + A.vec_3d = vector_view_type_3d ("A_vector", dims.a.m, dims.a.n, a_simd_batch_size); + A.mat_4d = view_type_4d ((scalar_type *)A.vec_3d.data(), simd_vector_size, dims.a.m, dims.a.n, a_simd_batch_size); + A.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)A.mat_4d.data(), simd_vector_size/simd_internal_vector_size, dims.a.m, dims.a.n, a_simd_batch_size); + + B.vec_3d = vector_view_type_3d ("B_vector", dims.b.m, dims.b.n, b_simd_batch_size); + B.mat_4d = view_type_4d ((scalar_type *)B.vec_3d.data(), simd_vector_size, dims.b.m, dims.b.n, b_simd_batch_size); + B.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)B.mat_4d.data(), simd_vector_size/simd_internal_vector_size, dims.b.m, dims.b.n, b_simd_batch_size); + + C.vec_3d = vector_view_type_3d ("C_vector", dims.c.m, dims.c.n, c_simd_batch_size); + C.mat_4d = view_type_4d ((scalar_type *)C.vec_3d.data(), simd_vector_size, dims.c.m, dims.c.n, c_simd_batch_size); + C.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)C.mat_4d.data(), simd_vector_size/simd_internal_vector_size, dims.c.m, dims.c.n, c_simd_batch_size); + + } else { + // Construct simd matrices with batch_size in the first dimension (better for LayoutRight views) + A.vec_3d = vector_view_type_3d ("A_vector", a_simd_batch_size, dims.a.m, dims.a.n); + A.mat_4d = view_type_4d ((scalar_type *)A.vec_3d.data(), a_simd_batch_size, dims.a.m, dims.a.n, simd_vector_size); + A.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)A.mat_4d.data(), a_simd_batch_size, dims.a.m, dims.a.n, simd_vector_size/simd_internal_vector_size); + + B.vec_3d = vector_view_type_3d ("B_vector", b_simd_batch_size, dims.b.m, dims.b.n); + B.mat_4d = view_type_4d ((scalar_type *)B.vec_3d.data(), b_simd_batch_size, dims.b.m, dims.b.n, simd_vector_size); + B.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)B.mat_4d.data(), b_simd_batch_size, dims.b.m, dims.b.n, simd_vector_size/simd_internal_vector_size); + + C.vec_3d = vector_view_type_3d ("C_vector", c_simd_batch_size, dims.c.m, dims.c.n); + C.mat_4d = view_type_4d ((scalar_type *)C.vec_3d.data(), c_simd_batch_size, dims.c.m, dims.c.n, simd_vector_size); + C.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)C.mat_4d.data(), c_simd_batch_size, dims.c.m, dims.c.n, simd_vector_size/simd_internal_vector_size); + } + + // Use the non-simd 4-rank view type to randomly populate the gemm simd arguments + Kokkos::fill_random(gemm_args.Av.mat_4d, rand_pool, + Kokkos::rand, + scalar_type>::max()); + Kokkos::fill_random(gemm_args.Bv.mat_4d, rand_pool, + Kokkos::rand, + scalar_type>::max()); + Kokkos::fill_random(gemm_args.Cv.mat_4d, rand_pool, + Kokkos::rand, + scalar_type>::max()); } else { - gemm_args.A = vta("gemm_args.A", dim.a.k, dim.a.m, dim.a.n); - gemm_args.B = vtb("gemm_args.B", dim.b.k, dim.b.m, dim.b.n); - gemm_args.C = vtc("gemm_args.C", dim.c.k, dim.c.m, dim.c.n); + if (options.blas_args.batch_size_last_dim) { + gemm_args.A = vta("gemm_args.A", dims.a.m, dims.a.n, dims.a.k); + gemm_args.B = vtb("gemm_args.B", dims.b.m, dims.b.n, dims.b.k); + gemm_args.C = vtc("gemm_args.C", dims.c.m, dims.c.n, dims.c.k); + } else { + gemm_args.A = vta("gemm_args.A", dims.a.k, dims.a.m, dims.a.n); + gemm_args.B = vtb("gemm_args.B", dims.b.k, dims.b.m, dims.b.n); + gemm_args.C = vtc("gemm_args.C", dims.c.k, dims.c.m, dims.c.n); + } + + Kokkos::fill_random(gemm_args.A, rand_pool, + Kokkos::rand, + scalar_type>::max()); + Kokkos::fill_random(gemm_args.B, rand_pool, + Kokkos::rand, + scalar_type>::max()); + Kokkos::fill_random(gemm_args.C, rand_pool, + Kokkos::rand, + scalar_type>::max()); } gemm_args.alpha = options.blas_args.gemm.alpha; gemm_args.beta = options.blas_args.gemm.beta; gemm_args.bp.team_size = options.blas_args.team_size; gemm_args.bp.vector_len = options.blas_args.vector_len; - Kokkos::fill_random(gemm_args.A, rand_pool, - Kokkos::rand, - scalar_type>::max()); - Kokkos::fill_random(gemm_args.B, rand_pool, - Kokkos::rand, - scalar_type>::max()); - Kokkos::fill_random(gemm_args.C, rand_pool, - Kokkos::rand, - scalar_type>::max()); - return gemm_args; } @@ -1265,7 +1338,8 @@ void __do_loop_and_invoke(options_t options, __print_gemm_perf_test_options(options); std::cout << "SCALAR:" << typeid(default_scalar).name() << ", LAYOUT:" << typeid(default_layout).name() - << ", DEVICE:" << typeid(default_device).name() << std::endl; + << ", DEVICE:" << typeid(default_device).name() + << ", SPACE:" << typeid(memory_space).name() << std::endl; options.out[0] << gemm_csv_header_str << std::endl; @@ -1375,6 +1449,34 @@ void do_gemm_team_vector_batched_parallel(options_t options) { return; } +void do_gemm_team_simd_batched_parallel(options_t options) { + STATUS; + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + return; +} + +void do_gemm_team_simd_batched_blocked_parallel(options_t options) { + STATUS; + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + return; +} + + +// Blocked algo not yet implemented for TeamVectorGemm. /* void do_gemm_team_vector_batched_blocked_parallel(options_t options) { STATUS; __do_loop_and_invoke( @@ -1388,7 +1490,7 @@ void do_gemm_experiment_parallel(options_t options) { using TransBType = Trans::NoTranspose; using BlockingType = Algo::Gemm::Unblocked; -/* __do_loop_and_invoke( + __do_loop_and_invoke( options, __do_gemm_parallel_experiment1); __do_loop_and_invoke( @@ -1405,9 +1507,6 @@ void do_gemm_experiment_parallel(options_t options) { BlockingType, default_device>); __do_loop_and_invoke( options, __do_gemm_parallel_experiment6); */ - __do_loop_and_invoke( - options, __do_gemm_parallel_experiment7); } diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp index 72a92a32b1..17aac3d526 100644 --- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp @@ -74,7 +74,7 @@ static void __print_help_blas3_perf_test() { printf("Options:\n"); printf("\t-h, --help\n"); - printf("\t\tPrint this help menu.\n\n"); + printf("\t\tPrint this help menu.\n"); printf("\t-t, --test=OPTION\n"); printf("\t\tAlgorithm selection.\n"); @@ -145,7 +145,7 @@ static void __print_help_blas3_perf_test() { printf("%c[1m", 27); printf("\t\t\t\tparallel:"); printf("%c[0m", 27); - printf(" invoke blas routine in a Kokkos::parallel_for-loop.\n\n"); + printf(" invoke blas routine in a Kokkos::parallel_for-loop.\n"); printf("\t-b, --matrix_size_start=MxN,IxJ,PxQ\n"); printf( @@ -153,7 +153,7 @@ static void __print_help_blas3_perf_test() { "(start)\n"); printf( "\t\t\tValid values for M and N are any non-negative 32-bit integers. " - "(default: %dx%d,%dx%d,%dx%d)\n\n", + "(default: %dx%d,%dx%d,%dx%d)\n", DEFAULT_MATRIX_START, DEFAULT_MATRIX_START, DEFAULT_MATRIX_START, DEFAULT_MATRIX_START, DEFAULT_MATRIX_START, DEFAULT_MATRIX_START); @@ -163,7 +163,7 @@ static void __print_help_blas3_perf_test() { "(stop)\n"); printf( "\t\t\tValid dimension values are any non-negative 32-bit integers. " - "(default: %dx%d,%dx%d,%dx%d)\n\n", + "(default: %dx%d,%dx%d,%dx%d)\n", DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP); @@ -171,34 +171,34 @@ static void __print_help_blas3_perf_test() { printf("\t\tMatrix step selection.\n"); printf( "\t\t\tValid value for K is any non-negative 32-bit integer. (default: " - "%d)\n\n", + "%d)\n", DEFAULT_STEP); printf("\t-w, --warm_up_loop=LOOP\n"); printf("\t\tWarm up loop selection. (untimed)\n"); printf( "\t\t\tValid value for LOOP is any non-negative 32-bit integer that's <= " - "ITER. (default: %d)\n\n", + "ITER. (default: %d)\n", DEFAULT_WARM_UP_N); printf("\t-i, --iter=ITER\n"); printf("\t\tIteration selection. (timed)\n"); printf( "\t\t\tValid value for ITER is any non-negative 32-bit integer. " - "(default: %d)\n\n", + "(default: %d)\n", DEFAULT_N); printf("\t-c, --csv=/path/to/file.csv\n"); printf("\t\tCsv output file selection.\n"); printf( "\t\t\tValid value for /path/to/file.csv is any valid file name. " - "(default: stdout)\n\n"); + "(default: stdout)\n"); printf("\t-r, --routines=ROUTINES\n"); printf("\t\tRoutine selection.\n"); printf( "\t\t\tValid value for ROUTINES is one of more valid blas3 routines " - "delimited by a comma. (default: %s)\n\n", + "delimited by a comma. (default: %s)\n", DEFAULT_BLAS_ROUTINES); } @@ -250,6 +250,7 @@ int main(int argc, char **argv) { options.blas_args.team_size = DEFAULT_TEAM_SIZE; options.blas_args.vector_len = DEFAULT_VECTOR_LEN; options.blas_args.use_auto = DEFAULT_USE_AUTO; + options.blas_args.batch_size_last_dim = DEFAULT_BATCH_SIZE_LAST_DIM; options.blas_args.trmm.trmm_args = DEFAULT_TRMM_ARGS; options.blas_args.trmm.alpha = DEFAULT_TRMM_ALPHA; From b5c7b88b1682e9eeb19d8358582f5b5df042a340 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 2 Mar 2021 12:52:01 -0700 Subject: [PATCH 20/47] perf_test/batched: Add README.md --- perf_test/batched/README.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 perf_test/batched/README.md diff --git a/perf_test/batched/README.md b/perf_test/batched/README.md new file mode 100644 index 0000000000..ca5920ae39 --- /dev/null +++ b/perf_test/batched/README.md @@ -0,0 +1 @@ +Batched BLAS performance tests reside in `perf_test/blas/{blas,blas3}`. From d9e9d04d2005334ed2638b37d149cc67fa43eee7 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 2 Mar 2021 16:14:04 -0700 Subject: [PATCH 21/47] perf_test/blas/blas3: Add last gemm test types - Added serial simd test types. - Added serial compact mkl test type. --- perf_test/blas/blas3/KokkosBlas3_common.hpp | 29 ++++-- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 93 +++++++++++++++---- 2 files changed, 97 insertions(+), 25 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_common.hpp b/perf_test/blas/blas3/KokkosBlas3_common.hpp index b398ed62aa..d37f11eea9 100644 --- a/perf_test/blas/blas3/KokkosBlas3_common.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_common.hpp @@ -119,20 +119,28 @@ static std::string loop_e_str[LOOP_N] = {"serial", "parallel"}; /** * @var BLAS: Run the blas routine through the - * KokkosBlas namespace. + * KokkosBlas namespace. * @var BATCHED_SERIAL{_BLOCKED}: Run the serial blas routine through the * KokkosBatched namespace. + * @var BATCHED_SERIAL_SIMD{_BLOCKED}: Run the serial blas routine through the + * KokkosBatched namespace using SIMD views. + * @var BATCHED_SERIAL_COMPACT_MKL: Run the serial blas mkl routine through + * the KokkosBatched namespace. * @var BATCHED_TEAM{_BLOCKED}: Run the team blas routine through the - * KokkosBatched namespace. + * KokkosBatched namespace. * @var BATCHED_TEAM_VECTOR{_BLOCKED}: Run the team vector blas routine through - * the KokkosBatched namespace. - * @var EXPERIMENT: Run the blas routine as a custom - * experiment. + * the KokkosBatched namespace. + * @var BATCHED_TEAM_SIMD{_BLOCKED}: Run the team vector blas routine through the + * KokkosBatched namespace using SIMD views. + * @var EXPERIMENT: Run the blas routine as a custom experiment. */ typedef enum TEST { BLAS, BATCHED_SERIAL, BATCHED_SERIAL_BLOCKED, + BATCHED_SERIAL_SIMD, + BATCHED_SERIAL_SIMD_BLOCKED, + BATCHED_SERIAL_COMPACT_MKL, BATCHED_TEAM, BATCHED_TEAM_BLOCKED, BATCHED_TEAM_VECTOR, @@ -145,10 +153,13 @@ typedef enum TEST { } test_e; static std::string test_e_str[TEST_N]{ - "blas", "batched_serial", "batched_serial_blocked", "batched_team", - "batched_team_blocked", "batched_team_vector", - "batched_team_vector_blocked", "batched_team_simd", - "batched_team_simd_blocked", + "blas", + "batched_serial", "batched_serial_blocked", + "batched_serial_simd", "batched_serial_simd_blocked", + "batched_serial_compact_mkl", + "batched_team", "batched_team_blocked", + "batched_team_vector", "batched_team_vector_blocked", + "batched_team_simd", "batched_team_simd_blocked", // ADD MORE TEST TYPES HERE "experiment"}; diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 91bf649fed..5fffd02dc8 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -70,6 +70,9 @@ void do_gemm_serial_batched_blocked(options_t options); // invocation! void do_gemm_serial_batched_parallel(options_t options); void do_gemm_serial_batched_blocked_parallel(options_t options); +void do_gemm_serial_simd_batched_parallel(options_t options); +void do_gemm_serial_simd_batched_blocked_parallel(options_t options); +void do_gemm_serial_batched_compact_mkl_parallel(options_t options); void do_gemm_team_batched_parallel(options_t options); void do_gemm_team_batched_blocked_parallel(options_t options); void do_gemm_team_vector_batched_parallel(options_t options); @@ -104,8 +107,11 @@ void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = { }, { NULL, // BLAS - do_gemm_serial_batched_parallel, - do_gemm_serial_batched_blocked_parallel, // Serial + do_gemm_serial_batched_parallel, // Serial + do_gemm_serial_batched_blocked_parallel, + do_gemm_serial_simd_batched_parallel, + do_gemm_serial_simd_batched_blocked_parallel, + do_gemm_serial_batched_compact_mkl_parallel, do_gemm_team_batched_parallel, do_gemm_team_batched_blocked_parallel, // Team do_gemm_team_vector_batched_parallel, NULL, // TeamVector @@ -488,7 +494,7 @@ struct parallel_batched_gemm_range_policy { }; template + class BlockingType, class AlgoMode = void> struct parallel_batched_gemm { gemm_args_t gemm_args_; @@ -582,7 +588,7 @@ struct parallel_batched_gemm { auto svB = Kokkos::subview(gemm_args_.Bv.ivec_4d, i, Kokkos::ALL(), Kokkos::ALL(), vector_lane); auto svC = Kokkos::subview(gemm_args_.Cv.ivec_4d, i, Kokkos::ALL(), Kokkos::ALL(), vector_lane); - KokkosBatched::TeamGemm::invoke(member, gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + KokkosBatched::Gemm::invoke(member, gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); }); } @@ -594,7 +600,7 @@ struct parallel_batched_gemm { auto svB = Kokkos::subview(gemm_args_.Bv.ivec_4d, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); auto svC = Kokkos::subview(gemm_args_.Cv.ivec_4d, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); - KokkosBatched::TeamGemm::invoke(member, gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + KokkosBatched::Gemm::invoke(member, gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); }); } }; @@ -636,14 +642,14 @@ void __do_gemm_parallel_batched_template_range_policy(options_t options, gemm_ar } template + class device_type, class algo_mode = void> void __do_gemm_parallel_batched_template(options_t options, gemm_args_t gemm_args) { using execution_space = typename device_type::execution_space; using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; using functor_type = - parallel_batched_gemm; + parallel_batched_gemm; uint32_t warm_up_n = options.warm_up_n; uint32_t n = options.n; @@ -702,7 +708,7 @@ void __do_gemm_parallel_batched_template(options_t options, return; } -template +template void __do_gemm_parallel_batched(options_t options, gemm_args_t gemm_args) { char a = gemm_args.transA; char b = gemm_args.transB; @@ -714,19 +720,19 @@ void __do_gemm_parallel_batched(options_t options, gemm_args_t gemm_args) { if (a == 'N' && b == 'N') { __do_gemm_parallel_batched_template(options, gemm_args); + device_type, algo_mode>(options, gemm_args); } else if (a == 'N' && b == 'T') { __do_gemm_parallel_batched_template(options, gemm_args); + device_type, algo_mode>(options, gemm_args); //} else if (a == 'N' && b == 'C') { // __do_gemm_parallel_batched_template(options, gemm_args); } else if (a == 'T' && b == 'N') { __do_gemm_parallel_batched_template(options, gemm_args); + device_type, algo_mode>(options, gemm_args); } else if (a == 'T' && b == 'T') { __do_gemm_parallel_batched_template(options, gemm_args); + device_type, algo_mode>(options, gemm_args); //} else if (a == 'T' && b == 'C') { // __do_gemm_parallel_batched_template(options, gemm_args); @@ -1410,6 +1416,61 @@ void do_gemm_serial_batched_blocked_parallel(options_t options) { return; } +void do_gemm_serial_simd_batched_parallel(options_t options) { + STATUS; + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + return; +} + +void do_gemm_serial_simd_batched_blocked_parallel(options_t options) { + STATUS; + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + return; +} + +void do_gemm_serial_batched_compact_mkl_parallel(options_t options) { + STATUS; +#if \ + defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) && \ + defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \ + defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) + if (options.blas_args.batch_size_last_dim) + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); + else + __do_loop_and_invoke( + options, __do_gemm_parallel_batched); +#else + #if !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) + std::cerr << std::string(__func__) + << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL__ is undefined." << std::endl; + #elif !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) + std::cerr << std::string(__func__) + << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__ is undefined." << std::endl; + #elif !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) + std::cerr << std::string(__func__) + << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__ is undefined." << std::endl; + #endif +#endif + return; +} + void do_gemm_team_batched_parallel(options_t options) { STATUS; if (options.blas_args.batch_size_last_dim) @@ -1454,11 +1515,11 @@ void do_gemm_team_simd_batched_parallel(options_t options) { if (options.blas_args.batch_size_last_dim) __do_loop_and_invoke( options, __do_gemm_parallel_batched); + default_device, Mode::Team>); else __do_loop_and_invoke( options, __do_gemm_parallel_batched); + default_device, Mode::Team>); return; } @@ -1467,11 +1528,11 @@ void do_gemm_team_simd_batched_blocked_parallel(options_t options) { if (options.blas_args.batch_size_last_dim) __do_loop_and_invoke( options, __do_gemm_parallel_batched); + default_device, Mode::Team>); else __do_loop_and_invoke( options, __do_gemm_parallel_batched); + default_device, Mode::Team>); return; } From fa23cf75b5b4da16a468dfa9640b8bc84b5d5614 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 2 Mar 2021 16:19:04 -0700 Subject: [PATCH 22/47] perf_test/blas/blas3: Apply clang-format --- perf_test/blas/blas3/KokkosBlas3_common.hpp | 19 +- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 480 +++++++++++------- .../blas/blas3/KokkosBlas3_perf_test.cpp | 83 +-- .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 116 +++-- 4 files changed, 405 insertions(+), 293 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_common.hpp b/perf_test/blas/blas3/KokkosBlas3_common.hpp index d37f11eea9..a991efe61e 100644 --- a/perf_test/blas/blas3/KokkosBlas3_common.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_common.hpp @@ -130,9 +130,10 @@ static std::string loop_e_str[LOOP_N] = {"serial", "parallel"}; * KokkosBatched namespace. * @var BATCHED_TEAM_VECTOR{_BLOCKED}: Run the team vector blas routine through * the KokkosBatched namespace. - * @var BATCHED_TEAM_SIMD{_BLOCKED}: Run the team vector blas routine through the - * KokkosBatched namespace using SIMD views. - * @var EXPERIMENT: Run the blas routine as a custom experiment. + * @var BATCHED_TEAM_SIMD{_BLOCKED}: Run the team vector blas routine through + * the KokkosBatched namespace using SIMD views. + * @var EXPERIMENT: Run the blas routine as a custom + * experiment. */ typedef enum TEST { BLAS, @@ -153,13 +154,11 @@ typedef enum TEST { } test_e; static std::string test_e_str[TEST_N]{ - "blas", - "batched_serial", "batched_serial_blocked", - "batched_serial_simd", "batched_serial_simd_blocked", - "batched_serial_compact_mkl", - "batched_team", "batched_team_blocked", - "batched_team_vector", "batched_team_vector_blocked", - "batched_team_simd", "batched_team_simd_blocked", + "blas", "batched_serial", "batched_serial_blocked", "batched_serial_simd", + "batched_serial_simd_blocked", "batched_serial_compact_mkl", "batched_team", + "batched_team_blocked", "batched_team_vector", + "batched_team_vector_blocked", "batched_team_simd", + "batched_team_simd_blocked", // ADD MORE TEST TYPES HERE "experiment"}; diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 5fffd02dc8..3e55a85799 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -106,16 +106,16 @@ void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = { NULL // Serial Experiment }, { - NULL, // BLAS - do_gemm_serial_batched_parallel, // Serial + NULL, // BLAS + do_gemm_serial_batched_parallel, // Serial do_gemm_serial_batched_blocked_parallel, do_gemm_serial_simd_batched_parallel, - do_gemm_serial_simd_batched_blocked_parallel, - do_gemm_serial_batched_compact_mkl_parallel, + do_gemm_serial_simd_batched_blocked_parallel, + do_gemm_serial_batched_compact_mkl_parallel, do_gemm_team_batched_parallel, do_gemm_team_batched_blocked_parallel, // Team do_gemm_team_vector_batched_parallel, NULL, // TeamVector - do_gemm_team_simd_batched_parallel, + do_gemm_team_simd_batched_parallel, do_gemm_team_simd_batched_blocked_parallel, // TeamSimd do_gemm_experiment_parallel // Parallel Experiment }}; @@ -123,22 +123,29 @@ void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = { /*************************** Test types and defaults **************************/ #define DEFAULT_GEMM_ARGS "NN" #define DEFAULT_GEMM_ALPHA 1.0 -#define DEFAULT_GEMM_BETA 1.0 +#define DEFAULT_GEMM_BETA 1.0 using view_type_3d = Kokkos::View; -using view_type_4d = Kokkos::View; +using view_type_4d = + Kokkos::View; // Construct the vector type using memory_space = typename default_device::execution_space::memory_space; constexpr int simd_vector_size = KokkosBatched::DefaultVectorLength::value; -constexpr int simd_internal_vector_size = - KokkosBatched::DefaultInternalVectorLength::value; -using vector_type = KokkosBatched::Vector, simd_vector_size>; -using internal_vector_type = KokkosBatched::Vector, simd_internal_vector_size>; -using vector_view_type_3d = Kokkos::View; -using internal_vector_view_type_4d = Kokkos::View; +constexpr int simd_internal_vector_size = + KokkosBatched::DefaultInternalVectorLength::value; +using vector_type = KokkosBatched::Vector, + simd_vector_size>; +using internal_vector_type = + KokkosBatched::Vector, + simd_internal_vector_size>; +using vector_view_type_3d = + Kokkos::View; +using internal_vector_view_type_4d = + Kokkos::View; struct batched_params { int team_size; @@ -151,14 +158,14 @@ typedef struct batched_params batched_params_t; * for allocating and passing a single matrix to the KokkosBatched gemm * kernels. To invoke gemm on a batch of matrices, three instances of this * struct are required, one for each matrix, A, B, and C. - * + * * @var vec_3d: 3-rank view type used for allocating the underlying data. * A reference must be kept to this object to ensure the * data is not free'd by the C++ runtime. * @var mat_4d: 4-rank view type used for populating the simd view with random values. * @var ivec_4d: 4-rank view type used for passing to math kernels. This - * view type is used for leveraging simd instructions on + * view type is used for leveraging simd instructions on * both the host and device. */ struct gemm_simd_args { @@ -184,11 +191,12 @@ typedef struct gemm_simd_args gemm_simd_args_t; * @var A: 3-rank view type used in all non-simd tests. * @var B: 3-rank view type used in all non-simd tests. * @var C: 3-rank view type used in all non-simd tests. - * @var bp: team_size and vector_length for tests that use Kokkos::TeamPolicy. + * @var bp: team_size and vector_length for tests that use + * Kokkos::TeamPolicy. * @var Av: 3-rank and 4-rank vector view types for simd tests. * @var Bv: 3-rank and 4-rank vector view types for simd tests. * @var Cv: 3-rank and 4-rank vector view types for simd tests. - */ + */ struct gemm_args { char transA, transB; default_scalar alpha; @@ -207,19 +215,22 @@ static std::string gemm_csv_header_str = "iter,total_time(s),average_time(s),FLOPS,GFLOP/average_time(s)"; /*************************** Internal helper fns **************************/ -// Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf +// Flop count formula from lapack working note 41: +// http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf static inline int __gemm_flop_count(int a_m, int a_n, int b_n) { - if (std::is_same::value || - std::is_same::value || - std::is_same::value) - return 2 * a_m * b_n * a_n; - else - // For complex, we need to count 2 flops for each add and 6 flops for each multiply. - return (2 + 6) * a_m * b_n * a_n; + if (std::is_same::value || + std::is_same::value || + std::is_same::value) + return 2 * a_m * b_n * a_n; + else + // For complex, we need to count 2 flops for each add and 6 flops for each + // multiply. + return (2 + 6) * a_m * b_n * a_n; } -static inline std::string __gemm_output_dim_string(options_t options, matrix_dim_t dim) { - std::string x = "x"; +static inline std::string __gemm_output_dim_string(options_t options, + matrix_dim_t dim) { + std::string x = "x"; std::string ret = std::to_string(dim.m) + x + std::to_string(dim.n); if (options.blas_args.batch_size_last_dim) @@ -232,8 +243,8 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, double time_in_seconds, const char *experiment_name = nullptr) { std::string algo_name = test_e_str[options.test]; - std::string ts = std::to_string(gemm_args.bp.team_size); - std::string vlen = std::to_string(gemm_args.bp.vector_len); + std::string ts = std::to_string(gemm_args.bp.team_size); + std::string vlen = std::to_string(gemm_args.bp.vector_len); if (experiment_name) algo_name = std::string(experiment_name); if (options.blas_args.use_auto) ts = vlen = "Kokkos::AUTO"; @@ -241,26 +252,22 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, double gflops; double average_time = time_in_seconds / options.n; - flops = gemm_args.dims.a.k * __gemm_flop_count(gemm_args.dims.a.m, gemm_args.dims.a.n, - gemm_args.dims.b.n); + flops = gemm_args.dims.a.k * __gemm_flop_count(gemm_args.dims.a.m, + gemm_args.dims.a.n, + gemm_args.dims.b.n); gflops = flops / 1e9; options.out[0] << algo_name << "," << options.blas_args.gemm.gemm_args << "," << options.blas_args.gemm.alpha << "," - << options.blas_args.gemm.beta << "," - << ts << "," - << vlen << "," - << loop_e_str[options.loop] << "," - << __gemm_output_dim_string(options, gemm_args.dims.a) << "," - << __gemm_output_dim_string(options, gemm_args.dims.b) << "," - << __gemm_output_dim_string(options, gemm_args.dims.c) << "," - << options.warm_up_n << "," << options.n << "," - << time_in_seconds << "," - << time_in_seconds / options.n << "," - << flops << "," - << gflops / average_time - << std::endl; + << options.blas_args.gemm.beta << "," << ts << "," << vlen + << "," << loop_e_str[options.loop] << "," + << __gemm_output_dim_string(options, gemm_args.dims.a) << "," + << __gemm_output_dim_string(options, gemm_args.dims.b) << "," + << __gemm_output_dim_string(options, gemm_args.dims.c) << "," + << options.warm_up_n << "," << options.n << "," + << time_in_seconds << "," << time_in_seconds / options.n << "," + << flops << "," << gflops / average_time << std::endl; } static void __print_gemm_perf_test_options(options_t options) { @@ -435,12 +442,12 @@ void __do_gemm_parallel_blas(options_t options, gemm_args_t gemm_args) { return; } -template +template struct parallel_batched_gemm_range_policy { gemm_args_t gemm_args_; - parallel_batched_gemm_range_policy(gemm_args_t gemm_args) : gemm_args_(gemm_args) {} + parallel_batched_gemm_range_policy(gemm_args_t gemm_args) + : gemm_args_(gemm_args) {} KOKKOS_INLINE_FUNCTION void operator()(const SerialTag &, const int &i) const { @@ -470,27 +477,27 @@ struct parallel_batched_gemm_range_policy { KOKKOS_INLINE_FUNCTION void operator()(const TeamBatchDim3Tag &, const int &i) const { Kokkos::abort("TeamBatchDim3Tag not supported using RangePolicy."); - } + } KOKKOS_INLINE_FUNCTION void operator()(const TeamVectorTag &, const int &i) const { Kokkos::abort("TeamVectorTag not supported using RangePolicy."); - } + } KOKKOS_INLINE_FUNCTION void operator()(const TeamVectorBatchDim3Tag &, const int &i) const { Kokkos::abort("TeamVectorBatchDim3Tag not supported using RangePolicy."); - } + } KOKKOS_INLINE_FUNCTION void operator()(const TeamSimdTag &, const int &i) const { Kokkos::abort("TeamSimdTag not supported using RangePolicy."); - } + } KOKKOS_INLINE_FUNCTION void operator()(const TeamSimdBatchDim4Tag &, const int &i) const { Kokkos::abort("TeamSimdBatchDim4Tag not supported using RangePolicy."); - } + } }; template ::invoke(member, gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); - }); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, gemm_args_.Cv.ivec_4d.extent(3)), + [&](const int &vector_lane) { + auto svA = Kokkos::subview(gemm_args_.Av.ivec_4d, i, Kokkos::ALL(), + Kokkos::ALL(), vector_lane); + auto svB = Kokkos::subview(gemm_args_.Bv.ivec_4d, i, Kokkos::ALL(), + Kokkos::ALL(), vector_lane); + auto svC = Kokkos::subview(gemm_args_.Cv.ivec_4d, i, Kokkos::ALL(), + Kokkos::ALL(), vector_lane); + + KokkosBatched::Gemm::invoke(member, gemm_args_.alpha, + svA, svB, gemm_args_.beta, + svC); + }); } KOKKOS_INLINE_FUNCTION - void operator()(const TeamSimdBatchDim4Tag &, const MemberType &member) const { + void operator()(const TeamSimdBatchDim4Tag &, + const MemberType &member) const { auto i = member.league_rank(); - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, simd_vector_size),[&](const int &vector_lane) { - auto svA = Kokkos::subview(gemm_args_.Av.ivec_4d, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); - auto svB = Kokkos::subview(gemm_args_.Bv.ivec_4d, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); - auto svC = Kokkos::subview(gemm_args_.Cv.ivec_4d, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); - - KokkosBatched::Gemm::invoke(member, gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); - }); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, simd_vector_size), + [&](const int &vector_lane) { + auto svA = Kokkos::subview(gemm_args_.Av.ivec_4d, vector_lane, + Kokkos::ALL(), Kokkos::ALL(), i); + auto svB = Kokkos::subview(gemm_args_.Bv.ivec_4d, vector_lane, + Kokkos::ALL(), Kokkos::ALL(), i); + auto svC = Kokkos::subview(gemm_args_.Cv.ivec_4d, vector_lane, + Kokkos::ALL(), Kokkos::ALL(), i); + + KokkosBatched::Gemm::invoke(member, gemm_args_.alpha, + svA, svB, gemm_args_.beta, + svC); + }); } }; template -void __do_gemm_parallel_batched_template_range_policy(options_t options, gemm_args_t gemm_args) { +void __do_gemm_parallel_batched_template_range_policy(options_t options, + gemm_args_t gemm_args) { using execution_space = typename device_type::execution_space; using policy_type = Kokkos::RangePolicy; using functor_type = @@ -623,16 +649,16 @@ void __do_gemm_parallel_batched_template_range_policy(options_t options, gemm_ar for (uint32_t i = 0; i < warm_up_n; i++) { Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", - policy_type(0, options.start.c.k), - parallel_batched_gemm_functor); + policy_type(0, options.start.c.k), + parallel_batched_gemm_functor); Kokkos::fence(); } timer.reset(); for (uint32_t i = 0; i < n; i++) { Kokkos::parallel_for("parallelBatchedTimedLoopGemm", - policy_type(0, options.start.c.k), - parallel_batched_gemm_functor); + policy_type(0, options.start.c.k), + parallel_batched_gemm_functor); Kokkos::fence(); } @@ -649,22 +675,30 @@ void __do_gemm_parallel_batched_template(options_t options, using policy_type = Kokkos::TeamPolicy; using member_type = typename policy_type::member_type; using functor_type = - parallel_batched_gemm; + parallel_batched_gemm; uint32_t warm_up_n = options.warm_up_n; uint32_t n = options.n; auto league_size = options.start.c.k; - auto team_size = gemm_args.bp.team_size; - auto vector_len = gemm_args.bp.vector_len; + auto team_size = gemm_args.bp.team_size; + auto vector_len = gemm_args.bp.vector_len; Kokkos::Timer timer; - if (std::is_same::value || std::is_same::value) { - return __do_gemm_parallel_batched_template_range_policy(options, gemm_args); + if (std::is_same::value || + std::is_same::value) { + return __do_gemm_parallel_batched_template_range_policy< + TransAType, TransBType, BlockingType, AlgoTag, device_type>(options, + gemm_args); } - if (std::is_same::value || std::is_same::value) { - league_size = options.blas_args.batch_size_last_dim ? gemm_args.Cv.ivec_4d.extent(3) : gemm_args.Cv.ivec_4d.extent(0); - vector_len = simd_vector_size/simd_internal_vector_size; // TODO: use bp.vector_len? + if (std::is_same::value || + std::is_same::value) { + league_size = options.blas_args.batch_size_last_dim + ? gemm_args.Cv.ivec_4d.extent(3) + : gemm_args.Cv.ivec_4d.extent(0); + vector_len = simd_vector_size / + simd_internal_vector_size; // TODO: use bp.vector_len? } STATUS; @@ -674,31 +708,31 @@ void __do_gemm_parallel_batched_template(options_t options, if (options.blas_args.use_auto) { for (uint32_t i = 0; i < warm_up_n; i++) { Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", - policy_type(league_size, Kokkos::AUTO, Kokkos::AUTO), - parallel_batched_gemm_functor); + policy_type(league_size, Kokkos::AUTO, Kokkos::AUTO), + parallel_batched_gemm_functor); Kokkos::fence(); } timer.reset(); for (uint32_t i = 0; i < n; i++) { Kokkos::parallel_for("parallelBatchedTimedLoopGemm", - policy_type(league_size, Kokkos::AUTO, Kokkos::AUTO), - parallel_batched_gemm_functor); + policy_type(league_size, Kokkos::AUTO, Kokkos::AUTO), + parallel_batched_gemm_functor); Kokkos::fence(); } } else { for (uint32_t i = 0; i < warm_up_n; i++) { Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", - policy_type(league_size, team_size, vector_len), - parallel_batched_gemm_functor); + policy_type(league_size, team_size, vector_len), + parallel_batched_gemm_functor); Kokkos::fence(); } timer.reset(); for (uint32_t i = 0; i < n; i++) { Kokkos::parallel_for("parallelBatchedTimedLoopGemm", - policy_type(league_size, team_size, vector_len), - parallel_batched_gemm_functor); + policy_type(league_size, team_size, vector_len), + parallel_batched_gemm_functor); Kokkos::fence(); } } @@ -708,7 +742,8 @@ void __do_gemm_parallel_batched_template(options_t options, return; } -template +template void __do_gemm_parallel_batched(options_t options, gemm_args_t gemm_args) { char a = gemm_args.transA; char b = gemm_args.transB; @@ -720,19 +755,23 @@ void __do_gemm_parallel_batched(options_t options, gemm_args_t gemm_args) { if (a == 'N' && b == 'N') { __do_gemm_parallel_batched_template(options, gemm_args); + device_type, algo_mode>(options, + gemm_args); } else if (a == 'N' && b == 'T') { __do_gemm_parallel_batched_template(options, gemm_args); + device_type, algo_mode>(options, + gemm_args); //} else if (a == 'N' && b == 'C') { // __do_gemm_parallel_batched_template(options, gemm_args); } else if (a == 'T' && b == 'N') { __do_gemm_parallel_batched_template(options, gemm_args); + device_type, algo_mode>(options, + gemm_args); } else if (a == 'T' && b == 'T') { __do_gemm_parallel_batched_template(options, gemm_args); + device_type, algo_mode>(options, + gemm_args); //} else if (a == 'T' && b == 'C') { // __do_gemm_parallel_batched_template(options, gemm_args); @@ -1093,7 +1132,8 @@ void __do_gemm_parallel_experiment5(options_t options, gemm_args_t gemm_args) { using scalar_type = typename view_type_3d::value_type; constexpr int vl = KokkosBatched::DefaultVectorLength::value; - using simd_type = KokkosBatched::Vector, simd_vector_size>; + using simd_type = + KokkosBatched::Vector, simd_vector_size>; using simd_view_type = Kokkos::View; using functor_type = @@ -1118,12 +1158,12 @@ void __do_gemm_parallel_experiment5(options_t options, gemm_args_t gemm_args) { // uint64_t seed = Kokkos::Impl::clock_tic(); // Kokkos::Random_XorShift64_Pool rand_pool(seed); // Kokkos::fill_random(A, rand_pool, - // Kokkos::rand, simd_type>::max()); - // Kokkos::fill_random(B, rand_pool, - // Kokkos::rand, simd_type>::max()); - // Kokkos::fill_random(C, rand_pool, - // Kokkos::rand, simd_type>::max()); - // execution_space::fence(); + // Kokkos::rand, + // simd_type>::max()); Kokkos::fill_random(B, rand_pool, + // Kokkos::rand, + // simd_type>::max()); Kokkos::fill_random(C, rand_pool, + // Kokkos::rand, + // simd_type>::max()); execution_space::fence(); functor_type experiment5_functor(A, B, C, gemm_args); @@ -1151,8 +1191,8 @@ void __do_gemm_parallel_experiment5(options_t options, gemm_args_t gemm_args) { return; } -template +template class parallel_batched_gemm_experiment6 { private: SimdViewType &A, &B, &C; @@ -1165,14 +1205,16 @@ class parallel_batched_gemm_experiment6 { KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { - auto i = member.league_rank(); + auto i = member.league_rank(); auto svA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); auto svB = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL()); auto svC = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL()); // Uses two serial for-loops internally - KokkosBatched::TeamVectorGemm::invoke( - member, gemm_args.alpha, svA, svB, gemm_args.beta, svC); + KokkosBatched::TeamVectorGemm::invoke(member, gemm_args.alpha, + svA, svB, + gemm_args.beta, svC); } }; @@ -1254,77 +1296,111 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { Kokkos::Random_XorShift64_Pool rand_pool(seed); STATUS; - gemm_args.dims = dims; - gemm_args.transA = options.blas_args.gemm.gemm_args.c_str()[0]; - gemm_args.transB = options.blas_args.gemm.gemm_args.c_str()[1]; - if (options.test == BATCHED_TEAM_SIMD || options.test == BATCHED_TEAM_SIMD_BLOCKED) { + gemm_args.dims = dims; + gemm_args.transA = options.blas_args.gemm.gemm_args.c_str()[0]; + gemm_args.transB = options.blas_args.gemm.gemm_args.c_str()[1]; + if (options.test == BATCHED_TEAM_SIMD || + options.test == BATCHED_TEAM_SIMD_BLOCKED) { // Calculate the batch size for simd views - auto a_simd_batch_size = dims.a.k / simd_vector_size + (dims.a.k % simd_vector_size > 0); - auto b_simd_batch_size = dims.b.k / simd_vector_size + (dims.b.k % simd_vector_size > 0); - auto c_simd_batch_size = dims.c.k / simd_vector_size + (dims.c.k % simd_vector_size > 0); + auto a_simd_batch_size = + dims.a.k / simd_vector_size + (dims.a.k % simd_vector_size > 0); + auto b_simd_batch_size = + dims.b.k / simd_vector_size + (dims.b.k % simd_vector_size > 0); + auto c_simd_batch_size = + dims.c.k / simd_vector_size + (dims.c.k % simd_vector_size > 0); // Reference gemm simd arguments for allocating A, B, and C matrices gemm_simd_args_t &A = gemm_args.Av, &B = gemm_args.Bv, &C = gemm_args.Cv; if (options.blas_args.batch_size_last_dim) { - // Construct simd matrices with batch_size in the last dimension (better for LayoutLeft views) - A.vec_3d = vector_view_type_3d ("A_vector", dims.a.m, dims.a.n, a_simd_batch_size); - A.mat_4d = view_type_4d ((scalar_type *)A.vec_3d.data(), simd_vector_size, dims.a.m, dims.a.n, a_simd_batch_size); - A.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)A.mat_4d.data(), simd_vector_size/simd_internal_vector_size, dims.a.m, dims.a.n, a_simd_batch_size); - - B.vec_3d = vector_view_type_3d ("B_vector", dims.b.m, dims.b.n, b_simd_batch_size); - B.mat_4d = view_type_4d ((scalar_type *)B.vec_3d.data(), simd_vector_size, dims.b.m, dims.b.n, b_simd_batch_size); - B.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)B.mat_4d.data(), simd_vector_size/simd_internal_vector_size, dims.b.m, dims.b.n, b_simd_batch_size); - - C.vec_3d = vector_view_type_3d ("C_vector", dims.c.m, dims.c.n, c_simd_batch_size); - C.mat_4d = view_type_4d ((scalar_type *)C.vec_3d.data(), simd_vector_size, dims.c.m, dims.c.n, c_simd_batch_size); - C.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)C.mat_4d.data(), simd_vector_size/simd_internal_vector_size, dims.c.m, dims.c.n, c_simd_batch_size); + // Construct simd matrices with batch_size in the last dimension (better + // for LayoutLeft views) + A.vec_3d = vector_view_type_3d("A_vector", dims.a.m, dims.a.n, + a_simd_batch_size); + A.mat_4d = view_type_4d((scalar_type *)A.vec_3d.data(), simd_vector_size, + dims.a.m, dims.a.n, a_simd_batch_size); + A.ivec_4d = internal_vector_view_type_4d( + (internal_vector_type *)A.mat_4d.data(), + simd_vector_size / simd_internal_vector_size, dims.a.m, dims.a.n, + a_simd_batch_size); + + B.vec_3d = vector_view_type_3d("B_vector", dims.b.m, dims.b.n, + b_simd_batch_size); + B.mat_4d = view_type_4d((scalar_type *)B.vec_3d.data(), simd_vector_size, + dims.b.m, dims.b.n, b_simd_batch_size); + B.ivec_4d = internal_vector_view_type_4d( + (internal_vector_type *)B.mat_4d.data(), + simd_vector_size / simd_internal_vector_size, dims.b.m, dims.b.n, + b_simd_batch_size); + + C.vec_3d = vector_view_type_3d("C_vector", dims.c.m, dims.c.n, + c_simd_batch_size); + C.mat_4d = view_type_4d((scalar_type *)C.vec_3d.data(), simd_vector_size, + dims.c.m, dims.c.n, c_simd_batch_size); + C.ivec_4d = internal_vector_view_type_4d( + (internal_vector_type *)C.mat_4d.data(), + simd_vector_size / simd_internal_vector_size, dims.c.m, dims.c.n, + c_simd_batch_size); } else { - // Construct simd matrices with batch_size in the first dimension (better for LayoutRight views) - A.vec_3d = vector_view_type_3d ("A_vector", a_simd_batch_size, dims.a.m, dims.a.n); - A.mat_4d = view_type_4d ((scalar_type *)A.vec_3d.data(), a_simd_batch_size, dims.a.m, dims.a.n, simd_vector_size); - A.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)A.mat_4d.data(), a_simd_batch_size, dims.a.m, dims.a.n, simd_vector_size/simd_internal_vector_size); - - B.vec_3d = vector_view_type_3d ("B_vector", b_simd_batch_size, dims.b.m, dims.b.n); - B.mat_4d = view_type_4d ((scalar_type *)B.vec_3d.data(), b_simd_batch_size, dims.b.m, dims.b.n, simd_vector_size); - B.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)B.mat_4d.data(), b_simd_batch_size, dims.b.m, dims.b.n, simd_vector_size/simd_internal_vector_size); - - C.vec_3d = vector_view_type_3d ("C_vector", c_simd_batch_size, dims.c.m, dims.c.n); - C.mat_4d = view_type_4d ((scalar_type *)C.vec_3d.data(), c_simd_batch_size, dims.c.m, dims.c.n, simd_vector_size); - C.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)C.mat_4d.data(), c_simd_batch_size, dims.c.m, dims.c.n, simd_vector_size/simd_internal_vector_size); + // Construct simd matrices with batch_size in the first dimension (better + // for LayoutRight views) + A.vec_3d = vector_view_type_3d("A_vector", a_simd_batch_size, dims.a.m, + dims.a.n); + A.mat_4d = view_type_4d((scalar_type *)A.vec_3d.data(), a_simd_batch_size, + dims.a.m, dims.a.n, simd_vector_size); + A.ivec_4d = internal_vector_view_type_4d( + (internal_vector_type *)A.mat_4d.data(), a_simd_batch_size, dims.a.m, + dims.a.n, simd_vector_size / simd_internal_vector_size); + + B.vec_3d = vector_view_type_3d("B_vector", b_simd_batch_size, dims.b.m, + dims.b.n); + B.mat_4d = view_type_4d((scalar_type *)B.vec_3d.data(), b_simd_batch_size, + dims.b.m, dims.b.n, simd_vector_size); + B.ivec_4d = internal_vector_view_type_4d( + (internal_vector_type *)B.mat_4d.data(), b_simd_batch_size, dims.b.m, + dims.b.n, simd_vector_size / simd_internal_vector_size); + + C.vec_3d = vector_view_type_3d("C_vector", c_simd_batch_size, dims.c.m, + dims.c.n); + C.mat_4d = view_type_4d((scalar_type *)C.vec_3d.data(), c_simd_batch_size, + dims.c.m, dims.c.n, simd_vector_size); + C.ivec_4d = internal_vector_view_type_4d( + (internal_vector_type *)C.mat_4d.data(), c_simd_batch_size, dims.c.m, + dims.c.n, simd_vector_size / simd_internal_vector_size); } - // Use the non-simd 4-rank view type to randomly populate the gemm simd arguments + // Use the non-simd 4-rank view type to randomly populate the gemm simd + // arguments Kokkos::fill_random(gemm_args.Av.mat_4d, rand_pool, Kokkos::rand, - scalar_type>::max()); + scalar_type>::max()); Kokkos::fill_random(gemm_args.Bv.mat_4d, rand_pool, Kokkos::rand, - scalar_type>::max()); + scalar_type>::max()); Kokkos::fill_random(gemm_args.Cv.mat_4d, rand_pool, Kokkos::rand, - scalar_type>::max()); + scalar_type>::max()); } else { if (options.blas_args.batch_size_last_dim) { - gemm_args.A = vta("gemm_args.A", dims.a.m, dims.a.n, dims.a.k); - gemm_args.B = vtb("gemm_args.B", dims.b.m, dims.b.n, dims.b.k); - gemm_args.C = vtc("gemm_args.C", dims.c.m, dims.c.n, dims.c.k); + gemm_args.A = vta("gemm_args.A", dims.a.m, dims.a.n, dims.a.k); + gemm_args.B = vtb("gemm_args.B", dims.b.m, dims.b.n, dims.b.k); + gemm_args.C = vtc("gemm_args.C", dims.c.m, dims.c.n, dims.c.k); } else { - gemm_args.A = vta("gemm_args.A", dims.a.k, dims.a.m, dims.a.n); - gemm_args.B = vtb("gemm_args.B", dims.b.k, dims.b.m, dims.b.n); - gemm_args.C = vtc("gemm_args.C", dims.c.k, dims.c.m, dims.c.n); + gemm_args.A = vta("gemm_args.A", dims.a.k, dims.a.m, dims.a.n); + gemm_args.B = vtb("gemm_args.B", dims.b.k, dims.b.m, dims.b.n); + gemm_args.C = vtc("gemm_args.C", dims.c.k, dims.c.m, dims.c.n); } Kokkos::fill_random(gemm_args.A, rand_pool, Kokkos::rand, - scalar_type>::max()); + scalar_type>::max()); Kokkos::fill_random(gemm_args.B, rand_pool, Kokkos::rand, - scalar_type>::max()); + scalar_type>::max()); Kokkos::fill_random(gemm_args.C, rand_pool, Kokkos::rand, - scalar_type>::max()); + scalar_type>::max()); } gemm_args.alpha = options.blas_args.gemm.alpha; gemm_args.beta = options.blas_args.gemm.beta; @@ -1344,7 +1420,7 @@ void __do_loop_and_invoke(options_t options, __print_gemm_perf_test_options(options); std::cout << "SCALAR:" << typeid(default_scalar).name() << ", LAYOUT:" << typeid(default_layout).name() - << ", DEVICE:" << typeid(default_device).name() + << ", DEVICE:" << typeid(default_device).name() << ", SPACE:" << typeid(memory_space).name() << std::endl; options.out[0] << gemm_csv_header_str << std::endl; @@ -1394,8 +1470,9 @@ void do_gemm_serial_batched_parallel(options_t options) { STATUS; if (options.blas_args.batch_size_last_dim) __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + options, + __do_gemm_parallel_batched); else __do_loop_and_invoke( options, __do_gemm_parallel_batched); + options, + __do_gemm_parallel_batched); else __do_loop_and_invoke( options, __do_gemm_parallel_batched); + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); else __do_loop_and_invoke( options, __do_gemm_parallel_batched); + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); else __do_loop_and_invoke( options, __do_gemm_parallel_batched); + options, + __do_gemm_parallel_batched); else __do_loop_and_invoke( options, __do_gemm_parallel_batched); #else - #if !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) - std::cerr << std::string(__func__) - << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL__ is undefined." << std::endl; - #elif !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) - std::cerr << std::string(__func__) - << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__ is undefined." << std::endl; - #elif !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) - std::cerr << std::string(__func__) - << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__ is undefined." << std::endl; - #endif +#if !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) + std::cerr + << std::string(__func__) + << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL__ is undefined." + << std::endl; +#elif !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) + std::cerr << std::string(__func__) + << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__ is " + "undefined." + << std::endl; +#elif !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__) + std::cerr + << std::string(__func__) + << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__ " + "is undefined." + << std::endl; +#endif #endif return; } @@ -1475,8 +1562,9 @@ void do_gemm_team_batched_parallel(options_t options) { STATUS; if (options.blas_args.batch_size_last_dim) __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + options, + __do_gemm_parallel_batched); else __do_loop_and_invoke( options, __do_gemm_parallel_batched); + __do_gemm_parallel_batched); else __do_loop_and_invoke( - options, - __do_gemm_parallel_batched); + options, __do_gemm_parallel_batched); return; } void do_gemm_team_vector_batched_parallel(options_t options) { STATUS; if (options.blas_args.batch_size_last_dim) - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); else __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + options, + __do_gemm_parallel_batched); return; } void do_gemm_team_simd_batched_parallel(options_t options) { STATUS; if (options.blas_args.batch_size_last_dim) - __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); else __do_loop_and_invoke( options, __do_gemm_parallel_batched); + __do_loop_and_invoke( + options, + __do_gemm_parallel_batched); else __do_loop_and_invoke( options, __do_gemm_parallel_batched(alpha); - options.blas_args.gemm.beta = static_cast(beta); + options.blas_args.gemm.beta = static_cast(beta); break; case 'a': // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4)); diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index a313eabbaf..f84479d26e 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -78,34 +78,36 @@ void (*do_trmm_invoke[LOOP_N][TEST_N])(options_t) = { * assumes KokkosBatched::SerialTrmm is being used. Since the dot products * do a multiply and add we can calculate the flops for any element in the last * column of the LHS to be 2*columns_LHS, any element in the last-1 column of - * the LHS to be 2*(columns_LHS-1), and so on. We do this for every row of the LHS - * giving us this flop count: - * flops = columns_LHS * (columns_LHS + 1) - * flops = (flops / 2) * 2 - * flops = flops * rows_LHS + * the LHS to be 2*(columns_LHS-1), and so on. We do this for every row of the + * LHS giving us this flop count: flops = columns_LHS * (columns_LHS + 1) flops + * = (flops / 2) * 2 flops = flops * rows_LHS */ -static inline int trmm_impl_flop_count(char side, int b_m, int b_n, int a_m, int a_n) { +static inline int trmm_impl_flop_count(char side, int b_m, int b_n, int a_m, + int a_n) { int flops; if (side == 'L' || side == 'l') { - flops = (b_m * (b_m + 1)) * b_n; + flops = (b_m * (b_m + 1)) * b_n; } else { - flops = (b_n * (b_n + 1)) * b_m; + flops = (b_n * (b_n + 1)) * b_m; } if (std::is_same::value || - std::is_same::value || - std::is_same::value) - return flops; + std::is_same::value || + std::is_same::value) + return flops; // Account for 6 additional flops when complex numbers are used. // Above we have counted 1 flop for each add and 1 flop for each multiply. - // For complex, we need to count 2 flops for each add and 6 flops for each multiply. + // For complex, we need to count 2 flops for each add and 6 flops for each + // multiply. return flops * 4; } -// Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf -static inline int trmm_flop_count(char side, int b_m, int b_n, int a_m, int a_n) { +// Flop count formula from lapack working note 41: +// http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf +static inline int trmm_flop_count(char side, int b_m, int b_n, int a_m, + int a_n) { int flops; if (side == 'L' || side == 'l') { @@ -115,13 +117,14 @@ static inline int trmm_flop_count(char side, int b_m, int b_n, int a_m, int a_n) } if (std::is_same::value || - std::is_same::value || - std::is_same::value) - return flops; + std::is_same::value || + std::is_same::value) + return flops; // Account for 6 additional flops when complex numbers are used. // Above we have counted 1 flop for each add and 1 flop for each multiply. - // For complex, we need to count 2 flops for each add and 6 flops for each multiply. + // For complex, we need to count 2 flops for each add and 6 flops for each + // multiply. return flops * 4; } @@ -136,17 +139,21 @@ typedef struct trmm_args trmm_args_t; static std::string trmm_csv_header_str = "algorithm,side-uplo-trans-diag,alpha,loop_type,A_dims,B_dims,warm_up_n," - "iter,total_time(s),average_time(s),FLOPS,GFLOP/average_time(s),min_achieved_bandwidth(GB/s),max_achieved_bandwidth(GB/s)"; + "iter,total_time(s),average_time(s),FLOPS,GFLOP/" + "average_time(s),min_achieved_bandwidth(GB/s),max_achieved_bandwidth(GB/s)"; /*************************** Internal helper fns **************************/ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, double time_in_seconds) { - double flops = trmm_args.A.extent(0) * trmm_flop_count(trmm_args.side, - trmm_args.B.extent(1), trmm_args.B.extent(2), - trmm_args.A.extent(1), trmm_args.A.extent(2)); - double gflops = flops / 1e9; - double average_time = time_in_seconds / options.n; - double gbytes_in_matrix = (trmm_args.B.extent(0) * trmm_args.B.extent(1) * trmm_args.B.extent(2) * sizeof(default_scalar)) / 1e9; + double flops = trmm_args.A.extent(0) * + trmm_flop_count(trmm_args.side, trmm_args.B.extent(1), + trmm_args.B.extent(2), trmm_args.A.extent(1), + trmm_args.A.extent(2)); + double gflops = flops / 1e9; + double average_time = time_in_seconds / options.n; + double gbytes_in_matrix = (trmm_args.B.extent(0) * trmm_args.B.extent(1) * + trmm_args.B.extent(2) * sizeof(default_scalar)) / + 1e9; double min_memory_transactions, max_memory_transactions; // Assuming infinite cache size @@ -155,26 +162,29 @@ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, min_memory_transactions = 3; // Assuming no register or real caching - // We have to go out to memory for every element we read from A and B as well as - // every element we write to B. - // We use the trmm flops from lapack note 41 and multiple by 3/2 to account for the - // write to B since this flop count is for one multiply and one add. + // We have to go out to memory for every element we read from A and B as well + // as every element we write to B. We use the trmm flops from lapack note 41 + // and multiple by 3/2 to account for the write to B since this flop count is + // for one multiply and one add. if (trmm_args.side == 'l' || trmm_args.side == 'L') - max_memory_transactions = trmm_args.B.extent(1) * trmm_args.B.extent(1) * trmm_args.B.extent(2) * (3./2.); + max_memory_transactions = trmm_args.B.extent(1) * trmm_args.B.extent(1) * + trmm_args.B.extent(2) * (3. / 2.); else - max_memory_transactions = trmm_args.B.extent(2) * trmm_args.B.extent(2) * trmm_args.B.extent(1) * (3./2.); + max_memory_transactions = trmm_args.B.extent(2) * trmm_args.B.extent(2) * + trmm_args.B.extent(1) * (3. / 2.); options.out[0] << test_e_str[options.test] << "," << options.blas_args.trmm.trmm_args << "," << options.blas_args.trmm.alpha << "," - << loop_e_str[options.loop] << "," << trmm_args.A.extent(0) << "x" << trmm_args.A.extent(1) - << "x" << trmm_args.A.extent(2) << "," << trmm_args.B.extent(0) << "x" << trmm_args.B.extent(1) + << loop_e_str[options.loop] << "," << trmm_args.A.extent(0) + << "x" << trmm_args.A.extent(1) << "x" << trmm_args.A.extent(2) + << "," << trmm_args.B.extent(0) << "x" << trmm_args.B.extent(1) << "x" << trmm_args.B.extent(2) << "," << options.warm_up_n << "," << options.n << "," << time_in_seconds << "," - << average_time << "," - << flops << "," - << gflops / average_time << "," - << (gbytes_in_matrix * min_memory_transactions) / average_time << "," + << average_time << "," << flops << "," << gflops / average_time + << "," + << (gbytes_in_matrix * min_memory_transactions) / average_time + << "," << (gbytes_in_matrix * max_memory_transactions) / average_time << std::endl; } @@ -218,7 +228,7 @@ void __do_trmm_serial_blas(options_t options, trmm_args_t trmm_args) { auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); KokkosBlas::trmm(&trmm_args.side, &trmm_args.uplo, &trmm_args.trans, - &trmm_args.diag, trmm_args.alpha, A, B); + &trmm_args.diag, trmm_args.alpha, A, B); } // Fence after submitting each batch operation Kokkos::fence(); @@ -231,7 +241,7 @@ void __do_trmm_serial_blas(options_t options, trmm_args_t trmm_args) { auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); KokkosBlas::trmm(&trmm_args.side, &trmm_args.uplo, &trmm_args.trans, - &trmm_args.diag, trmm_args.alpha, A, B); + &trmm_args.diag, trmm_args.alpha, A, B); } // Fence after submitting each batch operation Kokkos::fence(); @@ -412,18 +422,20 @@ void __do_trmm_parallel_blas(options_t options, trmm_args_t trmm_args) { STATUS; for (uint32_t j = 0; j < warm_up_n; ++j) { - Kokkos::parallel_for("parallelBlasWarmUpLoopTrmm", - Kokkos::RangePolicy(0, options.start.a.k), - parallel_blas_trmm_functor); + Kokkos::parallel_for( + "parallelBlasWarmUpLoopTrmm", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_blas_trmm_functor); // Fence after each batch operation Kokkos::fence(); } timer.reset(); for (uint32_t j = 0; j < n; ++j) { - Kokkos::parallel_for("parallelBlasTimedLoopTrmm", - Kokkos::RangePolicy(0, options.start.a.k), - parallel_blas_trmm_functor); + Kokkos::parallel_for( + "parallelBlasTimedLoopTrmm", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_blas_trmm_functor); // Fence after each batch operation Kokkos::fence(); } @@ -470,18 +482,20 @@ void __do_trmm_parallel_batched_template(options_t options, STATUS; for (uint32_t j = 0; j < warm_up_n; ++j) { - Kokkos::parallel_for("parallelBatchedWarmUpLoopTrmm", - Kokkos::RangePolicy(0, options.start.a.k), - parallel_batched_trmm_functor); + Kokkos::parallel_for( + "parallelBatchedWarmUpLoopTrmm", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_batched_trmm_functor); // Fence after each batch operation Kokkos::fence(); } timer.reset(); for (uint32_t j = 0; j < n; ++j) { - Kokkos::parallel_for("parallelBatchedTimedLoopTrmm", - Kokkos::RangePolicy(0, options.start.a.k), - parallel_batched_trmm_functor); + Kokkos::parallel_for( + "parallelBatchedTimedLoopTrmm", + Kokkos::RangePolicy(0, options.start.a.k), + parallel_batched_trmm_functor); // Fence after each batch operation Kokkos::fence(); } From e5fb960c340f628242d0266f8dd9f03608d715c2 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 2 Mar 2021 16:20:25 -0700 Subject: [PATCH 23/47] perf_test/blas/blas3: Allocate simd views - Allocate simd views for serial simd tests. --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 3e55a85799..74f0771062 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -1300,7 +1300,9 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { gemm_args.transA = options.blas_args.gemm.gemm_args.c_str()[0]; gemm_args.transB = options.blas_args.gemm.gemm_args.c_str()[1]; if (options.test == BATCHED_TEAM_SIMD || - options.test == BATCHED_TEAM_SIMD_BLOCKED) { + options.test == BATCHED_TEAM_SIMD_BLOCKED || + options.test == BATCHED_SERIAL_SIMD || + options.test == BATCHED_SERIAL_SIMD_BLOCKED) { // Calculate the batch size for simd views auto a_simd_batch_size = dims.a.k / simd_vector_size + (dims.a.k % simd_vector_size > 0); From 30d54723e3d926ed3f65a3db521fec1929df27c9 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 3 Mar 2021 08:27:47 -0700 Subject: [PATCH 24/47] perf_test/blas/blas3: Update compact mkl functors --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 41 ++++++++++++++++--- 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 74f0771062..d646653697 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -83,14 +83,14 @@ void do_gemm_experiment_parallel(options_t options); struct SerialTag {}; struct SerialBatchDim3Tag {}; +struct SerialSimdTag {}; +struct SerialSimdBatchDim3Tag {}; struct TeamTag {}; struct TeamBatchDim3Tag {}; struct TeamVectorTag {}; struct TeamVectorBatchDim3Tag {}; struct TeamSimdTag {}; struct TeamSimdBatchDim4Tag {}; -// TODO: struct SerialSimdTag {}; -// TODO: struct SerialSimdBatchDim4Tag {}; struct LayoutLeftTag {}; struct LayoutRightTag {}; struct SimdCpuTag {}; @@ -469,6 +469,32 @@ struct parallel_batched_gemm_range_policy { gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); } + KOKKOS_INLINE_FUNCTION + void operator()(const SerialSimdTag &, const int &i) const { + auto svA = Kokkos::subview(gemm_args_.Av.vec_3d, i, Kokkos::ALL(), + Kokkos::ALL()); + auto svB = Kokkos::subview(gemm_args_.Bv.vec_3d, i, Kokkos::ALL(), + Kokkos::ALL()); + auto svC = Kokkos::subview(gemm_args_.Cv.vec_3d, i, Kokkos::ALL(), + Kokkos::ALL()); + + KokkosBatched::SerialGemm::invoke( + gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const SerialSimdBatchDim3Tag &, const int &i) const { + auto svA = Kokkos::subview(gemm_args_.Av.vec_3d, + Kokkos::ALL(), Kokkos::ALL(), i); + auto svB = Kokkos::subview(gemm_args_.Bv.vec_3d, + Kokkos::ALL(), Kokkos::ALL(), i); + auto svC = Kokkos::subview(gemm_args_.Cv.vec_3d, + Kokkos::ALL(), Kokkos::ALL(), i); + + KokkosBatched::SerialGemm::invoke( + gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); + } + KOKKOS_INLINE_FUNCTION void operator()(const TeamTag &, const int &i) const { Kokkos::abort("TeamTag not supported using RangePolicy."); @@ -686,7 +712,9 @@ void __do_gemm_parallel_batched_template(options_t options, Kokkos::Timer timer; if (std::is_same::value || - std::is_same::value) { + std::is_same::value || + std::is_same::value || + std::is_same::value) { return __do_gemm_parallel_batched_template_range_policy< TransAType, TransBType, BlockingType, AlgoTag, device_type>(options, gemm_args); @@ -1302,7 +1330,8 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { if (options.test == BATCHED_TEAM_SIMD || options.test == BATCHED_TEAM_SIMD_BLOCKED || options.test == BATCHED_SERIAL_SIMD || - options.test == BATCHED_SERIAL_SIMD_BLOCKED) { + options.test == BATCHED_SERIAL_SIMD_BLOCKED || + options.test == BATCHED_SERIAL_COMPACT_MKL) { // Calculate the batch size for simd views auto a_simd_batch_size = dims.a.k / simd_vector_size + (dims.a.k % simd_vector_size > 0); @@ -1532,11 +1561,11 @@ void do_gemm_serial_batched_compact_mkl_parallel(options_t options) { if (options.blas_args.batch_size_last_dim) __do_loop_and_invoke( options, - __do_gemm_parallel_batched); else __do_loop_and_invoke( - options, __do_gemm_parallel_batched); #else #if !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) From 2401e9dbde85183f7655cc955e4375f2f48d34f3 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 3 Mar 2021 09:01:55 -0700 Subject: [PATCH 25/47] perf_test/blas/blas3: Added operators for SerialSimd --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index d646653697..c2f3f58ced 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -654,6 +654,16 @@ struct parallel_batched_gemm { svC); }); } + + KOKKOS_INLINE_FUNCTION + void operator()(const SerialSimdTag &, const MemberType &member) const { + Kokkos::abort("SerialSimdTag not supported using RangePolicy."); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const SerialSimdBatchDim3Tag &, const MemberType &member) const { + Kokkos::abort("SerialSimdBatchDim3Tag not supported using RangePolicy."); + } }; template Date: Wed, 3 Mar 2021 15:34:21 -0700 Subject: [PATCH 26/47] perf_test/blas/blas3: Fix compactMKL batch size --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index c2f3f58ced..b575bc186b 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -677,15 +677,23 @@ void __do_gemm_parallel_batched_template_range_policy(options_t options, uint32_t warm_up_n = options.warm_up_n; uint32_t n = options.n; + auto batch_size = options.start.c.k; Kokkos::Timer timer; STATUS; functor_type parallel_batched_gemm_functor(gemm_args); + if (std::is_same::value || + std::is_same::value) { + batch_size = options.blas_args.batch_size_last_dim + ? gemm_args.Cv.vec_3d.extent(2) + : gemm_args.Cv.vec_3d.extent(0); + } + for (uint32_t i = 0; i < warm_up_n; i++) { Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", - policy_type(0, options.start.c.k), + policy_type(0, batch_size), parallel_batched_gemm_functor); Kokkos::fence(); } @@ -693,7 +701,7 @@ void __do_gemm_parallel_batched_template_range_policy(options_t options, timer.reset(); for (uint32_t i = 0; i < n; i++) { Kokkos::parallel_for("parallelBatchedTimedLoopGemm", - policy_type(0, options.start.c.k), + policy_type(0, batch_size), parallel_batched_gemm_functor); Kokkos::fence(); } From 1eab5b4f04754ddbe18038a13733fec5bbc6176f Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 3 Mar 2021 12:56:01 -0700 Subject: [PATCH 27/47] perf_test/blas: Fix internal function names --- perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp | 6 +++--- perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp index 32626cfba5..e6abeaefc4 100644 --- a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp +++ b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp @@ -82,7 +82,7 @@ void (*do_trtri_invoke[LOOP_N][TEST_N])(options_t) = { * The KokkosBatched::SerialTrtri implementation performs trmm and scal on subblocks * of the A matrix. a_m subblocks are selected. */ -static inline int trtri_impl_flop_count(int a_m, int a_n) { +static inline int __trtri_impl_flop_count(int a_m, int a_n) { int flop_count = 0; int flops_per_div, flops_per_mul, flops_per_add; @@ -109,7 +109,7 @@ static inline int trtri_impl_flop_count(int a_m, int a_n) { } // Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf -static inline int trtri_flop_count(int a_m, int a_n) { +static inline int __trtri_flop_count(int a_m, int a_n) { int flops; int flops_per_mul; int flops_per_add; @@ -151,7 +151,7 @@ static std::string trtri_csv_header_str = /*************************** Internal helper fns **************************/ static void __trtri_output_csv_row(options_t options, trtri_args_t trtri_args, double time_in_seconds) { - double flops = trtri_args.A.extent(0) * trtri_flop_count(trtri_args.A.extent(1), trtri_args.A.extent(2)); + double flops = trtri_args.A.extent(0) * __trtri_flop_count(trtri_args.A.extent(1), trtri_args.A.extent(2)); double gflops = flops / 1e9; double average_time = time_in_seconds / options.n; diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index f84479d26e..bd6392cf06 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -82,7 +82,7 @@ void (*do_trmm_invoke[LOOP_N][TEST_N])(options_t) = { * LHS giving us this flop count: flops = columns_LHS * (columns_LHS + 1) flops * = (flops / 2) * 2 flops = flops * rows_LHS */ -static inline int trmm_impl_flop_count(char side, int b_m, int b_n, int a_m, +static inline int __trmm_impl_flop_count(char side, int b_m, int b_n, int a_m, int a_n) { int flops; @@ -106,7 +106,7 @@ static inline int trmm_impl_flop_count(char side, int b_m, int b_n, int a_m, // Flop count formula from lapack working note 41: // http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf -static inline int trmm_flop_count(char side, int b_m, int b_n, int a_m, +static inline int __trmm_flop_count(char side, int b_m, int b_n, int a_m, int a_n) { int flops; @@ -146,7 +146,7 @@ static std::string trmm_csv_header_str = static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, double time_in_seconds) { double flops = trmm_args.A.extent(0) * - trmm_flop_count(trmm_args.side, trmm_args.B.extent(1), + __trmm_flop_count(trmm_args.side, trmm_args.B.extent(1), trmm_args.B.extent(2), trmm_args.A.extent(1), trmm_args.A.extent(2)); double gflops = flops / 1e9; From c7e4f5437c31c7f9c52928adcc8d31260b6418ea Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 3 Mar 2021 15:36:31 -0700 Subject: [PATCH 28/47] perf_test/blas/blas3: Apply clang-format --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 44 ++++++++++--------- .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 8 ++-- 2 files changed, 27 insertions(+), 25 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index b575bc186b..d7f2143dc6 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -471,12 +471,12 @@ struct parallel_batched_gemm_range_policy { KOKKOS_INLINE_FUNCTION void operator()(const SerialSimdTag &, const int &i) const { - auto svA = Kokkos::subview(gemm_args_.Av.vec_3d, i, Kokkos::ALL(), - Kokkos::ALL()); - auto svB = Kokkos::subview(gemm_args_.Bv.vec_3d, i, Kokkos::ALL(), - Kokkos::ALL()); - auto svC = Kokkos::subview(gemm_args_.Cv.vec_3d, i, Kokkos::ALL(), - Kokkos::ALL()); + auto svA = + Kokkos::subview(gemm_args_.Av.vec_3d, i, Kokkos::ALL(), Kokkos::ALL()); + auto svB = + Kokkos::subview(gemm_args_.Bv.vec_3d, i, Kokkos::ALL(), Kokkos::ALL()); + auto svC = + Kokkos::subview(gemm_args_.Cv.vec_3d, i, Kokkos::ALL(), Kokkos::ALL()); KokkosBatched::SerialGemm::invoke( gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); @@ -484,12 +484,12 @@ struct parallel_batched_gemm_range_policy { KOKKOS_INLINE_FUNCTION void operator()(const SerialSimdBatchDim3Tag &, const int &i) const { - auto svA = Kokkos::subview(gemm_args_.Av.vec_3d, - Kokkos::ALL(), Kokkos::ALL(), i); - auto svB = Kokkos::subview(gemm_args_.Bv.vec_3d, - Kokkos::ALL(), Kokkos::ALL(), i); - auto svC = Kokkos::subview(gemm_args_.Cv.vec_3d, - Kokkos::ALL(), Kokkos::ALL(), i); + auto svA = + Kokkos::subview(gemm_args_.Av.vec_3d, Kokkos::ALL(), Kokkos::ALL(), i); + auto svB = + Kokkos::subview(gemm_args_.Bv.vec_3d, Kokkos::ALL(), Kokkos::ALL(), i); + auto svC = + Kokkos::subview(gemm_args_.Cv.vec_3d, Kokkos::ALL(), Kokkos::ALL(), i); KokkosBatched::SerialGemm::invoke( gemm_args_.alpha, svA, svB, gemm_args_.beta, svC); @@ -661,7 +661,8 @@ struct parallel_batched_gemm { } KOKKOS_INLINE_FUNCTION - void operator()(const SerialSimdBatchDim3Tag &, const MemberType &member) const { + void operator()(const SerialSimdBatchDim3Tag &, + const MemberType &member) const { Kokkos::abort("SerialSimdBatchDim3Tag not supported using RangePolicy."); } }; @@ -677,7 +678,7 @@ void __do_gemm_parallel_batched_template_range_policy(options_t options, uint32_t warm_up_n = options.warm_up_n; uint32_t n = options.n; - auto batch_size = options.start.c.k; + auto batch_size = options.start.c.k; Kokkos::Timer timer; STATUS; @@ -687,9 +688,9 @@ void __do_gemm_parallel_batched_template_range_policy(options_t options, if (std::is_same::value || std::is_same::value) { batch_size = options.blas_args.batch_size_last_dim - ? gemm_args.Cv.vec_3d.extent(2) - : gemm_args.Cv.vec_3d.extent(0); - } + ? gemm_args.Cv.vec_3d.extent(2) + : gemm_args.Cv.vec_3d.extent(0); + } for (uint32_t i = 0; i < warm_up_n; i++) { Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm", @@ -1579,12 +1580,13 @@ void do_gemm_serial_batched_compact_mkl_parallel(options_t options) { if (options.blas_args.batch_size_last_dim) __do_loop_and_invoke( options, - __do_gemm_parallel_batched); + __do_gemm_parallel_batched); else __do_loop_and_invoke( - options, __do_gemm_parallel_batched); + options, + __do_gemm_parallel_batched); #else #if !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) std::cerr diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index bd6392cf06..86714b7e30 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -83,7 +83,7 @@ void (*do_trmm_invoke[LOOP_N][TEST_N])(options_t) = { * = (flops / 2) * 2 flops = flops * rows_LHS */ static inline int __trmm_impl_flop_count(char side, int b_m, int b_n, int a_m, - int a_n) { + int a_n) { int flops; if (side == 'L' || side == 'l') { @@ -107,7 +107,7 @@ static inline int __trmm_impl_flop_count(char side, int b_m, int b_n, int a_m, // Flop count formula from lapack working note 41: // http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf static inline int __trmm_flop_count(char side, int b_m, int b_n, int a_m, - int a_n) { + int a_n) { int flops; if (side == 'L' || side == 'l') { @@ -147,8 +147,8 @@ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, double time_in_seconds) { double flops = trmm_args.A.extent(0) * __trmm_flop_count(trmm_args.side, trmm_args.B.extent(1), - trmm_args.B.extent(2), trmm_args.A.extent(1), - trmm_args.A.extent(2)); + trmm_args.B.extent(2), trmm_args.A.extent(1), + trmm_args.A.extent(2)); double gflops = flops / 1e9; double average_time = time_in_seconds / options.n; double gbytes_in_matrix = (trmm_args.B.extent(0) * trmm_args.B.extent(1) * From 147783e45bf8aeab0e8e6e37ee5952538fd9904b Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 4 Mar 2021 10:26:14 -0700 Subject: [PATCH 29/47] perf_test/blas/blas3: Fix -d 1 for team and serial simd --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index d7f2143dc6..b7be38fdb9 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -639,7 +639,7 @@ struct parallel_batched_gemm { const MemberType &member) const { auto i = member.league_rank(); Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, simd_vector_size), + Kokkos::ThreadVectorRange(member, gemm_args_.Cv.ivec_4d.extent(0)), [&](const int &vector_lane) { auto svA = Kokkos::subview(gemm_args_.Av.ivec_4d, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i); From e3efd455be26670110cadb517724111dc86c3ba0 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 4 Mar 2021 11:24:44 -0700 Subject: [PATCH 30/47] perf_test/blas/blas3: Update serial simd to use RangePolicy --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index b7be38fdb9..09c3d27465 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -1549,11 +1549,11 @@ void do_gemm_serial_simd_batched_parallel(options_t options) { if (options.blas_args.batch_size_last_dim) __do_loop_and_invoke( options, - __do_gemm_parallel_batched); else __do_loop_and_invoke( - options, __do_gemm_parallel_batched); return; } @@ -1563,11 +1563,11 @@ void do_gemm_serial_simd_batched_blocked_parallel(options_t options) { if (options.blas_args.batch_size_last_dim) __do_loop_and_invoke( options, - __do_gemm_parallel_batched); else __do_loop_and_invoke( - options, __do_gemm_parallel_batched); return; } From 0127243a0363dd3bceb4dac90a95054a98656e6f Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 4 Mar 2021 11:31:09 -0700 Subject: [PATCH 31/47] perf_test/blas: Update flop counts to use double --- perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp | 14 +++++++------- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 2 +- .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 6 +++--- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp index e6abeaefc4..d60f15b92b 100644 --- a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp +++ b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp @@ -82,9 +82,9 @@ void (*do_trtri_invoke[LOOP_N][TEST_N])(options_t) = { * The KokkosBatched::SerialTrtri implementation performs trmm and scal on subblocks * of the A matrix. a_m subblocks are selected. */ -static inline int __trtri_impl_flop_count(int a_m, int a_n) { - int flop_count = 0; - int flops_per_div, flops_per_mul, flops_per_add; +static inline double __trtri_impl_flop_count(double a_m, double a_n) { + double flop_count = 0; + double flops_per_div, flops_per_mul, flops_per_add; if (std::is_same::value || std::is_same::value || @@ -109,10 +109,10 @@ static inline int __trtri_impl_flop_count(int a_m, int a_n) { } // Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf -static inline int __trtri_flop_count(int a_m, int a_n) { - int flops; - int flops_per_mul; - int flops_per_add; +static inline double __trtri_flop_count(double a_m, double a_n) { + double flops; + double flops_per_mul; + double flops_per_add; if (a_m != a_n) { fprintf(stderr, "%s:%d:ERROR: a_m != a_n.\n", __FILE__, __LINE__); diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 09c3d27465..36132db261 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -217,7 +217,7 @@ static std::string gemm_csv_header_str = /*************************** Internal helper fns **************************/ // Flop count formula from lapack working note 41: // http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf -static inline int __gemm_flop_count(int a_m, int a_n, int b_n) { +static inline double __gemm_flop_count(double a_m, double a_n, double b_n) { if (std::is_same::value || std::is_same::value || std::is_same::value) diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index 86714b7e30..6d67e96bd1 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -106,9 +106,9 @@ static inline int __trmm_impl_flop_count(char side, int b_m, int b_n, int a_m, // Flop count formula from lapack working note 41: // http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf -static inline int __trmm_flop_count(char side, int b_m, int b_n, int a_m, - int a_n) { - int flops; +static inline double __trmm_flop_count(char side, double b_m, double b_n, double a_m, + double a_n) { + double flops; if (side == 'L' || side == 'l') { flops = b_m * b_m * b_n; From 4acdaf51142081f32b3139dcfca6aa24f8bf8ccc Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 5 Mar 2021 11:59:04 -0700 Subject: [PATCH 32/47] perf_test/blas/blas3: Added verify option - Implemented verify checks in gemm. Simd verify is still failing when the batch_size is not divisible by the simd_vector_len. --- perf_test/blas/blas3/CMakeLists.txt | 1 + perf_test/blas/blas3/KokkosBlas3_common.hpp | 4 + .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 166 +++++++++++++++++- .../blas/blas3/KokkosBlas3_perf_test.cpp | 19 +- test_common/KokkosKernels_TestUtils.hpp | 15 ++ 5 files changed, 197 insertions(+), 8 deletions(-) diff --git a/perf_test/blas/blas3/CMakeLists.txt b/perf_test/blas/blas3/CMakeLists.txt index c1e3a117fa..8f83bd6b99 100644 --- a/perf_test/blas/blas3/CMakeLists.txt +++ b/perf_test/blas/blas3/CMakeLists.txt @@ -1,5 +1,6 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/tpls/gtest) KOKKOSKERNELS_ADD_EXECUTABLE( KokkosBlas3_perf_test diff --git a/perf_test/blas/blas3/KokkosBlas3_common.hpp b/perf_test/blas/blas3/KokkosBlas3_common.hpp index a991efe61e..2103d0d57e 100644 --- a/perf_test/blas/blas3/KokkosBlas3_common.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_common.hpp @@ -63,6 +63,7 @@ #define DEFAULT_VECTOR_LEN 1 #define DEFAULT_USE_AUTO 0 #define DEFAULT_BATCH_SIZE_LAST_DIM 0 +#define DEFAULT_VERIFY 1 /************************ blas routine structure definitions **********/ struct perf_test_trmm_args { @@ -192,6 +193,8 @@ typedef struct matrix_dims matrix_dims_t; * @var out_file: The file to write csv data to. Defaults to stdout. * @var blas_args: Arguments for each supported blas routine. * @var blas_routines: Selects which supported blas routines to test. + * @var verify: Performs verification of the blas routine for each input + * before timing it. */ struct perf_test_options { test_e test; @@ -205,6 +208,7 @@ struct perf_test_options { std::string out_file; blas_args_t blas_args; std::string blas_routines; + bool verify; }; typedef struct perf_test_options options_t; diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 36132db261..df08e30aaa 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -56,6 +56,8 @@ //#include "KokkosBatched_Gemm_Team_Impl.hpp" //#include "KokkosBatched_Gemm_TeamVector_Impl.hpp" #include "KokkosBatched_Util.hpp" +#include "gtest/gtest.h" // EXPECT_NEAR +#include "KokkosKernels_TestUtils.hpp" //#define GEMM_PERF_TEST_DEBUG @@ -252,6 +254,9 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, double gflops; double average_time = time_in_seconds / options.n; + if (options.verify) + return; + flops = gemm_args.dims.a.k * __gemm_flop_count(gemm_args.dims.a.m, gemm_args.dims.a.n, gemm_args.dims.b.n); @@ -360,8 +365,8 @@ void __do_gemm_serial_batched_template(options_t options, template void __do_gemm_serial_batched(options_t options, gemm_args_t gemm_args) { - char a = gemm_args.transA; - char b = gemm_args.transB; + char a = toupper(gemm_args.transA); + char b = toupper(gemm_args.transB); using N = Trans::NoTranspose; using T = Trans::Transpose; // using C = Trans::ConjTranspose; @@ -1333,6 +1338,154 @@ void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) { return; } +/** + * Check difference of scalars expected and actual at indexes i,j,k + * @var expected: The expected result. + * @var actual: The actual result. + * @var epsilon: The tolerance to use when comparing. + * @return true if the comparison fails and false if the comparison succeeds. + */ +static inline bool __gemm_print_compare_failure(view_type_3d expected, view_type_3d actual, int i, int j, int k, double epsilon) { + STATUS; + auto diff = static_cast(Kokkos::Experimental::fabs(expected(i,j,k) - actual(i,j,k))); + + if (diff > epsilon) { + printf("fabs(expected(%d,%d,%d):%g - actual(%d,%d,%d):%g):%g > epsilon:%g\n", + i,j,k,static_cast(expected(i,j,k)), + i,j,k,static_cast(actual(i,j,k)), + diff, + epsilon); + FATAL_ERROR("Comparison failure!"); + return true; + } + return false; +} + +/** + * Compare all values of expected with all values of actual. + * @var expected: the expected results + * @var actual: the actual results + * @return false if expected matches actual within epsilon, otherwise true. + */ +template +static inline bool __gemm_do_compare(view_type_3d expected, view_type_3d actual) { + double epsilon = Test::epsilon::value; + STATUS; + + for (size_t i = 0; i < expected.extent(0); i++) { + for (size_t j = 0; j < expected.extent(1); j++) { + for (size_t k = 0; k < expected.extent(2); k++) { + if (std::is_same::value) { + return __gemm_print_compare_failure(expected, actual, i, j, k, epsilon); + } + if (std::is_same::value) { + return __gemm_print_compare_failure(expected, actual, k, j, i, epsilon); + } + } + } + } + return false; +} + +/** + * Compare all values of expected with all values of actual. + * @var expected: the expected results + * @var actual: the actual results + * @return false if expected matches actual within epsilon, otherwise true. + */ +template +static inline bool __gemm_do_compare(view_type_3d expected, gemm_simd_args_t actual) { + std::cout << actual.mat_4d.extent(0) << "x" << actual.mat_4d.extent(1) << "x" << actual.mat_4d.extent(2) << "x" << actual.mat_4d.extent(3) << std::endl; + decltype(expected) actual_data(actual.mat_4d.data(), expected.extent(0), expected.extent(1), expected.extent(2)); + STATUS; + return __gemm_do_compare(expected, actual_data); +} + +template +static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, void (*fn)(options_t, gemm_args_t)) { + using execution_space = typename DeviceType::execution_space; + // Just create "expected" types using non-simd types. + decltype(gemm_args.C) C_expected; + decltype(gemm_args.A) A_expected; + decltype(gemm_args.B) B_expected; + STATUS; + + if (options.blas_args.batch_size_last_dim) { + C_expected = decltype(C_expected)("C_expected", gemm_args.dims.c.m, gemm_args.dims.c.n, gemm_args.dims.c.k); + A_expected = decltype(A_expected)("A_expected", gemm_args.dims.a.m, gemm_args.dims.a.n, gemm_args.dims.a.k); + B_expected = decltype(B_expected)("B_expected", gemm_args.dims.b.m, gemm_args.dims.b.n, gemm_args.dims.b.k); + } else { + C_expected = decltype(C_expected)("C_expected", gemm_args.dims.c.k, gemm_args.dims.c.m, gemm_args.dims.c.n); + A_expected = decltype(A_expected)("A_expected", gemm_args.dims.a.k, gemm_args.dims.a.m, gemm_args.dims.a.n); + B_expected = decltype(B_expected)("B_expected", gemm_args.dims.b.k, gemm_args.dims.b.m, gemm_args.dims.b.n); + } + + // Initialize "expected" matrices. + if (gemm_args.C.data() != nullptr) { + Kokkos::deep_copy(C_expected, gemm_args.C); + Kokkos::deep_copy(A_expected, gemm_args.A); + Kokkos::deep_copy(B_expected, gemm_args.B); + + Kokkos::fence(); // Ensure that deep_copy has completed + + // Check that initial values match + if (__gemm_do_compare(C_expected, gemm_args.C)) + FATAL_ERROR("Inital values mismatch!"); + } else if (gemm_args.Cv.vec_3d.data() != nullptr) { + // TODO: Debug this when batch_size % simd_vector_len != 0. + memcpy(C_expected.data(), gemm_args.Cv.vec_3d.data(), sizeof(default_scalar) * gemm_args.dims.c.k * gemm_args.dims.c.m * gemm_args.dims.c.n); + memcpy(A_expected.data(), gemm_args.Av.vec_3d.data(), sizeof(default_scalar) * gemm_args.dims.a.k * gemm_args.dims.a.m * gemm_args.dims.a.n); + memcpy(B_expected.data(), gemm_args.Bv.vec_3d.data(), sizeof(default_scalar) * gemm_args.dims.b.k * gemm_args.dims.b.m * gemm_args.dims.b.n); + + // Check that initial values match + if (__gemm_do_compare(C_expected, gemm_args.Cv)) + FATAL_ERROR("Inital values mismatch!"); + } else { + FATAL_ERROR("Input arguments are empty!"); + } + + // Populate "expected" matrices via VanillaGemm + Test::Functor_BatchedVanillaGEMM vgemm; + vgemm.A_t = toupper(gemm_args.transA) == 'T'; + vgemm.B_t = toupper(gemm_args.transB) == 'T'; + vgemm.A_c = vgemm.B_c = false; + vgemm.A = A_expected; + vgemm.B = B_expected; + vgemm.C = C_expected; + vgemm.alpha = gemm_args.alpha; + vgemm.beta = gemm_args.beta; + vgemm.run(); // Compute C_expected + + // Run routine with warm_up_n = 1 and n = 0. + auto warm_up_n_bak = options.warm_up_n; + options.warm_up_n = 1; + auto n_bak = options.n; + options.n = 0; + fn(options, gemm_args); + + Kokkos::fence(); // Redundant fence. + + // Check the result + if (gemm_args.C.data() != nullptr) { + if (__gemm_do_compare(C_expected, gemm_args.C)) + FATAL_ERROR("Result value mismatch!"); + } + + if (gemm_args.Cv.vec_3d.data() != nullptr) { + if (__gemm_do_compare(C_expected, gemm_args.Cv)) + FATAL_ERROR("Result value mismatch!"); + } + + // Run actual timed test. + options.verify = false; // Set verify to false for csv output. + options.warm_up_n = warm_up_n_bak; + options.n = n_bak; + fn(options, gemm_args); + + // Reset verify for next matrix size. + options.verify = true; +} + /*************************** Internal setup fns **************************/ template gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { @@ -1457,6 +1610,8 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { gemm_args.bp.team_size = options.blas_args.team_size; gemm_args.bp.vector_len = options.blas_args.vector_len; + Kokkos::fence(); // Ensure that fill_random has completed. + return gemm_args; } @@ -1484,7 +1639,12 @@ void __do_loop_and_invoke(options_t options, cur_dims.c.m += options.step, cur_dims.c.n += options.step) { gemm_args = __do_setup(options, cur_dims); - fn(options, gemm_args); + + if (options.verify) { + __gemm_do_verify(options, gemm_args, fn); + } else { + fn(options, gemm_args); + } } return; } diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp index daf68180c2..73f5a18452 100644 --- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp @@ -68,6 +68,7 @@ static struct option long_options[] = { {"vector_len", required_argument, 0, 'n'}, {"batch_size", required_argument, 0, 'k'}, {"batch_size_last_dim", required_argument, 0, 'd'}, + {"verify", required_argument, 0, 'v'}, {0, 0, 0, 0}}; static void __print_help_blas3_perf_test() { @@ -122,23 +123,23 @@ static void __print_help_blas3_perf_test() { printf("\t\t\tThe value of LEN as an integer. (default: %d)\n", DEFAULT_VECTOR_LEN); - printf("\t-u, --use_auto={0,1}\n"); + printf("\t-u, --use_auto=AUTO\n"); printf( "\t\tWhether to use Kokkos::AUTO for vector_len and team_size " "(Heirarchical parallelism).\n"); printf( - "\t\t\t1 to use Kokkos::AUTO, otherwise --vector_len and --team_size " - "will be used. (default: %d)\n", + "\t\t\tValid values for AUTO are 1 to use Kokkos::AUTO and 0 to use --vector_len and --team_size " + "instead. (default: %d)\n", DEFAULT_USE_AUTO); printf("\t-k, --batch_size=LEN\n"); printf("\t\tBatch size. Adds third dimension to matrices A, B, and C.\n"); printf("\t\t\tThe value of LEN as an integer. (default: %d)\n", DEFAULT_K); - printf("\t-d, --batch_size_last_dim={0,1}\n"); + printf("\t-d, --batch_size_last_dim=LAST_DIM\n"); printf("\t\tHow to allocate the batch_size in the matrices.\n"); printf( - "\t\t\t1 make the batch_size the last dimension, otherwise batch_size is " + "\t\t\tValid values for LAST_DIM are 1 make the batch_size the last dimension and 0 to make the batch_size " "the first dimension (default: %d)\n", DEFAULT_BATCH_SIZE_LAST_DIM); @@ -207,6 +208,13 @@ static void __print_help_blas3_perf_test() { "\t\t\tValid value for ROUTINES is one of more valid blas3 routines " "delimited by a comma. (default: %s)\n", DEFAULT_BLAS_ROUTINES); + + printf("\t-v, --verify=VERIFY\n"); + printf("\t\tVerification selection. (untimed)\n"); + printf( + "\t\t\tValid values for VERIFY are either 0 to skip verification or 1 to verify before timing. " + "(default: %d)\n", + DEFAULT_VERIFY); } static void __blas3_perf_test_input_error(char **argv, char short_opt, @@ -258,6 +266,7 @@ int main(int argc, char **argv) { options.blas_args.vector_len = DEFAULT_VECTOR_LEN; options.blas_args.use_auto = DEFAULT_USE_AUTO; options.blas_args.batch_size_last_dim = DEFAULT_BATCH_SIZE_LAST_DIM; + options.verify = DEFAULT_VERIFY; options.blas_args.trmm.trmm_args = DEFAULT_TRMM_ARGS; options.blas_args.trmm.alpha = DEFAULT_TRMM_ALPHA; diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 20a568bbc1..8ad7fe22af 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -202,5 +202,20 @@ namespace Test { *this); } }; + + template + class epsilon { + public: + constexpr static double value = std::numeric_limits::epsilon(); + }; + + // explicit epsilon specializations + #if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT + template + class epsilon { + public: + constexpr static double value = 0009765625F; + }; + #endif // KOKKOS_HALF_T_IS_FLOAT } #endif From 0de685f74269557950aae2271c74d52d26d5c94f Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 5 Mar 2021 12:49:54 -0700 Subject: [PATCH 33/47] test_common: Fix half_t epsilon specialization --- test_common/KokkosKernels_TestUtils.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 8ad7fe22af..64b3902ec7 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -211,8 +211,8 @@ namespace Test { // explicit epsilon specializations #if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT - template - class epsilon { + template<> + class epsilon { public: constexpr static double value = 0009765625F; }; From 29322e8cac1f82484ba3c01120e16d9c706035d3 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 5 Mar 2021 14:45:00 -0700 Subject: [PATCH 34/47] perf_test/blas/blas3: Use TeamPolicy for serial simd --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index df08e30aaa..77d5850fab 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -1706,28 +1706,32 @@ void do_gemm_serial_batched_blocked_parallel(options_t options) { void do_gemm_serial_simd_batched_parallel(options_t options) { STATUS; + // SerialBatchDim3Tag + // SerialSimdTag if (options.blas_args.batch_size_last_dim) __do_loop_and_invoke( options, - __do_gemm_parallel_batched); else __do_loop_and_invoke( - options, __do_gemm_parallel_batched); return; } void do_gemm_serial_simd_batched_blocked_parallel(options_t options) { STATUS; + // SerialBatchDim3Tag + // SerialSimdTag if (options.blas_args.batch_size_last_dim) __do_loop_and_invoke( options, - __do_gemm_parallel_batched); else __do_loop_and_invoke( - options, __do_gemm_parallel_batched); return; } From 80ca02ebc6efc905bccf7036a419f1fce6ee414e Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 9 Mar 2021 11:49:49 -0700 Subject: [PATCH 35/47] perf_test/blas/blas3: Process verify option --- perf_test/blas/blas3/KokkosBlas3_perf_test.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp index 73f5a18452..7e1cdf0f2f 100644 --- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp @@ -275,7 +275,7 @@ int main(int argc, char **argv) { options.blas_args.gemm.alpha = DEFAULT_GEMM_ALPHA; options.blas_args.gemm.beta = DEFAULT_GEMM_BETA; - while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:p:d:", + while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:p:d:v:", long_options, &option_idx)) != -1) { switch (ret) { case 'h': __print_help_blas3_perf_test(); return 0; @@ -397,6 +397,7 @@ int main(int argc, char **argv) { atoi(optarg); break; case 'd': options.blas_args.batch_size_last_dim = atoi(optarg); break; + case 'v': options.verify = atoi(optarg); break; case 'z': options.blas_args.team_size = atoi(optarg); break; case 'n': options.blas_args.vector_len = atoi(optarg); break; case 'u': options.blas_args.use_auto = atoi(optarg); break; From 239d44de0fbee7a17a51d3d8c28cc7f1497ea0e0 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 9 Mar 2021 12:02:24 -0700 Subject: [PATCH 36/47] perf_test/blas/blas3: Relax epsilon --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 77d5850fab..114cc49422 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -1369,7 +1369,7 @@ static inline bool __gemm_print_compare_failure(view_type_3d expected, view_type */ template static inline bool __gemm_do_compare(view_type_3d expected, view_type_3d actual) { - double epsilon = Test::epsilon::value; + double epsilon = Test::epsilon::value * 1e3; STATUS; for (size_t i = 0; i < expected.extent(0); i++) { From 55e3eb30670202eb2eab54d261799fe2c3c5c84e Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 9 Mar 2021 12:05:25 -0700 Subject: [PATCH 37/47] perf_test/blas/blas3: Add TODO for bug --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 114cc49422..d38bfccd60 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -312,6 +312,7 @@ void __do_gemm_serial_blas(options_t options, gemm_args_t gemm_args) { auto B = Kokkos::subview(_gemm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); auto C = Kokkos::subview(_gemm_args.C, i, Kokkos::ALL(), Kokkos::ALL()); + // TODO: Debug this when starting a matrix sizes <= 10x10 KokkosBlas::gemm(&_gemm_args.transA, &_gemm_args.transB, _gemm_args.alpha, A, B, _gemm_args.beta, C); } From 53aa6536ca7643a111383d4f7ae1fe4d65af5857 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 Mar 2021 09:03:41 -0700 Subject: [PATCH 38/47] perf_test/blas/blas3: Fix verify for simd when batch_size is first dim --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 41 +++++++++++++++++-- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index d38bfccd60..ad01d9acad 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -131,6 +131,8 @@ using view_type_3d = Kokkos::View; using view_type_4d = Kokkos::View; +using view_type_5d = + Kokkos::View; // Construct the vector type using memory_space = typename default_device::execution_space::memory_space; @@ -1402,6 +1404,38 @@ static inline bool __gemm_do_compare(view_type_3d expected, gemm_simd_args_t act return __gemm_do_compare(expected, actual_data); } +template +static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstViewType dst, options_t options) { + using scalar_type = typename dstViewType::value_type; + view_type_5d src_raw((double *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3)); + + if (options.blas_args.batch_size_last_dim) { + exit(255); // TODO + } else { + size_t remainder = dst.extent(0) % simd_vector_size; + if (remainder > 0) { + // The below loops map a given 2-rank gemm within the simd view back to the + // 3-rank view. + for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; simd_internal_vec_idx++) { + auto sv0 = Kokkos::subview(src_raw, simd_internal_vec_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + for (size_t simd_batch_size_idx = 0; simd_batch_size_idx < src.ivec_4d.extent(0); simd_batch_size_idx++) { + auto sv1 = Kokkos::subview(sv0, simd_batch_size_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + for (size_t vector_batch_idx = 0; vector_batch_idx < src.ivec_4d.extent(3); vector_batch_idx++) { + auto sv2 = Kokkos::subview(sv1, Kokkos::ALL(), Kokkos::ALL(), vector_batch_idx); + for (size_t m = 0; m < src.ivec_4d.extent(1); m++) { + for (size_t n = 0; n < src.ivec_4d.extent(2); n++) { + dst(simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx, m, n) = sv2(m, n); + } + } + } + } + } + } else { + memcpy(dst.data(), src.ivec_4d.data(), sizeof(scalar_type) * dst.extent(0) * dst.extent(1) * dst.extent(2)); + } + } +} + template static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, void (*fn)(options_t, gemm_args_t)) { using execution_space = typename DeviceType::execution_space; @@ -1433,10 +1467,9 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, vo if (__gemm_do_compare(C_expected, gemm_args.C)) FATAL_ERROR("Inital values mismatch!"); } else if (gemm_args.Cv.vec_3d.data() != nullptr) { - // TODO: Debug this when batch_size % simd_vector_len != 0. - memcpy(C_expected.data(), gemm_args.Cv.vec_3d.data(), sizeof(default_scalar) * gemm_args.dims.c.k * gemm_args.dims.c.m * gemm_args.dims.c.n); - memcpy(A_expected.data(), gemm_args.Av.vec_3d.data(), sizeof(default_scalar) * gemm_args.dims.a.k * gemm_args.dims.a.m * gemm_args.dims.a.n); - memcpy(B_expected.data(), gemm_args.Bv.vec_3d.data(), sizeof(default_scalar) * gemm_args.dims.b.k * gemm_args.dims.b.m * gemm_args.dims.b.n); + __gemm_copy_simd_view_to_3d_view(gemm_args.Cv, C_expected, options); + __gemm_copy_simd_view_to_3d_view(gemm_args.Av, A_expected, options); + __gemm_copy_simd_view_to_3d_view(gemm_args.Bv, B_expected, options); // Check that initial values match if (__gemm_do_compare(C_expected, gemm_args.Cv)) From 192fde6a76d975ed5c324f97bb46ca9e0545b24e Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 Mar 2021 10:49:00 -0700 Subject: [PATCH 39/47] perf_test/blas/blas3: Complete verify for batch_size in first dimension --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 67 ++++++++++++------- 1 file changed, 43 insertions(+), 24 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index ad01d9acad..a1e870e4c0 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -1375,33 +1375,29 @@ static inline bool __gemm_do_compare(view_type_3d expected, view_type_3d actual) double epsilon = Test::epsilon::value * 1e3; STATUS; - for (size_t i = 0; i < expected.extent(0); i++) { - for (size_t j = 0; j < expected.extent(1); j++) { - for (size_t k = 0; k < expected.extent(2); k++) { - if (std::is_same::value) { - return __gemm_print_compare_failure(expected, actual, i, j, k, epsilon); + if (std::is_same::value) { + for (size_t i = 0; i < expected.extent(0); i++) { + for (size_t j = 0; j < expected.extent(1); j++) { + for (size_t k = 0; k < expected.extent(2); k++) { + if (__gemm_print_compare_failure(expected, actual, i, j, k, epsilon)) + return true; } - if (std::is_same::value) { - return __gemm_print_compare_failure(expected, actual, k, j, i, epsilon); + } + } + } + + if (std::is_same::value) { + for (size_t k = 0; k < expected.extent(2); k++) { + for (size_t j = 0; j < expected.extent(1); j++) { + for (size_t i = 0; i < expected.extent(0); i++) { + if (__gemm_print_compare_failure(expected, actual, i, j, k, epsilon)) + return true; } } } } - return false; -} -/** - * Compare all values of expected with all values of actual. - * @var expected: the expected results - * @var actual: the actual results - * @return false if expected matches actual within epsilon, otherwise true. - */ -template -static inline bool __gemm_do_compare(view_type_3d expected, gemm_simd_args_t actual) { - std::cout << actual.mat_4d.extent(0) << "x" << actual.mat_4d.extent(1) << "x" << actual.mat_4d.extent(2) << "x" << actual.mat_4d.extent(3) << std::endl; - decltype(expected) actual_data(actual.mat_4d.data(), expected.extent(0), expected.extent(1), expected.extent(2)); - STATUS; - return __gemm_do_compare(expected, actual_data); + return false; } template @@ -1414,7 +1410,7 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstVie } else { size_t remainder = dst.extent(0) % simd_vector_size; if (remainder > 0) { - // The below loops map a given 2-rank gemm within the simd view back to the + // The below loops copies each corresponding 2-rank matrix within the simd view back to the // 3-rank view. for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; simd_internal_vec_idx++) { auto sv0 = Kokkos::subview(src_raw, simd_internal_vec_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); @@ -1431,11 +1427,34 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstVie } } } else { + // When the batch_size is a multiple of the simd_vector_size, each 2-rank matrix lies in the correct location + // and the data can simply be copied. memcpy(dst.data(), src.ivec_4d.data(), sizeof(scalar_type) * dst.extent(0) * dst.extent(1) * dst.extent(2)); } } } +/** + * Compare all values of expected with all values of actual. + * @var expected: the expected results + * @var actual: the actual results + * @return false if expected matches actual within epsilon, otherwise true. + */ +template +static inline bool __gemm_do_compare(view_type_3d expected, gemm_simd_args_t actual, options_t options) { + decltype(expected) actual_data("actual_data", expected.extent(0), expected.extent(1), expected.extent(2)); + + STATUS; + + // Copy the simd view to a 3d view for comparision. + // NOTE: The raw results are different when batch_size % simd_vector_size != 0. + // Also note that when batch_size % simd_vector_size != 0, the simd operation + // calculates results that we do not require. + // So, we end up running an extra batch_size % simd_vector_size GEMMs! + __gemm_copy_simd_view_to_3d_view(actual, actual_data, options); + return __gemm_do_compare(expected, actual_data); +} + template static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, void (*fn)(options_t, gemm_args_t)) { using execution_space = typename DeviceType::execution_space; @@ -1472,7 +1491,7 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, vo __gemm_copy_simd_view_to_3d_view(gemm_args.Bv, B_expected, options); // Check that initial values match - if (__gemm_do_compare(C_expected, gemm_args.Cv)) + if (__gemm_do_compare(C_expected, gemm_args.Cv, options)) FATAL_ERROR("Inital values mismatch!"); } else { FATAL_ERROR("Input arguments are empty!"); @@ -1506,7 +1525,7 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, vo } if (gemm_args.Cv.vec_3d.data() != nullptr) { - if (__gemm_do_compare(C_expected, gemm_args.Cv)) + if (__gemm_do_compare(C_expected, gemm_args.Cv, options)) FATAL_ERROR("Result value mismatch!"); } From e4351716f2cf7fc4daebcfb60933e488b50b1d1e Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 Mar 2021 13:01:35 -0700 Subject: [PATCH 40/47] test_common: Update VanillaGEMM with batch_size_last_dim member --- test_common/KokkosKernels_TestUtils.hpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 64b3902ec7..1d383ffd35 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -159,7 +159,7 @@ namespace Test { // C(i,:,:) = alpha * (A(i,:,:) * B(i,:,:)) + beta * C(i,:,:) template struct Functor_BatchedVanillaGEMM { - bool A_t, B_t, A_c, B_c; + bool A_t, B_t, A_c, B_c, batch_size_last_dim = false; ViewTypeA A; ViewTypeB B; ViewTypeC C; @@ -177,15 +177,20 @@ namespace Test { auto _A = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); auto _B = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL()); auto _C = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL()); + if (batch_size_last_dim) { + _A = Kokkos::subview(A, Kokkos::ALL(), Kokkos::ALL(), i); + _B = Kokkos::subview(B, Kokkos::ALL(), Kokkos::ALL(), i); + _C = Kokkos::subview(C, Kokkos::ALL(), Kokkos::ALL(), i); + } using SubviewTypeA = decltype(_A); using SubviewTypeB = decltype(_B); using SubviewTypeC = decltype(_C); struct SharedVanillaGEMM vgemm; vgemm.A_t = A_t; vgemm.B_t = B_t; vgemm.A_c = A_c; vgemm.B_c = B_c; - vgemm.C_rows = C.extent(1); - vgemm.C_cols = C.extent(2); - vgemm.A_cols = A_t?A.extent(1):A.extent(2); + vgemm.C_rows = batch_size_last_dim ? C.extent(0) : C.extent(1); + vgemm.C_cols = batch_size_last_dim ? C.extent(1) : C.extent(2); + vgemm.A_cols = batch_size_last_dim ? (A_t?A.extent(0):A.extent(1)) : (A_t?A.extent(1):A.extent(2)); vgemm.A = _A; vgemm.B = _B; vgemm.C = _C; @@ -198,7 +203,7 @@ namespace Test { void run() { Kokkos::parallel_for( "Test::VanillaGEMM", - Kokkos::TeamPolicy(C.extent(0), Kokkos::AUTO, 16), + Kokkos::TeamPolicy(batch_size_last_dim ? C.extent(2) : C.extent(0), Kokkos::AUTO, 16), *this); } }; From 07906733755fda24433b974df27e97cc9bf080ca Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 Mar 2021 13:01:47 -0700 Subject: [PATCH 41/47] perf_test/blas/blas3: Add batch_size_last_dim to vgemm --- perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index a1e870e4c0..8fca4e76b2 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -1502,6 +1502,7 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, vo vgemm.A_t = toupper(gemm_args.transA) == 'T'; vgemm.B_t = toupper(gemm_args.transB) == 'T'; vgemm.A_c = vgemm.B_c = false; + vgemm.batch_size_last_dim = options.blas_args.batch_size_last_dim; vgemm.A = A_expected; vgemm.B = B_expected; vgemm.C = C_expected; From 137adccbbee971bfcbf073f6374cdad90a719874 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 Mar 2021 13:45:38 -0700 Subject: [PATCH 42/47] perf_test/blas/blas3: Update compare routines - Handle simd with batch_size in last dimension - Work with device views --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 36 +++++++++++++++---- 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 8fca4e76b2..f560690e54 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -1350,12 +1350,14 @@ void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) { */ static inline bool __gemm_print_compare_failure(view_type_3d expected, view_type_3d actual, int i, int j, int k, double epsilon) { STATUS; - auto diff = static_cast(Kokkos::Experimental::fabs(expected(i,j,k) - actual(i,j,k))); + typename view_type_3d::HostMirror h_expected = Kokkos::create_mirror_view(expected); + typename view_type_3d::HostMirror h_actual = Kokkos::create_mirror_view(actual); + auto diff = static_cast(Kokkos::Experimental::fabs(h_expected(i,j,k) - h_actual(i,j,k))); if (diff > epsilon) { printf("fabs(expected(%d,%d,%d):%g - actual(%d,%d,%d):%g):%g > epsilon:%g\n", - i,j,k,static_cast(expected(i,j,k)), - i,j,k,static_cast(actual(i,j,k)), + i,j,k,static_cast(h_expected(i,j,k)), + i,j,k,static_cast(h_actual(i,j,k)), diff, epsilon); FATAL_ERROR("Comparison failure!"); @@ -1403,17 +1405,39 @@ static inline bool __gemm_do_compare(view_type_3d expected, view_type_3d actual) template static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstViewType dst, options_t options) { using scalar_type = typename dstViewType::value_type; - view_type_5d src_raw((double *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3)); if (options.blas_args.batch_size_last_dim) { - exit(255); // TODO + view_type_5d src_raw((double *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3)); + typename view_type_5d::HostMirror h_src_raw = Kokkos::create_mirror_view(src_raw); + size_t remainder = dst.extent(2) % simd_vector_size; + remainder = remainder == 0 ? simd_internal_vector_size : remainder; + + // The below loops copies each corresponding 2-rank matrix within the simd view back to the + // 3-rank view. + for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; simd_internal_vec_idx++) { + auto sv0 = Kokkos::subview(h_src_raw, simd_internal_vec_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + for (size_t vector_batch_idx = 0; vector_batch_idx < src.ivec_4d.extent(0); vector_batch_idx++) { + auto sv1 = Kokkos::subview(sv0, vector_batch_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + for (size_t simd_batch_size_idx = 0; simd_batch_size_idx < src.ivec_4d.extent(3); simd_batch_size_idx++) { + auto sv2 = Kokkos::subview(sv1, Kokkos::ALL(), Kokkos::ALL(), simd_batch_size_idx); + for (size_t m = 0; m < src.ivec_4d.extent(1); m++) { + for (size_t n = 0; n < src.ivec_4d.extent(2); n++) { + dst(m, n, simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx) = sv2(m, n); + } + } + } + } + } } else { + view_type_5d src_raw((double *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3)); + typename view_type_5d::HostMirror h_src_raw = Kokkos::create_mirror_view(src_raw); size_t remainder = dst.extent(0) % simd_vector_size; + if (remainder > 0) { // The below loops copies each corresponding 2-rank matrix within the simd view back to the // 3-rank view. for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; simd_internal_vec_idx++) { - auto sv0 = Kokkos::subview(src_raw, simd_internal_vec_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + auto sv0 = Kokkos::subview(h_src_raw, simd_internal_vec_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); for (size_t simd_batch_size_idx = 0; simd_batch_size_idx < src.ivec_4d.extent(0); simd_batch_size_idx++) { auto sv1 = Kokkos::subview(sv0, simd_batch_size_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); for (size_t vector_batch_idx = 0; vector_batch_idx < src.ivec_4d.extent(3); vector_batch_idx++) { From 891f4bd178b3c90a94d52eb10b05ccd8611d4454 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 Mar 2021 13:54:56 -0700 Subject: [PATCH 43/47] test_common: Fix half_t epsilon --- test_common/KokkosKernels_TestUtils.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index 1d383ffd35..ad546fe0b4 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -219,7 +219,7 @@ namespace Test { template<> class epsilon { public: - constexpr static double value = 0009765625F; + constexpr static double value = 0.0009765625F; }; #endif // KOKKOS_HALF_T_IS_FLOAT } From a7558b5eaccfdbcb27de174e86ef1048a3d2f531 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 Mar 2021 14:06:36 -0700 Subject: [PATCH 44/47] perf_test/blas/blas3: Update serial loops - Update serial loops for batch_size_last_dim option - Remove dead code --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 97 ++++++------------- 1 file changed, 30 insertions(+), 67 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index f560690e54..4ee8a676dd 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -308,22 +308,28 @@ void __do_gemm_serial_blas(options_t options, gemm_args_t gemm_args) { STATUS; - auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args) { + auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args, bool batch_size_last_dim) { for (uint32_t i = 0; i < n; ++i) { - auto A = Kokkos::subview(_gemm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - auto B = Kokkos::subview(_gemm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); - auto C = Kokkos::subview(_gemm_args.C, i, Kokkos::ALL(), Kokkos::ALL()); + for (int j = 0; j < _gemm_args.dims.c.k; j++) { + auto A = Kokkos::subview(_gemm_args.A, j, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(_gemm_args.B, j, Kokkos::ALL(), Kokkos::ALL()); + auto C = Kokkos::subview(_gemm_args.C, j, Kokkos::ALL(), Kokkos::ALL()); + if (batch_size_last_dim) { + A = Kokkos::subview(_gemm_args.A, Kokkos::ALL(), Kokkos::ALL(), j); + B = Kokkos::subview(_gemm_args.B, Kokkos::ALL(), Kokkos::ALL(), j); + C = Kokkos::subview(_gemm_args.C, Kokkos::ALL(), Kokkos::ALL(), j); + } - // TODO: Debug this when starting a matrix sizes <= 10x10 - KokkosBlas::gemm(&_gemm_args.transA, &_gemm_args.transB, _gemm_args.alpha, - A, B, _gemm_args.beta, C); + KokkosBlas::gemm(&_gemm_args.transA, &_gemm_args.transB, _gemm_args.alpha, + A, B, _gemm_args.beta, C); + } } }; - __do_loop(options.warm_up_n, gemm_args); + __do_loop(options.warm_up_n, gemm_args, options.blas_args.batch_size_last_dim); Kokkos::fence(); timer.reset(); - __do_loop(options.n, gemm_args); + __do_loop(options.n, gemm_args, options.blas_args.batch_size_last_dim); Kokkos::fence(); __gemm_output_csv_row(options, gemm_args, timer.seconds()); @@ -341,22 +347,29 @@ void __do_gemm_serial_batched_template(options_t options, #if !defined(KOKKOS_ENABLE_CUDA) Kokkos::Timer timer; - auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args) { + auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args, bool batch_size_last_dim) { for (uint32_t i = 0; i < n; ++i) { - auto A = Kokkos::subview(_gemm_args.A, i, Kokkos::ALL(), Kokkos::ALL()); - auto B = Kokkos::subview(_gemm_args.B, i, Kokkos::ALL(), Kokkos::ALL()); - auto C = Kokkos::subview(_gemm_args.C, i, Kokkos::ALL(), Kokkos::ALL()); + for (int j = 0; j < _gemm_args.dims.c.k; j++) { + auto A = Kokkos::subview(_gemm_args.A, j, Kokkos::ALL(), Kokkos::ALL()); + auto B = Kokkos::subview(_gemm_args.B, j, Kokkos::ALL(), Kokkos::ALL()); + auto C = Kokkos::subview(_gemm_args.C, j, Kokkos::ALL(), Kokkos::ALL()); + if (batch_size_last_dim) { + A = Kokkos::subview(_gemm_args.A, Kokkos::ALL(), Kokkos::ALL(), j); + B = Kokkos::subview(_gemm_args.B, Kokkos::ALL(), Kokkos::ALL(), j); + C = Kokkos::subview(_gemm_args.C, Kokkos::ALL(), Kokkos::ALL(), j); + } - SerialGemm::invoke( - _gemm_args.alpha, A, B, _gemm_args.beta, C); + SerialGemm::invoke( + _gemm_args.alpha, A, B, _gemm_args.beta, C); + } } }; - __do_loop(options.warm_up_n, gemm_args); + __do_loop(options.warm_up_n, gemm_args, options.blas_args.batch_size_last_dim); Kokkos::fence(); timer.reset(); - __do_loop(options.n, gemm_args); + __do_loop(options.n, gemm_args, options.blas_args.batch_size_last_dim); Kokkos::fence(); __gemm_output_csv_row(options, gemm_args, timer.seconds()); #else @@ -400,56 +413,6 @@ void __do_gemm_serial_batched(options_t options, gemm_args_t gemm_args) { return; } -#if !defined(KOKKOS_ENABLE_CUDA) -template -struct parallel_blas_gemm { - gemm_args_t gemm_args_; - - parallel_blas_gemm(gemm_args_t gemm_args) : gemm_args_(gemm_args) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const int &i) const { - auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL()); - auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL()); - auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL()); - - KokkosBlas::gemm(&gemm_args_.transA, &gemm_args_.transB, gemm_args_.alpha, - svA, svB, gemm_args_.beta, svC); - } -}; -#endif // !KOKKOS_ENABLE_CUDA - -template -void __do_gemm_parallel_blas(options_t options, gemm_args_t gemm_args) { -#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) - uint32_t warm_up_n = options.warm_up_n; - uint32_t n = options.n; - Kokkos::Timer timer; - using execution_space = typename device_type::execution_space; - using functor_type = parallel_blas_gemm; - functor_type parallel_blas_gemm_functor(gemm_args); - - STATUS; - - Kokkos::parallel_for("parallelBlasWarmUpLoopGemm", - Kokkos::RangePolicy(0, warm_up_n), - parallel_blas_gemm_functor); - Kokkos::fence(); - - timer.reset(); - Kokkos::parallel_for("parallelBlasTimedLoopGemm", - Kokkos::RangePolicy(0, n), - parallel_blas_gemm_functor); - Kokkos::fence(); - __gemm_output_csv_row(options, gemm_args, timer.seconds()); -#else - std::cerr << std::string(__func__) - << " disabled since KOKKOS_ENABLE_CUDA is defined." << std::endl; - __gemm_output_csv_row(options, gemm_args, -1); -#endif // !KOKKOS_ENABLE_CUDA - return; -} - template struct parallel_batched_gemm_range_policy { gemm_args_t gemm_args_; From 4ea0e4c863d3eab7de25c42ea6a90b05b19f4492 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 10 Mar 2021 15:46:18 -0700 Subject: [PATCH 45/47] test_common: Update VanillaGemm - Fix VanillaGemm to work with batch_size_last_dim=true when Cuda is enabled. --- test_common/KokkosKernels_TestUtils.hpp | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index ad546fe0b4..43f2d48460 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -167,24 +167,29 @@ namespace Test { using ScalarA = typename ViewTypeA::value_type; using ScalarB = typename ViewTypeB::value_type; using ScalarC = typename ViewTypeC::value_type; + using SubviewTypeA = typename Kokkos::View; + using SubviewTypeB = typename Kokkos::View; + using SubviewTypeC = typename Kokkos::View; + ScalarA alpha; ScalarC beta; KOKKOS_INLINE_FUNCTION void operator()(const typename Kokkos::TeamPolicy::member_type& team) const { int i = team.league_rank(); + SubviewTypeA _A; + SubviewTypeB _B; + SubviewTypeC _C; - auto _A = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); - auto _B = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL()); - auto _C = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL()); if (batch_size_last_dim) { _A = Kokkos::subview(A, Kokkos::ALL(), Kokkos::ALL(), i); _B = Kokkos::subview(B, Kokkos::ALL(), Kokkos::ALL(), i); _C = Kokkos::subview(C, Kokkos::ALL(), Kokkos::ALL(), i); + } else { + _A = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL()); + _B = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL()); + _C = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL()); } - using SubviewTypeA = decltype(_A); - using SubviewTypeB = decltype(_B); - using SubviewTypeC = decltype(_C); struct SharedVanillaGEMM vgemm; vgemm.A_t = A_t; vgemm.B_t = B_t; vgemm.A_c = A_c; vgemm.B_c = B_c; From 8522c914cbe58b14927903c690daa0904206df92 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 11 Mar 2021 11:55:01 -0700 Subject: [PATCH 46/47] perf_test/blas/blas3: Updates for half_t src/batched: Allow compile with half_t --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 58 +++++++++++++------ .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 24 +++++--- src/batched/KokkosBatched_Util.hpp | 3 +- 3 files changed, 57 insertions(+), 28 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index 4ee8a676dd..ffb13819b6 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -266,8 +266,8 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, gflops = flops / 1e9; options.out[0] << algo_name << "," << options.blas_args.gemm.gemm_args << "," - << options.blas_args.gemm.alpha << "," - << options.blas_args.gemm.beta << "," << ts << "," << vlen + << static_cast(options.blas_args.gemm.alpha) << "," + << static_cast(options.blas_args.gemm.beta) << "," << ts << "," << vlen << "," << loop_e_str[options.loop] << "," << __gemm_output_dim_string(options, gemm_args.dims.a) << "," << __gemm_output_dim_string(options, gemm_args.dims.b) << "," @@ -1315,7 +1315,7 @@ static inline bool __gemm_print_compare_failure(view_type_3d expected, view_type STATUS; typename view_type_3d::HostMirror h_expected = Kokkos::create_mirror_view(expected); typename view_type_3d::HostMirror h_actual = Kokkos::create_mirror_view(actual); - auto diff = static_cast(Kokkos::Experimental::fabs(h_expected(i,j,k) - h_actual(i,j,k))); + auto diff = static_cast(Kokkos::Experimental::fabs(static_cast(h_expected(i,j,k) - h_actual(i,j,k)))); if (diff > epsilon) { printf("fabs(expected(%d,%d,%d):%g - actual(%d,%d,%d):%g):%g > epsilon:%g\n", @@ -1367,10 +1367,11 @@ static inline bool __gemm_do_compare(view_type_3d expected, view_type_3d actual) template static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstViewType dst, options_t options) { - using scalar_type = typename dstViewType::value_type; + using dst_scalar_type = typename dstViewType::value_type; + using src_scalar_type = typename view_type_5d::value_type; if (options.blas_args.batch_size_last_dim) { - view_type_5d src_raw((double *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3)); + view_type_5d src_raw((src_scalar_type *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3)); typename view_type_5d::HostMirror h_src_raw = Kokkos::create_mirror_view(src_raw); size_t remainder = dst.extent(2) % simd_vector_size; remainder = remainder == 0 ? simd_internal_vector_size : remainder; @@ -1392,7 +1393,7 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstVie } } } else { - view_type_5d src_raw((double *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3)); + view_type_5d src_raw((src_scalar_type *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3)); typename view_type_5d::HostMirror h_src_raw = Kokkos::create_mirror_view(src_raw); size_t remainder = dst.extent(0) % simd_vector_size; @@ -1416,7 +1417,7 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstVie } else { // When the batch_size is a multiple of the simd_vector_size, each 2-rank matrix lies in the correct location // and the data can simply be copied. - memcpy(dst.data(), src.ivec_4d.data(), sizeof(scalar_type) * dst.extent(0) * dst.extent(1) * dst.extent(2)); + memcpy(dst.data(), src.ivec_4d.data(), sizeof(dst_scalar_type) * dst.extent(0) * dst.extent(1) * dst.extent(2)); } } } @@ -1616,15 +1617,24 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { // Use the non-simd 4-rank view type to randomly populate the gemm simd // arguments - Kokkos::fill_random(gemm_args.Av.mat_4d, rand_pool, + using tmp_view_type_4d = Kokkos::View; + tmp_view_type_4d tmpA("tmpA", gemm_args.Av.mat_4d.extent(0), gemm_args.Av.mat_4d.extent(1), gemm_args.Av.mat_4d.extent(2), gemm_args.Av.mat_4d.extent(3)); + Kokkos::fill_random(tmpA, rand_pool, Kokkos::rand, - scalar_type>::max()); - Kokkos::fill_random(gemm_args.Bv.mat_4d, rand_pool, + double>::max()); + tmp_view_type_4d tmpB("tmpB", gemm_args.Bv.mat_4d.extent(0), gemm_args.Bv.mat_4d.extent(1), gemm_args.Bv.mat_4d.extent(2), gemm_args.Bv.mat_4d.extent(3)); + Kokkos::fill_random(tmpB, rand_pool, Kokkos::rand, - scalar_type>::max()); - Kokkos::fill_random(gemm_args.Cv.mat_4d, rand_pool, + double>::max()); + tmp_view_type_4d tmpC("tmpC", gemm_args.Cv.mat_4d.extent(0), gemm_args.Cv.mat_4d.extent(1), gemm_args.Cv.mat_4d.extent(2), gemm_args.Cv.mat_4d.extent(3)); + Kokkos::fill_random(tmpC, rand_pool, Kokkos::rand, - scalar_type>::max()); + double>::max()); + Kokkos::fence(); + Kokkos::deep_copy(gemm_args.Av.mat_4d, tmpA); + Kokkos::deep_copy(gemm_args.Bv.mat_4d, tmpB); + Kokkos::deep_copy(gemm_args.Cv.mat_4d, tmpC); + Kokkos::fence(); } else { if (options.blas_args.batch_size_last_dim) { gemm_args.A = vta("gemm_args.A", dims.a.m, dims.a.n, dims.a.k); @@ -1636,15 +1646,25 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { gemm_args.C = vtc("gemm_args.C", dims.c.k, dims.c.m, dims.c.n); } - Kokkos::fill_random(gemm_args.A, rand_pool, + using tmp_view_type_3d = Kokkos::View; + tmp_view_type_3d tmpA("tmpA", gemm_args.A.extent(0), gemm_args.A.extent(1), gemm_args.A.extent(2)); + Kokkos::fill_random(tmpA, rand_pool, Kokkos::rand, - scalar_type>::max()); - Kokkos::fill_random(gemm_args.B, rand_pool, + double>::max()); + tmp_view_type_3d tmpB("tmpB", gemm_args.B.extent(0), gemm_args.B.extent(1), gemm_args.B.extent(2)); + Kokkos::fill_random(tmpB, rand_pool, Kokkos::rand, - scalar_type>::max()); - Kokkos::fill_random(gemm_args.C, rand_pool, + double>::max()); + tmp_view_type_3d tmpC("tmpC", gemm_args.C.extent(0), gemm_args.C.extent(1), gemm_args.C.extent(2)); + Kokkos::fill_random(tmpC, rand_pool, Kokkos::rand, - scalar_type>::max()); + double>::max()); + + Kokkos::fence(); + Kokkos::deep_copy(gemm_args.A, tmpA); + Kokkos::deep_copy(gemm_args.B, tmpB); + Kokkos::deep_copy(gemm_args.C, tmpC); + Kokkos::fence(); } gemm_args.alpha = options.blas_args.gemm.alpha; gemm_args.beta = options.blas_args.gemm.beta; diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index 6d67e96bd1..0a6741c603 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -175,7 +175,7 @@ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args, options.out[0] << test_e_str[options.test] << "," << options.blas_args.trmm.trmm_args << "," - << options.blas_args.trmm.alpha << "," + << static_cast(options.blas_args.trmm.alpha) << "," << loop_e_str[options.loop] << "," << trmm_args.A.extent(0) << "x" << trmm_args.A.extent(1) << "x" << trmm_args.A.extent(2) << "," << trmm_args.B.extent(0) << "x" << trmm_args.B.extent(1) @@ -624,10 +624,14 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { trmm_args.alpha = options.blas_args.trmm.alpha; host_A = Kokkos::create_mirror_view(trmm_args.A); - Kokkos::fill_random(trmm_args.A, rand_pool, - Kokkos::rand, - scalar_type>::max()); - Kokkos::deep_copy(host_A, trmm_args.A); + + { + Kokkos::View tmp("tmp", trmm_args.A.extent(0), trmm_args.A.extent(1), trmm_args.A.extent(2)); + Kokkos::fill_random(tmp, rand_pool, + Kokkos::rand, + double>::max()); + Kokkos::deep_copy(host_A, tmp); + } if (trmm_args.uplo == 'U' || trmm_args.uplo == 'u') { // Make A upper triangular @@ -663,9 +667,13 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { } Kokkos::deep_copy(trmm_args.A, host_A); - Kokkos::fill_random(trmm_args.B, rand_pool, - Kokkos::rand, - scalar_type>::max()); + { + Kokkos::View tmp("tmp", trmm_args.B.extent(0), trmm_args.B.extent(1), trmm_args.B.extent(2)); + Kokkos::fill_random(tmp, rand_pool, + Kokkos::rand, + double>::max()); + Kokkos::deep_copy(trmm_args.B, tmp); + } return trmm_args; } diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp index 3253b6ce12..4a5c17d1df 100644 --- a/src/batched/KokkosBatched_Util.hpp +++ b/src/batched/KokkosBatched_Util.hpp @@ -204,7 +204,8 @@ namespace KokkosBatched { std::is_same >::value || std::is_same >::value || std::is_same >::value || - std::is_same >::value, + std::is_same >::value || + std::is_same::value, "KokkosKernels:: Invalid SIMD<> type." ); using value_type = T; }; From 4f9dafa854357505c1243cebea6afcec24405dff Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 11 Mar 2021 13:16:36 -0700 Subject: [PATCH 47/47] perf_test/blas: Apply clang-format --- .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 263 +++++++++++------- .../blas/blas3/KokkosBlas3_perf_test.cpp | 14 +- .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 21 +- src/batched/KokkosBatched_Util.hpp | 2 +- 4 files changed, 188 insertions(+), 112 deletions(-) diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp index ffb13819b6..081b01bb58 100644 --- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp @@ -56,7 +56,7 @@ //#include "KokkosBatched_Gemm_Team_Impl.hpp" //#include "KokkosBatched_Gemm_TeamVector_Impl.hpp" #include "KokkosBatched_Util.hpp" -#include "gtest/gtest.h" // EXPECT_NEAR +#include "gtest/gtest.h" // EXPECT_NEAR #include "KokkosKernels_TestUtils.hpp" //#define GEMM_PERF_TEST_DEBUG @@ -256,8 +256,7 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, double gflops; double average_time = time_in_seconds / options.n; - if (options.verify) - return; + if (options.verify) return; flops = gemm_args.dims.a.k * __gemm_flop_count(gemm_args.dims.a.m, gemm_args.dims.a.n, @@ -267,8 +266,8 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args, options.out[0] << algo_name << "," << options.blas_args.gemm.gemm_args << "," << static_cast(options.blas_args.gemm.alpha) << "," - << static_cast(options.blas_args.gemm.beta) << "," << ts << "," << vlen - << "," << loop_e_str[options.loop] << "," + << static_cast(options.blas_args.gemm.beta) << "," + << ts << "," << vlen << "," << loop_e_str[options.loop] << "," << __gemm_output_dim_string(options, gemm_args.dims.a) << "," << __gemm_output_dim_string(options, gemm_args.dims.b) << "," << __gemm_output_dim_string(options, gemm_args.dims.c) << "," @@ -308,7 +307,8 @@ void __do_gemm_serial_blas(options_t options, gemm_args_t gemm_args) { STATUS; - auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args, bool batch_size_last_dim) { + auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args, + bool batch_size_last_dim) { for (uint32_t i = 0; i < n; ++i) { for (int j = 0; j < _gemm_args.dims.c.k; j++) { auto A = Kokkos::subview(_gemm_args.A, j, Kokkos::ALL(), Kokkos::ALL()); @@ -317,15 +317,16 @@ void __do_gemm_serial_blas(options_t options, gemm_args_t gemm_args) { if (batch_size_last_dim) { A = Kokkos::subview(_gemm_args.A, Kokkos::ALL(), Kokkos::ALL(), j); B = Kokkos::subview(_gemm_args.B, Kokkos::ALL(), Kokkos::ALL(), j); - C = Kokkos::subview(_gemm_args.C, Kokkos::ALL(), Kokkos::ALL(), j); + C = Kokkos::subview(_gemm_args.C, Kokkos::ALL(), Kokkos::ALL(), j); } - KokkosBlas::gemm(&_gemm_args.transA, &_gemm_args.transB, _gemm_args.alpha, - A, B, _gemm_args.beta, C); + KokkosBlas::gemm(&_gemm_args.transA, &_gemm_args.transB, + _gemm_args.alpha, A, B, _gemm_args.beta, C); } } }; - __do_loop(options.warm_up_n, gemm_args, options.blas_args.batch_size_last_dim); + __do_loop(options.warm_up_n, gemm_args, + options.blas_args.batch_size_last_dim); Kokkos::fence(); timer.reset(); @@ -347,7 +348,8 @@ void __do_gemm_serial_batched_template(options_t options, #if !defined(KOKKOS_ENABLE_CUDA) Kokkos::Timer timer; - auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args, bool batch_size_last_dim) { + auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args, + bool batch_size_last_dim) { for (uint32_t i = 0; i < n; ++i) { for (int j = 0; j < _gemm_args.dims.c.k; j++) { auto A = Kokkos::subview(_gemm_args.A, j, Kokkos::ALL(), Kokkos::ALL()); @@ -356,7 +358,7 @@ void __do_gemm_serial_batched_template(options_t options, if (batch_size_last_dim) { A = Kokkos::subview(_gemm_args.A, Kokkos::ALL(), Kokkos::ALL(), j); B = Kokkos::subview(_gemm_args.B, Kokkos::ALL(), Kokkos::ALL(), j); - C = Kokkos::subview(_gemm_args.C, Kokkos::ALL(), Kokkos::ALL(), j); + C = Kokkos::subview(_gemm_args.C, Kokkos::ALL(), Kokkos::ALL(), j); } SerialGemm::invoke( @@ -365,7 +367,8 @@ void __do_gemm_serial_batched_template(options_t options, } }; - __do_loop(options.warm_up_n, gemm_args, options.blas_args.batch_size_last_dim); + __do_loop(options.warm_up_n, gemm_args, + options.blas_args.batch_size_last_dim); Kokkos::fence(); timer.reset(); @@ -1311,18 +1314,22 @@ void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) { * @var epsilon: The tolerance to use when comparing. * @return true if the comparison fails and false if the comparison succeeds. */ -static inline bool __gemm_print_compare_failure(view_type_3d expected, view_type_3d actual, int i, int j, int k, double epsilon) { +static inline bool __gemm_print_compare_failure(view_type_3d expected, + view_type_3d actual, int i, + int j, int k, double epsilon) { STATUS; - typename view_type_3d::HostMirror h_expected = Kokkos::create_mirror_view(expected); - typename view_type_3d::HostMirror h_actual = Kokkos::create_mirror_view(actual); - auto diff = static_cast(Kokkos::Experimental::fabs(static_cast(h_expected(i,j,k) - h_actual(i,j,k)))); + typename view_type_3d::HostMirror h_expected = + Kokkos::create_mirror_view(expected); + typename view_type_3d::HostMirror h_actual = + Kokkos::create_mirror_view(actual); + auto diff = static_cast(Kokkos::Experimental::fabs( + static_cast(h_expected(i, j, k) - h_actual(i, j, k)))); if (diff > epsilon) { - printf("fabs(expected(%d,%d,%d):%g - actual(%d,%d,%d):%g):%g > epsilon:%g\n", - i,j,k,static_cast(h_expected(i,j,k)), - i,j,k,static_cast(h_actual(i,j,k)), - diff, - epsilon); + printf( + "fabs(expected(%d,%d,%d):%g - actual(%d,%d,%d):%g):%g > epsilon:%g\n", + i, j, k, static_cast(h_expected(i, j, k)), i, j, k, + static_cast(h_actual(i, j, k)), diff, epsilon); FATAL_ERROR("Comparison failure!"); return true; } @@ -1336,7 +1343,8 @@ static inline bool __gemm_print_compare_failure(view_type_3d expected, view_type * @return false if expected matches actual within epsilon, otherwise true. */ template -static inline bool __gemm_do_compare(view_type_3d expected, view_type_3d actual) { +static inline bool __gemm_do_compare(view_type_3d expected, + view_type_3d actual) { double epsilon = Test::epsilon::value * 1e3; STATUS; @@ -1354,7 +1362,7 @@ static inline bool __gemm_do_compare(view_type_3d expected, view_type_3d actual) if (std::is_same::value) { for (size_t k = 0; k < expected.extent(2); k++) { for (size_t j = 0; j < expected.extent(1); j++) { - for (size_t i = 0; i < expected.extent(0); i++) { + for (size_t i = 0; i < expected.extent(0); i++) { if (__gemm_print_compare_failure(expected, actual, i, j, k, epsilon)) return true; } @@ -1366,58 +1374,90 @@ static inline bool __gemm_do_compare(view_type_3d expected, view_type_3d actual) } template -static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstViewType dst, options_t options) { +static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, + dstViewType dst, + options_t options) { using dst_scalar_type = typename dstViewType::value_type; using src_scalar_type = typename view_type_5d::value_type; if (options.blas_args.batch_size_last_dim) { - view_type_5d src_raw((src_scalar_type *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3)); - typename view_type_5d::HostMirror h_src_raw = Kokkos::create_mirror_view(src_raw); + view_type_5d src_raw((src_scalar_type *)src.ivec_4d.data(), + simd_internal_vector_size, src.ivec_4d.extent(0), + src.ivec_4d.extent(1), src.ivec_4d.extent(2), + src.ivec_4d.extent(3)); + typename view_type_5d::HostMirror h_src_raw = + Kokkos::create_mirror_view(src_raw); size_t remainder = dst.extent(2) % simd_vector_size; - remainder = remainder == 0 ? simd_internal_vector_size : remainder; - - // The below loops copies each corresponding 2-rank matrix within the simd view back to the - // 3-rank view. - for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; simd_internal_vec_idx++) { - auto sv0 = Kokkos::subview(h_src_raw, simd_internal_vec_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); - for (size_t vector_batch_idx = 0; vector_batch_idx < src.ivec_4d.extent(0); vector_batch_idx++) { - auto sv1 = Kokkos::subview(sv0, vector_batch_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); - for (size_t simd_batch_size_idx = 0; simd_batch_size_idx < src.ivec_4d.extent(3); simd_batch_size_idx++) { - auto sv2 = Kokkos::subview(sv1, Kokkos::ALL(), Kokkos::ALL(), simd_batch_size_idx); + remainder = remainder == 0 ? simd_internal_vector_size : remainder; + + // The below loops copies each corresponding 2-rank matrix within the simd + // view back to the 3-rank view. + for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; + simd_internal_vec_idx++) { + auto sv0 = + Kokkos::subview(h_src_raw, simd_internal_vec_idx, Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + for (size_t vector_batch_idx = 0; + vector_batch_idx < src.ivec_4d.extent(0); vector_batch_idx++) { + auto sv1 = Kokkos::subview(sv0, vector_batch_idx, Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL()); + for (size_t simd_batch_size_idx = 0; + simd_batch_size_idx < src.ivec_4d.extent(3); + simd_batch_size_idx++) { + auto sv2 = Kokkos::subview(sv1, Kokkos::ALL(), Kokkos::ALL(), + simd_batch_size_idx); for (size_t m = 0; m < src.ivec_4d.extent(1); m++) { for (size_t n = 0; n < src.ivec_4d.extent(2); n++) { - dst(m, n, simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx) = sv2(m, n); + dst(m, n, + simd_internal_vec_idx + simd_batch_size_idx + + vector_batch_idx) = sv2(m, n); } } } } } } else { - view_type_5d src_raw((src_scalar_type *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3)); - typename view_type_5d::HostMirror h_src_raw = Kokkos::create_mirror_view(src_raw); + view_type_5d src_raw((src_scalar_type *)src.ivec_4d.data(), + simd_internal_vector_size, src.ivec_4d.extent(0), + src.ivec_4d.extent(1), src.ivec_4d.extent(2), + src.ivec_4d.extent(3)); + typename view_type_5d::HostMirror h_src_raw = + Kokkos::create_mirror_view(src_raw); size_t remainder = dst.extent(0) % simd_vector_size; if (remainder > 0) { - // The below loops copies each corresponding 2-rank matrix within the simd view back to the - // 3-rank view. - for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; simd_internal_vec_idx++) { - auto sv0 = Kokkos::subview(h_src_raw, simd_internal_vec_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); - for (size_t simd_batch_size_idx = 0; simd_batch_size_idx < src.ivec_4d.extent(0); simd_batch_size_idx++) { - auto sv1 = Kokkos::subview(sv0, simd_batch_size_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); - for (size_t vector_batch_idx = 0; vector_batch_idx < src.ivec_4d.extent(3); vector_batch_idx++) { - auto sv2 = Kokkos::subview(sv1, Kokkos::ALL(), Kokkos::ALL(), vector_batch_idx); + // The below loops copies each corresponding 2-rank matrix within the simd + // view back to the 3-rank view. + for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; + simd_internal_vec_idx++) { + auto sv0 = + Kokkos::subview(h_src_raw, simd_internal_vec_idx, Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); + for (size_t simd_batch_size_idx = 0; + simd_batch_size_idx < src.ivec_4d.extent(0); + simd_batch_size_idx++) { + auto sv1 = Kokkos::subview(sv0, simd_batch_size_idx, Kokkos::ALL(), + Kokkos::ALL(), Kokkos::ALL()); + for (size_t vector_batch_idx = 0; + vector_batch_idx < src.ivec_4d.extent(3); vector_batch_idx++) { + auto sv2 = Kokkos::subview(sv1, Kokkos::ALL(), Kokkos::ALL(), + vector_batch_idx); for (size_t m = 0; m < src.ivec_4d.extent(1); m++) { for (size_t n = 0; n < src.ivec_4d.extent(2); n++) { - dst(simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx, m, n) = sv2(m, n); + dst(simd_internal_vec_idx + simd_batch_size_idx + + vector_batch_idx, + m, n) = sv2(m, n); } } } } } } else { - // When the batch_size is a multiple of the simd_vector_size, each 2-rank matrix lies in the correct location - // and the data can simply be copied. - memcpy(dst.data(), src.ivec_4d.data(), sizeof(dst_scalar_type) * dst.extent(0) * dst.extent(1) * dst.extent(2)); + // When the batch_size is a multiple of the simd_vector_size, each 2-rank + // matrix lies in the correct location and the data can simply be copied. + memcpy(dst.data(), src.ivec_4d.data(), + sizeof(dst_scalar_type) * dst.extent(0) * dst.extent(1) * + dst.extent(2)); } } } @@ -1429,22 +1469,26 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstVie * @return false if expected matches actual within epsilon, otherwise true. */ template -static inline bool __gemm_do_compare(view_type_3d expected, gemm_simd_args_t actual, options_t options) { - decltype(expected) actual_data("actual_data", expected.extent(0), expected.extent(1), expected.extent(2)); +static inline bool __gemm_do_compare(view_type_3d expected, + gemm_simd_args_t actual, + options_t options) { + decltype(expected) actual_data("actual_data", expected.extent(0), + expected.extent(1), expected.extent(2)); STATUS; // Copy the simd view to a 3d view for comparision. - // NOTE: The raw results are different when batch_size % simd_vector_size != 0. - // Also note that when batch_size % simd_vector_size != 0, the simd operation - // calculates results that we do not require. - // So, we end up running an extra batch_size % simd_vector_size GEMMs! + // NOTE: The raw results are different when batch_size % simd_vector_size != + // 0. Also note that when batch_size % simd_vector_size != 0, the simd + // operation calculates results that we do not require. So, we end up running + // an extra batch_size % simd_vector_size GEMMs! __gemm_copy_simd_view_to_3d_view(actual, actual_data, options); return __gemm_do_compare(expected, actual_data); } template -static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, void (*fn)(options_t, gemm_args_t)) { +static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, + void (*fn)(options_t, gemm_args_t)) { using execution_space = typename DeviceType::execution_space; // Just create "expected" types using non-simd types. decltype(gemm_args.C) C_expected; @@ -1453,13 +1497,19 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, vo STATUS; if (options.blas_args.batch_size_last_dim) { - C_expected = decltype(C_expected)("C_expected", gemm_args.dims.c.m, gemm_args.dims.c.n, gemm_args.dims.c.k); - A_expected = decltype(A_expected)("A_expected", gemm_args.dims.a.m, gemm_args.dims.a.n, gemm_args.dims.a.k); - B_expected = decltype(B_expected)("B_expected", gemm_args.dims.b.m, gemm_args.dims.b.n, gemm_args.dims.b.k); + C_expected = decltype(C_expected)("C_expected", gemm_args.dims.c.m, + gemm_args.dims.c.n, gemm_args.dims.c.k); + A_expected = decltype(A_expected)("A_expected", gemm_args.dims.a.m, + gemm_args.dims.a.n, gemm_args.dims.a.k); + B_expected = decltype(B_expected)("B_expected", gemm_args.dims.b.m, + gemm_args.dims.b.n, gemm_args.dims.b.k); } else { - C_expected = decltype(C_expected)("C_expected", gemm_args.dims.c.k, gemm_args.dims.c.m, gemm_args.dims.c.n); - A_expected = decltype(A_expected)("A_expected", gemm_args.dims.a.k, gemm_args.dims.a.m, gemm_args.dims.a.n); - B_expected = decltype(B_expected)("B_expected", gemm_args.dims.b.k, gemm_args.dims.b.m, gemm_args.dims.b.n); + C_expected = decltype(C_expected)("C_expected", gemm_args.dims.c.k, + gemm_args.dims.c.m, gemm_args.dims.c.n); + A_expected = decltype(A_expected)("A_expected", gemm_args.dims.a.k, + gemm_args.dims.a.m, gemm_args.dims.a.n); + B_expected = decltype(B_expected)("B_expected", gemm_args.dims.b.k, + gemm_args.dims.b.m, gemm_args.dims.b.n); } // Initialize "expected" matrices. @@ -1468,44 +1518,50 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, vo Kokkos::deep_copy(A_expected, gemm_args.A); Kokkos::deep_copy(B_expected, gemm_args.B); - Kokkos::fence(); // Ensure that deep_copy has completed + Kokkos::fence(); // Ensure that deep_copy has completed // Check that initial values match if (__gemm_do_compare(C_expected, gemm_args.C)) FATAL_ERROR("Inital values mismatch!"); } else if (gemm_args.Cv.vec_3d.data() != nullptr) { - __gemm_copy_simd_view_to_3d_view(gemm_args.Cv, C_expected, options); - __gemm_copy_simd_view_to_3d_view(gemm_args.Av, A_expected, options); - __gemm_copy_simd_view_to_3d_view(gemm_args.Bv, B_expected, options); + __gemm_copy_simd_view_to_3d_view(gemm_args.Cv, + C_expected, options); + __gemm_copy_simd_view_to_3d_view(gemm_args.Av, + A_expected, options); + __gemm_copy_simd_view_to_3d_view(gemm_args.Bv, + B_expected, options); // Check that initial values match - if (__gemm_do_compare(C_expected, gemm_args.Cv, options)) + if (__gemm_do_compare(C_expected, gemm_args.Cv, + options)) FATAL_ERROR("Inital values mismatch!"); } else { FATAL_ERROR("Input arguments are empty!"); } // Populate "expected" matrices via VanillaGemm - Test::Functor_BatchedVanillaGEMM vgemm; + Test::Functor_BatchedVanillaGEMM + vgemm; vgemm.A_t = toupper(gemm_args.transA) == 'T'; vgemm.B_t = toupper(gemm_args.transB) == 'T'; - vgemm.A_c = vgemm.B_c = false; + vgemm.A_c = vgemm.B_c = false; vgemm.batch_size_last_dim = options.blas_args.batch_size_last_dim; - vgemm.A = A_expected; - vgemm.B = B_expected; - vgemm.C = C_expected; - vgemm.alpha = gemm_args.alpha; - vgemm.beta = gemm_args.beta; - vgemm.run(); // Compute C_expected - - // Run routine with warm_up_n = 1 and n = 0. + vgemm.A = A_expected; + vgemm.B = B_expected; + vgemm.C = C_expected; + vgemm.alpha = gemm_args.alpha; + vgemm.beta = gemm_args.beta; + vgemm.run(); // Compute C_expected + + // Run routine with warm_up_n = 1 and n = 0. auto warm_up_n_bak = options.warm_up_n; - options.warm_up_n = 1; - auto n_bak = options.n; - options.n = 0; + options.warm_up_n = 1; + auto n_bak = options.n; + options.n = 0; fn(options, gemm_args); - Kokkos::fence(); // Redundant fence. + Kokkos::fence(); // Redundant fence. // Check the result if (gemm_args.C.data() != nullptr) { @@ -1514,14 +1570,15 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, vo } if (gemm_args.Cv.vec_3d.data() != nullptr) { - if (__gemm_do_compare(C_expected, gemm_args.Cv, options)) + if (__gemm_do_compare(C_expected, gemm_args.Cv, + options)) FATAL_ERROR("Result value mismatch!"); } // Run actual timed test. - options.verify = false; // Set verify to false for csv output. + options.verify = false; // Set verify to false for csv output. options.warm_up_n = warm_up_n_bak; - options.n = n_bak; + options.n = n_bak; fn(options, gemm_args); // Reset verify for next matrix size. @@ -1617,16 +1674,23 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { // Use the non-simd 4-rank view type to randomly populate the gemm simd // arguments - using tmp_view_type_4d = Kokkos::View; - tmp_view_type_4d tmpA("tmpA", gemm_args.Av.mat_4d.extent(0), gemm_args.Av.mat_4d.extent(1), gemm_args.Av.mat_4d.extent(2), gemm_args.Av.mat_4d.extent(3)); + using tmp_view_type_4d = + Kokkos::View; + tmp_view_type_4d tmpA( + "tmpA", gemm_args.Av.mat_4d.extent(0), gemm_args.Av.mat_4d.extent(1), + gemm_args.Av.mat_4d.extent(2), gemm_args.Av.mat_4d.extent(3)); Kokkos::fill_random(tmpA, rand_pool, Kokkos::rand, double>::max()); - tmp_view_type_4d tmpB("tmpB", gemm_args.Bv.mat_4d.extent(0), gemm_args.Bv.mat_4d.extent(1), gemm_args.Bv.mat_4d.extent(2), gemm_args.Bv.mat_4d.extent(3)); + tmp_view_type_4d tmpB( + "tmpB", gemm_args.Bv.mat_4d.extent(0), gemm_args.Bv.mat_4d.extent(1), + gemm_args.Bv.mat_4d.extent(2), gemm_args.Bv.mat_4d.extent(3)); Kokkos::fill_random(tmpB, rand_pool, Kokkos::rand, double>::max()); - tmp_view_type_4d tmpC("tmpC", gemm_args.Cv.mat_4d.extent(0), gemm_args.Cv.mat_4d.extent(1), gemm_args.Cv.mat_4d.extent(2), gemm_args.Cv.mat_4d.extent(3)); + tmp_view_type_4d tmpC( + "tmpC", gemm_args.Cv.mat_4d.extent(0), gemm_args.Cv.mat_4d.extent(1), + gemm_args.Cv.mat_4d.extent(2), gemm_args.Cv.mat_4d.extent(3)); Kokkos::fill_random(tmpC, rand_pool, Kokkos::rand, double>::max()); @@ -1646,16 +1710,20 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { gemm_args.C = vtc("gemm_args.C", dims.c.k, dims.c.m, dims.c.n); } - using tmp_view_type_3d = Kokkos::View; - tmp_view_type_3d tmpA("tmpA", gemm_args.A.extent(0), gemm_args.A.extent(1), gemm_args.A.extent(2)); + using tmp_view_type_3d = + Kokkos::View; + tmp_view_type_3d tmpA("tmpA", gemm_args.A.extent(0), gemm_args.A.extent(1), + gemm_args.A.extent(2)); Kokkos::fill_random(tmpA, rand_pool, Kokkos::rand, double>::max()); - tmp_view_type_3d tmpB("tmpB", gemm_args.B.extent(0), gemm_args.B.extent(1), gemm_args.B.extent(2)); + tmp_view_type_3d tmpB("tmpB", gemm_args.B.extent(0), gemm_args.B.extent(1), + gemm_args.B.extent(2)); Kokkos::fill_random(tmpB, rand_pool, Kokkos::rand, double>::max()); - tmp_view_type_3d tmpC("tmpC", gemm_args.C.extent(0), gemm_args.C.extent(1), gemm_args.C.extent(2)); + tmp_view_type_3d tmpC("tmpC", gemm_args.C.extent(0), gemm_args.C.extent(1), + gemm_args.C.extent(2)); Kokkos::fill_random(tmpC, rand_pool, Kokkos::rand, double>::max()); @@ -1671,7 +1739,7 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) { gemm_args.bp.team_size = options.blas_args.team_size; gemm_args.bp.vector_len = options.blas_args.vector_len; - Kokkos::fence(); // Ensure that fill_random has completed. + Kokkos::fence(); // Ensure that fill_random has completed. return gemm_args; } @@ -1702,7 +1770,8 @@ void __do_loop_and_invoke(options_t options, view_type_3d, default_device>(options, cur_dims); if (options.verify) { - __gemm_do_verify(options, gemm_args, fn); + __gemm_do_verify( + options, gemm_args, fn); } else { fn(options, gemm_args); } diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp index 7e1cdf0f2f..149cc00fd1 100644 --- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp +++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp @@ -128,7 +128,8 @@ static void __print_help_blas3_perf_test() { "\t\tWhether to use Kokkos::AUTO for vector_len and team_size " "(Heirarchical parallelism).\n"); printf( - "\t\t\tValid values for AUTO are 1 to use Kokkos::AUTO and 0 to use --vector_len and --team_size " + "\t\t\tValid values for AUTO are 1 to use Kokkos::AUTO and 0 to use " + "--vector_len and --team_size " "instead. (default: %d)\n", DEFAULT_USE_AUTO); @@ -139,7 +140,8 @@ static void __print_help_blas3_perf_test() { printf("\t-d, --batch_size_last_dim=LAST_DIM\n"); printf("\t\tHow to allocate the batch_size in the matrices.\n"); printf( - "\t\t\tValid values for LAST_DIM are 1 make the batch_size the last dimension and 0 to make the batch_size " + "\t\t\tValid values for LAST_DIM are 1 make the batch_size the last " + "dimension and 0 to make the batch_size " "the first dimension (default: %d)\n", DEFAULT_BATCH_SIZE_LAST_DIM); @@ -212,7 +214,8 @@ static void __print_help_blas3_perf_test() { printf("\t-v, --verify=VERIFY\n"); printf("\t\tVerification selection. (untimed)\n"); printf( - "\t\t\tValid values for VERIFY are either 0 to skip verification or 1 to verify before timing. " + "\t\t\tValid values for VERIFY are either 0 to skip verification or 1 to " + "verify before timing. " "(default: %d)\n", DEFAULT_VERIFY); } @@ -275,8 +278,9 @@ int main(int argc, char **argv) { options.blas_args.gemm.alpha = DEFAULT_GEMM_ALPHA; options.blas_args.gemm.beta = DEFAULT_GEMM_BETA; - while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:p:d:v:", - long_options, &option_idx)) != -1) { + while ( + (ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:p:d:v:", + long_options, &option_idx)) != -1) { switch (ret) { case 'h': __print_help_blas3_perf_test(); return 0; case 't': diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp index 0a6741c603..de2bbd9ce9 100644 --- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp +++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp @@ -106,8 +106,8 @@ static inline int __trmm_impl_flop_count(char side, int b_m, int b_n, int a_m, // Flop count formula from lapack working note 41: // http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf -static inline double __trmm_flop_count(char side, double b_m, double b_n, double a_m, - double a_n) { +static inline double __trmm_flop_count(char side, double b_m, double b_n, + double a_m, double a_n) { double flops; if (side == 'L' || side == 'l') { @@ -624,12 +624,13 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { trmm_args.alpha = options.blas_args.trmm.alpha; host_A = Kokkos::create_mirror_view(trmm_args.A); - { - Kokkos::View tmp("tmp", trmm_args.A.extent(0), trmm_args.A.extent(1), trmm_args.A.extent(2)); + Kokkos::View tmp( + "tmp", trmm_args.A.extent(0), trmm_args.A.extent(1), + trmm_args.A.extent(2)); Kokkos::fill_random(tmp, rand_pool, - Kokkos::rand, - double>::max()); + Kokkos::rand, + double>::max()); Kokkos::deep_copy(host_A, tmp); } @@ -668,10 +669,12 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) { Kokkos::deep_copy(trmm_args.A, host_A); { - Kokkos::View tmp("tmp", trmm_args.B.extent(0), trmm_args.B.extent(1), trmm_args.B.extent(2)); + Kokkos::View tmp( + "tmp", trmm_args.B.extent(0), trmm_args.B.extent(1), + trmm_args.B.extent(2)); Kokkos::fill_random(tmp, rand_pool, - Kokkos::rand, - double>::max()); + Kokkos::rand, + double>::max()); Kokkos::deep_copy(trmm_args.B, tmp); } diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp index 4a5c17d1df..eb9883c425 100644 --- a/src/batched/KokkosBatched_Util.hpp +++ b/src/batched/KokkosBatched_Util.hpp @@ -205,7 +205,7 @@ namespace KokkosBatched { std::is_same >::value || std::is_same >::value || std::is_same >::value || - std::is_same::value, + std::is_same::value, "KokkosKernels:: Invalid SIMD<> type." ); using value_type = T; };