From b1d26ed2e98f1dd33f5c8584c41ec3a405499b3d Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Tue, 9 Feb 2021 14:20:42 -0700
Subject: [PATCH 01/47] perf_test/blas:     - Add GFLOP/s output     - Add
 support for separate batch_size option     - Update step option to add step
 size

---
 perf_test/blas/blas/KokkosBlas_common.hpp     |   3 +-
 perf_test/blas/blas/KokkosBlas_perf_test.cpp  |  17 +-
 .../blas/blas/KokkosBlas_trtri_perf_test.hpp  | 129 ++++++++++-----
 .../blas/blas3/KokkosBlas3_perf_test.cpp      |   2 +-
 .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 150 ++++++++++++------
 5 files changed, 210 insertions(+), 91 deletions(-)

diff --git a/perf_test/blas/blas/KokkosBlas_common.hpp b/perf_test/blas/blas/KokkosBlas_common.hpp
index a6f9c65d8b..54e79647bf 100644
--- a/perf_test/blas/blas/KokkosBlas_common.hpp
+++ b/perf_test/blas/blas/KokkosBlas_common.hpp
@@ -56,6 +56,7 @@
 #define DEFAULT_STEP 3
 #define DEFAULT_WARM_UP_N 100
 #define DEFAULT_N 100
+#define DEFAULT_K 10
 #define DEFAULT_OUT &std::cout
 #define DEFAULT_BLAS_ROUTINES "trtri,"
 
@@ -117,7 +118,7 @@ static std::string test_e_str[TEST_N]{"BLAS", "BATCHED"};
  * @var n: Number of columns.
  */
 struct matrix_dim {
-  int m, n;
+  int k, m, n;
 };
 typedef struct matrix_dim matrix_dim_t;
 
diff --git a/perf_test/blas/blas/KokkosBlas_perf_test.cpp b/perf_test/blas/blas/KokkosBlas_perf_test.cpp
index 46e89d5abb..803286f266 100644
--- a/perf_test/blas/blas/KokkosBlas_perf_test.cpp
+++ b/perf_test/blas/blas/KokkosBlas_perf_test.cpp
@@ -57,6 +57,7 @@ static struct option long_options[] = {
     {"matrix_size_step", required_argument, 0, 's'},
     {"warm_up_loop", required_argument, 0, 'w'},
     {"iter", required_argument, 0, 'i'},
+    {"batch_size", required_argument, 0, 'k'},
     {"csv", required_argument, 0, 'c'},
     {"routines", required_argument, 0, 'r'},
     {"trtri_options", required_argument, 0, 'o'},
@@ -135,6 +136,11 @@ static void __print_help_blas_perf_test() {
       "(default: %d)\n\n",
       DEFAULT_N);
 
+  printf("\t-k, --batch_size=LEN\n");
+  printf("\t\tBatch size. Adds third dimension to matrices A and B.\n");
+  printf("\t\t\tThe value of LEN as an integer. (default: %d)\n",
+         DEFAULT_K);
+
   printf("\t-c, --csv=/path/to/file.csv\n");
   printf("\t\tCsv output file selection.\n");
   printf(
@@ -166,12 +172,16 @@ int main(int argc, char **argv) {
   /* set default options */
   options.test          = DEFAULT_TEST;
   options.loop          = DEFAULT_LOOP;
+  options.start.a.k     = DEFAULT_K;
   options.start.a.m     = DEFAULT_MATRIX_START;
   options.start.a.n     = DEFAULT_MATRIX_START;
+  options.stop.a.k      = DEFAULT_K;
   options.stop.a.m      = DEFAULT_MATRIX_STOP;
   options.stop.a.n      = DEFAULT_MATRIX_STOP;
+  options.start.b.k     = DEFAULT_K;
   options.start.b.m     = DEFAULT_MATRIX_START;
   options.start.b.n     = DEFAULT_MATRIX_START;
+  options.stop.b.k      = DEFAULT_K;
   options.stop.b.m      = DEFAULT_MATRIX_STOP;
   options.stop.b.n      = DEFAULT_MATRIX_STOP;
   options.step          = DEFAULT_STEP;
@@ -182,7 +192,7 @@ int main(int argc, char **argv) {
 
   options.blas_args.trtri.trtri_args = DEFAULT_TRTRI_ARGS;
 
-  while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:c:r:", long_options,
+  while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:c:r:k:", long_options,
                             &option_idx)) != -1) {
     switch (ret) {
       case 'h': __print_help_blas_perf_test(); return 0;
@@ -255,6 +265,11 @@ int main(int argc, char **argv) {
       case 's': options.step = atoi(optarg); break;
       case 'w': options.warm_up_n = atoi(optarg); break;
       case 'i': options.n = atoi(optarg); break;
+      case 'k':
+        options.start.a.k = options.stop.a.k = 
+        options.start.b.k = options.stop.b.k =
+          atoi(optarg);
+        break;
       case 'c':
         out_file         = optarg;
         options.out_file = std::string(out_file);
diff --git a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp
index e6b7b825a7..34c0237871 100644
--- a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp
+++ b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp
@@ -78,6 +78,21 @@ void (*do_trtri_invoke[LOOP_N][TEST_N])(options_t) = {
 /*************************** Test types and defaults **************************/
 #define DEFAULT_TRTRI_ARGS "UU"
 
+  /**
+   * The KokkosBatched::SerialTrtri implementation performs trmm and scal on subblocks
+   * of the A matrix. a_m subblocks are selected.
+   */
+static inline int trtri_flop_count(int a_m, int a_n) {
+  int flop_count = 0;
+
+  for (int i = 0; i < a_m; i++) {
+    flop_count++;                // 1 / A[i,j]
+    flop_count += (i * (i + 1)); // TRMM FLOPS
+    flop_count += i;             // SCAL FLOPS
+  }
+  return flop_count;
+}
+
 using view_type_3d =
     Kokkos::View<default_scalar***, default_layout, default_device>;
 struct trtri_args {
@@ -87,18 +102,25 @@ struct trtri_args {
 typedef struct trtri_args trtri_args_t;
 
 static std::string trtri_csv_header_str =
-    "algorithm,side-uplo-trans-diag,alpha,loop_type,A_dims,warm_up_n,iter,"
-    "total_time(s),average_time(s)";
+    "algorithm,side-uplo-trans-diag,loop_type,A_dims,warm_up_n,iter,"
+    "total_time(s),average_time(s),GFLOPS,GFLOP/average_time(s)";
 
 /*************************** Internal helper fns **************************/
 static void __trtri_output_csv_row(options_t options, trtri_args_t trtri_args,
                                    double time_in_seconds) {
+  double flops = trtri_args.A.extent(0) * trtri_flop_count(trtri_args.A.extent(1), trtri_args.A.extent(2));
+  double gflops = flops / 10e9;
+  double average_time = time_in_seconds / options.n;
+
   options.out[0] << test_e_str[options.test] << ","
                  << options.blas_args.trtri.trtri_args << ","
-                 << loop_e_str[options.loop] << "," << trtri_args.A.extent(1)
+                 << loop_e_str[options.loop] << "," << trtri_args.A.extent(0) << "x" << trtri_args.A.extent(1)
                  << "x" << trtri_args.A.extent(2) << "," << options.warm_up_n
                  << "," << options.n << "," << time_in_seconds << ","
-                 << time_in_seconds / options.n << std::endl;
+                 << average_time << ","
+                 << gflops << ","
+                 << gflops / average_time
+                 << std::endl;
 }
 
 static void __print_trtri_perf_test_options(options_t options) {
@@ -133,19 +155,26 @@ void __do_trtri_serial_blas(options_t options, trtri_args_t trtri_args) {
 
   STATUS;
 
-  for (uint32_t i = 0; i < warm_up_n; ++i) {
-    auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL());
+  for (uint32_t j = 0; j < warm_up_n; ++j) {
+    for (int i = 0; i < options.start.a.k; ++i) {
+      auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL());
 
-    KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A);
+      KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A);
+    }
+    // Fence after each batch operation
+    Kokkos::fence();
   }
 
   timer.reset();
-  for (uint32_t i = 0; i < n; ++i) {
-    auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL());
+  for (uint32_t j = 0; j < n; ++j) {
+    for (int i = 0; i < options.start.a.k; ++i) {
+      auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL());
 
-    KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A);
+      KokkosBlas::trtri(&trtri_args.uplo, &trtri_args.diag, A);
+    }
+    // Fence after each batch operation
+    Kokkos::fence();
   }
-  Kokkos::fence();
   __trtri_output_csv_row(options, trtri_args, timer.seconds());
 #else
   std::cerr << std::string(__func__)
@@ -164,19 +193,26 @@ void __do_trtri_serial_batched_template(options_t options,
   Kokkos::Timer timer;
   using tag = Algo::Trtri::Unblocked;
 
-  for (uint32_t i = 0; i < warm_up_n; ++i) {
-    auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL());
+  for (uint32_t j = 0; j < warm_up_n; ++j) {
+    for (int i = 0; i < options.start.a.k; ++i) {
+      auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL());
 
-    SerialTrtri<uplo, diag, tag>::invoke(A);
+      SerialTrtri<uplo, diag, tag>::invoke(A);
+    }
+    // Fence after each batch operation
+    Kokkos::fence();
   }
 
   timer.reset();
-  for (uint32_t i = 0; i < n; ++i) {
-    auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL());
+  for (uint32_t j = 0; j < n; ++j) {
+    for (int i = 0; i < options.start.a.k; ++i) {
+      auto A = Kokkos::subview(trtri_args.A, i, Kokkos::ALL(), Kokkos::ALL());
 
-    SerialTrtri<uplo, diag, tag>::invoke(A);
+      SerialTrtri<uplo, diag, tag>::invoke(A);
+    }
+    // Fence after each batch operation
+    Kokkos::fence();
   }
-  Kokkos::fence();
   __trtri_output_csv_row(options, trtri_args, timer.seconds());
 #else
   std::cerr << std::string(__func__)
@@ -241,16 +277,22 @@ void __do_trtri_parallel_blas(options_t options, trtri_args_t trtri_args) {
 
   STATUS;
 
-  Kokkos::parallel_for("parallelBlasWarmUpLoopTrtri",
-                       Kokkos::RangePolicy<execution_space>(0, warm_up_n),
-                       parallel_blas_trtri_functor);
-  Kokkos::fence();
+  for (uint32_t i = 0; i < warm_up_n; ++i) {
+    Kokkos::parallel_for("parallelBlasWarmUpLoopTrtri",
+                        Kokkos::RangePolicy<execution_space>(0, options.start.a.k),
+                        parallel_blas_trtri_functor);
+    // Fence after each batch operation
+    Kokkos::fence();
+  }
 
   timer.reset();
-  Kokkos::parallel_for("parallelBlasTimedLoopTrtri",
-                       Kokkos::RangePolicy<execution_space>(0, n),
-                       parallel_blas_trtri_functor);
-  Kokkos::fence();
+  for (uint32_t i = 0; i < n; ++i) {
+    Kokkos::parallel_for("parallelBlasTimedLoopTrtri",
+                        Kokkos::RangePolicy<execution_space>(0, options.start.a.k),
+                        parallel_blas_trtri_functor);
+    // Fence after each batch operation
+    Kokkos::fence();
+  }
   __trtri_output_csv_row(options, trtri_args, timer.seconds());
 #else
   std::cerr << std::string(__func__)
@@ -287,16 +329,23 @@ void __do_trtri_parallel_batched_template(options_t options,
 
   STATUS;
 
-  Kokkos::parallel_for("parallelBatchedWarmUpLoopTrtri",
-                       Kokkos::RangePolicy<execution_space>(0, warm_up_n),
-                       parallel_batched_trtri_functor);
-  Kokkos::fence();
+  for (uint32_t i = 0; i < warm_up_n; ++i) {
+    Kokkos::parallel_for("parallelBatchedWarmUpLoopTrtri",
+                        Kokkos::RangePolicy<execution_space>(0, options.start.a.k),
+                        parallel_batched_trtri_functor);
+    // Fence after each batch operation
+    Kokkos::fence();
+  }
 
   timer.reset();
-  Kokkos::parallel_for("parallelBatchedTimedLoopTrtri",
-                       Kokkos::RangePolicy<execution_space>(0, n),
-                       parallel_batched_trtri_functor);
-  Kokkos::fence();
+
+  for (uint32_t i = 0; i < n; ++i) {
+    Kokkos::parallel_for("parallelBatchedTimedLoopTrtri",
+                        Kokkos::RangePolicy<execution_space>(0, options.start.a.k),
+                        parallel_batched_trtri_functor);
+    // Fence after each batch operation
+    Kokkos::fence();
+  }
   __trtri_output_csv_row(options, trtri_args, timer.seconds());
 
   return;
@@ -345,7 +394,7 @@ trtri_args_t __do_setup(options_t options, matrix_dims_t dim) {
 
   trtri_args.uplo = options.blas_args.trtri.trtri_args.c_str()[0];
   trtri_args.diag = options.blas_args.trtri.trtri_args.c_str()[1];
-  trtri_args.A    = vta("trtri_args.A", options.n, dim.a.m, dim.a.n);
+  trtri_args.A    = vta("trtri_args.A", dim.a.k, dim.a.m, dim.a.n);
   host_A          = Kokkos::create_mirror_view(trtri_args.A);
 
   Kokkos::fill_random(trtri_args.A, rand_pool,
@@ -355,7 +404,7 @@ trtri_args_t __do_setup(options_t options, matrix_dims_t dim) {
 
   if (trtri_args.uplo == 'U' || trtri_args.uplo == 'u') {
     // Make A upper triangular
-    for (uint32_t k = 0; k < options.n; ++k) {
+    for (int k = 0; k < dim.a.k; ++k) {
       auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL());
       for (int i = 1; i < dim.a.m; i++) {
         for (int j = 0; j < i; j++) {
@@ -367,7 +416,7 @@ trtri_args_t __do_setup(options_t options, matrix_dims_t dim) {
     // Make A lower triangular
     // Kokkos::parallel_for("toLowerLoop", options.n, KOKKOS_LAMBDA (const int&
     // i) {
-    for (uint32_t k = 0; k < options.n; ++k) {
+    for (int k = 0; k < dim.a.k; ++k) {
       auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL());
       for (int i = 0; i < dim.a.m - 1; i++) {
         for (int j = i + 1; j < dim.a.n; j++) {
@@ -378,7 +427,7 @@ trtri_args_t __do_setup(options_t options, matrix_dims_t dim) {
   }
 
   if (trtri_args.diag == 'U' || trtri_args.diag == 'u') {
-    for (uint32_t k = 0; k < options.n; ++k) {
+    for (int k = 0; k < dim.a.k; ++k) {
       auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL());
       for (int i = 0; i < min_dim; i++) {
         A(i, i) = scalar_type(1);
@@ -408,8 +457,8 @@ void __do_loop_and_invoke(options_t options,
   for (cur_dims = options.start;
        cur_dims.a.m <= options.stop.a.m && cur_dims.a.n <= options.stop.a.n &&
        cur_dims.b.m <= options.stop.b.m && cur_dims.b.n <= options.stop.b.n;
-       cur_dims.a.m *= options.step, cur_dims.a.n *= options.step,
-      cur_dims.b.m *= options.step, cur_dims.b.n *= options.step) {
+       cur_dims.a.m += options.step, cur_dims.a.n += options.step,
+      cur_dims.b.m += options.step, cur_dims.b.n += options.step) {
     trtri_args = __do_setup<default_scalar, view_type_3d, default_device>(
         options, cur_dims);
     fn(options, trtri_args);
diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
index b493c244d8..6c95960e25 100644
--- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
+++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
@@ -122,7 +122,7 @@ static void __print_help_blas3_perf_test() {
   printf("\t-k, --batch_size=LEN\n");
   printf("\t\tBatch size. Adds third dimension to matrices A, B, and C.\n");
   printf("\t\t\tThe value of LEN as an integer. (default: %d)\n",
-         DEFAULT_VECTOR_LEN);
+         DEFAULT_K);
 
   printf("\t-l, --loop_type=OPTION\n");
   printf("\t\tLoop selection.\n");
diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
index 70f7664679..79b58dc7d8 100644
--- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
@@ -72,6 +72,26 @@ void (*do_trmm_invoke[LOOP_N][TEST_N])(options_t) = {
 #define DEFAULT_TRMM_ARGS "LUNU"
 #define DEFAULT_TRMM_ALPHA 1.0
 
+/**
+ * The KokkosBatched::SerialTrmm implementation performs dot products on
+ * non-zero elements of the triangular matrices. The flop calculation below
+ * assumes KokkosBatched::SerialTrmm is being used. Since the dot products
+ * do a multiply and add we can calculate the flops for any element in the last
+ * column of the LHS to be 2*columns_LHS, any element in the last-1 column of
+ * the LHS to be 2*(columns_LHS-1), and so on. We do this for every row of the LHS
+ * giving us this flop count:
+ *  flops = columns_LHS * (columns_LHS + 1)
+ *  flops = (flops / 2) * 2
+ *  flops = flops * rows_LHS
+ */
+static inline int trmm_flop_count(char side, int b_m, int b_n, int a_m, int a_n) {
+  if (side == 'L' || side == 'l') {
+    return (a_n * (a_n + 1)) * a_m;
+  } else {
+    return (b_n * (b_n + 1)) * b_m;
+  }
+}
+
 using view_type_3d =
     Kokkos::View<default_scalar***, default_layout, default_device>;
 struct trmm_args {
@@ -83,19 +103,28 @@ typedef struct trmm_args trmm_args_t;
 
 static std::string trmm_csv_header_str =
     "algorithm,side-uplo-trans-diag,alpha,loop_type,A_dims,B_dims,warm_up_n,"
-    "iter,total_time(s),average_time(s)";
+    "iter,total_time(s),average_time(s),GFLOPS,GFLOP/average_time(s)";
 
 /*************************** Internal helper fns **************************/
 static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args,
                                   double time_in_seconds) {
+  double flops = trmm_args.A.extent(0) * trmm_flop_count(trmm_args.side,
+                                                         trmm_args.B.extent(1), trmm_args.B.extent(2),
+                                                         trmm_args.A.extent(1), trmm_args.A.extent(2));
+  double gflops = flops / 10e9;
+  double average_time = time_in_seconds / options.n;
+
   options.out[0] << test_e_str[options.test] << ","
                  << options.blas_args.trmm.trmm_args << ","
                  << options.blas_args.trmm.alpha << ","
-                 << loop_e_str[options.loop] << "," << trmm_args.A.extent(1)
-                 << "x" << trmm_args.A.extent(2) << "," << trmm_args.B.extent(1)
+                 << loop_e_str[options.loop] << "," << trmm_args.A.extent(0) << "x" << trmm_args.A.extent(1)
+                 << "x" << trmm_args.A.extent(2) << "," << trmm_args.B.extent(0) << "x" << trmm_args.B.extent(1)
                  << "x" << trmm_args.B.extent(2) << "," << options.warm_up_n
                  << "," << options.n << "," << time_in_seconds << ","
-                 << time_in_seconds / options.n << std::endl;
+                 << time_in_seconds / options.n << ","
+                 << gflops << ","
+                 << gflops / average_time
+                 << std::endl;
 }
 
 static void __print_trmm_perf_test_options(options_t options) {
@@ -131,24 +160,30 @@ void __do_trmm_serial_blas(options_t options, trmm_args_t trmm_args) {
 
   STATUS;
 
-  for (uint32_t i = 0; i < warm_up_n; ++i) {
-    auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL());
-    auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL());
+  for (uint32_t j = 0; j < warm_up_n; ++j) {
+    for (int i = 0; i < options.start.a.k; ++i) {
+      auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL());
+      auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL());
 
-    KokkosBlas::trmm(&trmm_args.side, &trmm_args.uplo, &trmm_args.trans,
-                     &trmm_args.diag, trmm_args.alpha, A, B);
+      KokkosBlas::trmm(&trmm_args.side, &trmm_args.uplo, &trmm_args.trans,
+                      &trmm_args.diag, trmm_args.alpha, A, B);
+    }
+    // Fence after submitting each batch operation
+    Kokkos::fence();
   }
 
-  Kokkos::fence();
   timer.reset();
-  for (uint32_t i = 0; i < n; ++i) {
-    auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL());
-    auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL());
+  for (uint32_t j = 0; j < n; ++j) {
+    for (int i = 0; i < options.start.a.k; ++i) {
+      auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL());
+      auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL());
 
-    KokkosBlas::trmm(&trmm_args.side, &trmm_args.uplo, &trmm_args.trans,
-                     &trmm_args.diag, trmm_args.alpha, A, B);
+      KokkosBlas::trmm(&trmm_args.side, &trmm_args.uplo, &trmm_args.trans,
+                      &trmm_args.diag, trmm_args.alpha, A, B);
+    }
+    // Fence after submitting each batch operation
+    Kokkos::fence();
   }
-  Kokkos::fence();
   __trmm_output_csv_row(options, trmm_args, timer.seconds());
 #else
   std::cerr << std::string(__func__)
@@ -167,21 +202,28 @@ void __do_trmm_serial_batched_template(options_t options,
   Kokkos::Timer timer;
   using tag = Algo::Trmm::Unblocked;
 
-  for (uint32_t i = 0; i < warm_up_n; ++i) {
-    auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL());
-    auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL());
+  for (uint32_t j = 0; j < warm_up_n; ++j) {
+    for (int i = 0; i < options.start.a.k; ++i) {
+      auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL());
+      auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL());
 
-    SerialTrmm<side, uplo, trans, diag, tag>::invoke(trmm_args.alpha, A, B);
+      SerialTrmm<side, uplo, trans, diag, tag>::invoke(trmm_args.alpha, A, B);
+    }
+    // Fence after submitting each batch operation
+    Kokkos::fence();
   }
 
   timer.reset();
-  for (uint32_t i = 0; i < n; ++i) {
-    auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL());
-    auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL());
+  for (uint32_t j = 0; j < n; ++j) {
+    for (int i = 0; i < options.start.a.k; ++i) {
+      auto A = Kokkos::subview(trmm_args.A, i, Kokkos::ALL(), Kokkos::ALL());
+      auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL());
 
-    SerialTrmm<side, uplo, trans, diag, tag>::invoke(trmm_args.alpha, A, B);
+      SerialTrmm<side, uplo, trans, diag, tag>::invoke(trmm_args.alpha, A, B);
+    }
+    // Fence after submitting each batch operation
+    Kokkos::fence();
   }
-  Kokkos::fence();
   __trmm_output_csv_row(options, trmm_args, timer.seconds());
 #else
   std::cerr << std::string(__func__)
@@ -316,16 +358,22 @@ void __do_trmm_parallel_blas(options_t options, trmm_args_t trmm_args) {
 
   STATUS;
 
-  Kokkos::parallel_for("parallelBlasWarmUpLoopTrmm",
-                       Kokkos::RangePolicy<execution_space>(0, warm_up_n),
-                       parallel_blas_trmm_functor);
-  Kokkos::fence();
+  for (uint32_t j = 0; j < warm_up_n; ++j) {
+    Kokkos::parallel_for("parallelBlasWarmUpLoopTrmm",
+                        Kokkos::RangePolicy<execution_space>(0, options.start.a.k),
+                        parallel_blas_trmm_functor);
+    // Fence after each batch operation
+    Kokkos::fence();
+  }
 
   timer.reset();
-  Kokkos::parallel_for("parallelBlasTimedLoopTrmm",
-                       Kokkos::RangePolicy<execution_space>(0, n),
-                       parallel_blas_trmm_functor);
-  Kokkos::fence();
+  for (uint32_t j = 0; j < n; ++j) {
+    Kokkos::parallel_for("parallelBlasTimedLoopTrmm",
+                        Kokkos::RangePolicy<execution_space>(0, options.start.a.k),
+                        parallel_blas_trmm_functor);
+    // Fence after each batch operation
+    Kokkos::fence();
+  }
   __trmm_output_csv_row(options, trmm_args, timer.seconds());
 #else
   std::cerr << std::string(__func__)
@@ -368,16 +416,22 @@ void __do_trmm_parallel_batched_template(options_t options,
 
   STATUS;
 
-  Kokkos::parallel_for("parallelBatchedWarmUpLoopTrmm",
-                       Kokkos::RangePolicy<execution_space>(0, warm_up_n),
-                       parallel_batched_trmm_functor);
-  Kokkos::fence();
+  for (uint32_t j = 0; j < warm_up_n; ++j) {
+    Kokkos::parallel_for("parallelBatchedWarmUpLoopTrmm",
+                        Kokkos::RangePolicy<execution_space>(0, options.start.a.k),
+                        parallel_batched_trmm_functor);
+    // Fence after each batch operation
+    Kokkos::fence();
+  }
 
   timer.reset();
-  Kokkos::parallel_for("parallelBatchedTimedLoopTrmm",
-                       Kokkos::RangePolicy<execution_space>(0, n),
-                       parallel_batched_trmm_functor);
-  Kokkos::fence();
+  for (uint32_t j = 0; j < n; ++j) {
+    Kokkos::parallel_for("parallelBatchedTimedLoopTrmm",
+                        Kokkos::RangePolicy<execution_space>(0, options.start.a.k),
+                        parallel_batched_trmm_functor);
+    // Fence after each batch operation
+    Kokkos::fence();
+  }
   __trmm_output_csv_row(options, trmm_args, timer.seconds());
 
   return;
@@ -498,8 +552,8 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) {
   trmm_args.uplo  = options.blas_args.trmm.trmm_args.c_str()[1];
   trmm_args.trans = options.blas_args.trmm.trmm_args.c_str()[2];
   trmm_args.diag  = options.blas_args.trmm.trmm_args.c_str()[3];
-  trmm_args.A     = vta("trmm_args.A", options.n, dim.a.m, dim.a.n);
-  trmm_args.B     = vtb("trmm_args.B", options.n, dim.b.m, dim.b.n);
+  trmm_args.A     = vta("trmm_args.A", dim.a.k, dim.a.m, dim.a.n);
+  trmm_args.B     = vtb("trmm_args.B", dim.b.k, dim.b.m, dim.b.n);
   trmm_args.alpha = options.blas_args.trmm.alpha;
   host_A          = Kokkos::create_mirror_view(trmm_args.A);
 
@@ -510,7 +564,7 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) {
 
   if (trmm_args.uplo == 'U' || trmm_args.uplo == 'u') {
     // Make A upper triangular
-    for (uint32_t k = 0; k < options.n; ++k) {
+    for (int k = 0; k < dim.a.k; ++k) {
       auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL());
       for (int i = 1; i < dim.a.m; i++) {
         for (int j = 0; j < i; j++) {
@@ -522,7 +576,7 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) {
     // Make A lower triangular
     // Kokkos::parallel_for("toLowerLoop", options.n, KOKKOS_LAMBDA (const int&
     // i) {
-    for (uint32_t k = 0; k < options.n; ++k) {
+    for (int k = 0; k < dim.a.k; ++k) {
       auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL());
       for (int i = 0; i < dim.a.m - 1; i++) {
         for (int j = i + 1; j < dim.a.n; j++) {
@@ -533,7 +587,7 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) {
   }
 
   if (trmm_args.diag == 'U' || trmm_args.diag == 'u') {
-    for (uint32_t k = 0; k < options.n; ++k) {
+    for (int k = 0; k < dim.a.k; ++k) {
       auto A = Kokkos::subview(host_A, k, Kokkos::ALL(), Kokkos::ALL());
       for (int i = 0; i < min_dim; i++) {
         A(i, i) = scalar_type(1);
@@ -566,8 +620,8 @@ void __do_loop_and_invoke(options_t options,
   for (cur_dims = options.start;
        cur_dims.a.m <= options.stop.a.m && cur_dims.a.n <= options.stop.a.n &&
        cur_dims.b.m <= options.stop.b.m && cur_dims.b.n <= options.stop.b.n;
-       cur_dims.a.m *= options.step, cur_dims.a.n *= options.step,
-      cur_dims.b.m *= options.step, cur_dims.b.n *= options.step) {
+       cur_dims.a.m += options.step, cur_dims.a.n += options.step,
+      cur_dims.b.m += options.step, cur_dims.b.n += options.step) {
     trmm_args =
         __do_setup<default_scalar, view_type_3d, view_type_3d, default_device>(
             options, cur_dims);

From 3211987c766583f587d3ad9bffca32e3e59d3d18 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Wed, 10 Feb 2021 08:41:57 -0700
Subject: [PATCH 02/47] perf_test: Account for complex flop counts

---
 .../blas/blas/KokkosBlas_trtri_perf_test.hpp  | 21 ++++++++++++++++---
 .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 17 +++++++++++++--
 2 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp
index 34c0237871..3cacc73739 100644
--- a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp
+++ b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp
@@ -84,12 +84,27 @@ void (*do_trtri_invoke[LOOP_N][TEST_N])(options_t) = {
    */
 static inline int trtri_flop_count(int a_m, int a_n) {
   int flop_count = 0;
+  int flops_per_div, flops_per_mul, flops_per_add;
+
+    if (std::is_same<double, default_scalar>::value ||
+        std::is_same<float, default_scalar>::value ||
+        std::is_same<Kokkos::Experimental::half_t, default_scalar>::value) {
+      flops_per_div = 1;
+      flops_per_mul = 1;
+      flops_per_add = 1;
+    } else {
+      // For complex, we need to count 2 flops for each add and 6 flops for each multiply or divide.
+      flops_per_div = 6;
+      flops_per_mul = 6;
+      flops_per_add = 2;
+    }
 
   for (int i = 0; i < a_m; i++) {
-    flop_count++;                // 1 / A[i,j]
-    flop_count += (i * (i + 1)); // TRMM FLOPS
-    flop_count += i;             // SCAL FLOPS
+    flop_count += flops_per_div;                                         // 1 / A[i,j]
+    flop_count += ((i * (i + 1)) / 2) * (flops_per_mul + flops_per_add); // TRMM FLOPS
+    flop_count += i * flops_per_mul;                                     // SCAL FLOPS
   }
+
   return flop_count;
 }
 
diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
index 79b58dc7d8..077c5b3d80 100644
--- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
@@ -85,11 +85,23 @@ void (*do_trmm_invoke[LOOP_N][TEST_N])(options_t) = {
  *  flops = flops * rows_LHS
  */
 static inline int trmm_flop_count(char side, int b_m, int b_n, int a_m, int a_n) {
+  int flops;
+
   if (side == 'L' || side == 'l') {
-    return (a_n * (a_n + 1)) * a_m;
+      flops = (a_n * (a_n + 1)) * a_m;
   } else {
-    return (b_n * (b_n + 1)) * b_m;
+      flops = (b_n * (b_n + 1)) * b_m;
   }
+
+  if (std::is_same<double, default_scalar>::value ||
+        std::is_same<float, default_scalar>::value ||
+        std::is_same<Kokkos::Experimental::half_t, default_scalar>::value)
+      return flops;
+
+  // Account for 6 additional flops when complex numbers are used.
+  // Above we have counted 1 flop for each add and 1 flop for each multiply.
+  // For complex, we need to count 2 flops for each add and 6 flops for each multiply.
+  return flops * 4;
 }
 
 using view_type_3d =
@@ -348,6 +360,7 @@ struct parallel_blas_trmm {
 
 template <class scalar_type, class vta, class vtb, class device_type>
 void __do_trmm_parallel_blas(options_t options, trmm_args_t trmm_args) {
+// TODO: Note why this is disabled on CUDA and HIP
 #if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
   uint32_t warm_up_n = options.warm_up_n;
   uint32_t n         = options.n;

From 667fee39d51110ee2bcd3b7e1216f0d91eac9685 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Wed, 10 Feb 2021 14:08:10 -0700
Subject: [PATCH 03/47] perf_test: Use flop counts from lapack note 41

---
 .../blas/blas/KokkosBlas_trtri_perf_test.hpp  | 34 +++++++++++++++++--
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp
index 3cacc73739..de24a96254 100644
--- a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp
+++ b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp
@@ -82,7 +82,7 @@ void (*do_trtri_invoke[LOOP_N][TEST_N])(options_t) = {
    * The KokkosBatched::SerialTrtri implementation performs trmm and scal on subblocks
    * of the A matrix. a_m subblocks are selected.
    */
-static inline int trtri_flop_count(int a_m, int a_n) {
+static inline int trtri_impl_flop_count(int a_m, int a_n) {
   int flop_count = 0;
   int flops_per_div, flops_per_mul, flops_per_add;
 
@@ -108,6 +108,34 @@ static inline int trtri_flop_count(int a_m, int a_n) {
   return flop_count;
 }
 
+// Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf
+static inline int trtri_flop_count(int a_m, int a_n) {
+  int flops;
+  int flops_per_mul;
+  int flops_per_add;
+
+  if (a_m != a_n) {
+    fprintf(stderr, "%s:%d:ERROR: a_m != a_n.\n", __FILE__, __LINE__);
+    exit(255);
+  }
+
+  if (std::is_same<double, default_scalar>::value ||
+        std::is_same<float, default_scalar>::value ||
+        std::is_same<Kokkos::Experimental::half_t, default_scalar>::value) {
+    flops_per_mul = 1;
+    flops_per_add = 1;
+  } else {
+    // For complex, we need to count 2 flops for each add and 6 flops for each multiply.
+    flops_per_mul = 6;
+    flops_per_add = 2;
+  }
+  
+  flops = (1./6.*a_n*a_n*a_n + 1./2.*a_n*a_n + 1./3.*a_n) * flops_per_mul +
+          (1./6.*a_n*a_n*a_n - 1./2.*a_n*a_n + 1./3.*a_n) * flops_per_add;
+
+  return flops;
+}
+
 using view_type_3d =
     Kokkos::View<default_scalar***, default_layout, default_device>;
 struct trtri_args {
@@ -118,7 +146,7 @@ typedef struct trtri_args trtri_args_t;
 
 static std::string trtri_csv_header_str =
     "algorithm,side-uplo-trans-diag,loop_type,A_dims,warm_up_n,iter,"
-    "total_time(s),average_time(s),GFLOPS,GFLOP/average_time(s)";
+    "total_time(s),average_time(s),FLOPS,GFLOP/average_time(s)";
 
 /*************************** Internal helper fns **************************/
 static void __trtri_output_csv_row(options_t options, trtri_args_t trtri_args,
@@ -133,7 +161,7 @@ static void __trtri_output_csv_row(options_t options, trtri_args_t trtri_args,
                  << "x" << trtri_args.A.extent(2) << "," << options.warm_up_n
                  << "," << options.n << "," << time_in_seconds << ","
                  << average_time << ","
-                 << gflops << ","
+                 << flops << ","
                  << gflops / average_time
                  << std::endl;
 }

From 973afc564f7ba479731773856b9a93f4ddcda647 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Wed, 10 Feb 2021 14:08:18 -0700
Subject: [PATCH 04/47] perf_test: Update flop counts   - Use flop counts from
 lapack note 41   - Fix impl flop counts for side == left

---
 .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 29 ++++++++++++++++---
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
index 077c5b3d80..a35caad5dd 100644
--- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
@@ -84,11 +84,11 @@ void (*do_trmm_invoke[LOOP_N][TEST_N])(options_t) = {
  *  flops = (flops / 2) * 2
  *  flops = flops * rows_LHS
  */
-static inline int trmm_flop_count(char side, int b_m, int b_n, int a_m, int a_n) {
+static inline int trmm_impl_flop_count(char side, int b_m, int b_n, int a_m, int a_n) {
   int flops;
 
   if (side == 'L' || side == 'l') {
-      flops = (a_n * (a_n + 1)) * a_m;
+      flops = (b_m * (b_m + 1)) * b_n;
   } else {
       flops = (b_n * (b_n + 1)) * b_m;
   }
@@ -104,6 +104,27 @@ static inline int trmm_flop_count(char side, int b_m, int b_n, int a_m, int a_n)
   return flops * 4;
 }
 
+// Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf
+static inline int trmm_flop_count(char side, int b_m, int b_n, int a_m, int a_n) {
+  int flops;
+
+  if (side == 'L' || side == 'l') {
+    flops = b_m * b_m * b_n;
+  } else {
+    flops = b_n * b_n * b_m;
+  }
+
+  if (std::is_same<double, default_scalar>::value ||
+        std::is_same<float, default_scalar>::value ||
+        std::is_same<Kokkos::Experimental::half_t, default_scalar>::value)
+      return flops;
+
+  // Account for 6 additional flops when complex numbers are used.
+  // Above we have counted 1 flop for each add and 1 flop for each multiply.
+  // For complex, we need to count 2 flops for each add and 6 flops for each multiply.
+  return flops * 4;
+}
+
 using view_type_3d =
     Kokkos::View<default_scalar***, default_layout, default_device>;
 struct trmm_args {
@@ -115,7 +136,7 @@ typedef struct trmm_args trmm_args_t;
 
 static std::string trmm_csv_header_str =
     "algorithm,side-uplo-trans-diag,alpha,loop_type,A_dims,B_dims,warm_up_n,"
-    "iter,total_time(s),average_time(s),GFLOPS,GFLOP/average_time(s)";
+    "iter,total_time(s),average_time(s),FLOPS,GFLOP/average_time(s)";
 
 /*************************** Internal helper fns **************************/
 static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args,
@@ -134,7 +155,7 @@ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args,
                  << "x" << trmm_args.B.extent(2) << "," << options.warm_up_n
                  << "," << options.n << "," << time_in_seconds << ","
                  << time_in_seconds / options.n << ","
-                 << gflops << ","
+                 << flops << ","
                  << gflops / average_time
                  << std::endl;
 }

From 8d2868740c2dd960f7bddaac061d3bd5edfd61a9 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Wed, 10 Feb 2021 14:27:36 -0700
Subject: [PATCH 05/47] perf_test: Update gemm to optionally use RangePolicy

---
 .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 46 ++++++++++++++++++-
 1 file changed, 44 insertions(+), 2 deletions(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index f26fbb7287..b66f4c3bd0 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -58,6 +58,7 @@
 #include "KokkosBatched_Util.hpp"
 
 //#define GEMM_PERF_TEST_DEBUG
+#define KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY
 
 // Forward declarations
 void do_gemm_serial_blas(options_t options);
@@ -322,6 +323,24 @@ void __do_gemm_parallel_blas(options_t options, gemm_args_t gemm_args) {
   return;
 }
 
+template <class TransAType, class TransBType,
+          class BlockingType>
+struct parallel_batched_gemm_range_policy {
+  gemm_args_t gemm_args_;
+
+  parallel_batched_gemm_range_policy(gemm_args_t gemm_args) : gemm_args_(gemm_args) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int &i) const {
+    auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL());
+    auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL());
+    auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL());
+
+    KokkosBatched::SerialGemm<TransAType, TransBType, BlockingType>::invoke(
+        gemm_args_.alpha, svA, svB, gemm_args_.beta, svC);
+  }
+};
+
 template <class MemberType, class TransAType, class TransBType,
           class BlockingType>
 struct parallel_batched_gemm {
@@ -375,36 +394,59 @@ template <class TransAType, class TransBType, class BlockingType, class AlgoTag,
 void __do_gemm_parallel_batched_template(options_t options,
                                          gemm_args_t gemm_args) {
   using execution_space = typename device_type::execution_space;
+#if defined(KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY)
+  printf("Using RangePolicy!\n");
+  using policy_type     = Kokkos::RangePolicy<execution_space>;
+  using functor_type =
+      parallel_batched_gemm_range_policy<TransAType, TransBType, BlockingType>;
+#else
   using policy_type     = Kokkos::TeamPolicy<AlgoTag, execution_space>;
   using member_type     = typename policy_type::member_type;
   using functor_type =
       parallel_batched_gemm<member_type, TransAType, TransBType, BlockingType>;
+#endif
 
   uint32_t warm_up_n = options.warm_up_n;
   uint32_t n         = options.n;
+#if !defined(KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY)
   auto league_size   = options.start.c.k;
+#endif
   Kokkos::Timer timer;
 
   STATUS;
 
   functor_type parallel_batched_gemm_functor(gemm_args);
+#if !defined(KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY)
   auto team_size  = gemm_args.bp.team_size;
   auto vector_len = gemm_args.bp.vector_len;
+#endif
 
   for (uint32_t i = 0; i < warm_up_n; i++) {
+#if defined(KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY)
+    Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm",
+                         policy_type(0, options.start.c.k),
+                         parallel_batched_gemm_functor);
+#else
     Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm",
                          policy_type(league_size, team_size, vector_len),
                          parallel_batched_gemm_functor);
+#endif
+    Kokkos::fence();
   }
-  Kokkos::fence();
 
   timer.reset();
   for (uint32_t i = 0; i < n; i++) {
+#if defined(KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY)
+    Kokkos::parallel_for("parallelBatchedTimedLoopGemm",
+                         policy_type(0, options.start.c.k),
+                         parallel_batched_gemm_functor);
+#else
     Kokkos::parallel_for("parallelBatchedTimedLoopGemm",
                          policy_type(league_size, team_size, vector_len),
                          parallel_batched_gemm_functor);
+#endif
+    Kokkos::fence();
   }
-  Kokkos::fence();
 
   __gemm_output_csv_row(options, gemm_args, timer.seconds());
 

From ccbbad3546c4cad67ba1419b35ab9b02f95b1043 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Wed, 10 Feb 2021 14:39:54 -0700
Subject: [PATCH 06/47] perf_test: Update GEMM to output GFLOPs

---
 .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp    | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index b66f4c3bd0..9792c3a061 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -128,15 +128,24 @@ typedef struct gemm_args gemm_args_t;
 static std::string gemm_csv_header_str =
     "algorithm,transAtransB,alpha,beta,team_size,vector_len,loop_type,A_dims,B_"
     "dims,C_dims,warm_up_n,"
-    "iter,total_time(s),average_time(s)";
+    "iter,total_time(s),average_time(s),FLOPS,GFLOP/average_time(s)";
 
 /*************************** Internal helper fns **************************/
+// Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf
+static inline int __gemm_flop_count(int a_m, int a_n, int b_k) {
+  return 2 * a_m * b_k * a_n;
+}
 static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args,
                                   double time_in_seconds,
                                   const char *experiment_name = nullptr) {
   std::string algo_name = test_e_str[options.test];
   if (experiment_name) algo_name = std::string(experiment_name);
 
+  double flops = gemm_args.A.extent(0) * __gemm_flop_count(gemm_args.A.extent(1), gemm_args.A.extent(2),
+                                                           gemm_args.B.extent(2));
+  double gflops = flops / 10e9;
+  double average_time = time_in_seconds / options.n;
+
   options.out[0] << algo_name << "," << options.blas_args.gemm.gemm_args << ","
                  << options.blas_args.gemm.alpha << ","
                  << options.blas_args.gemm.beta << "," << gemm_args.bp.team_size
@@ -147,7 +156,10 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args,
                  << "x" << gemm_args.B.extent(2) << "," << gemm_args.C.extent(0)
                  << "x" << gemm_args.C.extent(1) << "x" << gemm_args.C.extent(2)
                  << "," << options.warm_up_n << "," << options.n << ","
-                 << time_in_seconds << "," << time_in_seconds / options.n
+                 << time_in_seconds << ","
+                 << time_in_seconds / options.n << ","
+                 << flops << ","
+                 << gflops / average_time
                  << std::endl;
 }
 

From 274e9289af0838c1b675eb0ec0bc31d2c506fc62 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Wed, 10 Feb 2021 14:44:22 -0700
Subject: [PATCH 07/47] perf_test: Update gemm size step

---
 perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index 9792c3a061..0f1b7f70b6 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -959,9 +959,9 @@ void __do_loop_and_invoke(options_t options,
        cur_dims.a.m <= options.stop.a.m && cur_dims.a.n <= options.stop.a.n &&
        cur_dims.b.m <= options.stop.b.m && cur_dims.b.n <= options.stop.b.n &&
        cur_dims.c.m <= options.stop.c.m && cur_dims.c.n <= options.stop.c.n;
-       cur_dims.a.m *= options.step, cur_dims.a.n *= options.step,
-      cur_dims.b.m *= options.step, cur_dims.b.n *= options.step,
-      cur_dims.c.m *= options.step, cur_dims.c.n *= options.step) {
+       cur_dims.a.m += options.step, cur_dims.a.n += options.step,
+      cur_dims.b.m += options.step, cur_dims.b.n += options.step,
+      cur_dims.c.m += options.step, cur_dims.c.n += options.step) {
     gemm_args = __do_setup<default_scalar, view_type_3d, view_type_3d,
                            view_type_3d, default_device>(options, cur_dims);
     fn(options, gemm_args);

From 6f4e05bd022a61c1b2c71b8f822fdb2a4165aab1 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Thu, 11 Feb 2021 11:42:41 -0700
Subject: [PATCH 08/47] perf_test: Disable
 KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY

---
 perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index 0f1b7f70b6..59f5a84803 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -58,7 +58,7 @@
 #include "KokkosBatched_Util.hpp"
 
 //#define GEMM_PERF_TEST_DEBUG
-#define KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY
+//#define KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY
 
 // Forward declarations
 void do_gemm_serial_blas(options_t options);

From b066fa9b2a3170e62e33cc686a386749089942b3 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Thu, 11 Feb 2021 14:32:06 -0700
Subject: [PATCH 09/47] perf_test/blas: Fix GFLOP calculation

---
 perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp  | 4 ++--
 perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 2 +-
 perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp
index de24a96254..32626cfba5 100644
--- a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp
+++ b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp
@@ -129,7 +129,7 @@ static inline int trtri_flop_count(int a_m, int a_n) {
     flops_per_mul = 6;
     flops_per_add = 2;
   }
-  
+
   flops = (1./6.*a_n*a_n*a_n + 1./2.*a_n*a_n + 1./3.*a_n) * flops_per_mul +
           (1./6.*a_n*a_n*a_n - 1./2.*a_n*a_n + 1./3.*a_n) * flops_per_add;
 
@@ -152,7 +152,7 @@ static std::string trtri_csv_header_str =
 static void __trtri_output_csv_row(options_t options, trtri_args_t trtri_args,
                                    double time_in_seconds) {
   double flops = trtri_args.A.extent(0) * trtri_flop_count(trtri_args.A.extent(1), trtri_args.A.extent(2));
-  double gflops = flops / 10e9;
+  double gflops = flops / 1e9;
   double average_time = time_in_seconds / options.n;
 
   options.out[0] << test_e_str[options.test] << ","
diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index 59f5a84803..29fcace727 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -143,7 +143,7 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args,
 
   double flops = gemm_args.A.extent(0) * __gemm_flop_count(gemm_args.A.extent(1), gemm_args.A.extent(2),
                                                            gemm_args.B.extent(2));
-  double gflops = flops / 10e9;
+  double gflops = flops / 1e9;
   double average_time = time_in_seconds / options.n;
 
   options.out[0] << algo_name << "," << options.blas_args.gemm.gemm_args << ","
diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
index a35caad5dd..9a7f7cc480 100644
--- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
@@ -144,7 +144,7 @@ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args,
   double flops = trmm_args.A.extent(0) * trmm_flop_count(trmm_args.side,
                                                          trmm_args.B.extent(1), trmm_args.B.extent(2),
                                                          trmm_args.A.extent(1), trmm_args.A.extent(2));
-  double gflops = flops / 10e9;
+  double gflops = flops / 1e9;
   double average_time = time_in_seconds / options.n;
 
   options.out[0] << test_e_str[options.test] << ","

From 63382d3722d2868258cefffd4b5639a475d35198 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Fri, 12 Feb 2021 20:38:12 -0700
Subject: [PATCH 10/47] perf_test/blas/blas3: Add bandwidth metric to trmm

---
 .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 25 ++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
index 9a7f7cc480..a313eabbaf 100644
--- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
@@ -136,7 +136,7 @@ typedef struct trmm_args trmm_args_t;
 
 static std::string trmm_csv_header_str =
     "algorithm,side-uplo-trans-diag,alpha,loop_type,A_dims,B_dims,warm_up_n,"
-    "iter,total_time(s),average_time(s),FLOPS,GFLOP/average_time(s)";
+    "iter,total_time(s),average_time(s),FLOPS,GFLOP/average_time(s),min_achieved_bandwidth(GB/s),max_achieved_bandwidth(GB/s)";
 
 /*************************** Internal helper fns **************************/
 static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args,
@@ -146,6 +146,23 @@ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args,
                                                          trmm_args.A.extent(1), trmm_args.A.extent(2));
   double gflops = flops / 1e9;
   double average_time = time_in_seconds / options.n;
+  double gbytes_in_matrix = (trmm_args.B.extent(0) * trmm_args.B.extent(1) * trmm_args.B.extent(2) * sizeof(default_scalar)) / 1e9;
+  double min_memory_transactions, max_memory_transactions;
+
+  // Assuming infinite cache size
+  // We have to read A and B into the cache once and then write
+  // B back out to main memory once.
+  min_memory_transactions = 3;
+
+  // Assuming no register or real caching
+  // We have to go out to memory for every element we read from A and B as well as
+  // every element we write to B.
+  // We use the trmm flops from lapack note 41 and multiple by 3/2 to account for the
+  // write to B since this flop count is for one multiply and one add.
+  if (trmm_args.side == 'l' || trmm_args.side == 'L')
+    max_memory_transactions = trmm_args.B.extent(1) * trmm_args.B.extent(1) * trmm_args.B.extent(2) * (3./2.);
+  else
+    max_memory_transactions = trmm_args.B.extent(2) * trmm_args.B.extent(2) * trmm_args.B.extent(1) * (3./2.);
 
   options.out[0] << test_e_str[options.test] << ","
                  << options.blas_args.trmm.trmm_args << ","
@@ -154,9 +171,11 @@ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args,
                  << "x" << trmm_args.A.extent(2) << "," << trmm_args.B.extent(0) << "x" << trmm_args.B.extent(1)
                  << "x" << trmm_args.B.extent(2) << "," << options.warm_up_n
                  << "," << options.n << "," << time_in_seconds << ","
-                 << time_in_seconds / options.n << ","
+                 << average_time << ","
                  << flops << ","
-                 << gflops / average_time
+                 << gflops / average_time << ","
+                 << (gbytes_in_matrix * min_memory_transactions) / average_time << ","
+                 << (gbytes_in_matrix * max_memory_transactions) / average_time
                  << std::endl;
 }
 

From 898794eb3860f897a44a03b77211d06d4d74809a Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Mon, 15 Feb 2021 16:20:06 -0700
Subject: [PATCH 11/47] perf_test: Handle complex numbers in flop count

---
 perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index 29fcace727..d6572dfd34 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -133,7 +133,13 @@ static std::string gemm_csv_header_str =
 /*************************** Internal helper fns **************************/
 // Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf
 static inline int __gemm_flop_count(int a_m, int a_n, int b_k) {
-  return 2 * a_m * b_k * a_n;
+    if (std::is_same<double, default_scalar>::value ||
+        std::is_same<float, default_scalar>::value ||
+        std::is_same<Kokkos::Experimental::half_t, default_scalar>::value)
+      return 2 * a_m * b_k * a_n;
+    else
+      // For complex, we need to count 2 flops for each add and 6 flops for each multiply.
+      return (2 + 6) * a_m * b_k * a_n;
 }
 static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args,
                                   double time_in_seconds,

From f11f9138e9c69a6387b9e8c67c0809d81be7f872 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Mon, 15 Feb 2021 16:35:38 -0700
Subject: [PATCH 12/47] perf_test/blas/blas3: Gemm perf_test_updates

  - Fix batched_serial to use RangePolicy instead of TeamPolicy
  - Add --use_auto option and optionally use Kokkos::AUTO for team_size
  and vector_len in gemm.
---
 perf_test/blas/blas3/KokkosBlas3_common.hpp   |  2 +
 .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 93 +++++++++++++------
 .../blas/blas3/KokkosBlas3_perf_test.cpp      |  9 +-
 3 files changed, 73 insertions(+), 31 deletions(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_common.hpp b/perf_test/blas/blas3/KokkosBlas3_common.hpp
index 4952a8e606..01e368e15c 100644
--- a/perf_test/blas/blas3/KokkosBlas3_common.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_common.hpp
@@ -61,6 +61,7 @@
 #define DEFAULT_BLAS_ROUTINES "trmm,gemm,"
 #define DEFAULT_TEAM_SIZE 1
 #define DEFAULT_VECTOR_LEN 1
+#define DEFAULT_USE_AUTO 0
 
 /************************ blas routine structure definitions **********/
 struct perf_test_trmm_args {
@@ -83,6 +84,7 @@ struct blas_args {
   // ADD MORE BLAS3 ROUTINES HERE
   int team_size;
   int vector_len;
+  bool use_auto;
   // ADD MORE COMMON BLAS3 OPTIONS HERE
 };
 typedef struct blas_args blas_args_t;
diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index d6572dfd34..b4d55d0e90 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -58,7 +58,6 @@
 #include "KokkosBatched_Util.hpp"
 
 //#define GEMM_PERF_TEST_DEBUG
-//#define KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY
 
 // Forward declarations
 void do_gemm_serial_blas(options_t options);
@@ -409,60 +408,32 @@ struct parallel_batched_gemm {
 
 template <class TransAType, class TransBType, class BlockingType, class AlgoTag,
           class device_type>
-void __do_gemm_parallel_batched_template(options_t options,
-                                         gemm_args_t gemm_args) {
+void __do_gemm_parallel_batched_template_range_policy(options_t options, gemm_args_t gemm_args) {
   using execution_space = typename device_type::execution_space;
-#if defined(KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY)
-  printf("Using RangePolicy!\n");
   using policy_type     = Kokkos::RangePolicy<execution_space>;
   using functor_type =
       parallel_batched_gemm_range_policy<TransAType, TransBType, BlockingType>;
-#else
-  using policy_type     = Kokkos::TeamPolicy<AlgoTag, execution_space>;
-  using member_type     = typename policy_type::member_type;
-  using functor_type =
-      parallel_batched_gemm<member_type, TransAType, TransBType, BlockingType>;
-#endif
 
   uint32_t warm_up_n = options.warm_up_n;
   uint32_t n         = options.n;
-#if !defined(KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY)
-  auto league_size   = options.start.c.k;
-#endif
   Kokkos::Timer timer;
 
   STATUS;
 
   functor_type parallel_batched_gemm_functor(gemm_args);
-#if !defined(KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY)
-  auto team_size  = gemm_args.bp.team_size;
-  auto vector_len = gemm_args.bp.vector_len;
-#endif
 
   for (uint32_t i = 0; i < warm_up_n; i++) {
-#if defined(KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY)
     Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm",
                          policy_type(0, options.start.c.k),
                          parallel_batched_gemm_functor);
-#else
-    Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm",
-                         policy_type(league_size, team_size, vector_len),
-                         parallel_batched_gemm_functor);
-#endif
     Kokkos::fence();
   }
 
   timer.reset();
   for (uint32_t i = 0; i < n; i++) {
-#if defined(KOKKOSKERNELS_GEMM_PERF_TEST_USE_RANGE_POLICY)
     Kokkos::parallel_for("parallelBatchedTimedLoopGemm",
                          policy_type(0, options.start.c.k),
                          parallel_batched_gemm_functor);
-#else
-    Kokkos::parallel_for("parallelBatchedTimedLoopGemm",
-                         policy_type(league_size, team_size, vector_len),
-                         parallel_batched_gemm_functor);
-#endif
     Kokkos::fence();
   }
 
@@ -471,6 +442,68 @@ void __do_gemm_parallel_batched_template(options_t options,
   return;
 }
 
+template <class TransAType, class TransBType, class BlockingType, class AlgoTag,
+          class device_type>
+void __do_gemm_parallel_batched_template(options_t options,
+                                         gemm_args_t gemm_args) {
+  using execution_space = typename device_type::execution_space;
+  using policy_type     = Kokkos::TeamPolicy<AlgoTag, execution_space>;
+  using member_type     = typename policy_type::member_type;
+  using functor_type =
+      parallel_batched_gemm<member_type, TransAType, TransBType, BlockingType>;
+
+  uint32_t warm_up_n = options.warm_up_n;
+  uint32_t n         = options.n;
+  auto league_size   = options.start.c.k;
+  Kokkos::Timer timer;
+
+  if (std::is_same<AlgoTag, SerialTag>::value) {
+    return __do_gemm_parallel_batched_template_range_policy<TransAType, TransBType, BlockingType, SerialTag, device_type>(options, gemm_args);
+  }
+
+  STATUS;
+
+  functor_type parallel_batched_gemm_functor(gemm_args);
+  auto team_size  = gemm_args.bp.team_size;
+  auto vector_len = gemm_args.bp.vector_len;
+
+  if (options.blas_args.use_auto) {
+    for (uint32_t i = 0; i < warm_up_n; i++) {
+      Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm",
+                          policy_type(league_size, Kokkos::AUTO, Kokkos::AUTO),
+                          parallel_batched_gemm_functor);
+      Kokkos::fence();
+    }
+
+    timer.reset();
+    for (uint32_t i = 0; i < n; i++) {
+      Kokkos::parallel_for("parallelBatchedTimedLoopGemm",
+                          policy_type(league_size, Kokkos::AUTO, Kokkos::AUTO),
+                          parallel_batched_gemm_functor);
+      Kokkos::fence();
+    }
+  } else {
+    for (uint32_t i = 0; i < warm_up_n; i++) {
+      Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm",
+                          policy_type(league_size, team_size, vector_len),
+                          parallel_batched_gemm_functor);
+      Kokkos::fence();
+    }
+
+    timer.reset();
+    for (uint32_t i = 0; i < n; i++) {
+      Kokkos::parallel_for("parallelBatchedTimedLoopGemm",
+                          policy_type(league_size, team_size, vector_len),
+                          parallel_batched_gemm_functor);
+      Kokkos::fence();
+    }
+  }
+
+  __gemm_output_csv_row(options, gemm_args, timer.seconds());
+
+  return;
+}
+
 template <class algo_tag, class blocking_type, class device_type>
 void __do_gemm_parallel_batched(options_t options, gemm_args_t gemm_args) {
   char a  = gemm_args.transA;
diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
index 6c95960e25..0f1f2b5d07 100644
--- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
+++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
@@ -119,6 +119,11 @@ static void __print_help_blas3_perf_test() {
   printf("\t\t\tThe value of LEN as an integer. (default: %d)\n",
          DEFAULT_VECTOR_LEN);
 
+  printf("\t-u, --use_auto={0,1}\n");
+  printf("\t\tWhether to use Kokkos::AUTO for vector_len and team_size (Heirarchical parallelism).\n");
+  printf("\t\t\t1 to use Kokkos::AUTO, otherwise --vector_len and --team_size will be used. (default: %d)\n",
+         DEFAULT_USE_AUTO);
+
   printf("\t-k, --batch_size=LEN\n");
   printf("\t\tBatch size. Adds third dimension to matrices A, B, and C.\n");
   printf("\t\t\tThe value of LEN as an integer. (default: %d)\n",
@@ -238,6 +243,7 @@ int main(int argc, char **argv) {
   options.blas_routines        = std::string(DEFAULT_BLAS_ROUTINES);
   options.blas_args.team_size  = DEFAULT_TEAM_SIZE;
   options.blas_args.vector_len = DEFAULT_VECTOR_LEN;
+  options.blas_args.use_auto   = DEFAULT_USE_AUTO;
 
   options.blas_args.trmm.trmm_args = DEFAULT_TRMM_ARGS;
   options.blas_args.trmm.alpha     = DEFAULT_TRMM_ALPHA;
@@ -245,7 +251,7 @@ int main(int argc, char **argv) {
   options.blas_args.gemm.gemm_args = DEFAULT_GEMM_ARGS;
   options.blas_args.gemm.alpha     = DEFAULT_GEMM_ALPHA;
 
-  while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:",
+  while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:",
                             long_options, &option_idx)) != -1) {
     switch (ret) {
       case 'h': __print_help_blas3_perf_test(); return 0;
@@ -363,6 +369,7 @@ int main(int argc, char **argv) {
         break;
       case 'z': options.blas_args.team_size = atoi(optarg); break;
       case 'n': options.blas_args.vector_len = atoi(optarg); break;
+      case 'u': options.blas_args.use_auto = atoi(optarg); break;
       case 'c':
         out_file         = optarg;
         options.out_file = std::string(out_file);

From fb41b4c01582cfab5c88cb39261660030af260c2 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Tue, 16 Feb 2021 10:11:21 -0700
Subject: [PATCH 13/47] perf_test/blas/blas3:

    - Initialize options.blas_args.gemm.beta.
    - rename --gemm_alpha to --gemm_scalars and accept beta input arg.
---
 .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp |  3 ++-
 .../blas/blas3/KokkosBlas3_perf_test.cpp      | 22 ++++++++++++-------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index b4d55d0e90..06d854bc2a 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -105,6 +105,7 @@ void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = {
 /*************************** Test types and defaults **************************/
 #define DEFAULT_GEMM_ARGS "NN"
 #define DEFAULT_GEMM_ALPHA 1.0
+#define DEFAULT_GEMM_BETA  1.0
 
 using view_type_3d =
     Kokkos::View<default_scalar ***, default_layout, default_device>;
@@ -963,7 +964,7 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dim) {
   gemm_args.B             = vtb("gemm_args.B", dim.b.k, dim.b.m, dim.b.n);
   gemm_args.C             = vtc("gemm_args.C", dim.c.k, dim.c.m, dim.c.n);
   gemm_args.alpha         = options.blas_args.gemm.alpha;
-  gemm_args.alpha         = options.blas_args.gemm.beta;
+  gemm_args.beta          = options.blas_args.gemm.beta;
   gemm_args.bp.team_size  = options.blas_args.team_size;
   gemm_args.bp.vector_len = options.blas_args.vector_len;
 
diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
index 0f1f2b5d07..0ec88f42f7 100644
--- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
+++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
@@ -63,7 +63,7 @@ static struct option long_options[] = {
     {"trmm_options", required_argument, 0, 'o'},
     {"trmm_alpha", required_argument, 0, 'a'},
     {"gemm_options", required_argument, 0, 'g'},
-    {"gemm_alpha", required_argument, 0, 'p'},
+    {"gemm_scalars", required_argument, 0, 'p'},
     {"team_size", required_argument, 0, 'z'},
     {"vector_len", required_argument, 0, 'n'},
     {"batch_size", required_argument, 0, 'k'},
@@ -104,10 +104,10 @@ static void __print_help_blas3_perf_test() {
       "%s)\n",
       DEFAULT_GEMM_ARGS);
 
-  printf("\t-p, --gemm_alpha=SCALAR_VALUE\n");
-  printf("\t\tGEMM alpha value.\n");
-  printf("\t\t\tThe value of alpha in floating point. (default: %lf)\n",
-         DEFAULT_GEMM_ALPHA);
+  printf("\t-p, --gemm_scalars=ALPHA_SCALAR_VALUE,BETA_SCALAR_VALUE\n");
+  printf("\t\tGEMM alpha and beta values.\n");
+  printf("\t\t\tThe value of alpha and beta in floating point. (default: %lf,%lf)\n",
+         DEFAULT_GEMM_ALPHA, DEFAULT_GEMM_BETA);
 
   printf("\t-z, --team_size=SIZE\n");
   printf("\t\tKokkos team size.\n");
@@ -250,8 +250,9 @@ int main(int argc, char **argv) {
 
   options.blas_args.gemm.gemm_args = DEFAULT_GEMM_ARGS;
   options.blas_args.gemm.alpha     = DEFAULT_GEMM_ALPHA;
+  options.blas_args.gemm.beta      = DEFAULT_GEMM_BETA;
 
-  while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:",
+  while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:p:",
                             long_options, &option_idx)) != -1) {
     switch (ret) {
       case 'h': __print_help_blas3_perf_test(); return 0;
@@ -275,14 +276,19 @@ int main(int argc, char **argv) {
         break;
       case 'g':
         // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4));
-        if (strlen(optarg) != 3) {
+        if (strlen(optarg) != 2) {
           __blas3_perf_test_input_error(argv, ret, optarg);
         }
         options.blas_args.gemm.gemm_args = optarg;
         break;
       case 'p':
         // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4));
-        options.blas_args.gemm.alpha = (default_scalar)atof(optarg);
+        double alpha, beta;
+        if (sscanf(optarg, "%lf,%lf", &alpha, &beta) != 2)
+          __blas3_perf_test_input_error(argv, ret, optarg);
+
+        options.blas_args.gemm.alpha = static_cast<default_scalar>(alpha);
+        options.blas_args.gemm.beta = static_cast<default_scalar>(beta);
         break;
       case 'a':
         // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4));

From a91bd6c9d26f7c8dbddae2b06150f2c3f4bad579 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Tue, 16 Feb 2021 10:53:56 -0700
Subject: [PATCH 14/47] perf_test/blas/blas3: Update csv row for --use_auto

---
 perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index 06d854bc2a..a5dcbbfb0f 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -145,7 +145,10 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args,
                                   double time_in_seconds,
                                   const char *experiment_name = nullptr) {
   std::string algo_name = test_e_str[options.test];
+  std::string ts = std::to_string(gemm_args.bp.team_size);
+  std::string vlen = std::to_string(gemm_args.bp.vector_len);
   if (experiment_name) algo_name = std::string(experiment_name);
+  if (options.blas_args.use_auto) ts = vlen = "Kokkos::AUTO";
 
   double flops = gemm_args.A.extent(0) * __gemm_flop_count(gemm_args.A.extent(1), gemm_args.A.extent(2),
                                                            gemm_args.B.extent(2));
@@ -154,8 +157,9 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args,
 
   options.out[0] << algo_name << "," << options.blas_args.gemm.gemm_args << ","
                  << options.blas_args.gemm.alpha << ","
-                 << options.blas_args.gemm.beta << "," << gemm_args.bp.team_size
-                 << "," << gemm_args.bp.vector_len << ","
+                 << options.blas_args.gemm.beta << ","
+                 << ts << ","
+                 << vlen << ","
                  << loop_e_str[options.loop] << "," << gemm_args.A.extent(0)
                  << "x" << gemm_args.A.extent(1) << "x" << gemm_args.A.extent(2)
                  << "," << gemm_args.B.extent(0) << "x" << gemm_args.B.extent(1)

From 0d4fe93f72950903c138438738c0b1b2789679dd Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Wed, 17 Feb 2021 13:25:11 -0700
Subject: [PATCH 15/47] perf_test/blas/blas3: Add -d option for view allocation

---
 perf_test/blas/blas3/KokkosBlas3_common.hpp   |  3 +-
 .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 85 ++++++++++++++-----
 .../blas/blas3/KokkosBlas3_perf_test.cpp      |  9 +-
 3 files changed, 76 insertions(+), 21 deletions(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_common.hpp b/perf_test/blas/blas3/KokkosBlas3_common.hpp
index 01e368e15c..a2c1e6f6ae 100644
--- a/perf_test/blas/blas3/KokkosBlas3_common.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_common.hpp
@@ -62,6 +62,7 @@
 #define DEFAULT_TEAM_SIZE 1
 #define DEFAULT_VECTOR_LEN 1
 #define DEFAULT_USE_AUTO 0
+#define DEFAULT_BATCH_SIZE_LAST_DIM 0
 
 /************************ blas routine structure definitions **********/
 struct perf_test_trmm_args {
@@ -84,7 +85,7 @@ struct blas_args {
   // ADD MORE BLAS3 ROUTINES HERE
   int team_size;
   int vector_len;
-  bool use_auto;
+  bool use_auto, batch_size_last_dim;
   // ADD MORE COMMON BLAS3 OPTIONS HERE
 };
 typedef struct blas_args blas_args_t;
diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index a5dcbbfb0f..7e86d04a4f 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -82,6 +82,8 @@ struct TeamVectorTag {};
 struct LayoutLeftTag {};
 struct LayoutRightTag {};
 struct SimdCpuTag {};
+struct LastDimTag {};
+struct FirstDimTag {};
 
 // gemm invoke table
 void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = {
@@ -150,11 +152,20 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args,
   if (experiment_name) algo_name = std::string(experiment_name);
   if (options.blas_args.use_auto) ts = vlen = "Kokkos::AUTO";
 
-  double flops = gemm_args.A.extent(0) * __gemm_flop_count(gemm_args.A.extent(1), gemm_args.A.extent(2),
-                                                           gemm_args.B.extent(2));
-  double gflops = flops / 1e9;
+  double flops;
+  double gflops;
   double average_time = time_in_seconds / options.n;
 
+  if (options.blas_args.batch_size_last_dim) {
+    flops = gemm_args.A.extent(2) * __gemm_flop_count(gemm_args.A.extent(0), gemm_args.A.extent(1),
+                                                      gemm_args.B.extent(1));
+  } else {
+    flops = gemm_args.A.extent(0) * __gemm_flop_count(gemm_args.A.extent(1), gemm_args.A.extent(2),
+                                                      gemm_args.B.extent(2));
+  }
+
+  gflops = flops / 1e9;
+
   options.out[0] << algo_name << "," << options.blas_args.gemm.gemm_args << ","
                  << options.blas_args.gemm.alpha << ","
                  << options.blas_args.gemm.beta << ","
@@ -353,7 +364,7 @@ struct parallel_batched_gemm_range_policy {
   parallel_batched_gemm_range_policy(gemm_args_t gemm_args) : gemm_args_(gemm_args) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const int &i) const {
+  void operator()(const FirstDimTag &, const int &i) const {
     auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL());
     auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL());
     auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL());
@@ -361,6 +372,16 @@ struct parallel_batched_gemm_range_policy {
     KokkosBatched::SerialGemm<TransAType, TransBType, BlockingType>::invoke(
         gemm_args_.alpha, svA, svB, gemm_args_.beta, svC);
   }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const LastDimTag &, const int &i) const {
+    auto svA = Kokkos::subview(gemm_args_.A, Kokkos::ALL(), Kokkos::ALL(), i);
+    auto svB = Kokkos::subview(gemm_args_.B, Kokkos::ALL(), Kokkos::ALL(), i);
+    auto svC = Kokkos::subview(gemm_args_.C, Kokkos::ALL(), Kokkos::ALL(), i);
+
+    KokkosBatched::SerialGemm<TransAType, TransBType, BlockingType>::invoke(
+        gemm_args_.alpha, svA, svB, gemm_args_.beta, svC);
+  }
 };
 
 template <class MemberType, class TransAType, class TransBType,
@@ -415,7 +436,8 @@ template <class TransAType, class TransBType, class BlockingType, class AlgoTag,
           class device_type>
 void __do_gemm_parallel_batched_template_range_policy(options_t options, gemm_args_t gemm_args) {
   using execution_space = typename device_type::execution_space;
-  using policy_type     = Kokkos::RangePolicy<execution_space>;
+  using policy_type     = Kokkos::RangePolicy<FirstDimTag, execution_space>;
+  using policy_type_last_dim = Kokkos::RangePolicy<LastDimTag, execution_space>;
   using functor_type =
       parallel_batched_gemm_range_policy<TransAType, TransBType, BlockingType>;
 
@@ -427,19 +449,38 @@ void __do_gemm_parallel_batched_template_range_policy(options_t options, gemm_ar
 
   functor_type parallel_batched_gemm_functor(gemm_args);
 
-  for (uint32_t i = 0; i < warm_up_n; i++) {
-    Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm",
-                         policy_type(0, options.start.c.k),
-                         parallel_batched_gemm_functor);
-    Kokkos::fence();
+  if (options.blas_args.batch_size_last_dim) {
+    for (uint32_t i = 0; i < warm_up_n; i++) {
+      Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm",
+                          policy_type_last_dim(0, options.start.c.k),
+                          parallel_batched_gemm_functor);
+      Kokkos::fence();
+    }
+  } else {
+    for (uint32_t i = 0; i < warm_up_n; i++) {
+      Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm",
+                          policy_type(0, options.start.c.k),
+                          parallel_batched_gemm_functor);
+      Kokkos::fence();
+    }
   }
 
-  timer.reset();
-  for (uint32_t i = 0; i < n; i++) {
-    Kokkos::parallel_for("parallelBatchedTimedLoopGemm",
-                         policy_type(0, options.start.c.k),
-                         parallel_batched_gemm_functor);
-    Kokkos::fence();
+  if (options.blas_args.batch_size_last_dim) {
+    timer.reset();
+    for (uint32_t i = 0; i < n; i++) {
+      Kokkos::parallel_for("parallelBatchedTimedLoopGemm",
+                          policy_type_last_dim(0, options.start.c.k),
+                          parallel_batched_gemm_functor);
+      Kokkos::fence();
+    }
+  } else {
+    timer.reset();
+    for (uint32_t i = 0; i < n; i++) {
+      Kokkos::parallel_for("parallelBatchedTimedLoopGemm",
+                          policy_type(0, options.start.c.k),
+                          parallel_batched_gemm_functor);
+      Kokkos::fence();
+    }
   }
 
   __gemm_output_csv_row(options, gemm_args, timer.seconds());
@@ -964,9 +1005,15 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dim) {
 
   gemm_args.transA        = options.blas_args.gemm.gemm_args.c_str()[0];
   gemm_args.transB        = options.blas_args.gemm.gemm_args.c_str()[1];
-  gemm_args.A             = vta("gemm_args.A", dim.a.k, dim.a.m, dim.a.n);
-  gemm_args.B             = vtb("gemm_args.B", dim.b.k, dim.b.m, dim.b.n);
-  gemm_args.C             = vtc("gemm_args.C", dim.c.k, dim.c.m, dim.c.n);
+  if (options.blas_args.batch_size_last_dim) {
+    gemm_args.A             = vta("gemm_args.A", dim.a.m, dim.a.n, dim.a.k);
+    gemm_args.B             = vtb("gemm_args.B", dim.b.m, dim.b.n, dim.b.k);
+    gemm_args.C             = vtc("gemm_args.C", dim.c.m, dim.c.n, dim.c.k);
+  } else {
+    gemm_args.A             = vta("gemm_args.A", dim.a.k, dim.a.m, dim.a.n);
+    gemm_args.B             = vtb("gemm_args.B", dim.b.k, dim.b.m, dim.b.n);
+    gemm_args.C             = vtc("gemm_args.C", dim.c.k, dim.c.m, dim.c.n);
+  }
   gemm_args.alpha         = options.blas_args.gemm.alpha;
   gemm_args.beta          = options.blas_args.gemm.beta;
   gemm_args.bp.team_size  = options.blas_args.team_size;
diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
index 0ec88f42f7..72a92a32b1 100644
--- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
+++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
@@ -67,6 +67,7 @@ static struct option long_options[] = {
     {"team_size", required_argument, 0, 'z'},
     {"vector_len", required_argument, 0, 'n'},
     {"batch_size", required_argument, 0, 'k'},
+    {"batch_size_last_dim", required_argument, 0, 'd'},
     {0, 0, 0, 0}};
 
 static void __print_help_blas3_perf_test() {
@@ -129,6 +130,11 @@ static void __print_help_blas3_perf_test() {
   printf("\t\t\tThe value of LEN as an integer. (default: %d)\n",
          DEFAULT_K);
 
+  printf("\t-d, --batch_size_last_dim={0,1}\n");
+  printf("\t\tHow to allocate the batch_size in the matrices.\n");
+  printf("\t\t\t1 make the batch_size the last dimension, otherwise batch_size is the first dimension (default: %d)\n",
+         DEFAULT_BATCH_SIZE_LAST_DIM);
+
   printf("\t-l, --loop_type=OPTION\n");
   printf("\t\tLoop selection.\n");
   printf("\t\t\tValid values for OPTION:\n");
@@ -252,7 +258,7 @@ int main(int argc, char **argv) {
   options.blas_args.gemm.alpha     = DEFAULT_GEMM_ALPHA;
   options.blas_args.gemm.beta      = DEFAULT_GEMM_BETA;
 
-  while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:p:",
+  while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:p:d:",
                             long_options, &option_idx)) != -1) {
     switch (ret) {
       case 'h': __print_help_blas3_perf_test(); return 0;
@@ -373,6 +379,7 @@ int main(int argc, char **argv) {
             options.stop.a.k = options.stop.b.k = options.stop.c.k =
                 atoi(optarg);
         break;
+      case 'd': options.blas_args.batch_size_last_dim = atoi(optarg); break;
       case 'z': options.blas_args.team_size = atoi(optarg); break;
       case 'n': options.blas_args.vector_len = atoi(optarg); break;
       case 'u': options.blas_args.use_auto = atoi(optarg); break;

From 5c729bc9243b903059c2abd1422519079e655d07 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Wed, 17 Feb 2021 13:55:15 -0700
Subject: [PATCH 16/47] perf_test/blas/blas3: Update team and team_vector for
 -d

---
 .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 159 ++++++++++++------
 1 file changed, 107 insertions(+), 52 deletions(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index 7e86d04a4f..3db8f0dc1a 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -77,13 +77,14 @@ void do_gemm_team_vector_batched_blocked_parallel(options_t options);
 void do_gemm_experiment_parallel(options_t options);
 
 struct SerialTag {};
+struct SerialBatchDim3Tag {};
 struct TeamTag {};
+struct TeamBatchDim3Tag {};
 struct TeamVectorTag {};
+struct TeamVectorBatchDim3Tag {};
 struct LayoutLeftTag {};
 struct LayoutRightTag {};
 struct SimdCpuTag {};
-struct LastDimTag {};
-struct FirstDimTag {};
 
 // gemm invoke table
 void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = {
@@ -364,7 +365,7 @@ struct parallel_batched_gemm_range_policy {
   parallel_batched_gemm_range_policy(gemm_args_t gemm_args) : gemm_args_(gemm_args) {}
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const FirstDimTag &, const int &i) const {
+  void operator()(const SerialTag &, const int &i) const {
     auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL());
     auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL());
     auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL());
@@ -374,7 +375,7 @@ struct parallel_batched_gemm_range_policy {
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const LastDimTag &, const int &i) const {
+  void operator()(const SerialBatchDim3Tag &, const int &i) const {
     auto svA = Kokkos::subview(gemm_args_.A, Kokkos::ALL(), Kokkos::ALL(), i);
     auto svB = Kokkos::subview(gemm_args_.B, Kokkos::ALL(), Kokkos::ALL(), i);
     auto svC = Kokkos::subview(gemm_args_.C, Kokkos::ALL(), Kokkos::ALL(), i);
@@ -382,6 +383,15 @@ struct parallel_batched_gemm_range_policy {
     KokkosBatched::SerialGemm<TransAType, TransBType, BlockingType>::invoke(
         gemm_args_.alpha, svA, svB, gemm_args_.beta, svC);
   }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TeamTag &, const int &i) const {}
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TeamBatchDim3Tag &, const int &i) const {}
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TeamVectorTag &, const int &i) const {}
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TeamVectorBatchDim3Tag &, const int &i) const {}
 };
 
 template <class MemberType, class TransAType, class TransBType,
@@ -402,6 +412,17 @@ struct parallel_batched_gemm {
         gemm_args_.alpha, svA, svB, gemm_args_.beta, svC);
   }
 
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const SerialBatchDim3Tag &, const MemberType &member) const {
+    auto i   = member.league_rank();
+    auto svA = Kokkos::subview(gemm_args_.A, Kokkos::ALL(), Kokkos::ALL(), i);
+    auto svB = Kokkos::subview(gemm_args_.B, Kokkos::ALL(), Kokkos::ALL(), i);
+    auto svC = Kokkos::subview(gemm_args_.C, Kokkos::ALL(), Kokkos::ALL(), i);
+
+    KokkosBatched::SerialGemm<TransAType, TransBType, BlockingType>::invoke(
+        gemm_args_.alpha, svA, svB, gemm_args_.beta, svC);
+  }
+
   KOKKOS_INLINE_FUNCTION
   void operator()(const TeamTag &, const MemberType &member) const {
     auto i   = member.league_rank();
@@ -414,6 +435,18 @@ struct parallel_batched_gemm {
                                                   svB, gemm_args_.beta, svC);
   }
 
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TeamBatchDim3Tag &, const MemberType &member) const {
+    auto i   = member.league_rank();
+    auto svA = Kokkos::subview(gemm_args_.A, Kokkos::ALL(), Kokkos::ALL(), i);
+    auto svB = Kokkos::subview(gemm_args_.B, Kokkos::ALL(), Kokkos::ALL(), i);
+    auto svC = Kokkos::subview(gemm_args_.C, Kokkos::ALL(), Kokkos::ALL(), i);
+
+    KokkosBatched::TeamGemm<MemberType, TransAType, TransBType,
+                            BlockingType>::invoke(member, gemm_args_.alpha, svA,
+                                                  svB, gemm_args_.beta, svC);
+  }
+
   KOKKOS_INLINE_FUNCTION
   void operator()(const TeamVectorTag &, const MemberType &member) const {
     auto team_idx = member.league_rank();
@@ -430,14 +463,30 @@ struct parallel_batched_gemm {
                                                         svB, gemm_args_.beta,
                                                         svC);
   }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TeamVectorBatchDim3Tag &, const MemberType &member) const {
+    auto team_idx = member.league_rank();
+    auto svA =
+        Kokkos::subview(gemm_args_.A, Kokkos::ALL(), Kokkos::ALL(), team_idx);
+    auto svB =
+        Kokkos::subview(gemm_args_.B, Kokkos::ALL(), Kokkos::ALL(), team_idx);
+    auto svC =
+        Kokkos::subview(gemm_args_.C, Kokkos::ALL(), Kokkos::ALL(), team_idx);
+
+    KokkosBatched::TeamVectorGemm<MemberType, TransAType, TransBType,
+                                  BlockingType>::invoke(member,
+                                                        gemm_args_.alpha, svA,
+                                                        svB, gemm_args_.beta,
+                                                        svC);
+  }
 };
 
 template <class TransAType, class TransBType, class BlockingType, class AlgoTag,
           class device_type>
 void __do_gemm_parallel_batched_template_range_policy(options_t options, gemm_args_t gemm_args) {
   using execution_space = typename device_type::execution_space;
-  using policy_type     = Kokkos::RangePolicy<FirstDimTag, execution_space>;
-  using policy_type_last_dim = Kokkos::RangePolicy<LastDimTag, execution_space>;
+  using policy_type     = Kokkos::RangePolicy<AlgoTag, execution_space>;
   using functor_type =
       parallel_batched_gemm_range_policy<TransAType, TransBType, BlockingType>;
 
@@ -449,38 +498,19 @@ void __do_gemm_parallel_batched_template_range_policy(options_t options, gemm_ar
 
   functor_type parallel_batched_gemm_functor(gemm_args);
 
-  if (options.blas_args.batch_size_last_dim) {
-    for (uint32_t i = 0; i < warm_up_n; i++) {
-      Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm",
-                          policy_type_last_dim(0, options.start.c.k),
-                          parallel_batched_gemm_functor);
-      Kokkos::fence();
-    }
-  } else {
-    for (uint32_t i = 0; i < warm_up_n; i++) {
-      Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm",
-                          policy_type(0, options.start.c.k),
-                          parallel_batched_gemm_functor);
-      Kokkos::fence();
-    }
+  for (uint32_t i = 0; i < warm_up_n; i++) {
+    Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm",
+                        policy_type(0, options.start.c.k),
+                        parallel_batched_gemm_functor);
+    Kokkos::fence();
   }
 
-  if (options.blas_args.batch_size_last_dim) {
-    timer.reset();
-    for (uint32_t i = 0; i < n; i++) {
-      Kokkos::parallel_for("parallelBatchedTimedLoopGemm",
-                          policy_type_last_dim(0, options.start.c.k),
-                          parallel_batched_gemm_functor);
-      Kokkos::fence();
-    }
-  } else {
-    timer.reset();
-    for (uint32_t i = 0; i < n; i++) {
-      Kokkos::parallel_for("parallelBatchedTimedLoopGemm",
-                          policy_type(0, options.start.c.k),
-                          parallel_batched_gemm_functor);
-      Kokkos::fence();
-    }
+  timer.reset();
+  for (uint32_t i = 0; i < n; i++) {
+    Kokkos::parallel_for("parallelBatchedTimedLoopGemm",
+                        policy_type(0, options.start.c.k),
+                        parallel_batched_gemm_functor);
+    Kokkos::fence();
   }
 
   __gemm_output_csv_row(options, gemm_args, timer.seconds());
@@ -503,8 +533,8 @@ void __do_gemm_parallel_batched_template(options_t options,
   auto league_size   = options.start.c.k;
   Kokkos::Timer timer;
 
-  if (std::is_same<AlgoTag, SerialTag>::value) {
-    return __do_gemm_parallel_batched_template_range_policy<TransAType, TransBType, BlockingType, SerialTag, device_type>(options, gemm_args);
+  if (std::is_same<AlgoTag, SerialTag>::value || std::is_same<AlgoTag, SerialBatchDim3Tag>::value) {
+    return __do_gemm_parallel_batched_template_range_policy<TransAType, TransBType, BlockingType, AlgoTag, device_type>(options, gemm_args);
   }
 
   STATUS;
@@ -1089,41 +1119,66 @@ void do_gemm_serial_batched_blocked(options_t options) {
 
 void do_gemm_serial_batched_parallel(options_t options) {
   STATUS;
-  __do_loop_and_invoke(
-      options, __do_gemm_parallel_batched<SerialTag, Algo::Gemm::Unblocked,
+  if (options.blas_args.batch_size_last_dim)
+    __do_loop_and_invoke(
+      options, __do_gemm_parallel_batched<SerialBatchDim3Tag, Algo::Gemm::Unblocked,
                                           default_device>);
+  else
+    __do_loop_and_invoke(
+        options, __do_gemm_parallel_batched<SerialTag, Algo::Gemm::Unblocked,
+                                            default_device>);
   return;
 }
 
 void do_gemm_serial_batched_blocked_parallel(options_t options) {
   STATUS;
-  __do_loop_and_invoke(
-      options, __do_gemm_parallel_batched<SerialTag, Algo::Gemm::Blocked,
-                                          default_device>);
+  if (options.blas_args.batch_size_last_dim)
+    __do_loop_and_invoke(
+        options, __do_gemm_parallel_batched<SerialBatchDim3Tag, Algo::Gemm::Blocked,
+                                            default_device>);
+  else
+    __do_loop_and_invoke(
+        options, __do_gemm_parallel_batched<SerialTag, Algo::Gemm::Blocked,
+                                            default_device>);
   return;
 }
 
 void do_gemm_team_batched_parallel(options_t options) {
   STATUS;
-  __do_loop_and_invoke(
-      options, __do_gemm_parallel_batched<TeamTag, Algo::Gemm::Unblocked,
-                                          default_device>);
+  if (options.blas_args.batch_size_last_dim)
+    __do_loop_and_invoke(
+        options, __do_gemm_parallel_batched<TeamBatchDim3Tag, Algo::Gemm::Unblocked,
+                                            default_device>);
+  else
+    __do_loop_and_invoke(
+        options, __do_gemm_parallel_batched<TeamTag, Algo::Gemm::Unblocked,
+                                            default_device>);
   return;
 }
 
 void do_gemm_team_batched_blocked_parallel(options_t options) {
   STATUS;
-  __do_loop_and_invoke(
-      options,
-      __do_gemm_parallel_batched<TeamTag, Algo::Gemm::Blocked, default_device>);
+  if (options.blas_args.batch_size_last_dim)
+    __do_loop_and_invoke(
+        options,
+        __do_gemm_parallel_batched<TeamBatchDim3Tag, Algo::Gemm::Blocked, default_device>);
+  else
+    __do_loop_and_invoke(
+        options,
+        __do_gemm_parallel_batched<TeamTag, Algo::Gemm::Blocked, default_device>);
   return;
 }
 
 void do_gemm_team_vector_batched_parallel(options_t options) {
   STATUS;
-  __do_loop_and_invoke(
-      options, __do_gemm_parallel_batched<TeamVectorTag, Algo::Gemm::Unblocked,
-                                          default_device>);
+  if (options.blas_args.batch_size_last_dim)
+      __do_loop_and_invoke(
+        options, __do_gemm_parallel_batched<TeamVectorBatchDim3Tag, Algo::Gemm::Unblocked,
+                                            default_device>);
+  else
+    __do_loop_and_invoke(
+        options, __do_gemm_parallel_batched<TeamVectorTag, Algo::Gemm::Unblocked,
+                                            default_device>);
   return;
 }
 

From 6da5a7b637552ba325a1d42c0561cf8b294b362a Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Thu, 18 Feb 2021 13:35:09 -0700
Subject: [PATCH 17/47] perf_test/blas/blas3: Add simd gemm as experiment6.

---
 .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 98 ++++++++++++++++++-
 1 file changed, 97 insertions(+), 1 deletion(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index 3db8f0dc1a..f24a1091b7 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -1023,6 +1023,99 @@ void __do_gemm_parallel_experiment5(options_t options, gemm_args_t gemm_args) {
   return;
 }
 
+template <class MemberType, class SimdViewType, class TransAType, class TransBType,
+          class BlockingType>
+class parallel_batched_gemm_experiment6 {
+ private:
+  SimdViewType &A, &B, &C;
+  gemm_args_t gemm_args;
+
+ public:
+  parallel_batched_gemm_experiment6(SimdViewType &_A, SimdViewType &_B,
+                                    SimdViewType &_C, gemm_args_t _gemm_args)
+      : A(_A), B(_B), C(_C), gemm_args(_gemm_args) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const MemberType &member) const {
+    auto i = member.league_rank();
+    auto svA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
+    auto svB = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL());
+    auto svC = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL());
+
+    // Uses two serial for-loops internally
+    KokkosBatched::TeamVectorGemm<MemberType, TransAType, TransBType, BlockingType>::invoke(
+        member, gemm_args.alpha, svA, svB, gemm_args.beta, svC);
+  }
+};
+
+template <class TransAType, class TransBType, class BlockingType,
+          class device_type>
+void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) {
+  using execution_space = typename device_type::execution_space;
+  using policy_type     = Kokkos::TeamPolicy<execution_space>;
+  using member_type     = typename policy_type::member_type;
+
+  // Construct the vector type
+  using scalar_type = typename view_type_3d::value_type;
+  constexpr int vl =
+      KokkosBatched::DefaultVectorLength<scalar_type, execution_space>::value;
+  constexpr int il = 
+      KokkosBatched::DefaultInternalVectorLength<scalar_type, execution_space>::value;
+  using vector_type = KokkosBatched::Vector<KokkosBatched::SIMD<scalar_type>, vl>;
+  using internal_vector_type = KokkosBatched::Vector<KokkosBatched::SIMD<scalar_type>, il>;
+  using view_type = Kokkos::View<scalar_type***[vl], default_layout, default_device>;
+  using vector_view_type = Kokkos::View<vector_type***, default_layout, default_device>;
+  using internal_vector_view_type = Kokkos::View<internal_vector_type***, default_layout, default_device>;
+  using functor_type =
+      parallel_batched_gemm_experiment6<member_type, internal_vector_view_type,
+                                        TransAType, TransBType, BlockingType>;
+
+  uint32_t warm_up_n = options.warm_up_n;
+  uint32_t n         = options.n;
+  auto k             = options.start.c.k;
+  Kokkos::Timer timer;
+  auto simd_batch_size = k / vl + (k % vl > 0);
+  STATUS;
+
+  // Construct matrices
+  vector_view_type A_vector("A_vector", simd_batch_size, gemm_args.A.extent(0), gemm_args.A.extent(1));
+  view_type A((scalar_type *)A_vector.data(), simd_batch_size, gemm_args.A.extent(0), gemm_args.A.extent(1));
+  internal_vector_view_type A_vector_internal(A_vector.data(), simd_batch_size, gemm_args.A.extent(0), gemm_args.A.extent(1));
+
+  vector_view_type B_vector("B_vector", simd_batch_size, gemm_args.B.extent(0), gemm_args.B.extent(1));
+  view_type B((scalar_type *)B_vector.data(), simd_batch_size, gemm_args.B.extent(0), gemm_args.B.extent(1));
+  internal_vector_view_type B_vector_internal(B_vector.data(), simd_batch_size, gemm_args.B.extent(0), gemm_args.B.extent(1));
+
+  vector_view_type C_vector("C_vector", simd_batch_size, gemm_args.C.extent(0), gemm_args.C.extent(1));
+  view_type C((scalar_type *)C_vector.data(), simd_batch_size, gemm_args.C.extent(0), gemm_args.C.extent(1));
+  internal_vector_view_type C_vector_internal(C_vector.data(), simd_batch_size, gemm_args.C.extent(0), gemm_args.C.extent(1));
+
+  uint64_t seed = Kokkos::Impl::clock_tic();
+  Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(seed);
+  Kokkos::fill_random(A, rand_pool, Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, scalar_type>::max());
+  Kokkos::fill_random(B, rand_pool, Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, scalar_type>::max());
+  Kokkos::fill_random(C, rand_pool, Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, scalar_type>::max());
+  Kokkos::fence();
+
+  functor_type experiment6_functor(A_vector_internal, B_vector_internal, C_vector_internal, gemm_args);
+
+  for (uint32_t i = 0; i < warm_up_n; ++i) {
+    Kokkos::parallel_for("parallelBatchedUntimedExperiment6Gemm",
+                         policy_type(simd_batch_size, Kokkos::AUTO, vl/il), experiment6_functor);
+    Kokkos::fence();
+  }
+
+  timer.reset();
+  for (uint32_t i = 0; i < n; ++i) {
+    Kokkos::parallel_for("parallelBatchedTimedExperiment6Gemm",
+                         policy_type(simd_batch_size, Kokkos::AUTO, vl/il), experiment6_functor);
+    Kokkos::fence();
+  }
+
+  __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment6");
+  return;
+}
+
 /*************************** Internal setup fns **************************/
 template <class scalar_type, class vta, class vtb, class vtc, class device_type>
 gemm_args_t __do_setup(options_t options, matrix_dims_t dim) {
@@ -1195,7 +1288,7 @@ void do_gemm_experiment_parallel(options_t options) {
   using TransBType   = Trans::NoTranspose;
   using BlockingType = Algo::Gemm::Unblocked;
 
-  __do_loop_and_invoke(
+/*   __do_loop_and_invoke(
       options, __do_gemm_parallel_experiment1<TransAType, TransBType,
                                               BlockingType, default_device>);
   __do_loop_and_invoke(
@@ -1209,6 +1302,9 @@ void do_gemm_experiment_parallel(options_t options) {
                                               BlockingType, default_device>);
   __do_loop_and_invoke(
       options, __do_gemm_parallel_experiment5<TransAType, TransBType,
+                                              BlockingType, default_device>); */
+  __do_loop_and_invoke(
+      options, __do_gemm_parallel_experiment6<TransAType, TransBType,
                                               BlockingType, default_device>);
 }
 

From 441c4d4a6bfaf16b5e9ae28e8e0795ccea3b1c21 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Tue, 23 Feb 2021 11:45:10 -0700
Subject: [PATCH 18/47] perf_test/blas/blas3: Add experiment7 (Simd + TeamGemm)

---
 .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 105 +++++++++++++++++-
 1 file changed, 104 insertions(+), 1 deletion(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index f24a1091b7..86b46e5adb 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -1116,6 +1116,106 @@ void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) {
   return;
 }
 
+template <class MemberType, class SimdViewType, class TransAType, class TransBType,
+          class BlockingType>
+class parallel_batched_gemm_experiment7 {
+ private:
+  SimdViewType &A, &B, &C;
+  gemm_args_t gemm_args;
+
+ public:
+  parallel_batched_gemm_experiment7(SimdViewType &_A, SimdViewType &_B,
+                                    SimdViewType &_C, gemm_args_t _gemm_args)
+      : A(_A), B(_B), C(_C), gemm_args(_gemm_args) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const MemberType &member) const {
+    auto i = member.league_rank();
+    Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, A.extent(0)),[&](const int &vector_lane) {
+	auto svA = Kokkos::subview(A, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i);
+	auto svB = Kokkos::subview(B, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i);
+	auto svC = Kokkos::subview(C, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i);
+
+	KokkosBatched::TeamGemm<MemberType, TransAType, TransBType, BlockingType>::invoke(member, gemm_args.alpha, svA, svB, gemm_args.beta, svC);
+   });
+  }
+};
+
+template <class TransAType, class TransBType, class BlockingType,
+          class device_type>
+void __do_gemm_parallel_experiment7(options_t options, gemm_args_t gemm_args) {
+  using execution_space = typename device_type::execution_space;
+  using policy_type     = Kokkos::TeamPolicy<execution_space>;
+  using member_type     = typename policy_type::member_type;
+
+  // Construct the vector type
+  using scalar_type = typename view_type_3d::value_type;
+  constexpr int vl =
+      KokkosBatched::DefaultVectorLength<scalar_type, execution_space>::value;
+  constexpr int il = 
+      KokkosBatched::DefaultInternalVectorLength<scalar_type, execution_space>::value;
+  using vector_type = KokkosBatched::Vector<KokkosBatched::SIMD<scalar_type>, vl>;
+  using internal_vector_type = KokkosBatched::Vector<KokkosBatched::SIMD<scalar_type>, il>;
+  using view_type = Kokkos::View<scalar_type****, default_layout, default_device>;
+  using vector_view_type = Kokkos::View<vector_type***, default_layout, default_device>;
+  using internal_vector_view_type = Kokkos::View<internal_vector_type****, default_layout, default_device>;
+
+  uint32_t warm_up_n = options.warm_up_n;
+  uint32_t n         = options.n;
+  auto k             = options.start.c.k;
+  Kokkos::Timer timer;
+  auto simd_batch_size = k / vl + (k % vl > 0);
+  STATUS;
+
+  // Construct matrices
+  vector_view_type A_vector("A_vector", gemm_args.A.extent(0), gemm_args.A.extent(1), simd_batch_size);
+  view_type A((scalar_type *)A_vector.data(), vl, gemm_args.A.extent(0), gemm_args.A.extent(1), simd_batch_size);
+  internal_vector_view_type A_vector_internal(A_vector.data(), il/vl, gemm_args.A.extent(0), gemm_args.A.extent(1), simd_batch_size);
+
+  vector_view_type B_vector("B_vector", gemm_args.B.extent(0), gemm_args.B.extent(1), simd_batch_size);
+  view_type B((scalar_type *)B_vector.data(), vl, gemm_args.B.extent(0), gemm_args.B.extent(1), simd_batch_size);
+  internal_vector_view_type B_vector_internal(B_vector.data(), il/vl, gemm_args.B.extent(0), gemm_args.B.extent(1), simd_batch_size);
+
+  vector_view_type C_vector("C_vector", gemm_args.C.extent(0), gemm_args.C.extent(1), simd_batch_size);
+  view_type C((scalar_type *)C_vector.data(), vl, gemm_args.C.extent(0), gemm_args.C.extent(1), simd_batch_size);
+  internal_vector_view_type C_vector_internal(C_vector.data(), il/vl, gemm_args.C.extent(0), gemm_args.C.extent(1), simd_batch_size);
+
+  uint64_t seed = Kokkos::Impl::clock_tic();
+  Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(seed);
+  Kokkos::fill_random(A, rand_pool, Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, scalar_type>::max());
+  Kokkos::fill_random(B, rand_pool, Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, scalar_type>::max());
+  Kokkos::fill_random(C, rand_pool, Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, scalar_type>::max());
+  Kokkos::fence();
+
+   using functor_type =
+       parallel_batched_gemm_experiment7<member_type, internal_vector_view_type,
+                                         TransAType, TransBType, BlockingType>;
+    functor_type experiment7_functor(A_vector_internal, B_vector_internal, C_vector_internal, gemm_args);
+
+  //using functor_type =
+  //    parallel_batched_gemm_experiment7<member_type, view_type,
+  //                                      TransAType, TransBType, BlockingType>;
+  // functor_type experiment7_functor(A, B, C, gemm_args);
+
+  for (uint32_t i = 0; i < warm_up_n; ++i) {
+    Kokkos::parallel_for("parallelBatchedUntimedExperiment7Gemm",
+                         policy_type(simd_batch_size, Kokkos::AUTO, vl/il), experiment7_functor);
+                         //policy_type(simd_batch_size, Kokkos::AUTO, vl), experiment7_functor);
+    Kokkos::fence();
+  }
+
+  timer.reset();
+  for (uint32_t i = 0; i < n; ++i) {
+    Kokkos::parallel_for("parallelBatchedTimedExperiment7Gemm",
+                         policy_type(simd_batch_size, Kokkos::AUTO, vl/il), experiment7_functor);
+                         //policy_type(simd_batch_size, Kokkos::AUTO, vl), experiment7_functor);
+    Kokkos::fence();
+  }
+
+  __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment7");
+  return;
+}
+
 /*************************** Internal setup fns **************************/
 template <class scalar_type, class vta, class vtb, class vtc, class device_type>
 gemm_args_t __do_setup(options_t options, matrix_dims_t dim) {
@@ -1302,9 +1402,12 @@ void do_gemm_experiment_parallel(options_t options) {
                                               BlockingType, default_device>);
   __do_loop_and_invoke(
       options, __do_gemm_parallel_experiment5<TransAType, TransBType,
-                                              BlockingType, default_device>); */
+                                              BlockingType, default_device>);
   __do_loop_and_invoke(
       options, __do_gemm_parallel_experiment6<TransAType, TransBType,
+      BlockingType, default_device>); */
+  __do_loop_and_invoke(
+      options, __do_gemm_parallel_experiment7<TransAType, TransBType,
                                               BlockingType, default_device>);
 }
 

From 3c805868b780334ab037d2ebc47ce711f1246cc5 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Tue, 2 Mar 2021 12:51:37 -0700
Subject: [PATCH 19/47] perf_test/blas/blas3: replace experiment7 with
 batched_team_simd

---
 perf_test/blas/blas3/KokkosBlas3_common.hpp   |   5 +-
 .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 395 +++++++++++-------
 .../blas/blas3/KokkosBlas3_perf_test.cpp      |  19 +-
 3 files changed, 261 insertions(+), 158 deletions(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_common.hpp b/perf_test/blas/blas3/KokkosBlas3_common.hpp
index a2c1e6f6ae..b398ed62aa 100644
--- a/perf_test/blas/blas3/KokkosBlas3_common.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_common.hpp
@@ -137,6 +137,8 @@ typedef enum TEST {
   BATCHED_TEAM_BLOCKED,
   BATCHED_TEAM_VECTOR,
   BATCHED_TEAM_VECTOR_BLOCKED,
+  BATCHED_TEAM_SIMD,
+  BATCHED_TEAM_SIMD_BLOCKED,
   // ADD MORE TEST TYPES HERE
   EXPERIMENT,
   TEST_N
@@ -145,7 +147,8 @@ typedef enum TEST {
 static std::string test_e_str[TEST_N]{
     "blas", "batched_serial", "batched_serial_blocked", "batched_team",
     "batched_team_blocked", "batched_team_vector",
-    "batched_team_vector_blocked",
+    "batched_team_vector_blocked", "batched_team_simd",
+    "batched_team_simd_blocked",
     // ADD MORE TEST TYPES HERE
     "experiment"};
 
diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index 86b46e5adb..91bf649fed 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -74,6 +74,8 @@ void do_gemm_team_batched_parallel(options_t options);
 void do_gemm_team_batched_blocked_parallel(options_t options);
 void do_gemm_team_vector_batched_parallel(options_t options);
 void do_gemm_team_vector_batched_blocked_parallel(options_t options);
+void do_gemm_team_simd_batched_parallel(options_t options);
+void do_gemm_team_simd_batched_blocked_parallel(options_t options);
 void do_gemm_experiment_parallel(options_t options);
 
 struct SerialTag {};
@@ -82,6 +84,10 @@ struct TeamTag {};
 struct TeamBatchDim3Tag {};
 struct TeamVectorTag {};
 struct TeamVectorBatchDim3Tag {};
+struct TeamSimdTag {};
+struct TeamSimdBatchDim4Tag {};
+// TODO: struct SerialSimdTag {};
+// TODO: struct SerialSimdBatchDim4Tag {};
 struct LayoutLeftTag {};
 struct LayoutRightTag {};
 struct SimdCpuTag {};
@@ -93,6 +99,7 @@ void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = {
         do_gemm_serial_batched, do_gemm_serial_batched_blocked,  // Serial
         NULL, NULL,                                              // Team
         NULL, NULL,                                              // TeamVector
+        NULL, NULL,                                              // TeamSimd
         NULL  // Serial Experiment
     },
     {
@@ -102,6 +109,8 @@ void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = {
         do_gemm_team_batched_parallel,
         do_gemm_team_batched_blocked_parallel,       // Team
         do_gemm_team_vector_batched_parallel, NULL,  // TeamVector
+        do_gemm_team_simd_batched_parallel, 
+        do_gemm_team_simd_batched_blocked_parallel,  // TeamSimd
         do_gemm_experiment_parallel                  // Parallel Experiment
     }};
 
@@ -112,6 +121,18 @@ void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = {
 
 using view_type_3d =
     Kokkos::View<default_scalar ***, default_layout, default_device>;
+using view_type_4d = Kokkos::View<default_scalar****, default_layout, default_device>;
+
+// Construct the vector type
+using memory_space = typename default_device::execution_space::memory_space;
+constexpr int simd_vector_size =
+    KokkosBatched::DefaultVectorLength<default_scalar, memory_space>::value;
+constexpr int simd_internal_vector_size = 
+    KokkosBatched::DefaultInternalVectorLength<default_scalar, memory_space>::value;
+using vector_type = KokkosBatched::Vector<KokkosBatched::SIMD<default_scalar>, simd_vector_size>;
+using internal_vector_type = KokkosBatched::Vector<KokkosBatched::SIMD<default_scalar>, simd_internal_vector_size>;
+using vector_view_type_3d = Kokkos::View<vector_type***, default_layout, default_device>;
+using internal_vector_view_type_4d = Kokkos::View<internal_vector_type****, default_layout, default_device>;
 
 struct batched_params {
   int team_size;
@@ -119,12 +140,58 @@ struct batched_params {
 };
 typedef struct batched_params batched_params_t;
 
+/**
+ * @brief struct gemm_simd_args encapsulates the data types required
+ * for allocating and passing a single matrix to the KokkosBatched gemm
+ * kernels. To invoke gemm on a batch of matrices, three instances of this
+ * struct are required, one for each matrix, A, B, and C.
+ * 
+ * @var  vec_3d: 3-rank view type used for allocating the underlying data.
+ *               A reference must be kept to this object to ensure the
+ *               data is not free'd by the C++ runtime.
+ * @var  mat_4d: 4-rank view type used for populating the simd view with
+                 random values.
+ * @var ivec_4d: 4-rank view type used for passing to math kernels. This
+ *               view type is used for leveraging simd instructions on 
+ *               both the host and device.
+ */
+struct gemm_simd_args {
+  vector_view_type_3d vec_3d;
+  view_type_4d mat_4d;
+  internal_vector_view_type_4d ivec_4d;
+};
+typedef struct gemm_simd_args gemm_simd_args_t;
+
+/**
+ * @brief struct gemm_args are common arguments passed to
+ * both gemm implementations in the KokkosBlas and KokkosBatched
+ * namespaces throughout these performance tests.
+ *
+ * @var transA: transpose type for A matrix.
+ *              supported types:   'n' - no transpose, 't' - transpose.
+ *              unsupported types: 'c' - conjugate transpose.
+ * @var transB: transpose type for B matrix.
+ *              supported types:   'n' - no transpose, 't' - transpose.
+ *              unsupported types: 'c' - conjugate transpose.
+ * @var alpha: scalar applied to A matrix.
+ * @var beta:  scalar applied to B matrix.
+ * @var A:     3-rank view type used in all non-simd tests.
+ * @var B:     3-rank view type used in all non-simd tests.
+ * @var C:     3-rank view type used in all non-simd tests.
+ * @var bp:    team_size and vector_length for tests that use Kokkos::TeamPolicy.
+ * @var Av:    3-rank and 4-rank vector view types for simd tests.
+ * @var Bv:    3-rank and 4-rank vector view types for simd tests.
+ * @var Cv:    3-rank and 4-rank vector view types for simd tests.
+ */ 
 struct gemm_args {
   char transA, transB;
   default_scalar alpha;
   default_scalar beta;
   view_type_3d A, B, C;
   batched_params_t bp;
+  // Below are matrices for simd tests
+  gemm_simd_args_t Av, Bv, Cv;
+  matrix_dims_t dims;
 };
 typedef struct gemm_args gemm_args_t;
 
@@ -135,15 +202,26 @@ static std::string gemm_csv_header_str =
 
 /*************************** Internal helper fns **************************/
 // Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf
-static inline int __gemm_flop_count(int a_m, int a_n, int b_k) {
+static inline int __gemm_flop_count(int a_m, int a_n, int b_n) {
     if (std::is_same<double, default_scalar>::value ||
         std::is_same<float, default_scalar>::value ||
         std::is_same<Kokkos::Experimental::half_t, default_scalar>::value)
-      return 2 * a_m * b_k * a_n;
+      return 2 * a_m * b_n * a_n;
     else
       // For complex, we need to count 2 flops for each add and 6 flops for each multiply.
-      return (2 + 6) * a_m * b_k * a_n;
+      return (2 + 6) * a_m * b_n * a_n;
 }
+
+static inline std::string __gemm_output_dim_string(options_t options, matrix_dim_t dim) {
+  std::string x = "x";
+  std::string ret = std::to_string(dim.m) + x + std::to_string(dim.n);
+
+  if (options.blas_args.batch_size_last_dim)
+    return ret + x + std::to_string(dim.k);
+  else
+    return std::to_string(dim.k) + x + ret;
+}
+
 static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args,
                                   double time_in_seconds,
                                   const char *experiment_name = nullptr) {
@@ -157,13 +235,8 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args,
   double gflops;
   double average_time = time_in_seconds / options.n;
 
-  if (options.blas_args.batch_size_last_dim) {
-    flops = gemm_args.A.extent(2) * __gemm_flop_count(gemm_args.A.extent(0), gemm_args.A.extent(1),
-                                                      gemm_args.B.extent(1));
-  } else {
-    flops = gemm_args.A.extent(0) * __gemm_flop_count(gemm_args.A.extent(1), gemm_args.A.extent(2),
-                                                      gemm_args.B.extent(2));
-  }
+  flops = gemm_args.dims.a.k * __gemm_flop_count(gemm_args.dims.a.m, gemm_args.dims.a.n,
+						 gemm_args.dims.b.n);
 
   gflops = flops / 1e9;
 
@@ -172,12 +245,11 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args,
                  << options.blas_args.gemm.beta << ","
                  << ts << ","
                  << vlen << ","
-                 << loop_e_str[options.loop] << "," << gemm_args.A.extent(0)
-                 << "x" << gemm_args.A.extent(1) << "x" << gemm_args.A.extent(2)
-                 << "," << gemm_args.B.extent(0) << "x" << gemm_args.B.extent(1)
-                 << "x" << gemm_args.B.extent(2) << "," << gemm_args.C.extent(0)
-                 << "x" << gemm_args.C.extent(1) << "x" << gemm_args.C.extent(2)
-                 << "," << options.warm_up_n << "," << options.n << ","
+                 << loop_e_str[options.loop] << "," 
+		 << __gemm_output_dim_string(options, gemm_args.dims.a) << ","
+		 << __gemm_output_dim_string(options, gemm_args.dims.b) << ","
+		 << __gemm_output_dim_string(options, gemm_args.dims.c) << ","
+		 << options.warm_up_n << "," << options.n << ","
                  << time_in_seconds << ","
                  << time_in_seconds / options.n << ","
                  << flops << ","
@@ -385,13 +457,34 @@ struct parallel_batched_gemm_range_policy {
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const TeamTag &, const int &i) const {}
+  void operator()(const TeamTag &, const int &i) const {
+    Kokkos::abort("TeamTag not supported using RangePolicy.");
+  }
+
   KOKKOS_INLINE_FUNCTION
-  void operator()(const TeamBatchDim3Tag &, const int &i) const {}
+  void operator()(const TeamBatchDim3Tag &, const int &i) const {
+    Kokkos::abort("TeamBatchDim3Tag not supported using RangePolicy.");
+      }
+
   KOKKOS_INLINE_FUNCTION
-  void operator()(const TeamVectorTag &, const int &i) const {}
+  void operator()(const TeamVectorTag &, const int &i) const {
+    Kokkos::abort("TeamVectorTag not supported using RangePolicy.");
+      }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TeamVectorBatchDim3Tag &, const int &i) const {
+    Kokkos::abort("TeamVectorBatchDim3Tag not supported using RangePolicy.");
+      }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TeamSimdTag &, const int &i) const {
+    Kokkos::abort("TeamSimdTag not supported using RangePolicy.");
+      }
+
   KOKKOS_INLINE_FUNCTION
-  void operator()(const TeamVectorBatchDim3Tag &, const int &i) const {}
+  void operator()(const TeamSimdBatchDim4Tag &, const int &i) const {
+    Kokkos::abort("TeamSimdBatchDim4Tag not supported using RangePolicy.");
+      }
 };
 
 template <class MemberType, class TransAType, class TransBType,
@@ -480,6 +573,30 @@ struct parallel_batched_gemm {
                                                         svB, gemm_args_.beta,
                                                         svC);
   }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TeamSimdTag &, const MemberType &member) const {
+    auto i = member.league_rank();
+    Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, gemm_args_.Cv.ivec_4d.extent(3)),[&](const int &vector_lane) {
+      auto svA = Kokkos::subview(gemm_args_.Av.ivec_4d, i, Kokkos::ALL(), Kokkos::ALL(), vector_lane);
+      auto svB = Kokkos::subview(gemm_args_.Bv.ivec_4d, i, Kokkos::ALL(), Kokkos::ALL(), vector_lane);
+      auto svC = Kokkos::subview(gemm_args_.Cv.ivec_4d, i, Kokkos::ALL(), Kokkos::ALL(), vector_lane);
+
+      KokkosBatched::TeamGemm<MemberType, TransAType, TransBType, BlockingType>::invoke(member, gemm_args_.alpha, svA, svB, gemm_args_.beta, svC);
+   });
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const TeamSimdBatchDim4Tag &, const MemberType &member) const {
+    auto i = member.league_rank();
+    Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, simd_vector_size),[&](const int &vector_lane) {
+      auto svA = Kokkos::subview(gemm_args_.Av.ivec_4d, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i);
+      auto svB = Kokkos::subview(gemm_args_.Bv.ivec_4d, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i);
+      auto svC = Kokkos::subview(gemm_args_.Cv.ivec_4d, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i);
+
+      KokkosBatched::TeamGemm<MemberType, TransAType, TransBType, BlockingType>::invoke(member, gemm_args_.alpha, svA, svB, gemm_args_.beta, svC);
+   });
+  }
 };
 
 template <class TransAType, class TransBType, class BlockingType, class AlgoTag,
@@ -531,17 +648,22 @@ void __do_gemm_parallel_batched_template(options_t options,
   uint32_t warm_up_n = options.warm_up_n;
   uint32_t n         = options.n;
   auto league_size   = options.start.c.k;
+  auto team_size  = gemm_args.bp.team_size;
+  auto vector_len = gemm_args.bp.vector_len;
   Kokkos::Timer timer;
 
   if (std::is_same<AlgoTag, SerialTag>::value || std::is_same<AlgoTag, SerialBatchDim3Tag>::value) {
     return __do_gemm_parallel_batched_template_range_policy<TransAType, TransBType, BlockingType, AlgoTag, device_type>(options, gemm_args);
   }
 
+  if (std::is_same<AlgoTag, TeamSimdTag>::value || std::is_same<AlgoTag, TeamSimdBatchDim4Tag>::value) {
+    league_size = options.blas_args.batch_size_last_dim ? gemm_args.Cv.ivec_4d.extent(3) : gemm_args.Cv.ivec_4d.extent(0);
+    vector_len = simd_vector_size/simd_internal_vector_size; // TODO: use bp.vector_len?
+  }
+
   STATUS;
 
   functor_type parallel_batched_gemm_functor(gemm_args);
-  auto team_size  = gemm_args.bp.team_size;
-  auto vector_len = gemm_args.bp.vector_len;
 
   if (options.blas_args.use_auto) {
     for (uint32_t i = 0; i < warm_up_n; i++) {
@@ -965,7 +1087,7 @@ void __do_gemm_parallel_experiment5(options_t options, gemm_args_t gemm_args) {
   using scalar_type = typename view_type_3d::value_type;
   constexpr int vl =
       KokkosBatched::DefaultVectorLength<scalar_type, execution_space>::value;
-  using simd_type = KokkosBatched::Vector<KokkosBatched::SIMD<scalar_type>, vl>;
+  using simd_type = KokkosBatched::Vector<KokkosBatched::SIMD<scalar_type>, simd_vector_size>;
   using simd_view_type =
       Kokkos::View<simd_type ***, default_layout, default_device>;
   using functor_type =
@@ -1051,6 +1173,7 @@ class parallel_batched_gemm_experiment6 {
 template <class TransAType, class TransBType, class BlockingType,
           class device_type>
 void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) {
+#if 0
   using execution_space = typename device_type::execution_space;
   using policy_type     = Kokkos::TeamPolicy<execution_space>;
   using member_type     = typename policy_type::member_type;
@@ -1061,8 +1184,6 @@ void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) {
       KokkosBatched::DefaultVectorLength<scalar_type, execution_space>::value;
   constexpr int il = 
       KokkosBatched::DefaultInternalVectorLength<scalar_type, execution_space>::value;
-  using vector_type = KokkosBatched::Vector<KokkosBatched::SIMD<scalar_type>, vl>;
-  using internal_vector_type = KokkosBatched::Vector<KokkosBatched::SIMD<scalar_type>, il>;
   using view_type = Kokkos::View<scalar_type***[vl], default_layout, default_device>;
   using vector_view_type = Kokkos::View<vector_type***, default_layout, default_device>;
   using internal_vector_view_type = Kokkos::View<internal_vector_type***, default_layout, default_device>;
@@ -1113,112 +1234,13 @@ void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) {
   }
 
   __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment6");
-  return;
-}
-
-template <class MemberType, class SimdViewType, class TransAType, class TransBType,
-          class BlockingType>
-class parallel_batched_gemm_experiment7 {
- private:
-  SimdViewType &A, &B, &C;
-  gemm_args_t gemm_args;
-
- public:
-  parallel_batched_gemm_experiment7(SimdViewType &_A, SimdViewType &_B,
-                                    SimdViewType &_C, gemm_args_t _gemm_args)
-      : A(_A), B(_B), C(_C), gemm_args(_gemm_args) {}
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const MemberType &member) const {
-    auto i = member.league_rank();
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, A.extent(0)),[&](const int &vector_lane) {
-	auto svA = Kokkos::subview(A, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i);
-	auto svB = Kokkos::subview(B, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i);
-	auto svC = Kokkos::subview(C, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i);
-
-	KokkosBatched::TeamGemm<MemberType, TransAType, TransBType, BlockingType>::invoke(member, gemm_args.alpha, svA, svB, gemm_args.beta, svC);
-   });
-  }
-};
-
-template <class TransAType, class TransBType, class BlockingType,
-          class device_type>
-void __do_gemm_parallel_experiment7(options_t options, gemm_args_t gemm_args) {
-  using execution_space = typename device_type::execution_space;
-  using policy_type     = Kokkos::TeamPolicy<execution_space>;
-  using member_type     = typename policy_type::member_type;
-
-  // Construct the vector type
-  using scalar_type = typename view_type_3d::value_type;
-  constexpr int vl =
-      KokkosBatched::DefaultVectorLength<scalar_type, execution_space>::value;
-  constexpr int il = 
-      KokkosBatched::DefaultInternalVectorLength<scalar_type, execution_space>::value;
-  using vector_type = KokkosBatched::Vector<KokkosBatched::SIMD<scalar_type>, vl>;
-  using internal_vector_type = KokkosBatched::Vector<KokkosBatched::SIMD<scalar_type>, il>;
-  using view_type = Kokkos::View<scalar_type****, default_layout, default_device>;
-  using vector_view_type = Kokkos::View<vector_type***, default_layout, default_device>;
-  using internal_vector_view_type = Kokkos::View<internal_vector_type****, default_layout, default_device>;
-
-  uint32_t warm_up_n = options.warm_up_n;
-  uint32_t n         = options.n;
-  auto k             = options.start.c.k;
-  Kokkos::Timer timer;
-  auto simd_batch_size = k / vl + (k % vl > 0);
-  STATUS;
-
-  // Construct matrices
-  vector_view_type A_vector("A_vector", gemm_args.A.extent(0), gemm_args.A.extent(1), simd_batch_size);
-  view_type A((scalar_type *)A_vector.data(), vl, gemm_args.A.extent(0), gemm_args.A.extent(1), simd_batch_size);
-  internal_vector_view_type A_vector_internal(A_vector.data(), il/vl, gemm_args.A.extent(0), gemm_args.A.extent(1), simd_batch_size);
-
-  vector_view_type B_vector("B_vector", gemm_args.B.extent(0), gemm_args.B.extent(1), simd_batch_size);
-  view_type B((scalar_type *)B_vector.data(), vl, gemm_args.B.extent(0), gemm_args.B.extent(1), simd_batch_size);
-  internal_vector_view_type B_vector_internal(B_vector.data(), il/vl, gemm_args.B.extent(0), gemm_args.B.extent(1), simd_batch_size);
-
-  vector_view_type C_vector("C_vector", gemm_args.C.extent(0), gemm_args.C.extent(1), simd_batch_size);
-  view_type C((scalar_type *)C_vector.data(), vl, gemm_args.C.extent(0), gemm_args.C.extent(1), simd_batch_size);
-  internal_vector_view_type C_vector_internal(C_vector.data(), il/vl, gemm_args.C.extent(0), gemm_args.C.extent(1), simd_batch_size);
-
-  uint64_t seed = Kokkos::Impl::clock_tic();
-  Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(seed);
-  Kokkos::fill_random(A, rand_pool, Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, scalar_type>::max());
-  Kokkos::fill_random(B, rand_pool, Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, scalar_type>::max());
-  Kokkos::fill_random(C, rand_pool, Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, scalar_type>::max());
-  Kokkos::fence();
-
-   using functor_type =
-       parallel_batched_gemm_experiment7<member_type, internal_vector_view_type,
-                                         TransAType, TransBType, BlockingType>;
-    functor_type experiment7_functor(A_vector_internal, B_vector_internal, C_vector_internal, gemm_args);
-
-  //using functor_type =
-  //    parallel_batched_gemm_experiment7<member_type, view_type,
-  //                                      TransAType, TransBType, BlockingType>;
-  // functor_type experiment7_functor(A, B, C, gemm_args);
-
-  for (uint32_t i = 0; i < warm_up_n; ++i) {
-    Kokkos::parallel_for("parallelBatchedUntimedExperiment7Gemm",
-                         policy_type(simd_batch_size, Kokkos::AUTO, vl/il), experiment7_functor);
-                         //policy_type(simd_batch_size, Kokkos::AUTO, vl), experiment7_functor);
-    Kokkos::fence();
-  }
-
-  timer.reset();
-  for (uint32_t i = 0; i < n; ++i) {
-    Kokkos::parallel_for("parallelBatchedTimedExperiment7Gemm",
-                         policy_type(simd_batch_size, Kokkos::AUTO, vl/il), experiment7_functor);
-                         //policy_type(simd_batch_size, Kokkos::AUTO, vl), experiment7_functor);
-    Kokkos::fence();
-  }
-
-  __gemm_output_csv_row(options, gemm_args, timer.seconds(), "experiment7");
+#endif
   return;
 }
 
 /*************************** Internal setup fns **************************/
 template <class scalar_type, class vta, class vtb, class vtc, class device_type>
-gemm_args_t __do_setup(options_t options, matrix_dims_t dim) {
+gemm_args_t __do_setup(options_t options, matrix_dims_t dims) {
   using execution_space = typename device_type::execution_space;
 
   gemm_args_t gemm_args;
@@ -1226,32 +1248,83 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dim) {
   Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(seed);
   STATUS;
 
+  gemm_args.dims = dims;
   gemm_args.transA        = options.blas_args.gemm.gemm_args.c_str()[0];
   gemm_args.transB        = options.blas_args.gemm.gemm_args.c_str()[1];
-  if (options.blas_args.batch_size_last_dim) {
-    gemm_args.A             = vta("gemm_args.A", dim.a.m, dim.a.n, dim.a.k);
-    gemm_args.B             = vtb("gemm_args.B", dim.b.m, dim.b.n, dim.b.k);
-    gemm_args.C             = vtc("gemm_args.C", dim.c.m, dim.c.n, dim.c.k);
+  if (options.test == BATCHED_TEAM_SIMD || options.test == BATCHED_TEAM_SIMD_BLOCKED) {
+    // Calculate the batch size for simd views
+    auto a_simd_batch_size = dims.a.k / simd_vector_size + (dims.a.k % simd_vector_size > 0);
+    auto b_simd_batch_size = dims.b.k / simd_vector_size + (dims.b.k % simd_vector_size > 0);
+    auto c_simd_batch_size = dims.c.k / simd_vector_size + (dims.c.k % simd_vector_size > 0);
+
+    // Reference gemm simd arguments for allocating A, B, and C matrices
+    gemm_simd_args_t &A = gemm_args.Av, &B = gemm_args.Bv, &C = gemm_args.Cv;
+
+    if (options.blas_args.batch_size_last_dim) {
+      // Construct simd matrices with batch_size in the last dimension (better for LayoutLeft views)
+      A.vec_3d = vector_view_type_3d ("A_vector", dims.a.m, dims.a.n, a_simd_batch_size);
+      A.mat_4d = view_type_4d ((scalar_type *)A.vec_3d.data(), simd_vector_size, dims.a.m, dims.a.n, a_simd_batch_size);
+      A.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)A.mat_4d.data(), simd_vector_size/simd_internal_vector_size, dims.a.m, dims.a.n, a_simd_batch_size);
+
+      B.vec_3d = vector_view_type_3d ("B_vector", dims.b.m, dims.b.n, b_simd_batch_size);
+      B.mat_4d = view_type_4d ((scalar_type *)B.vec_3d.data(), simd_vector_size, dims.b.m, dims.b.n, b_simd_batch_size);
+      B.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)B.mat_4d.data(), simd_vector_size/simd_internal_vector_size, dims.b.m, dims.b.n, b_simd_batch_size);
+
+      C.vec_3d = vector_view_type_3d ("C_vector", dims.c.m, dims.c.n, c_simd_batch_size);
+      C.mat_4d = view_type_4d ((scalar_type *)C.vec_3d.data(), simd_vector_size, dims.c.m, dims.c.n, c_simd_batch_size);
+      C.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)C.mat_4d.data(), simd_vector_size/simd_internal_vector_size, dims.c.m, dims.c.n, c_simd_batch_size);
+
+    } else {
+      // Construct simd matrices with batch_size in the first dimension (better for LayoutRight views)
+      A.vec_3d = vector_view_type_3d ("A_vector", a_simd_batch_size, dims.a.m, dims.a.n);
+      A.mat_4d = view_type_4d ((scalar_type *)A.vec_3d.data(), a_simd_batch_size, dims.a.m, dims.a.n, simd_vector_size);
+      A.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)A.mat_4d.data(), a_simd_batch_size, dims.a.m, dims.a.n, simd_vector_size/simd_internal_vector_size);
+
+      B.vec_3d = vector_view_type_3d ("B_vector", b_simd_batch_size, dims.b.m, dims.b.n);
+      B.mat_4d = view_type_4d ((scalar_type *)B.vec_3d.data(), b_simd_batch_size, dims.b.m, dims.b.n, simd_vector_size);
+      B.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)B.mat_4d.data(), b_simd_batch_size, dims.b.m, dims.b.n, simd_vector_size/simd_internal_vector_size);
+
+      C.vec_3d = vector_view_type_3d ("C_vector", c_simd_batch_size, dims.c.m, dims.c.n);
+      C.mat_4d = view_type_4d ((scalar_type *)C.vec_3d.data(), c_simd_batch_size, dims.c.m, dims.c.n, simd_vector_size);
+      C.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)C.mat_4d.data(), c_simd_batch_size, dims.c.m, dims.c.n, simd_vector_size/simd_internal_vector_size);
+    }
+
+    // Use the non-simd 4-rank view type to randomly populate the gemm simd arguments
+    Kokkos::fill_random(gemm_args.Av.mat_4d, rand_pool,
+                        Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
+                                    scalar_type>::max());
+    Kokkos::fill_random(gemm_args.Bv.mat_4d, rand_pool,
+                        Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
+                                    scalar_type>::max());
+    Kokkos::fill_random(gemm_args.Cv.mat_4d, rand_pool,
+                        Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
+                                    scalar_type>::max());
   } else {
-    gemm_args.A             = vta("gemm_args.A", dim.a.k, dim.a.m, dim.a.n);
-    gemm_args.B             = vtb("gemm_args.B", dim.b.k, dim.b.m, dim.b.n);
-    gemm_args.C             = vtc("gemm_args.C", dim.c.k, dim.c.m, dim.c.n);
+    if (options.blas_args.batch_size_last_dim) {
+      gemm_args.A             = vta("gemm_args.A", dims.a.m, dims.a.n, dims.a.k);
+      gemm_args.B             = vtb("gemm_args.B", dims.b.m, dims.b.n, dims.b.k);
+      gemm_args.C             = vtc("gemm_args.C", dims.c.m, dims.c.n, dims.c.k);
+    } else {
+      gemm_args.A             = vta("gemm_args.A", dims.a.k, dims.a.m, dims.a.n);
+      gemm_args.B             = vtb("gemm_args.B", dims.b.k, dims.b.m, dims.b.n);
+      gemm_args.C             = vtc("gemm_args.C", dims.c.k, dims.c.m, dims.c.n);
+    }
+
+    Kokkos::fill_random(gemm_args.A, rand_pool,
+                        Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
+                                    scalar_type>::max());
+    Kokkos::fill_random(gemm_args.B, rand_pool,
+                        Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
+                                    scalar_type>::max());
+    Kokkos::fill_random(gemm_args.C, rand_pool,
+                        Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
+                                    scalar_type>::max());
   }
   gemm_args.alpha         = options.blas_args.gemm.alpha;
   gemm_args.beta          = options.blas_args.gemm.beta;
   gemm_args.bp.team_size  = options.blas_args.team_size;
   gemm_args.bp.vector_len = options.blas_args.vector_len;
 
-  Kokkos::fill_random(gemm_args.A, rand_pool,
-                      Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
-                                   scalar_type>::max());
-  Kokkos::fill_random(gemm_args.B, rand_pool,
-                      Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
-                                   scalar_type>::max());
-  Kokkos::fill_random(gemm_args.C, rand_pool,
-                      Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
-                                   scalar_type>::max());
-
   return gemm_args;
 }
 
@@ -1265,7 +1338,8 @@ void __do_loop_and_invoke(options_t options,
   __print_gemm_perf_test_options(options);
   std::cout << "SCALAR:" << typeid(default_scalar).name()
             << ", LAYOUT:" << typeid(default_layout).name()
-            << ", DEVICE:" << typeid(default_device).name() << std::endl;
+            << ", DEVICE:" << typeid(default_device).name() 
+            << ", SPACE:" << typeid(memory_space).name() << std::endl;
 
   options.out[0] << gemm_csv_header_str << std::endl;
 
@@ -1375,6 +1449,34 @@ void do_gemm_team_vector_batched_parallel(options_t options) {
   return;
 }
 
+void do_gemm_team_simd_batched_parallel(options_t options) {
+  STATUS;
+  if (options.blas_args.batch_size_last_dim)
+      __do_loop_and_invoke(
+        options, __do_gemm_parallel_batched<TeamSimdBatchDim4Tag, Algo::Gemm::Unblocked,
+                                            default_device>);
+  else
+    __do_loop_and_invoke(
+        options, __do_gemm_parallel_batched<TeamSimdTag, Algo::Gemm::Unblocked,
+                                            default_device>);
+  return;
+}
+
+void do_gemm_team_simd_batched_blocked_parallel(options_t options) {
+  STATUS;
+  if (options.blas_args.batch_size_last_dim)
+      __do_loop_and_invoke(
+        options, __do_gemm_parallel_batched<TeamSimdBatchDim4Tag, Algo::Gemm::Blocked,
+                                            default_device>);
+  else
+    __do_loop_and_invoke(
+        options, __do_gemm_parallel_batched<TeamSimdTag, Algo::Gemm::Blocked,
+                                            default_device>);
+  return;
+}
+
+
+// Blocked algo not yet implemented for TeamVectorGemm.
 /* void do_gemm_team_vector_batched_blocked_parallel(options_t options) {
   STATUS;
   __do_loop_and_invoke(
@@ -1388,7 +1490,7 @@ void do_gemm_experiment_parallel(options_t options) {
   using TransBType   = Trans::NoTranspose;
   using BlockingType = Algo::Gemm::Unblocked;
 
-/*   __do_loop_and_invoke(
+  __do_loop_and_invoke(
       options, __do_gemm_parallel_experiment1<TransAType, TransBType,
                                               BlockingType, default_device>);
   __do_loop_and_invoke(
@@ -1405,9 +1507,6 @@ void do_gemm_experiment_parallel(options_t options) {
                                               BlockingType, default_device>);
   __do_loop_and_invoke(
       options, __do_gemm_parallel_experiment6<TransAType, TransBType,
-      BlockingType, default_device>); */
-  __do_loop_and_invoke(
-      options, __do_gemm_parallel_experiment7<TransAType, TransBType,
                                               BlockingType, default_device>);
 }
 
diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
index 72a92a32b1..17aac3d526 100644
--- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
+++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
@@ -74,7 +74,7 @@ static void __print_help_blas3_perf_test() {
   printf("Options:\n");
 
   printf("\t-h, --help\n");
-  printf("\t\tPrint this help menu.\n\n");
+  printf("\t\tPrint this help menu.\n");
 
   printf("\t-t, --test=OPTION\n");
   printf("\t\tAlgorithm selection.\n");
@@ -145,7 +145,7 @@ static void __print_help_blas3_perf_test() {
   printf("%c[1m", 27);
   printf("\t\t\t\tparallel:");
   printf("%c[0m", 27);
-  printf(" invoke blas routine in a Kokkos::parallel_for-loop.\n\n");
+  printf(" invoke blas routine in a Kokkos::parallel_for-loop.\n");
 
   printf("\t-b, --matrix_size_start=MxN,IxJ,PxQ\n");
   printf(
@@ -153,7 +153,7 @@ static void __print_help_blas3_perf_test() {
       "(start)\n");
   printf(
       "\t\t\tValid values for M and N are any non-negative 32-bit integers. "
-      "(default: %dx%d,%dx%d,%dx%d)\n\n",
+      "(default: %dx%d,%dx%d,%dx%d)\n",
       DEFAULT_MATRIX_START, DEFAULT_MATRIX_START, DEFAULT_MATRIX_START,
       DEFAULT_MATRIX_START, DEFAULT_MATRIX_START, DEFAULT_MATRIX_START);
 
@@ -163,7 +163,7 @@ static void __print_help_blas3_perf_test() {
       "(stop)\n");
   printf(
       "\t\t\tValid dimension values are any non-negative 32-bit integers. "
-      "(default: %dx%d,%dx%d,%dx%d)\n\n",
+      "(default: %dx%d,%dx%d,%dx%d)\n",
       DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP,
       DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP, DEFAULT_MATRIX_STOP);
 
@@ -171,34 +171,34 @@ static void __print_help_blas3_perf_test() {
   printf("\t\tMatrix step selection.\n");
   printf(
       "\t\t\tValid value for K is any non-negative 32-bit integer. (default: "
-      "%d)\n\n",
+      "%d)\n",
       DEFAULT_STEP);
 
   printf("\t-w, --warm_up_loop=LOOP\n");
   printf("\t\tWarm up loop selection. (untimed)\n");
   printf(
       "\t\t\tValid value for LOOP is any non-negative 32-bit integer that's <= "
-      "ITER. (default: %d)\n\n",
+      "ITER. (default: %d)\n",
       DEFAULT_WARM_UP_N);
 
   printf("\t-i, --iter=ITER\n");
   printf("\t\tIteration selection. (timed)\n");
   printf(
       "\t\t\tValid value for ITER is any non-negative 32-bit integer. "
-      "(default: %d)\n\n",
+      "(default: %d)\n",
       DEFAULT_N);
 
   printf("\t-c, --csv=/path/to/file.csv\n");
   printf("\t\tCsv output file selection.\n");
   printf(
       "\t\t\tValid value for /path/to/file.csv is any valid file name. "
-      "(default: stdout)\n\n");
+      "(default: stdout)\n");
 
   printf("\t-r, --routines=ROUTINES\n");
   printf("\t\tRoutine selection.\n");
   printf(
       "\t\t\tValid value for ROUTINES is one of more valid blas3 routines "
-      "delimited by a comma. (default: %s)\n\n",
+      "delimited by a comma. (default: %s)\n",
       DEFAULT_BLAS_ROUTINES);
 }
 
@@ -250,6 +250,7 @@ int main(int argc, char **argv) {
   options.blas_args.team_size  = DEFAULT_TEAM_SIZE;
   options.blas_args.vector_len = DEFAULT_VECTOR_LEN;
   options.blas_args.use_auto   = DEFAULT_USE_AUTO;
+  options.blas_args.batch_size_last_dim = DEFAULT_BATCH_SIZE_LAST_DIM;
 
   options.blas_args.trmm.trmm_args = DEFAULT_TRMM_ARGS;
   options.blas_args.trmm.alpha     = DEFAULT_TRMM_ALPHA;

From b5c7b88b1682e9eeb19d8358582f5b5df042a340 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Tue, 2 Mar 2021 12:52:01 -0700
Subject: [PATCH 20/47] perf_test/batched: Add README.md

---
 perf_test/batched/README.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 perf_test/batched/README.md

diff --git a/perf_test/batched/README.md b/perf_test/batched/README.md
new file mode 100644
index 0000000000..ca5920ae39
--- /dev/null
+++ b/perf_test/batched/README.md
@@ -0,0 +1 @@
+Batched BLAS performance tests reside in `perf_test/blas/{blas,blas3}`.

From d9e9d04d2005334ed2638b37d149cc67fa43eee7 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Tue, 2 Mar 2021 16:14:04 -0700
Subject: [PATCH 21/47] perf_test/blas/blas3: Add last gemm test types

  - Added serial simd test types.
  - Added serial compact mkl test type.
---
 perf_test/blas/blas3/KokkosBlas3_common.hpp   | 29 ++++--
 .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 93 +++++++++++++++----
 2 files changed, 97 insertions(+), 25 deletions(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_common.hpp b/perf_test/blas/blas3/KokkosBlas3_common.hpp
index b398ed62aa..d37f11eea9 100644
--- a/perf_test/blas/blas3/KokkosBlas3_common.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_common.hpp
@@ -119,20 +119,28 @@ static std::string loop_e_str[LOOP_N] = {"serial", "parallel"};
 
 /**
  * @var BLAS:                          Run the blas routine through the
- * KokkosBlas namespace.
+ *                                     KokkosBlas namespace.
  * @var BATCHED_SERIAL{_BLOCKED}:      Run the serial blas routine through the
  *                                     KokkosBatched namespace.
+ * @var BATCHED_SERIAL_SIMD{_BLOCKED}: Run the serial blas routine through the
+ *                                     KokkosBatched namespace using SIMD views.
+ * @var BATCHED_SERIAL_COMPACT_MKL:    Run the serial blas mkl routine through
+ *                                     the KokkosBatched namespace.
  * @var BATCHED_TEAM{_BLOCKED}:        Run the team blas routine through the
- * KokkosBatched namespace.
+ *                                     KokkosBatched namespace.
  * @var BATCHED_TEAM_VECTOR{_BLOCKED}: Run the team vector blas routine through
- * the KokkosBatched namespace.
- * @var EXPERIMENT:                    Run the blas routine as a custom
- * experiment.
+ *                                     the KokkosBatched namespace.
+ * @var BATCHED_TEAM_SIMD{_BLOCKED}:   Run the team vector blas routine through the
+ *                                     KokkosBatched namespace using SIMD views.
+ * @var EXPERIMENT:                    Run the blas routine as a custom experiment.
  */
 typedef enum TEST {
   BLAS,
   BATCHED_SERIAL,
   BATCHED_SERIAL_BLOCKED,
+  BATCHED_SERIAL_SIMD,
+  BATCHED_SERIAL_SIMD_BLOCKED,
+  BATCHED_SERIAL_COMPACT_MKL,
   BATCHED_TEAM,
   BATCHED_TEAM_BLOCKED,
   BATCHED_TEAM_VECTOR,
@@ -145,10 +153,13 @@ typedef enum TEST {
 } test_e;
 
 static std::string test_e_str[TEST_N]{
-    "blas", "batched_serial", "batched_serial_blocked", "batched_team",
-    "batched_team_blocked", "batched_team_vector",
-    "batched_team_vector_blocked", "batched_team_simd",
-    "batched_team_simd_blocked",
+    "blas", 
+    "batched_serial", "batched_serial_blocked",
+    "batched_serial_simd", "batched_serial_simd_blocked",
+    "batched_serial_compact_mkl",
+    "batched_team", "batched_team_blocked", 
+    "batched_team_vector", "batched_team_vector_blocked", 
+    "batched_team_simd", "batched_team_simd_blocked",
     // ADD MORE TEST TYPES HERE
     "experiment"};
 
diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index 91bf649fed..5fffd02dc8 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -70,6 +70,9 @@ void do_gemm_serial_batched_blocked(options_t options);
 // invocation!
 void do_gemm_serial_batched_parallel(options_t options);
 void do_gemm_serial_batched_blocked_parallel(options_t options);
+void do_gemm_serial_simd_batched_parallel(options_t options);
+void do_gemm_serial_simd_batched_blocked_parallel(options_t options);
+void do_gemm_serial_batched_compact_mkl_parallel(options_t options);
 void do_gemm_team_batched_parallel(options_t options);
 void do_gemm_team_batched_blocked_parallel(options_t options);
 void do_gemm_team_vector_batched_parallel(options_t options);
@@ -104,8 +107,11 @@ void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = {
     },
     {
         NULL,  // BLAS
-        do_gemm_serial_batched_parallel,
-        do_gemm_serial_batched_blocked_parallel,  // Serial
+        do_gemm_serial_batched_parallel, // Serial
+        do_gemm_serial_batched_blocked_parallel,
+        do_gemm_serial_simd_batched_parallel,
+        do_gemm_serial_simd_batched_blocked_parallel,  
+        do_gemm_serial_batched_compact_mkl_parallel, 
         do_gemm_team_batched_parallel,
         do_gemm_team_batched_blocked_parallel,       // Team
         do_gemm_team_vector_batched_parallel, NULL,  // TeamVector
@@ -488,7 +494,7 @@ struct parallel_batched_gemm_range_policy {
 };
 
 template <class MemberType, class TransAType, class TransBType,
-          class BlockingType>
+          class BlockingType, class AlgoMode = void>
 struct parallel_batched_gemm {
   gemm_args_t gemm_args_;
 
@@ -582,7 +588,7 @@ struct parallel_batched_gemm {
       auto svB = Kokkos::subview(gemm_args_.Bv.ivec_4d, i, Kokkos::ALL(), Kokkos::ALL(), vector_lane);
       auto svC = Kokkos::subview(gemm_args_.Cv.ivec_4d, i, Kokkos::ALL(), Kokkos::ALL(), vector_lane);
 
-      KokkosBatched::TeamGemm<MemberType, TransAType, TransBType, BlockingType>::invoke(member, gemm_args_.alpha, svA, svB, gemm_args_.beta, svC);
+      KokkosBatched::Gemm<MemberType, TransAType, TransBType, AlgoMode, BlockingType>::invoke(member, gemm_args_.alpha, svA, svB, gemm_args_.beta, svC);
    });
   }
 
@@ -594,7 +600,7 @@ struct parallel_batched_gemm {
       auto svB = Kokkos::subview(gemm_args_.Bv.ivec_4d, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i);
       auto svC = Kokkos::subview(gemm_args_.Cv.ivec_4d, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i);
 
-      KokkosBatched::TeamGemm<MemberType, TransAType, TransBType, BlockingType>::invoke(member, gemm_args_.alpha, svA, svB, gemm_args_.beta, svC);
+      KokkosBatched::Gemm<MemberType, TransAType, TransBType, AlgoMode, BlockingType>::invoke(member, gemm_args_.alpha, svA, svB, gemm_args_.beta, svC);
    });
   }
 };
@@ -636,14 +642,14 @@ void __do_gemm_parallel_batched_template_range_policy(options_t options, gemm_ar
 }
 
 template <class TransAType, class TransBType, class BlockingType, class AlgoTag,
-          class device_type>
+          class device_type, class algo_mode = void>
 void __do_gemm_parallel_batched_template(options_t options,
                                          gemm_args_t gemm_args) {
   using execution_space = typename device_type::execution_space;
   using policy_type     = Kokkos::TeamPolicy<AlgoTag, execution_space>;
   using member_type     = typename policy_type::member_type;
   using functor_type =
-      parallel_batched_gemm<member_type, TransAType, TransBType, BlockingType>;
+      parallel_batched_gemm<member_type, TransAType, TransBType, BlockingType, algo_mode>;
 
   uint32_t warm_up_n = options.warm_up_n;
   uint32_t n         = options.n;
@@ -702,7 +708,7 @@ void __do_gemm_parallel_batched_template(options_t options,
   return;
 }
 
-template <class algo_tag, class blocking_type, class device_type>
+template <class algo_tag, class blocking_type, class device_type, class algo_mode = void>
 void __do_gemm_parallel_batched(options_t options, gemm_args_t gemm_args) {
   char a  = gemm_args.transA;
   char b  = gemm_args.transB;
@@ -714,19 +720,19 @@ void __do_gemm_parallel_batched(options_t options, gemm_args_t gemm_args) {
 
   if (a == 'N' && b == 'N') {
     __do_gemm_parallel_batched_template<N, N, blocking_type, algo_tag,
-                                        device_type>(options, gemm_args);
+                                        device_type, algo_mode>(options, gemm_args);
   } else if (a == 'N' && b == 'T') {
     __do_gemm_parallel_batched_template<N, T, blocking_type, algo_tag,
-                                        device_type>(options, gemm_args);
+                                        device_type, algo_mode>(options, gemm_args);
     //} else if (a == 'N' && b == 'C') {
     //  __do_gemm_parallel_batched_template<N, C, blocking_type, algo_tag,
     //  device_type>(options, gemm_args);
   } else if (a == 'T' && b == 'N') {
     __do_gemm_parallel_batched_template<T, N, blocking_type, algo_tag,
-                                        device_type>(options, gemm_args);
+                                        device_type, algo_mode>(options, gemm_args);
   } else if (a == 'T' && b == 'T') {
     __do_gemm_parallel_batched_template<T, T, blocking_type, algo_tag,
-                                        device_type>(options, gemm_args);
+                                        device_type, algo_mode>(options, gemm_args);
     //} else if (a == 'T' && b == 'C') {
     //  __do_gemm_parallel_batched_template<T, C, blocking_type, algo_tag,
     //  device_type>(options, gemm_args);
@@ -1410,6 +1416,61 @@ void do_gemm_serial_batched_blocked_parallel(options_t options) {
   return;
 }
 
+void do_gemm_serial_simd_batched_parallel(options_t options) {
+  STATUS;
+  if (options.blas_args.batch_size_last_dim)
+      __do_loop_and_invoke(
+        options, __do_gemm_parallel_batched<TeamSimdBatchDim4Tag, Algo::Gemm::Unblocked,
+                                            default_device, Mode::Serial>);
+  else
+    __do_loop_and_invoke(
+        options, __do_gemm_parallel_batched<TeamSimdTag, Algo::Gemm::Unblocked,
+                                            default_device, Mode::Serial>);
+  return;
+}
+
+void do_gemm_serial_simd_batched_blocked_parallel(options_t options) {
+  STATUS;
+  if (options.blas_args.batch_size_last_dim)
+      __do_loop_and_invoke(
+        options, __do_gemm_parallel_batched<TeamSimdBatchDim4Tag, Algo::Gemm::Blocked,
+                                            default_device, Mode::Serial>);
+  else
+    __do_loop_and_invoke(
+        options, __do_gemm_parallel_batched<TeamSimdTag, Algo::Gemm::Blocked,
+                                            default_device, Mode::Serial>);
+  return;
+}
+
+void do_gemm_serial_batched_compact_mkl_parallel(options_t options) {
+  STATUS;
+#if                                                            \
+  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&               \
+  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) &&       \
+  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
+  if (options.blas_args.batch_size_last_dim)
+    __do_loop_and_invoke(
+        options, __do_gemm_parallel_batched<SerialBatchDim3Tag, Algo::Gemm::CompactMKL,
+                                            default_device>);
+  else
+    __do_loop_and_invoke(
+        options, __do_gemm_parallel_batched<SerialTag, Algo::Gemm::CompactMKL,
+                                            default_device>);
+#else
+  #if !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__)
+    std::cerr << std::string(__func__)
+              << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL__ is undefined." << std::endl;
+  #elif !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__)
+    std::cerr << std::string(__func__)
+              << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__ is undefined." << std::endl;
+  #elif !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
+    std::cerr << std::string(__func__)
+              << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__ is undefined." << std::endl;
+  #endif
+#endif
+  return;
+}
+
 void do_gemm_team_batched_parallel(options_t options) {
   STATUS;
   if (options.blas_args.batch_size_last_dim)
@@ -1454,11 +1515,11 @@ void do_gemm_team_simd_batched_parallel(options_t options) {
   if (options.blas_args.batch_size_last_dim)
       __do_loop_and_invoke(
         options, __do_gemm_parallel_batched<TeamSimdBatchDim4Tag, Algo::Gemm::Unblocked,
-                                            default_device>);
+                                            default_device, Mode::Team>);
   else
     __do_loop_and_invoke(
         options, __do_gemm_parallel_batched<TeamSimdTag, Algo::Gemm::Unblocked,
-                                            default_device>);
+                                            default_device, Mode::Team>);
   return;
 }
 
@@ -1467,11 +1528,11 @@ void do_gemm_team_simd_batched_blocked_parallel(options_t options) {
   if (options.blas_args.batch_size_last_dim)
       __do_loop_and_invoke(
         options, __do_gemm_parallel_batched<TeamSimdBatchDim4Tag, Algo::Gemm::Blocked,
-                                            default_device>);
+                                            default_device, Mode::Team>);
   else
     __do_loop_and_invoke(
         options, __do_gemm_parallel_batched<TeamSimdTag, Algo::Gemm::Blocked,
-                                            default_device>);
+                                            default_device, Mode::Team>);
   return;
 }
 

From fa23cf75b5b4da16a468dfa9640b8bc84b5d5614 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Tue, 2 Mar 2021 16:19:04 -0700
Subject: [PATCH 22/47] perf_test/blas/blas3: Apply clang-format

---
 perf_test/blas/blas3/KokkosBlas3_common.hpp   |  19 +-
 .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 480 +++++++++++-------
 .../blas/blas3/KokkosBlas3_perf_test.cpp      |  83 +--
 .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 116 +++--
 4 files changed, 405 insertions(+), 293 deletions(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_common.hpp b/perf_test/blas/blas3/KokkosBlas3_common.hpp
index d37f11eea9..a991efe61e 100644
--- a/perf_test/blas/blas3/KokkosBlas3_common.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_common.hpp
@@ -130,9 +130,10 @@ static std::string loop_e_str[LOOP_N] = {"serial", "parallel"};
  *                                     KokkosBatched namespace.
  * @var BATCHED_TEAM_VECTOR{_BLOCKED}: Run the team vector blas routine through
  *                                     the KokkosBatched namespace.
- * @var BATCHED_TEAM_SIMD{_BLOCKED}:   Run the team vector blas routine through the
- *                                     KokkosBatched namespace using SIMD views.
- * @var EXPERIMENT:                    Run the blas routine as a custom experiment.
+ * @var BATCHED_TEAM_SIMD{_BLOCKED}:   Run the team vector blas routine through
+ * the KokkosBatched namespace using SIMD views.
+ * @var EXPERIMENT:                    Run the blas routine as a custom
+ * experiment.
  */
 typedef enum TEST {
   BLAS,
@@ -153,13 +154,11 @@ typedef enum TEST {
 } test_e;
 
 static std::string test_e_str[TEST_N]{
-    "blas", 
-    "batched_serial", "batched_serial_blocked",
-    "batched_serial_simd", "batched_serial_simd_blocked",
-    "batched_serial_compact_mkl",
-    "batched_team", "batched_team_blocked", 
-    "batched_team_vector", "batched_team_vector_blocked", 
-    "batched_team_simd", "batched_team_simd_blocked",
+    "blas", "batched_serial", "batched_serial_blocked", "batched_serial_simd",
+    "batched_serial_simd_blocked", "batched_serial_compact_mkl", "batched_team",
+    "batched_team_blocked", "batched_team_vector",
+    "batched_team_vector_blocked", "batched_team_simd",
+    "batched_team_simd_blocked",
     // ADD MORE TEST TYPES HERE
     "experiment"};
 
diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index 5fffd02dc8..3e55a85799 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -106,16 +106,16 @@ void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = {
         NULL  // Serial Experiment
     },
     {
-        NULL,  // BLAS
-        do_gemm_serial_batched_parallel, // Serial
+        NULL,                             // BLAS
+        do_gemm_serial_batched_parallel,  // Serial
         do_gemm_serial_batched_blocked_parallel,
         do_gemm_serial_simd_batched_parallel,
-        do_gemm_serial_simd_batched_blocked_parallel,  
-        do_gemm_serial_batched_compact_mkl_parallel, 
+        do_gemm_serial_simd_batched_blocked_parallel,
+        do_gemm_serial_batched_compact_mkl_parallel,
         do_gemm_team_batched_parallel,
         do_gemm_team_batched_blocked_parallel,       // Team
         do_gemm_team_vector_batched_parallel, NULL,  // TeamVector
-        do_gemm_team_simd_batched_parallel, 
+        do_gemm_team_simd_batched_parallel,
         do_gemm_team_simd_batched_blocked_parallel,  // TeamSimd
         do_gemm_experiment_parallel                  // Parallel Experiment
     }};
@@ -123,22 +123,29 @@ void (*do_gemm_invoke[LOOP_N][TEST_N])(options_t) = {
 /*************************** Test types and defaults **************************/
 #define DEFAULT_GEMM_ARGS "NN"
 #define DEFAULT_GEMM_ALPHA 1.0
-#define DEFAULT_GEMM_BETA  1.0
+#define DEFAULT_GEMM_BETA 1.0
 
 using view_type_3d =
     Kokkos::View<default_scalar ***, default_layout, default_device>;
-using view_type_4d = Kokkos::View<default_scalar****, default_layout, default_device>;
+using view_type_4d =
+    Kokkos::View<default_scalar ****, default_layout, default_device>;
 
 // Construct the vector type
 using memory_space = typename default_device::execution_space::memory_space;
 constexpr int simd_vector_size =
     KokkosBatched::DefaultVectorLength<default_scalar, memory_space>::value;
-constexpr int simd_internal_vector_size = 
-    KokkosBatched::DefaultInternalVectorLength<default_scalar, memory_space>::value;
-using vector_type = KokkosBatched::Vector<KokkosBatched::SIMD<default_scalar>, simd_vector_size>;
-using internal_vector_type = KokkosBatched::Vector<KokkosBatched::SIMD<default_scalar>, simd_internal_vector_size>;
-using vector_view_type_3d = Kokkos::View<vector_type***, default_layout, default_device>;
-using internal_vector_view_type_4d = Kokkos::View<internal_vector_type****, default_layout, default_device>;
+constexpr int simd_internal_vector_size =
+    KokkosBatched::DefaultInternalVectorLength<default_scalar,
+                                               memory_space>::value;
+using vector_type = KokkosBatched::Vector<KokkosBatched::SIMD<default_scalar>,
+                                          simd_vector_size>;
+using internal_vector_type =
+    KokkosBatched::Vector<KokkosBatched::SIMD<default_scalar>,
+                          simd_internal_vector_size>;
+using vector_view_type_3d =
+    Kokkos::View<vector_type ***, default_layout, default_device>;
+using internal_vector_view_type_4d =
+    Kokkos::View<internal_vector_type ****, default_layout, default_device>;
 
 struct batched_params {
   int team_size;
@@ -151,14 +158,14 @@ typedef struct batched_params batched_params_t;
  * for allocating and passing a single matrix to the KokkosBatched gemm
  * kernels. To invoke gemm on a batch of matrices, three instances of this
  * struct are required, one for each matrix, A, B, and C.
- * 
+ *
  * @var  vec_3d: 3-rank view type used for allocating the underlying data.
  *               A reference must be kept to this object to ensure the
  *               data is not free'd by the C++ runtime.
  * @var  mat_4d: 4-rank view type used for populating the simd view with
                  random values.
  * @var ivec_4d: 4-rank view type used for passing to math kernels. This
- *               view type is used for leveraging simd instructions on 
+ *               view type is used for leveraging simd instructions on
  *               both the host and device.
  */
 struct gemm_simd_args {
@@ -184,11 +191,12 @@ typedef struct gemm_simd_args gemm_simd_args_t;
  * @var A:     3-rank view type used in all non-simd tests.
  * @var B:     3-rank view type used in all non-simd tests.
  * @var C:     3-rank view type used in all non-simd tests.
- * @var bp:    team_size and vector_length for tests that use Kokkos::TeamPolicy.
+ * @var bp:    team_size and vector_length for tests that use
+ * Kokkos::TeamPolicy.
  * @var Av:    3-rank and 4-rank vector view types for simd tests.
  * @var Bv:    3-rank and 4-rank vector view types for simd tests.
  * @var Cv:    3-rank and 4-rank vector view types for simd tests.
- */ 
+ */
 struct gemm_args {
   char transA, transB;
   default_scalar alpha;
@@ -207,19 +215,22 @@ static std::string gemm_csv_header_str =
     "iter,total_time(s),average_time(s),FLOPS,GFLOP/average_time(s)";
 
 /*************************** Internal helper fns **************************/
-// Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf
+// Flop count formula from lapack working note 41:
+// http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf
 static inline int __gemm_flop_count(int a_m, int a_n, int b_n) {
-    if (std::is_same<double, default_scalar>::value ||
-        std::is_same<float, default_scalar>::value ||
-        std::is_same<Kokkos::Experimental::half_t, default_scalar>::value)
-      return 2 * a_m * b_n * a_n;
-    else
-      // For complex, we need to count 2 flops for each add and 6 flops for each multiply.
-      return (2 + 6) * a_m * b_n * a_n;
+  if (std::is_same<double, default_scalar>::value ||
+      std::is_same<float, default_scalar>::value ||
+      std::is_same<Kokkos::Experimental::half_t, default_scalar>::value)
+    return 2 * a_m * b_n * a_n;
+  else
+    // For complex, we need to count 2 flops for each add and 6 flops for each
+    // multiply.
+    return (2 + 6) * a_m * b_n * a_n;
 }
 
-static inline std::string __gemm_output_dim_string(options_t options, matrix_dim_t dim) {
-  std::string x = "x";
+static inline std::string __gemm_output_dim_string(options_t options,
+                                                   matrix_dim_t dim) {
+  std::string x   = "x";
   std::string ret = std::to_string(dim.m) + x + std::to_string(dim.n);
 
   if (options.blas_args.batch_size_last_dim)
@@ -232,8 +243,8 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args,
                                   double time_in_seconds,
                                   const char *experiment_name = nullptr) {
   std::string algo_name = test_e_str[options.test];
-  std::string ts = std::to_string(gemm_args.bp.team_size);
-  std::string vlen = std::to_string(gemm_args.bp.vector_len);
+  std::string ts        = std::to_string(gemm_args.bp.team_size);
+  std::string vlen      = std::to_string(gemm_args.bp.vector_len);
   if (experiment_name) algo_name = std::string(experiment_name);
   if (options.blas_args.use_auto) ts = vlen = "Kokkos::AUTO";
 
@@ -241,26 +252,22 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args,
   double gflops;
   double average_time = time_in_seconds / options.n;
 
-  flops = gemm_args.dims.a.k * __gemm_flop_count(gemm_args.dims.a.m, gemm_args.dims.a.n,
-						 gemm_args.dims.b.n);
+  flops = gemm_args.dims.a.k * __gemm_flop_count(gemm_args.dims.a.m,
+                                                 gemm_args.dims.a.n,
+                                                 gemm_args.dims.b.n);
 
   gflops = flops / 1e9;
 
   options.out[0] << algo_name << "," << options.blas_args.gemm.gemm_args << ","
                  << options.blas_args.gemm.alpha << ","
-                 << options.blas_args.gemm.beta << ","
-                 << ts << ","
-                 << vlen << ","
-                 << loop_e_str[options.loop] << "," 
-		 << __gemm_output_dim_string(options, gemm_args.dims.a) << ","
-		 << __gemm_output_dim_string(options, gemm_args.dims.b) << ","
-		 << __gemm_output_dim_string(options, gemm_args.dims.c) << ","
-		 << options.warm_up_n << "," << options.n << ","
-                 << time_in_seconds << ","
-                 << time_in_seconds / options.n << ","
-                 << flops << ","
-                 << gflops / average_time
-                 << std::endl;
+                 << options.blas_args.gemm.beta << "," << ts << "," << vlen
+                 << "," << loop_e_str[options.loop] << ","
+                 << __gemm_output_dim_string(options, gemm_args.dims.a) << ","
+                 << __gemm_output_dim_string(options, gemm_args.dims.b) << ","
+                 << __gemm_output_dim_string(options, gemm_args.dims.c) << ","
+                 << options.warm_up_n << "," << options.n << ","
+                 << time_in_seconds << "," << time_in_seconds / options.n << ","
+                 << flops << "," << gflops / average_time << std::endl;
 }
 
 static void __print_gemm_perf_test_options(options_t options) {
@@ -435,12 +442,12 @@ void __do_gemm_parallel_blas(options_t options, gemm_args_t gemm_args) {
   return;
 }
 
-template <class TransAType, class TransBType,
-          class BlockingType>
+template <class TransAType, class TransBType, class BlockingType>
 struct parallel_batched_gemm_range_policy {
   gemm_args_t gemm_args_;
 
-  parallel_batched_gemm_range_policy(gemm_args_t gemm_args) : gemm_args_(gemm_args) {}
+  parallel_batched_gemm_range_policy(gemm_args_t gemm_args)
+      : gemm_args_(gemm_args) {}
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const SerialTag &, const int &i) const {
@@ -470,27 +477,27 @@ struct parallel_batched_gemm_range_policy {
   KOKKOS_INLINE_FUNCTION
   void operator()(const TeamBatchDim3Tag &, const int &i) const {
     Kokkos::abort("TeamBatchDim3Tag not supported using RangePolicy.");
-      }
+  }
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const TeamVectorTag &, const int &i) const {
     Kokkos::abort("TeamVectorTag not supported using RangePolicy.");
-      }
+  }
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const TeamVectorBatchDim3Tag &, const int &i) const {
     Kokkos::abort("TeamVectorBatchDim3Tag not supported using RangePolicy.");
-      }
+  }
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const TeamSimdTag &, const int &i) const {
     Kokkos::abort("TeamSimdTag not supported using RangePolicy.");
-      }
+  }
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const TeamSimdBatchDim4Tag &, const int &i) const {
     Kokkos::abort("TeamSimdBatchDim4Tag not supported using RangePolicy.");
-      }
+  }
 };
 
 template <class MemberType, class TransAType, class TransBType,
@@ -564,7 +571,8 @@ struct parallel_batched_gemm {
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const TeamVectorBatchDim3Tag &, const MemberType &member) const {
+  void operator()(const TeamVectorBatchDim3Tag &,
+                  const MemberType &member) const {
     auto team_idx = member.league_rank();
     auto svA =
         Kokkos::subview(gemm_args_.A, Kokkos::ALL(), Kokkos::ALL(), team_idx);
@@ -583,31 +591,49 @@ struct parallel_batched_gemm {
   KOKKOS_INLINE_FUNCTION
   void operator()(const TeamSimdTag &, const MemberType &member) const {
     auto i = member.league_rank();
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, gemm_args_.Cv.ivec_4d.extent(3)),[&](const int &vector_lane) {
-      auto svA = Kokkos::subview(gemm_args_.Av.ivec_4d, i, Kokkos::ALL(), Kokkos::ALL(), vector_lane);
-      auto svB = Kokkos::subview(gemm_args_.Bv.ivec_4d, i, Kokkos::ALL(), Kokkos::ALL(), vector_lane);
-      auto svC = Kokkos::subview(gemm_args_.Cv.ivec_4d, i, Kokkos::ALL(), Kokkos::ALL(), vector_lane);
-
-      KokkosBatched::Gemm<MemberType, TransAType, TransBType, AlgoMode, BlockingType>::invoke(member, gemm_args_.alpha, svA, svB, gemm_args_.beta, svC);
-   });
+    Kokkos::parallel_for(
+        Kokkos::ThreadVectorRange(member, gemm_args_.Cv.ivec_4d.extent(3)),
+        [&](const int &vector_lane) {
+          auto svA = Kokkos::subview(gemm_args_.Av.ivec_4d, i, Kokkos::ALL(),
+                                     Kokkos::ALL(), vector_lane);
+          auto svB = Kokkos::subview(gemm_args_.Bv.ivec_4d, i, Kokkos::ALL(),
+                                     Kokkos::ALL(), vector_lane);
+          auto svC = Kokkos::subview(gemm_args_.Cv.ivec_4d, i, Kokkos::ALL(),
+                                     Kokkos::ALL(), vector_lane);
+
+          KokkosBatched::Gemm<MemberType, TransAType, TransBType, AlgoMode,
+                              BlockingType>::invoke(member, gemm_args_.alpha,
+                                                    svA, svB, gemm_args_.beta,
+                                                    svC);
+        });
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const TeamSimdBatchDim4Tag &, const MemberType &member) const {
+  void operator()(const TeamSimdBatchDim4Tag &,
+                  const MemberType &member) const {
     auto i = member.league_rank();
-    Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, simd_vector_size),[&](const int &vector_lane) {
-      auto svA = Kokkos::subview(gemm_args_.Av.ivec_4d, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i);
-      auto svB = Kokkos::subview(gemm_args_.Bv.ivec_4d, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i);
-      auto svC = Kokkos::subview(gemm_args_.Cv.ivec_4d, vector_lane, Kokkos::ALL(), Kokkos::ALL(), i);
-
-      KokkosBatched::Gemm<MemberType, TransAType, TransBType, AlgoMode, BlockingType>::invoke(member, gemm_args_.alpha, svA, svB, gemm_args_.beta, svC);
-   });
+    Kokkos::parallel_for(
+        Kokkos::ThreadVectorRange(member, simd_vector_size),
+        [&](const int &vector_lane) {
+          auto svA = Kokkos::subview(gemm_args_.Av.ivec_4d, vector_lane,
+                                     Kokkos::ALL(), Kokkos::ALL(), i);
+          auto svB = Kokkos::subview(gemm_args_.Bv.ivec_4d, vector_lane,
+                                     Kokkos::ALL(), Kokkos::ALL(), i);
+          auto svC = Kokkos::subview(gemm_args_.Cv.ivec_4d, vector_lane,
+                                     Kokkos::ALL(), Kokkos::ALL(), i);
+
+          KokkosBatched::Gemm<MemberType, TransAType, TransBType, AlgoMode,
+                              BlockingType>::invoke(member, gemm_args_.alpha,
+                                                    svA, svB, gemm_args_.beta,
+                                                    svC);
+        });
   }
 };
 
 template <class TransAType, class TransBType, class BlockingType, class AlgoTag,
           class device_type>
-void __do_gemm_parallel_batched_template_range_policy(options_t options, gemm_args_t gemm_args) {
+void __do_gemm_parallel_batched_template_range_policy(options_t options,
+                                                      gemm_args_t gemm_args) {
   using execution_space = typename device_type::execution_space;
   using policy_type     = Kokkos::RangePolicy<AlgoTag, execution_space>;
   using functor_type =
@@ -623,16 +649,16 @@ void __do_gemm_parallel_batched_template_range_policy(options_t options, gemm_ar
 
   for (uint32_t i = 0; i < warm_up_n; i++) {
     Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm",
-                        policy_type(0, options.start.c.k),
-                        parallel_batched_gemm_functor);
+                         policy_type(0, options.start.c.k),
+                         parallel_batched_gemm_functor);
     Kokkos::fence();
   }
 
   timer.reset();
   for (uint32_t i = 0; i < n; i++) {
     Kokkos::parallel_for("parallelBatchedTimedLoopGemm",
-                        policy_type(0, options.start.c.k),
-                        parallel_batched_gemm_functor);
+                         policy_type(0, options.start.c.k),
+                         parallel_batched_gemm_functor);
     Kokkos::fence();
   }
 
@@ -649,22 +675,30 @@ void __do_gemm_parallel_batched_template(options_t options,
   using policy_type     = Kokkos::TeamPolicy<AlgoTag, execution_space>;
   using member_type     = typename policy_type::member_type;
   using functor_type =
-      parallel_batched_gemm<member_type, TransAType, TransBType, BlockingType, algo_mode>;
+      parallel_batched_gemm<member_type, TransAType, TransBType, BlockingType,
+                            algo_mode>;
 
   uint32_t warm_up_n = options.warm_up_n;
   uint32_t n         = options.n;
   auto league_size   = options.start.c.k;
-  auto team_size  = gemm_args.bp.team_size;
-  auto vector_len = gemm_args.bp.vector_len;
+  auto team_size     = gemm_args.bp.team_size;
+  auto vector_len    = gemm_args.bp.vector_len;
   Kokkos::Timer timer;
 
-  if (std::is_same<AlgoTag, SerialTag>::value || std::is_same<AlgoTag, SerialBatchDim3Tag>::value) {
-    return __do_gemm_parallel_batched_template_range_policy<TransAType, TransBType, BlockingType, AlgoTag, device_type>(options, gemm_args);
+  if (std::is_same<AlgoTag, SerialTag>::value ||
+      std::is_same<AlgoTag, SerialBatchDim3Tag>::value) {
+    return __do_gemm_parallel_batched_template_range_policy<
+        TransAType, TransBType, BlockingType, AlgoTag, device_type>(options,
+                                                                    gemm_args);
   }
 
-  if (std::is_same<AlgoTag, TeamSimdTag>::value || std::is_same<AlgoTag, TeamSimdBatchDim4Tag>::value) {
-    league_size = options.blas_args.batch_size_last_dim ? gemm_args.Cv.ivec_4d.extent(3) : gemm_args.Cv.ivec_4d.extent(0);
-    vector_len = simd_vector_size/simd_internal_vector_size; // TODO: use bp.vector_len?
+  if (std::is_same<AlgoTag, TeamSimdTag>::value ||
+      std::is_same<AlgoTag, TeamSimdBatchDim4Tag>::value) {
+    league_size = options.blas_args.batch_size_last_dim
+                      ? gemm_args.Cv.ivec_4d.extent(3)
+                      : gemm_args.Cv.ivec_4d.extent(0);
+    vector_len  = simd_vector_size /
+                 simd_internal_vector_size;  // TODO: use bp.vector_len?
   }
 
   STATUS;
@@ -674,31 +708,31 @@ void __do_gemm_parallel_batched_template(options_t options,
   if (options.blas_args.use_auto) {
     for (uint32_t i = 0; i < warm_up_n; i++) {
       Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm",
-                          policy_type(league_size, Kokkos::AUTO, Kokkos::AUTO),
-                          parallel_batched_gemm_functor);
+                           policy_type(league_size, Kokkos::AUTO, Kokkos::AUTO),
+                           parallel_batched_gemm_functor);
       Kokkos::fence();
     }
 
     timer.reset();
     for (uint32_t i = 0; i < n; i++) {
       Kokkos::parallel_for("parallelBatchedTimedLoopGemm",
-                          policy_type(league_size, Kokkos::AUTO, Kokkos::AUTO),
-                          parallel_batched_gemm_functor);
+                           policy_type(league_size, Kokkos::AUTO, Kokkos::AUTO),
+                           parallel_batched_gemm_functor);
       Kokkos::fence();
     }
   } else {
     for (uint32_t i = 0; i < warm_up_n; i++) {
       Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm",
-                          policy_type(league_size, team_size, vector_len),
-                          parallel_batched_gemm_functor);
+                           policy_type(league_size, team_size, vector_len),
+                           parallel_batched_gemm_functor);
       Kokkos::fence();
     }
 
     timer.reset();
     for (uint32_t i = 0; i < n; i++) {
       Kokkos::parallel_for("parallelBatchedTimedLoopGemm",
-                          policy_type(league_size, team_size, vector_len),
-                          parallel_batched_gemm_functor);
+                           policy_type(league_size, team_size, vector_len),
+                           parallel_batched_gemm_functor);
       Kokkos::fence();
     }
   }
@@ -708,7 +742,8 @@ void __do_gemm_parallel_batched_template(options_t options,
   return;
 }
 
-template <class algo_tag, class blocking_type, class device_type, class algo_mode = void>
+template <class algo_tag, class blocking_type, class device_type,
+          class algo_mode = void>
 void __do_gemm_parallel_batched(options_t options, gemm_args_t gemm_args) {
   char a  = gemm_args.transA;
   char b  = gemm_args.transB;
@@ -720,19 +755,23 @@ void __do_gemm_parallel_batched(options_t options, gemm_args_t gemm_args) {
 
   if (a == 'N' && b == 'N') {
     __do_gemm_parallel_batched_template<N, N, blocking_type, algo_tag,
-                                        device_type, algo_mode>(options, gemm_args);
+                                        device_type, algo_mode>(options,
+                                                                gemm_args);
   } else if (a == 'N' && b == 'T') {
     __do_gemm_parallel_batched_template<N, T, blocking_type, algo_tag,
-                                        device_type, algo_mode>(options, gemm_args);
+                                        device_type, algo_mode>(options,
+                                                                gemm_args);
     //} else if (a == 'N' && b == 'C') {
     //  __do_gemm_parallel_batched_template<N, C, blocking_type, algo_tag,
     //  device_type>(options, gemm_args);
   } else if (a == 'T' && b == 'N') {
     __do_gemm_parallel_batched_template<T, N, blocking_type, algo_tag,
-                                        device_type, algo_mode>(options, gemm_args);
+                                        device_type, algo_mode>(options,
+                                                                gemm_args);
   } else if (a == 'T' && b == 'T') {
     __do_gemm_parallel_batched_template<T, T, blocking_type, algo_tag,
-                                        device_type, algo_mode>(options, gemm_args);
+                                        device_type, algo_mode>(options,
+                                                                gemm_args);
     //} else if (a == 'T' && b == 'C') {
     //  __do_gemm_parallel_batched_template<T, C, blocking_type, algo_tag,
     //  device_type>(options, gemm_args);
@@ -1093,7 +1132,8 @@ void __do_gemm_parallel_experiment5(options_t options, gemm_args_t gemm_args) {
   using scalar_type = typename view_type_3d::value_type;
   constexpr int vl =
       KokkosBatched::DefaultVectorLength<scalar_type, execution_space>::value;
-  using simd_type = KokkosBatched::Vector<KokkosBatched::SIMD<scalar_type>, simd_vector_size>;
+  using simd_type =
+      KokkosBatched::Vector<KokkosBatched::SIMD<scalar_type>, simd_vector_size>;
   using simd_view_type =
       Kokkos::View<simd_type ***, default_layout, default_device>;
   using functor_type =
@@ -1118,12 +1158,12 @@ void __do_gemm_parallel_experiment5(options_t options, gemm_args_t gemm_args) {
   // uint64_t seed = Kokkos::Impl::clock_tic();
   // Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(seed);
   // Kokkos::fill_random(A, rand_pool,
-  // Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, simd_type>::max());
-  // Kokkos::fill_random(B, rand_pool,
-  // Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, simd_type>::max());
-  // Kokkos::fill_random(C, rand_pool,
-  // Kokkos::rand<Kokkos::Random_XorShift64<execution_space>, simd_type>::max());
-  // execution_space::fence();
+  // Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
+  // simd_type>::max()); Kokkos::fill_random(B, rand_pool,
+  // Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
+  // simd_type>::max()); Kokkos::fill_random(C, rand_pool,
+  // Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
+  // simd_type>::max()); execution_space::fence();
 
   functor_type experiment5_functor(A, B, C, gemm_args);
 
@@ -1151,8 +1191,8 @@ void __do_gemm_parallel_experiment5(options_t options, gemm_args_t gemm_args) {
   return;
 }
 
-template <class MemberType, class SimdViewType, class TransAType, class TransBType,
-          class BlockingType>
+template <class MemberType, class SimdViewType, class TransAType,
+          class TransBType, class BlockingType>
 class parallel_batched_gemm_experiment6 {
  private:
   SimdViewType &A, &B, &C;
@@ -1165,14 +1205,16 @@ class parallel_batched_gemm_experiment6 {
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const MemberType &member) const {
-    auto i = member.league_rank();
+    auto i   = member.league_rank();
     auto svA = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
     auto svB = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL());
     auto svC = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL());
 
     // Uses two serial for-loops internally
-    KokkosBatched::TeamVectorGemm<MemberType, TransAType, TransBType, BlockingType>::invoke(
-        member, gemm_args.alpha, svA, svB, gemm_args.beta, svC);
+    KokkosBatched::TeamVectorGemm<MemberType, TransAType, TransBType,
+                                  BlockingType>::invoke(member, gemm_args.alpha,
+                                                        svA, svB,
+                                                        gemm_args.beta, svC);
   }
 };
 
@@ -1254,77 +1296,111 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) {
   Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(seed);
   STATUS;
 
-  gemm_args.dims = dims;
-  gemm_args.transA        = options.blas_args.gemm.gemm_args.c_str()[0];
-  gemm_args.transB        = options.blas_args.gemm.gemm_args.c_str()[1];
-  if (options.test == BATCHED_TEAM_SIMD || options.test == BATCHED_TEAM_SIMD_BLOCKED) {
+  gemm_args.dims   = dims;
+  gemm_args.transA = options.blas_args.gemm.gemm_args.c_str()[0];
+  gemm_args.transB = options.blas_args.gemm.gemm_args.c_str()[1];
+  if (options.test == BATCHED_TEAM_SIMD ||
+      options.test == BATCHED_TEAM_SIMD_BLOCKED) {
     // Calculate the batch size for simd views
-    auto a_simd_batch_size = dims.a.k / simd_vector_size + (dims.a.k % simd_vector_size > 0);
-    auto b_simd_batch_size = dims.b.k / simd_vector_size + (dims.b.k % simd_vector_size > 0);
-    auto c_simd_batch_size = dims.c.k / simd_vector_size + (dims.c.k % simd_vector_size > 0);
+    auto a_simd_batch_size =
+        dims.a.k / simd_vector_size + (dims.a.k % simd_vector_size > 0);
+    auto b_simd_batch_size =
+        dims.b.k / simd_vector_size + (dims.b.k % simd_vector_size > 0);
+    auto c_simd_batch_size =
+        dims.c.k / simd_vector_size + (dims.c.k % simd_vector_size > 0);
 
     // Reference gemm simd arguments for allocating A, B, and C matrices
     gemm_simd_args_t &A = gemm_args.Av, &B = gemm_args.Bv, &C = gemm_args.Cv;
 
     if (options.blas_args.batch_size_last_dim) {
-      // Construct simd matrices with batch_size in the last dimension (better for LayoutLeft views)
-      A.vec_3d = vector_view_type_3d ("A_vector", dims.a.m, dims.a.n, a_simd_batch_size);
-      A.mat_4d = view_type_4d ((scalar_type *)A.vec_3d.data(), simd_vector_size, dims.a.m, dims.a.n, a_simd_batch_size);
-      A.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)A.mat_4d.data(), simd_vector_size/simd_internal_vector_size, dims.a.m, dims.a.n, a_simd_batch_size);
-
-      B.vec_3d = vector_view_type_3d ("B_vector", dims.b.m, dims.b.n, b_simd_batch_size);
-      B.mat_4d = view_type_4d ((scalar_type *)B.vec_3d.data(), simd_vector_size, dims.b.m, dims.b.n, b_simd_batch_size);
-      B.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)B.mat_4d.data(), simd_vector_size/simd_internal_vector_size, dims.b.m, dims.b.n, b_simd_batch_size);
-
-      C.vec_3d = vector_view_type_3d ("C_vector", dims.c.m, dims.c.n, c_simd_batch_size);
-      C.mat_4d = view_type_4d ((scalar_type *)C.vec_3d.data(), simd_vector_size, dims.c.m, dims.c.n, c_simd_batch_size);
-      C.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)C.mat_4d.data(), simd_vector_size/simd_internal_vector_size, dims.c.m, dims.c.n, c_simd_batch_size);
+      // Construct simd matrices with batch_size in the last dimension (better
+      // for LayoutLeft views)
+      A.vec_3d  = vector_view_type_3d("A_vector", dims.a.m, dims.a.n,
+                                     a_simd_batch_size);
+      A.mat_4d  = view_type_4d((scalar_type *)A.vec_3d.data(), simd_vector_size,
+                              dims.a.m, dims.a.n, a_simd_batch_size);
+      A.ivec_4d = internal_vector_view_type_4d(
+          (internal_vector_type *)A.mat_4d.data(),
+          simd_vector_size / simd_internal_vector_size, dims.a.m, dims.a.n,
+          a_simd_batch_size);
+
+      B.vec_3d  = vector_view_type_3d("B_vector", dims.b.m, dims.b.n,
+                                     b_simd_batch_size);
+      B.mat_4d  = view_type_4d((scalar_type *)B.vec_3d.data(), simd_vector_size,
+                              dims.b.m, dims.b.n, b_simd_batch_size);
+      B.ivec_4d = internal_vector_view_type_4d(
+          (internal_vector_type *)B.mat_4d.data(),
+          simd_vector_size / simd_internal_vector_size, dims.b.m, dims.b.n,
+          b_simd_batch_size);
+
+      C.vec_3d  = vector_view_type_3d("C_vector", dims.c.m, dims.c.n,
+                                     c_simd_batch_size);
+      C.mat_4d  = view_type_4d((scalar_type *)C.vec_3d.data(), simd_vector_size,
+                              dims.c.m, dims.c.n, c_simd_batch_size);
+      C.ivec_4d = internal_vector_view_type_4d(
+          (internal_vector_type *)C.mat_4d.data(),
+          simd_vector_size / simd_internal_vector_size, dims.c.m, dims.c.n,
+          c_simd_batch_size);
 
     } else {
-      // Construct simd matrices with batch_size in the first dimension (better for LayoutRight views)
-      A.vec_3d = vector_view_type_3d ("A_vector", a_simd_batch_size, dims.a.m, dims.a.n);
-      A.mat_4d = view_type_4d ((scalar_type *)A.vec_3d.data(), a_simd_batch_size, dims.a.m, dims.a.n, simd_vector_size);
-      A.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)A.mat_4d.data(), a_simd_batch_size, dims.a.m, dims.a.n, simd_vector_size/simd_internal_vector_size);
-
-      B.vec_3d = vector_view_type_3d ("B_vector", b_simd_batch_size, dims.b.m, dims.b.n);
-      B.mat_4d = view_type_4d ((scalar_type *)B.vec_3d.data(), b_simd_batch_size, dims.b.m, dims.b.n, simd_vector_size);
-      B.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)B.mat_4d.data(), b_simd_batch_size, dims.b.m, dims.b.n, simd_vector_size/simd_internal_vector_size);
-
-      C.vec_3d = vector_view_type_3d ("C_vector", c_simd_batch_size, dims.c.m, dims.c.n);
-      C.mat_4d = view_type_4d ((scalar_type *)C.vec_3d.data(), c_simd_batch_size, dims.c.m, dims.c.n, simd_vector_size);
-      C.ivec_4d = internal_vector_view_type_4d ((internal_vector_type *)C.mat_4d.data(), c_simd_batch_size, dims.c.m, dims.c.n, simd_vector_size/simd_internal_vector_size);
+      // Construct simd matrices with batch_size in the first dimension (better
+      // for LayoutRight views)
+      A.vec_3d = vector_view_type_3d("A_vector", a_simd_batch_size, dims.a.m,
+                                     dims.a.n);
+      A.mat_4d = view_type_4d((scalar_type *)A.vec_3d.data(), a_simd_batch_size,
+                              dims.a.m, dims.a.n, simd_vector_size);
+      A.ivec_4d = internal_vector_view_type_4d(
+          (internal_vector_type *)A.mat_4d.data(), a_simd_batch_size, dims.a.m,
+          dims.a.n, simd_vector_size / simd_internal_vector_size);
+
+      B.vec_3d = vector_view_type_3d("B_vector", b_simd_batch_size, dims.b.m,
+                                     dims.b.n);
+      B.mat_4d = view_type_4d((scalar_type *)B.vec_3d.data(), b_simd_batch_size,
+                              dims.b.m, dims.b.n, simd_vector_size);
+      B.ivec_4d = internal_vector_view_type_4d(
+          (internal_vector_type *)B.mat_4d.data(), b_simd_batch_size, dims.b.m,
+          dims.b.n, simd_vector_size / simd_internal_vector_size);
+
+      C.vec_3d = vector_view_type_3d("C_vector", c_simd_batch_size, dims.c.m,
+                                     dims.c.n);
+      C.mat_4d = view_type_4d((scalar_type *)C.vec_3d.data(), c_simd_batch_size,
+                              dims.c.m, dims.c.n, simd_vector_size);
+      C.ivec_4d = internal_vector_view_type_4d(
+          (internal_vector_type *)C.mat_4d.data(), c_simd_batch_size, dims.c.m,
+          dims.c.n, simd_vector_size / simd_internal_vector_size);
     }
 
-    // Use the non-simd 4-rank view type to randomly populate the gemm simd arguments
+    // Use the non-simd 4-rank view type to randomly populate the gemm simd
+    // arguments
     Kokkos::fill_random(gemm_args.Av.mat_4d, rand_pool,
                         Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
-                                    scalar_type>::max());
+                                     scalar_type>::max());
     Kokkos::fill_random(gemm_args.Bv.mat_4d, rand_pool,
                         Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
-                                    scalar_type>::max());
+                                     scalar_type>::max());
     Kokkos::fill_random(gemm_args.Cv.mat_4d, rand_pool,
                         Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
-                                    scalar_type>::max());
+                                     scalar_type>::max());
   } else {
     if (options.blas_args.batch_size_last_dim) {
-      gemm_args.A             = vta("gemm_args.A", dims.a.m, dims.a.n, dims.a.k);
-      gemm_args.B             = vtb("gemm_args.B", dims.b.m, dims.b.n, dims.b.k);
-      gemm_args.C             = vtc("gemm_args.C", dims.c.m, dims.c.n, dims.c.k);
+      gemm_args.A = vta("gemm_args.A", dims.a.m, dims.a.n, dims.a.k);
+      gemm_args.B = vtb("gemm_args.B", dims.b.m, dims.b.n, dims.b.k);
+      gemm_args.C = vtc("gemm_args.C", dims.c.m, dims.c.n, dims.c.k);
     } else {
-      gemm_args.A             = vta("gemm_args.A", dims.a.k, dims.a.m, dims.a.n);
-      gemm_args.B             = vtb("gemm_args.B", dims.b.k, dims.b.m, dims.b.n);
-      gemm_args.C             = vtc("gemm_args.C", dims.c.k, dims.c.m, dims.c.n);
+      gemm_args.A = vta("gemm_args.A", dims.a.k, dims.a.m, dims.a.n);
+      gemm_args.B = vtb("gemm_args.B", dims.b.k, dims.b.m, dims.b.n);
+      gemm_args.C = vtc("gemm_args.C", dims.c.k, dims.c.m, dims.c.n);
     }
 
     Kokkos::fill_random(gemm_args.A, rand_pool,
                         Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
-                                    scalar_type>::max());
+                                     scalar_type>::max());
     Kokkos::fill_random(gemm_args.B, rand_pool,
                         Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
-                                    scalar_type>::max());
+                                     scalar_type>::max());
     Kokkos::fill_random(gemm_args.C, rand_pool,
                         Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
-                                    scalar_type>::max());
+                                     scalar_type>::max());
   }
   gemm_args.alpha         = options.blas_args.gemm.alpha;
   gemm_args.beta          = options.blas_args.gemm.beta;
@@ -1344,7 +1420,7 @@ void __do_loop_and_invoke(options_t options,
   __print_gemm_perf_test_options(options);
   std::cout << "SCALAR:" << typeid(default_scalar).name()
             << ", LAYOUT:" << typeid(default_layout).name()
-            << ", DEVICE:" << typeid(default_device).name() 
+            << ", DEVICE:" << typeid(default_device).name()
             << ", SPACE:" << typeid(memory_space).name() << std::endl;
 
   options.out[0] << gemm_csv_header_str << std::endl;
@@ -1394,8 +1470,9 @@ void do_gemm_serial_batched_parallel(options_t options) {
   STATUS;
   if (options.blas_args.batch_size_last_dim)
     __do_loop_and_invoke(
-      options, __do_gemm_parallel_batched<SerialBatchDim3Tag, Algo::Gemm::Unblocked,
-                                          default_device>);
+        options,
+        __do_gemm_parallel_batched<SerialBatchDim3Tag, Algo::Gemm::Unblocked,
+                                   default_device>);
   else
     __do_loop_and_invoke(
         options, __do_gemm_parallel_batched<SerialTag, Algo::Gemm::Unblocked,
@@ -1407,8 +1484,9 @@ void do_gemm_serial_batched_blocked_parallel(options_t options) {
   STATUS;
   if (options.blas_args.batch_size_last_dim)
     __do_loop_and_invoke(
-        options, __do_gemm_parallel_batched<SerialBatchDim3Tag, Algo::Gemm::Blocked,
-                                            default_device>);
+        options,
+        __do_gemm_parallel_batched<SerialBatchDim3Tag, Algo::Gemm::Blocked,
+                                   default_device>);
   else
     __do_loop_and_invoke(
         options, __do_gemm_parallel_batched<SerialTag, Algo::Gemm::Blocked,
@@ -1419,9 +1497,10 @@ void do_gemm_serial_batched_blocked_parallel(options_t options) {
 void do_gemm_serial_simd_batched_parallel(options_t options) {
   STATUS;
   if (options.blas_args.batch_size_last_dim)
-      __do_loop_and_invoke(
-        options, __do_gemm_parallel_batched<TeamSimdBatchDim4Tag, Algo::Gemm::Unblocked,
-                                            default_device, Mode::Serial>);
+    __do_loop_and_invoke(
+        options,
+        __do_gemm_parallel_batched<TeamSimdBatchDim4Tag, Algo::Gemm::Unblocked,
+                                   default_device, Mode::Serial>);
   else
     __do_loop_and_invoke(
         options, __do_gemm_parallel_batched<TeamSimdTag, Algo::Gemm::Unblocked,
@@ -1432,9 +1511,10 @@ void do_gemm_serial_simd_batched_parallel(options_t options) {
 void do_gemm_serial_simd_batched_blocked_parallel(options_t options) {
   STATUS;
   if (options.blas_args.batch_size_last_dim)
-      __do_loop_and_invoke(
-        options, __do_gemm_parallel_batched<TeamSimdBatchDim4Tag, Algo::Gemm::Blocked,
-                                            default_device, Mode::Serial>);
+    __do_loop_and_invoke(
+        options,
+        __do_gemm_parallel_batched<TeamSimdBatchDim4Tag, Algo::Gemm::Blocked,
+                                   default_device, Mode::Serial>);
   else
     __do_loop_and_invoke(
         options, __do_gemm_parallel_batched<TeamSimdTag, Algo::Gemm::Blocked,
@@ -1444,29 +1524,36 @@ void do_gemm_serial_simd_batched_blocked_parallel(options_t options) {
 
 void do_gemm_serial_batched_compact_mkl_parallel(options_t options) {
   STATUS;
-#if                                                            \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&               \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) &&       \
-  defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
+#if defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__) &&         \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__) && \
+    defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
   if (options.blas_args.batch_size_last_dim)
     __do_loop_and_invoke(
-        options, __do_gemm_parallel_batched<SerialBatchDim3Tag, Algo::Gemm::CompactMKL,
-                                            default_device>);
+        options,
+        __do_gemm_parallel_batched<SerialBatchDim3Tag, Algo::Gemm::CompactMKL,
+                                   default_device>);
   else
     __do_loop_and_invoke(
         options, __do_gemm_parallel_batched<SerialTag, Algo::Gemm::CompactMKL,
                                             default_device>);
 #else
-  #if !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__)
-    std::cerr << std::string(__func__)
-              << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL__ is undefined." << std::endl;
-  #elif !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__)
-    std::cerr << std::string(__func__)
-              << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__ is undefined." << std::endl;
-  #elif !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
-    std::cerr << std::string(__func__)
-              << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__ is undefined." << std::endl;
-  #endif
+#if !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__)
+  std::cerr
+      << std::string(__func__)
+      << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL__ is undefined."
+      << std::endl;
+#elif !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__)
+  std::cerr << std::string(__func__)
+            << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL_BATCHED__ is "
+               "undefined."
+            << std::endl;
+#elif !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__)
+  std::cerr
+      << std::string(__func__)
+      << " disabled since __KOKKOSBATCHED_ENABLE_INTEL_MKL_COMPACT_BATCHED__ "
+         "is undefined."
+      << std::endl;
+#endif
 #endif
   return;
 }
@@ -1475,8 +1562,9 @@ void do_gemm_team_batched_parallel(options_t options) {
   STATUS;
   if (options.blas_args.batch_size_last_dim)
     __do_loop_and_invoke(
-        options, __do_gemm_parallel_batched<TeamBatchDim3Tag, Algo::Gemm::Unblocked,
-                                            default_device>);
+        options,
+        __do_gemm_parallel_batched<TeamBatchDim3Tag, Algo::Gemm::Unblocked,
+                                   default_device>);
   else
     __do_loop_and_invoke(
         options, __do_gemm_parallel_batched<TeamTag, Algo::Gemm::Unblocked,
@@ -1489,33 +1577,37 @@ void do_gemm_team_batched_blocked_parallel(options_t options) {
   if (options.blas_args.batch_size_last_dim)
     __do_loop_and_invoke(
         options,
-        __do_gemm_parallel_batched<TeamBatchDim3Tag, Algo::Gemm::Blocked, default_device>);
+        __do_gemm_parallel_batched<TeamBatchDim3Tag, Algo::Gemm::Blocked,
+                                   default_device>);
   else
     __do_loop_and_invoke(
-        options,
-        __do_gemm_parallel_batched<TeamTag, Algo::Gemm::Blocked, default_device>);
+        options, __do_gemm_parallel_batched<TeamTag, Algo::Gemm::Blocked,
+                                            default_device>);
   return;
 }
 
 void do_gemm_team_vector_batched_parallel(options_t options) {
   STATUS;
   if (options.blas_args.batch_size_last_dim)
-      __do_loop_and_invoke(
-        options, __do_gemm_parallel_batched<TeamVectorBatchDim3Tag, Algo::Gemm::Unblocked,
-                                            default_device>);
+    __do_loop_and_invoke(
+        options,
+        __do_gemm_parallel_batched<TeamVectorBatchDim3Tag,
+                                   Algo::Gemm::Unblocked, default_device>);
   else
     __do_loop_and_invoke(
-        options, __do_gemm_parallel_batched<TeamVectorTag, Algo::Gemm::Unblocked,
-                                            default_device>);
+        options,
+        __do_gemm_parallel_batched<TeamVectorTag, Algo::Gemm::Unblocked,
+                                   default_device>);
   return;
 }
 
 void do_gemm_team_simd_batched_parallel(options_t options) {
   STATUS;
   if (options.blas_args.batch_size_last_dim)
-      __do_loop_and_invoke(
-        options, __do_gemm_parallel_batched<TeamSimdBatchDim4Tag, Algo::Gemm::Unblocked,
-                                            default_device, Mode::Team>);
+    __do_loop_and_invoke(
+        options,
+        __do_gemm_parallel_batched<TeamSimdBatchDim4Tag, Algo::Gemm::Unblocked,
+                                   default_device, Mode::Team>);
   else
     __do_loop_and_invoke(
         options, __do_gemm_parallel_batched<TeamSimdTag, Algo::Gemm::Unblocked,
@@ -1526,9 +1618,10 @@ void do_gemm_team_simd_batched_parallel(options_t options) {
 void do_gemm_team_simd_batched_blocked_parallel(options_t options) {
   STATUS;
   if (options.blas_args.batch_size_last_dim)
-      __do_loop_and_invoke(
-        options, __do_gemm_parallel_batched<TeamSimdBatchDim4Tag, Algo::Gemm::Blocked,
-                                            default_device, Mode::Team>);
+    __do_loop_and_invoke(
+        options,
+        __do_gemm_parallel_batched<TeamSimdBatchDim4Tag, Algo::Gemm::Blocked,
+                                   default_device, Mode::Team>);
   else
     __do_loop_and_invoke(
         options, __do_gemm_parallel_batched<TeamSimdTag, Algo::Gemm::Blocked,
@@ -1536,7 +1629,6 @@ void do_gemm_team_simd_batched_blocked_parallel(options_t options) {
   return;
 }
 
-
 // Blocked algo not yet implemented for TeamVectorGemm.
 /* void do_gemm_team_vector_batched_blocked_parallel(options_t options) {
   STATUS;
diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
index 17aac3d526..daf68180c2 100644
--- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
+++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
@@ -107,8 +107,10 @@ static void __print_help_blas3_perf_test() {
 
   printf("\t-p, --gemm_scalars=ALPHA_SCALAR_VALUE,BETA_SCALAR_VALUE\n");
   printf("\t\tGEMM alpha and beta values.\n");
-  printf("\t\t\tThe value of alpha and beta in floating point. (default: %lf,%lf)\n",
-         DEFAULT_GEMM_ALPHA, DEFAULT_GEMM_BETA);
+  printf(
+      "\t\t\tThe value of alpha and beta in floating point. (default: "
+      "%lf,%lf)\n",
+      DEFAULT_GEMM_ALPHA, DEFAULT_GEMM_BETA);
 
   printf("\t-z, --team_size=SIZE\n");
   printf("\t\tKokkos team size.\n");
@@ -121,19 +123,24 @@ static void __print_help_blas3_perf_test() {
          DEFAULT_VECTOR_LEN);
 
   printf("\t-u, --use_auto={0,1}\n");
-  printf("\t\tWhether to use Kokkos::AUTO for vector_len and team_size (Heirarchical parallelism).\n");
-  printf("\t\t\t1 to use Kokkos::AUTO, otherwise --vector_len and --team_size will be used. (default: %d)\n",
-         DEFAULT_USE_AUTO);
+  printf(
+      "\t\tWhether to use Kokkos::AUTO for vector_len and team_size "
+      "(Heirarchical parallelism).\n");
+  printf(
+      "\t\t\t1 to use Kokkos::AUTO, otherwise --vector_len and --team_size "
+      "will be used. (default: %d)\n",
+      DEFAULT_USE_AUTO);
 
   printf("\t-k, --batch_size=LEN\n");
   printf("\t\tBatch size. Adds third dimension to matrices A, B, and C.\n");
-  printf("\t\t\tThe value of LEN as an integer. (default: %d)\n",
-         DEFAULT_K);
+  printf("\t\t\tThe value of LEN as an integer. (default: %d)\n", DEFAULT_K);
 
   printf("\t-d, --batch_size_last_dim={0,1}\n");
   printf("\t\tHow to allocate the batch_size in the matrices.\n");
-  printf("\t\t\t1 make the batch_size the last dimension, otherwise batch_size is the first dimension (default: %d)\n",
-         DEFAULT_BATCH_SIZE_LAST_DIM);
+  printf(
+      "\t\t\t1 make the batch_size the last dimension, otherwise batch_size is "
+      "the first dimension (default: %d)\n",
+      DEFAULT_BATCH_SIZE_LAST_DIM);
 
   printf("\t-l, --loop_type=OPTION\n");
   printf("\t\tLoop selection.\n");
@@ -222,34 +229,34 @@ int main(int argc, char **argv) {
   };
 
   /* set default options */
-  options.test                 = DEFAULT_TEST;
-  options.loop                 = DEFAULT_LOOP;
-  options.start.a.k            = DEFAULT_K;
-  options.start.a.m            = DEFAULT_MATRIX_START;
-  options.start.a.n            = DEFAULT_MATRIX_START;
-  options.stop.a.k             = DEFAULT_K;
-  options.stop.a.m             = DEFAULT_MATRIX_STOP;
-  options.stop.a.n             = DEFAULT_MATRIX_STOP;
-  options.start.b.k            = DEFAULT_K;
-  options.start.b.m            = DEFAULT_MATRIX_START;
-  options.start.b.n            = DEFAULT_MATRIX_START;
-  options.stop.b.k             = DEFAULT_K;
-  options.stop.b.m             = DEFAULT_MATRIX_STOP;
-  options.stop.b.n             = DEFAULT_MATRIX_STOP;
-  options.start.c.k            = DEFAULT_K;
-  options.start.c.m            = DEFAULT_MATRIX_START;
-  options.start.c.n            = DEFAULT_MATRIX_START;
-  options.stop.c.k             = DEFAULT_K;
-  options.stop.c.m             = DEFAULT_MATRIX_STOP;
-  options.stop.c.n             = DEFAULT_MATRIX_STOP;
-  options.step                 = DEFAULT_STEP;
-  options.warm_up_n            = DEFAULT_WARM_UP_N;
-  options.n                    = DEFAULT_N;
-  options.out                  = DEFAULT_OUT;
-  options.blas_routines        = std::string(DEFAULT_BLAS_ROUTINES);
-  options.blas_args.team_size  = DEFAULT_TEAM_SIZE;
-  options.blas_args.vector_len = DEFAULT_VECTOR_LEN;
-  options.blas_args.use_auto   = DEFAULT_USE_AUTO;
+  options.test                          = DEFAULT_TEST;
+  options.loop                          = DEFAULT_LOOP;
+  options.start.a.k                     = DEFAULT_K;
+  options.start.a.m                     = DEFAULT_MATRIX_START;
+  options.start.a.n                     = DEFAULT_MATRIX_START;
+  options.stop.a.k                      = DEFAULT_K;
+  options.stop.a.m                      = DEFAULT_MATRIX_STOP;
+  options.stop.a.n                      = DEFAULT_MATRIX_STOP;
+  options.start.b.k                     = DEFAULT_K;
+  options.start.b.m                     = DEFAULT_MATRIX_START;
+  options.start.b.n                     = DEFAULT_MATRIX_START;
+  options.stop.b.k                      = DEFAULT_K;
+  options.stop.b.m                      = DEFAULT_MATRIX_STOP;
+  options.stop.b.n                      = DEFAULT_MATRIX_STOP;
+  options.start.c.k                     = DEFAULT_K;
+  options.start.c.m                     = DEFAULT_MATRIX_START;
+  options.start.c.n                     = DEFAULT_MATRIX_START;
+  options.stop.c.k                      = DEFAULT_K;
+  options.stop.c.m                      = DEFAULT_MATRIX_STOP;
+  options.stop.c.n                      = DEFAULT_MATRIX_STOP;
+  options.step                          = DEFAULT_STEP;
+  options.warm_up_n                     = DEFAULT_WARM_UP_N;
+  options.n                             = DEFAULT_N;
+  options.out                           = DEFAULT_OUT;
+  options.blas_routines                 = std::string(DEFAULT_BLAS_ROUTINES);
+  options.blas_args.team_size           = DEFAULT_TEAM_SIZE;
+  options.blas_args.vector_len          = DEFAULT_VECTOR_LEN;
+  options.blas_args.use_auto            = DEFAULT_USE_AUTO;
   options.blas_args.batch_size_last_dim = DEFAULT_BATCH_SIZE_LAST_DIM;
 
   options.blas_args.trmm.trmm_args = DEFAULT_TRMM_ARGS;
@@ -295,7 +302,7 @@ int main(int argc, char **argv) {
           __blas3_perf_test_input_error(argv, ret, optarg);
 
         options.blas_args.gemm.alpha = static_cast<default_scalar>(alpha);
-        options.blas_args.gemm.beta = static_cast<default_scalar>(beta);
+        options.blas_args.gemm.beta  = static_cast<default_scalar>(beta);
         break;
       case 'a':
         // printf("optarg=%s. %d\n", optarg, strncasecmp(optarg, "blas", 4));
diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
index a313eabbaf..f84479d26e 100644
--- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
@@ -78,34 +78,36 @@ void (*do_trmm_invoke[LOOP_N][TEST_N])(options_t) = {
  * assumes KokkosBatched::SerialTrmm is being used. Since the dot products
  * do a multiply and add we can calculate the flops for any element in the last
  * column of the LHS to be 2*columns_LHS, any element in the last-1 column of
- * the LHS to be 2*(columns_LHS-1), and so on. We do this for every row of the LHS
- * giving us this flop count:
- *  flops = columns_LHS * (columns_LHS + 1)
- *  flops = (flops / 2) * 2
- *  flops = flops * rows_LHS
+ * the LHS to be 2*(columns_LHS-1), and so on. We do this for every row of the
+ * LHS giving us this flop count: flops = columns_LHS * (columns_LHS + 1) flops
+ * = (flops / 2) * 2 flops = flops * rows_LHS
  */
-static inline int trmm_impl_flop_count(char side, int b_m, int b_n, int a_m, int a_n) {
+static inline int trmm_impl_flop_count(char side, int b_m, int b_n, int a_m,
+                                       int a_n) {
   int flops;
 
   if (side == 'L' || side == 'l') {
-      flops = (b_m * (b_m + 1)) * b_n;
+    flops = (b_m * (b_m + 1)) * b_n;
   } else {
-      flops = (b_n * (b_n + 1)) * b_m;
+    flops = (b_n * (b_n + 1)) * b_m;
   }
 
   if (std::is_same<double, default_scalar>::value ||
-        std::is_same<float, default_scalar>::value ||
-        std::is_same<Kokkos::Experimental::half_t, default_scalar>::value)
-      return flops;
+      std::is_same<float, default_scalar>::value ||
+      std::is_same<Kokkos::Experimental::half_t, default_scalar>::value)
+    return flops;
 
   // Account for 6 additional flops when complex numbers are used.
   // Above we have counted 1 flop for each add and 1 flop for each multiply.
-  // For complex, we need to count 2 flops for each add and 6 flops for each multiply.
+  // For complex, we need to count 2 flops for each add and 6 flops for each
+  // multiply.
   return flops * 4;
 }
 
-// Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf
-static inline int trmm_flop_count(char side, int b_m, int b_n, int a_m, int a_n) {
+// Flop count formula from lapack working note 41:
+// http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf
+static inline int trmm_flop_count(char side, int b_m, int b_n, int a_m,
+                                  int a_n) {
   int flops;
 
   if (side == 'L' || side == 'l') {
@@ -115,13 +117,14 @@ static inline int trmm_flop_count(char side, int b_m, int b_n, int a_m, int a_n)
   }
 
   if (std::is_same<double, default_scalar>::value ||
-        std::is_same<float, default_scalar>::value ||
-        std::is_same<Kokkos::Experimental::half_t, default_scalar>::value)
-      return flops;
+      std::is_same<float, default_scalar>::value ||
+      std::is_same<Kokkos::Experimental::half_t, default_scalar>::value)
+    return flops;
 
   // Account for 6 additional flops when complex numbers are used.
   // Above we have counted 1 flop for each add and 1 flop for each multiply.
-  // For complex, we need to count 2 flops for each add and 6 flops for each multiply.
+  // For complex, we need to count 2 flops for each add and 6 flops for each
+  // multiply.
   return flops * 4;
 }
 
@@ -136,17 +139,21 @@ typedef struct trmm_args trmm_args_t;
 
 static std::string trmm_csv_header_str =
     "algorithm,side-uplo-trans-diag,alpha,loop_type,A_dims,B_dims,warm_up_n,"
-    "iter,total_time(s),average_time(s),FLOPS,GFLOP/average_time(s),min_achieved_bandwidth(GB/s),max_achieved_bandwidth(GB/s)";
+    "iter,total_time(s),average_time(s),FLOPS,GFLOP/"
+    "average_time(s),min_achieved_bandwidth(GB/s),max_achieved_bandwidth(GB/s)";
 
 /*************************** Internal helper fns **************************/
 static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args,
                                   double time_in_seconds) {
-  double flops = trmm_args.A.extent(0) * trmm_flop_count(trmm_args.side,
-                                                         trmm_args.B.extent(1), trmm_args.B.extent(2),
-                                                         trmm_args.A.extent(1), trmm_args.A.extent(2));
-  double gflops = flops / 1e9;
-  double average_time = time_in_seconds / options.n;
-  double gbytes_in_matrix = (trmm_args.B.extent(0) * trmm_args.B.extent(1) * trmm_args.B.extent(2) * sizeof(default_scalar)) / 1e9;
+  double flops = trmm_args.A.extent(0) *
+                 trmm_flop_count(trmm_args.side, trmm_args.B.extent(1),
+                                 trmm_args.B.extent(2), trmm_args.A.extent(1),
+                                 trmm_args.A.extent(2));
+  double gflops           = flops / 1e9;
+  double average_time     = time_in_seconds / options.n;
+  double gbytes_in_matrix = (trmm_args.B.extent(0) * trmm_args.B.extent(1) *
+                             trmm_args.B.extent(2) * sizeof(default_scalar)) /
+                            1e9;
   double min_memory_transactions, max_memory_transactions;
 
   // Assuming infinite cache size
@@ -155,26 +162,29 @@ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args,
   min_memory_transactions = 3;
 
   // Assuming no register or real caching
-  // We have to go out to memory for every element we read from A and B as well as
-  // every element we write to B.
-  // We use the trmm flops from lapack note 41 and multiple by 3/2 to account for the
-  // write to B since this flop count is for one multiply and one add.
+  // We have to go out to memory for every element we read from A and B as well
+  // as every element we write to B. We use the trmm flops from lapack note 41
+  // and multiple by 3/2 to account for the write to B since this flop count is
+  // for one multiply and one add.
   if (trmm_args.side == 'l' || trmm_args.side == 'L')
-    max_memory_transactions = trmm_args.B.extent(1) * trmm_args.B.extent(1) * trmm_args.B.extent(2) * (3./2.);
+    max_memory_transactions = trmm_args.B.extent(1) * trmm_args.B.extent(1) *
+                              trmm_args.B.extent(2) * (3. / 2.);
   else
-    max_memory_transactions = trmm_args.B.extent(2) * trmm_args.B.extent(2) * trmm_args.B.extent(1) * (3./2.);
+    max_memory_transactions = trmm_args.B.extent(2) * trmm_args.B.extent(2) *
+                              trmm_args.B.extent(1) * (3. / 2.);
 
   options.out[0] << test_e_str[options.test] << ","
                  << options.blas_args.trmm.trmm_args << ","
                  << options.blas_args.trmm.alpha << ","
-                 << loop_e_str[options.loop] << "," << trmm_args.A.extent(0) << "x" << trmm_args.A.extent(1)
-                 << "x" << trmm_args.A.extent(2) << "," << trmm_args.B.extent(0) << "x" << trmm_args.B.extent(1)
+                 << loop_e_str[options.loop] << "," << trmm_args.A.extent(0)
+                 << "x" << trmm_args.A.extent(1) << "x" << trmm_args.A.extent(2)
+                 << "," << trmm_args.B.extent(0) << "x" << trmm_args.B.extent(1)
                  << "x" << trmm_args.B.extent(2) << "," << options.warm_up_n
                  << "," << options.n << "," << time_in_seconds << ","
-                 << average_time << ","
-                 << flops << ","
-                 << gflops / average_time << ","
-                 << (gbytes_in_matrix * min_memory_transactions) / average_time << ","
+                 << average_time << "," << flops << "," << gflops / average_time
+                 << ","
+                 << (gbytes_in_matrix * min_memory_transactions) / average_time
+                 << ","
                  << (gbytes_in_matrix * max_memory_transactions) / average_time
                  << std::endl;
 }
@@ -218,7 +228,7 @@ void __do_trmm_serial_blas(options_t options, trmm_args_t trmm_args) {
       auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL());
 
       KokkosBlas::trmm(&trmm_args.side, &trmm_args.uplo, &trmm_args.trans,
-                      &trmm_args.diag, trmm_args.alpha, A, B);
+                       &trmm_args.diag, trmm_args.alpha, A, B);
     }
     // Fence after submitting each batch operation
     Kokkos::fence();
@@ -231,7 +241,7 @@ void __do_trmm_serial_blas(options_t options, trmm_args_t trmm_args) {
       auto B = Kokkos::subview(trmm_args.B, i, Kokkos::ALL(), Kokkos::ALL());
 
       KokkosBlas::trmm(&trmm_args.side, &trmm_args.uplo, &trmm_args.trans,
-                      &trmm_args.diag, trmm_args.alpha, A, B);
+                       &trmm_args.diag, trmm_args.alpha, A, B);
     }
     // Fence after submitting each batch operation
     Kokkos::fence();
@@ -412,18 +422,20 @@ void __do_trmm_parallel_blas(options_t options, trmm_args_t trmm_args) {
   STATUS;
 
   for (uint32_t j = 0; j < warm_up_n; ++j) {
-    Kokkos::parallel_for("parallelBlasWarmUpLoopTrmm",
-                        Kokkos::RangePolicy<execution_space>(0, options.start.a.k),
-                        parallel_blas_trmm_functor);
+    Kokkos::parallel_for(
+        "parallelBlasWarmUpLoopTrmm",
+        Kokkos::RangePolicy<execution_space>(0, options.start.a.k),
+        parallel_blas_trmm_functor);
     // Fence after each batch operation
     Kokkos::fence();
   }
 
   timer.reset();
   for (uint32_t j = 0; j < n; ++j) {
-    Kokkos::parallel_for("parallelBlasTimedLoopTrmm",
-                        Kokkos::RangePolicy<execution_space>(0, options.start.a.k),
-                        parallel_blas_trmm_functor);
+    Kokkos::parallel_for(
+        "parallelBlasTimedLoopTrmm",
+        Kokkos::RangePolicy<execution_space>(0, options.start.a.k),
+        parallel_blas_trmm_functor);
     // Fence after each batch operation
     Kokkos::fence();
   }
@@ -470,18 +482,20 @@ void __do_trmm_parallel_batched_template(options_t options,
   STATUS;
 
   for (uint32_t j = 0; j < warm_up_n; ++j) {
-    Kokkos::parallel_for("parallelBatchedWarmUpLoopTrmm",
-                        Kokkos::RangePolicy<execution_space>(0, options.start.a.k),
-                        parallel_batched_trmm_functor);
+    Kokkos::parallel_for(
+        "parallelBatchedWarmUpLoopTrmm",
+        Kokkos::RangePolicy<execution_space>(0, options.start.a.k),
+        parallel_batched_trmm_functor);
     // Fence after each batch operation
     Kokkos::fence();
   }
 
   timer.reset();
   for (uint32_t j = 0; j < n; ++j) {
-    Kokkos::parallel_for("parallelBatchedTimedLoopTrmm",
-                        Kokkos::RangePolicy<execution_space>(0, options.start.a.k),
-                        parallel_batched_trmm_functor);
+    Kokkos::parallel_for(
+        "parallelBatchedTimedLoopTrmm",
+        Kokkos::RangePolicy<execution_space>(0, options.start.a.k),
+        parallel_batched_trmm_functor);
     // Fence after each batch operation
     Kokkos::fence();
   }

From e5fb960c340f628242d0266f8dd9f03608d715c2 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Tue, 2 Mar 2021 16:20:25 -0700
Subject: [PATCH 23/47] perf_test/blas/blas3: Allocate simd views

  - Allocate simd views for serial simd tests.
---
 perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index 3e55a85799..74f0771062 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -1300,7 +1300,9 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) {
   gemm_args.transA = options.blas_args.gemm.gemm_args.c_str()[0];
   gemm_args.transB = options.blas_args.gemm.gemm_args.c_str()[1];
   if (options.test == BATCHED_TEAM_SIMD ||
-      options.test == BATCHED_TEAM_SIMD_BLOCKED) {
+      options.test == BATCHED_TEAM_SIMD_BLOCKED ||
+      options.test == BATCHED_SERIAL_SIMD ||
+      options.test == BATCHED_SERIAL_SIMD_BLOCKED) {
     // Calculate the batch size for simd views
     auto a_simd_batch_size =
         dims.a.k / simd_vector_size + (dims.a.k % simd_vector_size > 0);

From 30d54723e3d926ed3f65a3db521fec1929df27c9 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Wed, 3 Mar 2021 08:27:47 -0700
Subject: [PATCH 24/47] perf_test/blas/blas3: Update compact mkl functors

---
 .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 41 ++++++++++++++++---
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index 74f0771062..d646653697 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -83,14 +83,14 @@ void do_gemm_experiment_parallel(options_t options);
 
 struct SerialTag {};
 struct SerialBatchDim3Tag {};
+struct SerialSimdTag {};
+struct SerialSimdBatchDim3Tag {};
 struct TeamTag {};
 struct TeamBatchDim3Tag {};
 struct TeamVectorTag {};
 struct TeamVectorBatchDim3Tag {};
 struct TeamSimdTag {};
 struct TeamSimdBatchDim4Tag {};
-// TODO: struct SerialSimdTag {};
-// TODO: struct SerialSimdBatchDim4Tag {};
 struct LayoutLeftTag {};
 struct LayoutRightTag {};
 struct SimdCpuTag {};
@@ -469,6 +469,32 @@ struct parallel_batched_gemm_range_policy {
         gemm_args_.alpha, svA, svB, gemm_args_.beta, svC);
   }
 
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const SerialSimdTag &, const int &i) const {
+    auto svA = Kokkos::subview(gemm_args_.Av.vec_3d, i, Kokkos::ALL(),
+                                Kokkos::ALL());
+    auto svB = Kokkos::subview(gemm_args_.Bv.vec_3d, i, Kokkos::ALL(),
+                                Kokkos::ALL());
+    auto svC = Kokkos::subview(gemm_args_.Cv.vec_3d, i, Kokkos::ALL(),
+                                Kokkos::ALL());
+
+    KokkosBatched::SerialGemm<TransAType, TransBType, BlockingType>::invoke(
+        gemm_args_.alpha, svA, svB, gemm_args_.beta, svC);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const SerialSimdBatchDim3Tag &, const int &i) const {
+    auto svA = Kokkos::subview(gemm_args_.Av.vec_3d,
+                                Kokkos::ALL(), Kokkos::ALL(), i);
+    auto svB = Kokkos::subview(gemm_args_.Bv.vec_3d,
+                                Kokkos::ALL(), Kokkos::ALL(), i);
+    auto svC = Kokkos::subview(gemm_args_.Cv.vec_3d,
+                                Kokkos::ALL(), Kokkos::ALL(), i);
+
+    KokkosBatched::SerialGemm<TransAType, TransBType, BlockingType>::invoke(
+        gemm_args_.alpha, svA, svB, gemm_args_.beta, svC);
+  }
+
   KOKKOS_INLINE_FUNCTION
   void operator()(const TeamTag &, const int &i) const {
     Kokkos::abort("TeamTag not supported using RangePolicy.");
@@ -686,7 +712,9 @@ void __do_gemm_parallel_batched_template(options_t options,
   Kokkos::Timer timer;
 
   if (std::is_same<AlgoTag, SerialTag>::value ||
-      std::is_same<AlgoTag, SerialBatchDim3Tag>::value) {
+      std::is_same<AlgoTag, SerialBatchDim3Tag>::value ||
+      std::is_same<AlgoTag, SerialSimdTag>::value ||
+      std::is_same<AlgoTag, SerialSimdBatchDim3Tag>::value) {
     return __do_gemm_parallel_batched_template_range_policy<
         TransAType, TransBType, BlockingType, AlgoTag, device_type>(options,
                                                                     gemm_args);
@@ -1302,7 +1330,8 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) {
   if (options.test == BATCHED_TEAM_SIMD ||
       options.test == BATCHED_TEAM_SIMD_BLOCKED ||
       options.test == BATCHED_SERIAL_SIMD ||
-      options.test == BATCHED_SERIAL_SIMD_BLOCKED) {
+      options.test == BATCHED_SERIAL_SIMD_BLOCKED ||
+      options.test == BATCHED_SERIAL_COMPACT_MKL) {
     // Calculate the batch size for simd views
     auto a_simd_batch_size =
         dims.a.k / simd_vector_size + (dims.a.k % simd_vector_size > 0);
@@ -1532,11 +1561,11 @@ void do_gemm_serial_batched_compact_mkl_parallel(options_t options) {
   if (options.blas_args.batch_size_last_dim)
     __do_loop_and_invoke(
         options,
-        __do_gemm_parallel_batched<SerialBatchDim3Tag, Algo::Gemm::CompactMKL,
+        __do_gemm_parallel_batched<SerialSimdBatchDim3Tag, Algo::Gemm::CompactMKL,
                                    default_device>);
   else
     __do_loop_and_invoke(
-        options, __do_gemm_parallel_batched<SerialTag, Algo::Gemm::CompactMKL,
+        options, __do_gemm_parallel_batched<SerialSimdTag, Algo::Gemm::CompactMKL,
                                             default_device>);
 #else
 #if !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__)

From 2401e9dbde85183f7655cc955e4375f2f48d34f3 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Wed, 3 Mar 2021 09:01:55 -0700
Subject: [PATCH 25/47] perf_test/blas/blas3: Added operators for SerialSimd

---
 perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index d646653697..c2f3f58ced 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -654,6 +654,16 @@ struct parallel_batched_gemm {
                                                     svC);
         });
   }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const SerialSimdTag &, const MemberType &member) const {
+    Kokkos::abort("SerialSimdTag not supported using RangePolicy.");
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const SerialSimdBatchDim3Tag &, const MemberType &member) const {
+    Kokkos::abort("SerialSimdBatchDim3Tag not supported using RangePolicy.");
+  }
 };
 
 template <class TransAType, class TransBType, class BlockingType, class AlgoTag,

From 950e05531233dfdcaa09c4c1542aadb53664503a Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Wed, 3 Mar 2021 15:34:21 -0700
Subject: [PATCH 26/47] perf_test/blas/blas3: Fix compactMKL batch size

---
 perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index c2f3f58ced..b575bc186b 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -677,15 +677,23 @@ void __do_gemm_parallel_batched_template_range_policy(options_t options,
 
   uint32_t warm_up_n = options.warm_up_n;
   uint32_t n         = options.n;
+  auto batch_size = options.start.c.k;
   Kokkos::Timer timer;
 
   STATUS;
 
   functor_type parallel_batched_gemm_functor(gemm_args);
 
+  if (std::is_same<AlgoTag, SerialSimdTag>::value ||
+      std::is_same<AlgoTag, SerialSimdBatchDim3Tag>::value) {
+    batch_size = options.blas_args.batch_size_last_dim
+      ? gemm_args.Cv.vec_3d.extent(2)
+      : gemm_args.Cv.vec_3d.extent(0);
+  }	
+
   for (uint32_t i = 0; i < warm_up_n; i++) {
     Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm",
-                         policy_type(0, options.start.c.k),
+                         policy_type(0, batch_size),
                          parallel_batched_gemm_functor);
     Kokkos::fence();
   }
@@ -693,7 +701,7 @@ void __do_gemm_parallel_batched_template_range_policy(options_t options,
   timer.reset();
   for (uint32_t i = 0; i < n; i++) {
     Kokkos::parallel_for("parallelBatchedTimedLoopGemm",
-                         policy_type(0, options.start.c.k),
+                         policy_type(0, batch_size),
                          parallel_batched_gemm_functor);
     Kokkos::fence();
   }

From 1eab5b4f04754ddbe18038a13733fec5bbc6176f Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Wed, 3 Mar 2021 12:56:01 -0700
Subject: [PATCH 27/47] perf_test/blas: Fix internal function names

---
 perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp  | 6 +++---
 perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp
index 32626cfba5..e6abeaefc4 100644
--- a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp
+++ b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp
@@ -82,7 +82,7 @@ void (*do_trtri_invoke[LOOP_N][TEST_N])(options_t) = {
    * The KokkosBatched::SerialTrtri implementation performs trmm and scal on subblocks
    * of the A matrix. a_m subblocks are selected.
    */
-static inline int trtri_impl_flop_count(int a_m, int a_n) {
+static inline int __trtri_impl_flop_count(int a_m, int a_n) {
   int flop_count = 0;
   int flops_per_div, flops_per_mul, flops_per_add;
 
@@ -109,7 +109,7 @@ static inline int trtri_impl_flop_count(int a_m, int a_n) {
 }
 
 // Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf
-static inline int trtri_flop_count(int a_m, int a_n) {
+static inline int __trtri_flop_count(int a_m, int a_n) {
   int flops;
   int flops_per_mul;
   int flops_per_add;
@@ -151,7 +151,7 @@ static std::string trtri_csv_header_str =
 /*************************** Internal helper fns **************************/
 static void __trtri_output_csv_row(options_t options, trtri_args_t trtri_args,
                                    double time_in_seconds) {
-  double flops = trtri_args.A.extent(0) * trtri_flop_count(trtri_args.A.extent(1), trtri_args.A.extent(2));
+  double flops = trtri_args.A.extent(0) * __trtri_flop_count(trtri_args.A.extent(1), trtri_args.A.extent(2));
   double gflops = flops / 1e9;
   double average_time = time_in_seconds / options.n;
 
diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
index f84479d26e..bd6392cf06 100644
--- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
@@ -82,7 +82,7 @@ void (*do_trmm_invoke[LOOP_N][TEST_N])(options_t) = {
  * LHS giving us this flop count: flops = columns_LHS * (columns_LHS + 1) flops
  * = (flops / 2) * 2 flops = flops * rows_LHS
  */
-static inline int trmm_impl_flop_count(char side, int b_m, int b_n, int a_m,
+static inline int __trmm_impl_flop_count(char side, int b_m, int b_n, int a_m,
                                        int a_n) {
   int flops;
 
@@ -106,7 +106,7 @@ static inline int trmm_impl_flop_count(char side, int b_m, int b_n, int a_m,
 
 // Flop count formula from lapack working note 41:
 // http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf
-static inline int trmm_flop_count(char side, int b_m, int b_n, int a_m,
+static inline int __trmm_flop_count(char side, int b_m, int b_n, int a_m,
                                   int a_n) {
   int flops;
 
@@ -146,7 +146,7 @@ static std::string trmm_csv_header_str =
 static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args,
                                   double time_in_seconds) {
   double flops = trmm_args.A.extent(0) *
-                 trmm_flop_count(trmm_args.side, trmm_args.B.extent(1),
+                 __trmm_flop_count(trmm_args.side, trmm_args.B.extent(1),
                                  trmm_args.B.extent(2), trmm_args.A.extent(1),
                                  trmm_args.A.extent(2));
   double gflops           = flops / 1e9;

From c7e4f5437c31c7f9c52928adcc8d31260b6418ea Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Wed, 3 Mar 2021 15:36:31 -0700
Subject: [PATCH 28/47] perf_test/blas/blas3: Apply clang-format

---
 .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 44 ++++++++++---------
 .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp |  8 ++--
 2 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index b575bc186b..d7f2143dc6 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -471,12 +471,12 @@ struct parallel_batched_gemm_range_policy {
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const SerialSimdTag &, const int &i) const {
-    auto svA = Kokkos::subview(gemm_args_.Av.vec_3d, i, Kokkos::ALL(),
-                                Kokkos::ALL());
-    auto svB = Kokkos::subview(gemm_args_.Bv.vec_3d, i, Kokkos::ALL(),
-                                Kokkos::ALL());
-    auto svC = Kokkos::subview(gemm_args_.Cv.vec_3d, i, Kokkos::ALL(),
-                                Kokkos::ALL());
+    auto svA =
+        Kokkos::subview(gemm_args_.Av.vec_3d, i, Kokkos::ALL(), Kokkos::ALL());
+    auto svB =
+        Kokkos::subview(gemm_args_.Bv.vec_3d, i, Kokkos::ALL(), Kokkos::ALL());
+    auto svC =
+        Kokkos::subview(gemm_args_.Cv.vec_3d, i, Kokkos::ALL(), Kokkos::ALL());
 
     KokkosBatched::SerialGemm<TransAType, TransBType, BlockingType>::invoke(
         gemm_args_.alpha, svA, svB, gemm_args_.beta, svC);
@@ -484,12 +484,12 @@ struct parallel_batched_gemm_range_policy {
 
   KOKKOS_INLINE_FUNCTION
   void operator()(const SerialSimdBatchDim3Tag &, const int &i) const {
-    auto svA = Kokkos::subview(gemm_args_.Av.vec_3d,
-                                Kokkos::ALL(), Kokkos::ALL(), i);
-    auto svB = Kokkos::subview(gemm_args_.Bv.vec_3d,
-                                Kokkos::ALL(), Kokkos::ALL(), i);
-    auto svC = Kokkos::subview(gemm_args_.Cv.vec_3d,
-                                Kokkos::ALL(), Kokkos::ALL(), i);
+    auto svA =
+        Kokkos::subview(gemm_args_.Av.vec_3d, Kokkos::ALL(), Kokkos::ALL(), i);
+    auto svB =
+        Kokkos::subview(gemm_args_.Bv.vec_3d, Kokkos::ALL(), Kokkos::ALL(), i);
+    auto svC =
+        Kokkos::subview(gemm_args_.Cv.vec_3d, Kokkos::ALL(), Kokkos::ALL(), i);
 
     KokkosBatched::SerialGemm<TransAType, TransBType, BlockingType>::invoke(
         gemm_args_.alpha, svA, svB, gemm_args_.beta, svC);
@@ -661,7 +661,8 @@ struct parallel_batched_gemm {
   }
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(const SerialSimdBatchDim3Tag &, const MemberType &member) const {
+  void operator()(const SerialSimdBatchDim3Tag &,
+                  const MemberType &member) const {
     Kokkos::abort("SerialSimdBatchDim3Tag not supported using RangePolicy.");
   }
 };
@@ -677,7 +678,7 @@ void __do_gemm_parallel_batched_template_range_policy(options_t options,
 
   uint32_t warm_up_n = options.warm_up_n;
   uint32_t n         = options.n;
-  auto batch_size = options.start.c.k;
+  auto batch_size    = options.start.c.k;
   Kokkos::Timer timer;
 
   STATUS;
@@ -687,9 +688,9 @@ void __do_gemm_parallel_batched_template_range_policy(options_t options,
   if (std::is_same<AlgoTag, SerialSimdTag>::value ||
       std::is_same<AlgoTag, SerialSimdBatchDim3Tag>::value) {
     batch_size = options.blas_args.batch_size_last_dim
-      ? gemm_args.Cv.vec_3d.extent(2)
-      : gemm_args.Cv.vec_3d.extent(0);
-  }	
+                     ? gemm_args.Cv.vec_3d.extent(2)
+                     : gemm_args.Cv.vec_3d.extent(0);
+  }
 
   for (uint32_t i = 0; i < warm_up_n; i++) {
     Kokkos::parallel_for("parallelBatchedWarmUpLoopGemm",
@@ -1579,12 +1580,13 @@ void do_gemm_serial_batched_compact_mkl_parallel(options_t options) {
   if (options.blas_args.batch_size_last_dim)
     __do_loop_and_invoke(
         options,
-        __do_gemm_parallel_batched<SerialSimdBatchDim3Tag, Algo::Gemm::CompactMKL,
-                                   default_device>);
+        __do_gemm_parallel_batched<SerialSimdBatchDim3Tag,
+                                   Algo::Gemm::CompactMKL, default_device>);
   else
     __do_loop_and_invoke(
-        options, __do_gemm_parallel_batched<SerialSimdTag, Algo::Gemm::CompactMKL,
-                                            default_device>);
+        options,
+        __do_gemm_parallel_batched<SerialSimdTag, Algo::Gemm::CompactMKL,
+                                   default_device>);
 #else
 #if !defined(__KOKKOSBATCHED_ENABLE_INTEL_MKL__)
   std::cerr
diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
index bd6392cf06..86714b7e30 100644
--- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
@@ -83,7 +83,7 @@ void (*do_trmm_invoke[LOOP_N][TEST_N])(options_t) = {
  * = (flops / 2) * 2 flops = flops * rows_LHS
  */
 static inline int __trmm_impl_flop_count(char side, int b_m, int b_n, int a_m,
-                                       int a_n) {
+                                         int a_n) {
   int flops;
 
   if (side == 'L' || side == 'l') {
@@ -107,7 +107,7 @@ static inline int __trmm_impl_flop_count(char side, int b_m, int b_n, int a_m,
 // Flop count formula from lapack working note 41:
 // http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf
 static inline int __trmm_flop_count(char side, int b_m, int b_n, int a_m,
-                                  int a_n) {
+                                    int a_n) {
   int flops;
 
   if (side == 'L' || side == 'l') {
@@ -147,8 +147,8 @@ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args,
                                   double time_in_seconds) {
   double flops = trmm_args.A.extent(0) *
                  __trmm_flop_count(trmm_args.side, trmm_args.B.extent(1),
-                                 trmm_args.B.extent(2), trmm_args.A.extent(1),
-                                 trmm_args.A.extent(2));
+                                   trmm_args.B.extent(2), trmm_args.A.extent(1),
+                                   trmm_args.A.extent(2));
   double gflops           = flops / 1e9;
   double average_time     = time_in_seconds / options.n;
   double gbytes_in_matrix = (trmm_args.B.extent(0) * trmm_args.B.extent(1) *

From 147783e45bf8aeab0e8e6e37ee5952538fd9904b Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Thu, 4 Mar 2021 10:26:14 -0700
Subject: [PATCH 29/47] perf_test/blas/blas3: Fix -d 1 for team and serial simd

---
 perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index d7f2143dc6..b7be38fdb9 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -639,7 +639,7 @@ struct parallel_batched_gemm {
                   const MemberType &member) const {
     auto i = member.league_rank();
     Kokkos::parallel_for(
-        Kokkos::ThreadVectorRange(member, simd_vector_size),
+        Kokkos::ThreadVectorRange(member, gemm_args_.Cv.ivec_4d.extent(0)),
         [&](const int &vector_lane) {
           auto svA = Kokkos::subview(gemm_args_.Av.ivec_4d, vector_lane,
                                      Kokkos::ALL(), Kokkos::ALL(), i);

From e3efd455be26670110cadb517724111dc86c3ba0 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Thu, 4 Mar 2021 11:24:44 -0700
Subject: [PATCH 30/47] perf_test/blas/blas3: Update serial simd to use
 RangePolicy

---
 perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index b7be38fdb9..09c3d27465 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -1549,11 +1549,11 @@ void do_gemm_serial_simd_batched_parallel(options_t options) {
   if (options.blas_args.batch_size_last_dim)
     __do_loop_and_invoke(
         options,
-        __do_gemm_parallel_batched<TeamSimdBatchDim4Tag, Algo::Gemm::Unblocked,
+        __do_gemm_parallel_batched<SerialBatchDim3Tag, Algo::Gemm::Unblocked,
                                    default_device, Mode::Serial>);
   else
     __do_loop_and_invoke(
-        options, __do_gemm_parallel_batched<TeamSimdTag, Algo::Gemm::Unblocked,
+        options, __do_gemm_parallel_batched<SerialSimdTag, Algo::Gemm::Unblocked,
                                             default_device, Mode::Serial>);
   return;
 }
@@ -1563,11 +1563,11 @@ void do_gemm_serial_simd_batched_blocked_parallel(options_t options) {
   if (options.blas_args.batch_size_last_dim)
     __do_loop_and_invoke(
         options,
-        __do_gemm_parallel_batched<TeamSimdBatchDim4Tag, Algo::Gemm::Blocked,
+        __do_gemm_parallel_batched<SerialBatchDim3Tag, Algo::Gemm::Blocked,
                                    default_device, Mode::Serial>);
   else
     __do_loop_and_invoke(
-        options, __do_gemm_parallel_batched<TeamSimdTag, Algo::Gemm::Blocked,
+        options, __do_gemm_parallel_batched<SerialSimdTag, Algo::Gemm::Blocked,
                                             default_device, Mode::Serial>);
   return;
 }

From 0127243a0363dd3bceb4dac90a95054a98656e6f Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Thu, 4 Mar 2021 11:31:09 -0700
Subject: [PATCH 31/47] perf_test/blas: Update flop counts to use double

---
 perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp | 14 +++++++-------
 .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp      |  2 +-
 .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp      |  6 +++---
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp
index e6abeaefc4..d60f15b92b 100644
--- a/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp
+++ b/perf_test/blas/blas/KokkosBlas_trtri_perf_test.hpp
@@ -82,9 +82,9 @@ void (*do_trtri_invoke[LOOP_N][TEST_N])(options_t) = {
    * The KokkosBatched::SerialTrtri implementation performs trmm and scal on subblocks
    * of the A matrix. a_m subblocks are selected.
    */
-static inline int __trtri_impl_flop_count(int a_m, int a_n) {
-  int flop_count = 0;
-  int flops_per_div, flops_per_mul, flops_per_add;
+static inline double __trtri_impl_flop_count(double a_m, double a_n) {
+  double flop_count = 0;
+  double flops_per_div, flops_per_mul, flops_per_add;
 
     if (std::is_same<double, default_scalar>::value ||
         std::is_same<float, default_scalar>::value ||
@@ -109,10 +109,10 @@ static inline int __trtri_impl_flop_count(int a_m, int a_n) {
 }
 
 // Flop count formula from lapack working note 41: http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf
-static inline int __trtri_flop_count(int a_m, int a_n) {
-  int flops;
-  int flops_per_mul;
-  int flops_per_add;
+static inline double __trtri_flop_count(double a_m, double a_n) {
+  double flops;
+  double flops_per_mul;
+  double flops_per_add;
 
   if (a_m != a_n) {
     fprintf(stderr, "%s:%d:ERROR: a_m != a_n.\n", __FILE__, __LINE__);
diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index 09c3d27465..36132db261 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -217,7 +217,7 @@ static std::string gemm_csv_header_str =
 /*************************** Internal helper fns **************************/
 // Flop count formula from lapack working note 41:
 // http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf
-static inline int __gemm_flop_count(int a_m, int a_n, int b_n) {
+static inline double __gemm_flop_count(double a_m, double a_n, double b_n) {
   if (std::is_same<double, default_scalar>::value ||
       std::is_same<float, default_scalar>::value ||
       std::is_same<Kokkos::Experimental::half_t, default_scalar>::value)
diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
index 86714b7e30..6d67e96bd1 100644
--- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
@@ -106,9 +106,9 @@ static inline int __trmm_impl_flop_count(char side, int b_m, int b_n, int a_m,
 
 // Flop count formula from lapack working note 41:
 // http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf
-static inline int __trmm_flop_count(char side, int b_m, int b_n, int a_m,
-                                    int a_n) {
-  int flops;
+static inline double __trmm_flop_count(char side, double b_m, double b_n, double a_m,
+                                      double a_n) {
+  double flops;
 
   if (side == 'L' || side == 'l') {
     flops = b_m * b_m * b_n;

From 4acdaf51142081f32b3139dcfca6aa24f8bf8ccc Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Fri, 5 Mar 2021 11:59:04 -0700
Subject: [PATCH 32/47] perf_test/blas/blas3: Added verify option

  - Implemented verify checks in gemm. Simd verify is still failing when
  the batch_size is not divisible by the simd_vector_len.
---
 perf_test/blas/blas3/CMakeLists.txt           |   1 +
 perf_test/blas/blas3/KokkosBlas3_common.hpp   |   4 +
 .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 166 +++++++++++++++++-
 .../blas/blas3/KokkosBlas3_perf_test.cpp      |  19 +-
 test_common/KokkosKernels_TestUtils.hpp       |  15 ++
 5 files changed, 197 insertions(+), 8 deletions(-)

diff --git a/perf_test/blas/blas3/CMakeLists.txt b/perf_test/blas/blas3/CMakeLists.txt
index c1e3a117fa..8f83bd6b99 100644
--- a/perf_test/blas/blas3/CMakeLists.txt
+++ b/perf_test/blas/blas3/CMakeLists.txt
@@ -1,5 +1,6 @@
 KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
 KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/tpls/gtest)
 
 KOKKOSKERNELS_ADD_EXECUTABLE(
     KokkosBlas3_perf_test
diff --git a/perf_test/blas/blas3/KokkosBlas3_common.hpp b/perf_test/blas/blas3/KokkosBlas3_common.hpp
index a991efe61e..2103d0d57e 100644
--- a/perf_test/blas/blas3/KokkosBlas3_common.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_common.hpp
@@ -63,6 +63,7 @@
 #define DEFAULT_VECTOR_LEN 1
 #define DEFAULT_USE_AUTO 0
 #define DEFAULT_BATCH_SIZE_LAST_DIM 0
+#define DEFAULT_VERIFY 1
 
 /************************ blas routine structure definitions **********/
 struct perf_test_trmm_args {
@@ -192,6 +193,8 @@ typedef struct matrix_dims matrix_dims_t;
  * @var out_file:      The file to write csv data to. Defaults to stdout.
  * @var blas_args:     Arguments for each supported blas routine.
  * @var blas_routines: Selects which supported blas routines to test.
+ * @var verify:        Performs verification of the blas routine for each input
+ *                     before timing it.
  */
 struct perf_test_options {
   test_e test;
@@ -205,6 +208,7 @@ struct perf_test_options {
   std::string out_file;
   blas_args_t blas_args;
   std::string blas_routines;
+  bool verify;
 };
 typedef struct perf_test_options options_t;
 
diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index 36132db261..df08e30aaa 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -56,6 +56,8 @@
 //#include "KokkosBatched_Gemm_Team_Impl.hpp"
 //#include "KokkosBatched_Gemm_TeamVector_Impl.hpp"
 #include "KokkosBatched_Util.hpp"
+#include "gtest/gtest.h" // EXPECT_NEAR
+#include "KokkosKernels_TestUtils.hpp"
 
 //#define GEMM_PERF_TEST_DEBUG
 
@@ -252,6 +254,9 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args,
   double gflops;
   double average_time = time_in_seconds / options.n;
 
+  if (options.verify)
+    return;
+
   flops = gemm_args.dims.a.k * __gemm_flop_count(gemm_args.dims.a.m,
                                                  gemm_args.dims.a.n,
                                                  gemm_args.dims.b.n);
@@ -360,8 +365,8 @@ void __do_gemm_serial_batched_template(options_t options,
 template <class scalar_type, class vta, class vtb, class vtc, class device_type,
           class algo_type>
 void __do_gemm_serial_batched(options_t options, gemm_args_t gemm_args) {
-  char a  = gemm_args.transA;
-  char b  = gemm_args.transB;
+  char a  = toupper(gemm_args.transA);
+  char b  = toupper(gemm_args.transB);
   using N = Trans::NoTranspose;
   using T = Trans::Transpose;
   // using C = Trans::ConjTranspose;
@@ -1333,6 +1338,154 @@ void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) {
   return;
 }
 
+/**
+ * Check difference of scalars expected and actual at indexes i,j,k
+ * @var expected: The expected result.
+ * @var actual:   The actual result.
+ * @var epsilon:  The tolerance to use when comparing.
+ * @return true if the comparison fails and false if the comparison succeeds.
+ */
+static inline bool __gemm_print_compare_failure(view_type_3d expected, view_type_3d actual, int i, int j, int k, double epsilon) {
+  STATUS;
+  auto diff = static_cast<double>(Kokkos::Experimental::fabs(expected(i,j,k) - actual(i,j,k)));
+
+  if (diff > epsilon) {
+    printf("fabs(expected(%d,%d,%d):%g - actual(%d,%d,%d):%g):%g > epsilon:%g\n", 
+            i,j,k,static_cast<double>(expected(i,j,k)), 
+            i,j,k,static_cast<double>(actual(i,j,k)), 
+            diff,
+            epsilon);
+    FATAL_ERROR("Comparison failure!");
+    return true;
+  }
+  return false;
+}
+
+/**
+ * Compare all values of expected with all values of actual.
+ * @var expected: the expected results
+ * @var actual:   the actual results
+ * @return false if expected matches actual within epsilon, otherwise true.
+ */
+template <class ScalarType, class LayoutType>
+static inline bool __gemm_do_compare(view_type_3d expected, view_type_3d actual) {
+  double epsilon = Test::epsilon<ScalarType>::value;
+  STATUS;
+
+  for (size_t i = 0; i < expected.extent(0); i++) {
+    for (size_t j = 0; j < expected.extent(1); j++) {
+      for (size_t k = 0; k < expected.extent(2); k++) {
+        if (std::is_same<LayoutType, Kokkos::LayoutRight>::value) {
+          return __gemm_print_compare_failure(expected, actual, i, j, k, epsilon);
+        }
+        if (std::is_same<LayoutType, Kokkos::LayoutLeft>::value) {
+          return __gemm_print_compare_failure(expected, actual, k, j, i, epsilon);
+        }
+      }
+    }
+  }
+  return false;
+}
+
+/**
+ * Compare all values of expected with all values of actual.
+ * @var expected: the expected results
+ * @var actual:   the actual results
+ * @return false if expected matches actual within epsilon, otherwise true.
+ */
+template <class ScalarType, class LayoutType>
+static inline bool __gemm_do_compare(view_type_3d expected, gemm_simd_args_t actual) {
+  std::cout << actual.mat_4d.extent(0) << "x" << actual.mat_4d.extent(1) << "x" << actual.mat_4d.extent(2) << "x" << actual.mat_4d.extent(3) << std::endl;
+  decltype(expected) actual_data(actual.mat_4d.data(), expected.extent(0), expected.extent(1), expected.extent(2));
+  STATUS;
+  return __gemm_do_compare<ScalarType, LayoutType>(expected, actual_data);
+}
+
+template <class ScalarType, class LayoutType, class DeviceType>
+static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, void (*fn)(options_t, gemm_args_t)) {
+  using execution_space = typename DeviceType::execution_space;
+  // Just create "expected" types using non-simd types.
+  decltype(gemm_args.C) C_expected;
+  decltype(gemm_args.A) A_expected;
+  decltype(gemm_args.B) B_expected;
+  STATUS;
+
+  if (options.blas_args.batch_size_last_dim) {
+    C_expected = decltype(C_expected)("C_expected", gemm_args.dims.c.m, gemm_args.dims.c.n, gemm_args.dims.c.k);
+    A_expected = decltype(A_expected)("A_expected", gemm_args.dims.a.m, gemm_args.dims.a.n, gemm_args.dims.a.k);
+    B_expected = decltype(B_expected)("B_expected", gemm_args.dims.b.m, gemm_args.dims.b.n, gemm_args.dims.b.k);
+  } else {
+    C_expected = decltype(C_expected)("C_expected", gemm_args.dims.c.k, gemm_args.dims.c.m, gemm_args.dims.c.n);
+    A_expected = decltype(A_expected)("A_expected", gemm_args.dims.a.k, gemm_args.dims.a.m, gemm_args.dims.a.n);
+    B_expected = decltype(B_expected)("B_expected", gemm_args.dims.b.k, gemm_args.dims.b.m, gemm_args.dims.b.n);
+  }
+
+  // Initialize "expected" matrices.
+  if (gemm_args.C.data() != nullptr) {
+    Kokkos::deep_copy(C_expected, gemm_args.C);
+    Kokkos::deep_copy(A_expected, gemm_args.A);
+    Kokkos::deep_copy(B_expected, gemm_args.B);
+
+    Kokkos::fence(); // Ensure that deep_copy has completed
+
+    // Check that initial values match
+    if (__gemm_do_compare<ScalarType, LayoutType>(C_expected, gemm_args.C))
+      FATAL_ERROR("Inital values mismatch!");
+  } else if (gemm_args.Cv.vec_3d.data() != nullptr) {
+    // TODO: Debug this when batch_size % simd_vector_len != 0.
+    memcpy(C_expected.data(), gemm_args.Cv.vec_3d.data(), sizeof(default_scalar) * gemm_args.dims.c.k * gemm_args.dims.c.m * gemm_args.dims.c.n);
+    memcpy(A_expected.data(), gemm_args.Av.vec_3d.data(), sizeof(default_scalar) * gemm_args.dims.a.k * gemm_args.dims.a.m * gemm_args.dims.a.n);
+    memcpy(B_expected.data(), gemm_args.Bv.vec_3d.data(), sizeof(default_scalar) * gemm_args.dims.b.k * gemm_args.dims.b.m * gemm_args.dims.b.n);
+
+    // Check that initial values match
+    if (__gemm_do_compare<ScalarType, LayoutType>(C_expected, gemm_args.Cv))
+      FATAL_ERROR("Inital values mismatch!");
+  } else {
+    FATAL_ERROR("Input arguments are empty!");
+  }
+
+  // Populate "expected" matrices via VanillaGemm
+  Test::Functor_BatchedVanillaGEMM<decltype(A_expected), decltype(B_expected), decltype(C_expected), execution_space> vgemm;
+  vgemm.A_t = toupper(gemm_args.transA) == 'T';
+  vgemm.B_t = toupper(gemm_args.transB) == 'T';
+  vgemm.A_c = vgemm.B_c = false;
+  vgemm.A = A_expected;
+  vgemm.B = B_expected;
+  vgemm.C = C_expected;
+  vgemm.alpha = gemm_args.alpha;
+  vgemm.beta = gemm_args.beta;
+  vgemm.run(); // Compute C_expected
+
+  // Run routine with warm_up_n = 1 and n = 0. 
+  auto warm_up_n_bak = options.warm_up_n;
+  options.warm_up_n = 1;
+  auto n_bak = options.n;
+  options.n = 0;
+  fn(options, gemm_args);
+
+  Kokkos::fence(); // Redundant fence.
+
+  // Check the result
+  if (gemm_args.C.data() != nullptr) {
+    if (__gemm_do_compare<ScalarType, LayoutType>(C_expected, gemm_args.C))
+      FATAL_ERROR("Result value mismatch!");
+  }
+
+  if (gemm_args.Cv.vec_3d.data() != nullptr) {
+    if (__gemm_do_compare<ScalarType, LayoutType>(C_expected, gemm_args.Cv))
+      FATAL_ERROR("Result value mismatch!");
+  }
+
+  // Run actual timed test.
+  options.verify = false; // Set verify to false for csv output.
+  options.warm_up_n = warm_up_n_bak;
+  options.n = n_bak;
+  fn(options, gemm_args);
+
+  // Reset verify for next matrix size.
+  options.verify = true;
+}
+
 /*************************** Internal setup fns **************************/
 template <class scalar_type, class vta, class vtb, class vtc, class device_type>
 gemm_args_t __do_setup(options_t options, matrix_dims_t dims) {
@@ -1457,6 +1610,8 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) {
   gemm_args.bp.team_size  = options.blas_args.team_size;
   gemm_args.bp.vector_len = options.blas_args.vector_len;
 
+  Kokkos::fence(); // Ensure that fill_random has completed.
+
   return gemm_args;
 }
 
@@ -1484,7 +1639,12 @@ void __do_loop_and_invoke(options_t options,
       cur_dims.c.m += options.step, cur_dims.c.n += options.step) {
     gemm_args = __do_setup<default_scalar, view_type_3d, view_type_3d,
                            view_type_3d, default_device>(options, cur_dims);
-    fn(options, gemm_args);
+
+    if (options.verify) {
+      __gemm_do_verify<default_scalar, default_layout, default_device>(options, gemm_args, fn);
+    } else {
+      fn(options, gemm_args);
+    }
   }
   return;
 }
diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
index daf68180c2..73f5a18452 100644
--- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
+++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
@@ -68,6 +68,7 @@ static struct option long_options[] = {
     {"vector_len", required_argument, 0, 'n'},
     {"batch_size", required_argument, 0, 'k'},
     {"batch_size_last_dim", required_argument, 0, 'd'},
+    {"verify", required_argument, 0, 'v'},
     {0, 0, 0, 0}};
 
 static void __print_help_blas3_perf_test() {
@@ -122,23 +123,23 @@ static void __print_help_blas3_perf_test() {
   printf("\t\t\tThe value of LEN as an integer. (default: %d)\n",
          DEFAULT_VECTOR_LEN);
 
-  printf("\t-u, --use_auto={0,1}\n");
+  printf("\t-u, --use_auto=AUTO\n");
   printf(
       "\t\tWhether to use Kokkos::AUTO for vector_len and team_size "
       "(Heirarchical parallelism).\n");
   printf(
-      "\t\t\t1 to use Kokkos::AUTO, otherwise --vector_len and --team_size "
-      "will be used. (default: %d)\n",
+      "\t\t\tValid values for AUTO are 1 to use Kokkos::AUTO and 0 to use --vector_len and --team_size "
+      "instead. (default: %d)\n",
       DEFAULT_USE_AUTO);
 
   printf("\t-k, --batch_size=LEN\n");
   printf("\t\tBatch size. Adds third dimension to matrices A, B, and C.\n");
   printf("\t\t\tThe value of LEN as an integer. (default: %d)\n", DEFAULT_K);
 
-  printf("\t-d, --batch_size_last_dim={0,1}\n");
+  printf("\t-d, --batch_size_last_dim=LAST_DIM\n");
   printf("\t\tHow to allocate the batch_size in the matrices.\n");
   printf(
-      "\t\t\t1 make the batch_size the last dimension, otherwise batch_size is "
+      "\t\t\tValid values for LAST_DIM are 1 make the batch_size the last dimension and 0 to make the batch_size "
       "the first dimension (default: %d)\n",
       DEFAULT_BATCH_SIZE_LAST_DIM);
 
@@ -207,6 +208,13 @@ static void __print_help_blas3_perf_test() {
       "\t\t\tValid value for ROUTINES is one of more valid blas3 routines "
       "delimited by a comma. (default: %s)\n",
       DEFAULT_BLAS_ROUTINES);
+
+  printf("\t-v, --verify=VERIFY\n");
+  printf("\t\tVerification selection. (untimed)\n");
+  printf(
+      "\t\t\tValid values for VERIFY are either 0 to skip verification or 1 to verify before timing. "
+      "(default: %d)\n",
+      DEFAULT_VERIFY);
 }
 
 static void __blas3_perf_test_input_error(char **argv, char short_opt,
@@ -258,6 +266,7 @@ int main(int argc, char **argv) {
   options.blas_args.vector_len          = DEFAULT_VECTOR_LEN;
   options.blas_args.use_auto            = DEFAULT_USE_AUTO;
   options.blas_args.batch_size_last_dim = DEFAULT_BATCH_SIZE_LAST_DIM;
+  options.verify                        = DEFAULT_VERIFY;
 
   options.blas_args.trmm.trmm_args = DEFAULT_TRMM_ARGS;
   options.blas_args.trmm.alpha     = DEFAULT_TRMM_ALPHA;
diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp
index 20a568bbc1..8ad7fe22af 100644
--- a/test_common/KokkosKernels_TestUtils.hpp
+++ b/test_common/KokkosKernels_TestUtils.hpp
@@ -202,5 +202,20 @@ namespace Test {
           *this);
     }
   };
+
+  template<class T>
+  class epsilon {
+    public:
+      constexpr static double value = std::numeric_limits<T>::epsilon();
+  };
+
+  // explicit epsilon specializations
+  #if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
+  template<Kokkos::Experimental::half_t>
+  class epsilon {
+    public:
+      constexpr static double value = 0009765625F;
+  };
+  #endif // KOKKOS_HALF_T_IS_FLOAT
 }
 #endif

From 0de685f74269557950aae2271c74d52d26d5c94f Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Fri, 5 Mar 2021 12:49:54 -0700
Subject: [PATCH 33/47] test_common: Fix half_t epsilon specialization

---
 test_common/KokkosKernels_TestUtils.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp
index 8ad7fe22af..64b3902ec7 100644
--- a/test_common/KokkosKernels_TestUtils.hpp
+++ b/test_common/KokkosKernels_TestUtils.hpp
@@ -211,8 +211,8 @@ namespace Test {
 
   // explicit epsilon specializations
   #if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT
-  template<Kokkos::Experimental::half_t>
-  class epsilon {
+  template<>
+  class epsilon<Kokkos::Experimental::half_t> {
     public:
       constexpr static double value = 0009765625F;
   };

From 29322e8cac1f82484ba3c01120e16d9c706035d3 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Fri, 5 Mar 2021 14:45:00 -0700
Subject: [PATCH 34/47] perf_test/blas/blas3: Use TeamPolicy for serial simd

---
 perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index df08e30aaa..77d5850fab 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -1706,28 +1706,32 @@ void do_gemm_serial_batched_blocked_parallel(options_t options) {
 
 void do_gemm_serial_simd_batched_parallel(options_t options) {
   STATUS;
+  // SerialBatchDim3Tag
+  // SerialSimdTag
   if (options.blas_args.batch_size_last_dim)
     __do_loop_and_invoke(
         options,
-        __do_gemm_parallel_batched<SerialBatchDim3Tag, Algo::Gemm::Unblocked,
+        __do_gemm_parallel_batched<TeamSimdBatchDim4Tag, Algo::Gemm::Unblocked,
                                    default_device, Mode::Serial>);
   else
     __do_loop_and_invoke(
-        options, __do_gemm_parallel_batched<SerialSimdTag, Algo::Gemm::Unblocked,
+        options, __do_gemm_parallel_batched<TeamSimdTag, Algo::Gemm::Unblocked,
                                             default_device, Mode::Serial>);
   return;
 }
 
 void do_gemm_serial_simd_batched_blocked_parallel(options_t options) {
   STATUS;
+  // SerialBatchDim3Tag
+  // SerialSimdTag
   if (options.blas_args.batch_size_last_dim)
     __do_loop_and_invoke(
         options,
-        __do_gemm_parallel_batched<SerialBatchDim3Tag, Algo::Gemm::Blocked,
+        __do_gemm_parallel_batched<TeamSimdBatchDim4Tag, Algo::Gemm::Blocked,
                                    default_device, Mode::Serial>);
   else
     __do_loop_and_invoke(
-        options, __do_gemm_parallel_batched<SerialSimdTag, Algo::Gemm::Blocked,
+        options, __do_gemm_parallel_batched<TeamSimdTag, Algo::Gemm::Blocked,
                                             default_device, Mode::Serial>);
   return;
 }

From 80ca02ebc6efc905bccf7036a419f1fce6ee414e Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Tue, 9 Mar 2021 11:49:49 -0700
Subject: [PATCH 35/47] perf_test/blas/blas3: Process verify option

---
 perf_test/blas/blas3/KokkosBlas3_perf_test.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
index 73f5a18452..7e1cdf0f2f 100644
--- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
+++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
@@ -275,7 +275,7 @@ int main(int argc, char **argv) {
   options.blas_args.gemm.alpha     = DEFAULT_GEMM_ALPHA;
   options.blas_args.gemm.beta      = DEFAULT_GEMM_BETA;
 
-  while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:p:d:",
+  while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:p:d:v:",
                             long_options, &option_idx)) != -1) {
     switch (ret) {
       case 'h': __print_help_blas3_perf_test(); return 0;
@@ -397,6 +397,7 @@ int main(int argc, char **argv) {
                 atoi(optarg);
         break;
       case 'd': options.blas_args.batch_size_last_dim = atoi(optarg); break;
+      case 'v': options.verify = atoi(optarg); break;
       case 'z': options.blas_args.team_size = atoi(optarg); break;
       case 'n': options.blas_args.vector_len = atoi(optarg); break;
       case 'u': options.blas_args.use_auto = atoi(optarg); break;

From 239d44de0fbee7a17a51d3d8c28cc7f1497ea0e0 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Tue, 9 Mar 2021 12:02:24 -0700
Subject: [PATCH 36/47] perf_test/blas/blas3: Relax epsilon

---
 perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index 77d5850fab..114cc49422 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -1369,7 +1369,7 @@ static inline bool __gemm_print_compare_failure(view_type_3d expected, view_type
  */
 template <class ScalarType, class LayoutType>
 static inline bool __gemm_do_compare(view_type_3d expected, view_type_3d actual) {
-  double epsilon = Test::epsilon<ScalarType>::value;
+  double epsilon = Test::epsilon<ScalarType>::value * 1e3;
   STATUS;
 
   for (size_t i = 0; i < expected.extent(0); i++) {

From 55e3eb30670202eb2eab54d261799fe2c3c5c84e Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Tue, 9 Mar 2021 12:05:25 -0700
Subject: [PATCH 37/47] perf_test/blas/blas3: Add TODO for bug

---
 perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index 114cc49422..d38bfccd60 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -312,6 +312,7 @@ void __do_gemm_serial_blas(options_t options, gemm_args_t gemm_args) {
       auto B = Kokkos::subview(_gemm_args.B, i, Kokkos::ALL(), Kokkos::ALL());
       auto C = Kokkos::subview(_gemm_args.C, i, Kokkos::ALL(), Kokkos::ALL());
 
+      // TODO: Debug this when starting a matrix sizes <= 10x10
       KokkosBlas::gemm(&_gemm_args.transA, &_gemm_args.transB, _gemm_args.alpha,
                        A, B, _gemm_args.beta, C);
     }

From 53aa6536ca7643a111383d4f7ae1fe4d65af5857 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Wed, 10 Mar 2021 09:03:41 -0700
Subject: [PATCH 38/47] perf_test/blas/blas3: Fix verify for simd when
 batch_size is first dim

---
 .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 41 +++++++++++++++++--
 1 file changed, 37 insertions(+), 4 deletions(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index d38bfccd60..ad01d9acad 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -131,6 +131,8 @@ using view_type_3d =
     Kokkos::View<default_scalar ***, default_layout, default_device>;
 using view_type_4d =
     Kokkos::View<default_scalar ****, default_layout, default_device>;
+using view_type_5d =
+    Kokkos::View<default_scalar *****, default_layout, default_device>;
 
 // Construct the vector type
 using memory_space = typename default_device::execution_space::memory_space;
@@ -1402,6 +1404,38 @@ static inline bool __gemm_do_compare(view_type_3d expected, gemm_simd_args_t act
   return __gemm_do_compare<ScalarType, LayoutType>(expected, actual_data);
 }
 
+template <class dstViewType>
+static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstViewType dst, options_t options) {
+  using scalar_type = typename dstViewType::value_type;
+  view_type_5d src_raw((double *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3));
+
+  if (options.blas_args.batch_size_last_dim) {
+    exit(255); // TODO
+  } else {
+    size_t remainder = dst.extent(0) % simd_vector_size;
+    if (remainder > 0) {
+      // The below loops map a given 2-rank gemm within the simd view back to the
+      // 3-rank view.
+      for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; simd_internal_vec_idx++) {
+        auto sv0 = Kokkos::subview(src_raw, simd_internal_vec_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
+        for (size_t simd_batch_size_idx = 0; simd_batch_size_idx < src.ivec_4d.extent(0); simd_batch_size_idx++) {
+          auto sv1 = Kokkos::subview(sv0, simd_batch_size_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
+          for (size_t vector_batch_idx = 0; vector_batch_idx < src.ivec_4d.extent(3); vector_batch_idx++) {
+            auto sv2 = Kokkos::subview(sv1, Kokkos::ALL(), Kokkos::ALL(), vector_batch_idx);
+            for (size_t m = 0; m < src.ivec_4d.extent(1); m++) {
+              for (size_t n = 0; n < src.ivec_4d.extent(2); n++) {
+                dst(simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx, m, n) = sv2(m, n);
+              }
+            }
+          }
+        }
+      }
+    } else {
+      memcpy(dst.data(), src.ivec_4d.data(), sizeof(scalar_type) * dst.extent(0) * dst.extent(1) * dst.extent(2));
+    }
+  }
+}
+
 template <class ScalarType, class LayoutType, class DeviceType>
 static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, void (*fn)(options_t, gemm_args_t)) {
   using execution_space = typename DeviceType::execution_space;
@@ -1433,10 +1467,9 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, vo
     if (__gemm_do_compare<ScalarType, LayoutType>(C_expected, gemm_args.C))
       FATAL_ERROR("Inital values mismatch!");
   } else if (gemm_args.Cv.vec_3d.data() != nullptr) {
-    // TODO: Debug this when batch_size % simd_vector_len != 0.
-    memcpy(C_expected.data(), gemm_args.Cv.vec_3d.data(), sizeof(default_scalar) * gemm_args.dims.c.k * gemm_args.dims.c.m * gemm_args.dims.c.n);
-    memcpy(A_expected.data(), gemm_args.Av.vec_3d.data(), sizeof(default_scalar) * gemm_args.dims.a.k * gemm_args.dims.a.m * gemm_args.dims.a.n);
-    memcpy(B_expected.data(), gemm_args.Bv.vec_3d.data(), sizeof(default_scalar) * gemm_args.dims.b.k * gemm_args.dims.b.m * gemm_args.dims.b.n);
+    __gemm_copy_simd_view_to_3d_view<decltype(C_expected)>(gemm_args.Cv, C_expected, options);
+    __gemm_copy_simd_view_to_3d_view<decltype(A_expected)>(gemm_args.Av, A_expected, options);
+    __gemm_copy_simd_view_to_3d_view<decltype(B_expected)>(gemm_args.Bv, B_expected, options);
 
     // Check that initial values match
     if (__gemm_do_compare<ScalarType, LayoutType>(C_expected, gemm_args.Cv))

From 192fde6a76d975ed5c324f97bb46ca9e0545b24e Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Wed, 10 Mar 2021 10:49:00 -0700
Subject: [PATCH 39/47] perf_test/blas/blas3: Complete verify for batch_size in
 first dimension

---
 .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 67 ++++++++++++-------
 1 file changed, 43 insertions(+), 24 deletions(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index ad01d9acad..a1e870e4c0 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -1375,33 +1375,29 @@ static inline bool __gemm_do_compare(view_type_3d expected, view_type_3d actual)
   double epsilon = Test::epsilon<ScalarType>::value * 1e3;
   STATUS;
 
-  for (size_t i = 0; i < expected.extent(0); i++) {
-    for (size_t j = 0; j < expected.extent(1); j++) {
-      for (size_t k = 0; k < expected.extent(2); k++) {
-        if (std::is_same<LayoutType, Kokkos::LayoutRight>::value) {
-          return __gemm_print_compare_failure(expected, actual, i, j, k, epsilon);
+  if (std::is_same<LayoutType, Kokkos::LayoutRight>::value) {
+    for (size_t i = 0; i < expected.extent(0); i++) {
+      for (size_t j = 0; j < expected.extent(1); j++) {
+        for (size_t k = 0; k < expected.extent(2); k++) {
+          if (__gemm_print_compare_failure(expected, actual, i, j, k, epsilon))
+            return true;
         }
-        if (std::is_same<LayoutType, Kokkos::LayoutLeft>::value) {
-          return __gemm_print_compare_failure(expected, actual, k, j, i, epsilon);
+      }
+    }
+  }
+
+  if (std::is_same<LayoutType, Kokkos::LayoutLeft>::value) {
+    for (size_t k = 0; k < expected.extent(2); k++) {
+      for (size_t j = 0; j < expected.extent(1); j++) {
+          for (size_t i = 0; i < expected.extent(0); i++) {
+          if (__gemm_print_compare_failure(expected, actual, i, j, k, epsilon))
+            return true;
         }
       }
     }
   }
-  return false;
-}
 
-/**
- * Compare all values of expected with all values of actual.
- * @var expected: the expected results
- * @var actual:   the actual results
- * @return false if expected matches actual within epsilon, otherwise true.
- */
-template <class ScalarType, class LayoutType>
-static inline bool __gemm_do_compare(view_type_3d expected, gemm_simd_args_t actual) {
-  std::cout << actual.mat_4d.extent(0) << "x" << actual.mat_4d.extent(1) << "x" << actual.mat_4d.extent(2) << "x" << actual.mat_4d.extent(3) << std::endl;
-  decltype(expected) actual_data(actual.mat_4d.data(), expected.extent(0), expected.extent(1), expected.extent(2));
-  STATUS;
-  return __gemm_do_compare<ScalarType, LayoutType>(expected, actual_data);
+  return false;
 }
 
 template <class dstViewType>
@@ -1414,7 +1410,7 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstVie
   } else {
     size_t remainder = dst.extent(0) % simd_vector_size;
     if (remainder > 0) {
-      // The below loops map a given 2-rank gemm within the simd view back to the
+      // The below loops copies each corresponding 2-rank matrix within the simd view back to the
       // 3-rank view.
       for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; simd_internal_vec_idx++) {
         auto sv0 = Kokkos::subview(src_raw, simd_internal_vec_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
@@ -1431,11 +1427,34 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstVie
         }
       }
     } else {
+      // When the batch_size is a multiple of the simd_vector_size, each 2-rank matrix lies in the correct location
+      // and the data can simply be copied.
       memcpy(dst.data(), src.ivec_4d.data(), sizeof(scalar_type) * dst.extent(0) * dst.extent(1) * dst.extent(2));
     }
   }
 }
 
+/**
+ * Compare all values of expected with all values of actual.
+ * @var expected: the expected results
+ * @var actual:   the actual results
+ * @return false if expected matches actual within epsilon, otherwise true.
+ */
+template <class ScalarType, class LayoutType>
+static inline bool __gemm_do_compare(view_type_3d expected, gemm_simd_args_t actual, options_t options) {
+  decltype(expected) actual_data("actual_data", expected.extent(0), expected.extent(1), expected.extent(2));
+
+  STATUS;
+
+  // Copy the simd view to a 3d view for comparision.
+  // NOTE: The raw results are different when batch_size % simd_vector_size != 0.
+  // Also note that when batch_size % simd_vector_size != 0, the simd operation
+  // calculates results that we do not require.
+  // So, we end up running an extra batch_size % simd_vector_size GEMMs!
+  __gemm_copy_simd_view_to_3d_view(actual, actual_data, options);
+  return __gemm_do_compare<ScalarType, LayoutType>(expected, actual_data);
+}
+
 template <class ScalarType, class LayoutType, class DeviceType>
 static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, void (*fn)(options_t, gemm_args_t)) {
   using execution_space = typename DeviceType::execution_space;
@@ -1472,7 +1491,7 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, vo
     __gemm_copy_simd_view_to_3d_view<decltype(B_expected)>(gemm_args.Bv, B_expected, options);
 
     // Check that initial values match
-    if (__gemm_do_compare<ScalarType, LayoutType>(C_expected, gemm_args.Cv))
+    if (__gemm_do_compare<ScalarType, LayoutType>(C_expected, gemm_args.Cv, options))
       FATAL_ERROR("Inital values mismatch!");
   } else {
     FATAL_ERROR("Input arguments are empty!");
@@ -1506,7 +1525,7 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, vo
   }
 
   if (gemm_args.Cv.vec_3d.data() != nullptr) {
-    if (__gemm_do_compare<ScalarType, LayoutType>(C_expected, gemm_args.Cv))
+    if (__gemm_do_compare<ScalarType, LayoutType>(C_expected, gemm_args.Cv, options))
       FATAL_ERROR("Result value mismatch!");
   }
 

From e4351716f2cf7fc4daebcfb60933e488b50b1d1e Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Wed, 10 Mar 2021 13:01:35 -0700
Subject: [PATCH 40/47] test_common: Update VanillaGEMM with
 batch_size_last_dim member

---
 test_common/KokkosKernels_TestUtils.hpp | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp
index 64b3902ec7..1d383ffd35 100644
--- a/test_common/KokkosKernels_TestUtils.hpp
+++ b/test_common/KokkosKernels_TestUtils.hpp
@@ -159,7 +159,7 @@ namespace Test {
   // C(i,:,:) = alpha * (A(i,:,:) * B(i,:,:)) + beta * C(i,:,:)
   template<class ViewTypeA, class ViewTypeB, class ViewTypeC, class ExecutionSpace>
   struct Functor_BatchedVanillaGEMM {
-    bool A_t, B_t, A_c, B_c;
+    bool A_t, B_t, A_c, B_c, batch_size_last_dim = false;
     ViewTypeA A;
     ViewTypeB B;
     ViewTypeC C;
@@ -177,15 +177,20 @@ namespace Test {
       auto _A = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
       auto _B = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL());
       auto _C = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL());
+      if (batch_size_last_dim) {
+        _A = Kokkos::subview(A, Kokkos::ALL(), Kokkos::ALL(), i);
+        _B = Kokkos::subview(B, Kokkos::ALL(), Kokkos::ALL(), i);
+        _C = Kokkos::subview(C, Kokkos::ALL(), Kokkos::ALL(), i);
+      }
       using SubviewTypeA = decltype(_A);
       using SubviewTypeB = decltype(_B);
       using SubviewTypeC = decltype(_C);
       struct SharedVanillaGEMM<SubviewTypeA,SubviewTypeB,SubviewTypeC,ExecutionSpace> vgemm;
       vgemm.A_t = A_t; vgemm.B_t = B_t;
       vgemm.A_c = A_c; vgemm.B_c = B_c;
-      vgemm.C_rows = C.extent(1);
-      vgemm.C_cols = C.extent(2);    
-      vgemm.A_cols = A_t?A.extent(1):A.extent(2);
+      vgemm.C_rows = batch_size_last_dim ? C.extent(0) : C.extent(1);
+      vgemm.C_cols = batch_size_last_dim ? C.extent(1) : C.extent(2);
+      vgemm.A_cols = batch_size_last_dim ? (A_t?A.extent(0):A.extent(1)) : (A_t?A.extent(1):A.extent(2));
       vgemm.A = _A;
       vgemm.B = _B;
       vgemm.C = _C;
@@ -198,7 +203,7 @@ namespace Test {
     void run() {
       Kokkos::parallel_for(
           "Test::VanillaGEMM",
-          Kokkos::TeamPolicy<ExecutionSpace>(C.extent(0), Kokkos::AUTO, 16),
+          Kokkos::TeamPolicy<ExecutionSpace>(batch_size_last_dim ? C.extent(2) : C.extent(0), Kokkos::AUTO, 16),
           *this);
     }
   };

From 07906733755fda24433b974df27e97cc9bf080ca Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Wed, 10 Mar 2021 13:01:47 -0700
Subject: [PATCH 41/47] perf_test/blas/blas3: Add batch_size_last_dim to vgemm

---
 perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index a1e870e4c0..8fca4e76b2 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -1502,6 +1502,7 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, vo
   vgemm.A_t = toupper(gemm_args.transA) == 'T';
   vgemm.B_t = toupper(gemm_args.transB) == 'T';
   vgemm.A_c = vgemm.B_c = false;
+  vgemm.batch_size_last_dim = options.blas_args.batch_size_last_dim;
   vgemm.A = A_expected;
   vgemm.B = B_expected;
   vgemm.C = C_expected;

From 137adccbbee971bfcbf073f6374cdad90a719874 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Wed, 10 Mar 2021 13:45:38 -0700
Subject: [PATCH 42/47] perf_test/blas/blas3: Update compare routines

  - Handle simd with batch_size in last dimension
  - Work with device views
---
 .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 36 +++++++++++++++----
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index 8fca4e76b2..f560690e54 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -1350,12 +1350,14 @@ void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) {
  */
 static inline bool __gemm_print_compare_failure(view_type_3d expected, view_type_3d actual, int i, int j, int k, double epsilon) {
   STATUS;
-  auto diff = static_cast<double>(Kokkos::Experimental::fabs(expected(i,j,k) - actual(i,j,k)));
+  typename view_type_3d::HostMirror h_expected = Kokkos::create_mirror_view(expected);
+  typename view_type_3d::HostMirror h_actual = Kokkos::create_mirror_view(actual);
+  auto diff = static_cast<double>(Kokkos::Experimental::fabs(h_expected(i,j,k) - h_actual(i,j,k)));
 
   if (diff > epsilon) {
     printf("fabs(expected(%d,%d,%d):%g - actual(%d,%d,%d):%g):%g > epsilon:%g\n", 
-            i,j,k,static_cast<double>(expected(i,j,k)), 
-            i,j,k,static_cast<double>(actual(i,j,k)), 
+            i,j,k,static_cast<double>(h_expected(i,j,k)), 
+            i,j,k,static_cast<double>(h_actual(i,j,k)), 
             diff,
             epsilon);
     FATAL_ERROR("Comparison failure!");
@@ -1403,17 +1405,39 @@ static inline bool __gemm_do_compare(view_type_3d expected, view_type_3d actual)
 template <class dstViewType>
 static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstViewType dst, options_t options) {
   using scalar_type = typename dstViewType::value_type;
-  view_type_5d src_raw((double *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3));
 
   if (options.blas_args.batch_size_last_dim) {
-    exit(255); // TODO
+    view_type_5d src_raw((double *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3));
+    typename view_type_5d::HostMirror h_src_raw = Kokkos::create_mirror_view(src_raw);
+    size_t remainder = dst.extent(2) % simd_vector_size;
+    remainder = remainder == 0 ? simd_internal_vector_size : remainder;
+
+    // The below loops copies each corresponding 2-rank matrix within the simd view back to the
+    // 3-rank view.
+    for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; simd_internal_vec_idx++) {
+      auto sv0 = Kokkos::subview(h_src_raw, simd_internal_vec_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
+      for (size_t vector_batch_idx = 0; vector_batch_idx < src.ivec_4d.extent(0); vector_batch_idx++) {
+        auto sv1 = Kokkos::subview(sv0, vector_batch_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
+        for (size_t simd_batch_size_idx = 0; simd_batch_size_idx < src.ivec_4d.extent(3); simd_batch_size_idx++) {
+          auto sv2 = Kokkos::subview(sv1, Kokkos::ALL(), Kokkos::ALL(), simd_batch_size_idx);
+          for (size_t m = 0; m < src.ivec_4d.extent(1); m++) {
+            for (size_t n = 0; n < src.ivec_4d.extent(2); n++) {
+              dst(m, n, simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx) = sv2(m, n);
+            }
+          }
+        }
+      }
+    }
   } else {
+    view_type_5d src_raw((double *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3));
+    typename view_type_5d::HostMirror h_src_raw = Kokkos::create_mirror_view(src_raw);
     size_t remainder = dst.extent(0) % simd_vector_size;
+
     if (remainder > 0) {
       // The below loops copies each corresponding 2-rank matrix within the simd view back to the
       // 3-rank view.
       for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; simd_internal_vec_idx++) {
-        auto sv0 = Kokkos::subview(src_raw, simd_internal_vec_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
+        auto sv0 = Kokkos::subview(h_src_raw, simd_internal_vec_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
         for (size_t simd_batch_size_idx = 0; simd_batch_size_idx < src.ivec_4d.extent(0); simd_batch_size_idx++) {
           auto sv1 = Kokkos::subview(sv0, simd_batch_size_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
           for (size_t vector_batch_idx = 0; vector_batch_idx < src.ivec_4d.extent(3); vector_batch_idx++) {

From 891f4bd178b3c90a94d52eb10b05ccd8611d4454 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Wed, 10 Mar 2021 13:54:56 -0700
Subject: [PATCH 43/47] test_common: Fix half_t epsilon

---
 test_common/KokkosKernels_TestUtils.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp
index 1d383ffd35..ad546fe0b4 100644
--- a/test_common/KokkosKernels_TestUtils.hpp
+++ b/test_common/KokkosKernels_TestUtils.hpp
@@ -219,7 +219,7 @@ namespace Test {
   template<>
   class epsilon<Kokkos::Experimental::half_t> {
     public:
-      constexpr static double value = 0009765625F;
+      constexpr static double value = 0.0009765625F;
   };
   #endif // KOKKOS_HALF_T_IS_FLOAT
 }

From a7558b5eaccfdbcb27de174e86ef1048a3d2f531 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Wed, 10 Mar 2021 14:06:36 -0700
Subject: [PATCH 44/47] perf_test/blas/blas3: Update serial loops

  - Update serial loops for batch_size_last_dim option
  - Remove dead code
---
 .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 97 ++++++-------------
 1 file changed, 30 insertions(+), 67 deletions(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index f560690e54..4ee8a676dd 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -308,22 +308,28 @@ void __do_gemm_serial_blas(options_t options, gemm_args_t gemm_args) {
 
   STATUS;
 
-  auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args) {
+  auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args, bool batch_size_last_dim) {
     for (uint32_t i = 0; i < n; ++i) {
-      auto A = Kokkos::subview(_gemm_args.A, i, Kokkos::ALL(), Kokkos::ALL());
-      auto B = Kokkos::subview(_gemm_args.B, i, Kokkos::ALL(), Kokkos::ALL());
-      auto C = Kokkos::subview(_gemm_args.C, i, Kokkos::ALL(), Kokkos::ALL());
+      for (int j = 0; j < _gemm_args.dims.c.k; j++) {
+        auto A = Kokkos::subview(_gemm_args.A, j, Kokkos::ALL(), Kokkos::ALL());
+        auto B = Kokkos::subview(_gemm_args.B, j, Kokkos::ALL(), Kokkos::ALL());
+        auto C = Kokkos::subview(_gemm_args.C, j, Kokkos::ALL(), Kokkos::ALL());
+        if (batch_size_last_dim) {
+          A = Kokkos::subview(_gemm_args.A, Kokkos::ALL(), Kokkos::ALL(), j);
+          B = Kokkos::subview(_gemm_args.B, Kokkos::ALL(), Kokkos::ALL(), j);
+          C = Kokkos::subview(_gemm_args.C, Kokkos::ALL(), Kokkos::ALL(), j);  
+        }
 
-      // TODO: Debug this when starting a matrix sizes <= 10x10
-      KokkosBlas::gemm(&_gemm_args.transA, &_gemm_args.transB, _gemm_args.alpha,
-                       A, B, _gemm_args.beta, C);
+        KokkosBlas::gemm(&_gemm_args.transA, &_gemm_args.transB, _gemm_args.alpha,
+                        A, B, _gemm_args.beta, C);
+      }
     }
   };
-  __do_loop(options.warm_up_n, gemm_args);
+  __do_loop(options.warm_up_n, gemm_args, options.blas_args.batch_size_last_dim);
   Kokkos::fence();
 
   timer.reset();
-  __do_loop(options.n, gemm_args);
+  __do_loop(options.n, gemm_args, options.blas_args.batch_size_last_dim);
   Kokkos::fence();
 
   __gemm_output_csv_row(options, gemm_args, timer.seconds());
@@ -341,22 +347,29 @@ void __do_gemm_serial_batched_template(options_t options,
 #if !defined(KOKKOS_ENABLE_CUDA)
   Kokkos::Timer timer;
 
-  auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args) {
+  auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args, bool batch_size_last_dim) {
     for (uint32_t i = 0; i < n; ++i) {
-      auto A = Kokkos::subview(_gemm_args.A, i, Kokkos::ALL(), Kokkos::ALL());
-      auto B = Kokkos::subview(_gemm_args.B, i, Kokkos::ALL(), Kokkos::ALL());
-      auto C = Kokkos::subview(_gemm_args.C, i, Kokkos::ALL(), Kokkos::ALL());
+      for (int j = 0; j < _gemm_args.dims.c.k; j++) {
+        auto A = Kokkos::subview(_gemm_args.A, j, Kokkos::ALL(), Kokkos::ALL());
+        auto B = Kokkos::subview(_gemm_args.B, j, Kokkos::ALL(), Kokkos::ALL());
+        auto C = Kokkos::subview(_gemm_args.C, j, Kokkos::ALL(), Kokkos::ALL());
+        if (batch_size_last_dim) {
+          A = Kokkos::subview(_gemm_args.A, Kokkos::ALL(), Kokkos::ALL(), j);
+          B = Kokkos::subview(_gemm_args.B, Kokkos::ALL(), Kokkos::ALL(), j);
+          C = Kokkos::subview(_gemm_args.C, Kokkos::ALL(), Kokkos::ALL(), j);  
+        }
 
-      SerialGemm<TransAType, TransBType, AlgoType>::invoke(
-          _gemm_args.alpha, A, B, _gemm_args.beta, C);
+        SerialGemm<TransAType, TransBType, AlgoType>::invoke(
+            _gemm_args.alpha, A, B, _gemm_args.beta, C);
+      }
     }
   };
 
-  __do_loop(options.warm_up_n, gemm_args);
+  __do_loop(options.warm_up_n, gemm_args, options.blas_args.batch_size_last_dim);
   Kokkos::fence();
 
   timer.reset();
-  __do_loop(options.n, gemm_args);
+  __do_loop(options.n, gemm_args, options.blas_args.batch_size_last_dim);
   Kokkos::fence();
   __gemm_output_csv_row(options, gemm_args, timer.seconds());
 #else
@@ -400,56 +413,6 @@ void __do_gemm_serial_batched(options_t options, gemm_args_t gemm_args) {
   return;
 }
 
-#if !defined(KOKKOS_ENABLE_CUDA)
-template <class ExecutionSpace>
-struct parallel_blas_gemm {
-  gemm_args_t gemm_args_;
-
-  parallel_blas_gemm(gemm_args_t gemm_args) : gemm_args_(gemm_args) {}
-
-  KOKKOS_INLINE_FUNCTION
-  void operator()(const int &i) const {
-    auto svA = Kokkos::subview(gemm_args_.A, i, Kokkos::ALL(), Kokkos::ALL());
-    auto svB = Kokkos::subview(gemm_args_.B, i, Kokkos::ALL(), Kokkos::ALL());
-    auto svC = Kokkos::subview(gemm_args_.C, i, Kokkos::ALL(), Kokkos::ALL());
-
-    KokkosBlas::gemm(&gemm_args_.transA, &gemm_args_.transB, gemm_args_.alpha,
-                     svA, svB, gemm_args_.beta, svC);
-  }
-};
-#endif  // !KOKKOS_ENABLE_CUDA
-
-template <class scalar_type, class vta, class vtb, class device_type>
-void __do_gemm_parallel_blas(options_t options, gemm_args_t gemm_args) {
-#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP)
-  uint32_t warm_up_n = options.warm_up_n;
-  uint32_t n         = options.n;
-  Kokkos::Timer timer;
-  using execution_space = typename device_type::execution_space;
-  using functor_type    = parallel_blas_gemm<execution_space>;
-  functor_type parallel_blas_gemm_functor(gemm_args);
-
-  STATUS;
-
-  Kokkos::parallel_for("parallelBlasWarmUpLoopGemm",
-                       Kokkos::RangePolicy<execution_space>(0, warm_up_n),
-                       parallel_blas_gemm_functor);
-  Kokkos::fence();
-
-  timer.reset();
-  Kokkos::parallel_for("parallelBlasTimedLoopGemm",
-                       Kokkos::RangePolicy<execution_space>(0, n),
-                       parallel_blas_gemm_functor);
-  Kokkos::fence();
-  __gemm_output_csv_row(options, gemm_args, timer.seconds());
-#else
-  std::cerr << std::string(__func__)
-            << " disabled since KOKKOS_ENABLE_CUDA is defined." << std::endl;
-  __gemm_output_csv_row(options, gemm_args, -1);
-#endif  // !KOKKOS_ENABLE_CUDA
-  return;
-}
-
 template <class TransAType, class TransBType, class BlockingType>
 struct parallel_batched_gemm_range_policy {
   gemm_args_t gemm_args_;

From 4ea0e4c863d3eab7de25c42ea6a90b05b19f4492 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Wed, 10 Mar 2021 15:46:18 -0700
Subject: [PATCH 45/47] test_common: Update VanillaGemm

  - Fix VanillaGemm to work with batch_size_last_dim=true
  when Cuda is enabled.
---
 test_common/KokkosKernels_TestUtils.hpp | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp
index ad546fe0b4..43f2d48460 100644
--- a/test_common/KokkosKernels_TestUtils.hpp
+++ b/test_common/KokkosKernels_TestUtils.hpp
@@ -167,24 +167,29 @@ namespace Test {
     using ScalarA = typename ViewTypeA::value_type;
     using ScalarB = typename ViewTypeB::value_type;
     using ScalarC = typename ViewTypeC::value_type;
+    using SubviewTypeA = typename Kokkos::View<ScalarA**, Kokkos::LayoutStride, typename ViewTypeA::device_type>;
+    using SubviewTypeB = typename Kokkos::View<ScalarB**, Kokkos::LayoutStride, typename ViewTypeA::device_type>;
+    using SubviewTypeC = typename Kokkos::View<ScalarC**, Kokkos::LayoutStride, typename ViewTypeA::device_type>;
+
     ScalarA alpha;
     ScalarC beta;
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const typename Kokkos::TeamPolicy<ExecutionSpace>::member_type& team) const {
       int i = team.league_rank();
+      SubviewTypeA _A;
+      SubviewTypeB _B;
+      SubviewTypeC _C;
 
-      auto _A = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
-      auto _B = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL());
-      auto _C = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL());
       if (batch_size_last_dim) {
         _A = Kokkos::subview(A, Kokkos::ALL(), Kokkos::ALL(), i);
         _B = Kokkos::subview(B, Kokkos::ALL(), Kokkos::ALL(), i);
         _C = Kokkos::subview(C, Kokkos::ALL(), Kokkos::ALL(), i);
+      } else {
+        _A = Kokkos::subview(A, i, Kokkos::ALL(), Kokkos::ALL());
+        _B = Kokkos::subview(B, i, Kokkos::ALL(), Kokkos::ALL());
+        _C = Kokkos::subview(C, i, Kokkos::ALL(), Kokkos::ALL());
       }
-      using SubviewTypeA = decltype(_A);
-      using SubviewTypeB = decltype(_B);
-      using SubviewTypeC = decltype(_C);
       struct SharedVanillaGEMM<SubviewTypeA,SubviewTypeB,SubviewTypeC,ExecutionSpace> vgemm;
       vgemm.A_t = A_t; vgemm.B_t = B_t;
       vgemm.A_c = A_c; vgemm.B_c = B_c;

From 8522c914cbe58b14927903c690daa0904206df92 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Thu, 11 Mar 2021 11:55:01 -0700
Subject: [PATCH 46/47] perf_test/blas/blas3: Updates for half_t

src/batched: Allow compile with half_t
---
 .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 58 +++++++++++++------
 .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp | 24 +++++---
 src/batched/KokkosBatched_Util.hpp            |  3 +-
 3 files changed, 57 insertions(+), 28 deletions(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index 4ee8a676dd..ffb13819b6 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -266,8 +266,8 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args,
   gflops = flops / 1e9;
 
   options.out[0] << algo_name << "," << options.blas_args.gemm.gemm_args << ","
-                 << options.blas_args.gemm.alpha << ","
-                 << options.blas_args.gemm.beta << "," << ts << "," << vlen
+                 << static_cast<double>(options.blas_args.gemm.alpha) << ","
+                 << static_cast<double>(options.blas_args.gemm.beta) << "," << ts << "," << vlen
                  << "," << loop_e_str[options.loop] << ","
                  << __gemm_output_dim_string(options, gemm_args.dims.a) << ","
                  << __gemm_output_dim_string(options, gemm_args.dims.b) << ","
@@ -1315,7 +1315,7 @@ static inline bool __gemm_print_compare_failure(view_type_3d expected, view_type
   STATUS;
   typename view_type_3d::HostMirror h_expected = Kokkos::create_mirror_view(expected);
   typename view_type_3d::HostMirror h_actual = Kokkos::create_mirror_view(actual);
-  auto diff = static_cast<double>(Kokkos::Experimental::fabs(h_expected(i,j,k) - h_actual(i,j,k)));
+  auto diff = static_cast<double>(Kokkos::Experimental::fabs(static_cast<double>(h_expected(i,j,k) - h_actual(i,j,k))));
 
   if (diff > epsilon) {
     printf("fabs(expected(%d,%d,%d):%g - actual(%d,%d,%d):%g):%g > epsilon:%g\n", 
@@ -1367,10 +1367,11 @@ static inline bool __gemm_do_compare(view_type_3d expected, view_type_3d actual)
 
 template <class dstViewType>
 static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstViewType dst, options_t options) {
-  using scalar_type = typename dstViewType::value_type;
+  using dst_scalar_type = typename dstViewType::value_type;
+  using src_scalar_type = typename view_type_5d::value_type;
 
   if (options.blas_args.batch_size_last_dim) {
-    view_type_5d src_raw((double *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3));
+    view_type_5d src_raw((src_scalar_type *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3));
     typename view_type_5d::HostMirror h_src_raw = Kokkos::create_mirror_view(src_raw);
     size_t remainder = dst.extent(2) % simd_vector_size;
     remainder = remainder == 0 ? simd_internal_vector_size : remainder;
@@ -1392,7 +1393,7 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstVie
       }
     }
   } else {
-    view_type_5d src_raw((double *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3));
+    view_type_5d src_raw((src_scalar_type *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3));
     typename view_type_5d::HostMirror h_src_raw = Kokkos::create_mirror_view(src_raw);
     size_t remainder = dst.extent(0) % simd_vector_size;
 
@@ -1416,7 +1417,7 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstVie
     } else {
       // When the batch_size is a multiple of the simd_vector_size, each 2-rank matrix lies in the correct location
       // and the data can simply be copied.
-      memcpy(dst.data(), src.ivec_4d.data(), sizeof(scalar_type) * dst.extent(0) * dst.extent(1) * dst.extent(2));
+      memcpy(dst.data(), src.ivec_4d.data(), sizeof(dst_scalar_type) * dst.extent(0) * dst.extent(1) * dst.extent(2));
     }
   }
 }
@@ -1616,15 +1617,24 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) {
 
     // Use the non-simd 4-rank view type to randomly populate the gemm simd
     // arguments
-    Kokkos::fill_random(gemm_args.Av.mat_4d, rand_pool,
+    using tmp_view_type_4d = Kokkos::View<double ****, default_layout, default_device>;
+    tmp_view_type_4d tmpA("tmpA", gemm_args.Av.mat_4d.extent(0), gemm_args.Av.mat_4d.extent(1), gemm_args.Av.mat_4d.extent(2), gemm_args.Av.mat_4d.extent(3));
+    Kokkos::fill_random(tmpA, rand_pool,
                         Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
-                                     scalar_type>::max());
-    Kokkos::fill_random(gemm_args.Bv.mat_4d, rand_pool,
+                                     double>::max());
+    tmp_view_type_4d tmpB("tmpB", gemm_args.Bv.mat_4d.extent(0), gemm_args.Bv.mat_4d.extent(1), gemm_args.Bv.mat_4d.extent(2), gemm_args.Bv.mat_4d.extent(3));
+    Kokkos::fill_random(tmpB, rand_pool,
                         Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
-                                     scalar_type>::max());
-    Kokkos::fill_random(gemm_args.Cv.mat_4d, rand_pool,
+                                     double>::max());
+    tmp_view_type_4d tmpC("tmpC", gemm_args.Cv.mat_4d.extent(0), gemm_args.Cv.mat_4d.extent(1), gemm_args.Cv.mat_4d.extent(2), gemm_args.Cv.mat_4d.extent(3));
+    Kokkos::fill_random(tmpC, rand_pool,
                         Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
-                                     scalar_type>::max());
+                                     double>::max());
+    Kokkos::fence();
+    Kokkos::deep_copy(gemm_args.Av.mat_4d, tmpA);
+    Kokkos::deep_copy(gemm_args.Bv.mat_4d, tmpB);
+    Kokkos::deep_copy(gemm_args.Cv.mat_4d, tmpC);
+    Kokkos::fence();
   } else {
     if (options.blas_args.batch_size_last_dim) {
       gemm_args.A = vta("gemm_args.A", dims.a.m, dims.a.n, dims.a.k);
@@ -1636,15 +1646,25 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) {
       gemm_args.C = vtc("gemm_args.C", dims.c.k, dims.c.m, dims.c.n);
     }
 
-    Kokkos::fill_random(gemm_args.A, rand_pool,
+    using tmp_view_type_3d = Kokkos::View<double ***, default_layout, default_device>;
+    tmp_view_type_3d tmpA("tmpA", gemm_args.A.extent(0), gemm_args.A.extent(1), gemm_args.A.extent(2));
+    Kokkos::fill_random(tmpA, rand_pool,
                         Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
-                                     scalar_type>::max());
-    Kokkos::fill_random(gemm_args.B, rand_pool,
+                                     double>::max());
+    tmp_view_type_3d tmpB("tmpB", gemm_args.B.extent(0), gemm_args.B.extent(1), gemm_args.B.extent(2));
+    Kokkos::fill_random(tmpB, rand_pool,
                         Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
-                                     scalar_type>::max());
-    Kokkos::fill_random(gemm_args.C, rand_pool,
+                                     double>::max());
+    tmp_view_type_3d tmpC("tmpC", gemm_args.C.extent(0), gemm_args.C.extent(1), gemm_args.C.extent(2));
+    Kokkos::fill_random(tmpC, rand_pool,
                         Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
-                                     scalar_type>::max());
+                                     double>::max());
+
+    Kokkos::fence();
+    Kokkos::deep_copy(gemm_args.A, tmpA);
+    Kokkos::deep_copy(gemm_args.B, tmpB);
+    Kokkos::deep_copy(gemm_args.C, tmpC);
+    Kokkos::fence();
   }
   gemm_args.alpha         = options.blas_args.gemm.alpha;
   gemm_args.beta          = options.blas_args.gemm.beta;
diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
index 6d67e96bd1..0a6741c603 100644
--- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
@@ -175,7 +175,7 @@ static void __trmm_output_csv_row(options_t options, trmm_args_t trmm_args,
 
   options.out[0] << test_e_str[options.test] << ","
                  << options.blas_args.trmm.trmm_args << ","
-                 << options.blas_args.trmm.alpha << ","
+                 << static_cast<double>(options.blas_args.trmm.alpha) << ","
                  << loop_e_str[options.loop] << "," << trmm_args.A.extent(0)
                  << "x" << trmm_args.A.extent(1) << "x" << trmm_args.A.extent(2)
                  << "," << trmm_args.B.extent(0) << "x" << trmm_args.B.extent(1)
@@ -624,10 +624,14 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) {
   trmm_args.alpha = options.blas_args.trmm.alpha;
   host_A          = Kokkos::create_mirror_view(trmm_args.A);
 
-  Kokkos::fill_random(trmm_args.A, rand_pool,
-                      Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
-                                   scalar_type>::max());
-  Kokkos::deep_copy(host_A, trmm_args.A);
+
+  {
+    Kokkos::View<double ***, default_layout, default_device> tmp("tmp", trmm_args.A.extent(0), trmm_args.A.extent(1), trmm_args.A.extent(2));
+    Kokkos::fill_random(tmp, rand_pool,
+			Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
+			double>::max());
+    Kokkos::deep_copy(host_A, tmp);
+  }
 
   if (trmm_args.uplo == 'U' || trmm_args.uplo == 'u') {
     // Make A upper triangular
@@ -663,9 +667,13 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) {
   }
   Kokkos::deep_copy(trmm_args.A, host_A);
 
-  Kokkos::fill_random(trmm_args.B, rand_pool,
-                      Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
-                                   scalar_type>::max());
+  {
+    Kokkos::View<double ***, default_layout, default_device> tmp("tmp", trmm_args.B.extent(0), trmm_args.B.extent(1), trmm_args.B.extent(2));
+    Kokkos::fill_random(tmp, rand_pool,
+			Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
+			double>::max());
+    Kokkos::deep_copy(trmm_args.B, tmp);
+  }
 
   return trmm_args;
 }
diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp
index 3253b6ce12..4a5c17d1df 100644
--- a/src/batched/KokkosBatched_Util.hpp
+++ b/src/batched/KokkosBatched_Util.hpp
@@ -204,7 +204,8 @@ namespace KokkosBatched {
                    std::is_same<T,Kokkos::complex<float> >::value  ||
                    std::is_same<T,std::complex<float> >::value     ||
                    std::is_same<T,Kokkos::complex<double> >::value ||
-                   std::is_same<T,std::complex<double> >::value,
+                   std::is_same<T,std::complex<double> >::value    ||
+		   std::is_same<T,Kokkos::Experimental::half_t>::value,
                    "KokkosKernels:: Invalid SIMD<> type." );
     using value_type = T;
   };

From 4f9dafa854357505c1243cebea6afcec24405dff Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Thu, 11 Mar 2021 13:16:36 -0700
Subject: [PATCH 47/47] perf_test/blas: Apply clang-format

---
 .../blas/blas3/KokkosBlas3_gemm_perf_test.hpp | 263 +++++++++++-------
 .../blas/blas3/KokkosBlas3_perf_test.cpp      |  14 +-
 .../blas/blas3/KokkosBlas3_trmm_perf_test.hpp |  21 +-
 src/batched/KokkosBatched_Util.hpp            |   2 +-
 4 files changed, 188 insertions(+), 112 deletions(-)

diff --git a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
index ffb13819b6..081b01bb58 100644
--- a/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_gemm_perf_test.hpp
@@ -56,7 +56,7 @@
 //#include "KokkosBatched_Gemm_Team_Impl.hpp"
 //#include "KokkosBatched_Gemm_TeamVector_Impl.hpp"
 #include "KokkosBatched_Util.hpp"
-#include "gtest/gtest.h" // EXPECT_NEAR
+#include "gtest/gtest.h"  // EXPECT_NEAR
 #include "KokkosKernels_TestUtils.hpp"
 
 //#define GEMM_PERF_TEST_DEBUG
@@ -256,8 +256,7 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args,
   double gflops;
   double average_time = time_in_seconds / options.n;
 
-  if (options.verify)
-    return;
+  if (options.verify) return;
 
   flops = gemm_args.dims.a.k * __gemm_flop_count(gemm_args.dims.a.m,
                                                  gemm_args.dims.a.n,
@@ -267,8 +266,8 @@ static void __gemm_output_csv_row(options_t options, gemm_args_t gemm_args,
 
   options.out[0] << algo_name << "," << options.blas_args.gemm.gemm_args << ","
                  << static_cast<double>(options.blas_args.gemm.alpha) << ","
-                 << static_cast<double>(options.blas_args.gemm.beta) << "," << ts << "," << vlen
-                 << "," << loop_e_str[options.loop] << ","
+                 << static_cast<double>(options.blas_args.gemm.beta) << ","
+                 << ts << "," << vlen << "," << loop_e_str[options.loop] << ","
                  << __gemm_output_dim_string(options, gemm_args.dims.a) << ","
                  << __gemm_output_dim_string(options, gemm_args.dims.b) << ","
                  << __gemm_output_dim_string(options, gemm_args.dims.c) << ","
@@ -308,7 +307,8 @@ void __do_gemm_serial_blas(options_t options, gemm_args_t gemm_args) {
 
   STATUS;
 
-  auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args, bool batch_size_last_dim) {
+  auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args,
+                      bool batch_size_last_dim) {
     for (uint32_t i = 0; i < n; ++i) {
       for (int j = 0; j < _gemm_args.dims.c.k; j++) {
         auto A = Kokkos::subview(_gemm_args.A, j, Kokkos::ALL(), Kokkos::ALL());
@@ -317,15 +317,16 @@ void __do_gemm_serial_blas(options_t options, gemm_args_t gemm_args) {
         if (batch_size_last_dim) {
           A = Kokkos::subview(_gemm_args.A, Kokkos::ALL(), Kokkos::ALL(), j);
           B = Kokkos::subview(_gemm_args.B, Kokkos::ALL(), Kokkos::ALL(), j);
-          C = Kokkos::subview(_gemm_args.C, Kokkos::ALL(), Kokkos::ALL(), j);  
+          C = Kokkos::subview(_gemm_args.C, Kokkos::ALL(), Kokkos::ALL(), j);
         }
 
-        KokkosBlas::gemm(&_gemm_args.transA, &_gemm_args.transB, _gemm_args.alpha,
-                        A, B, _gemm_args.beta, C);
+        KokkosBlas::gemm(&_gemm_args.transA, &_gemm_args.transB,
+                         _gemm_args.alpha, A, B, _gemm_args.beta, C);
       }
     }
   };
-  __do_loop(options.warm_up_n, gemm_args, options.blas_args.batch_size_last_dim);
+  __do_loop(options.warm_up_n, gemm_args,
+            options.blas_args.batch_size_last_dim);
   Kokkos::fence();
 
   timer.reset();
@@ -347,7 +348,8 @@ void __do_gemm_serial_batched_template(options_t options,
 #if !defined(KOKKOS_ENABLE_CUDA)
   Kokkos::Timer timer;
 
-  auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args, bool batch_size_last_dim) {
+  auto __do_loop = [](uint32_t n, gemm_args_t _gemm_args,
+                      bool batch_size_last_dim) {
     for (uint32_t i = 0; i < n; ++i) {
       for (int j = 0; j < _gemm_args.dims.c.k; j++) {
         auto A = Kokkos::subview(_gemm_args.A, j, Kokkos::ALL(), Kokkos::ALL());
@@ -356,7 +358,7 @@ void __do_gemm_serial_batched_template(options_t options,
         if (batch_size_last_dim) {
           A = Kokkos::subview(_gemm_args.A, Kokkos::ALL(), Kokkos::ALL(), j);
           B = Kokkos::subview(_gemm_args.B, Kokkos::ALL(), Kokkos::ALL(), j);
-          C = Kokkos::subview(_gemm_args.C, Kokkos::ALL(), Kokkos::ALL(), j);  
+          C = Kokkos::subview(_gemm_args.C, Kokkos::ALL(), Kokkos::ALL(), j);
         }
 
         SerialGemm<TransAType, TransBType, AlgoType>::invoke(
@@ -365,7 +367,8 @@ void __do_gemm_serial_batched_template(options_t options,
     }
   };
 
-  __do_loop(options.warm_up_n, gemm_args, options.blas_args.batch_size_last_dim);
+  __do_loop(options.warm_up_n, gemm_args,
+            options.blas_args.batch_size_last_dim);
   Kokkos::fence();
 
   timer.reset();
@@ -1311,18 +1314,22 @@ void __do_gemm_parallel_experiment6(options_t options, gemm_args_t gemm_args) {
  * @var epsilon:  The tolerance to use when comparing.
  * @return true if the comparison fails and false if the comparison succeeds.
  */
-static inline bool __gemm_print_compare_failure(view_type_3d expected, view_type_3d actual, int i, int j, int k, double epsilon) {
+static inline bool __gemm_print_compare_failure(view_type_3d expected,
+                                                view_type_3d actual, int i,
+                                                int j, int k, double epsilon) {
   STATUS;
-  typename view_type_3d::HostMirror h_expected = Kokkos::create_mirror_view(expected);
-  typename view_type_3d::HostMirror h_actual = Kokkos::create_mirror_view(actual);
-  auto diff = static_cast<double>(Kokkos::Experimental::fabs(static_cast<double>(h_expected(i,j,k) - h_actual(i,j,k))));
+  typename view_type_3d::HostMirror h_expected =
+      Kokkos::create_mirror_view(expected);
+  typename view_type_3d::HostMirror h_actual =
+      Kokkos::create_mirror_view(actual);
+  auto diff = static_cast<double>(Kokkos::Experimental::fabs(
+      static_cast<double>(h_expected(i, j, k) - h_actual(i, j, k))));
 
   if (diff > epsilon) {
-    printf("fabs(expected(%d,%d,%d):%g - actual(%d,%d,%d):%g):%g > epsilon:%g\n", 
-            i,j,k,static_cast<double>(h_expected(i,j,k)), 
-            i,j,k,static_cast<double>(h_actual(i,j,k)), 
-            diff,
-            epsilon);
+    printf(
+        "fabs(expected(%d,%d,%d):%g - actual(%d,%d,%d):%g):%g > epsilon:%g\n",
+        i, j, k, static_cast<double>(h_expected(i, j, k)), i, j, k,
+        static_cast<double>(h_actual(i, j, k)), diff, epsilon);
     FATAL_ERROR("Comparison failure!");
     return true;
   }
@@ -1336,7 +1343,8 @@ static inline bool __gemm_print_compare_failure(view_type_3d expected, view_type
  * @return false if expected matches actual within epsilon, otherwise true.
  */
 template <class ScalarType, class LayoutType>
-static inline bool __gemm_do_compare(view_type_3d expected, view_type_3d actual) {
+static inline bool __gemm_do_compare(view_type_3d expected,
+                                     view_type_3d actual) {
   double epsilon = Test::epsilon<ScalarType>::value * 1e3;
   STATUS;
 
@@ -1354,7 +1362,7 @@ static inline bool __gemm_do_compare(view_type_3d expected, view_type_3d actual)
   if (std::is_same<LayoutType, Kokkos::LayoutLeft>::value) {
     for (size_t k = 0; k < expected.extent(2); k++) {
       for (size_t j = 0; j < expected.extent(1); j++) {
-          for (size_t i = 0; i < expected.extent(0); i++) {
+        for (size_t i = 0; i < expected.extent(0); i++) {
           if (__gemm_print_compare_failure(expected, actual, i, j, k, epsilon))
             return true;
         }
@@ -1366,58 +1374,90 @@ static inline bool __gemm_do_compare(view_type_3d expected, view_type_3d actual)
 }
 
 template <class dstViewType>
-static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstViewType dst, options_t options) {
+static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src,
+                                                    dstViewType dst,
+                                                    options_t options) {
   using dst_scalar_type = typename dstViewType::value_type;
   using src_scalar_type = typename view_type_5d::value_type;
 
   if (options.blas_args.batch_size_last_dim) {
-    view_type_5d src_raw((src_scalar_type *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3));
-    typename view_type_5d::HostMirror h_src_raw = Kokkos::create_mirror_view(src_raw);
+    view_type_5d src_raw((src_scalar_type *)src.ivec_4d.data(),
+                         simd_internal_vector_size, src.ivec_4d.extent(0),
+                         src.ivec_4d.extent(1), src.ivec_4d.extent(2),
+                         src.ivec_4d.extent(3));
+    typename view_type_5d::HostMirror h_src_raw =
+        Kokkos::create_mirror_view(src_raw);
     size_t remainder = dst.extent(2) % simd_vector_size;
-    remainder = remainder == 0 ? simd_internal_vector_size : remainder;
-
-    // The below loops copies each corresponding 2-rank matrix within the simd view back to the
-    // 3-rank view.
-    for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; simd_internal_vec_idx++) {
-      auto sv0 = Kokkos::subview(h_src_raw, simd_internal_vec_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
-      for (size_t vector_batch_idx = 0; vector_batch_idx < src.ivec_4d.extent(0); vector_batch_idx++) {
-        auto sv1 = Kokkos::subview(sv0, vector_batch_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
-        for (size_t simd_batch_size_idx = 0; simd_batch_size_idx < src.ivec_4d.extent(3); simd_batch_size_idx++) {
-          auto sv2 = Kokkos::subview(sv1, Kokkos::ALL(), Kokkos::ALL(), simd_batch_size_idx);
+    remainder        = remainder == 0 ? simd_internal_vector_size : remainder;
+
+    // The below loops copies each corresponding 2-rank matrix within the simd
+    // view back to the 3-rank view.
+    for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder;
+         simd_internal_vec_idx++) {
+      auto sv0 =
+          Kokkos::subview(h_src_raw, simd_internal_vec_idx, Kokkos::ALL(),
+                          Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
+      for (size_t vector_batch_idx = 0;
+           vector_batch_idx < src.ivec_4d.extent(0); vector_batch_idx++) {
+        auto sv1 = Kokkos::subview(sv0, vector_batch_idx, Kokkos::ALL(),
+                                   Kokkos::ALL(), Kokkos::ALL());
+        for (size_t simd_batch_size_idx = 0;
+             simd_batch_size_idx < src.ivec_4d.extent(3);
+             simd_batch_size_idx++) {
+          auto sv2 = Kokkos::subview(sv1, Kokkos::ALL(), Kokkos::ALL(),
+                                     simd_batch_size_idx);
           for (size_t m = 0; m < src.ivec_4d.extent(1); m++) {
             for (size_t n = 0; n < src.ivec_4d.extent(2); n++) {
-              dst(m, n, simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx) = sv2(m, n);
+              dst(m, n,
+                  simd_internal_vec_idx + simd_batch_size_idx +
+                      vector_batch_idx) = sv2(m, n);
             }
           }
         }
       }
     }
   } else {
-    view_type_5d src_raw((src_scalar_type *)src.ivec_4d.data(), simd_internal_vector_size, src.ivec_4d.extent(0), src.ivec_4d.extent(1), src.ivec_4d.extent(2), src.ivec_4d.extent(3));
-    typename view_type_5d::HostMirror h_src_raw = Kokkos::create_mirror_view(src_raw);
+    view_type_5d src_raw((src_scalar_type *)src.ivec_4d.data(),
+                         simd_internal_vector_size, src.ivec_4d.extent(0),
+                         src.ivec_4d.extent(1), src.ivec_4d.extent(2),
+                         src.ivec_4d.extent(3));
+    typename view_type_5d::HostMirror h_src_raw =
+        Kokkos::create_mirror_view(src_raw);
     size_t remainder = dst.extent(0) % simd_vector_size;
 
     if (remainder > 0) {
-      // The below loops copies each corresponding 2-rank matrix within the simd view back to the
-      // 3-rank view.
-      for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder; simd_internal_vec_idx++) {
-        auto sv0 = Kokkos::subview(h_src_raw, simd_internal_vec_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
-        for (size_t simd_batch_size_idx = 0; simd_batch_size_idx < src.ivec_4d.extent(0); simd_batch_size_idx++) {
-          auto sv1 = Kokkos::subview(sv0, simd_batch_size_idx, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
-          for (size_t vector_batch_idx = 0; vector_batch_idx < src.ivec_4d.extent(3); vector_batch_idx++) {
-            auto sv2 = Kokkos::subview(sv1, Kokkos::ALL(), Kokkos::ALL(), vector_batch_idx);
+      // The below loops copies each corresponding 2-rank matrix within the simd
+      // view back to the 3-rank view.
+      for (size_t simd_internal_vec_idx = 0; simd_internal_vec_idx < remainder;
+           simd_internal_vec_idx++) {
+        auto sv0 =
+            Kokkos::subview(h_src_raw, simd_internal_vec_idx, Kokkos::ALL(),
+                            Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL());
+        for (size_t simd_batch_size_idx = 0;
+             simd_batch_size_idx < src.ivec_4d.extent(0);
+             simd_batch_size_idx++) {
+          auto sv1 = Kokkos::subview(sv0, simd_batch_size_idx, Kokkos::ALL(),
+                                     Kokkos::ALL(), Kokkos::ALL());
+          for (size_t vector_batch_idx = 0;
+               vector_batch_idx < src.ivec_4d.extent(3); vector_batch_idx++) {
+            auto sv2 = Kokkos::subview(sv1, Kokkos::ALL(), Kokkos::ALL(),
+                                       vector_batch_idx);
             for (size_t m = 0; m < src.ivec_4d.extent(1); m++) {
               for (size_t n = 0; n < src.ivec_4d.extent(2); n++) {
-                dst(simd_internal_vec_idx + simd_batch_size_idx + vector_batch_idx, m, n) = sv2(m, n);
+                dst(simd_internal_vec_idx + simd_batch_size_idx +
+                        vector_batch_idx,
+                    m, n) = sv2(m, n);
               }
             }
           }
         }
       }
     } else {
-      // When the batch_size is a multiple of the simd_vector_size, each 2-rank matrix lies in the correct location
-      // and the data can simply be copied.
-      memcpy(dst.data(), src.ivec_4d.data(), sizeof(dst_scalar_type) * dst.extent(0) * dst.extent(1) * dst.extent(2));
+      // When the batch_size is a multiple of the simd_vector_size, each 2-rank
+      // matrix lies in the correct location and the data can simply be copied.
+      memcpy(dst.data(), src.ivec_4d.data(),
+             sizeof(dst_scalar_type) * dst.extent(0) * dst.extent(1) *
+                 dst.extent(2));
     }
   }
 }
@@ -1429,22 +1469,26 @@ static inline void __gemm_copy_simd_view_to_3d_view(gemm_simd_args_t src, dstVie
  * @return false if expected matches actual within epsilon, otherwise true.
  */
 template <class ScalarType, class LayoutType>
-static inline bool __gemm_do_compare(view_type_3d expected, gemm_simd_args_t actual, options_t options) {
-  decltype(expected) actual_data("actual_data", expected.extent(0), expected.extent(1), expected.extent(2));
+static inline bool __gemm_do_compare(view_type_3d expected,
+                                     gemm_simd_args_t actual,
+                                     options_t options) {
+  decltype(expected) actual_data("actual_data", expected.extent(0),
+                                 expected.extent(1), expected.extent(2));
 
   STATUS;
 
   // Copy the simd view to a 3d view for comparision.
-  // NOTE: The raw results are different when batch_size % simd_vector_size != 0.
-  // Also note that when batch_size % simd_vector_size != 0, the simd operation
-  // calculates results that we do not require.
-  // So, we end up running an extra batch_size % simd_vector_size GEMMs!
+  // NOTE: The raw results are different when batch_size % simd_vector_size !=
+  // 0. Also note that when batch_size % simd_vector_size != 0, the simd
+  // operation calculates results that we do not require. So, we end up running
+  // an extra batch_size % simd_vector_size GEMMs!
   __gemm_copy_simd_view_to_3d_view(actual, actual_data, options);
   return __gemm_do_compare<ScalarType, LayoutType>(expected, actual_data);
 }
 
 template <class ScalarType, class LayoutType, class DeviceType>
-static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, void (*fn)(options_t, gemm_args_t)) {
+static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args,
+                                    void (*fn)(options_t, gemm_args_t)) {
   using execution_space = typename DeviceType::execution_space;
   // Just create "expected" types using non-simd types.
   decltype(gemm_args.C) C_expected;
@@ -1453,13 +1497,19 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, vo
   STATUS;
 
   if (options.blas_args.batch_size_last_dim) {
-    C_expected = decltype(C_expected)("C_expected", gemm_args.dims.c.m, gemm_args.dims.c.n, gemm_args.dims.c.k);
-    A_expected = decltype(A_expected)("A_expected", gemm_args.dims.a.m, gemm_args.dims.a.n, gemm_args.dims.a.k);
-    B_expected = decltype(B_expected)("B_expected", gemm_args.dims.b.m, gemm_args.dims.b.n, gemm_args.dims.b.k);
+    C_expected = decltype(C_expected)("C_expected", gemm_args.dims.c.m,
+                                      gemm_args.dims.c.n, gemm_args.dims.c.k);
+    A_expected = decltype(A_expected)("A_expected", gemm_args.dims.a.m,
+                                      gemm_args.dims.a.n, gemm_args.dims.a.k);
+    B_expected = decltype(B_expected)("B_expected", gemm_args.dims.b.m,
+                                      gemm_args.dims.b.n, gemm_args.dims.b.k);
   } else {
-    C_expected = decltype(C_expected)("C_expected", gemm_args.dims.c.k, gemm_args.dims.c.m, gemm_args.dims.c.n);
-    A_expected = decltype(A_expected)("A_expected", gemm_args.dims.a.k, gemm_args.dims.a.m, gemm_args.dims.a.n);
-    B_expected = decltype(B_expected)("B_expected", gemm_args.dims.b.k, gemm_args.dims.b.m, gemm_args.dims.b.n);
+    C_expected = decltype(C_expected)("C_expected", gemm_args.dims.c.k,
+                                      gemm_args.dims.c.m, gemm_args.dims.c.n);
+    A_expected = decltype(A_expected)("A_expected", gemm_args.dims.a.k,
+                                      gemm_args.dims.a.m, gemm_args.dims.a.n);
+    B_expected = decltype(B_expected)("B_expected", gemm_args.dims.b.k,
+                                      gemm_args.dims.b.m, gemm_args.dims.b.n);
   }
 
   // Initialize "expected" matrices.
@@ -1468,44 +1518,50 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, vo
     Kokkos::deep_copy(A_expected, gemm_args.A);
     Kokkos::deep_copy(B_expected, gemm_args.B);
 
-    Kokkos::fence(); // Ensure that deep_copy has completed
+    Kokkos::fence();  // Ensure that deep_copy has completed
 
     // Check that initial values match
     if (__gemm_do_compare<ScalarType, LayoutType>(C_expected, gemm_args.C))
       FATAL_ERROR("Inital values mismatch!");
   } else if (gemm_args.Cv.vec_3d.data() != nullptr) {
-    __gemm_copy_simd_view_to_3d_view<decltype(C_expected)>(gemm_args.Cv, C_expected, options);
-    __gemm_copy_simd_view_to_3d_view<decltype(A_expected)>(gemm_args.Av, A_expected, options);
-    __gemm_copy_simd_view_to_3d_view<decltype(B_expected)>(gemm_args.Bv, B_expected, options);
+    __gemm_copy_simd_view_to_3d_view<decltype(C_expected)>(gemm_args.Cv,
+                                                           C_expected, options);
+    __gemm_copy_simd_view_to_3d_view<decltype(A_expected)>(gemm_args.Av,
+                                                           A_expected, options);
+    __gemm_copy_simd_view_to_3d_view<decltype(B_expected)>(gemm_args.Bv,
+                                                           B_expected, options);
 
     // Check that initial values match
-    if (__gemm_do_compare<ScalarType, LayoutType>(C_expected, gemm_args.Cv, options))
+    if (__gemm_do_compare<ScalarType, LayoutType>(C_expected, gemm_args.Cv,
+                                                  options))
       FATAL_ERROR("Inital values mismatch!");
   } else {
     FATAL_ERROR("Input arguments are empty!");
   }
 
   // Populate "expected" matrices via VanillaGemm
-  Test::Functor_BatchedVanillaGEMM<decltype(A_expected), decltype(B_expected), decltype(C_expected), execution_space> vgemm;
+  Test::Functor_BatchedVanillaGEMM<decltype(A_expected), decltype(B_expected),
+                                   decltype(C_expected), execution_space>
+      vgemm;
   vgemm.A_t = toupper(gemm_args.transA) == 'T';
   vgemm.B_t = toupper(gemm_args.transB) == 'T';
-  vgemm.A_c = vgemm.B_c = false;
+  vgemm.A_c = vgemm.B_c     = false;
   vgemm.batch_size_last_dim = options.blas_args.batch_size_last_dim;
-  vgemm.A = A_expected;
-  vgemm.B = B_expected;
-  vgemm.C = C_expected;
-  vgemm.alpha = gemm_args.alpha;
-  vgemm.beta = gemm_args.beta;
-  vgemm.run(); // Compute C_expected
-
-  // Run routine with warm_up_n = 1 and n = 0. 
+  vgemm.A                   = A_expected;
+  vgemm.B                   = B_expected;
+  vgemm.C                   = C_expected;
+  vgemm.alpha               = gemm_args.alpha;
+  vgemm.beta                = gemm_args.beta;
+  vgemm.run();  // Compute C_expected
+
+  // Run routine with warm_up_n = 1 and n = 0.
   auto warm_up_n_bak = options.warm_up_n;
-  options.warm_up_n = 1;
-  auto n_bak = options.n;
-  options.n = 0;
+  options.warm_up_n  = 1;
+  auto n_bak         = options.n;
+  options.n          = 0;
   fn(options, gemm_args);
 
-  Kokkos::fence(); // Redundant fence.
+  Kokkos::fence();  // Redundant fence.
 
   // Check the result
   if (gemm_args.C.data() != nullptr) {
@@ -1514,14 +1570,15 @@ static inline void __gemm_do_verify(options_t options, gemm_args_t gemm_args, vo
   }
 
   if (gemm_args.Cv.vec_3d.data() != nullptr) {
-    if (__gemm_do_compare<ScalarType, LayoutType>(C_expected, gemm_args.Cv, options))
+    if (__gemm_do_compare<ScalarType, LayoutType>(C_expected, gemm_args.Cv,
+                                                  options))
       FATAL_ERROR("Result value mismatch!");
   }
 
   // Run actual timed test.
-  options.verify = false; // Set verify to false for csv output.
+  options.verify    = false;  // Set verify to false for csv output.
   options.warm_up_n = warm_up_n_bak;
-  options.n = n_bak;
+  options.n         = n_bak;
   fn(options, gemm_args);
 
   // Reset verify for next matrix size.
@@ -1617,16 +1674,23 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) {
 
     // Use the non-simd 4-rank view type to randomly populate the gemm simd
     // arguments
-    using tmp_view_type_4d = Kokkos::View<double ****, default_layout, default_device>;
-    tmp_view_type_4d tmpA("tmpA", gemm_args.Av.mat_4d.extent(0), gemm_args.Av.mat_4d.extent(1), gemm_args.Av.mat_4d.extent(2), gemm_args.Av.mat_4d.extent(3));
+    using tmp_view_type_4d =
+        Kokkos::View<double ****, default_layout, default_device>;
+    tmp_view_type_4d tmpA(
+        "tmpA", gemm_args.Av.mat_4d.extent(0), gemm_args.Av.mat_4d.extent(1),
+        gemm_args.Av.mat_4d.extent(2), gemm_args.Av.mat_4d.extent(3));
     Kokkos::fill_random(tmpA, rand_pool,
                         Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
                                      double>::max());
-    tmp_view_type_4d tmpB("tmpB", gemm_args.Bv.mat_4d.extent(0), gemm_args.Bv.mat_4d.extent(1), gemm_args.Bv.mat_4d.extent(2), gemm_args.Bv.mat_4d.extent(3));
+    tmp_view_type_4d tmpB(
+        "tmpB", gemm_args.Bv.mat_4d.extent(0), gemm_args.Bv.mat_4d.extent(1),
+        gemm_args.Bv.mat_4d.extent(2), gemm_args.Bv.mat_4d.extent(3));
     Kokkos::fill_random(tmpB, rand_pool,
                         Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
                                      double>::max());
-    tmp_view_type_4d tmpC("tmpC", gemm_args.Cv.mat_4d.extent(0), gemm_args.Cv.mat_4d.extent(1), gemm_args.Cv.mat_4d.extent(2), gemm_args.Cv.mat_4d.extent(3));
+    tmp_view_type_4d tmpC(
+        "tmpC", gemm_args.Cv.mat_4d.extent(0), gemm_args.Cv.mat_4d.extent(1),
+        gemm_args.Cv.mat_4d.extent(2), gemm_args.Cv.mat_4d.extent(3));
     Kokkos::fill_random(tmpC, rand_pool,
                         Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
                                      double>::max());
@@ -1646,16 +1710,20 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) {
       gemm_args.C = vtc("gemm_args.C", dims.c.k, dims.c.m, dims.c.n);
     }
 
-    using tmp_view_type_3d = Kokkos::View<double ***, default_layout, default_device>;
-    tmp_view_type_3d tmpA("tmpA", gemm_args.A.extent(0), gemm_args.A.extent(1), gemm_args.A.extent(2));
+    using tmp_view_type_3d =
+        Kokkos::View<double ***, default_layout, default_device>;
+    tmp_view_type_3d tmpA("tmpA", gemm_args.A.extent(0), gemm_args.A.extent(1),
+                          gemm_args.A.extent(2));
     Kokkos::fill_random(tmpA, rand_pool,
                         Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
                                      double>::max());
-    tmp_view_type_3d tmpB("tmpB", gemm_args.B.extent(0), gemm_args.B.extent(1), gemm_args.B.extent(2));
+    tmp_view_type_3d tmpB("tmpB", gemm_args.B.extent(0), gemm_args.B.extent(1),
+                          gemm_args.B.extent(2));
     Kokkos::fill_random(tmpB, rand_pool,
                         Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
                                      double>::max());
-    tmp_view_type_3d tmpC("tmpC", gemm_args.C.extent(0), gemm_args.C.extent(1), gemm_args.C.extent(2));
+    tmp_view_type_3d tmpC("tmpC", gemm_args.C.extent(0), gemm_args.C.extent(1),
+                          gemm_args.C.extent(2));
     Kokkos::fill_random(tmpC, rand_pool,
                         Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
                                      double>::max());
@@ -1671,7 +1739,7 @@ gemm_args_t __do_setup(options_t options, matrix_dims_t dims) {
   gemm_args.bp.team_size  = options.blas_args.team_size;
   gemm_args.bp.vector_len = options.blas_args.vector_len;
 
-  Kokkos::fence(); // Ensure that fill_random has completed.
+  Kokkos::fence();  // Ensure that fill_random has completed.
 
   return gemm_args;
 }
@@ -1702,7 +1770,8 @@ void __do_loop_and_invoke(options_t options,
                            view_type_3d, default_device>(options, cur_dims);
 
     if (options.verify) {
-      __gemm_do_verify<default_scalar, default_layout, default_device>(options, gemm_args, fn);
+      __gemm_do_verify<default_scalar, default_layout, default_device>(
+          options, gemm_args, fn);
     } else {
       fn(options, gemm_args);
     }
diff --git a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
index 7e1cdf0f2f..149cc00fd1 100644
--- a/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
+++ b/perf_test/blas/blas3/KokkosBlas3_perf_test.cpp
@@ -128,7 +128,8 @@ static void __print_help_blas3_perf_test() {
       "\t\tWhether to use Kokkos::AUTO for vector_len and team_size "
       "(Heirarchical parallelism).\n");
   printf(
-      "\t\t\tValid values for AUTO are 1 to use Kokkos::AUTO and 0 to use --vector_len and --team_size "
+      "\t\t\tValid values for AUTO are 1 to use Kokkos::AUTO and 0 to use "
+      "--vector_len and --team_size "
       "instead. (default: %d)\n",
       DEFAULT_USE_AUTO);
 
@@ -139,7 +140,8 @@ static void __print_help_blas3_perf_test() {
   printf("\t-d, --batch_size_last_dim=LAST_DIM\n");
   printf("\t\tHow to allocate the batch_size in the matrices.\n");
   printf(
-      "\t\t\tValid values for LAST_DIM are 1 make the batch_size the last dimension and 0 to make the batch_size "
+      "\t\t\tValid values for LAST_DIM are 1 make the batch_size the last "
+      "dimension and 0 to make the batch_size "
       "the first dimension (default: %d)\n",
       DEFAULT_BATCH_SIZE_LAST_DIM);
 
@@ -212,7 +214,8 @@ static void __print_help_blas3_perf_test() {
   printf("\t-v, --verify=VERIFY\n");
   printf("\t\tVerification selection. (untimed)\n");
   printf(
-      "\t\t\tValid values for VERIFY are either 0 to skip verification or 1 to verify before timing. "
+      "\t\t\tValid values for VERIFY are either 0 to skip verification or 1 to "
+      "verify before timing. "
       "(default: %d)\n",
       DEFAULT_VERIFY);
 }
@@ -275,8 +278,9 @@ int main(int argc, char **argv) {
   options.blas_args.gemm.alpha     = DEFAULT_GEMM_ALPHA;
   options.blas_args.gemm.beta      = DEFAULT_GEMM_BETA;
 
-  while ((ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:p:d:v:",
-                            long_options, &option_idx)) != -1) {
+  while (
+      (ret = getopt_long(argc, argv, "ht:l:b:e:s:w:i:o:a:c:r:g:z:n:k:u:p:d:v:",
+                         long_options, &option_idx)) != -1) {
     switch (ret) {
       case 'h': __print_help_blas3_perf_test(); return 0;
       case 't':
diff --git a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
index 0a6741c603..de2bbd9ce9 100644
--- a/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
+++ b/perf_test/blas/blas3/KokkosBlas3_trmm_perf_test.hpp
@@ -106,8 +106,8 @@ static inline int __trmm_impl_flop_count(char side, int b_m, int b_n, int a_m,
 
 // Flop count formula from lapack working note 41:
 // http://www.icl.utk.edu/~mgates3/docs/lawn41.pdf
-static inline double __trmm_flop_count(char side, double b_m, double b_n, double a_m,
-                                      double a_n) {
+static inline double __trmm_flop_count(char side, double b_m, double b_n,
+                                       double a_m, double a_n) {
   double flops;
 
   if (side == 'L' || side == 'l') {
@@ -624,12 +624,13 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) {
   trmm_args.alpha = options.blas_args.trmm.alpha;
   host_A          = Kokkos::create_mirror_view(trmm_args.A);
 
-
   {
-    Kokkos::View<double ***, default_layout, default_device> tmp("tmp", trmm_args.A.extent(0), trmm_args.A.extent(1), trmm_args.A.extent(2));
+    Kokkos::View<double***, default_layout, default_device> tmp(
+        "tmp", trmm_args.A.extent(0), trmm_args.A.extent(1),
+        trmm_args.A.extent(2));
     Kokkos::fill_random(tmp, rand_pool,
-			Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
-			double>::max());
+                        Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
+                                     double>::max());
     Kokkos::deep_copy(host_A, tmp);
   }
 
@@ -668,10 +669,12 @@ trmm_args_t __do_setup(options_t options, matrix_dims_t dim) {
   Kokkos::deep_copy(trmm_args.A, host_A);
 
   {
-    Kokkos::View<double ***, default_layout, default_device> tmp("tmp", trmm_args.B.extent(0), trmm_args.B.extent(1), trmm_args.B.extent(2));
+    Kokkos::View<double***, default_layout, default_device> tmp(
+        "tmp", trmm_args.B.extent(0), trmm_args.B.extent(1),
+        trmm_args.B.extent(2));
     Kokkos::fill_random(tmp, rand_pool,
-			Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
-			double>::max());
+                        Kokkos::rand<Kokkos::Random_XorShift64<execution_space>,
+                                     double>::max());
     Kokkos::deep_copy(trmm_args.B, tmp);
   }
 
diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp
index 4a5c17d1df..eb9883c425 100644
--- a/src/batched/KokkosBatched_Util.hpp
+++ b/src/batched/KokkosBatched_Util.hpp
@@ -205,7 +205,7 @@ namespace KokkosBatched {
                    std::is_same<T,std::complex<float> >::value     ||
                    std::is_same<T,Kokkos::complex<double> >::value ||
                    std::is_same<T,std::complex<double> >::value    ||
-		   std::is_same<T,Kokkos::Experimental::half_t>::value,
+                   std::is_same<T,Kokkos::Experimental::half_t>::value,
                    "KokkosKernels:: Invalid SIMD<> type." );
     using value_type = T;
   };