From 78dac8e419a452faa24f803468b927f62f2cdcdd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= <mikolaj.zuzek@ng-analytics.com>
Date: Mon, 14 Jun 2021 15:01:51 +0200
Subject: [PATCH] #9 Tests refactored + CSV output

---
 .../wiki/sparse/KokkosSparse_nga_bspmv.cpp    | 575 +++++++++---------
 1 file changed, 283 insertions(+), 292 deletions(-)
diff --git a/example/wiki/sparse/KokkosSparse_nga_bspmv.cpp b/example/wiki/sparse/KokkosSparse_nga_bspmv.cpp
index eea39ad6f7..d7edce89f4 100644
--- a/example/wiki/sparse/KokkosSparse_nga_bspmv.cpp
+++ b/example/wiki/sparse/KokkosSparse_nga_bspmv.cpp
@@ -38,172 +38,6 @@ using MultiVector_Internal = typename Kokkos::View< Scalar**, Layout, device_typ
 
 namespace details {
 
-const Scalar SC_ONE  = Kokkos::ArithTraits<Scalar>::one();
-const Scalar SC_ZERO = Kokkos::ArithTraits<Scalar>::zero();
-
-/// \brief Generate a CrsMatrix object for a matrix
-/// "with multiple DOFs per node"
-///
-/// \tparam mat_structure
-/// \param stencil
-/// \param structure
-/// \param blockSize
-/// \param mat_rowmap
-/// \param mat_colidx
-/// \param mat_val
-/// \return
-template <typename mat_structure>
-crs_matrix_t_ generate_crs_matrix(const std::string stencil,
-                                  const mat_structure &structure,
-                                  const int blockSize,
-                                  std::vector<Ordinal> &mat_rowmap,
-                                  std::vector<Ordinal> &mat_colidx,
-                                  std::vector<Scalar> &mat_val) {
-  crs_matrix_t_ mat_b1 =
-      Test::generate_structured_matrix2D<crs_matrix_t_>(stencil, structure);
-
-  if (blockSize == 1) return mat_b1;
-
-  //
-  // Fill blocks with random values
-  //
-
-  int nRow   = blockSize * mat_b1.numRows();
-  int nCol   = blockSize * mat_b1.numCols();
-  size_t nnz = blockSize * blockSize * mat_b1.nnz();
-
-  mat_val.resize(nnz);
-  Scalar *val_ptr = &mat_val[0];
-
-  for (size_t ii = 0; ii < nnz; ++ii)
-    val_ptr[ii] =
-        static_cast<Scalar>(std::rand() / (RAND_MAX + static_cast<Scalar>(1)));
-
-  mat_rowmap.resize(nRow + 1);
-  int *rowmap = &mat_rowmap[0];
-  rowmap[0]   = 0;
-
-  mat_colidx.resize(nnz);
-  int *cols = &mat_colidx[0];
-
-  for (int ir = 0; ir < mat_b1.numRows(); ++ir) {
-    auto mat_b1_row = mat_b1.rowConst(ir);
-    for (int ib = 0; ib < blockSize; ++ib) {
-      int my_row         = ir * blockSize + ib;
-      rowmap[my_row + 1] = rowmap[my_row] + mat_b1_row.length * blockSize;
-      for (int ijk = 0; ijk < mat_b1_row.length; ++ijk) {
-        int col0 = mat_b1_row.colidx(ijk);
-        for (int jb = 0; jb < blockSize; ++jb) {
-          cols[rowmap[my_row] + ijk * blockSize + jb] = col0 * blockSize + jb;
-        }
-      }  // for (int ijk = 0; ijk < mat_row.length; ++ijk)
-    }
-  }  // for (int ir = 0; ir < mat_b1.numRows(); ++ir)
-
-  return crs_matrix_t_("new_crs_matr", nRow, nCol, nnz, val_ptr, rowmap, cols);
-}
-
-/// \brief Convert a CrsMatrix object to a BlockCrsMatrix object
-///
-/// \param mat_crs
-/// \param blockSize
-/// \return
-///
-/// \note We assume that each block has a constant block size
-/// (in both directions, i.e. row and column)
-/// \note The numerical values for each individual block are stored
-/// contiguously in a (blockSize * blockSize) space
-/// in a row-major fashion.
-/// (2021/06/08) This storage is different from the BlockCrsMatrix constructor
-/// ```  BlockCrsMatrix (const KokkosSparse::CrsMatrix<SType, OType, DType,
-/// MTType, IType> &crs_mtx,
-///                      const OrdinalType blockDimIn) ```
-bcrs_matrix_t_ to_block_crs_matrix(const crs_matrix_t_ &mat_crs,
-                                   const int blockSize) {
-  if (blockSize == 1) {
-    bcrs_matrix_t_ bmat(mat_crs, blockSize);
-    return bmat;
-  }
-
-  if ((mat_crs.numRows() % blockSize > 0) ||
-      (mat_crs.numCols() % blockSize > 0)) {
-    std::cerr
-        << "\n !!! Matrix Dimensions Do Not Match Block Structure !!! \n\n";
-    exit(-123);
-  }
-
-  // block_rows will accumulate the number of blocks per row - this is NOT the
-  // row_map with cum sum!!
-  Ordinal nbrows = mat_crs.numRows() / blockSize;
-  std::vector<Ordinal> block_rows(nbrows, 0);
-
-  Ordinal nbcols = mat_crs.numCols() / blockSize;
-
-  Ordinal numBlocks = 0;
-  for (Ordinal i = 0; i < mat_crs.numRows(); i += blockSize) {
-    Ordinal current_blocks = 0;
-    for (Ordinal j = 0; j < blockSize; ++j) {
-      auto n_entries = mat_crs.graph.row_map(i + 1 + j) -
-                       mat_crs.graph.row_map(i + j) + blockSize - 1;
-      current_blocks = std::max(current_blocks, n_entries / blockSize);
-    }
-    numBlocks += current_blocks;                 // cum sum
-    block_rows[i / blockSize] = current_blocks;  // frequency counts
-  }
-
-  Kokkos::View<Ordinal *, Kokkos::LayoutLeft, device_type> rows("new_row",
-                                                                nbrows + 1);
-  rows(0) = 0;
-  for (Ordinal i = 0; i < nbrows; ++i) rows(i + 1) = rows(i) + block_rows[i];
-
-  Kokkos::View<Ordinal *, Kokkos::LayoutLeft, device_type> cols("new_col",
-                                                                rows[nbrows]);
-  cols(0) = 0;
-
-  for (Ordinal ib = 0; ib < nbrows; ++ib) {
-    auto ir_start = ib * blockSize;
-    auto ir_stop  = (ib + 1) * blockSize;
-    std::set<Ordinal> col_set;
-    for (Ordinal ir = ir_start; ir < ir_stop; ++ir) {
-      for (Ordinal jk = mat_crs.graph.row_map(ir);
-           jk < mat_crs.graph.row_map(ir + 1); ++jk) {
-        col_set.insert(mat_crs.graph.entries(jk) / blockSize);
-      }
-    }
-    assert(col_set.size() == block_rows[ib]);
-    Ordinal icount = 0;
-    auto *col_list = &cols(rows(ib));
-    for (auto col_block : col_set) col_list[icount++] = col_block;
-  }
-
-  Ordinal annz = numBlocks * blockSize * blockSize;
-  bcrs_matrix_t_::values_type vals("values", annz);
-  for (Ordinal i = 0; i < annz; ++i) vals(i) = 0.0;
-
-  for ( Ordinal ir = 0; ir < mat_crs.numRows(); ++ir ) {
-    const auto iblock = ir / blockSize;
-    const auto ilocal = ir % blockSize;
-    Ordinal lda = blockSize * (rows[iblock + 1] - rows[iblock]);
-    for (Ordinal jk = mat_crs.graph.row_map(ir); jk < mat_crs.graph.row_map(ir+1); ++jk) {
-      const auto jc = mat_crs.graph.entries(jk);
-      const auto jblock = jc / blockSize;
-      const auto jlocal = jc % blockSize;
-      for (Ordinal jkb = rows[iblock]; jkb < rows[iblock + 1]; ++jkb) {
-        if (cols(jkb) == jblock) {
-          Ordinal shift = rows[iblock] * blockSize * blockSize
-              + blockSize * (jkb - rows[iblock]);
-          vals(shift + jlocal + ilocal * lda) = mat_crs.values(jk);
-          break;
-        }
-      }
-    }
-  }
-
-  bcrs_matrix_t_ bmat("newblock", nbrows, nbcols, annz, vals, rows, cols,
-                      blockSize);
-  return bmat;
-}
-
 //
 // spmv: version for GPU execution
 //
@@ -1171,7 +1005,175 @@ void spmv(KokkosKernels::Experimental::Controls controls,
   }
 }
 
+} // namespace details
 
+namespace test {
+
+const Scalar SC_ONE  = Kokkos::ArithTraits<Scalar>::one();
+const Scalar SC_ZERO = Kokkos::ArithTraits<Scalar>::zero();
+
+/// \brief Generate a CrsMatrix object for a matrix
+/// "with multiple DOFs per node"
+///
+/// \tparam mat_structure
+/// \param stencil
+/// \param structure
+/// \param blockSize
+/// \param mat_rowmap
+/// \param mat_colidx
+/// \param mat_val
+/// \return
+template <typename mat_structure>
+crs_matrix_t_ generate_crs_matrix(const std::string stencil,
+                                  const mat_structure &structure,
+                                  const int blockSize,
+                                  std::vector<Ordinal> &mat_rowmap,
+                                  std::vector<Ordinal> &mat_colidx,
+                                  std::vector<Scalar> &mat_val) {
+  crs_matrix_t_ mat_b1 =
+      Test::generate_structured_matrix2D<crs_matrix_t_>(stencil, structure);
+
+  if (blockSize == 1) return mat_b1;
+
+  //
+  // Fill blocks with random values
+  //
+
+  int nRow   = blockSize * mat_b1.numRows();
+  int nCol   = blockSize * mat_b1.numCols();
+  size_t nnz = blockSize * blockSize * mat_b1.nnz();
+
+  mat_val.resize(nnz);
+  Scalar *val_ptr = &mat_val[0];
+
+  for (size_t ii = 0; ii < nnz; ++ii)
+    val_ptr[ii] =
+        static_cast<Scalar>(std::rand() / (RAND_MAX + static_cast<Scalar>(1)));
+
+  mat_rowmap.resize(nRow + 1);
+  int *rowmap = &mat_rowmap[0];
+  rowmap[0]   = 0;
+
+  mat_colidx.resize(nnz);
+  int *cols = &mat_colidx[0];
+
+  for (int ir = 0; ir < mat_b1.numRows(); ++ir) {
+    auto mat_b1_row = mat_b1.rowConst(ir);
+    for (int ib = 0; ib < blockSize; ++ib) {
+      int my_row         = ir * blockSize + ib;
+      rowmap[my_row + 1] = rowmap[my_row] + mat_b1_row.length * blockSize;
+      for (int ijk = 0; ijk < mat_b1_row.length; ++ijk) {
+        int col0 = mat_b1_row.colidx(ijk);
+        for (int jb = 0; jb < blockSize; ++jb) {
+          cols[rowmap[my_row] + ijk * blockSize + jb] = col0 * blockSize + jb;
+        }
+      }  // for (int ijk = 0; ijk < mat_row.length; ++ijk)
+    }
+  }  // for (int ir = 0; ir < mat_b1.numRows(); ++ir)
+
+  return crs_matrix_t_("new_crs_matr", nRow, nCol, nnz, val_ptr, rowmap, cols);
+}
+
+/// \brief Convert a CrsMatrix object to a BlockCrsMatrix object
+///
+/// \param mat_crs
+/// \param blockSize
+/// \return
+///
+/// \note We assume that each block has a constant block size
+/// (in both directions, i.e. row and column)
+/// \note The numerical values for each individual block are stored
+/// contiguously in a (blockSize * blockSize) space
+/// in a row-major fashion.
+/// (2021/06/08) This storage is different from the BlockCrsMatrix constructor
+/// ```  BlockCrsMatrix (const KokkosSparse::CrsMatrix<SType, OType, DType,
+/// MTType, IType> &crs_mtx,
+///                      const OrdinalType blockDimIn) ```
+bcrs_matrix_t_ to_block_crs_matrix(const crs_matrix_t_ &mat_crs,
+                                   const int blockSize) {
+  if (blockSize == 1) {
+    bcrs_matrix_t_ bmat(mat_crs, blockSize);
+    return bmat;
+  }
+
+  if ((mat_crs.numRows() % blockSize > 0) ||
+      (mat_crs.numCols() % blockSize > 0)) {
+    std::cerr
+        << "\n !!! Matrix Dimensions Do Not Match Block Structure !!! \n\n";
+    exit(-123);
+  }
+
+  // block_rows will accumulate the number of blocks per row - this is NOT the
+  // row_map with cum sum!!
+  Ordinal nbrows = mat_crs.numRows() / blockSize;
+  std::vector<Ordinal> block_rows(nbrows, 0);
+
+  Ordinal nbcols = mat_crs.numCols() / blockSize;
+
+  Ordinal numBlocks = 0;
+  for (Ordinal i = 0; i < mat_crs.numRows(); i += blockSize) {
+    Ordinal current_blocks = 0;
+    for (Ordinal j = 0; j < blockSize; ++j) {
+      auto n_entries = mat_crs.graph.row_map(i + 1 + j) -
+                       mat_crs.graph.row_map(i + j) + blockSize - 1;
+      current_blocks = std::max(current_blocks, n_entries / blockSize);
+    }
+    numBlocks += current_blocks;                 // cum sum
+    block_rows[i / blockSize] = current_blocks;  // frequency counts
+  }
+
+  Kokkos::View<Ordinal *, Kokkos::LayoutLeft, device_type> rows("new_row",
+                                                                nbrows + 1);
+  rows(0) = 0;
+  for (Ordinal i = 0; i < nbrows; ++i) rows(i + 1) = rows(i) + block_rows[i];
+
+  Kokkos::View<Ordinal *, Kokkos::LayoutLeft, device_type> cols("new_col",
+                                                                rows[nbrows]);
+  cols(0) = 0;
+
+  for (Ordinal ib = 0; ib < nbrows; ++ib) {
+    auto ir_start = ib * blockSize;
+    auto ir_stop  = (ib + 1) * blockSize;
+    std::set<Ordinal> col_set;
+    for (Ordinal ir = ir_start; ir < ir_stop; ++ir) {
+      for (Ordinal jk = mat_crs.graph.row_map(ir);
+           jk < mat_crs.graph.row_map(ir + 1); ++jk) {
+        col_set.insert(mat_crs.graph.entries(jk) / blockSize);
+      }
+    }
+    assert(col_set.size() == block_rows[ib]);
+    Ordinal icount = 0;
+    auto *col_list = &cols(rows(ib));
+    for (auto col_block : col_set) col_list[icount++] = col_block;
+  }
+
+  Ordinal annz = numBlocks * blockSize * blockSize;
+  bcrs_matrix_t_::values_type vals("values", annz);
+  for (Ordinal i = 0; i < annz; ++i) vals(i) = 0.0;
+
+  for ( Ordinal ir = 0; ir < mat_crs.numRows(); ++ir ) {
+    const auto iblock = ir / blockSize;
+    const auto ilocal = ir % blockSize;
+    Ordinal lda = blockSize * (rows[iblock + 1] - rows[iblock]);
+    for (Ordinal jk = mat_crs.graph.row_map(ir); jk < mat_crs.graph.row_map(ir+1); ++jk) {
+      const auto jc = mat_crs.graph.entries(jk);
+      const auto jblock = jc / blockSize;
+      const auto jlocal = jc % blockSize;
+      for (Ordinal jkb = rows[iblock]; jkb < rows[iblock + 1]; ++jkb) {
+        if (cols(jkb) == jblock) {
+          Ordinal shift = rows[iblock] * blockSize * blockSize
+              + blockSize * (jkb - rows[iblock]);
+          vals(shift + jlocal + ilocal * lda) = mat_crs.values(jk);
+          break;
+        }
+      }
+    }
+  }
+
+  bcrs_matrix_t_ bmat("newblock", nbrows, nbcols, annz, vals, rows, cols,
+                      blockSize);
+  return bmat;
+}
 
 /////////////////////////////////////////////////////
 
@@ -1303,106 +1305,55 @@ void compare(const char fOp[],
   }
 }
 
-template <typename mtx_t>
-void test_matrix(const mtx_t &myMatrix, const int blockSize, const int repeat) {
-  const Scalar alpha = details::SC_ONE;
-  const Scalar beta  = details::SC_ZERO;
-
-  auto const numRows = myMatrix.numRows();
-
-  //
-  // Use BlockCrsMatrix format
-  //
-  bcrs_matrix_t_ myBlockMatrix = to_block_crs_matrix(myMatrix, blockSize);
-
-  double error = 0.0, maxNorm = 0.0;
-
-  //
-  // Assess y <- A * x
-  //
-  compare("N", myMatrix, myBlockMatrix, alpha, beta, error, maxNorm);
-
-  std::cout << " Error BlockCrsMatrix " << error << " maxNorm " << maxNorm
-            << "\n";
-  std::cout << " ------------------------ \n";
-
-  //---
-
-  std::chrono::duration<double> dt_crs = measure("N", myMatrix, alpha, beta, repeat);
-
-  std::cout << " Total time for Crs Mat-Vec " << dt_crs.count() << " Avg. "
-            << dt_crs.count() / static_cast<double>(repeat);
-  std::cout << " Flops (mult only) "
-            << myMatrix.nnz() * static_cast<double>(repeat / dt_crs.count())
-            << "\n";
-  std::cout << " ------------------------ \n";
-
-  //---
-  std::chrono::duration<double> dt_bcrs =
-      measure_block("N", myBlockMatrix, alpha, beta, repeat);
-
-  std::cout << " Total time for BlockCrs Mat-Vec " << dt_bcrs.count()
-            << " Avg. " << dt_bcrs.count() / static_cast<double>(repeat);
-  std::cout << " Flops (mult only) "
-            << myMatrix.nnz() * static_cast<double>(repeat / dt_bcrs.count());
-  std::cout << "\n";
-  //
-  std::cout << " Ratio = " << dt_bcrs.count() / dt_crs.count();
+template<typename mtx_t = crs_matrix_t_, typename bmtx_t = bcrs_matrix_t_>
+class TestCase {
+public:
+  using time_t = std::chrono::duration<double>;
+
+  struct RunInfo {
+    // options
+    const char *mode = "N"; // N/T/C/H
+    const Scalar alpha = SC_ONE;
+    const Scalar beta = SC_ZERO;
+    // results
+    double error = 0.0;
+    double maxNorm = 0.0;
+    time_t dt_crs;
+    time_t dt_bcrs;
+  };
 
-  if (dt_bcrs.count() < dt_crs.count()) {
-    std::cout << " --- GOOD --- ";
-  } else {
-    std::cout << " ((( Not Faster ))) ";
+public:
+  TestCase(std::string name, crs_matrix_t_ myMatrix, const int blockSize, const int repeat = 1024):
+    name_(name),
+    myMatrix_(std::move(myMatrix)),
+    blockSize_(blockSize),
+    repeat_(repeat)
+  {
+    myBlockMatrix_ = to_block_crs_matrix(myMatrix_, blockSize_); // Use BlockCrsMatrix format
   }
-  std::cout << "\n";
-
-  std::cout << " ======================== \n";
 
   //
   // Assess y <- A * x
   //
-  compare("T", myMatrix, myBlockMatrix, alpha, beta, error, maxNorm);
-
-  std::cout << " Error Transpose BlockCrsMatrix " << error << " maxNorm " << maxNorm
-            << "\n";
-  std::cout << " ------------------------ \n";
-
-  //---
-
-  dt_crs = measure("T", myMatrix, alpha, beta, repeat);
-
-  std::cout << " Total time for Crs Mat^T-Vec " << dt_crs.count() << " Avg. "
-            << dt_crs.count() / static_cast<double>(repeat);
-  std::cout << " Flops (mult only) "
-            << myMatrix.nnz() * static_cast<double>(repeat / dt_crs.count())
-            << "\n";
-  std::cout << " ------------------------ \n";
-
-  //---
-  dt_bcrs = measure_block("T", myBlockMatrix, alpha, beta, repeat);
-
-  std::cout << " Total time for BlockCrs Mat^T-Vec " << dt_bcrs.count()
-            << " Avg. " << dt_bcrs.count() / static_cast<double>(repeat);
-  std::cout << " Flops (mult only) "
-            << myMatrix.nnz() * static_cast<double>(repeat / dt_bcrs.count());
-  std::cout << "\n";
-  //
-  std::cout << " Ratio = " << dt_bcrs.count() / dt_crs.count();
-
-  if (dt_bcrs.count() < dt_crs.count()) {
-    std::cout << " --- GOOD --- ";
-  } else {
-    std::cout << " ((( Not Faster ))) ";
+  bool execute(RunInfo &run)
+  {
+    compare(run.mode, myMatrix_, myBlockMatrix_, run.alpha, run.beta, run.error, run.maxNorm);
+    run.dt_crs = measure(run.mode, myMatrix_, run.alpha, run.beta, repeat_);
+    run.dt_bcrs = measure_block(run.mode, myBlockMatrix_, run.alpha, run.beta, repeat_);
+    return true;
   }
-  std::cout << "\n";
 
-  std::cout << " ======================== \n";
+// private:
+  mtx_t myMatrix_;
+  bmtx_t myBlockMatrix_; // derived
+  int blockSize_;
+  int repeat_;
+  std::string name_;
+};
 
-}
-
-int test_random(const int repeat = 1024, const int minBlockSize = 1,
+template<typename test_t>
+void test_random(std::vector<test_t> &samples, const int repeat = 1024, const int minBlockSize = 1,
                 const int maxBlockSize = 12) {
-  int return_value = 0;
 
   // The mat_structure view is used to generate a matrix using
   // finite difference (FD) or finite element (FE) discretization
@@ -1426,23 +1377,18 @@ int test_random(const int repeat = 1024, const int minBlockSize = 1,
     std::vector<int> mat_rowmap, mat_colidx;
     std::vector<double> mat_val;
 
-    crs_matrix_t_ myMatrix = details::generate_crs_matrix(
-        "FD", mat_structure, blockSize, mat_rowmap, mat_colidx, mat_val);
-
-    std::cout << " ======================== \n";
-    std::cout << " Block Size " << blockSize;
-    std::cout << " Matrix Size " << myMatrix.numRows() << " nnz "
-              << myMatrix.nnz() << "\n";
-
-    test_matrix(myMatrix, blockSize, repeat);
+    samples.push_back({
+      "rand-" + std::to_string(blockSize),
+      generate_crs_matrix(
+        "FD", mat_structure, blockSize, mat_rowmap, mat_colidx, mat_val),
+      blockSize,
+      repeat
+    });
   }
-  return return_value;
 }
 
-int test_samples(const int repeat = 3000) {
-  int return_value = 0;
-
-  srand(17312837);
+template<typename test_t>
+void test_samples(std::vector<test_t> &samples, const int repeat = 3000) {
 
   const std::vector<std::tuple<const char *, int> > SAMPLES{
       // std::tuple(char* fileName, int blockSize)
@@ -1474,36 +1420,81 @@ int test_samples(const int repeat = 3000) {
   };
 
   // Loop over sample matrix files
-  std::for_each(SAMPLES.begin(), SAMPLES.end(), [=](auto const &sample) {
+  std::for_each(SAMPLES.begin(), SAMPLES.end(), [&](auto const &sample) {
     const char *fileName = std::get<0>(sample);
-    const int blockSize  = std::get<1>(sample);
-    auto myMatrix =
-        KokkosKernels::Impl::read_kokkos_crst_matrix<crs_matrix_t_>(fileName);
-
-    std::cout << " ======================== \n";
-    std::cout << " Sample: '" << fileName << "', Block Size " << blockSize;
-    std::cout << " Matrix Size " << myMatrix.numCols() << " x "
-              << myMatrix.numRows() << ", nnz " << myMatrix.nnz() << "\n";
-
-    test_matrix(myMatrix, blockSize, repeat);
+    samples.push_back({
+      std::string(fileName),
+      KokkosKernels::Impl::read_kokkos_crst_matrix<crs_matrix_t_>(fileName),
+      std::get<1>(sample),
+      repeat
+    });
   });
-  return return_value;
 }
 
-}  // namespace details
-
-#define TEST_RANDOM_BSPMV
+} // namespace test
 
 int main() {
+  const char* const sep = "\t"; // CSV separator
   Kokkos::initialize();
+  bool failed = false;
+  srand(17312837);
 
-#ifdef TEST_RANDOM_BSPMV
-  int return_value = details::test_random();
-#else
-  int return_value = details::test_samples();
-#endif
+  {
+    // Prepare samples
+    using test_t = test::TestCase<crs_matrix_t_, bcrs_matrix_t_>;
+    std::vector<test_t> samples;
+    test::test_random(samples);
+    test::test_samples(samples);
+
+    // Prepare variants
+    std::vector<test_t::RunInfo> variants;
+    variants.push_back({"N"});
+    variants.push_back({"T"});
+
+    // Run samples
+    int sample_id = 0;
+    std::cout << "no." << sep << "name" << sep << "size" << sep << "block" << sep << "nnz" // sample info
+              << sep << "mode" << sep << "alpha" << sep << "beta" // run info
+              << sep << "error" << sep << "maxNorm"
+              << sep << "crsTime" << sep << "crsAvg" << sep << "crsGFlops"
+              << sep << "bcrsTime" << sep << "bcrsAvg" << sep << "bcrsGFlops"
+              << sep << "ratio" << sep << "remarks"
+              << std::endl;
+    std::for_each(samples.begin(), samples.end(), [&](test_t &test) {
+      bool first = true;
+      sample_id += 1;
+      int variant_id = 0;
+      std::for_each(variants.begin(), variants.end(), [&](test_t::RunInfo &run) {
+
+        // Show run info
+        ++variant_id;
+        std::cout << sample_id << ":" << variant_id << "/" << samples.size() << sep;
+        if (first) {
+          std::cout << sep << test.name_ << sep << test.myMatrix_.numRows() << sep << test.blockSize_ << sep << test.myMatrix_.nnz();
+          first = false;
+        } else
+          std::cout << sep << "^" << sep << "^" << sep << "^" << sep << "^";
+        std::cout << sep << run.mode << sep << run.alpha << sep << run.beta;
+
+        // Execute
+        bool pass = test.execute(run);
+        failed = failed || !pass;
+
+        // Show results
+        std::cout << sep << run.error << sep << run.maxNorm;
+        auto const avg = (run.dt_crs.count() / static_cast<double>(test.repeat_));
+        auto const flops = test.myMatrix_.nnz() * static_cast<double>(test.repeat_ / run.dt_crs.count());
+        std::cout << sep << run.dt_crs.count() << sep << avg << sep << (flops * 1e-9);
+        auto const avg2 = (run.dt_bcrs.count() / static_cast<double>(test.repeat_));
+        auto const flops2 = test.myMatrix_.nnz() * static_cast<double>(test.repeat_ / run.dt_bcrs.count());
+        std::cout << sep << run.dt_bcrs.count() << sep << avg2 << sep << (flops2 * 1e-9);
+        auto const remarks = (run.dt_bcrs.count() < run.dt_crs.count()) ? "good" : "NOT_faster";
+        std::cout << sep << (run.dt_bcrs.count() / run.dt_crs.count()) << sep << remarks;
+        std::cout << std::endl;
+      });
+    });
+  }
 
   Kokkos::finalize();
-
-  return return_value;
+  return failed ? 1 : 0;
 }