From 78dac8e419a452faa24f803468b927f62f2cdcdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miko=C5=82aj=20Zuzek?= Date: Mon, 14 Jun 2021 15:01:51 +0200 Subject: [PATCH] #9 Tests refactored + CSV output --- .../wiki/sparse/KokkosSparse_nga_bspmv.cpp | 575 +++++++++--------- 1 file changed, 283 insertions(+), 292 deletions(-) diff --git a/example/wiki/sparse/KokkosSparse_nga_bspmv.cpp b/example/wiki/sparse/KokkosSparse_nga_bspmv.cpp index eea39ad6f7..d7edce89f4 100644 --- a/example/wiki/sparse/KokkosSparse_nga_bspmv.cpp +++ b/example/wiki/sparse/KokkosSparse_nga_bspmv.cpp @@ -38,172 +38,6 @@ using MultiVector_Internal = typename Kokkos::View< Scalar**, Layout, device_typ namespace details { -const Scalar SC_ONE = Kokkos::ArithTraits::one(); -const Scalar SC_ZERO = Kokkos::ArithTraits::zero(); - -/// \brief Generate a CrsMatrix object for a matrix -/// "with multiple DOFs per node" -/// -/// \tparam mat_structure -/// \param stencil -/// \param structure -/// \param blockSize -/// \param mat_rowmap -/// \param mat_colidx -/// \param mat_val -/// \return -template -crs_matrix_t_ generate_crs_matrix(const std::string stencil, - const mat_structure &structure, - const int blockSize, - std::vector &mat_rowmap, - std::vector &mat_colidx, - std::vector &mat_val) { - crs_matrix_t_ mat_b1 = - Test::generate_structured_matrix2D(stencil, structure); - - if (blockSize == 1) return mat_b1; - - // - // Fill blocks with random values - // - - int nRow = blockSize * mat_b1.numRows(); - int nCol = blockSize * mat_b1.numCols(); - size_t nnz = blockSize * blockSize * mat_b1.nnz(); - - mat_val.resize(nnz); - Scalar *val_ptr = &mat_val[0]; - - for (size_t ii = 0; ii < nnz; ++ii) - val_ptr[ii] = - static_cast(std::rand() / (RAND_MAX + static_cast(1))); - - mat_rowmap.resize(nRow + 1); - int *rowmap = &mat_rowmap[0]; - rowmap[0] = 0; - - mat_colidx.resize(nnz); - int *cols = &mat_colidx[0]; - - for (int ir = 0; ir < mat_b1.numRows(); ++ir) { - auto mat_b1_row = mat_b1.rowConst(ir); - for (int ib = 0; ib < blockSize; ++ib) { - int my_row = ir * blockSize + ib; - rowmap[my_row + 1] = rowmap[my_row] + mat_b1_row.length * blockSize; - for (int ijk = 0; ijk < mat_b1_row.length; ++ijk) { - int col0 = mat_b1_row.colidx(ijk); - for (int jb = 0; jb < blockSize; ++jb) { - cols[rowmap[my_row] + ijk * blockSize + jb] = col0 * blockSize + jb; - } - } // for (int ijk = 0; ijk < mat_row.length; ++ijk) - } - } // for (int ir = 0; ir < mat_b1.numRows(); ++ir) - - return crs_matrix_t_("new_crs_matr", nRow, nCol, nnz, val_ptr, rowmap, cols); -} - -/// \brief Convert a CrsMatrix object to a BlockCrsMatrix object -/// -/// \param mat_crs -/// \param blockSize -/// \return -/// -/// \note We assume that each block has a constant block size -/// (in both directions, i.e. row and column) -/// \note The numerical values for each individual block are stored -/// contiguously in a (blockSize * blockSize) space -/// in a row-major fashion. -/// (2021/06/08) This storage is different from the BlockCrsMatrix constructor -/// ``` BlockCrsMatrix (const KokkosSparse::CrsMatrix &crs_mtx, -/// const OrdinalType blockDimIn) ``` -bcrs_matrix_t_ to_block_crs_matrix(const crs_matrix_t_ &mat_crs, - const int blockSize) { - if (blockSize == 1) { - bcrs_matrix_t_ bmat(mat_crs, blockSize); - return bmat; - } - - if ((mat_crs.numRows() % blockSize > 0) || - (mat_crs.numCols() % blockSize > 0)) { - std::cerr - << "\n !!! Matrix Dimensions Do Not Match Block Structure !!! \n\n"; - exit(-123); - } - - // block_rows will accumulate the number of blocks per row - this is NOT the - // row_map with cum sum!! - Ordinal nbrows = mat_crs.numRows() / blockSize; - std::vector block_rows(nbrows, 0); - - Ordinal nbcols = mat_crs.numCols() / blockSize; - - Ordinal numBlocks = 0; - for (Ordinal i = 0; i < mat_crs.numRows(); i += blockSize) { - Ordinal current_blocks = 0; - for (Ordinal j = 0; j < blockSize; ++j) { - auto n_entries = mat_crs.graph.row_map(i + 1 + j) - - mat_crs.graph.row_map(i + j) + blockSize - 1; - current_blocks = std::max(current_blocks, n_entries / blockSize); - } - numBlocks += current_blocks; // cum sum - block_rows[i / blockSize] = current_blocks; // frequency counts - } - - Kokkos::View rows("new_row", - nbrows + 1); - rows(0) = 0; - for (Ordinal i = 0; i < nbrows; ++i) rows(i + 1) = rows(i) + block_rows[i]; - - Kokkos::View cols("new_col", - rows[nbrows]); - cols(0) = 0; - - for (Ordinal ib = 0; ib < nbrows; ++ib) { - auto ir_start = ib * blockSize; - auto ir_stop = (ib + 1) * blockSize; - std::set col_set; - for (Ordinal ir = ir_start; ir < ir_stop; ++ir) { - for (Ordinal jk = mat_crs.graph.row_map(ir); - jk < mat_crs.graph.row_map(ir + 1); ++jk) { - col_set.insert(mat_crs.graph.entries(jk) / blockSize); - } - } - assert(col_set.size() == block_rows[ib]); - Ordinal icount = 0; - auto *col_list = &cols(rows(ib)); - for (auto col_block : col_set) col_list[icount++] = col_block; - } - - Ordinal annz = numBlocks * blockSize * blockSize; - bcrs_matrix_t_::values_type vals("values", annz); - for (Ordinal i = 0; i < annz; ++i) vals(i) = 0.0; - - for ( Ordinal ir = 0; ir < mat_crs.numRows(); ++ir ) { - const auto iblock = ir / blockSize; - const auto ilocal = ir % blockSize; - Ordinal lda = blockSize * (rows[iblock + 1] - rows[iblock]); - for (Ordinal jk = mat_crs.graph.row_map(ir); jk < mat_crs.graph.row_map(ir+1); ++jk) { - const auto jc = mat_crs.graph.entries(jk); - const auto jblock = jc / blockSize; - const auto jlocal = jc % blockSize; - for (Ordinal jkb = rows[iblock]; jkb < rows[iblock + 1]; ++jkb) { - if (cols(jkb) == jblock) { - Ordinal shift = rows[iblock] * blockSize * blockSize - + blockSize * (jkb - rows[iblock]); - vals(shift + jlocal + ilocal * lda) = mat_crs.values(jk); - break; - } - } - } - } - - bcrs_matrix_t_ bmat("newblock", nbrows, nbcols, annz, vals, rows, cols, - blockSize); - return bmat; -} - // // spmv: version for GPU execution // @@ -1171,7 +1005,175 @@ void spmv(KokkosKernels::Experimental::Controls controls, } } +} // namespace details +namespace test { + +const Scalar SC_ONE = Kokkos::ArithTraits::one(); +const Scalar SC_ZERO = Kokkos::ArithTraits::zero(); + +/// \brief Generate a CrsMatrix object for a matrix +/// "with multiple DOFs per node" +/// +/// \tparam mat_structure +/// \param stencil +/// \param structure +/// \param blockSize +/// \param mat_rowmap +/// \param mat_colidx +/// \param mat_val +/// \return +template +crs_matrix_t_ generate_crs_matrix(const std::string stencil, + const mat_structure &structure, + const int blockSize, + std::vector &mat_rowmap, + std::vector &mat_colidx, + std::vector &mat_val) { + crs_matrix_t_ mat_b1 = + Test::generate_structured_matrix2D(stencil, structure); + + if (blockSize == 1) return mat_b1; + + // + // Fill blocks with random values + // + + int nRow = blockSize * mat_b1.numRows(); + int nCol = blockSize * mat_b1.numCols(); + size_t nnz = blockSize * blockSize * mat_b1.nnz(); + + mat_val.resize(nnz); + Scalar *val_ptr = &mat_val[0]; + + for (size_t ii = 0; ii < nnz; ++ii) + val_ptr[ii] = + static_cast(std::rand() / (RAND_MAX + static_cast(1))); + + mat_rowmap.resize(nRow + 1); + int *rowmap = &mat_rowmap[0]; + rowmap[0] = 0; + + mat_colidx.resize(nnz); + int *cols = &mat_colidx[0]; + + for (int ir = 0; ir < mat_b1.numRows(); ++ir) { + auto mat_b1_row = mat_b1.rowConst(ir); + for (int ib = 0; ib < blockSize; ++ib) { + int my_row = ir * blockSize + ib; + rowmap[my_row + 1] = rowmap[my_row] + mat_b1_row.length * blockSize; + for (int ijk = 0; ijk < mat_b1_row.length; ++ijk) { + int col0 = mat_b1_row.colidx(ijk); + for (int jb = 0; jb < blockSize; ++jb) { + cols[rowmap[my_row] + ijk * blockSize + jb] = col0 * blockSize + jb; + } + } // for (int ijk = 0; ijk < mat_row.length; ++ijk) + } + } // for (int ir = 0; ir < mat_b1.numRows(); ++ir) + + return crs_matrix_t_("new_crs_matr", nRow, nCol, nnz, val_ptr, rowmap, cols); +} + +/// \brief Convert a CrsMatrix object to a BlockCrsMatrix object +/// +/// \param mat_crs +/// \param blockSize +/// \return +/// +/// \note We assume that each block has a constant block size +/// (in both directions, i.e. row and column) +/// \note The numerical values for each individual block are stored +/// contiguously in a (blockSize * blockSize) space +/// in a row-major fashion. +/// (2021/06/08) This storage is different from the BlockCrsMatrix constructor +/// ``` BlockCrsMatrix (const KokkosSparse::CrsMatrix &crs_mtx, +/// const OrdinalType blockDimIn) ``` +bcrs_matrix_t_ to_block_crs_matrix(const crs_matrix_t_ &mat_crs, + const int blockSize) { + if (blockSize == 1) { + bcrs_matrix_t_ bmat(mat_crs, blockSize); + return bmat; + } + + if ((mat_crs.numRows() % blockSize > 0) || + (mat_crs.numCols() % blockSize > 0)) { + std::cerr + << "\n !!! Matrix Dimensions Do Not Match Block Structure !!! \n\n"; + exit(-123); + } + + // block_rows will accumulate the number of blocks per row - this is NOT the + // row_map with cum sum!! + Ordinal nbrows = mat_crs.numRows() / blockSize; + std::vector block_rows(nbrows, 0); + + Ordinal nbcols = mat_crs.numCols() / blockSize; + + Ordinal numBlocks = 0; + for (Ordinal i = 0; i < mat_crs.numRows(); i += blockSize) { + Ordinal current_blocks = 0; + for (Ordinal j = 0; j < blockSize; ++j) { + auto n_entries = mat_crs.graph.row_map(i + 1 + j) - + mat_crs.graph.row_map(i + j) + blockSize - 1; + current_blocks = std::max(current_blocks, n_entries / blockSize); + } + numBlocks += current_blocks; // cum sum + block_rows[i / blockSize] = current_blocks; // frequency counts + } + + Kokkos::View rows("new_row", + nbrows + 1); + rows(0) = 0; + for (Ordinal i = 0; i < nbrows; ++i) rows(i + 1) = rows(i) + block_rows[i]; + + Kokkos::View cols("new_col", + rows[nbrows]); + cols(0) = 0; + + for (Ordinal ib = 0; ib < nbrows; ++ib) { + auto ir_start = ib * blockSize; + auto ir_stop = (ib + 1) * blockSize; + std::set col_set; + for (Ordinal ir = ir_start; ir < ir_stop; ++ir) { + for (Ordinal jk = mat_crs.graph.row_map(ir); + jk < mat_crs.graph.row_map(ir + 1); ++jk) { + col_set.insert(mat_crs.graph.entries(jk) / blockSize); + } + } + assert(col_set.size() == block_rows[ib]); + Ordinal icount = 0; + auto *col_list = &cols(rows(ib)); + for (auto col_block : col_set) col_list[icount++] = col_block; + } + + Ordinal annz = numBlocks * blockSize * blockSize; + bcrs_matrix_t_::values_type vals("values", annz); + for (Ordinal i = 0; i < annz; ++i) vals(i) = 0.0; + + for ( Ordinal ir = 0; ir < mat_crs.numRows(); ++ir ) { + const auto iblock = ir / blockSize; + const auto ilocal = ir % blockSize; + Ordinal lda = blockSize * (rows[iblock + 1] - rows[iblock]); + for (Ordinal jk = mat_crs.graph.row_map(ir); jk < mat_crs.graph.row_map(ir+1); ++jk) { + const auto jc = mat_crs.graph.entries(jk); + const auto jblock = jc / blockSize; + const auto jlocal = jc % blockSize; + for (Ordinal jkb = rows[iblock]; jkb < rows[iblock + 1]; ++jkb) { + if (cols(jkb) == jblock) { + Ordinal shift = rows[iblock] * blockSize * blockSize + + blockSize * (jkb - rows[iblock]); + vals(shift + jlocal + ilocal * lda) = mat_crs.values(jk); + break; + } + } + } + } + + bcrs_matrix_t_ bmat("newblock", nbrows, nbcols, annz, vals, rows, cols, + blockSize); + return bmat; +} ///////////////////////////////////////////////////// @@ -1303,106 +1305,55 @@ void compare(const char fOp[], } } -template -void test_matrix(const mtx_t &myMatrix, const int blockSize, const int repeat) { - const Scalar alpha = details::SC_ONE; - const Scalar beta = details::SC_ZERO; - - auto const numRows = myMatrix.numRows(); - - // - // Use BlockCrsMatrix format - // - bcrs_matrix_t_ myBlockMatrix = to_block_crs_matrix(myMatrix, blockSize); - - double error = 0.0, maxNorm = 0.0; - - // - // Assess y <- A * x - // - compare("N", myMatrix, myBlockMatrix, alpha, beta, error, maxNorm); - - std::cout << " Error BlockCrsMatrix " << error << " maxNorm " << maxNorm - << "\n"; - std::cout << " ------------------------ \n"; - - //--- - - std::chrono::duration dt_crs = measure("N", myMatrix, alpha, beta, repeat); - - std::cout << " Total time for Crs Mat-Vec " << dt_crs.count() << " Avg. " - << dt_crs.count() / static_cast(repeat); - std::cout << " Flops (mult only) " - << myMatrix.nnz() * static_cast(repeat / dt_crs.count()) - << "\n"; - std::cout << " ------------------------ \n"; - - //--- - std::chrono::duration dt_bcrs = - measure_block("N", myBlockMatrix, alpha, beta, repeat); - - std::cout << " Total time for BlockCrs Mat-Vec " << dt_bcrs.count() - << " Avg. " << dt_bcrs.count() / static_cast(repeat); - std::cout << " Flops (mult only) " - << myMatrix.nnz() * static_cast(repeat / dt_bcrs.count()); - std::cout << "\n"; - // - std::cout << " Ratio = " << dt_bcrs.count() / dt_crs.count(); +template +class TestCase { +public: + using time_t = std::chrono::duration; + + struct RunInfo { + // options + const char *mode = "N"; // N/T/C/H + const Scalar alpha = SC_ONE; + const Scalar beta = SC_ZERO; + // results + double error = 0.0; + double maxNorm = 0.0; + time_t dt_crs; + time_t dt_bcrs; + }; - if (dt_bcrs.count() < dt_crs.count()) { - std::cout << " --- GOOD --- "; - } else { - std::cout << " ((( Not Faster ))) "; +public: + TestCase(std::string name, crs_matrix_t_ myMatrix, const int blockSize, const int repeat = 1024): + name_(name), + myMatrix_(std::move(myMatrix)), + blockSize_(blockSize), + repeat_(repeat) + { + myBlockMatrix_ = to_block_crs_matrix(myMatrix_, blockSize_); // Use BlockCrsMatrix format } - std::cout << "\n"; - - std::cout << " ======================== \n"; // // Assess y <- A * x // - compare("T", myMatrix, myBlockMatrix, alpha, beta, error, maxNorm); - - std::cout << " Error Transpose BlockCrsMatrix " << error << " maxNorm " << maxNorm - << "\n"; - std::cout << " ------------------------ \n"; - - //--- - - dt_crs = measure("T", myMatrix, alpha, beta, repeat); - - std::cout << " Total time for Crs Mat^T-Vec " << dt_crs.count() << " Avg. " - << dt_crs.count() / static_cast(repeat); - std::cout << " Flops (mult only) " - << myMatrix.nnz() * static_cast(repeat / dt_crs.count()) - << "\n"; - std::cout << " ------------------------ \n"; - - //--- - dt_bcrs = measure_block("T", myBlockMatrix, alpha, beta, repeat); - - std::cout << " Total time for BlockCrs Mat^T-Vec " << dt_bcrs.count() - << " Avg. " << dt_bcrs.count() / static_cast(repeat); - std::cout << " Flops (mult only) " - << myMatrix.nnz() * static_cast(repeat / dt_bcrs.count()); - std::cout << "\n"; - // - std::cout << " Ratio = " << dt_bcrs.count() / dt_crs.count(); - - if (dt_bcrs.count() < dt_crs.count()) { - std::cout << " --- GOOD --- "; - } else { - std::cout << " ((( Not Faster ))) "; + bool execute(RunInfo &run) + { + compare(run.mode, myMatrix_, myBlockMatrix_, run.alpha, run.beta, run.error, run.maxNorm); + run.dt_crs = measure(run.mode, myMatrix_, run.alpha, run.beta, repeat_); + run.dt_bcrs = measure_block(run.mode, myBlockMatrix_, run.alpha, run.beta, repeat_); + return true; } - std::cout << "\n"; - std::cout << " ======================== \n"; +// private: + mtx_t myMatrix_; + bmtx_t myBlockMatrix_; // derived + int blockSize_; + int repeat_; + std::string name_; +}; -} - -int test_random(const int repeat = 1024, const int minBlockSize = 1, +template +void test_random(std::vector &samples, const int repeat = 1024, const int minBlockSize = 1, const int maxBlockSize = 12) { - int return_value = 0; // The mat_structure view is used to generate a matrix using // finite difference (FD) or finite element (FE) discretization @@ -1426,23 +1377,18 @@ int test_random(const int repeat = 1024, const int minBlockSize = 1, std::vector mat_rowmap, mat_colidx; std::vector mat_val; - crs_matrix_t_ myMatrix = details::generate_crs_matrix( - "FD", mat_structure, blockSize, mat_rowmap, mat_colidx, mat_val); - - std::cout << " ======================== \n"; - std::cout << " Block Size " << blockSize; - std::cout << " Matrix Size " << myMatrix.numRows() << " nnz " - << myMatrix.nnz() << "\n"; - - test_matrix(myMatrix, blockSize, repeat); + samples.push_back({ + "rand-" + std::to_string(blockSize), + generate_crs_matrix( + "FD", mat_structure, blockSize, mat_rowmap, mat_colidx, mat_val), + blockSize, + repeat + }); } - return return_value; } -int test_samples(const int repeat = 3000) { - int return_value = 0; - - srand(17312837); +template +void test_samples(std::vector &samples, const int repeat = 3000) { const std::vector > SAMPLES{ // std::tuple(char* fileName, int blockSize) @@ -1474,36 +1420,81 @@ int test_samples(const int repeat = 3000) { }; // Loop over sample matrix files - std::for_each(SAMPLES.begin(), SAMPLES.end(), [=](auto const &sample) { + std::for_each(SAMPLES.begin(), SAMPLES.end(), [&](auto const &sample) { const char *fileName = std::get<0>(sample); - const int blockSize = std::get<1>(sample); - auto myMatrix = - KokkosKernels::Impl::read_kokkos_crst_matrix(fileName); - - std::cout << " ======================== \n"; - std::cout << " Sample: '" << fileName << "', Block Size " << blockSize; - std::cout << " Matrix Size " << myMatrix.numCols() << " x " - << myMatrix.numRows() << ", nnz " << myMatrix.nnz() << "\n"; - - test_matrix(myMatrix, blockSize, repeat); + samples.push_back({ + std::string(fileName), + KokkosKernels::Impl::read_kokkos_crst_matrix(fileName), + std::get<1>(sample), + repeat + }); }); - return return_value; } -} // namespace details - -#define TEST_RANDOM_BSPMV +} // namespace test int main() { + const char* const sep = "\t"; // CSV separator Kokkos::initialize(); + bool failed = false; + srand(17312837); -#ifdef TEST_RANDOM_BSPMV - int return_value = details::test_random(); -#else - int return_value = details::test_samples(); -#endif + { + // Prepare samples + using test_t = test::TestCase; + std::vector samples; + test::test_random(samples); + test::test_samples(samples); + + // Prepare variants + std::vector variants; + variants.push_back({"N"}); + variants.push_back({"T"}); + + // Run samples + int sample_id = 0; + std::cout << "no." << sep << "name" << sep << "size" << sep << "block" << sep << "nnz" // sample info + << sep << "mode" << sep << "alpha" << sep << "beta" // run info + << sep << "error" << sep << "maxNorm" + << sep << "crsTime" << sep << "crsAvg" << sep << "crsGFlops" + << sep << "bcrsTime" << sep << "bcrsAvg" << sep << "bcrsGFlops" + << sep << "ratio" << sep << "remarks" + << std::endl; + std::for_each(samples.begin(), samples.end(), [&](test_t &test) { + bool first = true; + sample_id += 1; + int variant_id = 0; + std::for_each(variants.begin(), variants.end(), [&](test_t::RunInfo &run) { + + // Show run info + ++variant_id; + std::cout << sample_id << ":" << variant_id << "/" << samples.size() << sep; + if (first) { + std::cout << sep << test.name_ << sep << test.myMatrix_.numRows() << sep << test.blockSize_ << sep << test.myMatrix_.nnz(); + first = false; + } else + std::cout << sep << "^" << sep << "^" << sep << "^" << sep << "^"; + std::cout << sep << run.mode << sep << run.alpha << sep << run.beta; + + // Execute + bool pass = test.execute(run); + failed = failed || !pass; + + // Show results + std::cout << sep << run.error << sep << run.maxNorm; + auto const avg = (run.dt_crs.count() / static_cast(test.repeat_)); + auto const flops = test.myMatrix_.nnz() * static_cast(test.repeat_ / run.dt_crs.count()); + std::cout << sep << run.dt_crs.count() << sep << avg << sep << (flops * 1e-9); + auto const avg2 = (run.dt_bcrs.count() / static_cast(test.repeat_)); + auto const flops2 = test.myMatrix_.nnz() * static_cast(test.repeat_ / run.dt_bcrs.count()); + std::cout << sep << run.dt_bcrs.count() << sep << avg2 << sep << (flops2 * 1e-9); + auto const remarks = (run.dt_bcrs.count() < run.dt_crs.count()) ? "good" : "NOT_faster"; + std::cout << sep << (run.dt_bcrs.count() / run.dt_crs.count()) << sep << remarks; + std::cout << std::endl; + }); + }); + } Kokkos::finalize(); - - return return_value; + return failed ? 1 : 0; }