From b82cfaa06702dd691ac10e3994ae54341c63990c Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Wed, 17 Feb 2016 13:08:16 -0800 Subject: [PATCH 3/3] Tpetra: Call MKL for single-vec (double,int) SpMV @trilinos/tpetra If the MKL TPL is enabled, TpetraKernels now has the option to call MKL for sparse matrix-vector multiply, if Scalar = double, LocalOrdinal = int, OffsetType = int, and the execution space is Kokkos::OpenMP. You may control this using an environment variable, TPETRA_USE_MKL_SPMV. If this is "1" (sans quotes), then TpetraKernels uses MKL for that combination of types. Furthermore, Tpetra::CrsGraph (and therefore Tpetra::CrsMatrix) now uses OffsetType = LocalOrdinal, instead of OffsetType = size_t. This means that Tpetra::CrsMatrix actually calls MKL for the Scalar = double and LocalOrdinal = int case. --- .../core/example/Lesson07-Kokkos-Fill/05_solve.cpp | 3 +- packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp | 3 +- packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp | 38 ++-- packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp | 23 ++- .../src/Tpetra_Experimental_BlockCrsMatrix_def.hpp | 6 +- .../src/impl/Kokkos_Sparse_V_impl_spmv_Cuda.cpp | 16 +- .../src/impl/Kokkos_Sparse_V_impl_spmv_OpenMP.cpp | 6 +- .../src/impl/Kokkos_Sparse_V_impl_spmv_Serial.cpp | 6 +- .../src/impl/Kokkos_Sparse_V_impl_spmv_Threads.cpp | 6 +- .../kernels/src/impl/Kokkos_Sparse_impl_spmv.hpp | 217 +++++++++++++++++---- 10 files changed, 238 insertions(+), 86 deletions(-) diff --git a/packages/tpetra/core/example/Lesson07-Kokkos-Fill/05_solve.cpp b/packages/tpetra/core/example/Lesson07-Kokkos-Fill/05_solve.cpp index 14794d1..5e77a09 100644 --- a/packages/tpetra/core/example/Lesson07-Kokkos-Fill/05_solve.cpp +++ b/packages/tpetra/core/example/Lesson07-Kokkos-Fill/05_solve.cpp @@ -229,7 +229,8 @@ int main (int argc, char* argv[]) { // Use a parallel scan (prefix sum) over the array of row counts, to // compute the array of row offsets for the sparse graph. - Kokkos::View rowOffsets ("row offsets", numLclRows+1); + typedef Tpetra::CrsMatrix<>::local_graph_type::size_type offset_type; + Kokkos::View rowOffsets ("row offsets", numLclRows+1); Kokkos::parallel_scan (numLclRows+1, KOKKOS_LAMBDA (const LO lclRows, size_t& update, const bool final) { if (final) { diff --git a/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp b/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp index 77be8b3..8efe0fa 100644 --- a/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp @@ -299,7 +299,8 @@ namespace Tpetra { //! The type of the part of the sparse graph on each MPI process. typedef Kokkos::StaticCrsGraph local_graph_type; + execution_space, + LocalOrdinal> local_graph_type; //! DEPRECATED; use local_graph_type (above) instead. typedef local_graph_type LocalStaticCrsGraphType TPETRA_DEPRECATED; diff --git a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp index 5a6ad37..44dbb80 100644 --- a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp @@ -490,8 +490,9 @@ namespace Tpetra { TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( colMap.is_null (), std::runtime_error, ": The input column Map must be nonnull."); - TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( - k_local_graph_.numRows () != rowMap->getNodeNumElements (), + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC + (static_cast (k_local_graph_.numRows ()) != + static_cast (rowMap->getNodeNumElements ()), std::runtime_error, ": The input row Map and the input local graph need to have the same " "number of rows. The row Map claims " << rowMap->getNodeNumElements () @@ -560,7 +561,9 @@ namespace Tpetra { if (localRow < largestCol) { lowerTriangular_ = false; } - for (size_t i = d_ptrs(rlcid); i < d_ptrs(rlcid + 1); ++i) { + + typedef typename std::decay::type index_type; + for (index_type i = d_ptrs(rlcid); i < d_ptrs(rlcid + 1); ++i) { if (d_inds(i) == rlcid) { ++nodeNumDiags_; } @@ -2310,14 +2313,18 @@ namespace Tpetra { TEUCHOS_TEST_FOR_EXCEPTION( isGloballyIndexed () && k_rowPtrs_.dimension_0 () != 0 && - (static_cast (k_rowPtrs_.dimension_0 ()) != getNodeNumRows () + 1 || - k_rowPtrs_(getNodeNumRows ()) != static_cast (k_gblInds1D_.dimension_0 ())), + (static_cast (k_rowPtrs_.dimension_0 ()) != + static_cast (getNodeNumRows () + 1) || + static_cast (k_rowPtrs_(getNodeNumRows ())) != + static_cast (k_gblInds1D_.dimension_0 ())), std::logic_error, err ); TEUCHOS_TEST_FOR_EXCEPTION( isLocallyIndexed () && k_rowPtrs_.dimension_0 () != 0 && - (static_cast (k_rowPtrs_.dimension_0 ()) != getNodeNumRows () + 1 || - k_rowPtrs_(getNodeNumRows ()) != static_cast (k_lclInds1D_.dimension_0 ())), + (static_cast (k_rowPtrs_.dimension_0 ()) != + static_cast (getNodeNumRows () + 1) || + static_cast (k_rowPtrs_(getNodeNumRows ())) != + static_cast (k_lclInds1D_.dimension_0 ())), std::logic_error, err ); // If profile is dynamic and indices are allocated, then 2-D @@ -4014,8 +4021,8 @@ namespace Tpetra { << h_ptrs.dimension_0 () << " != (lclNumRows+1) = " << (lclNumRows+1) << "."); // FIXME (mfh 08 Aug 2014) This assumes UVM. - TEUCHOS_TEST_FOR_EXCEPTION( - k_ptrs(lclNumRows) != lclTotalNumEntries, std::logic_error, + TEUCHOS_TEST_FOR_EXCEPTION + (static_cast (k_ptrs(lclNumRows)) != lclTotalNumEntries, std::logic_error, "Tpetra::CrsGraph::fillLocalGraph: In DynamicProfile branch, after " "packing k_ptrs, k_ptrs(lclNumRows = " << lclNumRows << ") = " << k_ptrs(lclNumRows) << " != total number of entries on the calling " @@ -4065,7 +4072,8 @@ namespace Tpetra { // FIXME (mfh 08 Aug 2014) This relies on UVM. TEUCHOS_TEST_FOR_EXCEPTION( numOffsets != 0 && - k_lclInds1D_.dimension_0 () != k_rowPtrs_(numOffsets-1), + static_cast (k_lclInds1D_.dimension_0 ()) != + static_cast (k_rowPtrs_(numOffsets-1)), std::logic_error, "Tpetra::CrsGraph::fillLocalGraph: " "numOffsets = " << numOffsets << " != 0 and " "k_lclInds1D_.dimension_0() = " << k_lclInds1D_.dimension_0 () @@ -4091,8 +4099,9 @@ namespace Tpetra { if (k_rowPtrs_.dimension_0 () != 0) { const size_t numOffsets = static_cast (k_rowPtrs_.dimension_0 ()); - TEUCHOS_TEST_FOR_EXCEPTION( - k_rowPtrs_(numOffsets-1) != static_cast (k_lclInds1D_.dimension_0 ()), + TEUCHOS_TEST_FOR_EXCEPTION + (static_cast (k_rowPtrs_(numOffsets-1)) != + static_cast (k_lclInds1D_.dimension_0 ()), std::logic_error, "Tpetra::CrsGraph::fillLocalGraph: " "In StaticProfile branch, before allocating or packing, " "k_rowPtrs_(" << (numOffsets-1) << ") = " @@ -4135,8 +4144,9 @@ namespace Tpetra { "k_ptrs.dimension_0() = " << k_ptrs.dimension_0 () << " != " "lclNumRows+1 = " << (lclNumRows+1) << "."); // FIXME (mfh 08 Aug 2014) This assumes UVM. - TEUCHOS_TEST_FOR_EXCEPTION( - k_ptrs(lclNumRows) != lclTotalNumEntries, std::logic_error, + TEUCHOS_TEST_FOR_EXCEPTION + (static_cast (k_ptrs(lclNumRows)) != + static_cast (lclTotalNumEntries), std::logic_error, "Tpetra::CrsGraph::fillLocalGraph: In StaticProfile unpacked " "branch, after filling k_ptrs, k_ptrs(lclNumRows=" << lclNumRows << ") = " << k_ptrs(lclNumRows) << " != total number of entries on " diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index 82200da..586e246 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -897,8 +897,9 @@ namespace Tpetra { " = " << h_ptrs.dimension_0 () << " != (lclNumRows+1) = " << (lclNumRows+1) << "."); // FIXME (mfh 08 Aug 2014) This assumes UVM. - TEUCHOS_TEST_FOR_EXCEPTION( - k_ptrs(lclNumRows) != lclTotalNumEntries, std::logic_error, + TEUCHOS_TEST_FOR_EXCEPTION + (static_cast (k_ptrs(lclNumRows)) != + static_cast (lclTotalNumEntries), std::logic_error, "Tpetra::CrsMatrix::fillLocalGraphAndMatrix: In DynamicProfile branch, " "after packing k_ptrs, k_ptrs(lclNumRows = " << lclNumRows << ") = " << k_ptrs(lclNumRows) << " != total number of entries on the calling " @@ -966,7 +967,8 @@ namespace Tpetra { // FIXME (mfh 06 Aug 2014) This relies on UVM. TEUCHOS_TEST_FOR_EXCEPTION( numOffsets != 0 && - myGraph_->k_lclInds1D_.dimension_0 () != curRowOffsets(numOffsets - 1), + static_cast (myGraph_->k_lclInds1D_.dimension_0 ()) != + static_cast (curRowOffsets(numOffsets - 1)), std::logic_error, "Tpetra::CrsMatrix::fillLocalGraphAndMatrix: " "numOffsets = " << numOffsets << " != 0 and " "myGraph_->k_lclInds1D_.dimension_0() = " @@ -993,8 +995,9 @@ namespace Tpetra { if (curRowOffsets.dimension_0 () != 0) { const size_t numOffsets = static_cast (curRowOffsets.dimension_0 ()); - TEUCHOS_TEST_FOR_EXCEPTION( - curRowOffsets(numOffsets-1) != static_cast (k_values1D_.dimension_0 ()), + TEUCHOS_TEST_FOR_EXCEPTION + (static_cast (curRowOffsets(numOffsets-1)) != + static_cast (k_values1D_.dimension_0 ()), std::logic_error, "Tpetra::CrsMatrix::fillLocalGraphAndMatrix: " "In StaticProfile branch, before allocating or packing, " "curRowOffsets(" << (numOffsets-1) << ") = " @@ -1054,8 +1057,9 @@ namespace Tpetra { "k_ptrs.dimension_0() = " << k_ptrs.dimension_0 () << " != " "lclNumRows+1 = " << (lclNumRows+1) << "."); // FIXME (mfh 06 Aug 2014) This assumes UVM. - TEUCHOS_TEST_FOR_EXCEPTION( - k_ptrs(lclNumRows) != lclTotalNumEntries, std::logic_error, + TEUCHOS_TEST_FOR_EXCEPTION + (static_cast (k_ptrs(lclNumRows)) != + static_cast (lclTotalNumEntries), std::logic_error, "Tpetra::CrsMatrix::fillLocalGraphAndMatrix: In StaticProfile " "unpacked-but-pack branch, after filling k_ptrs, k_ptrs(lclNumRows=" << lclNumRows << ") = " << k_ptrs(lclNumRows) << " != total number " @@ -1342,8 +1346,9 @@ namespace Tpetra { " = " << h_ptrs.dimension_0 () << " != (lclNumRows+1) = " << (lclNumRows+1) << "."); // FIXME (mfh 08 Aug 2014) This assumes UVM. - TEUCHOS_TEST_FOR_EXCEPTION( - k_ptrs(lclNumRows) != lclTotalNumEntries, std::logic_error, + TEUCHOS_TEST_FOR_EXCEPTION + (static_cast (k_ptrs(lclNumRows)) != + static_cast (lclTotalNumEntries), std::logic_error, "Tpetra::CrsMatrix::fillLocalMatrix: In DynamicProfile branch, " "after packing k_ptrs, k_ptrs(lclNumRows = " << lclNumRows << ") = " << k_ptrs(lclNumRows) << " != total number of entries on the calling " diff --git a/packages/tpetra/core/src/Tpetra_Experimental_BlockCrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_Experimental_BlockCrsMatrix_def.hpp index c77dae1..d28aa13 100644 --- a/packages/tpetra/core/src/Tpetra_Experimental_BlockCrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_Experimental_BlockCrsMatrix_def.hpp @@ -1270,7 +1270,7 @@ namespace Experimental { BlockCrsMatrix:: getConstLocalBlockFromAbsOffset (const size_t absBlockOffset) const { - if (absBlockOffset >= ptr_[rowMeshMap_.getNodeNumElements ()]) { + if (absBlockOffset >= static_cast (ptr_[rowMeshMap_.getNodeNumElements ()])) { // An empty block signifies an error. We don't expect to see // this error in correct code, but it's helpful for avoiding // memory corruption in case there is a bug. @@ -1313,7 +1313,7 @@ namespace Experimental { BlockCrsMatrix:: getNonConstLocalBlockFromAbsOffset (const size_t absBlockOffset) const { - if (absBlockOffset >= ptr_[rowMeshMap_.getNodeNumElements ()]) { + if (absBlockOffset >= static_cast (ptr_[rowMeshMap_.getNodeNumElements ()])) { // An empty block signifies an error. We don't expect to see // this error in correct code, but it's helpful for avoiding // memory corruption in case there is a bug. @@ -1330,7 +1330,7 @@ namespace Experimental { BlockCrsMatrix:: getLocalBlock (const LO localRowInd, const LO localColInd) const { - const size_t absRowBlockOffset = ptr_[localRowInd]; + const size_t absRowBlockOffset = static_cast (ptr_[localRowInd]); const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd); diff --git a/packages/tpetra/kernels/src/impl/Kokkos_Sparse_V_impl_spmv_Cuda.cpp b/packages/tpetra/kernels/src/impl/Kokkos_Sparse_V_impl_spmv_Cuda.cpp index 7a2cc75..5dceb4d 100644 --- a/packages/tpetra/kernels/src/impl/Kokkos_Sparse_V_impl_spmv_Cuda.cpp +++ b/packages/tpetra/kernels/src/impl/Kokkos_Sparse_V_impl_spmv_Cuda.cpp @@ -46,19 +46,19 @@ namespace KokkosSparse { namespace Impl { -#ifdef KOKKOS_HAVE_CUDA +// #ifdef KOKKOS_HAVE_CUDA -KOKKOSSPARSE_IMPL_SPMV_DEF( int, int, size_t, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace ) -KOKKOSSPARSE_IMPL_SPMV_DEF( long, int, size_t, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace ) -KOKKOSSPARSE_IMPL_SPMV_DEF( double, int, size_t, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace ) +// KOKKOSSPARSE_IMPL_SPMV_DEF( int, int, int, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace ) +// KOKKOSSPARSE_IMPL_SPMV_DEF( long, int, int, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace ) +// KOKKOSSPARSE_IMPL_SPMV_DEF( double, int, int, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace ) -#endif // KOKKOS_HAVE_CUDA +// #endif // KOKKOS_HAVE_CUDA #ifdef KOKKOS_HAVE_CUDA -KOKKOSSPARSE_IMPL_SPMV_DEF( int, int, size_t, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace ) -KOKKOSSPARSE_IMPL_SPMV_DEF( long, int, size_t, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace ) -KOKKOSSPARSE_IMPL_SPMV_DEF( double, int, size_t, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace ) +KOKKOSSPARSE_IMPL_SPMV_DEF( int, int, int, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace ) +KOKKOSSPARSE_IMPL_SPMV_DEF( long, int, int, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace ) +KOKKOSSPARSE_IMPL_SPMV_DEF( double, int, int, Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaUVMSpace ) #endif // KOKKOS_HAVE_CUDA diff --git a/packages/tpetra/kernels/src/impl/Kokkos_Sparse_V_impl_spmv_OpenMP.cpp b/packages/tpetra/kernels/src/impl/Kokkos_Sparse_V_impl_spmv_OpenMP.cpp index 1c1b7c8..f3762fc 100644 --- a/packages/tpetra/kernels/src/impl/Kokkos_Sparse_V_impl_spmv_OpenMP.cpp +++ b/packages/tpetra/kernels/src/impl/Kokkos_Sparse_V_impl_spmv_OpenMP.cpp @@ -48,9 +48,9 @@ namespace Impl { #ifdef KOKKOS_HAVE_OPENMP -KOKKOSSPARSE_IMPL_SPMV_DEF( int, int, size_t, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace ) -KOKKOSSPARSE_IMPL_SPMV_DEF( long, int, size_t, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace ) -KOKKOSSPARSE_IMPL_SPMV_DEF( double, int, size_t, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace ) +KOKKOSSPARSE_IMPL_SPMV_DEF( int, int, int, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace ) +KOKKOSSPARSE_IMPL_SPMV_DEF( long, int, int, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace ) +KOKKOSSPARSE_IMPL_SPMV_DEF( double, int, int, Kokkos::LayoutLeft, Kokkos::OpenMP, Kokkos::HostSpace ) /*#define KOKKOSSPARSE_IMPL_MV_EXEC_SPACE Kokkos::OpenMP #define KOKKOSSPARSE_IMPL_MV_MEM_SPACE Kokkos::HostSpace diff --git a/packages/tpetra/kernels/src/impl/Kokkos_Sparse_V_impl_spmv_Serial.cpp b/packages/tpetra/kernels/src/impl/Kokkos_Sparse_V_impl_spmv_Serial.cpp index 841ee7a..7481700 100644 --- a/packages/tpetra/kernels/src/impl/Kokkos_Sparse_V_impl_spmv_Serial.cpp +++ b/packages/tpetra/kernels/src/impl/Kokkos_Sparse_V_impl_spmv_Serial.cpp @@ -48,9 +48,9 @@ namespace Impl { #ifdef KOKKOS_HAVE_SERIAL -KOKKOSSPARSE_IMPL_SPMV_DEF( int, int, size_t, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace ) -KOKKOSSPARSE_IMPL_SPMV_DEF( long, int, size_t, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace ) -KOKKOSSPARSE_IMPL_SPMV_DEF( double, int, size_t, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace ) +KOKKOSSPARSE_IMPL_SPMV_DEF( int, int, int, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace ) +KOKKOSSPARSE_IMPL_SPMV_DEF( long, int, int, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace ) +KOKKOSSPARSE_IMPL_SPMV_DEF( double, int, int, Kokkos::LayoutLeft, Kokkos::Serial, Kokkos::HostSpace ) #endif // KOKKOS_HAVE_SERIAL diff --git a/packages/tpetra/kernels/src/impl/Kokkos_Sparse_V_impl_spmv_Threads.cpp b/packages/tpetra/kernels/src/impl/Kokkos_Sparse_V_impl_spmv_Threads.cpp index 6dd450e..6261167 100644 --- a/packages/tpetra/kernels/src/impl/Kokkos_Sparse_V_impl_spmv_Threads.cpp +++ b/packages/tpetra/kernels/src/impl/Kokkos_Sparse_V_impl_spmv_Threads.cpp @@ -48,9 +48,9 @@ namespace Impl { #ifdef KOKKOS_HAVE_PTHREAD -KOKKOSSPARSE_IMPL_SPMV_DEF( int, int, size_t, Kokkos::LayoutLeft, Kokkos::Threads, Kokkos::HostSpace ) -KOKKOSSPARSE_IMPL_SPMV_DEF( long, int, size_t, Kokkos::LayoutLeft, Kokkos::Threads, Kokkos::HostSpace ) -KOKKOSSPARSE_IMPL_SPMV_DEF( double, int, size_t, Kokkos::LayoutLeft, Kokkos::Threads, Kokkos::HostSpace ) +KOKKOSSPARSE_IMPL_SPMV_DEF( int, int, int, Kokkos::LayoutLeft, Kokkos::Threads, Kokkos::HostSpace ) +KOKKOSSPARSE_IMPL_SPMV_DEF( long, int, int, Kokkos::LayoutLeft, Kokkos::Threads, Kokkos::HostSpace ) +KOKKOSSPARSE_IMPL_SPMV_DEF( double, int, int, Kokkos::LayoutLeft, Kokkos::Threads, Kokkos::HostSpace ) #endif // KOKKOS_HAVE_PTHREAD diff --git a/packages/tpetra/kernels/src/impl/Kokkos_Sparse_impl_spmv.hpp b/packages/tpetra/kernels/src/impl/Kokkos_Sparse_impl_spmv.hpp index eed79a2..ccc1e37 100644 --- a/packages/tpetra/kernels/src/impl/Kokkos_Sparse_impl_spmv.hpp +++ b/packages/tpetra/kernels/src/impl/Kokkos_Sparse_impl_spmv.hpp @@ -55,6 +55,131 @@ #define KOKKOSSPARSE_ETI_ONLY #endif +#ifdef HAVE_TPETRAKERNELS_MKL +#include +#include // getenv +#include // strncmp +#endif // HAVE_TPETRAKERNELS_MKL + +namespace { // (anonymous) + + template + struct TryMklOneVec { + typedef KokkosSparse::CrsMatrix, + OffsetType> matrix_type; + typedef Kokkos::View > + x_vec_type; + typedef Kokkos::View > y_vec_type; + + // Return value: Whether we actually used MKL. + // If not, use our native implementation. + static bool + tryMkl (const char mode[], + const ValueType& alpha, + const matrix_type& A, + const x_vec_type& x, + const ValueType& beta, + const y_vec_type& y) + { + // Default is not to use MKL, because MKL only supports certain types. + return false; + } + }; + +#if defined(HAVE_TPETRAKERNELS_MKL) && defined(KOKKOS_HAVE_OPENMP) + template<> + struct TryMklOneVec > { + typedef double ValueType; + typedef MKL_INT IndexType; + typedef MKL_INT OffsetType; + typedef Kokkos::Device DeviceType; + + typedef KokkosSparse::CrsMatrix, + OffsetType> matrix_type; + typedef Kokkos::View > + x_vec_type; + typedef Kokkos::View > y_vec_type; + + static bool + tryMkl (const char mode[], + const ValueType& alpha, + const matrix_type& A, + const x_vec_type& x, + const ValueType& beta, + const y_vec_type& y) + { + // WARNING (17 Feb 2016) POSIX does not require getenv() to be + // reentrant. Thus, it might not be safe to call this function + // concurrently on multiple threads, even with different data on + // different threads. As a result, it's OK for us to make this + // function nonreentrant, so that we can avoid the overhead of + // calling getenv after the first call. + static bool notFirstCall = false; + static bool useMkl = false; + + if (! notFirstCall) { + const char envKey[] = "TPETRA_USE_MKL_SPMV"; + const char* const envVal = getenv (envKey); + if (envVal == NULL) { + useMkl = false; + } + else { + if (strncmp (envVal, "1", 1) || + strncmp (envVal, "ON", 2) || + strncmp (envVal, "YES", 3) || + strncmp (envVal, "TRUE", 4)) { + useMkl = true; + } + else { + useMkl = false; + } + } + notFirstCall = true; + } + + if (! useMkl) { + return false; + } + else { // useMkl + const char* mklMode = (mode[0] == 'H' || mode[0] == 'h') ? "C" : mode; + const MKL_INT numRows = A.numRows (); + // If numRows == 0, then the ptr array may be empty (instead of + // having one entry as it should). Don't try to access ptr[1] + // in that case. + if (numRows > 0) { + char matdesc[4]; + matdesc[0] = 'G'; + matdesc[1] = 'N'; // neither upper nor lower tri (mat-vec ignores this) + matdesc[2] = 'N'; // explicitly stored diagonal + matdesc[3] = 'C'; // zero-based (C style) indexing + + auto rowBeg = A.graph.row_map; + const std::pair range (1, A.graph.row_map.dimension_0 ()); + auto rowEnd = Kokkos::subview (A.graph.row_map, range); + const MKL_INT numCols = A.numCols (); + mkl_dcsrmv (mklMode, &numRows, &numCols, &alpha, matdesc, + A.values.ptr_on_device (), + A.graph.entries.ptr_on_device (), + rowBeg.ptr_on_device (), rowEnd.ptr_on_device (), + x.ptr_on_device (), &beta, y.ptr_on_device ()); + } + return true; // successfully used MKL + } + } + }; +#endif // HAVE_TPETRAKERNELS_MKL +} // namespace (anonymous) + + + namespace KokkosSparse { namespace Impl { @@ -534,19 +659,24 @@ struct SPMV spmv (const char mode[], const Scalar& alpha, const AMatrix& A, const XVector& x, const Scalar& beta, const YVector& y) { - if (alpha == Kokkos::Details::ArithTraits::zero ()) { - spmv_alpha (mode, alpha, A, x, beta, y); - return; - } - if (alpha == Kokkos::Details::ArithTraits::one ()) { - spmv_alpha (mode, alpha, A, x, beta, y); - return; - } - if (alpha == -Kokkos::Details::ArithTraits::one ()) { - spmv_alpha (mode, alpha, A, x, beta, y); - return; + typedef TryMklOneVec try_mkl_type; + const bool didMkl = try_mkl_type::tryMkl (mode, alpha, A, x, beta, y); + + if (! didMkl) { + if (alpha == Kokkos::Details::ArithTraits::zero ()) { + spmv_alpha (mode, alpha, A, x, beta, y); + return; + } + if (alpha == Kokkos::Details::ArithTraits::one ()) { + spmv_alpha (mode, alpha, A, x, beta, y); + return; + } + if (alpha == -Kokkos::Details::ArithTraits::one ()) { + spmv_alpha (mode, alpha, A, x, beta, y); + return; + } + spmv_alpha (mode, alpha, A, x, beta, y); } - spmv_alpha (mode, alpha, A, x, beta, y); } } #endif @@ -608,41 +738,41 @@ struct SPMV::zero ()) { \ - spmv_alpha (mode, alpha, A, x, beta, y); \ - return; \ - } \ - if (alpha == Kokkos::Details::ArithTraits::one ()) { \ - spmv_alpha (mode, alpha, A, x, beta, y); \ - return; \ - } \ - if (alpha == -Kokkos::Details::ArithTraits::one ()) { \ - spmv_alpha (mode, alpha, A, x, beta, y); \ - return; \ + typedef TryMklOneVec > try_mkl_type; \ + const bool didMkl = try_mkl_type::tryMkl (mode, alpha, A, x, beta, y); \ + \ + if (! didMkl) { \ + if (alpha == Kokkos::Details::ArithTraits::zero ()) { \ + spmv_alpha (mode, alpha, A, x, beta, y); \ + return; \ + } \ + if (alpha == Kokkos::Details::ArithTraits::one ()) { \ + spmv_alpha (mode, alpha, A, x, beta, y); \ + return; \ + } \ + if (alpha == -Kokkos::Details::ArithTraits::one ()) { \ + spmv_alpha (mode, alpha, A, x, beta, y); \ + return; \ + } \ + spmv_alpha (mode, alpha, A, x, beta, y); \ } \ - spmv_alpha (mode, alpha, A, x, beta, y); \ } -- 2.1.3