diff --git a/packages/tpetra/core/src/Tpetra_Experimental_BlockMultiVector_def.hpp b/packages/tpetra/core/src/Tpetra_Experimental_BlockMultiVector_def.hpp index 10beeb513b8b..92d154ad89fb 100644 --- a/packages/tpetra/core/src/Tpetra_Experimental_BlockMultiVector_def.hpp +++ b/packages/tpetra/core/src/Tpetra_Experimental_BlockMultiVector_def.hpp @@ -424,6 +424,18 @@ BlockMultiVector:: getLocalBlock (const LO localRowIndex, const LO colIndex) const { + // NOTE (mfh 07 Jul 2016) It should be correct to add the + // commented-out test below. However, I've conservatively commented + // it out, since users might not realize that they need to have + // things sync'd correctly. + +// #ifdef HAVE_TPETRA_DEBUG +// TEUCHOS_TEST_FOR_EXCEPTION +// (mv_.template need_sync (), std::runtime_error, +// "Tpetra::Experimental::BlockMultiVector::getLocalBlock: This method " +// "accesses host data, but the object is not in sync on host." ); +// #endif // HAVE_TPETRA_DEBUG + if (! isValidLocalMeshIndex (localRowIndex)) { return typename little_vec_type::HostMirror (); } else { @@ -699,8 +711,113 @@ createBlockWiseMultiply (const int block_size, const Scalar& alpha, const ViewY& Y, const ViewD& D, const ViewX& X) { return BlockWiseMultiply(block_size, alpha, Y, D, X); } + +template +class BlockJacobiUpdate { +private: + typedef typename ViewD::device_type Device; + typedef typename ViewD::non_const_value_type ImplScalar; + typedef Kokkos::MemoryTraits Unmanaged; + + template + using UnmanagedView = Kokkos::View; + typedef UnmanagedView UnMViewY; + typedef UnmanagedView UnMViewD; + typedef UnmanagedView UnMViewZ; + + const LO blockSize_; + UnMViewY Y_; + const Scalar alpha_; + UnMViewD D_; + UnMViewZ Z_; + const Scalar beta_; + +public: + BlockJacobiUpdate (const ViewY& Y, + const Scalar& alpha, + const ViewD& D, + const ViewZ& Z, + const Scalar& beta) : + blockSize_ (D.dimension_1 ()), + // numVecs_ (static_cast (ViewY::rank) == 1 ? static_cast (1) : static_cast (Y_.dimension_1 ())), + Y_ (Y), + alpha_ (alpha), + D_ (D), + Z_ (Z), + beta_ (beta) + { + static_assert (static_cast (ViewY::rank) == 1, + "Y must have rank 1."); + static_assert (static_cast (ViewD::rank) == 3, "D must have rank 3."); + static_assert (static_cast (ViewZ::rank) == 1, + "Z must have rank 1."); + // static_assert (static_cast (ViewZ::rank) == + // static_cast (ViewY::rank), + // "Z must have the same rank as Y."); + } + + KOKKOS_INLINE_FUNCTION void + operator() (const LO& k) const + { + using Kokkos::ALL; + using Kokkos::subview; + typedef Kokkos::pair range_type; + typedef Kokkos::Details::ArithTraits KAT; + + // We only have to implement the alpha != 0 case. + + auto D_curBlk = subview (D_, k, ALL (), ALL ()); + const range_type kslice (k*blockSize_, (k+1)*blockSize_); + + // Z.update (STS::one (), X, -STS::one ()); // assume this is done + + auto Z_curBlk = subview (Z_, kslice); + auto Y_curBlk = subview (Y_, kslice); + // Y_curBlk := beta * Y_curBlk + alpha * D_curBlk * Z_curBlk + if (beta_ == KAT::zero ()) { + Tpetra::Experimental::FILL (Y_curBlk, KAT::zero ()); + } + else if (beta_ != KAT::one ()) { + Tpetra::Experimental::SCAL (beta_, Y_curBlk); + } + Tpetra::Experimental::GEMV (alpha_, D_curBlk, Z_curBlk, Y_curBlk); + } +}; + +template +void +blockJacobiUpdate (const ViewY& Y, + const Scalar& alpha, + const ViewD& D, + const ViewZ& Z, + const Scalar& beta) +{ + static_assert (Kokkos::Impl::is_view::value, "Y must be a Kokkos::View."); + static_assert (Kokkos::Impl::is_view::value, "D must be a Kokkos::View."); + static_assert (Kokkos::Impl::is_view::value, "Z must be a Kokkos::View."); + static_assert (static_cast (ViewY::rank) == static_cast (ViewZ::rank), + "Y and Z must have the same rank."); + static_assert (static_cast (ViewD::rank) == 3, "D must have rank 3."); + + const auto lclNumMeshRows = D.dimension_0 (); + BlockJacobiUpdate functor (Y, alpha, D, Z, beta); + Kokkos::RangePolicy range (0, lclNumMeshRows); + Kokkos::parallel_for (range, functor); } +} // namespace Impl + template void BlockMultiVector:: blockWiseMultiply (const Scalar& alpha, @@ -709,18 +826,27 @@ blockWiseMultiply (const Scalar& alpha, const BlockMultiVector& X) { using Kokkos::ALL; + typedef typename device_type::execution_space execution_space; + typedef typename device_type::memory_space memory_space; const LO lclNumMeshRows = meshMap_.getNodeNumElements (); if (alpha == STS::zero ()) { this->putScalar (STS::zero ()); } else { // alpha != 0 - auto bwm = Impl::createBlockWiseMultiply ( - getBlockSize(), alpha, - this->getMultiVectorView().template getLocalView(), - D, - const_cast&>(X).getMultiVectorView().template getLocalView()); - Kokkos::parallel_for (lclNumMeshRows, bwm); + const LO blockSize = this->getBlockSize (); + const impl_scalar_type alphaImpl = static_cast (alpha); + auto X_lcl = X.mv_.template getLocalView (); + auto Y_lcl = this->mv_.template getLocalView (); + auto bwm = Impl::createBlockWiseMultiply (blockSize, alphaImpl, Y_lcl, D, X_lcl); + + // Use an explicit RangePolicy with the desired execution space. + // Otherwise, if you just use a number, it runs on the default + // execution space. For example, if the default execution space + // is Cuda but the current execution space is Serial, using just a + // number would incorrectly run with Cuda. + Kokkos::RangePolicy range (0, lclNumMeshRows); + Kokkos::parallel_for (range, bwm); } } @@ -734,29 +860,31 @@ blockJacobiUpdate (const Scalar& alpha, const Scalar& beta) { using Kokkos::ALL; - const impl_scalar_type zero = static_cast (STS::zero ()); + using Kokkos::subview; + typedef typename device_type::memory_space memory_space; + typedef impl_scalar_type IST; + + const IST zero = Kokkos::Details::ArithTraits::zero (); + const IST one = Kokkos::Details::ArithTraits::one (); + const IST alphaImpl = static_cast (alpha); + const IST betaImpl = static_cast (beta); const LO lclNumMeshRows = meshMap_.getNodeNumElements (); const LO numVecs = mv_.getNumVectors (); + auto X_lcl = X.mv_.template getLocalView (); + auto Y_lcl = this->mv_.template getLocalView (); + auto Z_lcl = Z.mv_.template getLocalView (); + if (alpha == STS::zero ()) { // Y := beta * Y this->scale (beta); } else { // alpha != 0 Z.update (STS::one (), X, -STS::one ()); - for (LO i = 0; i < numVecs; ++i) { - for (LO k = 0; k < lclNumMeshRows; ++k) { - auto D_curBlk = Kokkos::subview (D, k, ALL (), ALL ()); - auto Z_curBlk = Z.getLocalBlock (k, i); - auto Y_curBlk = this->getLocalBlock (k, i); - // Y_curBlk := beta * Y_curBlk + alpha * D_curBlk * Z_curBlk - if (beta == STS::zero ()) { - Tpetra::Experimental::FILL (Y_curBlk, zero); - } - else if (beta != STS::one ()) { - Tpetra::Experimental::SCAL (beta, Y_curBlk); - } - Tpetra::Experimental::GEMV (alpha, D_curBlk, Z_curBlk, Y_curBlk); - } + for (LO j = 0; j < numVecs; ++j) { + auto X_lcl_j = subview (X_lcl, ALL (), j); + auto Y_lcl_j = subview (Y_lcl, ALL (), j); + auto Z_lcl_j = subview (Z_lcl, ALL (), j); + Impl::blockJacobiUpdate (Y_lcl_j, alphaImpl, D, Z_lcl_j, betaImpl); } } }