From 7fe526cb85f4923fa5c1f92cee4b001a09f705f3 Mon Sep 17 00:00:00 2001 From: Christian Glusa Date: Tue, 22 Jun 2021 15:47:48 -0600 Subject: [PATCH] Tpetra residual: Add options for skipping of local copy and comm/comp overlap --- .../core/src/Tpetra_Details_Behavior.cpp | 23 + .../core/src/Tpetra_Details_Behavior.hpp | 12 + .../core/src/Tpetra_Details_residual.hpp | 588 +++++++++++++----- 3 files changed, 465 insertions(+), 158 deletions(-) diff --git a/packages/tpetra/core/src/Tpetra_Details_Behavior.cpp b/packages/tpetra/core/src/Tpetra_Details_Behavior.cpp index bc34f1972585..6a12004ad642 100644 --- a/packages/tpetra/core/src/Tpetra_Details_Behavior.cpp +++ b/packages/tpetra/core/src/Tpetra_Details_Behavior.cpp @@ -549,6 +549,29 @@ bool Behavior::hierarchicalUnpack () defaultValue); } +bool Behavior::skipCopyAndPermuteIfPossible () +{ + constexpr char envVarName[] = "TPETRA_SKIP_COPY_AND_PERMUTE"; + constexpr bool defaultValue(false); + + static bool value_ = defaultValue; + static bool initialized_ = false; + return idempotentlyGetEnvironmentVariableAsBool + (value_, initialized_, envVarName, defaultValue); +} + +bool Behavior::overlapCommunicationAndComputation () +{ + constexpr char envVarName[] = "TPETRA_OVERLAP"; + constexpr bool defaultValue(false); + + static bool value_ = defaultValue; + static bool initialized_ = false; + return idempotentlyGetEnvironmentVariableAsBool + (value_, initialized_, envVarName, defaultValue); +} + + } // namespace Details } // namespace Tpetra diff --git a/packages/tpetra/core/src/Tpetra_Details_Behavior.hpp b/packages/tpetra/core/src/Tpetra_Details_Behavior.hpp index a5094cb80c57..b2d5dd26c9c1 100644 --- a/packages/tpetra/core/src/Tpetra_Details_Behavior.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_Behavior.hpp @@ -263,6 +263,18 @@ class Behavior { /// environment variable. static bool profilingRegionUseKokkosProfiling(); + /// \brief Skip copyAndPermute if possible + /// + /// This is disabled by default. You may control this at run time via the + /// TPETRA_SKIP_COPY_AND_PERMUTE environment variable. + static bool skipCopyAndPermuteIfPossible(); + + /// \brief Overlap communication and computation. + /// + /// This is disabled by default. You may control this at run time via the + /// TPETRA_OVERLAP environment variable. + static bool overlapCommunicationAndComputation(); + }; diff --git a/packages/tpetra/core/src/Tpetra_Details_residual.hpp b/packages/tpetra/core/src/Tpetra_Details_residual.hpp index beff28c317c3..7c532a030d82 100644 --- a/packages/tpetra/core/src/Tpetra_Details_residual.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_residual.hpp @@ -61,6 +61,218 @@ namespace Tpetra { namespace Details { +/// \brief Functor for computing the residual +/// +/// The template parameters decide what is actually computed. +/// is_MV: whether the multivectors have actually more than 1 vector. +/// restrictedMode: when true, read on-rank RHS data out of X_domainmap_lcl instead of X_colmap_lcl +/// skipOffRank: when true, only compute the local part of the residual +template +struct LocalResidualFunctor { + + using execution_space = typename AMatrix::execution_space; + using LO = typename AMatrix::non_const_ordinal_type; + using value_type = typename AMatrix::non_const_value_type; + using team_policy = typename Kokkos::TeamPolicy; + using team_member = typename team_policy::member_type; + using ATV = Kokkos::ArithTraits; + + AMatrix A_lcl; + ConstMV X_colmap_lcl; + ConstMV B_lcl; + MV R_lcl; + int rows_per_team; + Offsets offsets; + ConstMV X_domainmap_lcl; + + + LocalResidualFunctor (const AMatrix& A_lcl_, + const ConstMV& X_colmap_lcl_, + const ConstMV& B_lcl_, + const MV& R_lcl_, + const int rows_per_team_, + const Offsets& offsets_, + const ConstMV& X_domainmap_lcl_) : + A_lcl(A_lcl_), + X_colmap_lcl(X_colmap_lcl_), + B_lcl(B_lcl_), + R_lcl(R_lcl_), + rows_per_team(rows_per_team_), + offsets(offsets_), + X_domainmap_lcl(X_domainmap_lcl_) + { } + + KOKKOS_INLINE_FUNCTION + void operator() (const team_member& dev) const + { + + Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) { + const LO lclRow = static_cast (dev.league_rank ()) * rows_per_team + loop; + + if (lclRow >= A_lcl.numRows ()) { + return; + } + + if (!is_MV) { // MultiVectors only have a single column + + value_type A_x = ATV::zero (); + + if (!restrictedMode) { + const auto A_row = A_lcl.rowConst(lclRow); + const LO row_length = static_cast (A_row.length); + + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, value_type& lsum) { + const auto A_val = A_row.value(iEntry); + lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry),0); + }, A_x); + + } + else { + + const LO offRankOffset = offsets(lclRow); + const size_t start = A_lcl.graph.row_map(lclRow); + const size_t end = A_lcl.graph.row_map(lclRow+1); + + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, start, end), [&] (const LO iEntry, value_type& lsum) { + const auto A_val = A_lcl.values(iEntry); + const auto lclCol = A_lcl.graph.entries(iEntry); + if (iEntry < offRankOffset) + lsum += A_val * X_domainmap_lcl(lclCol,0); + else if (!skipOffRank) + lsum += A_val * X_colmap_lcl(lclCol,0); + }, A_x); + } + + Kokkos::single(Kokkos::PerThread(dev),[&] () { + R_lcl(lclRow,0) = B_lcl(lclRow,0) - A_x; + }); + } + else { // MultiVectors have more than one column + + const LO numVectors = static_cast(X_colmap_lcl.extent(1)); + + for(LO v=0; v (A_row.length); + + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, value_type& lsum) { + const auto A_val = A_row.value(iEntry); + lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry),v); + }, A_x); + } + else { + const LO offRankOffset = offsets(lclRow); + const size_t start = A_lcl.graph.row_map(lclRow); + const size_t end = A_lcl.graph.row_map(lclRow+1); + + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, start, end), [&] (const LO iEntry, value_type& lsum) { + const auto A_val = A_lcl.values(iEntry); + const auto lclCol = A_lcl.graph.entries(iEntry); + if (iEntry < offRankOffset) + lsum += A_val * X_domainmap_lcl(lclCol,v); + else if (!skipOffRank) + lsum += A_val * X_colmap_lcl(lclCol,v); + }, A_x); + } + + Kokkos::single(Kokkos::PerThread(dev),[&] () { + R_lcl(lclRow,v) = B_lcl(lclRow,v) - A_x; + }); + + }//end for numVectors + } + });//end parallel_for TeamThreadRange + } +}; + + +/// \brief Functor for computing R -= A_offRank*X_colmap +template +struct OffRankUpdateFunctor { + + using execution_space = typename AMatrix::execution_space; + using LO = typename AMatrix::non_const_ordinal_type; + using value_type = typename AMatrix::non_const_value_type; + using team_policy = typename Kokkos::TeamPolicy; + using team_member = typename team_policy::member_type; + using ATV = Kokkos::ArithTraits; + + AMatrix A_lcl; + ConstMV X_colmap_lcl; + MV R_lcl; + int rows_per_team; + Offsets offsets; + + + OffRankUpdateFunctor (const AMatrix& A_lcl_, + const ConstMV& X_colmap_lcl_, + const MV& R_lcl_, + const int rows_per_team_, + const Offsets& offsets_) : + A_lcl(A_lcl_), + X_colmap_lcl(X_colmap_lcl_), + R_lcl(R_lcl_), + rows_per_team(rows_per_team_), + offsets(offsets_) + { } + + KOKKOS_INLINE_FUNCTION + void operator() (const team_member& dev) const + { + + Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) { + const LO lclRow = static_cast (dev.league_rank ()) * rows_per_team + loop; + + if (lclRow >= A_lcl.numRows ()) { + return; + } + + const LO offRankOffset = offsets(lclRow); + const size_t end = A_lcl.graph.row_map(lclRow+1); + + if (!is_MV) { // MultiVectors only have a single column + + value_type A_x = ATV::zero (); + + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, offRankOffset, end), [&] (const LO iEntry, value_type& lsum) { + const auto A_val = A_lcl.values(iEntry); + const auto lclCol = A_lcl.graph.entries(iEntry); + lsum += A_val * X_colmap_lcl(lclCol,0); + }, A_x); + + Kokkos::single(Kokkos::PerThread(dev),[&] () { + R_lcl(lclRow,0) -= A_x; + }); + } + else { // MultiVectors have more than one column + + const LO numVectors = static_cast(X_colmap_lcl.extent(1)); + + for(LO v=0; v void localResidual(const CrsMatrix & A, const MultiVector & X_colmap, @@ -72,10 +284,16 @@ void localResidual(const CrsMatrix & A, using Teuchos::NO_TRANS; ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localResidual"); - auto A_lcl = A.getLocalMatrixDevice (); - auto X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly); - auto B_lcl = B.getLocalViewDevice(Access::ReadOnly); - auto R_lcl = R.getLocalViewDevice(Access::OverwriteAll); + using local_matrix_device_type = typename CrsMatrix::local_matrix_device_type; + using local_view_device_type = typename MultiVector::dual_view_type::t_dev::non_const_type; + using const_local_view_device_type = typename MultiVector::dual_view_type::t_dev::const_type; + using offset_type = Kokkos::View; + + local_matrix_device_type A_lcl = A.getLocalMatrixDevice (); + const_local_view_device_type X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly); + const_local_view_device_type B_lcl = B.getLocalViewDevice(Access::ReadOnly); + local_view_device_type R_lcl = R.getLocalViewDevice(Access::OverwriteAll); + const_local_view_device_type X_domainmap_lcl; const bool debug = ::Tpetra::Details::Behavior::debug (); if (debug) { @@ -83,11 +301,6 @@ void localResidual(const CrsMatrix & A, (X_colmap.getNumVectors () != R.getNumVectors (), std::runtime_error, "X.getNumVectors() = " << X_colmap.getNumVectors () << " != " "R.getNumVectors() = " << R.getNumVectors () << "."); - TEUCHOS_TEST_FOR_EXCEPTION - (X_colmap.getNumVectors () != R.getNumVectors (), std::runtime_error, - "X.getNumVectors() = " << X_colmap.getNumVectors () << " != " - "R.getNumVectors() = " << R.getNumVectors () << "."); - TEUCHOS_TEST_FOR_EXCEPTION (X_colmap.getLocalLength () != A.getColMap ()->getNodeNumElements (), std::runtime_error, @@ -133,7 +346,6 @@ void localResidual(const CrsMatrix & A, A.localApply(X_colmap,R,Teuchos::NO_TRANS, one, zero); R.update(one,B,negone); #else - using execution_space = typename CrsMatrix::execution_space; if (A_lcl.numRows() == 0) { return; @@ -160,15 +372,12 @@ void localResidual(const CrsMatrix & A, int team_size = -1; int vector_length = -1; int64_t rows_per_thread = -1; - - int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters(numLocalRows, myNnz, rows_per_thread, team_size, vector_length); - int64_t worksets = (B_lcl.extent (0) + rows_per_team - 1) / rows_per_team; + using execution_space = typename CrsMatrix::execution_space; using policy_type = typename Kokkos::TeamPolicy; - using team_member = typename policy_type::member_type; - using residual_value_type = typename MultiVector::dual_view_type::t_dev::non_const_value_type; - using KAT = Kokkos::ArithTraits; + int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters(numLocalRows, myNnz, rows_per_thread, team_size, vector_length); + int64_t worksets = (B_lcl.extent (0) + rows_per_team - 1) / rows_per_team; policy_type policy (1, 1); if (team_size < 0) { @@ -183,135 +392,188 @@ void localResidual(const CrsMatrix & A, if(is_vector) { if (X_domainmap == nullptr) { - - // Vector case - // Kernel interior shamelessly horked from Ifpack2_Details_ScaledDampedResidual_def.hpp - Kokkos::parallel_for("residual-vector",policy,KOKKOS_LAMBDA(const team_member& dev) { - Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) { - const LO lclRow = static_cast (dev.league_rank ()) * rows_per_team + loop; - - if (lclRow >= A_lcl.numRows ()) { - return; - } - - const auto A_row = A_lcl.rowConst(lclRow); - const LO row_length = static_cast (A_row.length); - residual_value_type A_x = KAT::zero (); - - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, residual_value_type& lsum) { - const auto A_val = A_row.value(iEntry); - lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry),0); - }, A_x); - - Kokkos::single(Kokkos::PerThread(dev),[&] () { - R_lcl(lclRow,0) = B_lcl(lclRow,0) - A_x; - }); - });//end parallel_for TeamThreadRange - });//end parallel_for "residual-vector" - } else { - auto X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly); - - Kokkos::parallel_for("residual-vector",policy,KOKKOS_LAMBDA(const team_member& dev) { - Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) { - const LO lclRow = static_cast (dev.league_rank ()) * rows_per_team + loop; - const LO numRows = A_lcl.numRows (); - if (lclRow >= numRows) { - return; - } - const LO offRankOffset = offsets(lclRow); - const size_t start = A_lcl.graph.row_map(lclRow); - const size_t end = A_lcl.graph.row_map(lclRow+1); - residual_value_type A_x = KAT::zero (); - - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, start, end), [&] (const LO iEntry, residual_value_type& lsum) { - const auto A_val = A_lcl.values(iEntry); - const auto lclCol = A_lcl.graph.entries(iEntry); - if (iEntry < offRankOffset) - lsum += A_val * X_domainmap_lcl(lclCol,0); - else - lsum += A_val * X_colmap_lcl(lclCol,0); - }, A_x); - - Kokkos::single(Kokkos::PerThread(dev),[&] () { - R_lcl(lclRow,0) = B_lcl(lclRow,0) - A_x; - }); - });//end parallel_for TeamThreadRange - });//end parallel_for "residual-vector" + + using functor_type = LocalResidualFunctor; + functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl); + Kokkos::parallel_for("residual-vector",policy,func); + + } + else { + + X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly); + using functor_type = LocalResidualFunctor; + functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl); + Kokkos::parallel_for("residual-vector",policy,func); } - } else { - // MultiVector case + } + else { + if (X_domainmap == nullptr) { - // Kernel interior shamelessly horked from Ifpack2_Details_ScaledDampedResidual_def.hpp - Kokkos::parallel_for("residual-multivector",policy,KOKKOS_LAMBDA(const team_member& dev) { - // NOTE: It looks like I should be able to get this data up above, but if I try to - // we get internal compiler errors. Who knew that gcc tried to "gimplify"? - const LO numVectors = static_cast(X_colmap_lcl.extent(1)); - Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) { - const LO lclRow = static_cast (dev.league_rank ()) * rows_per_team + loop; - - if (lclRow >= A_lcl.numRows ()) { - return; - } - const auto A_row = A_lcl.rowConst(lclRow); - const LO row_length = static_cast (A_row.length); - for(LO v=0; vgetLocalViewDevice(Access::ReadOnly); - - Kokkos::parallel_for("residual-multivector",policy,KOKKOS_LAMBDA(const team_member& dev) { - // NOTE: It looks like I should be able to get this data up above, but if I try to - // we get internal compiler errors. Who knew that gcc tried to "gimplify"? - const LO numVectors = static_cast(X_colmap_lcl.extent(1)); - Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) { - const LO lclRow = static_cast (dev.league_rank ()) * rows_per_team + loop; - const LO numRows = A_lcl.numRows (); - if (lclRow >= numRows) { - return; - } - - const LO offRankOffset = offsets(lclRow); - const size_t start = A_lcl.graph.row_map(lclRow); - const size_t end = A_lcl.graph.row_map(lclRow+1); - - for(LO v=0; v; + functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl); + Kokkos::parallel_for("residual-multivector",policy,func); + + } + else { + + X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly); + using functor_type = LocalResidualFunctor; + functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl); + Kokkos::parallel_for("residual-multivector",policy,func); + } - }// end else + } #endif } - + + +template +void localResidualWithCommCompOverlap(const CrsMatrix & A, + MultiVector & X_colmap, + const MultiVector & B, + MultiVector & R, + const Kokkos::View& offsets, + const MultiVector & X_domainmap) { + using Tpetra::Details::ProfilingRegion; + using Teuchos::NO_TRANS; + using Teuchos::RCP; + using import_type = typename CrsMatrix::import_type; + + ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localResidualWithCommCompOverlap"); + + using local_matrix_device_type = typename CrsMatrix::local_matrix_device_type; + using local_view_device_type = typename MultiVector::dual_view_type::t_dev::non_const_type; + using const_local_view_device_type = typename MultiVector::dual_view_type::t_dev::const_type; + using offset_type = Kokkos::View; + + local_matrix_device_type A_lcl = A.getLocalMatrixDevice (); + const_local_view_device_type X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly); + const_local_view_device_type B_lcl = B.getLocalViewDevice(Access::ReadOnly); + local_view_device_type R_lcl = R.getLocalViewDevice(Access::OverwriteAll); + const_local_view_device_type X_domainmap_lcl = X_domainmap.getLocalViewDevice(Access::ReadOnly);; + + const bool debug = ::Tpetra::Details::Behavior::debug (); + if (debug) { + TEUCHOS_TEST_FOR_EXCEPTION + (X_colmap.getNumVectors () != R.getNumVectors (), std::runtime_error, + "X.getNumVectors() = " << X_colmap.getNumVectors () << " != " + "R.getNumVectors() = " << R.getNumVectors () << "."); + TEUCHOS_TEST_FOR_EXCEPTION + (X_colmap.getLocalLength () != + A.getColMap ()->getNodeNumElements (), std::runtime_error, + "X has the wrong number of local rows. " + "X.getLocalLength() = " << X_colmap.getLocalLength () << " != " + "A.getColMap()->getNodeNumElements() = " << + A.getColMap ()->getNodeNumElements () << "."); + TEUCHOS_TEST_FOR_EXCEPTION + (R.getLocalLength () != + A.getRowMap ()->getNodeNumElements (), std::runtime_error, + "R has the wrong number of local rows. " + "R.getLocalLength() = " << R.getLocalLength () << " != " + "A.getRowMap()->getNodeNumElements() = " << + A.getRowMap ()->getNodeNumElements () << "."); + TEUCHOS_TEST_FOR_EXCEPTION + (B.getLocalLength () != + A.getRowMap ()->getNodeNumElements (), std::runtime_error, + "B has the wrong number of local rows. " + "B.getLocalLength() = " << B.getLocalLength () << " != " + "A.getRowMap()->getNodeNumElements() = " << + A.getRowMap ()->getNodeNumElements () << "."); + + TEUCHOS_TEST_FOR_EXCEPTION + (! A.isFillComplete (), std::runtime_error, "The matrix A is not " + "fill complete. You must call fillComplete() (possibly with " + "domain and range Map arguments) without an intervening " + "resumeFill() call before you may call this method."); + TEUCHOS_TEST_FOR_EXCEPTION + (! X_colmap.isConstantStride () || ! R.isConstantStride () || ! B.isConstantStride (), + std::runtime_error, "X, Y and B must be constant stride."); + // If the two pointers are NULL, then they don't alias one + // another, even though they are equal. + TEUCHOS_TEST_FOR_EXCEPTION + ((X_colmap_lcl.data () == R_lcl.data () && X_colmap_lcl.data () != nullptr) || + (X_colmap_lcl.data () == B_lcl.data () && X_colmap_lcl.data () != nullptr), + std::runtime_error, "X, Y and R may not alias one another."); + } + + if (A_lcl.numRows() == 0) { + return; + } + + int64_t numLocalRows = A_lcl.numRows (); + int64_t myNnz = A_lcl.nnz(); + + // //Check for imbalanced rows. If the logic for SPMV to use merge path is triggered, + // //use it and follow the reference residual code + // LO maxRowImbalance = 0; + // if(numLocalRows != 0) + // maxRowImbalance = A.getNodeMaxNumRowEntries() - (myNnz / numLocalRows); + // if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold()) + // { + // //note: lclOp will be wrapped in shared_ptr + // auto lclOp = A.getLocalMultiplyOperator(); + // //Call local SPMV, requesting merge path, through A's LocalCrsMatrixOperator + // lclOp->applyImbalancedRows (X_colmap_lcl, R_lcl, Teuchos::NO_TRANS, one, zero); + // R.update(one,B,negone); + // return; + // } + + int team_size = -1; + int vector_length = -1; + int64_t rows_per_thread = -1; + + using execution_space = typename CrsMatrix::execution_space; + using policy_type = typename Kokkos::TeamPolicy; + + int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters(numLocalRows, myNnz, rows_per_thread, team_size, vector_length); + int64_t worksets = (B_lcl.extent (0) + rows_per_team - 1) / rows_per_team; + + policy_type policy (1, 1); + if (team_size < 0) { + policy = policy_type (worksets, Kokkos::AUTO, vector_length); + } + else { + policy = policy_type (worksets, team_size, vector_length); + } + + bool is_vector = (X_colmap_lcl.extent(1) == 1); + + if(is_vector) { + + using functor_type = LocalResidualFunctor; + functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl); + Kokkos::parallel_for("residual-vector",policy,func); + + RCP importer = A.getGraph ()->getImporter (); + X_colmap.endImport (X_domainmap, *importer, INSERT, true); + + Kokkos::fence(); + + using functor_type2 = OffRankUpdateFunctor; + functor_type2 func2 (A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets); + Kokkos::parallel_for("residual-vector-offrank",policy,func2); + + } + else { + + using functor_type = LocalResidualFunctor; + functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl); + Kokkos::parallel_for("residual-multivector",policy,func); + + RCP importer = A.getGraph ()->getImporter (); + X_colmap.endImport (X_domainmap, *importer, INSERT, true); + + Kokkos::fence(); + + using functor_type2 = OffRankUpdateFunctor; + functor_type2 func2 (A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets); + Kokkos::parallel_for("residual-vector-offrank",policy,func2); + + } +} + //! Computes R = B - A * X template @@ -326,6 +588,10 @@ void residual(const Operator & Aop, using Teuchos::rcpFromRef; const bool debug = ::Tpetra::Details::Behavior::debug (); + const bool skipCopyAndPermuteIfPossible = ::Tpetra::Details::Behavior::skipCopyAndPermuteIfPossible (); + const bool overlapCommunicationAndComputation = ::Tpetra::Details::Behavior::overlapCommunicationAndComputation (); + if (overlapCommunicationAndComputation) + TEUCHOS_ASSERT(skipCopyAndPermuteIfPossible); // Whether we are using restrictedMode in the import from domain to // column map. Restricted mode skips the copy and permutation of the @@ -348,7 +614,6 @@ void residual(const Operator & Aop, using export_type = typename CrsMatrix::export_type; using MV = MultiVector; using graph_type = Tpetra::CrsGraph; - using local_graph_type = typename graph_type::local_graph_type; using offset_type = typename graph_type::offset_device_view_type; // We treat the case of a replicated MV output specially. @@ -367,7 +632,7 @@ void residual(const Operator & Aop, // Temporary MV for Import operation. After the block of code // below, this will be an (Imported if necessary) column Map MV // ready to give to localApply(...). - RCP X_colMap; + RCP X_colMap; if (importer.is_null ()) { if (! X_in.isConstantStride ()) { // Not all sparse mat-vec kernels can handle an input MV with @@ -376,14 +641,14 @@ void residual(const Operator & Aop, // copy of X_in, we force creation of the column (== domain) // Map MV (if it hasn't already been created, else fetch the // cached copy). This avoids creating a new MV each time. - RCP X_colMapNonConst = A.getColumnMapMultiVector (X_in, true); - Tpetra::deep_copy (*X_colMapNonConst, X_in); - X_colMap = rcp_const_cast (X_colMapNonConst); + X_colMap = A.getColumnMapMultiVector (X_in, true); + Tpetra::deep_copy (*X_colMap, X_in); + // X_colMap = rcp_const_cast (X_colMapNonConst); } else { // The domain and column Maps are the same, so do the local // multiply using the domain Map input MV X_in. - X_colMap = rcpFromRef (X_in); + X_colMap = rcp_const_cast (rcpFromRef (X_in) ); } } else { // need to Import source (multi)vector @@ -392,10 +657,10 @@ void residual(const Operator & Aop, // elements of the domain Map MV X_in into a separate column Map // MV. Thus, we don't have to worry whether X_in is constant // stride. - RCP X_colMapNonConst = A.getColumnMapMultiVector (X_in); + X_colMap = A.getColumnMapMultiVector (X_in); // Do we want to use restrictedMode? - restrictedMode = importer->isLocallyFitted(); + restrictedMode = skipCopyAndPermuteIfPossible && importer->isLocallyFitted(); if (debug && restrictedMode) { TEUCHOS_TEST_FOR_EXCEPTION @@ -404,8 +669,7 @@ void residual(const Operator & Aop, } // Import from the domain Map MV to the column Map MV. - X_colMapNonConst->doImport (X_in, *importer, INSERT, restrictedMode); - X_colMap = rcp_const_cast (X_colMapNonConst); + X_colMap->beginImport (X_in, *importer, INSERT, restrictedMode); } offset_type offsets; @@ -458,7 +722,8 @@ void residual(const Operator & Aop, // constant-stride version of R_in in this case, because we had to // make a constant stride R_rowMap MV and do an Export anyway. if (! exporter.is_null ()) { - + if ( ! importer.is_null ()) + X_colMap->endImport (X_in, *importer, INSERT, restrictedMode); if (restrictedMode && !importer.is_null ()) localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in); else @@ -472,23 +737,30 @@ void residual(const Operator & Aop, } } else { // Don't do an Export: row Map and range Map are the same. + + if (restrictedMode) + if (overlapCommunicationAndComputation) { + localResidualWithCommCompOverlap (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, X_in); + } else { + X_colMap->endImport (X_in, *importer, INSERT, restrictedMode); + localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in); + } + else { + if ( ! importer.is_null ()) + X_colMap->endImport (X_in, *importer, INSERT, restrictedMode); + localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets); + } + // // If R_in does not have constant stride, - // then we can't let the kernel write directly to R_in. + // then we can't let the kernel write directly to R_in. // Instead, we have to use the cached row (== range) // Map MV as temporary storage. // if (! R_in.isConstantStride () ) { // We need to be sure to do a copy out in this case. - localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets); Tpetra::deep_copy (R_in, *R_rowMap); } - else { - if (restrictedMode && !importer.is_null ()) - localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in); - else - localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets); - } } // If the range Map is a locally replicated Map, sum up