From 7fe526cb85f4923fa5c1f92cee4b001a09f705f3 Mon Sep 17 00:00:00 2001
From: Christian Glusa <caglusa@sandia.gov>
Date: Tue, 22 Jun 2021 15:47:48 -0600
Subject: [PATCH] Tpetra residual: Add options for skipping of local copy and
 comm/comp overlap

---
 .../core/src/Tpetra_Details_Behavior.cpp      |  23 +
 .../core/src/Tpetra_Details_Behavior.hpp      |  12 +
 .../core/src/Tpetra_Details_residual.hpp      | 588 +++++++++++++-----
 3 files changed, 465 insertions(+), 158 deletions(-)
diff --git a/packages/tpetra/core/src/Tpetra_Details_Behavior.cpp b/packages/tpetra/core/src/Tpetra_Details_Behavior.cpp
index bc34f1972585..6a12004ad642 100644
--- a/packages/tpetra/core/src/Tpetra_Details_Behavior.cpp
+++ b/packages/tpetra/core/src/Tpetra_Details_Behavior.cpp
@@ -549,6 +549,29 @@ bool Behavior::hierarchicalUnpack ()
                                                    defaultValue);
 }
 
+bool Behavior::skipCopyAndPermuteIfPossible ()
+{
+  constexpr char envVarName[] = "TPETRA_SKIP_COPY_AND_PERMUTE";
+  constexpr bool defaultValue(false);
+
+  static bool value_ = defaultValue;
+  static bool initialized_ = false;
+  return idempotentlyGetEnvironmentVariableAsBool
+    (value_, initialized_, envVarName, defaultValue);
+}
+
+bool Behavior::overlapCommunicationAndComputation ()
+{
+  constexpr char envVarName[] = "TPETRA_OVERLAP";
+  constexpr bool defaultValue(false);
+
+  static bool value_ = defaultValue;
+  static bool initialized_ = false;
+  return idempotentlyGetEnvironmentVariableAsBool
+    (value_, initialized_, envVarName, defaultValue);
+}
+
+
 } // namespace Details
 } // namespace Tpetra
 
diff --git a/packages/tpetra/core/src/Tpetra_Details_Behavior.hpp b/packages/tpetra/core/src/Tpetra_Details_Behavior.hpp
index a5094cb80c57..b2d5dd26c9c1 100644
--- a/packages/tpetra/core/src/Tpetra_Details_Behavior.hpp
+++ b/packages/tpetra/core/src/Tpetra_Details_Behavior.hpp
@@ -263,6 +263,18 @@ class Behavior {
   /// environment variable.
   static bool profilingRegionUseKokkosProfiling();
 
+  /// \brief Skip copyAndPermute if possible
+  ///
+  /// This is disabled by default.  You may control this at run time via the
+  /// <tt>TPETRA_SKIP_COPY_AND_PERMUTE</tt> environment variable.
+  static bool skipCopyAndPermuteIfPossible();
+
+  /// \brief Overlap communication and computation.
+  ///
+  /// This is disabled by default.  You may control this at run time via the
+  /// <tt>TPETRA_OVERLAP</tt> environment variable.
+  static bool overlapCommunicationAndComputation();
+
 
 };
 
diff --git a/packages/tpetra/core/src/Tpetra_Details_residual.hpp b/packages/tpetra/core/src/Tpetra_Details_residual.hpp
index beff28c317c3..7c532a030d82 100644
--- a/packages/tpetra/core/src/Tpetra_Details_residual.hpp
+++ b/packages/tpetra/core/src/Tpetra_Details_residual.hpp
@@ -61,6 +61,218 @@ namespace Tpetra {
   namespace Details {
 
 
+/// \brief Functor for computing the residual
+///
+/// The template parameters decide what is actually computed.
+/// is_MV:          whether the multivectors have actually more than 1 vector.
+/// restrictedMode: when true, read on-rank RHS data out of X_domainmap_lcl instead of X_colmap_lcl
+/// skipOffRank:    when true, only compute the local part of the residual
+template<class AMatrix, class MV, class ConstMV, class Offsets, bool is_MV, bool restrictedMode, bool skipOffRank>
+struct LocalResidualFunctor {
+
+  using execution_space = typename AMatrix::execution_space;
+  using LO = typename AMatrix::non_const_ordinal_type;
+  using value_type = typename AMatrix::non_const_value_type;
+  using team_policy = typename Kokkos::TeamPolicy<execution_space>;
+  using team_member = typename team_policy::member_type;
+  using ATV = Kokkos::ArithTraits<value_type>;
+
+  AMatrix A_lcl;
+  ConstMV X_colmap_lcl;
+  ConstMV B_lcl;
+  MV R_lcl;
+  int rows_per_team;
+  Offsets offsets;
+  ConstMV X_domainmap_lcl;
+
+
+  LocalResidualFunctor (const AMatrix& A_lcl_,
+                        const ConstMV& X_colmap_lcl_,
+                        const ConstMV& B_lcl_,
+                        const MV& R_lcl_,
+                        const int rows_per_team_,
+                        const Offsets& offsets_,
+                        const ConstMV& X_domainmap_lcl_) :
+    A_lcl(A_lcl_),
+    X_colmap_lcl(X_colmap_lcl_),
+    B_lcl(B_lcl_),
+    R_lcl(R_lcl_),
+    rows_per_team(rows_per_team_),
+    offsets(offsets_),
+    X_domainmap_lcl(X_domainmap_lcl_)
+  { }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const team_member& dev) const
+  {
+
+    Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) {
+      const LO lclRow = static_cast<LO> (dev.league_rank ()) * rows_per_team + loop;
+
+      if (lclRow >= A_lcl.numRows ()) {
+        return;
+      }
+
+      if (!is_MV) { // MultiVectors only have a single column
+
+        value_type A_x = ATV::zero ();
+
+        if (!restrictedMode) {
+          const auto A_row = A_lcl.rowConst(lclRow);
+          const LO row_length = static_cast<LO> (A_row.length);
+
+          Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, value_type& lsum) {
+            const auto A_val = A_row.value(iEntry);
+            lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry),0);
+          }, A_x);
+
+        }
+        else {
+
+          const LO offRankOffset = offsets(lclRow);
+          const size_t start = A_lcl.graph.row_map(lclRow);
+          const size_t end = A_lcl.graph.row_map(lclRow+1);
+
+          Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, start, end), [&] (const LO iEntry, value_type& lsum) {
+            const auto A_val = A_lcl.values(iEntry);
+            const auto lclCol = A_lcl.graph.entries(iEntry);
+            if (iEntry < offRankOffset)
+              lsum += A_val * X_domainmap_lcl(lclCol,0);
+            else if (!skipOffRank)
+              lsum += A_val * X_colmap_lcl(lclCol,0);
+          }, A_x);
+        }
+
+        Kokkos::single(Kokkos::PerThread(dev),[&] () {
+          R_lcl(lclRow,0) = B_lcl(lclRow,0) - A_x;
+        });
+      }
+      else { // MultiVectors have more than one column
+
+        const LO numVectors = static_cast<LO>(X_colmap_lcl.extent(1));
+
+        for(LO v=0; v<numVectors; v++) {
+
+          value_type A_x = ATV::zero ();
+
+          if (!restrictedMode) {
+
+            const auto A_row = A_lcl.rowConst(lclRow);
+            const LO row_length = static_cast<LO> (A_row.length);
+
+            Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, value_type& lsum) {
+              const auto A_val = A_row.value(iEntry);
+              lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry),v);
+            }, A_x);
+          }
+          else {
+            const LO offRankOffset = offsets(lclRow);
+            const size_t start = A_lcl.graph.row_map(lclRow);
+            const size_t end = A_lcl.graph.row_map(lclRow+1);
+
+            Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, start, end), [&] (const LO iEntry, value_type& lsum) {
+              const auto A_val = A_lcl.values(iEntry);
+              const auto lclCol = A_lcl.graph.entries(iEntry);
+              if (iEntry < offRankOffset)
+                lsum += A_val * X_domainmap_lcl(lclCol,v);
+              else if (!skipOffRank)
+                lsum += A_val * X_colmap_lcl(lclCol,v);
+            }, A_x);
+          }
+
+          Kokkos::single(Kokkos::PerThread(dev),[&] () {
+            R_lcl(lclRow,v) = B_lcl(lclRow,v) - A_x;
+          });
+
+        }//end for numVectors
+      }
+    });//end parallel_for TeamThreadRange
+  }
+};
+
+
+/// \brief Functor for computing R -= A_offRank*X_colmap
+template<class AMatrix, class MV, class ConstMV, class Offsets, bool is_MV>
+struct OffRankUpdateFunctor {
+
+  using execution_space = typename AMatrix::execution_space;
+  using LO = typename AMatrix::non_const_ordinal_type;
+  using value_type = typename AMatrix::non_const_value_type;
+  using team_policy = typename Kokkos::TeamPolicy<execution_space>;
+  using team_member = typename team_policy::member_type;
+  using ATV = Kokkos::ArithTraits<value_type>;
+
+  AMatrix A_lcl;
+  ConstMV X_colmap_lcl;
+  MV R_lcl;
+  int rows_per_team;
+  Offsets offsets;
+
+
+  OffRankUpdateFunctor (const AMatrix& A_lcl_,
+                        const ConstMV& X_colmap_lcl_,
+                        const MV& R_lcl_,
+                        const int rows_per_team_,
+                        const Offsets& offsets_) :
+    A_lcl(A_lcl_),
+    X_colmap_lcl(X_colmap_lcl_),
+    R_lcl(R_lcl_),
+    rows_per_team(rows_per_team_),
+    offsets(offsets_)
+  { }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const team_member& dev) const
+  {
+
+    Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) {
+      const LO lclRow = static_cast<LO> (dev.league_rank ()) * rows_per_team + loop;
+
+      if (lclRow >= A_lcl.numRows ()) {
+        return;
+      }
+
+      const LO offRankOffset = offsets(lclRow);
+      const size_t end = A_lcl.graph.row_map(lclRow+1);
+
+      if (!is_MV) { // MultiVectors only have a single column
+
+        value_type A_x = ATV::zero ();
+
+        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, offRankOffset, end), [&] (const LO iEntry, value_type& lsum) {
+          const auto A_val = A_lcl.values(iEntry);
+          const auto lclCol = A_lcl.graph.entries(iEntry);
+          lsum += A_val * X_colmap_lcl(lclCol,0);
+        }, A_x);
+
+        Kokkos::single(Kokkos::PerThread(dev),[&] () {
+          R_lcl(lclRow,0) -=  A_x;
+        });
+      }
+      else { // MultiVectors have more than one column
+
+        const LO numVectors = static_cast<LO>(X_colmap_lcl.extent(1));
+
+        for(LO v=0; v<numVectors; v++) {
+
+          value_type A_x = ATV::zero ();
+
+          Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, offRankOffset, end), [&] (const LO iEntry, value_type& lsum) {
+            const auto A_val = A_lcl.values(iEntry);
+            const auto lclCol = A_lcl.graph.entries(iEntry);
+            lsum += A_val * X_colmap_lcl(lclCol,v);
+          }, A_x);
+
+          Kokkos::single(Kokkos::PerThread(dev),[&] () {
+            R_lcl(lclRow,v) -= A_x;
+          });
+
+        }//end for numVectors
+      }
+    });
+  }
+};
+
 template<class SC, class LO, class GO, class NO>
 void localResidual(const CrsMatrix<SC,LO,GO,NO> &  A,
                    const MultiVector<SC,LO,GO,NO> & X_colmap,
@@ -72,10 +284,16 @@ void localResidual(const CrsMatrix<SC,LO,GO,NO> &  A,
   using Teuchos::NO_TRANS;
   ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localResidual");
 
-  auto A_lcl = A.getLocalMatrixDevice ();
-  auto X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly);
-  auto B_lcl = B.getLocalViewDevice(Access::ReadOnly);
-  auto R_lcl = R.getLocalViewDevice(Access::OverwriteAll);
+  using local_matrix_device_type = typename CrsMatrix<SC,LO,GO,NO>::local_matrix_device_type;
+  using local_view_device_type = typename  MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::non_const_type;
+  using const_local_view_device_type = typename  MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::const_type;
+  using offset_type = Kokkos::View<const size_t*, typename NO::device_type>;
+
+  local_matrix_device_type     A_lcl = A.getLocalMatrixDevice ();
+  const_local_view_device_type X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly);
+  const_local_view_device_type B_lcl = B.getLocalViewDevice(Access::ReadOnly);
+  local_view_device_type       R_lcl = R.getLocalViewDevice(Access::OverwriteAll);
+  const_local_view_device_type X_domainmap_lcl;
 
   const bool debug = ::Tpetra::Details::Behavior::debug ();
   if (debug) {
@@ -83,11 +301,6 @@ void localResidual(const CrsMatrix<SC,LO,GO,NO> &  A,
       (X_colmap.getNumVectors () != R.getNumVectors (), std::runtime_error,
        "X.getNumVectors() = " << X_colmap.getNumVectors () << " != "
        "R.getNumVectors() = " << R.getNumVectors () << ".");
-    TEUCHOS_TEST_FOR_EXCEPTION
-      (X_colmap.getNumVectors () != R.getNumVectors (), std::runtime_error,
-       "X.getNumVectors() = " << X_colmap.getNumVectors () << " != "
-       "R.getNumVectors() = " << R.getNumVectors () << ".");
-
     TEUCHOS_TEST_FOR_EXCEPTION
       (X_colmap.getLocalLength () !=
        A.getColMap ()->getNodeNumElements (), std::runtime_error,
@@ -133,7 +346,6 @@ void localResidual(const CrsMatrix<SC,LO,GO,NO> &  A,
   A.localApply(X_colmap,R,Teuchos::NO_TRANS, one, zero);
   R.update(one,B,negone);
 #else
-  using execution_space = typename CrsMatrix<SC,LO,GO,NO>::execution_space;
 
   if (A_lcl.numRows() == 0) {
     return;
@@ -160,15 +372,12 @@ void localResidual(const CrsMatrix<SC,LO,GO,NO> &  A,
   int team_size = -1;
   int vector_length = -1;
   int64_t rows_per_thread = -1;
-  
-  int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
-  int64_t worksets = (B_lcl.extent (0) + rows_per_team - 1) / rows_per_team;
 
+  using execution_space = typename CrsMatrix<SC,LO,GO,NO>::execution_space;
   using policy_type = typename Kokkos::TeamPolicy<execution_space>;
-  using team_member = typename policy_type::member_type;
 
-  using residual_value_type = typename  MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::non_const_value_type;
-  using KAT = Kokkos::ArithTraits<residual_value_type>;
+  int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
+  int64_t worksets = (B_lcl.extent (0) + rows_per_team - 1) / rows_per_team;
 
   policy_type policy (1, 1);
   if (team_size < 0) {
@@ -183,135 +392,188 @@ void localResidual(const CrsMatrix<SC,LO,GO,NO> &  A,
   if(is_vector) {
 
     if (X_domainmap == nullptr) {
-    
-      // Vector case
-      // Kernel interior shamelessly horked from Ifpack2_Details_ScaledDampedResidual_def.hpp
-      Kokkos::parallel_for("residual-vector",policy,KOKKOS_LAMBDA(const team_member& dev) {
-          Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) {
-              const LO lclRow = static_cast<LO> (dev.league_rank ()) * rows_per_team + loop;
-            
-              if (lclRow >= A_lcl.numRows ()) {
-                return;
-              }
-
-              const auto A_row = A_lcl.rowConst(lclRow);
-              const LO row_length = static_cast<LO> (A_row.length);
-              residual_value_type A_x = KAT::zero ();          
-            
-              Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, residual_value_type& lsum) {
-                  const auto A_val = A_row.value(iEntry);
-                  lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry),0);
-                }, A_x);
-            
-              Kokkos::single(Kokkos::PerThread(dev),[&] () {            
-                  R_lcl(lclRow,0) = B_lcl(lclRow,0) - A_x;
-                });           
-            });//end parallel_for TeamThreadRange
-        });//end parallel_for "residual-vector"
-    } else {
-      auto X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
-      
-      Kokkos::parallel_for("residual-vector",policy,KOKKOS_LAMBDA(const team_member& dev) {
-          Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) {
-              const LO lclRow = static_cast<LO> (dev.league_rank ()) * rows_per_team + loop;
-              const LO numRows = A_lcl.numRows ();
-              if (lclRow >= numRows) {
-                return;
-              }
-              const LO offRankOffset = offsets(lclRow);
-              const size_t start = A_lcl.graph.row_map(lclRow);
-              const size_t end = A_lcl.graph.row_map(lclRow+1);
-              residual_value_type A_x = KAT::zero ();
-
-              Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, start, end), [&] (const LO iEntry, residual_value_type& lsum) {
-                  const auto A_val = A_lcl.values(iEntry);
-                  const auto lclCol = A_lcl.graph.entries(iEntry);
-                  if (iEntry < offRankOffset)
-                    lsum += A_val * X_domainmap_lcl(lclCol,0);
-                  else
-                    lsum += A_val * X_colmap_lcl(lclCol,0);
-                }, A_x);
-            
-              Kokkos::single(Kokkos::PerThread(dev),[&] () {            
-                  R_lcl(lclRow,0) = B_lcl(lclRow,0) - A_x;
-                });           
-            });//end parallel_for TeamThreadRange
-        });//end parallel_for "residual-vector"
+
+      using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false,false,false>;
+      functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
+      Kokkos::parallel_for("residual-vector",policy,func);
+
+    }
+    else {
+
+      X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
+      using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false,true,false>;
+      functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
+      Kokkos::parallel_for("residual-vector",policy,func);
 
     }
-  } else {
-    // MultiVector case
+  }
+  else {
+
     if (X_domainmap == nullptr) {
-      // Kernel interior shamelessly horked from Ifpack2_Details_ScaledDampedResidual_def.hpp
-      Kokkos::parallel_for("residual-multivector",policy,KOKKOS_LAMBDA(const team_member& dev) {
-          // NOTE: It looks like I should be able to get this data up above, but if I try to
-          // we get internal compiler errors.  Who knew that gcc tried to "gimplify"?
-          const LO numVectors = static_cast<LO>(X_colmap_lcl.extent(1));
-          Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) {
-              const LO lclRow = static_cast<LO> (dev.league_rank ()) * rows_per_team + loop;
-            
-              if (lclRow >= A_lcl.numRows ()) {
-                return;
-              }
-              const auto A_row = A_lcl.rowConst(lclRow);
-              const LO row_length = static_cast<LO> (A_row.length);
-              for(LO v=0; v<numVectors; v++) {
-                residual_value_type A_x = KAT::zero ();          
-              
-                Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, residual_value_type& lsum) {
-                    const auto A_val = A_row.value(iEntry);
-                    lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry),v);
-                  }, A_x);
-              
-                Kokkos::single(Kokkos::PerThread(dev),[&] () {            
-                    R_lcl(lclRow,v) = B_lcl(lclRow,v) - A_x;
-                  });
-              
-              }//end for numVectors
-            });//end parallel_for TeamThreadRange
-        });//end parallel_for "residual-multivector"
-    } else {
-      auto X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
-      
-      Kokkos::parallel_for("residual-multivector",policy,KOKKOS_LAMBDA(const team_member& dev) {
-          // NOTE: It looks like I should be able to get this data up above, but if I try to
-          // we get internal compiler errors.  Who knew that gcc tried to "gimplify"?
-          const LO numVectors = static_cast<LO>(X_colmap_lcl.extent(1));
-          Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) {
-              const LO lclRow = static_cast<LO> (dev.league_rank ()) * rows_per_team + loop;
-              const LO numRows = A_lcl.numRows ();
-              if (lclRow >= numRows) {
-                return;
-              }
-
-              const LO offRankOffset = offsets(lclRow);
-              const size_t start = A_lcl.graph.row_map(lclRow);
-              const size_t end = A_lcl.graph.row_map(lclRow+1);
-
-              for(LO v=0; v<numVectors; v++) {
-                residual_value_type A_x = KAT::zero ();          
-              
-                Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, start, end), [&] (const LO iEntry, residual_value_type& lsum) {
-                  const auto A_val = A_lcl.values(iEntry);
-                  const auto lclCol = A_lcl.graph.entries(iEntry);
-                  if (iEntry < offRankOffset)
-                    lsum += A_val * X_domainmap_lcl(lclCol,v);
-                  else
-                    lsum += A_val * X_colmap_lcl(lclCol,v);
-                }, A_x);
-              
-                Kokkos::single(Kokkos::PerThread(dev),[&] () {            
-                  R_lcl(lclRow,v) = B_lcl(lclRow,v) - A_x;
-                });
-              
-              }//end for numVectors
-            });//end parallel_for TeamThreadRange
-        });//end parallel_for "residual-multivector"
+
+      using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true,false,false>;
+      functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
+      Kokkos::parallel_for("residual-multivector",policy,func);
+
+    }
+    else {
+
+      X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly);
+      using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true,true,false>;
+      functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
+      Kokkos::parallel_for("residual-multivector",policy,func);
+
     }
-  }// end else
+  }
 #endif
 }
-    
+
+
+template<class SC, class LO, class GO, class NO>
+void localResidualWithCommCompOverlap(const CrsMatrix<SC,LO,GO,NO> &  A,
+                                      MultiVector<SC,LO,GO,NO> & X_colmap,
+                                      const MultiVector<SC,LO,GO,NO> & B,
+                                      MultiVector<SC,LO,GO,NO> & R,
+                                      const Kokkos::View<const size_t*, typename NO::device_type>& offsets,
+                                      const MultiVector<SC,LO,GO,NO> & X_domainmap) {
+  using Tpetra::Details::ProfilingRegion;
+  using Teuchos::NO_TRANS;
+  using Teuchos::RCP;
+  using import_type = typename CrsMatrix<SC,LO,GO,NO>::import_type;
+
+  ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localResidualWithCommCompOverlap");
+
+  using local_matrix_device_type = typename CrsMatrix<SC,LO,GO,NO>::local_matrix_device_type;
+  using local_view_device_type = typename  MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::non_const_type;
+  using const_local_view_device_type = typename  MultiVector<SC,LO,GO,NO>::dual_view_type::t_dev::const_type;
+  using offset_type = Kokkos::View<const size_t*, typename NO::device_type>;
+
+  local_matrix_device_type     A_lcl = A.getLocalMatrixDevice ();
+  const_local_view_device_type X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly);
+  const_local_view_device_type B_lcl = B.getLocalViewDevice(Access::ReadOnly);
+  local_view_device_type       R_lcl = R.getLocalViewDevice(Access::OverwriteAll);
+  const_local_view_device_type X_domainmap_lcl = X_domainmap.getLocalViewDevice(Access::ReadOnly);;
+
+  const bool debug = ::Tpetra::Details::Behavior::debug ();
+  if (debug) {
+    TEUCHOS_TEST_FOR_EXCEPTION
+      (X_colmap.getNumVectors () != R.getNumVectors (), std::runtime_error,
+       "X.getNumVectors() = " << X_colmap.getNumVectors () << " != "
+       "R.getNumVectors() = " << R.getNumVectors () << ".");
+    TEUCHOS_TEST_FOR_EXCEPTION
+      (X_colmap.getLocalLength () !=
+       A.getColMap ()->getNodeNumElements (), std::runtime_error,
+       "X has the wrong number of local rows.  "
+       "X.getLocalLength() = " << X_colmap.getLocalLength () << " != "
+       "A.getColMap()->getNodeNumElements() = " <<
+       A.getColMap ()->getNodeNumElements () << ".");
+    TEUCHOS_TEST_FOR_EXCEPTION
+      (R.getLocalLength () !=
+       A.getRowMap ()->getNodeNumElements (), std::runtime_error,
+       "R has the wrong number of local rows.  "
+       "R.getLocalLength() = " << R.getLocalLength () << " != "
+       "A.getRowMap()->getNodeNumElements() = " <<
+       A.getRowMap ()->getNodeNumElements () << ".");
+    TEUCHOS_TEST_FOR_EXCEPTION
+      (B.getLocalLength () !=
+       A.getRowMap ()->getNodeNumElements (), std::runtime_error,
+       "B has the wrong number of local rows.  "
+       "B.getLocalLength() = " << B.getLocalLength () << " != "
+       "A.getRowMap()->getNodeNumElements() = " <<
+       A.getRowMap ()->getNodeNumElements () << ".");
+
+    TEUCHOS_TEST_FOR_EXCEPTION
+      (! A.isFillComplete (), std::runtime_error, "The matrix A is not "
+       "fill complete.  You must call fillComplete() (possibly with "
+       "domain and range Map arguments) without an intervening "
+       "resumeFill() call before you may call this method.");
+    TEUCHOS_TEST_FOR_EXCEPTION
+      (! X_colmap.isConstantStride () || ! R.isConstantStride () || ! B.isConstantStride (),
+       std::runtime_error, "X, Y and B must be constant stride.");
+    // If the two pointers are NULL, then they don't alias one
+    // another, even though they are equal.
+    TEUCHOS_TEST_FOR_EXCEPTION
+      ((X_colmap_lcl.data () == R_lcl.data () && X_colmap_lcl.data () != nullptr) ||
+       (X_colmap_lcl.data () == B_lcl.data () && X_colmap_lcl.data () != nullptr),
+       std::runtime_error, "X, Y and R may not alias one another.");
+  }
+
+  if (A_lcl.numRows() == 0) {
+    return;
+  }
+
+  int64_t numLocalRows = A_lcl.numRows ();
+  int64_t myNnz = A_lcl.nnz();
+
+  // //Check for imbalanced rows. If the logic for SPMV to use merge path is triggered,
+  // //use it and follow the reference residual code
+  // LO maxRowImbalance = 0;
+  // if(numLocalRows != 0)
+  //   maxRowImbalance = A.getNodeMaxNumRowEntries() - (myNnz / numLocalRows);
+  // if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
+  // {
+  //   //note: lclOp will be wrapped in shared_ptr
+  //   auto lclOp = A.getLocalMultiplyOperator();
+  //   //Call local SPMV, requesting merge path, through A's LocalCrsMatrixOperator
+  //   lclOp->applyImbalancedRows (X_colmap_lcl, R_lcl, Teuchos::NO_TRANS, one, zero);
+  //   R.update(one,B,negone);
+  //   return;
+  // }
+
+  int team_size = -1;
+  int vector_length = -1;
+  int64_t rows_per_thread = -1;
+
+  using execution_space = typename CrsMatrix<SC,LO,GO,NO>::execution_space;
+  using policy_type = typename Kokkos::TeamPolicy<execution_space>;
+
+  int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
+  int64_t worksets = (B_lcl.extent (0) + rows_per_team - 1) / rows_per_team;
+
+  policy_type policy (1, 1);
+  if (team_size < 0) {
+    policy = policy_type (worksets, Kokkos::AUTO, vector_length);
+  }
+  else {
+    policy = policy_type (worksets, team_size, vector_length);
+  }
+
+  bool is_vector = (X_colmap_lcl.extent(1) == 1);
+
+  if(is_vector) {
+
+    using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false,true,true>;
+    functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
+    Kokkos::parallel_for("residual-vector",policy,func);
+
+    RCP<const import_type> importer = A.getGraph ()->getImporter ();
+    X_colmap.endImport (X_domainmap, *importer, INSERT, true);
+
+    Kokkos::fence();
+
+    using functor_type2 = OffRankUpdateFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,false>;
+    functor_type2 func2 (A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets);
+    Kokkos::parallel_for("residual-vector-offrank",policy,func2);
+
+  }
+  else {
+
+    using functor_type = LocalResidualFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true,true,true>;
+    functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl);
+    Kokkos::parallel_for("residual-multivector",policy,func);
+
+    RCP<const import_type> importer = A.getGraph ()->getImporter ();
+    X_colmap.endImport (X_domainmap, *importer, INSERT, true);
+
+    Kokkos::fence();
+
+    using functor_type2 = OffRankUpdateFunctor<local_matrix_device_type,local_view_device_type,const_local_view_device_type,offset_type,true>;
+    functor_type2 func2 (A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets);
+    Kokkos::parallel_for("residual-vector-offrank",policy,func2);
+
+  }
+}
+
 
 //! Computes R = B - A * X 
 template<class SC, class LO, class GO, class NO>
@@ -326,6 +588,10 @@ void residual(const Operator<SC,LO,GO,NO> &   Aop,
   using Teuchos::rcpFromRef;
 
   const bool debug = ::Tpetra::Details::Behavior::debug ();
+  const bool skipCopyAndPermuteIfPossible = ::Tpetra::Details::Behavior::skipCopyAndPermuteIfPossible ();
+  const bool overlapCommunicationAndComputation = ::Tpetra::Details::Behavior::overlapCommunicationAndComputation ();
+  if (overlapCommunicationAndComputation)
+    TEUCHOS_ASSERT(skipCopyAndPermuteIfPossible);
 
   // Whether we are using restrictedMode in the import from domain to
   // column map. Restricted mode skips the copy and permutation of the
@@ -348,7 +614,6 @@ void residual(const Operator<SC,LO,GO,NO> &   Aop,
   using export_type = typename CrsMatrix<SC,LO,GO,NO>::export_type;
   using MV = MultiVector<SC,LO,GO,NO>;
   using graph_type = Tpetra::CrsGraph<LO,GO,NO>;
-  using local_graph_type = typename graph_type::local_graph_type;
   using offset_type = typename graph_type::offset_device_view_type;
 
   // We treat the case of a replicated MV output specially.
@@ -367,7 +632,7 @@ void residual(const Operator<SC,LO,GO,NO> &   Aop,
   // Temporary MV for Import operation.  After the block of code
   // below, this will be an (Imported if necessary) column Map MV
   // ready to give to localApply(...).
-  RCP<const MV> X_colMap;
+  RCP<MV> X_colMap;
   if (importer.is_null ()) {
     if (! X_in.isConstantStride ()) {
       // Not all sparse mat-vec kernels can handle an input MV with
@@ -376,14 +641,14 @@ void residual(const Operator<SC,LO,GO,NO> &   Aop,
       // copy of X_in, we force creation of the column (== domain)
       // Map MV (if it hasn't already been created, else fetch the
       // cached copy).  This avoids creating a new MV each time.
-      RCP<MV> X_colMapNonConst = A.getColumnMapMultiVector (X_in, true);
-      Tpetra::deep_copy (*X_colMapNonConst, X_in);
-      X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
+      X_colMap = A.getColumnMapMultiVector (X_in, true);
+      Tpetra::deep_copy (*X_colMap, X_in);
+      // X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
     }
     else {
       // The domain and column Maps are the same, so do the local
       // multiply using the domain Map input MV X_in.
-      X_colMap = rcpFromRef (X_in);
+      X_colMap = rcp_const_cast<MV> (rcpFromRef (X_in) );
     }
   }
   else { // need to Import source (multi)vector
@@ -392,10 +657,10 @@ void residual(const Operator<SC,LO,GO,NO> &   Aop,
     // elements of the domain Map MV X_in into a separate column Map
     // MV.  Thus, we don't have to worry whether X_in is constant
     // stride.
-    RCP<MV> X_colMapNonConst = A.getColumnMapMultiVector (X_in);
+    X_colMap = A.getColumnMapMultiVector (X_in);
     
     // Do we want to use restrictedMode?
-    restrictedMode = importer->isLocallyFitted();
+    restrictedMode = skipCopyAndPermuteIfPossible && importer->isLocallyFitted();
 
     if (debug && restrictedMode) {
       TEUCHOS_TEST_FOR_EXCEPTION
@@ -404,8 +669,7 @@ void residual(const Operator<SC,LO,GO,NO> &   Aop,
     }
 
     // Import from the domain Map MV to the column Map MV.
-    X_colMapNonConst->doImport (X_in, *importer, INSERT, restrictedMode);
-    X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
+    X_colMap->beginImport (X_in, *importer, INSERT, restrictedMode);
   }
 
   offset_type offsets;
@@ -458,7 +722,8 @@ void residual(const Operator<SC,LO,GO,NO> &   Aop,
   // constant-stride version of R_in in this case, because we had to
   // make a constant stride R_rowMap MV and do an Export anyway.
   if (! exporter.is_null ()) {
-
+    if ( ! importer.is_null ())
+      X_colMap->endImport (X_in, *importer, INSERT, restrictedMode);
     if (restrictedMode && !importer.is_null ())
       localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in);
     else
@@ -472,23 +737,30 @@ void residual(const Operator<SC,LO,GO,NO> &   Aop,
     }
   }
   else { // Don't do an Export: row Map and range Map are the same.
+
+    if (restrictedMode)
+      if (overlapCommunicationAndComputation) {
+        localResidualWithCommCompOverlap (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, X_in);
+      } else {
+        X_colMap->endImport (X_in, *importer, INSERT, restrictedMode);
+        localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in);
+      }
+    else {
+      if ( ! importer.is_null ())
+        X_colMap->endImport (X_in, *importer, INSERT, restrictedMode);
+      localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets);
+    }
+
     //
     // If R_in does not have constant stride,
-    // then we can't let the kernel write directly to R_in.  
+    // then we can't let the kernel write directly to R_in.
     // Instead, we have to use the cached row (== range)
     // Map MV as temporary storage.
     //
     if (! R_in.isConstantStride () ) {
       // We need to be sure to do a copy out in this case.
-      localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets);
       Tpetra::deep_copy (R_in, *R_rowMap);
     }
-    else {
-      if (restrictedMode && !importer.is_null ())
-        localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in);
-      else
-        localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets);
-    }
   }
 
   // If the range Map is a locally replicated Map, sum up