diff --git a/packages/ifpack2/src/Ifpack2_Details_GaussSeidel.hpp b/packages/ifpack2/src/Ifpack2_Details_GaussSeidel.hpp
new file mode 100644
index 000000000000..5f4d08aca595
--- /dev/null
+++ b/packages/ifpack2/src/Ifpack2_Details_GaussSeidel.hpp
@@ -0,0 +1,335 @@
+/*@HEADER
+// ***********************************************************************
+//
+//       Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
+//                 Copyright (2009) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ***********************************************************************
+//@HEADER
+*/
+
+#ifndef IFPACK2_GAUSS_SEIDEL_HPP
+#define IFPACK2_GAUSS_SEIDEL_HPP
+
+namespace Ifpack2
+{
+namespace Details
+{
+  template<typename Scalar, typename LO, typename GO, typename NT>
+  struct GaussSeidel
+  {
+    using crs_matrix_type = Tpetra::CrsMatrix<Scalar, LO, GO, NT>;
+    using bcrs_matrix_type = Tpetra::BlockCrsMatrix<Scalar, LO, GO, NT>;
+    using row_matrix_type = Tpetra::RowMatrix<Scalar, LO, GO, NT>;
+    using local_matrix_type = typename crs_matrix_type::local_matrix_type;
+    using vector_type = Tpetra::Vector<Scalar, LO, GO, NT>;
+    using multivector_type = Tpetra::MultiVector<Scalar, LO, GO, NT>;
+    using block_multivector_type = Tpetra::BlockMultiVector<Scalar, LO, GO, NT>;
+    using mem_space_t = typename local_matrix_type::memory_space;
+    using rowmap_t = typename local_matrix_type::row_map_type::HostMirror;
+    using entries_t = typename local_matrix_type::index_type::HostMirror;
+    using values_t = typename local_matrix_type::values_type::HostMirror;
+    using Offset = typename rowmap_t::non_const_value_type;
+    using IST = typename crs_matrix_type::impl_scalar_type;
+    using KAT = Kokkos::ArithTraits<IST>;
+    //Type of view representing inverse diagonal blocks, and its HostMirror.
+    using InverseBlocks = Kokkos::View<IST***, typename bcrs_matrix_type::device_type>;
+    using InverseBlocksHost = typename InverseBlocks::HostMirror;
+
+    //Setup for CrsMatrix
+    GaussSeidel(const crs_matrix_type& A, Teuchos::RCP<vector_type>& inverseDiagVec_, Teuchos::ArrayRCP<LO>& applyRows_, Scalar omega_)
+    {
+      numRows = A.getNodeNumRows();
+      inverseDiagVec = inverseDiagVec_;
+      inverseDiagVec->sync_host();
+      applyRows = applyRows_;
+      blockSize = 1;
+      omega = omega_;
+      auto Alocal = A.getLocalMatrix();
+      Arowmap = Kokkos::create_mirror_view(Alocal.graph.row_map);
+      Aentries = Kokkos::create_mirror_view(Alocal.graph.entries);
+      Avalues = Kokkos::create_mirror_view(Alocal.values);
+      Kokkos::deep_copy(Arowmap, Alocal.graph.row_map);
+      Kokkos::deep_copy(Aentries, Alocal.graph.entries);
+      Kokkos::deep_copy(Avalues, Alocal.values);
+    }
+
+    GaussSeidel(const row_matrix_type& A, Teuchos::RCP<vector_type>& inverseDiagVec_, Teuchos::ArrayRCP<LO>& applyRows_, Scalar omega_)
+    {
+      numRows = A.getNodeNumRows();
+      inverseDiagVec = inverseDiagVec_;
+      inverseDiagVec->sync_host();
+      applyRows = applyRows_;
+      blockSize = 1;
+      omega = omega_;
+      //Here, need to make a deep CRS copy to avoid slow access via getLocalRowCopy
+      Arowmap = rowmap_t(Kokkos::ViewAllocateWithoutInitializing("Arowmap"), numRows + 1);
+      Aentries = entries_t(Kokkos::ViewAllocateWithoutInitializing("Aentries"), A.getNodeNumEntries());
+      Avalues = values_t(Kokkos::ViewAllocateWithoutInitializing("Avalues"), A.getNodeNumEntries());
+      size_t maxDegree = A.getNodeMaxNumRowEntries();
+      Teuchos::Array<Scalar> rowValues(maxDegree);
+      Teuchos::Array<LO> rowEntries(maxDegree);
+      size_t accum = 0;
+      for(LO i = 0; i <= numRows; i++)
+      {
+        Arowmap(i) = accum;
+        if(i == numRows)
+          break;
+        size_t degree;
+        A.getLocalRowCopy(i, rowEntries(), rowValues(), degree);
+        accum += degree;
+        size_t rowBegin = Arowmap(i);
+        for(size_t j = 0; j < degree; j++)
+        {
+          Aentries(rowBegin + j) = rowEntries[j];
+          Avalues(rowBegin + j) = rowValues[j];
+        }
+      }
+    }
+
+    GaussSeidel(const bcrs_matrix_type& A, const InverseBlocks& inverseBlockDiag_, Teuchos::ArrayRCP<LO>& applyRows_, Scalar omega_)
+    {
+      numRows = A.getNodeNumRows();
+      //note: next 2 lines are no-ops if inverseBlockDiag_ is already host-accessible
+      inverseBlockDiag = Kokkos::create_mirror_view(inverseBlockDiag_);
+      Kokkos::deep_copy(inverseBlockDiag, inverseBlockDiag_);
+      applyRows = applyRows_;
+      omega = omega_;
+      auto AlocalGraph = A.getCrsGraph().getLocalGraph();
+      //A.sync_host();  //note: this only syncs values, not graph
+      Avalues = A.getValuesHost();
+      Arowmap = Kokkos::create_mirror_view(AlocalGraph.row_map);
+      Aentries = Kokkos::create_mirror_view(AlocalGraph.entries);
+      Kokkos::deep_copy(Arowmap, AlocalGraph.row_map);
+      Kokkos::deep_copy(Aentries, AlocalGraph.entries);
+      blockSize = A.getBlockSize();
+    }
+
+    template<bool useApplyRows, bool multipleRHS, bool omegaNotOne>
+    void applyImpl(multivector_type& x, const multivector_type& b, Tpetra::ESweepDirection direction)
+    {
+      //note: direction is either Forward or Backward (Symmetric is handled in apply())
+      LO numApplyRows = useApplyRows ? (LO) applyRows.size() : numRows;
+      //note: inverseDiagMV always has only one column
+      auto inverseDiag = inverseDiagVec->get1dView();
+      bool forward = direction == Tpetra::Forward;
+      if(multipleRHS)
+      {
+        LO numVecs = x.getNumVectors();
+        Teuchos::Array<IST> accum(numVecs);
+        auto xlcl = x.get2dViewNonConst();
+        auto blcl = b.get2dView();
+        for(LO i = 0; i < numApplyRows; i++)
+        {
+          LO row;
+          if(forward)
+            row = useApplyRows ? applyRows[i] : i;
+          else
+            row = useApplyRows ? applyRows[numApplyRows - 1 - i] : numApplyRows - 1 - i;
+          for(LO k = 0; k < numVecs; k++)
+          {
+            accum[k] = blcl[k][row];
+          }
+          Offset rowBegin = Arowmap(row);
+          Offset rowEnd = Arowmap(row + 1);
+          for(Offset j = rowBegin; j < rowEnd; j++)
+          {
+            LO col = Aentries(j);
+            IST val = Avalues(j);
+            for(LO k = 0; k < numVecs; k++)
+            {
+              accum[k] -= val * IST(xlcl[k][col]);
+            }
+          }
+          //Update x
+          IST dinv = inverseDiag[row];
+          for(LO k = 0; k < numVecs; k++)
+          {
+            if(omegaNotOne)
+              xlcl[k][row] += Scalar(omega * dinv * accum[k]);
+            else
+              xlcl[k][row] += Scalar(dinv * accum[k]);
+          }
+        }
+      }
+      else
+      {
+        auto xlcl = Kokkos::subview(x.getLocalViewHost(), Kokkos::ALL(), 0);
+        auto blcl = Kokkos::subview(b.getLocalViewHost(), Kokkos::ALL(), 0);
+        auto dlcl = Kokkos::subview(inverseDiagVec->getLocalViewHost(), Kokkos::ALL(), 0);
+        for(LO i = 0; i < numApplyRows; i++)
+        {
+          LO row;
+          if(forward)
+            row = useApplyRows ? applyRows[i] : i;
+          else
+            row = useApplyRows ? applyRows[numApplyRows - 1 - i] : numApplyRows - 1 - i;
+          IST sum = blcl(row);
+          Offset rowBegin = Arowmap(row);
+          Offset rowEnd = Arowmap(row + 1);
+          for(Offset j = rowBegin; j < rowEnd; j++)
+          {
+            sum -= Avalues(j) * xlcl(Aentries(j));
+          }
+          //Update x
+          IST dinv = dlcl(row);
+          if(omegaNotOne)
+            xlcl(row) += omega * dinv * sum;
+          else
+            xlcl(row) += dinv * sum;
+        }
+      }
+    }
+
+    void applyBlock(block_multivector_type& x, const block_multivector_type& b, Tpetra::ESweepDirection direction)
+    {
+      if(direction == Tpetra::Symmetric)
+      {
+        applyBlock(x, b, Tpetra::Forward);
+        applyBlock(x, b, Tpetra::Backward);
+        return;
+      }
+      //Number of scalars in Avalues per block entry.
+      Offset bs2 = blockSize * blockSize;
+      LO numVecs = x.getNumVectors();
+      Kokkos::View<IST**, Kokkos::LayoutLeft, Kokkos::HostSpace> accum(
+          Kokkos::ViewAllocateWithoutInitializing("BlockGaussSeidel Accumulator"), blockSize, numVecs);
+      Kokkos::View<IST**, Kokkos::LayoutLeft, Kokkos::HostSpace> dinv_accum(
+          Kokkos::ViewAllocateWithoutInitializing("BlockGaussSeidel A_ii^-1*Accumulator"), blockSize, numVecs);
+      bool useApplyRows = !applyRows.is_null();
+      bool forward = direction == Tpetra::Forward;
+      LO numApplyRows = useApplyRows ? applyRows.size() : numRows;
+      for(LO i = 0; i < numApplyRows; i++)
+      {
+        LO row;
+        if(forward)
+          row = useApplyRows ? applyRows[i] : i;
+        else
+          row = useApplyRows ? applyRows[numApplyRows - 1 - i] : numApplyRows - 1 - i;
+        for(LO v = 0; v < numVecs; v++)
+        {
+          auto bRow = b.getLocalBlock (row, v);
+          for(LO k = 0; k < blockSize; k++)
+          {
+            accum(k, v) = bRow(k);
+          }
+        }
+        Offset rowBegin = Arowmap(row);
+        Offset rowEnd = Arowmap(row + 1);
+        for(Offset j = rowBegin; j < rowEnd; j++)
+        {
+          LO col = Aentries(j);
+          IST* blk = &Avalues(j * bs2);
+          for(LO v = 0; v < numVecs; v++)
+          {
+            auto xCol = x.getLocalBlock (col, v);
+            for(LO br = 0; br < blockSize; br++)
+            {
+              for(LO bc = 0; bc < blockSize; bc++)
+              {
+                IST Aval = blk[br * blockSize + bc];
+                accum(br, v) -= Aval * xCol(bc);
+              }
+            }
+          }
+        }
+        //Update x: term is omega * Aii^-1 * accum, where omega is scalar, Aii^-1 is bs*bs, and accum is bs*nv
+        auto invBlock = Kokkos::subview(inverseBlockDiag, row, Kokkos::ALL(), Kokkos::ALL());
+        Kokkos::deep_copy(dinv_accum, KAT::zero());
+        for(LO v = 0; v < numVecs; v++)
+        {
+          for(LO br = 0; br < blockSize; br++)
+          {
+            for(LO bc = 0; bc < blockSize; bc++)
+            {
+              dinv_accum(br, v) += invBlock(br, bc) * accum(bc, v);
+            }
+          }
+        }
+        //Update x
+        for(LO v = 0; v < numVecs; v++)
+        {
+          auto xRow = x.getLocalBlock (row, v);
+          for(LO k = 0; k < blockSize; k++)
+          {
+            xRow(k) += omega * dinv_accum(k, v);
+          }
+        }
+      }
+    }
+
+    void apply(multivector_type& x, const multivector_type& b, Tpetra::ESweepDirection direction)
+    {
+      if(direction == Tpetra::Symmetric)
+      {
+        apply(x, b, Tpetra::Forward);
+        apply(x, b, Tpetra::Backward);
+        return;
+      }
+      else if(applyRows.is_null())
+      {
+        if(x.getNumVectors() > 1)
+          this->template applyImpl<false, true, true>(x, b, direction);
+        else
+        {
+          //Optimize for the all-rows, single vector, omega = 1 case
+          if(omega == KAT::one())
+            this->template applyImpl<false, false, false>(x, b, direction);
+          else
+            this->template applyImpl<false, false, true>(x, b, direction);
+        }
+      }
+      else
+      {
+        this->template applyImpl<true, true, true>(x, b, direction);
+      }
+    }
+
+    values_t Avalues; //length = Aentries.extent(0) * blockSize * blockSize
+    rowmap_t Arowmap;
+    entries_t Aentries;
+    LO numRows;
+    IST omega;
+    //If set up with BlockCrsMatrix, the block size. Otherwise 1.
+    LO blockSize;
+    //If using a non-block matrix, the inverse diagonal.
+    Teuchos::RCP<vector_type> inverseDiagVec;
+    //If using a block matrix, the inverses of all diagonal blocks.
+    InverseBlocksHost inverseBlockDiag;
+    //If null, apply over all rows in natural order. Otherwise, apply for each row listed in order.
+    Teuchos::ArrayRCP<LO> applyRows;
+  };
+}
+}
+
+#endif
diff --git a/packages/ifpack2/src/Ifpack2_Relaxation_decl.hpp b/packages/ifpack2/src/Ifpack2_Relaxation_decl.hpp
index 73c1d48c3613..5d6805b64610 100644
--- a/packages/ifpack2/src/Ifpack2_Relaxation_decl.hpp
+++ b/packages/ifpack2/src/Ifpack2_Relaxation_decl.hpp
@@ -50,7 +50,7 @@
 #include "Tpetra_BlockCrsMatrix.hpp"
 #include <type_traits>
 #include <KokkosKernels_Handle.hpp>
-
+#include "Ifpack2_Details_GaussSeidel.hpp"
 
 #ifndef DOXYGEN_SHOULD_SKIP_THIS
 namespace Ifpack2 {
@@ -261,6 +261,9 @@ class Relaxation :
   //! The Node type used by the input MatrixType.
   typedef typename MatrixType::node_type node_type;
 
+  //! The Kokkos device type used by the input MatrixType.
+  typedef typename MatrixType::node_type::device_type device_type;
+
   //! The type of the magnitude (absolute value) of a matrix entry.
   typedef typename Teuchos::ScalarTraits<scalar_type>::magnitudeType magnitude_type;
 
@@ -583,17 +586,26 @@ class Relaxation :
   typedef Teuchos::ScalarTraits<scalar_type> STS;
   typedef Teuchos::ScalarTraits<magnitude_type> STM;
 
+    typedef typename Kokkos::ArithTraits<scalar_type>::val_type impl_scalar_type;
+
   /// \brief Tpetra::CrsMatrix specialization used by this class.
   ///
   /// We use this for dynamic casts to dispatch to the most efficient
   /// implementation of various relaxation kernels.
   typedef Tpetra::CrsMatrix<scalar_type, local_ordinal_type,
                             global_ordinal_type, node_type> crs_matrix_type;
+  typedef Tpetra::CrsGraph<local_ordinal_type,
+                            global_ordinal_type, node_type> crs_graph_type;
+  typedef Tpetra::MultiVector<scalar_type, local_ordinal_type,
+                            global_ordinal_type, node_type> multivector_type;
   typedef Tpetra::BlockCrsMatrix<scalar_type, local_ordinal_type,
                             global_ordinal_type, node_type> block_crs_matrix_type;
   typedef Tpetra::BlockMultiVector<scalar_type, local_ordinal_type,
                             global_ordinal_type, node_type> block_multivector_type;
 
+  typedef Tpetra::Map<local_ordinal_type, global_ordinal_type, node_type> map_type;
+  typedef Tpetra::Import<local_ordinal_type, global_ordinal_type, node_type> import_type;
+
 
   //@}
   //! \name Implementation of multithreaded Gauss-Seidel.
@@ -611,6 +623,13 @@ class Relaxation :
       MyExecSpace, TemporaryWorkSpace,PersistentWorkSpace > mt_kernel_handle_type;
   Teuchos::RCP<mt_kernel_handle_type> mtKernelHandle_;
 
+  //@}
+  //! \name Implementation of serial Gauss-Seidel.
+  //@{
+
+  using SerialGaussSeidel = Ifpack2::Details::GaussSeidel<scalar_type, local_ordinal_type, global_ordinal_type, node_type>;
+  Teuchos::RCP<SerialGaussSeidel> serialGaussSeidel_;
+
   //@}
   //! \name Unimplemented methods that you are syntactically forbidden to call.
   //@{
@@ -646,38 +665,36 @@ class Relaxation :
         const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
               Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const;
 
-  //! Apply Gauss-Seidel to X, returning the result in Y.
-  void ApplyInverseGS(
-        const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
-              Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const;
-
-  //! Apply multi-threaded Gauss-Seidel to X, returning the result in Y.
+  //! Apply multi-threaded Gauss-Seidel for solving AX = B.
   void ApplyInverseMTGS_CrsMatrix(
-          const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
-                Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const;
-
+      const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& B,
+      Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
+      Tpetra::ESweepDirection direction) const;
 
-  //! Apply Gauss-Seidel for a Tpetra::RowMatrix specialization.
-  void ApplyInverseGS_RowMatrix(
+  //! Apply Gauss-Seidel to X, returning the result in Y. Calls one of the Crs/Row/BlockCrs specializations below.
+  void ApplyInverseSerialGS(
         const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
-              Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const;
+              Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y,
+              Tpetra::ESweepDirection direction) const;
 
   //! Apply Gauss-Seidel for a Tpetra::CrsMatrix specialization.
-  void
-  ApplyInverseGS_CrsMatrix (const crs_matrix_type& A,
-                            const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
-                            Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const;
+  void ApplyInverseSerialGS_CrsMatrix (const crs_matrix_type& A,
+      const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& B,
+      Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
+      Tpetra::ESweepDirection direction) const;
 
-  //! Apply Gauss-Seidel for a Tpetra::BlockCrsMatrix specialization.
-  void
-  ApplyInverseGS_BlockCrsMatrix (const block_crs_matrix_type& A,
-                            const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
-                            Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y);
+  //! Apply Gauss-Seidel for a Tpetra::RowMatrix specialization.
+  void ApplyInverseSerialGS_RowMatrix(
+      const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
+      Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y,
+      Tpetra::ESweepDirection direction) const;
 
-  //! Apply symmetric Gauss-Seidel to X, returning the result in Y.
-  void ApplyInverseSGS(
-        const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
-              Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const;
+  //! Apply Gauss-Seidel for a Tpetra::BlockCrsMatrix specialization.
+  void ApplyInverseSerialGS_BlockCrsMatrix(
+      const block_crs_matrix_type& A,
+      const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
+      Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y,
+      Tpetra::ESweepDirection direction);
 
   //! Apply symmetric multi-threaded Gauss-Seidel to X, returning the result in Y.
   void ApplyInverseMTSGS_CrsMatrix(
@@ -689,23 +706,6 @@ class Relaxation :
       Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
       const Tpetra::ESweepDirection direction) const;
 
-  //! Apply symmetric Gauss-Seidel for a Tpetra::RowMatrix specialization.
-  void ApplyInverseSGS_RowMatrix(
-        const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
-              Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const;
-
-  //! Apply symmetric Gauss-Seidel for a Tpetra::CrsMatrix specialization.
-  void
-  ApplyInverseSGS_CrsMatrix (const crs_matrix_type& A,
-                             const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
-                             Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const;
-
-  //! Apply symmetric Gauss-Seidel for a Tpetra::BlockCrsMatrix specialization.
-  void
-  ApplyInverseSGS_BlockCrsMatrix (const block_crs_matrix_type& A,
-                             const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
-                             Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y);
-
   void computeBlockCrs ();
 
   //! A service routine for updating the cached MultiVector
@@ -728,9 +728,9 @@ class Relaxation :
   Teuchos::RCP<const row_matrix_type> A_;
   //! Time object to track timing (setup).
   //! Importer for parallel Gauss-Seidel and symmetric Gauss-Seidel.
-  Teuchos::RCP<const Tpetra::Import<local_ordinal_type,global_ordinal_type,node_type> > Importer_;
+  Teuchos::RCP<const import_type> Importer_;
   //! Importer for block multivector versions of GS and SGS
-  Teuchos::RCP<const Tpetra::Import<local_ordinal_type,global_ordinal_type,node_type> > pointImporter_;
+  Teuchos::RCP<const import_type> pointImporter_;
   //! Contains the diagonal elements of \c A_.
   Teuchos::RCP<Tpetra::Vector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> > Diagonal_;
   //! MultiVector for caching purposes (so apply doesn't need to allocate one on each call)
diff --git a/packages/ifpack2/src/Ifpack2_Relaxation_def.hpp b/packages/ifpack2/src/Ifpack2_Relaxation_def.hpp
index afb0253ee507..c06e17dea6d2 100644
--- a/packages/ifpack2/src/Ifpack2_Relaxation_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_Relaxation_def.hpp
@@ -199,10 +199,10 @@ setMatrix(const Teuchos::RCP<const row_matrix_type>& A)
     Diagonal_ = Teuchos::null; // ??? what if this comes from the user???
     isInitialized_ = false;
     IsComputed_ = false;
-    using device_type = typename node_type::device_type;
     diagOffsets_ = Kokkos::View<size_t*, device_type>();
     savedDiagOffsets_ = false;
     hasBlockCrsMatrix_ = false;
+    serialGaussSeidel_ = Teuchos::null;
     if (! A.is_null ()) {
       IsParallel_ = (A->getRowMap ()->getComm ()->getSize () > 1);
     }
@@ -576,18 +576,18 @@ apply (const Tpetra::MultiVector<scalar_type, local_ordinal_type, global_ordinal
         ApplyInverseJacobi(*Xcopy,Y);
         break;
       case Ifpack2::Details::GS:
-        ApplyInverseGS(*Xcopy,Y);
+        ApplyInverseSerialGS(*Xcopy, Y, DoBackwardGS_ ? Tpetra::Backward : Tpetra::Forward);
         break;
       case Ifpack2::Details::SGS:
-        ApplyInverseSGS(*Xcopy,Y);
-        break;
-      case Ifpack2::Details::MTSGS:
-      case Ifpack2::Details::SGS2:
-        ApplyInverseMTSGS_CrsMatrix(*Xcopy,Y);
+        ApplyInverseSerialGS(*Xcopy, Y, Tpetra::Symmetric);
         break;
       case Ifpack2::Details::MTGS:
       case Ifpack2::Details::GS2:
-        ApplyInverseMTGS_CrsMatrix(*Xcopy,Y);
+        ApplyInverseMTGS_CrsMatrix(*Xcopy, Y, DoBackwardGS_ ? Tpetra::Backward : Tpetra::Forward);
+        break;
+      case Ifpack2::Details::MTSGS:
+      case Ifpack2::Details::SGS2:
+        ApplyInverseMTGS_CrsMatrix(*Xcopy, Y, Tpetra::Symmetric);
         break;
       case Ifpack2::Details::RICHARDSON:
         ApplyInverseRichardson(*Xcopy,Y);
@@ -675,6 +675,8 @@ void Relaxation<MatrixType>::initialize ()
       hasBlockCrsMatrix_ = ! A_bcrs.is_null ();
     }
 
+    serialGaussSeidel_ = Teuchos::null;
+
     if (PrecType_ == Details::MTGS || PrecType_ == Details::MTSGS ||
         PrecType_ == Details::GS2  || PrecType_ == Details::SGS2) {
       const crs_matrix_type* crsMat =
@@ -806,7 +808,6 @@ void Relaxation<MatrixType>::computeBlockCrs ()
   using Teuchos::rcp_dynamic_cast;
   using Teuchos::reduceAll;
   typedef local_ordinal_type LO;
-  typedef typename node_type::device_type device_type;
 
   const std::string timerName ("Ifpack2::Relaxation::computeBlockCrs");
   Teuchos::RCP<Teuchos::Time> timer = Teuchos::TimeMonitor::lookupCounter (timerName);
@@ -930,6 +931,7 @@ void Relaxation<MatrixType>::computeBlockCrs ()
        "Tpetra::BlockCrsMatrix, one or more diagonal block LU factorizations "
        "failed on one or more (MPI) processes.");
 #endif // HAVE_IFPACK2_DEBUG
+    serialGaussSeidel_ = rcp(new SerialGaussSeidel(*blockCrsA, blockDiag_, localSmoothingIndices_, DampingFactor_));
   } // end TimeMonitor scope
 
   ComputeTime_ += (timer->wallTime() - startTime);
@@ -956,7 +958,6 @@ void Relaxation<MatrixType>::compute ()
   using LO = local_ordinal_type;
   using vector_type = Tpetra::Vector<scalar_type, local_ordinal_type,
                                      global_ordinal_type, node_type>;
-  using device_type = typename vector_type::device_type;
   using IST = typename vector_type::impl_scalar_type;
   using KAT = Kokkos::ArithTraits<IST>;
 
@@ -993,6 +994,10 @@ void Relaxation<MatrixType>::compute ()
 
   { // Timing of compute starts here.
     Teuchos::TimeMonitor timeMon (*timer);
+
+    const crs_matrix_type* crsMat =
+      dynamic_cast<const crs_matrix_type*> (A_.get());
+
     TEUCHOS_TEST_FOR_EXCEPTION
       (NumSweeps_ < 0, std::logic_error, methodName
        << ": NumSweeps_ = " << NumSweeps_ << " < 0.  "
@@ -1020,8 +1025,6 @@ void Relaxation<MatrixType>::compute ()
       // method (not inherited from RowMatrix's interface).  It's
       // perfectly valid to do relaxation on a RowMatrix which is not
       // a CrsMatrix.
-      const crs_matrix_type* crsMat =
-        dynamic_cast<const crs_matrix_type*> (A_.getRawPtr ());
       if (crsMat == nullptr || ! crsMat->isStaticGraph ()) {
         A_->getLocalDiagCopy (*Diagonal_); // slow path
       }
@@ -1324,8 +1327,6 @@ void Relaxation<MatrixType>::compute ()
 
       //KokkosKernels GaussSeidel Initialization.
 
-      const crs_matrix_type* crsMat =
-        dynamic_cast<const crs_matrix_type*> (A_.get());
       TEUCHOS_TEST_FOR_EXCEPTION
         (crsMat == nullptr, std::logic_error, methodName << ": "
          "Multithreaded Gauss-Seidel methods currently only work "
@@ -1347,6 +1348,12 @@ void Relaxation<MatrixType>::compute ()
                                                    diagView_1d,
                                                    is_matrix_structurally_symmetric_);
     }
+    else if(PrecType_ == Ifpack2::Details::GS || PrecType_ == Ifpack2::Details::SGS) {
+      if(crsMat)
+        serialGaussSeidel_ = rcp(new SerialGaussSeidel(*crsMat, Diagonal_, localSmoothingIndices_, DampingFactor_));
+      else
+        serialGaussSeidel_ = rcp(new SerialGaussSeidel(*A_, Diagonal_, localSmoothingIndices_, DampingFactor_));
+    }
   } // end TimeMonitor scope
 
   ComputeTime_ += (timer->wallTime() - startTime);
@@ -1556,8 +1563,9 @@ ApplyInverseJacobi_BlockCrsMatrix (const Tpetra::MultiVector<scalar_type,
 template<class MatrixType>
 void
 Relaxation<MatrixType>::
-ApplyInverseGS (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
-                Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const
+ApplyInverseSerialGS (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
+                Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y,
+                Tpetra::ESweepDirection direction) const
 {
   using this_type = Relaxation<MatrixType>;
   // The CrsMatrix version is faster, because it can access the sparse
@@ -1574,13 +1582,13 @@ ApplyInverseGS (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_
   const crs_matrix_type* crsMat =
     dynamic_cast<const crs_matrix_type*> (A_.getRawPtr ());
   if (blockCrsMat != nullptr)  {
-    const_cast<this_type&> (*this).ApplyInverseGS_BlockCrsMatrix (*blockCrsMat, X, Y);
+    const_cast<this_type&> (*this).ApplyInverseSerialGS_BlockCrsMatrix (*blockCrsMat, X, Y, direction);
   }
   else if (crsMat != nullptr) {
-    ApplyInverseGS_CrsMatrix (*crsMat, X, Y);
+    ApplyInverseSerialGS_CrsMatrix (*crsMat, X, Y, direction);
   }
   else {
-    ApplyInverseGS_RowMatrix (X, Y);
+    ApplyInverseSerialGS_RowMatrix (X, Y, direction);
   }
 }
 
@@ -1588,9 +1596,9 @@ ApplyInverseGS (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_
 template<class MatrixType>
 void
 Relaxation<MatrixType>::
-ApplyInverseGS_RowMatrix (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
-                          Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const
-{
+ApplyInverseSerialGS_RowMatrix (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
+                          Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y,
+                          Tpetra::ESweepDirection direction) const {
   using Teuchos::Array;
   using Teuchos::ArrayRCP;
   using Teuchos::ArrayView;
@@ -1607,20 +1615,7 @@ ApplyInverseGS_RowMatrix (const Tpetra::MultiVector<scalar_type,local_ordinal_ty
     Y.putScalar (STS::zero ());
   }
 
-  const size_t NumVectors = X.getNumVectors ();
-  const size_t maxLength = A_->getNodeMaxNumRowEntries ();
-  Array<local_ordinal_type> Indices (maxLength);
-  Array<scalar_type> Values (maxLength);
-
-  // Local smoothing stuff
-  const size_t numMyRows = A_->getNodeNumRows();
-  const local_ordinal_type* rowInd  = 0;
-  size_t numActive = numMyRows;
-  bool do_local = ! localSmoothingIndices_.is_null ();
-  if (do_local) {
-    rowInd = localSmoothingIndices_.getRawPtr ();
-    numActive = localSmoothingIndices_.size ();
-  }
+  size_t NumVectors = X.getNumVectors();
 
   RCP<MV> Y2;
   if (IsParallel_) {
@@ -1636,147 +1631,29 @@ ApplyInverseGS_RowMatrix (const Tpetra::MultiVector<scalar_type,local_ordinal_ty
     Y2 = rcpFromRef (Y);
   }
 
-  // Diagonal
-  ArrayRCP<const scalar_type> d_rcp = Diagonal_->get1dView ();
-  ArrayView<const scalar_type> d_ptr = d_rcp();
-
-  // Constant stride check
-  bool constant_stride = X.isConstantStride() && Y2->isConstantStride();
-
-  if (constant_stride) {
-    // extract 1D RCPs
-    size_t                    x_stride = X.getStride();
-    size_t                   y2_stride = Y2->getStride();
-    ArrayRCP<scalar_type>       y2_rcp = Y2->get1dViewNonConst();
-    ArrayRCP<const scalar_type>  x_rcp = X.get1dView();
-    ArrayView<scalar_type>      y2_ptr = y2_rcp();
-    ArrayView<const scalar_type> x_ptr = x_rcp();
-    Array<scalar_type> dtemp(NumVectors,STS::zero());
-
-    for (int j = 0; j < NumSweeps_; ++j) {
-      // data exchange is here, once per sweep
-      if (IsParallel_) {
-        if (Importer_.is_null ()) {
-          // FIXME (mfh 27 May 2019) This doesn't deep copy -- not
-          // clear if this is correct.  Reevaluate at some point.
-
-          *Y2 = Y; // just copy, since domain and column Maps are the same
-        } else {
-          Y2->doImport (Y, *Importer_, Tpetra::INSERT);
-        }
-      }
-
-      if (! DoBackwardGS_) { // Forward sweep
-        for (size_t ii = 0; ii < numActive; ++ii) {
-          local_ordinal_type i = as<local_ordinal_type> (do_local ? rowInd[ii] : ii);
-          size_t NumEntries;
-          A_->getLocalRowCopy (i, Indices (), Values (), NumEntries);
-          dtemp.assign(NumVectors,STS::zero());
-
-          for (size_t k = 0; k < NumEntries; ++k) {
-            const local_ordinal_type col = Indices[k];
-            for (size_t m = 0; m < NumVectors; ++m) {
-              dtemp[m] += Values[k] * y2_ptr[col + y2_stride*m];
-            }
-          }
-
-          for (size_t m = 0; m < NumVectors; ++m) {
-            y2_ptr[i + y2_stride*m] += DampingFactor_ * d_ptr[i] * (x_ptr[i + x_stride*m] - dtemp[m]);
-          }
-        }
-      }
-      else { // Backward sweep
-        // ptrdiff_t is the same size as size_t, but is signed.  Being
-        // signed is important so that i >= 0 is not trivially true.
-        for (ptrdiff_t ii = as<ptrdiff_t> (numActive) - 1; ii >= 0; --ii) {
-          local_ordinal_type i = as<local_ordinal_type> (do_local ? rowInd[ii] : ii);
-          size_t NumEntries;
-          A_->getLocalRowCopy (i, Indices (), Values (), NumEntries);
-          dtemp.assign (NumVectors, STS::zero ());
-
-          for (size_t k = 0; k < NumEntries; ++k) {
-            const local_ordinal_type col = Indices[k];
-            for (size_t m = 0; m < NumVectors; ++m) {
-              dtemp[m] += Values[k] * y2_ptr[col + y2_stride*m];
-            }
-          }
+  const_cast<multivector_type&>(X).sync_host();
+  for (int j = 0; j < NumSweeps_; ++j) {
+    // data exchange is here, once per sweep
+    if (IsParallel_) {
+      if (Importer_.is_null ()) {
+        // FIXME (mfh 27 May 2019) This doesn't deep copy -- not
+        // clear if this is correct.  Reevaluate at some point.
 
-          for (size_t m = 0; m < NumVectors; ++m) {
-            y2_ptr[i + y2_stride*m] += DampingFactor_ * d_ptr[i] * (x_ptr[i + x_stride*m] - dtemp[m]);
-          }
-        }
-      }
-      // FIXME (mfh 02 Jan 2013) This is only correct if row Map == range Map.
-      if (IsParallel_) {
-        Tpetra::deep_copy (Y, *Y2);
+        *Y2 = Y; // just copy, since domain and column Maps are the same
+      } else {
+        Y2->doImport (Y, *Importer_, Tpetra::INSERT);
       }
     }
-  }
-  else {
-    // extract 2D RCPS
-    ArrayRCP<ArrayRCP<scalar_type> > y2_ptr = Y2->get2dViewNonConst ();
-    ArrayRCP<ArrayRCP<const scalar_type> > x_ptr = X.get2dView ();
-
-    for (int j = 0; j < NumSweeps_; ++j) {
-      // data exchange is here, once per sweep
-      if (IsParallel_) {
-        if (Importer_.is_null ()) {
-          *Y2 = Y; // just copy, since domain and column Maps are the same
-        } else {
-          Y2->doImport (Y, *Importer_, Tpetra::INSERT);
-        }
-      }
+    Y2->sync_host();
+    serialGaussSeidel_->apply(*Y2, X, direction);
+    Y2->modify_host();
 
-      if (! DoBackwardGS_) { // Forward sweep
-        for (size_t ii = 0; ii < numActive; ++ii) {
-          local_ordinal_type i = as<local_ordinal_type> (do_local ? rowInd[ii] : ii);
-          size_t NumEntries;
-          A_->getLocalRowCopy (i, Indices (), Values (), NumEntries);
-
-          for (size_t m = 0; m < NumVectors; ++m) {
-            scalar_type dtemp = STS::zero ();
-            ArrayView<const scalar_type> x_local = (x_ptr())[m]();
-            ArrayView<scalar_type>      y2_local = (y2_ptr())[m]();
-
-            for (size_t k = 0; k < NumEntries; ++k) {
-              const local_ordinal_type col = Indices[k];
-              dtemp += Values[k] * y2_local[col];
-            }
-            y2_local[i] += DampingFactor_ * d_ptr[i] * (x_local[i] - dtemp);
-          }
-        }
-      }
-      else { // Backward sweep
-        // ptrdiff_t is the same size as size_t, but is signed.  Being
-        // signed is important so that i >= 0 is not trivially true.
-        for (ptrdiff_t ii = as<ptrdiff_t> (numActive) - 1; ii >= 0; --ii) {
-          local_ordinal_type i = as<local_ordinal_type> (do_local ? rowInd[ii] : ii);
-
-          size_t NumEntries;
-          A_->getLocalRowCopy (i, Indices (), Values (), NumEntries);
-
-          for (size_t m = 0; m < NumVectors; ++m) {
-            scalar_type dtemp = STS::zero ();
-            ArrayView<const scalar_type> x_local = (x_ptr())[m]();
-            ArrayView<scalar_type>      y2_local = (y2_ptr())[m]();
-
-            for (size_t k = 0; k < NumEntries; ++k) {
-              const local_ordinal_type col = Indices[k];
-              dtemp += Values[k] * y2_local[col];
-            }
-            y2_local[i] += DampingFactor_ * d_ptr[i] * (x_local[i] - dtemp);
-          }
-        }
-      }
-
-      // FIXME (mfh 02 Jan 2013) This is only correct if row Map == range Map.
-      if (IsParallel_) {
-        Tpetra::deep_copy (Y, *Y2);
-      }
+    // FIXME (mfh 02 Jan 2013) This is only correct if row Map == range Map.
+    if (IsParallel_) {
+      Tpetra::deep_copy (Y, *Y2);
     }
   }
 
-
   // See flop count discussion in implementation of ApplyInverseGS_CrsMatrix().
   const double dampingFlops = (DampingFactor_ == STS::one()) ? 0.0 : 1.0;
   const double numVectors = as<double> (X.getNumVectors ());
@@ -1790,21 +1667,173 @@ ApplyInverseGS_RowMatrix (const Tpetra::MultiVector<scalar_type,local_ordinal_ty
 template<class MatrixType>
 void
 Relaxation<MatrixType>::
-ApplyInverseGS_CrsMatrix (const crs_matrix_type& A,
-                          const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
-                          Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const
+ApplyInverseSerialGS_CrsMatrix(const crs_matrix_type& A,
+                          const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& B,
+                          Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
+                          Tpetra::ESweepDirection direction) const
 {
-  using Teuchos::as;
-  const Tpetra::ESweepDirection direction =
-    DoBackwardGS_ ? Tpetra::Backward : Tpetra::Forward;
-  if (localSmoothingIndices_.is_null ()) {
-    A.gaussSeidelCopy (Y, X, *Diagonal_, DampingFactor_, direction,
-                       NumSweeps_, ZeroStartingSolution_);
+  using Teuchos::null;
+  using Teuchos::RCP;
+  using Teuchos::rcp;
+  using Teuchos::rcpFromRef;
+  using Teuchos::rcp_const_cast;
+  typedef scalar_type Scalar;
+  const char prefix[] = "Ifpack2::Relaxation::SerialGS: ";
+  const scalar_type ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
+
+  TEUCHOS_TEST_FOR_EXCEPTION(
+      ! A.isFillComplete (), std::runtime_error,
+      prefix << "The matrix is not fill complete.");
+  TEUCHOS_TEST_FOR_EXCEPTION(
+      NumSweeps_ < 0, std::invalid_argument,
+      prefix << "The number of sweeps must be nonnegative, "
+      "but you provided numSweeps = " << NumSweeps_ << " < 0.");
+
+  if (NumSweeps_ == 0) {
+    return;
   }
-  else {
-    A.reorderedGaussSeidelCopy (Y, X, *Diagonal_, localSmoothingIndices_ (),
-                                DampingFactor_, direction,
-                                NumSweeps_, ZeroStartingSolution_);
+
+  RCP<const import_type> importer = A.getGraph ()->getImporter ();
+
+  RCP<const map_type> domainMap = A.getDomainMap ();
+  RCP<const map_type> rangeMap = A.getRangeMap ();
+  RCP<const map_type> rowMap = A.getGraph ()->getRowMap ();
+  RCP<const map_type> colMap = A.getGraph ()->getColMap ();
+
+#ifdef HAVE_IFPACK2_DEBUG
+  {
+    // The relation 'isSameAs' is transitive.  It's also a
+    // collective, so we don't have to do a "shared" test for
+    // exception (i.e., a global reduction on the test value).
+    TEUCHOS_TEST_FOR_EXCEPTION(
+        ! X.getMap ()->isSameAs (*domainMap), std::runtime_error,
+        "Tpetra::CrsMatrix::gaussSeidelCopy requires that the input "
+        "multivector X be in the domain Map of the matrix.");
+    TEUCHOS_TEST_FOR_EXCEPTION(
+        ! B.getMap ()->isSameAs (*rangeMap), std::runtime_error,
+        "Tpetra::CrsMatrix::gaussSeidelCopy requires that the input "
+        "B be in the range Map of the matrix.");
+    TEUCHOS_TEST_FOR_EXCEPTION(
+        ! Diagonal_->getMap ()->isSameAs (*rowMap), std::runtime_error,
+        "Tpetra::CrsMatrix::gaussSeidelCopy requires that the input "
+        "D be in the row Map of the matrix.");
+    TEUCHOS_TEST_FOR_EXCEPTION(
+        ! rowMap->isSameAs (*rangeMap), std::runtime_error,
+        "Tpetra::CrsMatrix::gaussSeidelCopy requires that the row Map and the "
+        "range Map be the same (in the sense of Tpetra::Map::isSameAs).");
+    TEUCHOS_TEST_FOR_EXCEPTION(
+        ! domainMap->isSameAs (*rangeMap), std::runtime_error,
+        "Tpetra::CrsMatrix::gaussSeidelCopy requires that the domain Map and "
+        "the range Map of the matrix be the same.");
+  }
+#endif
+
+  // Fetch a (possibly cached) temporary column Map multivector
+  // X_colMap, and a domain Map view X_domainMap of it.  Both have
+  // constant stride by construction.  We know that the domain Map
+  // must include the column Map, because our Gauss-Seidel kernel
+  // requires that the row Map, domain Map, and range Map are all
+  // the same, and that each process owns all of its own diagonal
+  // entries of the matrix.
+
+  RCP<multivector_type> X_colMap;
+  RCP<multivector_type> X_domainMap;
+  bool copyBackOutput = false;
+  if (importer.is_null ()) {
+    X_colMap = Teuchos::rcpFromRef (X);
+    X_domainMap = Teuchos::rcpFromRef (X);
+    // Column Map and domain Map are the same, so there are no
+    // remote entries.  Thus, if we are not setting the initial
+    // guess to zero, we don't have to worry about setting remote
+    // entries to zero, even though we are not doing an Import in
+    // this case.
+    if (ZeroStartingSolution_) {
+      X_colMap->putScalar (ZERO);
+    }
+    // No need to copy back to X at end.
+  }
+  else { // Column Map and domain Map are _not_ the same.
+    updateCachedMultiVector(colMap, X.getNumVectors());
+    X_colMap = cachedMV_;
+    X_domainMap = X_colMap->offsetViewNonConst (domainMap, 0);
+
+#ifdef HAVE_TPETRA_DEBUG
+    auto X_colMap_host_view = X_colMap->getLocalViewHost ();
+    auto X_domainMap_host_view = X_domainMap->getLocalViewHost ();
+
+    if (X_colMap->getLocalLength () != 0 && X_domainMap->getLocalLength ()) {
+      TEUCHOS_TEST_FOR_EXCEPTION
+        (X_colMap_host_view.data () != X_domainMap_host_view.data (),
+         std::logic_error, "Tpetra::CrsMatrix::gaussSeidelCopy: Pointer to "
+         "start of column Map view of X is not equal to pointer to start of "
+         "(domain Map view of) X.  This may mean that Tpetra::MultiVector::"
+         "offsetViewNonConst is broken.  "
+         "Please report this bug to the Tpetra developers.");
+    }
+
+    TEUCHOS_TEST_FOR_EXCEPTION(
+        X_colMap_host_view.extent (0) < X_domainMap_host_view.extent (0) ||
+        X_colMap->getLocalLength () < X_domainMap->getLocalLength (),
+        std::logic_error, "Tpetra::CrsMatrix::gaussSeidelCopy: "
+        "X_colMap has fewer local rows than X_domainMap.  "
+        "X_colMap_host_view.extent(0) = " << X_colMap_host_view.extent (0)
+        << ", X_domainMap_host_view.extent(0) = "
+        << X_domainMap_host_view.extent (0)
+        << ", X_colMap->getLocalLength() = " << X_colMap->getLocalLength ()
+        << ", and X_domainMap->getLocalLength() = "
+        << X_domainMap->getLocalLength ()
+        << ".  This means that Tpetra::MultiVector::offsetViewNonConst "
+        "is broken.  Please report this bug to the Tpetra developers.");
+
+    TEUCHOS_TEST_FOR_EXCEPTION(
+        X_colMap->getNumVectors () != X_domainMap->getNumVectors (),
+        std::logic_error, "Tpetra::CrsMatrix::gaussSeidelCopy: "
+        "X_colMap has a different number of columns than X_domainMap.  "
+        "X_colMap->getNumVectors() = " << X_colMap->getNumVectors ()
+        << " != X_domainMap->getNumVectors() = "
+        << X_domainMap->getNumVectors ()
+        << ".  This means that Tpetra::MultiVector::offsetViewNonConst "
+        "is broken.  Please report this bug to the Tpetra developers.");
+#endif // HAVE_TPETRA_DEBUG
+
+    if (ZeroStartingSolution_) {
+      // No need for an Import, since we're filling with zeros.
+      X_colMap->putScalar (ZERO);
+    } else {
+      // We could just copy X into X_domainMap.  However, that
+      // wastes a copy, because the Import also does a copy (plus
+      // communication).  Since the typical use case for
+      // Gauss-Seidel is a small number of sweeps (2 is typical), we
+      // don't want to waste that copy.  Thus, we do the Import
+      // here, and skip the first Import in the first sweep.
+      // Importing directly from X effects the copy into X_domainMap
+      // (which is a view of X_colMap).
+      X_colMap->doImport (X, *importer, Tpetra::INSERT);
+    }
+    copyBackOutput = true; // Don't forget to copy back at end.
+  } // if column and domain Maps are (not) the same
+
+  const_cast<multivector_type&>(B).sync_host();
+  for (int sweep = 0; sweep < NumSweeps_; ++sweep) {
+    if (! importer.is_null () && sweep > 0) {
+      // We already did the first Import for the zeroth sweep above,
+      // if it was necessary.
+      X_colMap->doImport (*X_domainMap, *importer, Tpetra::INSERT);
+    }
+    X_colMap->sync_host ();
+    // Do local Gauss-Seidel (forward, backward or symmetric)
+    serialGaussSeidel_->apply(*X_colMap, B, direction);
+    X_colMap->modify_host ();
+  }
+
+  if (copyBackOutput) {
+    try {
+      deep_copy (X , *X_domainMap); // Copy result back into X.
+    } catch (std::exception& e) {
+      TEUCHOS_TEST_FOR_EXCEPTION(
+          true, std::runtime_error, prefix << "deep_copy(X, *X_domainMap) "
+          "threw an exception: " << e.what ());
+    }
   }
 
   // For each column of output, for each sweep over the matrix:
@@ -1819,9 +1848,9 @@ ApplyInverseGS_CrsMatrix (const crs_matrix_type& A,
   // Floating-point operations due to the damping factor, per matrix
   // row, per direction, per columm of output.
   const double dampingFlops = (DampingFactor_ == STS::one()) ? 0.0 : 1.0;
-  const double numVectors = as<double> (X.getNumVectors ());
-  const double numGlobalRows = as<double> (A_->getGlobalNumRows ());
-  const double numGlobalNonzeros = as<double> (A_->getGlobalNumEntries ());
+  const double numVectors = X.getNumVectors ();
+  const double numGlobalRows = A_->getGlobalNumRows ();
+  const double numGlobalNonzeros = A_->getGlobalNumEntries ();
   ApplyFlops_ += NumSweeps_ * numVectors *
     (2.0 * numGlobalRows + 2.0 * numGlobalNonzeros + dampingFlops);
 }
@@ -1829,22 +1858,15 @@ ApplyInverseGS_CrsMatrix (const crs_matrix_type& A,
 template<class MatrixType>
 void
 Relaxation<MatrixType>::
-ApplyInverseGS_BlockCrsMatrix (const block_crs_matrix_type& A,
+ApplyInverseSerialGS_BlockCrsMatrix (const block_crs_matrix_type& A,
                                const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
-                               Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y)
+                               Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y,
+                               Tpetra::ESweepDirection direction)
 {
   using Tpetra::INSERT;
   using Teuchos::RCP;
   using Teuchos::rcp;
   using Teuchos::rcpFromRef;
-  using BMV = Tpetra::BlockMultiVector<scalar_type,
-    local_ordinal_type, global_ordinal_type, node_type>;
-  using MV = Tpetra::MultiVector<scalar_type,
-    local_ordinal_type, global_ordinal_type, node_type>;
-  using map_type = Tpetra::Map<local_ordinal_type,
-    global_ordinal_type, node_type>;
-  using import_type = Tpetra::Import<local_ordinal_type,
-    global_ordinal_type, node_type>;
 
   //FIXME: (tcf) 8/21/2014 -- may be problematic for multiple right hand sides
   //
@@ -1853,11 +1875,11 @@ ApplyInverseGS_BlockCrsMatrix (const block_crs_matrix_type& A,
   // does not have constant stride.  We should check for that case
   // here, in case it doesn't work in localGaussSeidel (which is
   // entirely possible).
-  BMV yBlock(Y, *(A.getGraph ()->getDomainMap()), A.getBlockSize());
-  const BMV xBlock(X, *(A.getColMap ()), A.getBlockSize());
+  block_multivector_type yBlock(Y, *(A.getGraph ()->getDomainMap()), A.getBlockSize());
+  const block_multivector_type xBlock(X, *(A.getColMap ()), A.getBlockSize());
 
   bool performImport = false;
-  RCP<BMV> yBlockCol;
+  RCP<block_multivector_type> yBlockCol;
   if (Importer_.is_null()) {
     yBlockCol = rcpFromRef(yBlock);
   }
@@ -1866,7 +1888,7 @@ ApplyInverseGS_BlockCrsMatrix (const block_crs_matrix_type& A,
         yBlockColumnPointMap_->getNumVectors() != yBlock.getNumVectors() ||
         yBlockColumnPointMap_->getBlockSize() != yBlock.getBlockSize()) {
       yBlockColumnPointMap_ =
-        rcp(new BMV(*(A.getColMap()), A.getBlockSize(),
+        rcp(new block_multivector_type(*(A.getColMap()), A.getBlockSize(),
                     static_cast<local_ordinal_type>(yBlock.getNumVectors())));
     }
     yBlockCol = yBlockColumnPointMap_;
@@ -1878,9 +1900,9 @@ ApplyInverseGS_BlockCrsMatrix (const block_crs_matrix_type& A,
     performImport = true;
   }
 
-  MV yBlock_mv;
-  MV yBlockCol_mv;
-  RCP<const MV> yBlockColPointDomain;
+  multivector_type yBlock_mv;
+  multivector_type yBlockCol_mv;
+  RCP<const multivector_type> yBlockColPointDomain;
   if (performImport) { // create views (shallow copies)
     yBlock_mv = yBlock.getMultiVectorView();
     yBlockCol_mv = yBlockCol->getMultiVectorView();
@@ -1892,18 +1914,17 @@ ApplyInverseGS_BlockCrsMatrix (const block_crs_matrix_type& A,
     yBlockCol->putScalar(STS::zero ());
   }
   else if (performImport) {
-    yBlockCol_mv.doImport(yBlock_mv, *pointImporter_, INSERT);
+    yBlockCol_mv.doImport(yBlock_mv, *pointImporter_, Tpetra::INSERT);
   }
 
-  const Tpetra::ESweepDirection direction =
-    DoBackwardGS_ ? Tpetra::Backward : Tpetra::Forward;
-
+  const_cast<block_multivector_type&>(xBlock).sync_host();
   for (int sweep = 0; sweep < NumSweeps_; ++sweep) {
     if (performImport && sweep > 0) {
-      yBlockCol_mv.doImport(yBlock_mv, *pointImporter_, INSERT);
+      yBlockCol_mv.doImport(yBlock_mv, *pointImporter_, Tpetra::INSERT);
     }
-    A.localGaussSeidel(xBlock, *yBlockCol, blockDiag_,
-                       DampingFactor_, direction);
+    yBlockCol->sync_host();
+    serialGaussSeidel_->applyBlock(*yBlockCol, xBlock, direction);
+    yBlockCol->modify_host();
     if (performImport) {
       Tpetra::deep_copy(Y, *yBlockColPointDomain);
     }
@@ -1913,9 +1934,10 @@ ApplyInverseGS_BlockCrsMatrix (const block_crs_matrix_type& A,
 template<class MatrixType>
 void
 Relaxation<MatrixType>::
-MTGaussSeidel (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& B,
-               Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
-               const Tpetra::ESweepDirection direction) const
+ApplyInverseMTGS_CrsMatrix(
+    const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& B,
+    Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
+    Tpetra::ESweepDirection direction) const
 {
   using Teuchos::null;
   using Teuchos::RCP;
@@ -1925,9 +1947,6 @@ MTGaussSeidel (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_o
   using Teuchos::as;
 
   typedef scalar_type Scalar;
-  typedef local_ordinal_type LocalOrdinal;
-  typedef global_ordinal_type GlobalOrdinal;
-  typedef node_type Node;
 
   const char prefix[] = "Ifpack2::Relaxation::(reordered)MTGaussSeidel: ";
   const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
@@ -1971,15 +1990,9 @@ MTGaussSeidel (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_o
     return;
   }
 
-  typedef typename Tpetra::MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> MV;
-  typedef typename crs_matrix_type::import_type import_type;
-  typedef typename crs_matrix_type::export_type export_type;
-  typedef typename crs_matrix_type::map_type map_type;
-
   RCP<const import_type> importer = crsMat->getGraph ()->getImporter ();
-  RCP<const export_type> exporter = crsMat->getGraph ()->getExporter ();
   TEUCHOS_TEST_FOR_EXCEPTION(
-    ! exporter.is_null (), std::runtime_error,
+    ! crsMat->getGraph ()->getExporter ().is_null (), std::runtime_error,
     "This method's implementation currently requires that the matrix's row, "
     "domain, and range Maps be the same.  This cannot be the case, because "
     "the matrix has a nontrivial Export object.");
@@ -2029,8 +2042,8 @@ MTGaussSeidel (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_o
   // the same, and that each process owns all of its own diagonal
   // entries of the matrix.
 
-  RCP<MV> X_colMap;
-  RCP<MV> X_domainMap;
+  RCP<multivector_type> X_colMap;
+  RCP<multivector_type> X_domainMap;
   bool copyBackOutput = false;
   if (importer.is_null ()) {
     if (X.isConstantStride ()) {
@@ -2147,7 +2160,7 @@ MTGaussSeidel (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_o
   // The Gauss-Seidel / SOR kernel expects multivectors of constant
   // stride.  X_colMap is by construction, but B might not be.  If
   // it's not, we have to make a copy.
-  RCP<const MV> B_in;
+  RCP<const multivector_type> B_in;
   if (B.isConstantStride ()) {
     B_in = rcpFromRef (B);
   }
@@ -2155,8 +2168,7 @@ MTGaussSeidel (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_o
     // Range Map and row Map are the same in this case, so we can
     // use the cached row Map multivector to store a constant stride
     // copy of B.
-    //RCP<MV> B_in_nonconst = crsMat->getRowMapMultiVector (B, true);
-    RCP<MV> B_in_nonconst = rcp (new MV (rowMap, B.getNumVectors()));
+    RCP<multivector_type> B_in_nonconst = rcp (new multivector_type(rowMap, B.getNumVectors()));
     try {
       deep_copy (*B_in_nonconst, B);
     } catch (std::exception& e) {
@@ -2166,7 +2178,7 @@ MTGaussSeidel (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_o
          << e.what () << ".";
       TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, e.what ());
     }
-    B_in = rcp_const_cast<const MV> (B_in_nonconst);
+    B_in = rcp_const_cast<const multivector_type> (B_in_nonconst);
 
     /*
     TPETRA_EFFICIENCY_WARNING(
@@ -2245,382 +2257,6 @@ MTGaussSeidel (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_o
   if (direction == Tpetra::Symmetric)
     ApplyFlops *= 2.0;
   ApplyFlops_ += ApplyFlops;
-
-}
-
-template<class MatrixType>
-void
-Relaxation<MatrixType>::
-ApplyInverseMTSGS_CrsMatrix (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& B,
-                             Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X) const
-{
-  const Tpetra::ESweepDirection direction = Tpetra::Symmetric;
-  this->MTGaussSeidel (B, X, direction);
-}
-
-
-template<class MatrixType>
-void Relaxation<MatrixType>::ApplyInverseMTGS_CrsMatrix (
-    const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& B,
-    Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X) const {
-
-  const Tpetra::ESweepDirection direction =
-    DoBackwardGS_ ? Tpetra::Backward : Tpetra::Forward;
-  this->MTGaussSeidel (B, X, direction);
-}
-
-template<class MatrixType>
-void
-Relaxation<MatrixType>::
-ApplyInverseSGS (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
-                 Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const
-{
-  using this_type = Relaxation<MatrixType>;
-  // The CrsMatrix version is faster, because it can access the sparse
-  // matrix data directly, rather than by copying out each row's data
-  // in turn.  Thus, we check whether the RowMatrix is really a
-  // CrsMatrix.
-  //
-  // FIXME (mfh 07 Jul 2013) See note on crs_matrix_type typedef
-  // declaration in Ifpack2_Relaxation_decl.hpp header file.  The code
-  // will still be correct if the cast fails, but it will use an
-  // unoptimized kernel.
-  const block_crs_matrix_type* blockCrsMat =
-    dynamic_cast<const block_crs_matrix_type*> (A_.getRawPtr ());
-  const crs_matrix_type* crsMat =
-    dynamic_cast<const crs_matrix_type*> (A_.getRawPtr ());
-  if (blockCrsMat != nullptr)  {
-    const_cast<this_type&> (*this).ApplyInverseSGS_BlockCrsMatrix(*blockCrsMat, X, Y);
-  }
-  else if (crsMat != nullptr) {
-    ApplyInverseSGS_CrsMatrix (*crsMat, X, Y);
-  }
-  else {
-    ApplyInverseSGS_RowMatrix (X, Y);
-  }
-}
-
-
-template<class MatrixType>
-void
-Relaxation<MatrixType>::
-ApplyInverseSGS_RowMatrix (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
-                           Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const
-{
-  using Teuchos::Array;
-  using Teuchos::ArrayRCP;
-  using Teuchos::ArrayView;
-  using Teuchos::as;
-  using Teuchos::RCP;
-  using Teuchos::rcp;
-  using Teuchos::rcpFromRef;
-  using MV = Tpetra::MultiVector<scalar_type, local_ordinal_type,
-                                 global_ordinal_type, node_type>;
-
-  // Tpetra's GS implementation for CrsMatrix handles zeroing out the
-  // starting multivector itself.  The generic RowMatrix version here
-  // does not, so we have to zero out Y here.
-  if (ZeroStartingSolution_) {
-    Y.putScalar (STS::zero ());
-  }
-
-  const size_t NumVectors = X.getNumVectors ();
-  const size_t maxLength = A_->getNodeMaxNumRowEntries ();
-  Array<local_ordinal_type> Indices (maxLength);
-  Array<scalar_type> Values (maxLength);
-
-  // Local smoothing stuff
-  const size_t numMyRows             = A_->getNodeNumRows();
-  const local_ordinal_type * rowInd  = 0;
-  size_t numActive                   = numMyRows;
-  bool do_local = !localSmoothingIndices_.is_null();
-  if(do_local) {
-    rowInd    = localSmoothingIndices_.getRawPtr();
-    numActive = localSmoothingIndices_.size();
-  }
-
-
-  RCP<MV> Y2;
-  if (IsParallel_) {
-    if (Importer_.is_null ()) { // domain and column Maps are the same.
-      updateCachedMultiVector (Y.getMap (), NumVectors);
-    }
-    else {
-      updateCachedMultiVector (Importer_->getTargetMap (), NumVectors);
-    }
-    Y2 = cachedMV_;
-  }
-  else {
-    Y2 = rcpFromRef (Y);
-  }
-
-  // Diagonal
-  ArrayRCP<const scalar_type>  d_rcp = Diagonal_->get1dView();
-  ArrayView<const scalar_type> d_ptr = d_rcp();
-
-  // Constant stride check
-  bool constant_stride = X.isConstantStride() && Y2->isConstantStride();
-
-  if(constant_stride) {
-    // extract 1D RCPs
-    size_t                    x_stride = X.getStride();
-    size_t                   y2_stride = Y2->getStride();
-    ArrayRCP<scalar_type>       y2_rcp = Y2->get1dViewNonConst();
-    ArrayRCP<const scalar_type>  x_rcp = X.get1dView();
-    ArrayView<scalar_type>      y2_ptr = y2_rcp();
-    ArrayView<const scalar_type> x_ptr = x_rcp();
-    Array<scalar_type> dtemp(NumVectors,STS::zero());
-
-    for (int j = 0; j < NumSweeps_; j++) {
-      // data exchange is here, once per sweep
-      if (IsParallel_) {
-        if (Importer_.is_null ()) {
-          // just copy, since domain and column Maps are the same
-          Tpetra::deep_copy (*Y2, Y);
-        } else {
-          Y2->doImport (Y, *Importer_, Tpetra::INSERT);
-        }
-      }
-      for (size_t ii = 0; ii < numActive; ++ii) {
-        local_ordinal_type i = as<local_ordinal_type>(do_local ? rowInd[ii] : ii);
-        size_t NumEntries;
-        A_->getLocalRowCopy (i, Indices (), Values (), NumEntries);
-        dtemp.assign(NumVectors,STS::zero());
-
-        for (size_t k = 0; k < NumEntries; ++k) {
-          const local_ordinal_type col = Indices[k];
-          for (size_t m = 0; m < NumVectors; ++m) {
-            dtemp[m] += Values[k] * y2_ptr[col + y2_stride*m];
-          }
-        }
-
-        for (size_t m = 0; m < NumVectors; ++m) {
-          y2_ptr[i + y2_stride*m] += DampingFactor_ * d_ptr[i] * (x_ptr[i + x_stride*m] - dtemp[m]);
-        }
-      }
-
-      // ptrdiff_t is the same size as size_t, but is signed.  Being
-      // signed is important so that i >= 0 is not trivially true.
-      for (ptrdiff_t ii = as<ptrdiff_t> (numActive) - 1; ii >= 0; --ii) {
-        local_ordinal_type i = as<local_ordinal_type>(do_local ? rowInd[ii] : ii);
-        size_t NumEntries;
-        A_->getLocalRowCopy (i, Indices (), Values (), NumEntries);
-        dtemp.assign(NumVectors,STS::zero());
-
-        for (size_t k = 0; k < NumEntries; ++k) {
-          const local_ordinal_type col = Indices[k];
-          for (size_t m = 0; m < NumVectors; ++m) {
-            dtemp[m] += Values[k] * y2_ptr[col + y2_stride*m];
-          }
-        }
-
-        for (size_t m = 0; m < NumVectors; ++m) {
-          y2_ptr[i + y2_stride*m] += DampingFactor_ * d_ptr[i] * (x_ptr[i + x_stride*m] - dtemp[m]);
-        }
-      }
-
-      // FIXME (mfh 02 Jan 2013) This is only correct if row Map == range Map.
-      if (IsParallel_) {
-        Tpetra::deep_copy (Y, *Y2);
-      }
-    }
-  }
-  else {
-    // extract 2D RCPs
-    ArrayRCP<ArrayRCP<scalar_type> > y2_ptr = Y2->get2dViewNonConst ();
-    ArrayRCP<ArrayRCP<const scalar_type> > x_ptr =  X.get2dView ();
-
-    for (int iter = 0; iter < NumSweeps_; ++iter) {
-      // only one data exchange per sweep
-      if (IsParallel_) {
-        if (Importer_.is_null ()) {
-          // just copy, since domain and column Maps are the same
-          Tpetra::deep_copy (*Y2, Y);
-        } else {
-          Y2->doImport (Y, *Importer_, Tpetra::INSERT);
-        }
-      }
-
-      for (size_t ii = 0; ii < numActive; ++ii) {
-        local_ordinal_type i = as<local_ordinal_type>(do_local ? rowInd[ii] : ii);
-        const scalar_type diag = d_ptr[i];
-        size_t NumEntries;
-        A_->getLocalRowCopy (as<local_ordinal_type> (i), Indices (), Values (), NumEntries);
-
-        for (size_t m = 0; m < NumVectors; ++m) {
-          scalar_type dtemp = STS::zero ();
-          ArrayView<const scalar_type> x_local = (x_ptr())[m]();
-          ArrayView<scalar_type>      y2_local = (y2_ptr())[m]();
-
-          for (size_t k = 0; k < NumEntries; ++k) {
-            const local_ordinal_type col = Indices[k];
-            dtemp += Values[k] * y2_local[col];
-          }
-          y2_local[i] += DampingFactor_ * (x_local[i] - dtemp) * diag;
-        }
-      }
-
-      // ptrdiff_t is the same size as size_t, but is signed.  Being
-      // signed is important so that i >= 0 is not trivially true.
-      for (ptrdiff_t ii = as<ptrdiff_t> (numActive) - 1; ii >= 0; --ii) {
-        local_ordinal_type i = as<local_ordinal_type>(do_local ? rowInd[ii] : ii);
-        const scalar_type diag = d_ptr[i];
-        size_t NumEntries;
-        A_->getLocalRowCopy (as<local_ordinal_type> (i), Indices (), Values (), NumEntries);
-
-        for (size_t m = 0; m < NumVectors; ++m) {
-          scalar_type dtemp = STS::zero ();
-          ArrayView<const scalar_type> x_local = (x_ptr())[m]();
-          ArrayView<scalar_type>      y2_local = (y2_ptr())[m]();
-
-          for (size_t k = 0; k < NumEntries; ++k) {
-            const local_ordinal_type col = Indices[k];
-            dtemp += Values[k] * y2_local[col];
-          }
-          y2_local[i] += DampingFactor_ * (x_local[i] - dtemp) * diag;
-        }
-      }
-
-      // FIXME (mfh 02 Jan 2013) This is only correct if row Map == range Map.
-      if (IsParallel_) {
-        Tpetra::deep_copy (Y, *Y2);
-      }
-    }
-  }
-
-  // See flop count discussion in implementation of ApplyInverseSGS_CrsMatrix().
-  const double dampingFlops = (DampingFactor_ == STS::one()) ? 0.0 : 1.0;
-  const double numVectors = as<double> (X.getNumVectors ());
-  const double numGlobalRows = as<double> (A_->getGlobalNumRows ());
-  const double numGlobalNonzeros = as<double> (A_->getGlobalNumEntries ());
-  ApplyFlops_ += 2.0 * NumSweeps_ * numVectors *
-    (2.0 * numGlobalRows + 2.0 * numGlobalNonzeros + dampingFlops);
-}
-
-
-template<class MatrixType>
-void
-Relaxation<MatrixType>::
-ApplyInverseSGS_CrsMatrix (const crs_matrix_type& A,
-                           const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
-                           Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y) const
-{
-  using Teuchos::as;
-  const Tpetra::ESweepDirection direction = Tpetra::Symmetric;
-  if (localSmoothingIndices_.is_null ()) {
-    A.gaussSeidelCopy (Y, X, *Diagonal_, DampingFactor_, direction,
-                       NumSweeps_, ZeroStartingSolution_);
-  }
-  else {
-    A.reorderedGaussSeidelCopy (Y, X, *Diagonal_, localSmoothingIndices_ (),
-                                DampingFactor_, direction,
-                                NumSweeps_, ZeroStartingSolution_);
-  }
-
-  // For each column of output, for each sweep over the matrix:
-  //
-  // - One + and one * for each matrix entry
-  // - One / and one + for each row of the matrix
-  // - If the damping factor is not one: one * for each row of the
-  //   matrix.  (It's not fair to count this if the damping factor is
-  //   one, since the implementation could skip it.  Whether it does
-  //   or not is the implementation's choice.)
-  //
-  // Each sweep of symmetric Gauss-Seidel / SOR counts as two sweeps,
-  // one forward and one backward.
-
-  // Floating-point operations due to the damping factor, per matrix
-  // row, per direction, per columm of output.
-  const double dampingFlops = (DampingFactor_ == STS::one()) ? 0.0 : 1.0;
-  const double numVectors = as<double> (X.getNumVectors ());
-  const double numGlobalRows = as<double> (A_->getGlobalNumRows ());
-  const double numGlobalNonzeros = as<double> (A_->getGlobalNumEntries ());
-  ApplyFlops_ += 2.0 * NumSweeps_ * numVectors *
-    (2.0 * numGlobalRows + 2.0 * numGlobalNonzeros + dampingFlops);
-}
-
-template<class MatrixType>
-void
-Relaxation<MatrixType>::
-ApplyInverseSGS_BlockCrsMatrix (const block_crs_matrix_type& A,
-                                const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
-                                Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y)
-{
-  using Tpetra::INSERT;
-  using Teuchos::RCP;
-  using Teuchos::rcp;
-  using Teuchos::rcpFromRef;
-  using BMV = Tpetra::BlockMultiVector<scalar_type,
-    local_ordinal_type, global_ordinal_type, node_type>;
-  using MV = Tpetra::MultiVector<scalar_type,
-    local_ordinal_type, global_ordinal_type, node_type>;
-  using map_type = Tpetra::Map<local_ordinal_type,
-    global_ordinal_type, node_type>;
-  using import_type = Tpetra::Import<local_ordinal_type,
-    global_ordinal_type, node_type>;
-
-  //FIXME: (tcf) 8/21/2014 -- may be problematic for multiple right hand sides
-  //
-  // NOTE (mfh 12 Sep 2014) I don't think it should be a problem for
-  // multiple right-hand sides, unless the input or output MultiVector
-  // does not have constant stride.  We should check for that case
-  // here, in case it doesn't work in localGaussSeidel (which is
-  // entirely possible).
-  BMV yBlock (Y, * (A.getGraph ()->getDomainMap ()), A.getBlockSize ());
-  const BMV xBlock (X, * (A.getColMap ()), A.getBlockSize ());
-
-  bool performImport = false;
-  RCP<BMV> yBlockCol;
-  if (Importer_.is_null ()) {
-    yBlockCol = Teuchos::rcpFromRef (yBlock);
-  }
-  else {
-    if (yBlockColumnPointMap_.is_null () ||
-        yBlockColumnPointMap_->getNumVectors () != yBlock.getNumVectors () ||
-        yBlockColumnPointMap_->getBlockSize () != yBlock.getBlockSize ()) {
-      yBlockColumnPointMap_ =
-        rcp (new BMV (* (A.getColMap ()), A.getBlockSize (),
-                      static_cast<local_ordinal_type> (yBlock.getNumVectors ())));
-    }
-    yBlockCol = yBlockColumnPointMap_;
-    if (pointImporter_.is_null()) {
-      auto srcMap = rcp(new map_type(yBlock.getPointMap()));
-      auto tgtMap = rcp(new map_type(yBlockCol->getPointMap()));
-      pointImporter_ = rcp(new import_type(srcMap, tgtMap));
-    }
-    performImport = true;
-  }
-
-  MV yBlock_mv;
-  MV yBlockCol_mv;
-  RCP<const MV> yBlockColPointDomain;
-  if (performImport) { // create views (shallow copies)
-    yBlock_mv = yBlock.getMultiVectorView();
-    yBlockCol_mv = yBlockCol->getMultiVectorView();
-    yBlockColPointDomain =
-      yBlockCol_mv.offsetView(A.getDomainMap(), 0);
-  }
-
-  if (ZeroStartingSolution_) {
-    yBlockCol->putScalar(STS::zero ());
-  }
-  else if (performImport) {
-    yBlockCol_mv.doImport(yBlock_mv, *pointImporter_, INSERT);
-  }
-
-  // FIXME (mfh 12 Sep 2014) Shouldn't this come from the user's parameter?
-  const Tpetra::ESweepDirection direction = Tpetra::Symmetric;
-
-  for (int sweep = 0; sweep < NumSweeps_; ++sweep) {
-    if (performImport && sweep > 0) {
-      yBlockCol_mv.doImport(yBlock_mv, *pointImporter_, INSERT);
-    }
-    A.localGaussSeidel(xBlock, *yBlockCol, blockDiag_,
-                       DampingFactor_, direction);
-    if (performImport) {
-      Tpetra::deep_copy(yBlock_mv, *yBlockColPointDomain);
-    }
-  }
 }
 
 template<class MatrixType>
@@ -2807,6 +2443,7 @@ describe (Teuchos::FancyOStream &out,
   }
 }
 
+
 } // namespace Ifpack2
 
 #define IFPACK2_RELAXATION_INSTANT(S,LO,GO,N)                            \
diff --git a/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_decl.hpp b/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_decl.hpp
index fed4cda0de9c..cde3ca4f7cf6 100644
--- a/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_decl.hpp
+++ b/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_decl.hpp
@@ -351,58 +351,6 @@ class BlockCrsMatrix :
               const Scalar alpha = Teuchos::ScalarTraits<Scalar>::one (),
               const Scalar beta = Teuchos::ScalarTraits<Scalar>::zero ());
 
-  /// \brief Version of gaussSeidel(), with fewer requirements on X.
-  ///
-  /// Not Implemented
-  void
-  gaussSeidelCopy (MultiVector<Scalar,LO,GO,Node> &X,
-                   const ::Tpetra::MultiVector<Scalar,LO,GO,Node> &B,
-                   const ::Tpetra::MultiVector<Scalar,LO,GO,Node> &D,
-                   const Scalar& dampingFactor,
-                   const ESweepDirection direction,
-                   const int numSweeps,
-                   const bool zeroInitialGuess) const;
-
-  /// \brief Version of reorderedGaussSeidel(), with fewer requirements on X.
-  ///
-  /// Not Implemented
-  void
-  reorderedGaussSeidelCopy (::Tpetra::MultiVector<Scalar,LO,GO,Node>& X,
-                            const ::Tpetra::MultiVector<Scalar,LO,GO,Node>& B,
-                            const ::Tpetra::MultiVector<Scalar,LO,GO,Node>& D,
-                            const Teuchos::ArrayView<LO>& rowIndices,
-                            const Scalar& dampingFactor,
-                            const ESweepDirection direction,
-                            const int numSweeps,
-                            const bool zeroInitialGuess) const;
-
-  /// \brief Local Gauss-Seidel solve, given a factorized diagonal
-  ///
-  /// \param Residual [in] The "residual" (right-hand side) block
-  ///   (multi)vector
-  /// \param Solution [in/out] On input: the initial guess / current
-  ///   approximate solution.  On output: the new approximate
-  ///   solution.
-  /// \param D_inv [in] Block diagonal, the explicit inverse of this
-  ///   matrix's block diagonal (possibly modified for algorithmic
-  ///   reasons).
-  /// \param omega [in] (S)SOR relaxation coefficient
-  /// \param direction [in] Forward, Backward, or Symmetric.
-  ///
-  /// One may access block i in \c D_inv using the
-  /// following code:
-  /// \code
-  /// auto D_ii = Kokkos::subview(D_inv, i, Kokkos::ALL(), Kokkos::ALL());
-  /// \endcode
-  /// The resulting block is b x b, where <tt>b = this->getBlockSize()</tt>.
-  void
-  localGaussSeidel (const BlockMultiVector<Scalar, LO, GO, Node>& Residual,
-                          BlockMultiVector<Scalar, LO, GO, Node>& Solution,
-                    const Kokkos::View<impl_scalar_type***, device_type,
-                          Kokkos::MemoryUnmanaged>& D_inv,
-                    const Scalar& omega,
-                    const ESweepDirection direction) const;
-
   /// \brief Replace values at the given (mesh, i.e., block) column
   ///   indices, in the given (mesh, i.e., block) row.
   ///
diff --git a/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_def.hpp
index 625d2501e18d..3c84d111978f 100644
--- a/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_BlockCrsMatrix_def.hpp
@@ -1058,158 +1058,6 @@ class GetLocalDiagCopy {
     graph_.getLocalDiagOffsets (offsets);
   }
 
-
-  template <class Scalar, class LO, class GO, class Node>
-  void
-  BlockCrsMatrix<Scalar,LO,GO,Node>::
-  localGaussSeidel (const BlockMultiVector<Scalar, LO, GO, Node>& B,
-                    BlockMultiVector<Scalar, LO, GO, Node>& X,
-                    const Kokkos::View<impl_scalar_type***, device_type,
-                      Kokkos::MemoryUnmanaged>& D_inv,
-                    const Scalar& omega,
-                    const ESweepDirection direction) const
-  {
-    using Kokkos::ALL;
-    const impl_scalar_type zero =
-      Kokkos::Details::ArithTraits<impl_scalar_type>::zero ();
-    const impl_scalar_type one =
-      Kokkos::Details::ArithTraits<impl_scalar_type>::one ();
-    const LO numLocalMeshRows =
-      static_cast<LO> (rowMeshMap_.getNodeNumElements ());
-    const LO numVecs = static_cast<LO> (X.getNumVectors ());
-
-    // If using (new) Kokkos, replace localMem with thread-local
-    // memory.  Note that for larger block sizes, this will affect the
-    // two-level parallelization.  Look to Stokhos for best practice
-    // on making this fast for GPUs.
-    const LO blockSize = getBlockSize ();
-    Teuchos::Array<impl_scalar_type> localMem (blockSize);
-    Teuchos::Array<impl_scalar_type> localMat (blockSize*blockSize);
-    little_host_vec_type X_lcl (localMem.getRawPtr (), blockSize);
-
-    // FIXME (mfh 12 Aug 2014) This probably won't work if LO is unsigned.
-    LO rowBegin = 0, rowEnd = 0, rowStride = 0;
-    if (direction == Forward) {
-      rowBegin = 1;
-      rowEnd = numLocalMeshRows+1;
-      rowStride = 1;
-    }
-    else if (direction == Backward) {
-      rowBegin = numLocalMeshRows;
-      rowEnd = 0;
-      rowStride = -1;
-    }
-    else if (direction == Symmetric) {
-      this->localGaussSeidel (B, X, D_inv, omega, Forward);
-      this->localGaussSeidel (B, X, D_inv, omega, Backward);
-      return;
-    }
-
-    const Scalar one_minus_omega = Teuchos::ScalarTraits<Scalar>::one()-omega;
-    const Scalar     minus_omega = -omega;
-
-    Kokkos::fence();
-
-    if (numVecs == 1) {
-      for (LO lclRow = rowBegin; lclRow != rowEnd; lclRow += rowStride) {
-        const LO actlRow = lclRow - 1;
-
-        little_host_vec_type B_cur = B.getLocalBlock (actlRow, 0);
-        COPY (B_cur, X_lcl);
-        SCAL (static_cast<impl_scalar_type> (omega), X_lcl);
-
-        const size_t meshBeg = ptrHost_[actlRow];
-        const size_t meshEnd = ptrHost_[actlRow+1];
-        for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
-          const LO meshCol = indHost_[absBlkOff];
-          const_little_block_type A_cur =
-            getConstLocalBlockFromAbsOffset (absBlkOff);
-          little_host_vec_type X_cur = X.getLocalBlock (meshCol, 0);
-
-          // X_lcl += alpha*A_cur*X_cur
-          const Scalar alpha = meshCol == actlRow ? one_minus_omega : minus_omega;
-          //X_lcl.matvecUpdate (alpha, A_cur, X_cur);
-          GEMV (static_cast<impl_scalar_type> (alpha), A_cur, X_cur, X_lcl);
-        } // for each entry in the current local row of the matrix
-
-        // NOTE (mfh 20 Jan 2016) The two input Views here are
-        // unmanaged already, so we don't have to take unmanaged
-        // subviews first.
-        auto D_lcl = Kokkos::subview (D_inv, actlRow, ALL (), ALL ());
-        little_host_vec_type X_update = X.getLocalBlock (actlRow, 0);
-        FILL (X_update, zero);
-        GEMV (one, D_lcl, X_lcl, X_update); // overwrite X_update
-      } // for each local row of the matrix
-    }
-    else {
-      for (LO lclRow = rowBegin; lclRow != rowEnd; lclRow += rowStride) {
-        for (LO j = 0; j < numVecs; ++j) {
-          LO actlRow = lclRow-1;
-
-          little_host_vec_type B_cur = B.getLocalBlock (actlRow, j);
-          COPY (B_cur, X_lcl);
-          SCAL (static_cast<impl_scalar_type> (omega), X_lcl);
-
-          const size_t meshBeg = ptrHost_[actlRow];
-          const size_t meshEnd = ptrHost_[actlRow+1];
-          for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
-            const LO meshCol = indHost_[absBlkOff];
-            const_little_block_type A_cur =
-              getConstLocalBlockFromAbsOffset (absBlkOff);
-            little_host_vec_type X_cur = X.getLocalBlock (meshCol, j);
-
-            // X_lcl += alpha*A_cur*X_cur
-            const Scalar alpha = meshCol == actlRow ? one_minus_omega : minus_omega;
-            GEMV (static_cast<impl_scalar_type> (alpha), A_cur, X_cur, X_lcl);
-          } // for each entry in the current local row of the matrx
-
-          auto D_lcl = Kokkos::subview (D_inv, actlRow, ALL (), ALL ());
-          auto X_update = X.getLocalBlock (actlRow, j);
-          FILL (X_update, zero);
-          GEMV (one, D_lcl, X_lcl, X_update); // overwrite X_update
-        } // for each entry in the current local row of the matrix
-      } // for each local row of the matrix
-    }
-  }
-
-  template <class Scalar, class LO, class GO, class Node>
-  void
-  BlockCrsMatrix<Scalar,LO,GO,Node>::
-  gaussSeidelCopy (MultiVector<Scalar,LO,GO,Node> &/* X */,
-                   const MultiVector<Scalar,LO,GO,Node> &/* B */,
-                   const MultiVector<Scalar,LO,GO,Node> &/* D */,
-                   const Scalar& /* dampingFactor */,
-                   const ESweepDirection /* direction */,
-                   const int /* numSweeps */,
-                   const bool /* zeroInitialGuess */) const
-  {
-    // FIXME (mfh 12 Aug 2014) This method has entirely the wrong
-    // interface for block Gauss-Seidel.
-    TEUCHOS_TEST_FOR_EXCEPTION(
-      true, std::logic_error, "Tpetra::BlockCrsMatrix::"
-      "gaussSeidelCopy: Not implemented.");
-  }
-
-  template <class Scalar, class LO, class GO, class Node>
-  void
-  BlockCrsMatrix<Scalar,LO,GO,Node>::
-  reorderedGaussSeidelCopy (MultiVector<Scalar,LO,GO,Node>& /* X */,
-                            const MultiVector<Scalar,LO,GO,Node>& /* B */,
-                            const MultiVector<Scalar,LO,GO,Node>& /* D */,
-                            const Teuchos::ArrayView<LO>& /* rowIndices */,
-                            const Scalar& /* dampingFactor */,
-                            const ESweepDirection /* direction */,
-                            const int /* numSweeps */,
-                            const bool /* zeroInitialGuess */) const
-  {
-    // FIXME (mfh 12 Aug 2014) This method has entirely the wrong
-    // interface for block Gauss-Seidel.
-    TEUCHOS_TEST_FOR_EXCEPTION(
-      true, std::logic_error, "Tpetra::BlockCrsMatrix::"
-      "reorderedGaussSeidelCopy: Not implemented.");
-  }
-
-
   template <class Scalar, class LO, class GO, class Node>
   void
   BlockCrsMatrix<Scalar,LO,GO,Node>::
diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_decl.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_decl.hpp
index 1cf25b82ff42..a68995ab6700 100644
--- a/packages/tpetra/core/src/Tpetra_CrsMatrix_decl.hpp
+++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_decl.hpp
@@ -62,7 +62,6 @@
 // DomainScalar and RangeScalar, so we have to include this header
 // file here, rather than in the _def header file, so that we can get
 // the interfaces to the corresponding local computational kernels.
-#include "KokkosSparse_sor_sequential_impl.hpp"
 
 #include <memory> // std::shared_ptr
 
@@ -2808,82 +2807,6 @@ namespace Tpetra {
     ///   Tpetra::Backward.  ("Symmetric" requires interprocess
     ///   communication (before each sweep), which is not part of the
     ///   local kernel.)
-    template <class DomainScalar, class RangeScalar>
-    void
-    localGaussSeidel (const MultiVector<DomainScalar, LocalOrdinal, GlobalOrdinal, Node> &B,
-                      MultiVector<RangeScalar, LocalOrdinal, GlobalOrdinal, Node> &X,
-                      const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> &D,
-                      const RangeScalar& dampingFactor,
-                      const ESweepDirection direction) const
-    {
-      typedef LocalOrdinal LO;
-      typedef GlobalOrdinal GO;
-      typedef Tpetra::MultiVector<DomainScalar, LO, GO, Node> DMV;
-      typedef Tpetra::MultiVector<Scalar, LO, GO, Node> MMV;
-      typedef typename Node::device_type::memory_space dev_mem_space;
-      typedef typename MMV::dual_view_type::t_host::device_type host_mem_space;
-      typedef typename Graph::local_graph_type k_local_graph_type;
-      typedef typename k_local_graph_type::size_type offset_type;
-      const char prefix[] = "Tpetra::CrsMatrix::localGaussSeidel: ";
-
-      TEUCHOS_TEST_FOR_EXCEPTION
-        (! this->isFillComplete (), std::runtime_error,
-         prefix << "The matrix is not fill complete.");
-      const size_t lclNumRows = this->getNodeNumRows ();
-      const size_t numVecs = B.getNumVectors ();
-      TEUCHOS_TEST_FOR_EXCEPTION
-        (X.getNumVectors () != numVecs, std::invalid_argument,
-         prefix << "B.getNumVectors() = " << numVecs << " != "
-         "X.getNumVectors() = " << X.getNumVectors () << ".");
-      TEUCHOS_TEST_FOR_EXCEPTION
-        (B.getLocalLength () != lclNumRows, std::invalid_argument,
-         prefix << "B.getLocalLength() = " << B.getLocalLength ()
-         << " != this->getNodeNumRows() = " << lclNumRows << ".");
-
-      // mfh 28 Aug 2017: The current local Gauss-Seidel kernel only
-      // runs on host.  (See comments below.)  Thus, we need to access
-      // the host versions of these data.
-      const_cast<DMV&> (B).sync_host ();
-      X.sync_host ();
-      X.modify_host ();
-      const_cast<MMV&> (D).sync_host ();
-
-      auto B_lcl = B.template getLocalView<host_mem_space> ();
-      auto X_lcl = X.template getLocalView<host_mem_space> ();
-      auto D_lcl = D.template getLocalView<host_mem_space> ();
-
-      offset_type B_stride[8], X_stride[8], D_stride[8];
-      B_lcl.stride (B_stride);
-      X_lcl.stride (X_stride);
-      D_lcl.stride (D_stride);
-
-      local_matrix_type lclMatrix = this->getLocalMatrix ();
-      k_local_graph_type lclGraph = lclMatrix.graph;
-      typename local_matrix_type::row_map_type ptr = lclGraph.row_map;
-      typename local_matrix_type::index_type ind = lclGraph.entries;
-      typename local_matrix_type::values_type val = lclMatrix.values;
-      const offset_type* const ptrRaw = ptr.data ();
-      const LO* const indRaw = ind.data ();
-      const impl_scalar_type* const valRaw = val.data ();
-
-      const std::string dir ((direction == Forward) ? "F" : "B");
-      // NOTE (mfh 28 Aug 2017) This assumes UVM.  We can't get around
-      // that on GPUs without using a GPU-based sparse triangular
-      // solve to implement Gauss-Seidel.  This exists in cuSPARSE,
-      // but we would need to implement a wrapper with a fall-back
-      // algorithm for unsupported Scalar and LO types.
-      KokkosSparse::Impl::Sequential::gaussSeidel (static_cast<LO> (lclNumRows),
-                                                   static_cast<LO> (numVecs),
-                                                   ptrRaw, indRaw, valRaw,
-                                                   B_lcl.data (), B_stride[1],
-                                                   X_lcl.data (), X_stride[1],
-                                                   D_lcl.data (),
-                                                   static_cast<impl_scalar_type> (dampingFactor),
-                                                   dir.c_str ());
-      const_cast<DMV&> (B).template sync<dev_mem_space> ();
-      X.template sync<dev_mem_space> ();
-      const_cast<MMV&> (D).template sync<dev_mem_space> ();
-    }
 
     /// \brief Reordered Gauss-Seidel or SOR on \f$B = A X\f$.
     ///
@@ -2911,91 +2834,6 @@ namespace Tpetra {
     ///   Tpetra::Backward.  ("Symmetric" requires interprocess
     ///   communication (before each sweep), which is not part of the
     ///   local kernel.)
-    template <class DomainScalar, class RangeScalar>
-    void
-    reorderedLocalGaussSeidel (const MultiVector<DomainScalar, LocalOrdinal, GlobalOrdinal, Node>& B,
-                               MultiVector<RangeScalar, LocalOrdinal, GlobalOrdinal, Node>& X,
-                               const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& D,
-                               const Teuchos::ArrayView<LocalOrdinal>& rowIndices,
-                               const RangeScalar& dampingFactor,
-                               const ESweepDirection direction) const
-    {
-      typedef LocalOrdinal LO;
-      typedef GlobalOrdinal GO;
-      typedef Tpetra::MultiVector<DomainScalar, LO, GO, Node> DMV;
-      typedef Tpetra::MultiVector<Scalar, LO, GO, Node> MMV;
-      typedef typename Node::device_type::memory_space dev_mem_space;
-      typedef typename MMV::dual_view_type::t_host::device_type host_mem_space;
-      typedef typename Graph::local_graph_type k_local_graph_type;
-      typedef typename k_local_graph_type::size_type offset_type;
-      const char prefix[] = "Tpetra::CrsMatrix::reorderedLocalGaussSeidel: ";
-
-      TEUCHOS_TEST_FOR_EXCEPTION
-        (! this->isFillComplete (), std::runtime_error,
-         prefix << "The matrix is not fill complete.");
-      const size_t lclNumRows = this->getNodeNumRows ();
-      const size_t numVecs = B.getNumVectors ();
-      TEUCHOS_TEST_FOR_EXCEPTION
-        (X.getNumVectors () != numVecs, std::invalid_argument,
-         prefix << "B.getNumVectors() = " << numVecs << " != "
-         "X.getNumVectors() = " << X.getNumVectors () << ".");
-      TEUCHOS_TEST_FOR_EXCEPTION
-        (B.getLocalLength () != lclNumRows, std::invalid_argument,
-         prefix << "B.getLocalLength() = " << B.getLocalLength ()
-         << " != this->getNodeNumRows() = " << lclNumRows << ".");
-      TEUCHOS_TEST_FOR_EXCEPTION
-        (static_cast<size_t> (rowIndices.size ()) < lclNumRows,
-         std::invalid_argument, prefix << "rowIndices.size() = "
-         << rowIndices.size () << " < this->getNodeNumRows() = "
-         << lclNumRows << ".");
-
-      // mfh 28 Aug 2017: The current local Gauss-Seidel kernel only
-      // runs on host.  (See comments below.)  Thus, we need to access
-      // the host versions of these data.
-      const_cast<DMV&> (B).sync_host ();
-      X.sync_host ();
-      X.modify_host ();
-      const_cast<MMV&> (D).sync_host ();
-
-      auto B_lcl = B.template getLocalView<host_mem_space> ();
-      auto X_lcl = X.template getLocalView<host_mem_space> ();
-      auto D_lcl = D.template getLocalView<host_mem_space> ();
-
-      offset_type B_stride[8], X_stride[8], D_stride[8];
-      B_lcl.stride (B_stride);
-      X_lcl.stride (X_stride);
-      D_lcl.stride (D_stride);
-
-      local_matrix_type lclMatrix = this->getLocalMatrix ();
-      typename Graph::local_graph_type lclGraph = lclMatrix.graph;
-      typename local_matrix_type::index_type ind = lclGraph.entries;
-      typename local_matrix_type::row_map_type ptr = lclGraph.row_map;
-      typename local_matrix_type::values_type val = lclMatrix.values;
-      const offset_type* const ptrRaw = ptr.data ();
-      const LO* const indRaw = ind.data ();
-      const impl_scalar_type* const valRaw = val.data ();
-
-      const std::string dir = (direction == Forward) ? "F" : "B";
-      // NOTE (mfh 28 Aug 2017) This assumes UVM.  We can't get around
-      // that on GPUs without using a GPU-based sparse triangular
-      // solve to implement Gauss-Seidel, and also handling the
-      // permutations correctly.
-      KokkosSparse::Impl::Sequential::reorderedGaussSeidel (static_cast<LO> (lclNumRows),
-                                                            static_cast<LO> (numVecs),
-                                                            ptrRaw, indRaw, valRaw,
-                                                            B_lcl.data (),
-                                                            B_stride[1],
-                                                            X_lcl.data (),
-                                                            X_stride[1],
-                                                            D_lcl.data (),
-                                                            rowIndices.getRawPtr (),
-                                                            static_cast<LO> (lclNumRows),
-                                                            static_cast<impl_scalar_type> (dampingFactor),
-                                                            dir.c_str ());
-      const_cast<DMV&> (B).template sync<dev_mem_space> ();
-      X.template sync<dev_mem_space> ();
-      const_cast<MMV&> (D).template sync<dev_mem_space> ();
-    }
 
     /// \brief Return another CrsMatrix with the same entries, but
     ///   converted to a different Scalar type \c T.
@@ -3048,229 +2886,6 @@ namespace Tpetra {
     //! @name Other "apply"-like methods
     //@{
 
-    /// \brief "Hybrid" Jacobi + (Gauss-Seidel or SOR) on \f$B = A X\f$.
-    ///
-    /// "Hybrid" means Successive Over-Relaxation (SOR) or
-    /// Gauss-Seidel within an (MPI) process, but Jacobi between
-    /// processes.  Gauss-Seidel is a special case of SOR, where the
-    /// damping factor is one.
-    ///
-    /// The Forward or Backward sweep directions have their usual SOR
-    /// meaning within the process.  Interprocess communication occurs
-    /// once before the sweep, as it normally would in Jacobi.
-    ///
-    /// The Symmetric sweep option means two sweeps: first Forward,
-    /// then Backward.  Interprocess communication occurs before each
-    /// sweep, as in Jacobi.  Thus, Symmetric results in two
-    /// interprocess communication steps.
-    ///
-    /// \param B [in] Right-hand side(s).
-    /// \param X [in/out] On input: initial guess(es).  On output:
-    ///   result multivector(s).
-    /// \param D [in] Inverse of diagonal entries of the matrix A.
-    /// \param dampingFactor [in] SOR damping factor.  A damping
-    ///   factor of one results in Gauss-Seidel.
-    /// \param direction [in] Sweep direction: Forward, Backward, or
-    ///   Symmetric.
-    /// \param numSweeps [in] Number of sweeps.  We count each
-    ///   Symmetric sweep (including both its Forward and its Backward
-    ///   sweep) as one.
-    ///
-    /// \section Tpetra_KR_CrsMatrix_gaussSeidel_req Requirements
-    ///
-    /// This method has the following requirements:
-    ///
-    /// 1. X is in the domain Map of the matrix.
-    /// 2. The domain and row Maps of the matrix are the same.
-    /// 3. The column Map contains the domain Map, and both start at the same place.
-    /// 4. The row Map is uniquely owned.
-    /// 5. D is in the row Map of the matrix.
-    /// 6. X is actually a view of a column Map multivector.
-    /// 7. Neither B nor D alias X.
-    ///
-    /// #1 is just the usual requirement for operators: the input
-    /// multivector must always be in the domain Map.  The
-    /// Gauss-Seidel kernel imposes additional requirements, since it
-    ///
-    /// - overwrites the input multivector with the output (which
-    ///   implies #2), and
-    /// - uses the same local indices for the input and output
-    ///   multivector (which implies #2 and #3).
-    ///
-    /// #3 is reasonable if the matrix constructed the column Map,
-    /// because the method that does this (CrsGraph::makeColMap) puts
-    /// the local GIDs (those in the domain Map) in front and the
-    /// remote GIDs (not in the domain Map) at the end of the column
-    /// Map.  However, if you constructed the column Map yourself, you
-    /// are responsible for maintaining this invariant.  #6 lets us do
-    /// the Import from the domain Map to the column Map in place.
-    ///
-    /// The Gauss-Seidel kernel also assumes that each process has the
-    /// entire value (not a partial value to sum) of all the diagonal
-    /// elements in the rows in its row Map.  (We guarantee this anyway
-    /// though the separate D vector.)  This is because each element of
-    /// the output multivector depends nonlinearly on the diagonal
-    /// elements.  Shared ownership of off-diagonal elements would
-    /// produce different results.
-    void
-    gaussSeidel (const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> &B,
-                 MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> &X,
-                 const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> &D,
-                 const Scalar& dampingFactor,
-                 const ESweepDirection direction,
-                 const int numSweeps) const;
-
-    /// \brief Reordered "Hybrid" Jacobi + (Gauss-Seidel or SOR) on \f$B = A X\f$.
-    ///
-    /// "Hybrid" means Successive Over-Relaxation (SOR) or
-    /// Gauss-Seidel within an (MPI) process, but Jacobi between
-    /// processes.  Gauss-Seidel is a special case of SOR, where the
-    /// damping factor is one.  The ordering can be a partial one, in which case the Gauss-Seidel is only
-    /// executed on a local subset of unknowns.
-    ///
-    /// The Forward or Backward sweep directions have their usual SOR
-    /// meaning within the process.  Interprocess communication occurs
-    /// once before the sweep, as it normally would in Jacobi.
-    ///
-    /// The Symmetric sweep option means two sweeps: first Forward,
-    /// then Backward.  Interprocess communication occurs before each
-    /// sweep, as in Jacobi.  Thus, Symmetric results in two
-    /// interprocess communication steps.
-    ///
-    /// \param B [in] Right-hand side(s).
-    /// \param X [in/out] On input: initial guess(es).  On output:
-    ///   result multivector(s).
-    /// \param D [in] Inverse of diagonal entries of the matrix A.
-    /// \param rowIndices [in] Ordered list of indices on which to execute GS.
-    /// \param dampingFactor [in] SOR damping factor.  A damping
-    ///   factor of one results in Gauss-Seidel.
-    /// \param direction [in] Sweep direction: Forward, Backward, or
-    ///   Symmetric.
-    /// \param numSweeps [in] Number of sweeps.  We count each
-    ///   Symmetric sweep (including both its Forward and its Backward
-    ///   sweep) as one.
-    ///
-    /// \section Tpetra_KR_CrsMatrix_reorderedGaussSeidel_req Requirements
-    ///
-    /// This method has the following requirements:
-    ///
-    /// 1. X is in the domain Map of the matrix.
-    /// 2. The domain and row Maps of the matrix are the same.
-    /// 3. The column Map contains the domain Map, and both start at the same place.
-    /// 4. The row Map is uniquely owned.
-    /// 5. D is in the row Map of the matrix.
-    /// 6. X is actually a view of a column Map multivector.
-    /// 7. Neither B nor D alias X.
-    ///
-    /// #1 is just the usual requirement for operators: the input
-    /// multivector must always be in the domain Map.  The
-    /// Gauss-Seidel kernel imposes additional requirements, since it
-    ///
-    /// - overwrites the input multivector with the output (which
-    ///   implies #2), and
-    /// - uses the same local indices for the input and output
-    ///   multivector (which implies #2 and #3).
-    ///
-    /// #3 is reasonable if the matrix constructed the column Map,
-    /// because the method that does this (CrsGraph::makeColMap) puts
-    /// the local GIDs (those in the domain Map) in front and the
-    /// remote GIDs (not in the domain Map) at the end of the column
-    /// Map.  However, if you constructed the column Map yourself, you
-    /// are responsible for maintaining this invariant.  #6 lets us do
-    /// the Import from the domain Map to the column Map in place.
-    ///
-    /// The Gauss-Seidel kernel also assumes that each process has the
-    /// entire value (not a partial value to sum) of all the diagonal
-    /// elements in the rows in its row Map.  (We guarantee this anyway
-    /// though the separate D vector.)  This is because each element of
-    /// the output multivector depends nonlinearly on the diagonal
-    /// elements.  Shared ownership of off-diagonal elements would
-    /// produce different results.
-    void
-    reorderedGaussSeidel (const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
-                          MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& X,
-                          const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& D,
-                          const Teuchos::ArrayView<LocalOrdinal>& rowIndices,
-                          const Scalar& dampingFactor,
-                          const ESweepDirection direction,
-                          const int numSweeps) const;
-
-    /// \brief Version of gaussSeidel(), with fewer requirements on X.
-    ///
-    /// This method is just like gaussSeidel(), except that X need
-    /// only be in the domain Map.  This method does not require that
-    /// X be a domain Map view of a column Map multivector.  As a
-    /// result, this method must copy X into a domain Map multivector
-    /// before operating on it.
-    ///
-    /// \param X [in/out] On input: initial guess(es).  On output:
-    ///   result multivector(s).
-    /// \param B [in] Right-hand side(s), in the range Map.
-    /// \param D [in] Inverse of diagonal entries of the matrix,
-    ///   in the row Map.
-    /// \param dampingFactor [in] SOR damping factor.  A damping
-    ///   factor of one results in Gauss-Seidel.
-    /// \param direction [in] Sweep direction: Forward, Backward, or
-    ///   Symmetric.
-    /// \param numSweeps [in] Number of sweeps.  We count each
-    ///   Symmetric sweep (including both its Forward and its
-    ///   Backward sweep) as one.
-    /// \param zeroInitialGuess [in] If true, this method will fill X
-    ///   with zeros initially.  If false, this method will assume
-    ///   that X contains a possibly nonzero initial guess on input.
-    ///   Note that a nonzero initial guess may impose an additional
-    ///   nontrivial communication cost (an additional Import).
-    ///
-    /// \pre Domain, range, and row Maps of the sparse matrix are all the same.
-    /// \pre No other argument aliases X.
-    void
-    gaussSeidelCopy (MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> &X,
-                     const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> &B,
-                     const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> &D,
-                     const Scalar& dampingFactor,
-                     const ESweepDirection direction,
-                     const int numSweeps,
-                     const bool zeroInitialGuess) const;
-
-    /// \brief Version of reorderedGaussSeidel(), with fewer requirements on X.
-    ///
-    /// This method is just like reorderedGaussSeidel(), except that X need
-    /// only be in the domain Map.  This method does not require that
-    /// X be a domain Map view of a column Map multivector.  As a
-    /// result, this method must copy X into a domain Map multivector
-    /// before operating on it.
-    ///
-    /// \param X [in/out] On input: initial guess(es).  On output:
-    ///   result multivector(s).
-    /// \param B [in] Right-hand side(s), in the range Map.
-    /// \param D [in] Inverse of diagonal entries of the matrix,
-    ///   in the row Map.
-    /// \param rowIndices [in] Ordered list of indices on which to execute GS.
-    /// \param dampingFactor [in] SOR damping factor.  A damping
-    ///   factor of one results in Gauss-Seidel.
-    /// \param direction [in] Sweep direction: Forward, Backward, or
-    ///   Symmetric.
-    /// \param numSweeps [in] Number of sweeps.  We count each
-    ///   Symmetric sweep (including both its Forward and its
-    ///   Backward sweep) as one.
-    /// \param zeroInitialGuess [in] If true, this method will fill X
-    ///   with zeros initially.  If false, this method will assume
-    ///   that X contains a possibly nonzero initial guess on input.
-    ///   Note that a nonzero initial guess may impose an additional
-    ///   nontrivial communication cost (an additional Import).
-    ///
-    /// \pre Domain, range, and row Maps of the sparse matrix are all the same.
-    /// \pre No other argument aliases X.
-    void
-    reorderedGaussSeidelCopy (MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& X,
-                              const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
-                              const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& D,
-                              const Teuchos::ArrayView<LocalOrdinal>& rowIndices,
-                              const Scalar& dampingFactor,
-                              const ESweepDirection direction,
-                              const int numSweeps,
-                              const bool zeroInitialGuess) const;
-
     /// \brief Implementation of RowMatrix::add: return <tt>alpha*A + beta*this</tt>.
     ///
     /// This override of the default implementation ensures that, when
diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp
index d46a492f181a..cd995c00e8d4 100644
--- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp
+++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp
@@ -5494,609 +5494,6 @@ namespace Tpetra {
     }
   }
 
-  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
-  void
-  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
-  gaussSeidel (const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
-               MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& X,
-               const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& D,
-               const Scalar& dampingFactor,
-               const ESweepDirection direction,
-               const int numSweeps) const
-  {
-    reorderedGaussSeidel (B, X, D, Teuchos::null, dampingFactor, direction, numSweeps);
-  }
-
-  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
-  void
-  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
-  reorderedGaussSeidel (const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
-                        MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& X,
-                        const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& D,
-                        const Teuchos::ArrayView<LocalOrdinal>& rowIndices,
-                        const Scalar& dampingFactor,
-                        const ESweepDirection direction,
-                        const int numSweeps) const
-  {
-    using Teuchos::null;
-    using Teuchos::RCP;
-    using Teuchos::rcp;
-    using Teuchos::rcp_const_cast;
-    using Teuchos::rcpFromRef;
-    typedef Scalar ST;
-
-    TEUCHOS_TEST_FOR_EXCEPTION(
-      isFillComplete() == false, std::runtime_error,
-      "Tpetra::CrsMatrix::gaussSeidel: cannot call this method until "
-      "fillComplete() has been called.");
-    TEUCHOS_TEST_FOR_EXCEPTION(
-      numSweeps < 0,
-      std::invalid_argument,
-      "Tpetra::CrsMatrix::gaussSeidel: The number of sweeps must be , "
-      "nonnegative but you provided numSweeps = " << numSweeps << " < 0.");
-
-    // Translate from global to local sweep direction.
-    // While doing this, validate the input.
-    ESweepDirection localDirection;
-    if (direction == Forward) {
-      localDirection = Forward;
-    }
-    else if (direction == Backward) {
-      localDirection = Backward;
-    }
-    else if (direction == Symmetric) {
-      // We'll control local sweep direction manually.
-      localDirection = Forward;
-    }
-    else {
-      TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument,
-        "Tpetra::CrsMatrix::gaussSeidel: The 'direction' enum does not have "
-        "any of its valid values: Forward, Backward, or Symmetric.");
-    }
-
-    if (numSweeps == 0) {
-      return; // Nothing to do.
-    }
-
-    // We don't need the Export object because this method assumes
-    // that the row, domain, and range Maps are the same.  We do need
-    // the Import object, if there is one, though.
-    RCP<const import_type> importer = this->getGraph()->getImporter();
-    RCP<const export_type> exporter = this->getGraph()->getExporter();
-    TEUCHOS_TEST_FOR_EXCEPTION(
-      ! exporter.is_null (), std::runtime_error,
-      "Tpetra's gaussSeidel implementation requires that the row, domain, "
-      "and range Maps be the same.  This cannot be the case, because the "
-      "matrix has a nontrivial Export object.");
-
-    RCP<const map_type> domainMap = this->getDomainMap ();
-    RCP<const map_type> rangeMap = this->getRangeMap ();
-    RCP<const map_type> rowMap = this->getGraph ()->getRowMap ();
-    RCP<const map_type> colMap = this->getGraph ()->getColMap ();
-
-#ifdef HAVE_TEUCHOS_DEBUG
-    {
-      // The relation 'isSameAs' is transitive.  It's also a
-      // collective, so we don't have to do a "shared" test for
-      // exception (i.e., a global reduction on the test value).
-      TEUCHOS_TEST_FOR_EXCEPTION(
-        ! X.getMap ()->isSameAs (*domainMap),
-        std::runtime_error,
-        "Tpetra::CrsMatrix::gaussSeidel requires that the input "
-        "multivector X be in the domain Map of the matrix.");
-      TEUCHOS_TEST_FOR_EXCEPTION(
-        ! B.getMap ()->isSameAs (*rangeMap),
-        std::runtime_error,
-        "Tpetra::CrsMatrix::gaussSeidel requires that the input "
-        "B be in the range Map of the matrix.");
-      TEUCHOS_TEST_FOR_EXCEPTION(
-        ! D.getMap ()->isSameAs (*rowMap),
-        std::runtime_error,
-        "Tpetra::CrsMatrix::gaussSeidel requires that the input "
-        "D be in the row Map of the matrix.");
-      TEUCHOS_TEST_FOR_EXCEPTION(
-        ! rowMap->isSameAs (*rangeMap),
-        std::runtime_error,
-        "Tpetra::CrsMatrix::gaussSeidel requires that the row Map and the "
-        "range Map be the same (in the sense of Tpetra::Map::isSameAs).");
-      TEUCHOS_TEST_FOR_EXCEPTION(
-        ! domainMap->isSameAs (*rangeMap),
-        std::runtime_error,
-        "Tpetra::CrsMatrix::gaussSeidel requires that the domain Map and "
-        "the range Map of the matrix be the same.");
-    }
-#else
-    // Forestall any compiler warnings for unused variables.
-    (void) rangeMap;
-    (void) rowMap;
-#endif // HAVE_TEUCHOS_DEBUG
-
-    // If B is not constant stride, copy it into a constant stride
-    // multivector.  We'l handle the right-hand side B first and deal
-    // with X right before the sweeps, to improve locality of the
-    // first sweep.  (If the problem is small enough, then that will
-    // hopefully keep more of the entries of X in cache.  This
-    // optimizes for the typical case of a small number of sweeps.)
-    RCP<const MV> B_in;
-    if (B.isConstantStride()) {
-      B_in = rcpFromRef (B);
-    }
-    else {
-      // The range Map and row Map are the same in this case, so we
-      // can use the (possibly cached) row Map multivector to store a
-      // constant stride copy of B.  We don't have to copy back, since
-      // Gauss-Seidel won't modify B.
-      RCP<MV> B_in_nonconst = getRowMapMultiVector (B, true);
-      deep_copy (*B_in_nonconst, B); // Copy from B into B_in(_nonconst).
-      B_in = rcp_const_cast<const MV> (B_in_nonconst);
-
-      TPETRA_EFFICIENCY_WARNING(
-        ! B.isConstantStride (),
-        std::runtime_error,
-        "gaussSeidel: The current implementation of the Gauss-Seidel kernel "
-        "requires that X and B both have constant stride.  Since B does not "
-        "have constant stride, we had to make a copy.  This is a limitation of "
-        "the current implementation and not your fault, but we still report it "
-        "as an efficiency warning for your information.");
-    }
-
-    // If X is not constant stride, copy it into a constant stride
-    // multivector.  Also, make the column Map multivector X_colMap,
-    // and its domain Map view X_domainMap.  (X actually must be a
-    // domain Map view of a column Map multivector; exploit this, if X
-    // has constant stride.)
-
-    RCP<MV> X_domainMap;
-    RCP<MV> X_colMap;
-    bool copiedInput = false;
-
-    if (importer.is_null ()) { // Domain and column Maps are the same.
-      if (X.isConstantStride ()) {
-        X_domainMap = rcpFromRef (X);
-        X_colMap = X_domainMap;
-        copiedInput = false;
-      }
-      else {
-        // Get a temporary column Map multivector, make a domain Map
-        // view of it, and copy X into the domain Map view.  We have
-        // to copy here because we won't be doing Import operations.
-        X_colMap = getColumnMapMultiVector (X, true);
-        X_domainMap = X_colMap; // Domain and column Maps are the same.
-        deep_copy (*X_domainMap, X); // Copy X into the domain Map view.
-        copiedInput = true;
-        TPETRA_EFFICIENCY_WARNING(
-          ! X.isConstantStride (), std::runtime_error,
-          "Tpetra::CrsMatrix::gaussSeidel: The current implementation of the "
-          "Gauss-Seidel kernel requires that X and B both have constant "
-          "stride.  Since X does not have constant stride, we had to make a "
-          "copy.  This is a limitation of the current implementation and not "
-          "your fault, but we still report it as an efficiency warning for "
-          "your information.");
-      }
-    }
-    else { // We will be doing Import operations in the sweeps.
-      if (X.isConstantStride ()) {
-        X_domainMap = rcpFromRef (X);
-        // This kernel assumes that X is a domain Map view of a column
-        // Map multivector.  We will only check if this is valid if
-        // the CMake configure Teuchos_ENABLE_DEBUG is ON.
-        X_colMap = X_domainMap->offsetViewNonConst (colMap, 0);
-
-        // FIXME (mfh 19 Mar 2013) Do we need to fill the remote
-        // entries of X_colMap with zeros?  Do we need to fill all of
-        // X_domainMap initially with zeros?  Ifpack
-        // (Ifpack_PointRelaxation.cpp, line 906) creates an entirely
-        // new MultiVector each time.
-
-        // Do the first Import for the first sweep.  This simplifies
-        // the logic in the sweeps.
-        X_colMap->doImport (X, *importer, INSERT);
-        copiedInput = false;
-      }
-      else {
-        // Get a temporary column Map multivector X_colMap, and make a
-        // domain Map view X_domainMap of it.  Instead of copying, we
-        // do an Import from X into X_domainMap.  This saves us a
-        // copy, since the Import has to copy the data anyway.
-        X_colMap = getColumnMapMultiVector (X, true);
-        X_domainMap = X_colMap->offsetViewNonConst (domainMap, 0);
-        X_colMap->doImport (X, *importer, INSERT);
-        copiedInput = true;
-        TPETRA_EFFICIENCY_WARNING(
-          ! X.isConstantStride (), std::runtime_error,
-          "Tpetra::CrsMatrix::gaussSeidel: The current implementation of the "
-          "Gauss-Seidel kernel requires that X and B both have constant stride.  "
-          "Since X does not have constant stride, we had to make a copy.  "
-          "This is a limitation of the current implementation and not your fault, "
-          "but we still report it as an efficiency warning for your information.");
-      }
-    }
-
-    for (int sweep = 0; sweep < numSweeps; ++sweep) {
-      if (! importer.is_null () && sweep > 0) {
-        // We already did the first Import for the zeroth sweep.
-        X_colMap->doImport (*X_domainMap, *importer, INSERT);
-      }
-
-      // Do local Gauss-Seidel.
-      if (direction != Symmetric) {
-        if (rowIndices.is_null ()) {
-          this->template localGaussSeidel<ST, ST> (*B_in, *X_colMap, D,
-                                                   dampingFactor,
-                                                   localDirection);
-        }
-        else {
-          this->template reorderedLocalGaussSeidel<ST, ST> (*B_in, *X_colMap,
-                                                            D, rowIndices,
-                                                            dampingFactor,
-                                                            localDirection);
-        }
-      }
-      else { // direction == Symmetric
-        const bool doImportBetweenDirections = false;
-        if (rowIndices.is_null ()) {
-          this->template localGaussSeidel<ST, ST> (*B_in, *X_colMap, D,
-                                                   dampingFactor,
-                                                   Forward);
-          // mfh 18 Mar 2013: Aztec's implementation of "symmetric
-          // Gauss-Seidel" does _not_ do an Import between the forward
-          // and backward sweeps.  This makes sense, because Aztec
-          // considers "symmetric Gauss-Seidel" a subdomain solver.
-          if (doImportBetweenDirections) {
-            // Communicate again before the Backward sweep.
-            if (! importer.is_null ()) {
-              X_colMap->doImport (*X_domainMap, *importer, INSERT);
-            }
-          }
-          this->template localGaussSeidel<ST, ST> (*B_in, *X_colMap, D,
-                                                   dampingFactor,
-                                                   Backward);
-        }
-        else {
-          this->template reorderedLocalGaussSeidel<ST, ST> (*B_in, *X_colMap,
-                                                            D, rowIndices,
-                                                            dampingFactor,
-                                                            Forward);
-          if (doImportBetweenDirections) {
-            // Communicate again before the Backward sweep.
-            if (! importer.is_null ()) {
-              X_colMap->doImport (*X_domainMap, *importer, INSERT);
-            }
-          }
-          this->template reorderedLocalGaussSeidel<ST, ST> (*B_in, *X_colMap,
-                                                            D, rowIndices,
-                                                            dampingFactor,
-                                                            Backward);
-        }
-      }
-    }
-
-    if (copiedInput) {
-      deep_copy (X, *X_domainMap); // Copy back from X_domainMap to X.
-    }
-  }
-
-  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
-  void
-  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
-  gaussSeidelCopy (MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& X,
-                   const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
-                   const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& D,
-                   const Scalar& dampingFactor,
-                   const ESweepDirection direction,
-                   const int numSweeps,
-                   const bool zeroInitialGuess) const
-  {
-    reorderedGaussSeidelCopy (X, B, D, Teuchos::null, dampingFactor, direction,
-                              numSweeps, zeroInitialGuess);
-  }
-
-  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
-  void
-  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
-  reorderedGaussSeidelCopy (MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& X,
-                            const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
-                            const MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& D,
-                            const Teuchos::ArrayView<LocalOrdinal>& rowIndices,
-                            const Scalar& dampingFactor,
-                            const ESweepDirection direction,
-                            const int numSweeps,
-                            const bool zeroInitialGuess) const
-  {
-    using Teuchos::null;
-    using Teuchos::RCP;
-    using Teuchos::rcp;
-    using Teuchos::rcpFromRef;
-    using Teuchos::rcp_const_cast;
-    typedef Scalar ST;
-    const char prefix[] = "Tpetra::CrsMatrix::(reordered)gaussSeidelCopy: ";
-    const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
-
-    TEUCHOS_TEST_FOR_EXCEPTION(
-      ! isFillComplete (), std::runtime_error,
-      prefix << "The matrix is not fill complete.");
-    TEUCHOS_TEST_FOR_EXCEPTION(
-      numSweeps < 0, std::invalid_argument,
-      prefix << "The number of sweeps must be nonnegative, "
-      "but you provided numSweeps = " << numSweeps << " < 0.");
-
-    // Translate from global to local sweep direction.
-    // While doing this, validate the input.
-    ESweepDirection localDirection;
-    if (direction == Forward) {
-      localDirection = Forward;
-    }
-    else if (direction == Backward) {
-      localDirection = Backward;
-    }
-    else if (direction == Symmetric) {
-      // We'll control local sweep direction manually.
-      localDirection = Forward;
-    }
-    else {
-      TEUCHOS_TEST_FOR_EXCEPTION(
-        true, std::invalid_argument,
-        prefix << "The 'direction' enum does not have any of its valid "
-        "values: Forward, Backward, or Symmetric.");
-    }
-
-    if (numSweeps == 0) {
-      return;
-    }
-
-    RCP<const import_type> importer = this->getGraph ()->getImporter ();
-    RCP<const export_type> exporter = this->getGraph ()->getExporter ();
-    TEUCHOS_TEST_FOR_EXCEPTION(
-      ! exporter.is_null (), std::runtime_error,
-      "This method's implementation currently requires that the matrix's row, "
-      "domain, and range Maps be the same.  This cannot be the case, because "
-      "the matrix has a nontrivial Export object.");
-
-    RCP<const map_type> domainMap = this->getDomainMap ();
-    RCP<const map_type> rangeMap = this->getRangeMap ();
-    RCP<const map_type> rowMap = this->getGraph ()->getRowMap ();
-    RCP<const map_type> colMap = this->getGraph ()->getColMap ();
-
-#ifdef HAVE_TEUCHOS_DEBUG
-    {
-      // The relation 'isSameAs' is transitive.  It's also a
-      // collective, so we don't have to do a "shared" test for
-      // exception (i.e., a global reduction on the test value).
-      TEUCHOS_TEST_FOR_EXCEPTION(
-        ! X.getMap ()->isSameAs (*domainMap), std::runtime_error,
-        "Tpetra::CrsMatrix::gaussSeidelCopy requires that the input "
-        "multivector X be in the domain Map of the matrix.");
-      TEUCHOS_TEST_FOR_EXCEPTION(
-        ! B.getMap ()->isSameAs (*rangeMap), std::runtime_error,
-        "Tpetra::CrsMatrix::gaussSeidelCopy requires that the input "
-        "B be in the range Map of the matrix.");
-      TEUCHOS_TEST_FOR_EXCEPTION(
-        ! D.getMap ()->isSameAs (*rowMap), std::runtime_error,
-        "Tpetra::CrsMatrix::gaussSeidelCopy requires that the input "
-        "D be in the row Map of the matrix.");
-      TEUCHOS_TEST_FOR_EXCEPTION(
-        ! rowMap->isSameAs (*rangeMap), std::runtime_error,
-        "Tpetra::CrsMatrix::gaussSeidelCopy requires that the row Map and the "
-        "range Map be the same (in the sense of Tpetra::Map::isSameAs).");
-      TEUCHOS_TEST_FOR_EXCEPTION(
-        ! domainMap->isSameAs (*rangeMap), std::runtime_error,
-        "Tpetra::CrsMatrix::gaussSeidelCopy requires that the domain Map and "
-        "the range Map of the matrix be the same.");
-    }
-#else
-    // Forestall any compiler warnings for unused variables.
-    (void) rangeMap;
-    (void) rowMap;
-#endif // HAVE_TEUCHOS_DEBUG
-
-    // Fetch a (possibly cached) temporary column Map multivector
-    // X_colMap, and a domain Map view X_domainMap of it.  Both have
-    // constant stride by construction.  We know that the domain Map
-    // must include the column Map, because our Gauss-Seidel kernel
-    // requires that the row Map, domain Map, and range Map are all
-    // the same, and that each process owns all of its own diagonal
-    // entries of the matrix.
-
-    RCP<MV> X_colMap;
-    RCP<MV> X_domainMap;
-    bool copyBackOutput = false;
-    if (importer.is_null ()) {
-      if (X.isConstantStride ()) {
-        X_colMap = rcpFromRef (X);
-        X_domainMap = rcpFromRef (X);
-        // Column Map and domain Map are the same, so there are no
-        // remote entries.  Thus, if we are not setting the initial
-        // guess to zero, we don't have to worry about setting remote
-        // entries to zero, even though we are not doing an Import in
-        // this case.
-        if (zeroInitialGuess) {
-          X_colMap->putScalar (ZERO);
-        }
-        // No need to copy back to X at end.
-      }
-      else { // We must copy X into a constant stride multivector.
-        // Just use the cached column Map multivector for that.
-        // force=true means fill with zeros, so no need to fill
-        // remote entries (not in domain Map) with zeros.
-        X_colMap = getColumnMapMultiVector (X, true);
-        // X_domainMap is always a domain Map view of the column Map
-        // multivector.  In this case, the domain and column Maps are
-        // the same, so X_domainMap _is_ X_colMap.
-        X_domainMap = X_colMap;
-        if (! zeroInitialGuess) { // Don't copy if zero initial guess
-          try {
-            deep_copy (*X_domainMap , X); // Copy X into constant stride MV
-          } catch (std::exception& e) {
-            std::ostringstream os;
-            os << "Tpetra::CrsMatrix::reorderedGaussSeidelCopy: "
-              "deep_copy(*X_domainMap, X) threw an exception: "
-               << e.what () << ".";
-            TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, e.what ());
-          }
-        }
-        copyBackOutput = true; // Don't forget to copy back at end.
-        TPETRA_EFFICIENCY_WARNING(
-          ! X.isConstantStride (),
-          std::runtime_error,
-          "gaussSeidelCopy: The current implementation of the Gauss-Seidel "
-          "kernel requires that X and B both have constant stride.  Since X "
-          "does not have constant stride, we had to make a copy.  This is a "
-          "limitation of the current implementation and not your fault, but we "
-          "still report it as an efficiency warning for your information.");
-      }
-    }
-    else { // Column Map and domain Map are _not_ the same.
-      X_colMap = getColumnMapMultiVector (X);
-      X_domainMap = X_colMap->offsetViewNonConst (domainMap, 0);
-
-#ifdef HAVE_TPETRA_DEBUG
-      auto X_colMap_host_view = X_colMap->getLocalViewHost ();
-      auto X_domainMap_host_view = X_domainMap->getLocalViewHost ();
-
-      if (X_colMap->getLocalLength () != 0 && X_domainMap->getLocalLength ()) {
-        TEUCHOS_TEST_FOR_EXCEPTION
-          (X_colMap_host_view.data () != X_domainMap_host_view.data (),
-           std::logic_error, "Tpetra::CrsMatrix::gaussSeidelCopy: Pointer to "
-           "start of column Map view of X is not equal to pointer to start of "
-           "(domain Map view of) X.  This may mean that Tpetra::MultiVector::"
-           "offsetViewNonConst is broken.  "
-           "Please report this bug to the Tpetra developers.");
-      }
-
-      TEUCHOS_TEST_FOR_EXCEPTION(
-        X_colMap_host_view.extent (0) < X_domainMap_host_view.extent (0) ||
-        X_colMap->getLocalLength () < X_domainMap->getLocalLength (),
-        std::logic_error, "Tpetra::CrsMatrix::gaussSeidelCopy: "
-        "X_colMap has fewer local rows than X_domainMap.  "
-        "X_colMap_host_view.extent(0) = " << X_colMap_host_view.extent (0)
-        << ", X_domainMap_host_view.extent(0) = "
-        << X_domainMap_host_view.extent (0)
-        << ", X_colMap->getLocalLength() = " << X_colMap->getLocalLength ()
-        << ", and X_domainMap->getLocalLength() = "
-        << X_domainMap->getLocalLength ()
-        << ".  This means that Tpetra::MultiVector::offsetViewNonConst "
-        "is broken.  Please report this bug to the Tpetra developers.");
-
-      TEUCHOS_TEST_FOR_EXCEPTION(
-        X_colMap->getNumVectors () != X_domainMap->getNumVectors (),
-        std::logic_error, "Tpetra::CrsMatrix::gaussSeidelCopy: "
-        "X_colMap has a different number of columns than X_domainMap.  "
-        "X_colMap->getNumVectors() = " << X_colMap->getNumVectors ()
-        << " != X_domainMap->getNumVectors() = "
-        << X_domainMap->getNumVectors ()
-        << ".  This means that Tpetra::MultiVector::offsetViewNonConst "
-        "is broken.  Please report this bug to the Tpetra developers.");
-#endif // HAVE_TPETRA_DEBUG
-
-      if (zeroInitialGuess) {
-        // No need for an Import, since we're filling with zeros.
-        X_colMap->putScalar (ZERO);
-      } else {
-        // We could just copy X into X_domainMap.  However, that
-        // wastes a copy, because the Import also does a copy (plus
-        // communication).  Since the typical use case for
-        // Gauss-Seidel is a small number of sweeps (2 is typical), we
-        // don't want to waste that copy.  Thus, we do the Import
-        // here, and skip the first Import in the first sweep.
-        // Importing directly from X effects the copy into X_domainMap
-        // (which is a view of X_colMap).
-        X_colMap->doImport (X, *importer, INSERT);
-      }
-      copyBackOutput = true; // Don't forget to copy back at end.
-    } // if column and domain Maps are (not) the same
-
-    // The Gauss-Seidel / SOR kernel expects multivectors of constant
-    // stride.  X_colMap is by construction, but B might not be.  If
-    // it's not, we have to make a copy.
-    RCP<const MV> B_in;
-    if (B.isConstantStride ()) {
-      B_in = rcpFromRef (B);
-    }
-    else {
-      // Range Map and row Map are the same in this case, so we can
-      // use the cached row Map multivector to store a constant stride
-      // copy of B.
-      RCP<MV> B_in_nonconst = getRowMapMultiVector (B, true);
-      try {
-        deep_copy (*B_in_nonconst, B);
-      } catch (std::exception& e) {
-        std::ostringstream os;
-        os << "Tpetra::CrsMatrix::reorderedGaussSeidelCopy: "
-          "deep_copy(*B_in_nonconst, B) threw an exception: "
-           << e.what () << ".";
-        TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, e.what ());
-      }
-      B_in = rcp_const_cast<const MV> (B_in_nonconst);
-
-      TPETRA_EFFICIENCY_WARNING(
-        ! B.isConstantStride (),
-        std::runtime_error,
-        "gaussSeidelCopy: The current implementation requires that B have "
-        "constant stride.  Since B does not have constant stride, we had to "
-        "copy it into a separate constant-stride multivector.  This is a "
-        "limitation of the current implementation and not your fault, but we "
-        "still report it as an efficiency warning for your information.");
-    }
-
-    for (int sweep = 0; sweep < numSweeps; ++sweep) {
-      if (! importer.is_null () && sweep > 0) {
-        // We already did the first Import for the zeroth sweep above,
-        // if it was necessary.
-        X_colMap->doImport (*X_domainMap, *importer, INSERT);
-      }
-
-      // Do local Gauss-Seidel.
-      if (direction != Symmetric) {
-        if (rowIndices.is_null ()) {
-          this->template localGaussSeidel<ST, ST> (*B_in, *X_colMap, D,
-                                                   dampingFactor,
-                                                   localDirection);
-        }
-        else {
-          this->template reorderedLocalGaussSeidel<ST, ST> (*B_in, *X_colMap,
-                                                            D, rowIndices,
-                                                            dampingFactor,
-                                                            localDirection);
-        }
-      }
-      else { // direction == Symmetric
-        if (rowIndices.is_null ()) {
-          this->template localGaussSeidel<ST, ST> (*B_in, *X_colMap, D,
-                                                   dampingFactor,
-                                                   Forward);
-          // mfh 18 Mar 2013: Aztec's implementation of "symmetric
-          // Gauss-Seidel" does _not_ do an Import between the forward
-          // and backward sweeps.  This makes symmetric Gauss-Seidel a
-          // symmetric preconditioner if the matrix A is symmetric.  We
-          // imitate Aztec's behavior here.
-          this->template localGaussSeidel<ST, ST> (*B_in, *X_colMap, D,
-                                                   dampingFactor,
-                                                   Backward);
-        }
-        else {
-          this->template reorderedLocalGaussSeidel<ST, ST> (*B_in, *X_colMap,
-                                                            D, rowIndices,
-                                                            dampingFactor,
-                                                            Forward);
-          this->template reorderedLocalGaussSeidel<ST, ST> (*B_in, *X_colMap,
-                                                            D, rowIndices,
-                                                            dampingFactor,
-                                                            Backward);
-
-        }
-      }
-    }
-
-    if (copyBackOutput) {
-      try {
-        deep_copy (X , *X_domainMap); // Copy result back into X.
-      } catch (std::exception& e) {
-        TEUCHOS_TEST_FOR_EXCEPTION(
-          true, std::runtime_error, prefix << "deep_copy(X, *X_domainMap) "
-          "threw an exception: " << e.what ());
-      }
-    }
-  }
 
   template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
   template<class T>
diff --git a/packages/tpetra/core/test/Block/BlockCrsMatrix.cpp b/packages/tpetra/core/test/Block/BlockCrsMatrix.cpp
index 8260ad2cb414..aca76f9cb091 100644
--- a/packages/tpetra/core/test/Block/BlockCrsMatrix.cpp
+++ b/packages/tpetra/core/test/Block/BlockCrsMatrix.cpp
@@ -1768,450 +1768,6 @@ namespace {
     TEST_EQUALITY_CONST( gblSuccess, 1 );
   }
 
-  //
-  // Test BlockCrsMatrix's localGaussSeidel with a block diagonal matrix.
-  //
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( BlockCrsMatrix, localGSDiagonalMatrix, Scalar, LO, GO, Node )
-  {
-    using Kokkos::ALL;
-    typedef Scalar ST;
-    typedef Tpetra::BlockVector<ST, LO, GO, Node> BV;
-    typedef Tpetra::BlockCrsMatrix<ST, LO, GO, Node> BCM;
-    typedef Tpetra::CrsGraph<LO, GO, Node> graph_type;
-    typedef Tpetra::Map<LO, GO, Node> map_type;
-    typedef typename graph_type::device_type device_type;
-    typedef typename BCM::impl_scalar_type IST;
-    typedef Teuchos::ScalarTraits<ST> STS;
-
-    const ST two = STS::one () + STS::one ();
-    const ST three = two + STS::one ();
-    const auto tol = 10.0 * STS::eps ();
-
-    out << "Testing Tpetra::BlockCrsMatrix::localGaussSeidel "
-      "with a matrix whose graph is diagonal" << endl;
-    Teuchos::OSTab tab0 (out);
-
-    RCP<const Comm<int> > comm = getDefaultComm ();
-    const GST INVALID = Teuchos::OrdinalTraits<GST>::invalid ();
-
-    out << "Creating mesh row Map" << endl;
-
-    const size_t numLocalMeshPoints = 12;
-    const GO indexBase = 1;
-    // Use a block size that is not a power of 2, to test correctness
-    // in case the matrix pads blocks for SIMD-ization.
-    const LO blockSize = 3;
-
-    // mfh 20 May 2014: Tpetra::CrsGraph still needs the row Map as an
-    // RCP.  Later interface changes will let us pass in the Map by
-    // const reference and assume view semantics.
-    RCP<const map_type> meshRowMapPtr =
-      rcp (new map_type (INVALID, numLocalMeshPoints, indexBase, comm));
-    const map_type& meshRowMap = *meshRowMapPtr;
-
-    out << "Creating mesh graph" << endl;
-
-    const size_t maxNumEntPerRow = 1;
-    graph_type graph (meshRowMapPtr, maxNumEntPerRow, Tpetra::StaticProfile);
-
-    // Fill the graph with only diagonal entries
-    Teuchos::Array<GO> gblColInds (maxNumEntPerRow);
-    for (LO lclRowInd = meshRowMap.getMinLocalIndex ();
-         lclRowInd <= meshRowMap.getMaxLocalIndex (); ++lclRowInd) {
-      const GO gblRowInd = meshRowMap.getGlobalElement (lclRowInd);
-      gblColInds[0] = gblRowInd;
-      graph.insertGlobalIndices (gblRowInd, gblColInds ());
-    }
-    graph.fillComplete ();
-
-    // Get the graph's column Map (the "mesh column Map").
-    TEST_ASSERT( ! graph.getColMap ().is_null () );
-    map_type meshColMap = * (graph.getColMap ());
-
-    out << "Creating BlockCrsMatrix" << endl;
-
-    // Construct the BlockCrsMatrix.
-    BCM blockMat;
-    TEST_NOTHROW( blockMat = BCM (graph, blockSize) );
-    BV residual;
-    TEST_NOTHROW( residual = BV(meshRowMap, blockSize));
-    BV solution;
-    TEST_NOTHROW( solution = BV(meshRowMap, blockSize));
-
-    Teuchos::Array<ST> basematrix (blockSize*blockSize, STS::zero ());
-    basematrix[0] = two;
-    basematrix[2] = three;
-    basematrix[3] = three;
-    basematrix[4] = two;
-    basematrix[7] = three;
-    basematrix[8] = two;
-
-    Teuchos::Array<ST> baseResidual(blockSize, STS::zero ());
-    baseResidual[0] = STS::one ();
-    baseResidual[1] = three;
-    baseResidual[2] = -two;
-
-    // FIXME (mfh 25 Aug 2014) This will likely only work with Scalar
-    // = float or double.  On the other hand, the author of these
-    // tests understood that and only instantiated them for
-    // Scalar=double (see the instantiations list below).
-    Teuchos::Array<ST> exactSolution(blockSize, STS::zero());
-    exactSolution[0] = 43.0/35.0;
-    exactSolution[1] = -12.0/35.0;
-    exactSolution[2] = -17.0/35.0;
-
-    // NOTE (mfh 26 May 2016) We may start modifying the matrix on
-    // host now, because we haven't yet done anything to it on device.
-
-    Teuchos::Array<LO> lclColInds(1);
-    for (LO lclRowInd = meshRowMap.getMinLocalIndex ();
-         lclRowInd <= meshRowMap.getMaxLocalIndex (); ++lclRowInd) {
-      for (LO k = 0; k < blockSize*blockSize; ++k) {
-        basematrix[k] *= two;
-      }
-      for (LO k = 0; k < blockSize; ++k) {
-        baseResidual[k] *= two;
-      }
-      lclColInds[0] = lclRowInd;
-      blockMat.replaceLocalValues (lclRowInd, lclColInds.getRawPtr (),
-                                   basematrix.getRawPtr (), 1);
-      residual.replaceLocalValues (lclRowInd, baseResidual.getRawPtr ());
-      solution.replaceLocalValues (lclRowInd, baseResidual.getRawPtr ());
-    }
-
-    Kokkos::View<size_t*, device_type> diagonalOffsets ("offsets", numLocalMeshPoints);
-    graph.getLocalDiagOffsets (diagonalOffsets);
-
-    // Sync the matrix to device, since getLocalDiagCopy runs there.
-    blockMat.template sync<device_type> ();
-
-    typedef Kokkos::View<IST***, device_type> block_diag_type;
-    block_diag_type blockDiag ("blockDiag", numLocalMeshPoints,
-                               blockSize, blockSize);
-    blockMat.getLocalDiagCopy (blockDiag, diagonalOffsets);
-
-    Kokkos::View<int**, device_type> pivots ("pivots", numLocalMeshPoints, blockSize);
-    // That's how we found this test: the pivots array was filled with ones.
-    Kokkos::deep_copy (pivots, 1);
-    out << "pivots size = " << pivots.extent(0) << endl;
-
-    for (LO lclMeshRow = 0; lclMeshRow < static_cast<LO> (numLocalMeshPoints); ++lclMeshRow) {
-      auto diagBlock = Kokkos::subview (blockDiag, lclMeshRow, ALL (), ALL ());
-
-      // Make sure that the diagonal block is correct.
-      Scalar* blkVals = NULL;
-      const LO* blkColInds = NULL;
-      LO blkNumEnt = 0;
-      blockMat.getLocalRowView (lclMeshRow, blkColInds, blkVals, blkNumEnt);
-
-      TEST_EQUALITY( blkNumEnt, static_cast<LO> (1) );
-      if (blkNumEnt == 1) {
-        typename BCM::const_little_block_type diagBlock2 ((typename BCM::const_little_block_type::value_type*) blkVals, blockSize, blockSize);
-        for (LO j = 0; j < blockSize; ++j) {
-          for (LO i = 0; i < blockSize; ++i) {
-            TEST_EQUALITY( diagBlock(i,j), diagBlock2(i,j) );
-          }
-        }
-      }
-
-      auto ipiv = Kokkos::subview (pivots, lclMeshRow, ALL ());
-      int info = 0;
-      Tpetra::GETF2 (diagBlock, ipiv, info);
-      TEST_EQUALITY( info, 0 );
-
-      // GETRI needs workspace.  Use host space for now.
-      Teuchos::Array<IST> workVec (blockSize);
-      Kokkos::View<IST*, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>
-        work (workVec.getRawPtr (), blockSize);
-      Tpetra::GETRI (diagBlock, ipiv, work, info);
-      TEST_EQUALITY( info, 0 );
-    }
-
-    blockMat.localGaussSeidel (residual, solution, blockDiag,
-                               STS::one(), Tpetra::Forward);
-
-    for (LO lclRowInd = meshRowMap.getMinLocalIndex ();
-         lclRowInd <= meshRowMap.getMaxLocalIndex (); ++lclRowInd) {
-      typename BV::little_host_vec_type xlcl = solution.getLocalBlock (lclRowInd);
-      ST* x = reinterpret_cast<ST*> (xlcl.data ());
-      out << "row = " << lclRowInd << endl;
-      for (LO k = 0; k < blockSize; ++k) {
-        TEST_FLOATING_EQUALITY( x[k], exactSolution[k], tol );
-        x[k] = -STS::one ();
-      }
-    }
-
-    blockMat.localGaussSeidel (residual, solution, blockDiag,
-                               STS::one (), Tpetra::Backward);
-    for (LO lclRowInd = meshRowMap.getMinLocalIndex ();
-         lclRowInd <= meshRowMap.getMaxLocalIndex (); ++lclRowInd) {
-      typename BV::little_host_vec_type xlcl = solution.getLocalBlock (lclRowInd);
-      ST* x = reinterpret_cast<ST*> (xlcl.data ());
-      for (LO k = 0; k < blockSize; ++k) {
-        TEST_FLOATING_EQUALITY( x[k], exactSolution[k], tol );
-      }
-    }
-
-    // Final output
-    int lclSuccess = success;
-    int gblSuccess = 0;
-    reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
-    TEST_EQUALITY_CONST( gblSuccess, 1 );
-  }
-
-  //
-  // Test BlockCrsMatrix's localGaussSeidel with a triangular matrix (???)
-  //
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( BlockCrsMatrix, localGSTriangularMatrices, ST, LO, GO, Node )
-  {
-    using Kokkos::ALL;
-    typedef Tpetra::BlockVector<ST, LO, GO, Node> BV;
-    typedef Tpetra::BlockCrsMatrix<ST, LO, GO, Node> BCM;
-    typedef typename BCM::impl_scalar_type IST;
-    typedef Tpetra::CrsGraph<LO, GO, Node> graph_type;
-    typedef typename graph_type::device_type device_type;
-    typedef Tpetra::Map<LO, GO, Node> map_type;
-    typedef Teuchos::ScalarTraits<ST> STS;
-
-    const ST two = STS::one () + STS::one ();
-    const ST three = STS::one () + STS::one () + STS::one ();
-    const auto tol = 10.0 * STS::eps ();
-
-    out << "Testing Tpetra::BlockCrsMatrix::localGaussSeidel "
-      "with a triangular matrix ( ??? )" << endl;
-    Teuchos::OSTab tab0 (out);
-
-    RCP<const Comm<int> > comm = getDefaultComm ();
-    const GST INVALID = Teuchos::OrdinalTraits<GST>::invalid ();
-
-    out << "Creating mesh row Map" << endl;
-
-    const size_t numLocalMeshPoints = 3;
-    const GO indexBase = 1;
-    // Use a block size that is not a power of 2, to test correctness
-    // in case the matrix pads blocks for SIMD-ization.
-    const LO blockSize = 2;
-
-    // mfh 20 May 2014: Tpetra::CrsGraph still needs the row Map as an
-    // RCP.  Later interface changes will let us pass in the Map by
-    // const reference and assume view semantics.
-    RCP<const map_type> meshRowMapPtr =
-      rcp (new map_type (INVALID, numLocalMeshPoints, indexBase, comm));
-    const map_type& meshRowMap = *meshRowMapPtr;
-
-    out << "Creating mesh graph" << endl;
-
-    const size_t maxNumEntPerRow = 3;
-    graph_type graph (meshRowMapPtr, maxNumEntPerRow, Tpetra::StaticProfile);
-
-    Teuchos::Array<GO> gblColInds (maxNumEntPerRow);
-
-    for (LO lclRowInd = meshRowMap.getMinLocalIndex ();
-         lclRowInd <= meshRowMap.getMaxLocalIndex (); ++lclRowInd) {
-
-      const GO gblRowInd = meshRowMap.getGlobalElement (lclRowInd);
-
-      for (size_t k = 0; k < maxNumEntPerRow; ++k) {
-        const LO lclColInd = meshRowMap.getMinLocalIndex () + k;
-        const GO gblColInd = meshRowMap.getGlobalElement (lclColInd);
-        gblColInds[k] = gblColInd;
-      }
-      graph.insertGlobalIndices (gblRowInd, gblColInds ());
-    }
-
-    graph.fillComplete ();
-
-    // Get the graph's column Map (the "mesh column Map").
-    TEST_ASSERT( ! graph.getColMap ().is_null () );
-    map_type meshColMap = * (graph.getColMap ());
-
-    out << "Creating BlockCrsMatrix" << endl;
-
-    // Construct the BlockCrsMatrix.
-    BCM blockMat;
-    TEST_NOTHROW( blockMat = BCM (graph, blockSize) );
-    BV residual;
-    TEST_NOTHROW( residual = BV (meshRowMap, blockSize));
-    BV solution;
-    TEST_NOTHROW( solution = BV (meshRowMap, blockSize));
-
-    Teuchos::Array<ST> basematrix (maxNumEntPerRow * maxNumEntPerRow, STS::zero ());
-    basematrix[0] = two;
-    basematrix[3] = three;
-    basematrix[4] = two;
-    basematrix[6] = STS::one ();
-    basematrix[7] = three;
-    basematrix[8] = two;
-
-    Teuchos::Array<ST> baseResidual (maxNumEntPerRow, STS::zero ());
-    baseResidual[0] = STS::one ();
-    baseResidual[1] = three;
-    baseResidual[2] = -two;
-
-    // FIXME (mfh 25 Aug 2014) This will likely only work with Scalar
-    // = float or double.  On the other hand, the author of these
-    // tests understood that and only instantiated them for
-    // Scalar=double (see the instantiations list below).
-    Teuchos::Array<ST> exactSolution (maxNumEntPerRow, STS::zero ());
-    exactSolution[0] = 0.5;
-    exactSolution[1] = 0.75;
-    exactSolution[2] = -19.0/8.0;
-
-    Teuchos::Array<ST> assembleMatrix (blockSize * blockSize, STS::zero ());
-    Teuchos::Array<ST> assembleResidual (blockSize, STS::zero ());
-
-    Teuchos::Array<LO> lclColInds (1);
-
-    // lower triangular matrix
-    for (LO lclRowInd = meshRowMap.getMinLocalIndex ();
-         lclRowInd <= meshRowMap.getMaxLocalIndex (); ++lclRowInd) {
-      const LO rowOffset = lclRowInd - meshRowMap.getMinLocalIndex ();
-      for (LO k = 0; k < blockSize; ++k) {
-        assembleResidual[k] = baseResidual[rowOffset];
-      }
-      residual.replaceLocalValues (lclRowInd, &assembleResidual[0]);
-
-      for (LO k = 0; k < blockSize; ++k) {
-        assembleResidual[k] = STS::zero ();
-      }
-      solution.replaceLocalValues (lclRowInd, &assembleResidual[0]);
-
-      for (size_t i = 0; i < maxNumEntPerRow; ++i) {
-        for (LO j = 0; j < blockSize*blockSize; ++j) {
-          assembleMatrix[j] = STS::zero ();
-        }
-        const size_t indexBaseMatrix = (maxNumEntPerRow)*rowOffset+i;
-        for (LO j = 0; j < blockSize; ++j) {
-          assembleMatrix[(blockSize+1)*j] = basematrix[indexBaseMatrix];
-        }
-
-        lclColInds[0] = meshRowMap.getMinLocalIndex () + i;
-        blockMat.replaceLocalValues(lclRowInd, lclColInds.getRawPtr(), &assembleMatrix[0], 1);
-      }
-    }
-
-    Kokkos::View<size_t*, device_type> diagonalOffsets ("offsets", numLocalMeshPoints);
-    graph.getLocalDiagOffsets(diagonalOffsets);
-
-    typedef Kokkos::View<IST***, device_type> block_diag_type;
-    block_diag_type blockDiag ("blockDiag", numLocalMeshPoints,
-                               blockSize, blockSize);
-    Kokkos::fence ();
-    blockMat.getLocalDiagCopy (blockDiag, diagonalOffsets);
-    Kokkos::fence ();
-
-    using Kokkos::view_alloc;
-    using Kokkos::WithoutInitializing;
-    Kokkos::View<int**, device_type> pivots (view_alloc ("pivots", WithoutInitializing),
-                                             numLocalMeshPoints, blockSize);
-    // That's how we found this test: the pivots array was filled with ones.
-    Kokkos::deep_copy (pivots, 1);
-
-    Kokkos::fence ();
-
-    for (LO lclMeshRow = 0; lclMeshRow < static_cast<LO> (numLocalMeshPoints); ++lclMeshRow) {
-      auto diagBlock = Kokkos::subview (blockDiag, lclMeshRow, ALL (), ALL ());
-      auto ipiv = Kokkos::subview (pivots, lclMeshRow, ALL ());
-      int info = 0;
-      Tpetra::GETF2 (diagBlock, ipiv, info);
-      TEST_EQUALITY( info, 0 );
-
-      // GETRI needs workspace.  Use host space for now.
-      Teuchos::Array<IST> workVec (blockSize);
-      Kokkos::View<IST*, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>
-        work (workVec.getRawPtr (), blockSize);
-      Tpetra::GETRI (diagBlock, ipiv, work, info);
-      TEST_EQUALITY( info, 0 );
-    }
-
-    blockMat.localGaussSeidel (residual, solution, blockDiag,
-                               STS::one (), Tpetra::Forward);
-
-    for (LO lclRowInd = meshRowMap.getMinLocalIndex ();
-         lclRowInd <= meshRowMap.getMaxLocalIndex(); ++lclRowInd) {
-      const LO rowOffset = lclRowInd - meshRowMap.getMinLocalIndex ();
-      typename BV::little_host_vec_type xlcl = solution.getLocalBlock (lclRowInd);
-      ST* x = reinterpret_cast<ST*> (xlcl.data ());
-      for (LO k = 0; k < blockSize; ++k) {
-        TEST_FLOATING_EQUALITY( x[k], exactSolution[rowOffset], tol );
-        x[k] = -STS::one ();
-      }
-    }
-
-    // upper triangular matrix
-    //
-    // FIXME (mfh 25 Aug 2014) This will likely only work with Scalar
-    // = float or double.  On the other hand, the author of these
-    // tests understood that and only instantiated them for
-    // Scalar=double (see the instantiations list below).
-    exactSolution[0] = -3.5;
-    exactSolution[1] = 3.0;
-    exactSolution[2] = -1.0;
-
-    for (LO lclRowInd = meshRowMap.getMinLocalIndex ();
-         lclRowInd <= meshRowMap.getMaxLocalIndex (); ++lclRowInd) {
-
-      const LO rowOffset = lclRowInd - meshRowMap.getMinLocalIndex ();
-      for (size_t i = 0; i < maxNumEntPerRow; ++i) {
-        for (LO k = 0; k < blockSize; ++k) {
-          assembleResidual[k] = STS::zero ();
-        }
-        solution.replaceLocalValues(lclRowInd, &assembleResidual[0]);
-
-        for (LO j = 0; j < blockSize*blockSize; ++j) {
-          assembleMatrix[j] = STS::zero ();
-        }
-        const size_t indexBaseMatrix = maxNumEntPerRow * i + rowOffset;
-        for (LO j = 0; j < blockSize; ++j) {
-          assembleMatrix[(blockSize+1)*j] = basematrix[indexBaseMatrix];
-        }
-
-        lclColInds[0] = meshRowMap.getMinLocalIndex () + i;
-        blockMat.replaceLocalValues (lclRowInd, lclColInds.getRawPtr (),
-                                     &assembleMatrix[0], 1);
-      }
-    }
-
-    blockMat.getLocalDiagCopy (blockDiag, diagonalOffsets);
-    Kokkos::fence ();
-
-    for (LO lclMeshRow = 0; lclMeshRow < static_cast<LO> (numLocalMeshPoints); ++lclMeshRow) {
-      auto diagBlock = Kokkos::subview (blockDiag, lclMeshRow, ALL (), ALL ());
-      auto ipiv = Kokkos::subview (pivots, lclMeshRow, ALL ());
-      int info = 0;
-      Tpetra::GETF2 (diagBlock, ipiv, info);
-      TEST_EQUALITY( info, 0 );
-
-      // GETRI needs workspace.  Use host space for now.
-      Teuchos::Array<IST> workVec (blockSize);
-      Kokkos::View<IST*, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>
-        work (workVec.getRawPtr (), blockSize);
-      Tpetra::GETRI (diagBlock, ipiv, work, info);
-      TEST_EQUALITY( info, 0 );
-    }
-
-    blockMat.localGaussSeidel (residual, solution, blockDiag,
-                               STS::one (), Tpetra::Symmetric);
-    Kokkos::fence ();
-
-    for (LO lclRowInd = meshRowMap.getMinLocalIndex ();
-         lclRowInd <= meshRowMap.getMaxLocalIndex(); ++lclRowInd) {
-      const LO rowOffset = lclRowInd - meshRowMap.getMinLocalIndex ();
-      typename BV::little_host_vec_type xlcl = solution.getLocalBlock (lclRowInd);
-      ST* x = reinterpret_cast<ST*> (xlcl.data ());
-      for (LO k = 0; k < blockSize; ++k) {
-        TEST_FLOATING_EQUALITY( x[k], exactSolution[rowOffset], tol );
-        x[k] = -STS::one ();
-      }
-    }
-
-    // Final output
-    int lclSuccess = success;
-    int gblSuccess = 0;
-    reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
-    TEST_EQUALITY_CONST( gblSuccess, 1 );
-  }
-
   //
   // Test conversion from CrsMatrix to BlockCrsMatrix.  This test is valid only on
   // 1, 2, or 4 processes.  (See note below.)
@@ -2487,8 +2043,6 @@ namespace {
   TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT( BlockCrsMatrix, SetAllToScalar, SCALAR, LO, GO, NODE ) \
   TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT( BlockCrsMatrix, ImportCopy, SCALAR, LO, GO, NODE ) \
   TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT( BlockCrsMatrix, ExportDiffRowMaps, SCALAR, LO, GO, NODE ) \
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT( BlockCrsMatrix, localGSDiagonalMatrix, SCALAR, LO, GO, NODE ) \
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT( BlockCrsMatrix, localGSTriangularMatrices, SCALAR, LO, GO, NODE ) \
   TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT( BlockCrsMatrix, point2block, SCALAR, LO, GO, NODE )
 
   TPETRA_ETI_MANGLING_TYPEDEFS()
diff --git a/packages/tpetra/core/test/CrsMatrix/CMakeLists.txt b/packages/tpetra/core/test/CrsMatrix/CMakeLists.txt
index 5b24627929e8..100631006e67 100644
--- a/packages/tpetra/core/test/CrsMatrix/CMakeLists.txt
+++ b/packages/tpetra/core/test/CrsMatrix/CMakeLists.txt
@@ -198,17 +198,6 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
   STANDARD_PASS_OUTPUT
   )
 
-TRIBITS_ADD_EXECUTABLE_AND_TEST(
-  CrsMatrix_gaussSeidel
-  SOURCES
-    CrsMatrix_gaussSeidel
-    ${TEUCHOS_STD_UNIT_TEST_MAIN}
-  ARGS
-  COMM serial mpi
-  NUM_MPI_PROCS 1-4
-  STANDARD_PASS_OUTPUT
-  )
-
 # This test only makes sense for exactly 2 MPI processes.
 TRIBITS_ADD_EXECUTABLE_AND_TEST(
   CrsMatrix_Bug5978
diff --git a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_gaussSeidel.cpp b/packages/tpetra/core/test/CrsMatrix/CrsMatrix_gaussSeidel.cpp
deleted file mode 100644
index 6e870fd29f4f..000000000000
--- a/packages/tpetra/core/test/CrsMatrix/CrsMatrix_gaussSeidel.cpp
+++ /dev/null
@@ -1,969 +0,0 @@
-/*
-// @HEADER
-// ***********************************************************************
-//
-//          Tpetra: Templated Linear Algebra Services Package
-//                 Copyright (2008) Sandia Corporation
-//
-// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
-// the U.S. Government retains certain rights in this software.
-//
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// 1. Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-//
-// 2. Redistributions in binary form must reproduce the above copyright
-// notice, this list of conditions and the following disclaimer in the
-// documentation and/or other materials provided with the distribution.
-//
-// 3. Neither the name of the Corporation nor the names of the
-// contributors may be used to endorse or promote products derived from
-// this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
-// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
-// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
-//
-// ************************************************************************
-// @HEADER
-*/
-
-#include <Tpetra_ConfigDefs.hpp>
-#include <Teuchos_UnitTestHarness.hpp>
-#include <TpetraCore_ETIHelperMacros.h>
-
-#include <Tpetra_CrsGraph.hpp>
-#include <Tpetra_CrsMatrix.hpp>
-#include <Tpetra_Map.hpp>
-#include <Tpetra_Core.hpp>
-#include <Tpetra_Details_residual.hpp>
-#include <MatrixMarket_Tpetra.hpp>
-
-#include <Teuchos_Array.hpp>
-#include <Teuchos_as.hpp>
-#include <Teuchos_CommHelpers.hpp>
-#include <Teuchos_DefaultComm.hpp>
-#include <Teuchos_OrdinalTraits.hpp>
-#include <Teuchos_ScalarTraits.hpp>
-#include <Teuchos_TypeNameTraits.hpp>
-
-namespace {
-  template<class MV>
-  typename Teuchos::ScalarTraits<typename MV::scalar_type>::magnitudeType
-  norm2 (const MV& x)
-  {
-    typedef typename MV::scalar_type scalar_type;
-    typedef typename Teuchos::ScalarTraits<scalar_type>::magnitudeType magnitude_type;
-
-    Teuchos::Array<magnitude_type> normTemp (1);
-    x.norm2 (normTemp);
-    return normTemp[0];
-  }
-
-  // mfh 15 Oct 2014: Unfortunately, Teuchos::ScalarTraits<Scalar>
-  // doesn't define eps() if Scalar is int, unsigned int, etc.
-  // isOrdinal is true for integer types and false for floating-point
-  // types, so we can select on that.
-  template<class Scalar, const bool isOrdinal = Teuchos::ScalarTraits<Scalar>::isOrdinal>
-  struct ComputeEps {
-    static typename Teuchos::ScalarTraits<Scalar>::magnitudeType eps ();
-  };
-
-  // Partial specialization for non-floating-point (i.e., integer) types.
-  template<class Scalar>
-  struct ComputeEps<Scalar, true> {
-    static typename Teuchos::ScalarTraits<Scalar>::magnitudeType eps () {
-      return static_cast<Scalar> (0);
-    }
-  };
-
-  // Partial specialization for floating-point types.
-  template<class Scalar>
-  struct ComputeEps<Scalar, false> {
-    static typename Teuchos::ScalarTraits<Scalar>::magnitudeType eps () {
-      typedef Teuchos::ScalarTraits<Scalar> STS;
-      return STS::eps ();
-    }
-  };
-
-//
-// Tests for Tpetra::CrsMatrix::gaussSeidel().
-//
-TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( CrsMatrix, gaussSeidelSerial, LocalOrdinalType, GlobalOrdinalType, ScalarType, NodeType )
-{
-  using Tpetra::createContigMapWithNode;
-  using Tpetra::createMultiVector;
-  using Tpetra::global_size_t;
-  using Tpetra::Map;
-  using Teuchos::Array;
-  using Teuchos::ArrayRCP;
-  using Teuchos::ArrayView;
-  using Teuchos::as;
-  using Teuchos::av_const_cast;
-  using Teuchos::broadcast;
-  using Teuchos::Comm;
-  using Teuchos::RCP;
-  using Teuchos::rcp;
-  using Teuchos::rcp_const_cast;
-  using Teuchos::OrdinalTraits;
-  using Teuchos::outArg;
-  using Teuchos::ParameterList;
-  using Teuchos::parameterList;
-  using Teuchos::reduceAll;
-  using Teuchos::REDUCE_SUM;
-  using Teuchos::REDUCE_MIN;
-  using Teuchos::ScalarTraits;
-  using Teuchos::tuple;
-  using Teuchos::TypeNameTraits;
-  using std::cerr;
-  using std::endl;
-
-  typedef ScalarType scalar_type;
-  typedef LocalOrdinalType local_ordinal_type;
-  typedef GlobalOrdinalType global_ordinal_type;
-  typedef NodeType node_type;
-
-  // Typedefs derived from the above canonical typedefs.
-  typedef ScalarTraits<scalar_type> STS;
-  typedef typename STS::magnitudeType magnitude_type;
-  typedef ScalarTraits<magnitude_type> STM;
-  typedef Map<local_ordinal_type, global_ordinal_type, node_type> map_type;
-
-  // Abbreviation typedefs.
-  typedef scalar_type ST;
-  typedef local_ordinal_type LO;
-  typedef global_ordinal_type GO;
-  typedef node_type NT;
-
-  // CrsMatrix specialization to use in this test.
-  typedef Tpetra::CrsMatrix<ST, LO, GO, NT> crs_matrix_type;
-
-  // CrsGraph specialization corresponding to crs_matrix_type (the
-  // CrsMatrix specialization).
-  typedef Tpetra::CrsGraph<LO, GO, NT> crs_graph_type;
-
-  // MultiVector specialization corresponding to crs_matrix_type.
-  typedef Tpetra::MultiVector<ST, LO, GO, NT> multivector_type;
-  // Vector specialization corresponding to crs_matrix_type.
-  typedef Tpetra::Vector<ST, LO, GO, NT> vector_type;
-
-  // This controls whether to print to std::cerr instead of out.
-  // Printing to std::cerr ensures that output appears before Kokkos
-  // gets the chance to raise errors.
-  constexpr bool debug = true;
-
-  ////////////////////////////////////////////////////////////////////
-  // HERE BEGINS THE TEST.
-  ////////////////////////////////////////////////////////////////////
-
-  const global_size_t INVALID = OrdinalTraits<global_size_t>::invalid();
-
-  // Get the default communicator.
-  RCP<const Comm<int> > comm = Tpetra::getDefaultComm ();
-  const int numProcs = comm->getSize ();
-  const int myRank = comm->getRank ();
-
-  if (debug) {
-    if (myRank == 0) {
-      cerr << "Test Tpetra's Gauss-Seidel with " << numProcs << " process"
-           << (numProcs != 1 ? "es" : "") << endl;
-    }
-  }
-  else {
-    out << "Test Tpetra's Gauss-Seidel with " << numProcs << " process"
-        << (numProcs != 1 ? "es" : "") << endl;
-  }
-
-#if 0
-  // This test doesn't make much sense if there is only one MPI
-  // process.  We let it pass trivially in that case.
-  if (numProcs == 1) {
-    out << "Number of processes in world is one; test passes trivially." << endl;
-    return;
-  }
-#endif // 0
-
-  // Let's build ourselves the graph of a 2-D Laplacian (Dirichlet
-  // boundary conditions) on a square numGlobalPoints x
-  // numGlobalPoints mesh.  Each process gets a numLocalPoints x
-  // numGlobalPoints slab.
-
-  const LO numLocalPoints = 9;
-  const GO numGlobalPoints = numProcs * numLocalPoints;
-
-  // Number of rows in the matrix owned by each process.
-  const LO numLocalRows = numLocalPoints * numGlobalPoints;
-
-  //CrT: 4Feb14: the void trick does not seem to work, I get warnings
-  // Number of (global) rows and columns in the matrix.
-  //const GO numGlobalRows = numGlobalPoints * numGlobalPoints;
-  //const GO numGlobalCols = numGlobalRows;
-  // Prevent compile warning for unused variable.
-  // (It's not really "variable" if it's const, but oh well.)
-  //(void) numGlobalCols;
-
-  if (debug) {
-    if (myRank == 0) {
-      cerr << "Creating contiguous row Map" << endl;
-    }
-  }
-  else {
-    out << "Creating contiguous row Map" << endl;
-  }
-
-  // Create a contiguous row Map, with numLocalRows rows per process.
-  RCP<const map_type> rowMap =
-    createContigMapWithNode<LO, GO, NT> (INVALID, numLocalRows, comm);
-
-  // The Gauss-Seidel kernel requires that the row, domain, and range
-  // Maps all be the same.
-  RCP<const map_type> domainMap = rowMap;
-  RCP<const map_type> rangeMap = rowMap;
-
-  // Min and max row index of this process.
-  const GO globalMinRow = rowMap->getMinGlobalIndex ();
-  const GO globalMaxRow = rowMap->getMaxGlobalIndex ();
-  const GO globalMinAllRow = rowMap->getMinAllGlobalIndex ();
-  const GO globalMaxAllRow = rowMap->getMaxAllGlobalIndex ();
-
-  if (debug) {
-    if (myRank == 0) {
-      cerr << "Creating graph" << endl;
-    }
-  }
-  else {
-    out << "Creating graph" << endl;
-  }
-
-  // Create a numGlobalRows by numGlobalCols graph and set its
-  // structure.  Every process sets its diagonal entries (which it
-  // owns), and entries 1 and numGlobalPoints away in each direction
-  // (if valid).
-  RCP<const crs_graph_type> graph;
-  {
-    // We have a good upper bound for the number of entries per row, so use static profile.
-    RCP<crs_graph_type> nonconstGraph (new crs_graph_type (rowMap, 5, Tpetra::StaticProfile));
-
-    for (GO globalRow = globalMinRow; globalRow <= globalMaxRow; ++globalRow) {
-      Teuchos::Array<GO> indices;
-      if (globalRow  >= globalMinAllRow + numGlobalPoints) {
-        indices.push_back (globalRow - numGlobalPoints);
-      }
-      if (globalRow >= globalMinAllRow + 1) {
-        indices.push_back (globalRow - 1);
-      }
-      indices.push_back (globalRow);
-      if (globalRow + 1 <= globalMaxAllRow) {
-        indices.push_back (globalRow + 1);
-      }
-      if (globalRow + numGlobalPoints <= globalMaxAllRow) {
-        indices.push_back (globalRow + numGlobalPoints);
-      }
-      nonconstGraph->insertGlobalIndices (globalRow, indices ());
-    }
-
-    nonconstGraph->fillComplete (domainMap, rangeMap);
-    graph = rcp_const_cast<const crs_graph_type> (nonconstGraph);
-  }
-
-  if (debug) {
-    if (myRank == 0) {
-      cerr << "Creating matrix" << endl;
-    }
-  }
-  else {
-    out << "Creating matrix" << endl;
-  }
-
-  // Create the matrix, using the above graph.
-  RCP<crs_matrix_type> matrix (new crs_matrix_type (graph));
-  {
-    for (GO globalRow = globalMinRow; globalRow <= globalMaxRow; ++globalRow) {
-      Teuchos::Array<GO> indices;
-      Teuchos::Array<ST> values;
-      if (globalRow >= globalMinAllRow - numGlobalPoints) {
-        indices.push_back (globalRow - numGlobalPoints);
-        values.push_back (-STS::one ());
-      }
-      if (globalRow >= globalMinAllRow + 1) {
-        indices.push_back (globalRow - 1);
-        values.push_back (-STS::one ());
-      }
-      indices.push_back (globalRow);
-      values.push_back (as<ST> (4));
-      if (globalRow + 1 <= globalMaxAllRow) {
-        indices.push_back (globalRow + 1);
-        values.push_back (-STS::one ());
-      }
-      if (globalRow + numGlobalPoints <= globalMaxAllRow) {
-        indices.push_back (globalRow + numGlobalPoints);
-        values.push_back (-STS::one ());
-      }
-      matrix->replaceGlobalValues (globalRow, indices (), values ());
-    }
-
-    if (debug) {
-      if (myRank == 0) {
-        cerr << "Calling fillComplete on the matrix" << endl;
-      }
-    }
-    else {
-      out << "Calling fillComplete on the matrix" << endl;
-    }
-    matrix->fillComplete (domainMap, rangeMap);
-  }
-
-  const bool dumpMatrix = false;
-  if (dumpMatrix) {
-    const std::string filename ("A.mtx");
-    const std::string scalarType = Teuchos::TypeNameTraits<ST>::name ();
-    const std::string matName = "A-" + scalarType;
-    typedef Tpetra::MatrixMarket::Writer<crs_matrix_type> writer_type;
-    writer_type::writeSparseFile (filename, matrix, matName,
-                                  "Gauss-Seidel test matrix");
-  }
-
-  if (debug) {
-    if (myRank == 0) {
-      cerr << "Extracting inverse diagonal" << endl;
-    }
-  }
-  else {
-    out << "Extracting inverse diagonal" << endl;
-  }
-  RCP<vector_type> D = rcp (new vector_type (rowMap));
-  matrix->getLocalDiagCopy (*D); // Get the diagonal entries.
-  {
-    // Check whether any diagonal entries are zero, and invert the
-    // entries in place.  It's faster to do the latter with a Kokkos
-    // kernel, but this is good enough as a test.
-    typedef typename ArrayRCP<const ST>::size_type size_type;
-    size_type zeroDiagEltIndex = -1;
-    ArrayRCP<ST> D_data = D->getDataNonConst ();
-    for (size_type k = 0; k < D_data.size (); ++k) {
-      if (D_data[k] == STS::zero ()) {
-        zeroDiagEltIndex = k;
-      } else {
-        D_data[k] = STS::one() / D_data[k];
-      }
-    }
-    TEST_EQUALITY_CONST(zeroDiagEltIndex, static_cast<size_type> (-1));
-    // TEUCHOS_TEST_FOR_EXCEPTION(
-    //   zeroDiagEltIndex != -1,
-    //   std::logic_error,
-    //   "On Process " << comm->getRank () << ", diagonal element "
-    //   << zeroDiagEltIndex << ", possibly among others, is zero.");
-  }
-
-  if (debug) {
-    if (myRank == 0) {
-      cerr << "Making vectors" << endl;
-    }
-  }
-  else {
-    out << "Making vectors" << endl;
-  }
-
-  // Make (multi)vectors for the initial guess (which will also be the
-  // solution vector), the right-hand side, and the residual.  We're
-  // only testing a single right-hand side for now.  Make all the
-  // vectors first in the column Map, with the actual vector given to
-  // Gauss-Seidel in the domain / range Map.
-
-  RCP<const map_type> colMap = graph->getColMap ();
-  RCP<multivector_type> X_colMap = createMultiVector<ST, LO, GO, NT> (colMap, 1);
-  RCP<multivector_type> X_exact_colMap = createMultiVector<ST, LO, GO, NT> (colMap, 1);
-  RCP<multivector_type> B_colMap = createMultiVector<ST, LO, GO, NT> (colMap, 1);
-  RCP<multivector_type> R_colMap = createMultiVector<ST, LO, GO, NT> (colMap, 1);
-  RCP<multivector_type> X = X_colMap->offsetViewNonConst (domainMap, 0);
-  RCP<multivector_type> X_exact = X_exact_colMap->offsetViewNonConst (domainMap, 0);
-  RCP<multivector_type> B = B_colMap->offsetViewNonConst (rangeMap, 0);
-  RCP<multivector_type> R = R_colMap->offsetViewNonConst (rangeMap, 0);
-
-  if (debug) {
-    std::ostringstream os;
-    os << "Proc " << myRank << ": Set (random) exact solution X_exact"
-       << endl;
-    cerr << os.str ();
-  }
-  else {
-    out << "Set (random) exact solution X_exact" << endl;
-  }
-  X_exact->randomize ();
-
-  if (debug) {
-    std::ostringstream os;
-    os << "Proc " << myRank << ": Compute right-hand side B" << endl;
-    cerr << os.str ();
-  }
-  else {
-    out << "Compute right-hand side B" << endl;
-  }
-  matrix->apply (*X_exact, *B);
-
-  const int maxNumIters = 10;
-  Array<magnitude_type> residNorms (maxNumIters+1);
-
-  if (debug) {
-    std::ostringstream os;
-    os << "Proc " << myRank << ": Compute initial residual" << endl;
-    cerr << os.str ();
-  }
-  else {
-    out << "Compute initial residual" << endl;
-  }
-
-  // Compute initial residual R = B - A * X and ||R||_2.
-  Tpetra::Details::residual(*matrix,*X,*B,*R);
-  residNorms[0] = norm2 (*R);
-
-  if (debug) {
-    std::ostringstream os;
-    os << "Proc " << myRank << ": Compute norms of X, D, and B" << endl;
-    cerr << os.str ();
-  }
-  else {
-    out << "Compute norms of X, D, and B" << endl;
-  }
-
-  // Compute norms of X, D, and B.
-  // The norms of D and B must not change.
-  const magnitude_type X_norm_orig = norm2 (*X);
-  const magnitude_type D_norm_orig = norm2 (*D);
-  const magnitude_type B_norm_orig = norm2 (*B);
-  if (myRank == 0) {
-    std::ostream& os = debug ? cerr : out;
-    os << "Before iterating:" << endl
-       << "- ||R||_2 = " << residNorms[0] << endl
-       << "- ||X||_2 = " << X_norm_orig << endl
-       << "- ||B||_2 = " << B_norm_orig << endl
-       << "- ||D||_2 = " << D_norm_orig << endl;
-  }
-
-  // Monitor the norms of (X,) D, and B.  If the norms of D or B
-  // change, that means Gauss-Seidel is broken (or we mixed up the
-  // order of its arguments).
-  magnitude_type X_norm = X_norm_orig;
-  magnitude_type D_norm = D_norm_orig;
-  magnitude_type B_norm = B_norm_orig;
-
-  if (debug) {
-    std::ostringstream os;
-    os << "Proc " << myRank << ": Test CrsMatrix::gaussSeidel" << endl;
-    cerr << os.str ();
-  }
-  else {
-    out << "Test CrsMatrix::gaussSeidel" << endl;
-  }
-
-  int localSuccess = 1;
-  int globalSuccess = 1;
-  int smallestFailingRank = 0;
-  for (int iter = 0; iter < maxNumIters; ++iter) {
-    std::string exMsg;
-    try {
-      matrix->gaussSeidel (*B, *X, *D, STS::one(), Tpetra::Symmetric, 1);
-    }
-    catch (std::exception& e) {
-      exMsg = e.what ();
-      localSuccess = 0;
-    }
-    reduceAll (*comm, REDUCE_SUM, localSuccess, outArg (globalSuccess));
-    if (globalSuccess < numProcs) {
-      // Compute min rank that failed.  For procs that didn't fail,
-      // set their "rank" to numProcs.
-      const int inRank = (localSuccess == 1) ? numProcs : myRank;
-      reduceAll (*comm, REDUCE_MIN, inRank, outArg (smallestFailingRank));
-      // The min-failing-rank process broadcasts the error message's
-      // length and the message itself to all the other processes.
-      int msgLen = as<int> (exMsg.size ());
-      broadcast (*comm, smallestFailingRank, outArg (msgLen));
-      exMsg.reserve (msgLen);
-      char* const exMsgRaw = const_cast<char*> (exMsg.c_str ());
-      broadcast (*comm, smallestFailingRank, msgLen, exMsgRaw);
-      TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, exMsg);
-    }
-
-    // Compute the new residual R = B - A * X.  This is not part of
-    // Gauss-Seidel itself, but we use it to measure convergence.
-    Tpetra::Details::residual(*matrix,*X,*B,*R);
-    residNorms[iter+1] = norm2 (*R);
-
-    X_norm = norm2 (*X);
-    D_norm = norm2 (*D);
-    B_norm = norm2 (*B);
-    if (myRank == 0) {
-      out << "After iteration " << iter+1 << " of " << maxNumIters << ":" << endl
-           << "- ||R||_2 = " << residNorms[iter+1] << endl
-           << "- ||X||_2 = " << X_norm << endl
-           << "- ||B||_2 = " << B_norm << endl
-           << "- ||D||_2 = " << D_norm << endl;
-    }
-  }
-
-  // The test passes if
-  //
-  // 1. The norms of B and D did not change.
-  // 2. The residual norm decreased.
-  //
-  // It would be better if we had a specific decrease rate in mind,
-  // but we'll leave that for later.
-  //
-  // FIXME (mfh 01 Jan 2013) This test assumes that norms are computed
-  // deterministically.  This is not necessarily correct, even when
-  // running in MPI-only (no hybrid parallelism) mode.  Thus, we need
-  // some kind of tolerance for these tests.  For the prefactor,
-  // square root of N (the usual heuristic) was not enough; we had to
-  // use N instead.
-  const magnitude_type testTolPrefactor =
-    static_cast<magnitude_type> (B->getGlobalLength ());
-  const magnitude_type testTol =
-    testTolPrefactor * ComputeEps<scalar_type>::eps ();
-
-  const magnitude_type B_norm_diff = STM::magnitude (B_norm - B_norm_orig);
-  TEUCHOS_TEST_FOR_EXCEPTION(
-    B_norm_diff > testTol, std::logic_error,
-    "Gauss-Seidel changed the norm of B!  |B_norm_orig - B_norm| = "
-    << B_norm_diff << " > testTol = " << testTol << ".  That means either the "
-    "Gauss-Seidel implementation is broken, or we mixed up the order of its "
-    "arguments.  Original ||B||_2 = " << B_norm_orig << "; new ||B||_2 = "
-    << B_norm << ".");
-
-  const magnitude_type D_norm_diff = STM::magnitude (D_norm - D_norm_orig);
-  TEUCHOS_TEST_FOR_EXCEPTION(
-    D_norm_diff > testTol, std::logic_error,
-    "Gauss-Seidel changed the norm of D (the vector of diagonal entries of the "
-    "matrix)!  |D_norm_orig - D_norm| = " << D_norm_diff << " > testTol = "
-    << testTol << ".  That means either the Gauss-Seidel implementation is "
-    "broken, or we mixed up the order of its arguments.  Original ||D||_2 = "
-    << D_norm_orig << "; new ||D||_2 = " << D_norm << ".");
-
-  TEUCHOS_TEST_FOR_EXCEPTION(
-    maxNumIters > 0 && residNorms[maxNumIters] > residNorms[0],
-    std::logic_error,
-    "Gauss-Seidel failed to reduce the residual norm after " << maxNumIters
-    << " iterations!  Original ||R||_2 = " << residNorms[0] << "; final "
-    "||R||_2 = " << residNorms[maxNumIters] << ".");
-
-  {
-    const int lclSuccess = success ? 1 : 0;
-    int gblSuccess = 0; // output argument
-    reduceAll<int, int> (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess));
-    TEST_EQUALITY_CONST( gblSuccess, 1 );
-    if (gblSuccess != 1) {
-      std::ostream& out2 = debug ? cerr : out;
-      out2 << "Test failed on at least one process!" << endl;
-    }
-  }
-}
-
-
-//
-// Tests for Tpetra::CrsMatrix::gaussSeidel().
-//
-TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( CrsMatrix, reorderedGaussSeidelSerial, LocalOrdinalType, GlobalOrdinalType, ScalarType, NodeType )
-{
-  using Tpetra::createContigMapWithNode;
-  using Tpetra::createMultiVector;
-  using Tpetra::global_size_t;
-  using Tpetra::Map;
-  using Teuchos::Array;
-  using Teuchos::ArrayRCP;
-  using Teuchos::ArrayView;
-  using Teuchos::as;
-  using Teuchos::av_const_cast;
-  using Teuchos::broadcast;
-  using Teuchos::Comm;
-  using Teuchos::RCP;
-  using Teuchos::rcp;
-  using Teuchos::rcp_const_cast;
-  using Teuchos::OrdinalTraits;
-  using Teuchos::outArg;
-  using Teuchos::ParameterList;
-  using Teuchos::parameterList;
-  using Teuchos::reduceAll;
-  using Teuchos::REDUCE_SUM;
-  using Teuchos::REDUCE_MIN;
-  using Teuchos::ScalarTraits;
-  using Teuchos::tuple;
-  using Teuchos::TypeNameTraits;
-  using std::cerr;
-  using std::endl;
-
-  typedef ScalarType scalar_type;
-  typedef LocalOrdinalType local_ordinal_type;
-  typedef GlobalOrdinalType global_ordinal_type;
-  typedef NodeType node_type;
-
-  // Typedefs derived from the above canonical typedefs.
-  typedef ScalarTraits<scalar_type> STS;
-  typedef typename STS::magnitudeType magnitude_type;
-  typedef ScalarTraits<magnitude_type> STM;
-  typedef Map<local_ordinal_type, global_ordinal_type, node_type> map_type;
-
-  // Abbreviation typedefs.
-  typedef scalar_type ST;
-  typedef local_ordinal_type LO;
-  typedef global_ordinal_type GO;
-  typedef node_type NT;
-
-  // CrsMatrix specialization to use in this test.
-  typedef Tpetra::CrsMatrix<ST, LO, GO, NT> crs_matrix_type;
-
-  // CrsGraph specialization corresponding to crs_matrix_type (the
-  // CrsMatrix specialization).
-  typedef Tpetra::CrsGraph<LO, GO, NT> crs_graph_type;
-
-  // MultiVector specialization corresponding to crs_matrix_type.
-  typedef Tpetra::MultiVector<ST, LO, GO, NT> multivector_type;
-  // Vector specialization corresponding to crs_matrix_type.
-  typedef Tpetra::Vector<ST, LO, GO, NT> vector_type;
-
-
-  ////////////////////////////////////////////////////////////////////
-  // HERE BEGINS THE TEST.
-  ////////////////////////////////////////////////////////////////////
-
-  const global_size_t INVALID = OrdinalTraits<global_size_t>::invalid();
-
-  // Get the default communicator.
-  RCP<const Comm<int> > comm = Tpetra::getDefaultComm ();
-  const int numProcs = comm->getSize ();
-  const int myRank = comm->getRank ();
-
-  if (myRank == 0) {
-    out << "Test with " << numProcs << " process" << (numProcs != 1 ? "es" : "") << endl;
-  }
-
-#if 0
-  // This test doesn't make much sense if there is only one MPI
-  // process.  We let it pass trivially in that case.
-  if (numProcs == 1) {
-    out << "Number of processes in world is one; test passes trivially." << endl;
-    return;
-  }
-#endif // 0
-
-  // Let's build ourselves the graph of a 2-D Laplacian (Dirichlet
-  // boundary conditions) on a square numGlobalPoints x
-  // numGlobalPoints mesh.  Each process gets a numLocalPoints x
-  // numGlobalPoints slab.
-
-  const LO numLocalPoints = 9;
-  const GO numGlobalPoints = numProcs * numLocalPoints;
-
-  // Number of rows in the matrix owned by each process.
-  const LO numLocalRows = numLocalPoints * numGlobalPoints;
-
-  //CrT: 4Feb14: the void trick does not seem to work, I get warnings
-  // Number of (global) rows and columns in the matrix.
-  //const GO numGlobalRows = numGlobalPoints * numGlobalPoints;
-  //const GO numGlobalCols = numGlobalRows;
-  // Prevent compile warning for unused variable.
-  // (It's not really "variable" if it's const, but oh well.)
-  //(void) numGlobalCols;
-
-  if (myRank == 0) {
-    out << "Creating contiguous row Map" << endl;
-  }
-
-  // Create a contiguous row Map, with numLocalRows rows per process.
-  RCP<const map_type> rowMap = createContigMapWithNode<LO, GO, NT> (INVALID, numLocalRows, comm);
-
-  // The Gauss-Seidel kernel requires that the row, domain, and range
-  // Maps all be the same.
-  RCP<const map_type> domainMap = rowMap;
-  RCP<const map_type> rangeMap = rowMap;
-
-  // Min and max row index of this process.
-  const GO globalMinRow = rowMap->getMinGlobalIndex ();
-  const GO globalMaxRow = rowMap->getMaxGlobalIndex ();
-  const GO globalMinAllRow = rowMap->getMinAllGlobalIndex ();
-  const GO globalMaxAllRow = rowMap->getMaxAllGlobalIndex ();
-
-  if (myRank == 0) {
-    out << "Creating graph" << endl;
-  }
-
-  // Create a numGlobalRows by numGlobalCols graph and set its
-  // structure.  Every process sets its diagonal entries (which it
-  // owns), and entries 1 and numGlobalPoints away in each direction
-  // (if valid).
-  RCP<const crs_graph_type> graph;
-  {
-    // We have a good upper bound for the number of entries per row, so use static profile.
-    RCP<crs_graph_type> nonconstGraph (new crs_graph_type (rowMap, 5, Tpetra::StaticProfile));
-
-    for (GO globalRow = globalMinRow; globalRow <= globalMaxRow; ++globalRow) {
-      Teuchos::Array<GO> indices;
-      if (globalRow  >= globalMinAllRow + numGlobalPoints) {
-        indices.push_back (globalRow - numGlobalPoints);
-      }
-      if (globalRow >= globalMinAllRow + 1) {
-        indices.push_back (globalRow - 1);
-      }
-      indices.push_back (globalRow);
-      if (globalRow + 1 <= globalMaxAllRow) {
-        indices.push_back (globalRow + 1);
-      }
-      if (globalRow + numGlobalPoints <= globalMaxAllRow) {
-        indices.push_back (globalRow + numGlobalPoints);
-      }
-      nonconstGraph->insertGlobalIndices (globalRow, indices ());
-    }
-
-    nonconstGraph->fillComplete (domainMap, rangeMap);
-    graph = rcp_const_cast<const crs_graph_type> (nonconstGraph);
-  }
-
-  if (myRank == 0) {
-    out << "Creating matrix" << endl;
-  }
-
-  // Create the matrix, using the above graph.
-  RCP<crs_matrix_type> matrix (new crs_matrix_type (graph));
-  {
-    for (GO globalRow = globalMinRow; globalRow <= globalMaxRow; ++globalRow) {
-      Teuchos::Array<GO> indices;
-      Teuchos::Array<ST> values;
-      if (globalRow >= globalMinAllRow - numGlobalPoints) {
-        indices.push_back (globalRow - numGlobalPoints);
-        values.push_back (-STS::one ());
-      }
-      if (globalRow >= globalMinAllRow + 1) {
-        indices.push_back (globalRow - 1);
-        values.push_back (-STS::one ());
-      }
-      indices.push_back (globalRow);
-      values.push_back (as<ST> (4));
-      if (globalRow + 1 <= globalMaxAllRow) {
-        indices.push_back (globalRow + 1);
-        values.push_back (-STS::one ());
-      }
-      if (globalRow + numGlobalPoints <= globalMaxAllRow) {
-        indices.push_back (globalRow + numGlobalPoints);
-        values.push_back (-STS::one ());
-      }
-      matrix->replaceGlobalValues (globalRow, indices (), values ());
-    }
-
-    if (myRank == 0) {
-      out << "Calling fillComplete on the matrix" << endl;
-    }
-    matrix->fillComplete (domainMap, rangeMap);
-  }
-
-  const bool dumpMatrix = false;
-  if (dumpMatrix) {
-    const std::string filename ("A.mtx");
-    const std::string scalarType = Teuchos::TypeNameTraits<ST>::name ();
-    const std::string matName = "A-" + scalarType;
-    typedef Tpetra::MatrixMarket::Writer<crs_matrix_type> writer_type;
-    writer_type::writeSparseFile (filename, matrix, matName, "Gauss-Seidel test matrix");
-  }
-
-  if (myRank == 0) {
-    out << "Extracting inverse diagonal" << endl;
-  }
-  RCP<vector_type> D = rcp (new vector_type (rowMap));
-  matrix->getLocalDiagCopy (*D); // Get the diagonal entries.
-  {
-    // Check whether any diagonal entries are zero, and invert the
-    // entries in place.  It's faster to do the latter with a Kokkos
-    // kernel, but this is good enough as a test.
-    typedef typename ArrayRCP<const ST>::size_type size_type;
-    size_type zeroDiagEltIndex = -1;
-    ArrayRCP<ST> D_data = D->getDataNonConst ();
-    for (size_type k = 0; k < D_data.size (); ++k) {
-      if (D_data[k] == STS::zero ()) {
-        zeroDiagEltIndex = k;
-      } else {
-        D_data[k] = STS::one() / D_data[k];
-      }
-    }
-    TEST_EQUALITY_CONST(zeroDiagEltIndex, static_cast<size_type> (-1));
-    // TEUCHOS_TEST_FOR_EXCEPTION(
-    //   zeroDiagEltIndex != -1,
-    //   std::logic_error,
-    //   "On Process " << comm->getRank () << ", diagonal element "
-    //   << zeroDiagEltIndex << ", possibly among others, is zero.");
-  }
-
-  if (myRank == 0) {
-    out << "Making vectors" << endl;
-  }
-
-  // Make (multi)vectors for the initial guess (which will also be the
-  // solution vector), the right-hand side, and the residual.  We're
-  // only testing a single right-hand side for now.  Make all the
-  // vectors first in the column Map, with the actual vector given to
-  // Gauss-Seidel in the domain / range Map.
-
-  RCP<const map_type> colMap = graph->getColMap ();
-  RCP<multivector_type> X_colMap = createMultiVector<ST, LO, GO, NT> (colMap, 1);
-  RCP<multivector_type> X_exact_colMap = createMultiVector<ST, LO, GO, NT> (colMap, 1);
-  RCP<multivector_type> B_colMap = createMultiVector<ST, LO, GO, NT> (colMap, 1);
-  RCP<multivector_type> R_colMap = createMultiVector<ST, LO, GO, NT> (colMap, 1);
-  RCP<multivector_type> X = X_colMap->offsetViewNonConst (domainMap, 0);
-  RCP<multivector_type> X_exact = X_exact_colMap->offsetViewNonConst (domainMap, 0);
-  RCP<multivector_type> B = B_colMap->offsetViewNonConst (rangeMap, 0);
-  RCP<multivector_type> R = R_colMap->offsetViewNonConst (rangeMap, 0);
-
-  // Set the exact solution and right-hand side.
-  X_exact->randomize ();
-  matrix->apply (*X_exact, *B);
-
-  const int maxNumIters = 10;
-  Array<magnitude_type> residNorms (maxNumIters+1);
-
-
-  // Compute initial residual R = B - A * X and ||R||_2.
-  Tpetra::Details::residual(*matrix,*X,*B,*R);
-  residNorms[0] = norm2 (*R);
-
-  // Compute norms of X, D, and B.
-  // The norms of D and B must not change.
-  const magnitude_type X_norm_orig = norm2 (*X);
-  const magnitude_type D_norm_orig = norm2 (*D);
-  const magnitude_type B_norm_orig = norm2 (*B);
-  if (myRank == 0) {
-    out << "Before iterating:" << endl
-         << "- ||R||_2 = " << residNorms[0] << endl
-         << "- ||X||_2 = " << X_norm_orig << endl
-         << "- ||B||_2 = " << B_norm_orig << endl
-         << "- ||D||_2 = " << D_norm_orig << endl;
-  }
-
-  // Ordering vector
-  Teuchos::Array<LO> rowIndices(matrix->getNodeNumRows());
-  for( LO i=0; (size_t)i < matrix->getNodeNumRows(); i++)
-    rowIndices[i]=i;
-
-
-  // Monitor the norms of (X,) D, and B.  If the norms of D or B
-  // change, that means Gauss-Seidel is broken (or we mixed up the
-  // order of its arguments).
-  magnitude_type X_norm = X_norm_orig;
-  magnitude_type D_norm = D_norm_orig;
-  magnitude_type B_norm = B_norm_orig;
-
-  int localSuccess = 1;
-  int globalSuccess = 1;
-  int smallestFailingRank = 0;
-  for (int iter = 0; iter < maxNumIters; ++iter) {
-    std::string exMsg;
-    try {
-      matrix->reorderedGaussSeidel (*B, *X, *D, rowIndices,STS::one(), Tpetra::Symmetric, 1);
-    }
-    catch (std::exception& e) {
-      exMsg = e.what ();
-      localSuccess = 0;
-    }
-    reduceAll (*comm, REDUCE_SUM, localSuccess, outArg (globalSuccess));
-    if (globalSuccess < numProcs) {
-      // Compute min rank that failed.  For procs that didn't fail,
-      // set their "rank" to numProcs.
-      const int inRank = (localSuccess == 1) ? numProcs : myRank;
-      reduceAll (*comm, REDUCE_MIN, inRank, outArg (smallestFailingRank));
-      // The min-failing-rank process broadcasts the error message's
-      // length and the message itself to all the other processes.
-      int msgLen = as<int> (exMsg.size ());
-      broadcast (*comm, smallestFailingRank, outArg (msgLen));
-      exMsg.reserve (msgLen);
-      char* const exMsgRaw = const_cast<char*> (exMsg.c_str ());
-      broadcast (*comm, smallestFailingRank, msgLen, exMsgRaw);
-      TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, exMsg);
-    }
-
-    // Compute the new residual R = B - A * X.  This is not part of
-    // Gauss-Seidel itself, but we use it to measure convergence.
-    Tpetra::Details::residual(*matrix,*X,*B,*R);
-    residNorms[iter+1] = norm2 (*R);
-
-    X_norm = norm2 (*X);
-    D_norm = norm2 (*D);
-    B_norm = norm2 (*B);
-    if (myRank == 0) {
-      out << "After iteration " << iter+1 << " of " << maxNumIters << ":" << endl
-           << "- ||R||_2 = " << residNorms[iter+1] << endl
-           << "- ||X||_2 = " << X_norm << endl
-           << "- ||B||_2 = " << B_norm << endl
-           << "- ||D||_2 = " << D_norm << endl;
-    }
-  }
-
-  // The test passes if
-  //
-  // 1. The norms of B and D did not change.
-  // 2. The residual norm decreased.
-  //
-  // It would be better if we had a specific decrease rate in mind,
-  // but we'll leave that for later.
-  //
-  // FIXME (mfh 01 Jan 2013) This test assumes that norms are computed
-  // deterministically.  This is not necessarily correct, even when
-  // running in MPI-only (no hybrid parallelism) mode.  Thus, we
-  // really need some kind of tolerance for these tests.  For the
-  // prefactor, square root of N (the usual heuristic) was not enough;
-  // we had to use N instead.
-  const magnitude_type testTolPrefactor =
-    static_cast<magnitude_type> (B->getGlobalLength ());
-  const magnitude_type testTol =
-    testTolPrefactor * ComputeEps<scalar_type>::eps ();
-
-  const magnitude_type B_norm_diff = STM::magnitude (B_norm - B_norm_orig);
-  TEUCHOS_TEST_FOR_EXCEPTION(
-    B_norm_diff > testTol, std::logic_error,
-    "Gauss-Seidel changed the norm of B!  |B_norm_orig - B_norm| = "
-    << B_norm_diff << " > testTol = " << testTol << ".  That means either the "
-    "Gauss-Seidel implementation is broken, or we mixed up the order of its "
-    "arguments.  Original ||B||_2 = " << B_norm_orig << "; new ||B||_2 = "
-    << B_norm << ".");
-
-  const magnitude_type D_norm_diff = STM::magnitude (D_norm - D_norm_orig);
-  TEUCHOS_TEST_FOR_EXCEPTION(
-    D_norm_diff > testTol, std::logic_error,
-    "Gauss-Seidel changed the norm of D (the vector of diagonal entries of the "
-    "matrix)!  |D_norm_orig - D_norm| = " << D_norm_diff << " > testTol = "
-    << testTol << ".  That means either the Gauss-Seidel implementation is "
-    "broken, or we mixed up the order of its arguments.  Original ||D||_2 = "
-    << D_norm_orig << "; new ||D||_2 = " << D_norm << ".");
-
-  TEUCHOS_TEST_FOR_EXCEPTION(
-    maxNumIters > 0 && residNorms[maxNumIters] > residNorms[0],
-    std::logic_error,
-    "Gauss-Seidel failed to reduce the residual norm after " << maxNumIters
-    << " iterations!  Original ||R||_2 = " << residNorms[0] << "; final "
-    "||R||_2 = " << residNorms[maxNumIters] << ".");
-}
-
-
-//////////////////////////////////////////////////////////////////////
-// INSTANTIATE THE TEMPLATED UNIT TESTS
-//////////////////////////////////////////////////////////////////////
-
-typedef Tpetra::Details::DefaultTypes::node_type default_node_type;
-#define UNIT_TEST_GROUP( SCALAR, LO, GO ) \
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT( CrsMatrix, gaussSeidelSerial, LO, GO, SCALAR, default_node_type ) \
-  TEUCHOS_UNIT_TEST_TEMPLATE_4_INSTANT( CrsMatrix, reorderedGaussSeidelSerial, LO, GO, SCALAR, default_node_type )
-
-TPETRA_ETI_MANGLING_TYPEDEFS()
-
-TPETRA_INSTANTIATE_SLG( UNIT_TEST_GROUP )
-
-} // namespace (anonymous)
-
-