diff --git a/packages/ifpack2/example/BlockTriDi.cpp b/packages/ifpack2/example/BlockTriDi.cpp
index 1c7bd3012570..3277179656c1 100644
--- a/packages/ifpack2/example/BlockTriDi.cpp
+++ b/packages/ifpack2/example/BlockTriDi.cpp
@@ -18,7 +18,7 @@ namespace { // (anonymous)
 
 // Values of command-line arguments.
 struct CmdLineArgs {
-  CmdLineArgs ():blockSize(-1),numIters(10),numRepeats(1),tol(1e-12),nx(172),ny(-1),nz(-1),mx(1),my(1),mz(1),sublinesPerLine(1),useStackedTimer(false),overlapCommAndComp(false){}
+  CmdLineArgs ():blockSize(-1),numIters(10),numRepeats(1),tol(1e-12),nx(172),ny(-1),nz(-1),mx(1),my(1),mz(1),sublinesPerLine(1),sublinesPerLineSchur(1),useStackedTimer(false),overlapCommAndComp(false){}
 
   std::string mapFilename;
   std::string matrixFilename;
@@ -35,6 +35,7 @@ struct CmdLineArgs {
   int my;
   int mz;
   int sublinesPerLine;
+  int sublinesPerLineSchur;
   bool useStackedTimer;
   bool overlapCommAndComp;
   std::string problemName;
@@ -71,6 +72,7 @@ getCmdLineArgs (CmdLineArgs& args, int argc, char* argv[])
 		  "Whether to run with overlapCommAndComp)");
   cmdp.setOption("problemName", &args.problemName, "Human-readable problem name for Watchr plot");
   cmdp.setOption("matrixType", &args.matrixType, "matrixType");
+  cmdp.setOption("sublinesPerLineSchur", &args.sublinesPerLineSchur, "sublinesPerLineSchur");
   auto result = cmdp.parse (argc, argv);
   return result == Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL;
 }
@@ -573,7 +575,7 @@ main (int argc, char* argv[])
 
   {
     Teuchos::TimeMonitor precSetupTimeMon (*precSetupTime);
-    precond = rcp(new BTDC(Ablock,parts,args.overlapCommAndComp));
+    precond = rcp(new BTDC(Ablock,parts,args.sublinesPerLineSchur,args.overlapCommAndComp));
 
     if(rank0) std::cout<<"Initializing preconditioner..."<<std::endl;
     precond->initialize ();
diff --git a/packages/ifpack2/src/Ifpack2_BlockComputeResidualVector.hpp b/packages/ifpack2/src/Ifpack2_BlockComputeResidualVector.hpp
new file mode 100644
index 000000000000..a50f054a626a
--- /dev/null
+++ b/packages/ifpack2/src/Ifpack2_BlockComputeResidualVector.hpp
@@ -0,0 +1,859 @@
+/*@HEADER
+// ***********************************************************************
+//
+//       Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
+//                 Copyright (2009) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
+//
+// ***********************************************************************
+//@HEADER
+*/
+
+#ifndef IFPACK2_BLOCKCOMPUTERES_IMPL_HPP
+#define IFPACK2_BLOCKCOMPUTERES_IMPL_HPP
+
+#include "Ifpack2_BlockHelper.hpp"
+
+namespace Ifpack2 {
+
+  namespace BlockHelperDetails {
+
+    ///
+    /// A - Tridiags(A), i.e., R in the splitting A = D + R.
+    ///
+    template <typename MatrixType>
+    struct AmD {
+      using impl_type = BlockHelperDetails::ImplType<MatrixType>;
+      using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
+      using size_type_1d_view = typename impl_type::size_type_1d_view;
+      using impl_scalar_type_1d_view_tpetra = Unmanaged<typename impl_type::impl_scalar_type_1d_view_tpetra>;
+      // rowptr points to the start of each row of A_colindsub.
+      size_type_1d_view rowptr, rowptr_remote;
+      // Indices into A's rows giving the blocks to extract. rowptr(i) points to
+      // the i'th row. Thus, g.entries(A_colindsub(rowptr(row) : rowptr(row+1))),
+      // where g is A's graph, are the columns AmD uses. If seq_method_, then
+      // A_colindsub contains all the LIDs and A_colindsub_remote is empty. If !
+      // seq_method_, then A_colindsub contains owned LIDs and A_colindsub_remote
+      // contains the remote ones.
+      local_ordinal_type_1d_view A_colindsub, A_colindsub_remote;
+
+      // Currently always true.
+      bool is_tpetra_block_crs;
+
+      // If is_tpetra_block_crs, then this is a pointer to A_'s value data.
+      impl_scalar_type_1d_view_tpetra tpetra_values;
+
+      AmD() = default;
+      AmD(const AmD &b) = default;
+    };
+
+    template<typename MatrixType>
+    struct PartInterface {
+      using local_ordinal_type = typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type;
+      using local_ordinal_type_1d_view = typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view;
+      using local_ordinal_type_2d_view = typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_2d_view;
+
+      PartInterface() = default;
+      PartInterface(const PartInterface &b) = default;
+
+      // Some terms:
+      //   The matrix A is split as A = D + R, where D is the matrix of tridiag
+      // blocks and R is the remainder.
+      //   A part is roughly a synonym for a tridiag. The distinction is that a part
+      // is the set of rows belonging to one tridiag and, equivalently, the off-diag
+      // rows in R associated with that tridiag. In contrast, the term tridiag is
+      // used to refer specifically to tridiag data, such as the pointer into the
+      // tridiag data array.
+      //   Local (lcl) row are the LIDs. lclrow lists the LIDs belonging to each
+      // tridiag, and partptr points to the beginning of each tridiag. This is the
+      // LID space.
+      //   Row index (idx) is the ordinal in the tridiag ordering. lclrow is indexed
+      // by this ordinal. This is the 'index' space.
+      //   A flat index is the mathematical index into an array. A pack index
+      // accounts for SIMD packing.
+
+      // Local row LIDs. Permutation from caller's index space to tridiag index
+      // space.
+      local_ordinal_type_1d_view lclrow;
+      // partptr_ is the pointer array into lclrow_.
+      local_ordinal_type_1d_view partptr; // np+1
+      local_ordinal_type_2d_view partptr_sub;
+      local_ordinal_type_1d_view partptr_schur;
+      // packptr_(i), for i the pack index, indexes partptr_. partptr_(packptr_(i))
+      // is the start of the i'th pack.
+      local_ordinal_type_1d_view packptr; // npack+1
+      local_ordinal_type_1d_view packptr_sub;
+      local_ordinal_type_1d_view packindices_sub;
+      local_ordinal_type_1d_view packindices_schur;
+      // part2rowidx0_(i) is the flat row index of the start of the i'th part. It's
+      // an alias of partptr_ in the case of no overlap.
+      local_ordinal_type_1d_view part2rowidx0; // np+1
+      local_ordinal_type_1d_view part2rowidx0_sub;
+      // part2packrowidx0_(i) is the packed row index. If vector_length is 1, then
+      // it's the same as part2rowidx0_; if it's > 1, then the value is combined
+      // with i % vector_length to get the location in the packed data.
+      local_ordinal_type_1d_view part2packrowidx0; // np+1
+      local_ordinal_type_2d_view part2packrowidx0_sub;
+      local_ordinal_type part2packrowidx0_back; // So we don't need to grab the array from the GPU.
+      // rowidx2part_ maps the row index to the part index.
+      local_ordinal_type_1d_view rowidx2part; // nr
+      local_ordinal_type_1d_view rowidx2part_sub;
+      // True if lcl{row|col} is at most a constant away from row{idx|col}. In
+      // practice, this knowledge is not particularly useful, as packing for batched
+      // processing is done at the same time as the permutation from LID to index
+      // space. But it's easy to detect, so it's recorded in case an optimization
+      // can be made based on it.
+      bool row_contiguous;
+
+      local_ordinal_type max_partsz;
+      local_ordinal_type max_subpartsz;
+      local_ordinal_type n_subparts_per_part;
+      local_ordinal_type nparts;
+    };
+
+    ///
+    /// compute local residula vector y = b - R x
+    ///
+    static inline int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize,
+                                                                     const int team_size) {
+      int total_team_size(0);
+      if      (blksize <=  5) total_team_size =  32;
+      else if (blksize <=  9) total_team_size =  32; // 64
+      else if (blksize <= 12) total_team_size =  96;
+      else if (blksize <= 16) total_team_size = 128;
+      else if (blksize <= 20) total_team_size = 160;
+      else                    total_team_size = 160;
+      return total_team_size/team_size;
+    }
+
+    static inline int ComputeResidualVectorRecommendedHIPVectorSize(const int blksize,
+								    const int team_size) {
+      int total_team_size(0);
+      if      (blksize <=  5) total_team_size =  32;
+      else if (blksize <=  9) total_team_size =  32; // 64
+      else if (blksize <= 12) total_team_size =  96;
+      else if (blksize <= 16) total_team_size = 128;
+      else if (blksize <= 20) total_team_size = 160;
+      else                    total_team_size = 160;
+      return total_team_size/team_size;
+    }
+
+    static inline int ComputeResidualVectorRecommendedSYCLVectorSize(const int blksize,
+								     const int team_size) {
+      int total_team_size(0);
+      if      (blksize <=  5) total_team_size =  32;
+      else if (blksize <=  9) total_team_size =  32; // 64
+      else if (blksize <= 12) total_team_size =  96;
+      else if (blksize <= 16) total_team_size = 128;
+      else if (blksize <= 20) total_team_size = 160;
+      else                    total_team_size = 160;
+      return total_team_size/team_size;
+    }
+
+    template<typename T>
+    static inline int ComputeResidualVectorRecommendedVectorSize(const int blksize,
+                                                                 const int team_size) {
+      if ( is_cuda<T>::value )
+        return ComputeResidualVectorRecommendedCudaVectorSize(blksize, team_size);
+      if ( is_hip<T>::value )
+        return ComputeResidualVectorRecommendedHIPVectorSize(blksize, team_size);
+      if ( is_sycl<T>::value )
+        return ComputeResidualVectorRecommendedSYCLVectorSize(blksize, team_size);
+      return -1;
+    }
+
+    template<typename MatrixType>
+    struct ComputeResidualVector {
+    public:
+      using impl_type = BlockHelperDetails::ImplType<MatrixType>;
+      using node_device_type = typename impl_type::node_device_type;
+      using execution_space = typename impl_type::execution_space;
+      using memory_space = typename impl_type::memory_space;
+
+      using local_ordinal_type = typename impl_type::local_ordinal_type;
+      using size_type = typename impl_type::size_type;
+      using impl_scalar_type = typename impl_type::impl_scalar_type;
+      using magnitude_type = typename impl_type::magnitude_type;
+      using btdm_scalar_type = typename impl_type::btdm_scalar_type;
+      using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
+      /// views
+      using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
+      using size_type_1d_view = typename impl_type::size_type_1d_view;
+      using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right)
+      using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
+      using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector (layout left)
+      using vector_type_3d_view = typename impl_type::vector_type_3d_view;
+      using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
+      static constexpr int vector_length = impl_type::vector_length;
+
+      /// team policy member type (used in cuda)
+      using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
+
+      // enum for max blocksize and vector length
+      enum : int { max_blocksize = 32 };
+
+    private:
+      ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
+      ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
+      ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
+      Unmanaged<impl_scalar_type_2d_view_tpetra> y;
+      Unmanaged<vector_type_3d_view> y_packed;
+      Unmanaged<btdm_scalar_type_4d_view> y_packed_scalar;
+
+      // AmD information
+      const ConstUnmanaged<size_type_1d_view> rowptr, rowptr_remote;
+      const ConstUnmanaged<local_ordinal_type_1d_view> colindsub, colindsub_remote;
+      const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
+
+      // block crs graph information
+      // for cuda (kokkos crs graph uses a different size_type from size_t)
+      const ConstUnmanaged<Kokkos::View<size_t*,node_device_type> > A_rowptr;
+      const ConstUnmanaged<Kokkos::View<local_ordinal_type*,node_device_type> > A_colind;
+
+      // blocksize
+      const local_ordinal_type blocksize_requested;
+
+      // part interface
+      const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
+      const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
+      const ConstUnmanaged<local_ordinal_type_1d_view> rowidx2part;
+      const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
+      const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
+      const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
+      const bool is_dm2cm_active;
+
+    public:
+      template<typename LocalCrsGraphType>
+      ComputeResidualVector(const AmD<MatrixType> &amd,
+                            const LocalCrsGraphType &graph,
+                            const local_ordinal_type &blocksize_requested_,
+                            const PartInterface<MatrixType> &interf,
+                            const local_ordinal_type_1d_view &dm2cm_)
+        : rowptr(amd.rowptr), rowptr_remote(amd.rowptr_remote),
+          colindsub(amd.A_colindsub), colindsub_remote(amd.A_colindsub_remote),
+          tpetra_values(amd.tpetra_values),
+          A_rowptr(graph.row_map),
+          A_colind(graph.entries),
+          blocksize_requested(blocksize_requested_),
+          part2packrowidx0(interf.part2packrowidx0),
+          part2rowidx0(interf.part2rowidx0),
+          rowidx2part(interf.rowidx2part),
+          partptr(interf.partptr),
+          lclrow(interf.lclrow),
+          dm2cm(dm2cm_),
+          is_dm2cm_active(dm2cm_.span() > 0)
+      {}
+
+      inline
+      void
+      SerialGemv(const local_ordinal_type &blocksize,
+                 const impl_scalar_type * const KOKKOS_RESTRICT AA,
+                 const impl_scalar_type * const KOKKOS_RESTRICT xx,
+                 /* */ impl_scalar_type * KOKKOS_RESTRICT yy) const {
+        using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
+        for (local_ordinal_type k0=0;k0<blocksize;++k0) {
+          impl_scalar_type val = 0;
+#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
+#   pragma ivdep
+#endif
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+#   pragma unroll
+#endif
+          for (local_ordinal_type k1=0;k1<blocksize;++k1)
+            val += AA[tlb::getFlatIndex(k0,k1,blocksize)]*xx[k1];
+          yy[k0] -= val;
+        }
+      }
+
+      template<typename bbViewType, typename yyViewType>
+      KOKKOS_INLINE_FUNCTION
+      void
+      VectorCopy(const member_type &member,
+                 const local_ordinal_type &blocksize,
+                 const bbViewType &bb,
+                 const yyViewType &yy) const {
+        Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](const local_ordinal_type &k0)  {
+            yy(k0) = static_cast<typename yyViewType::const_value_type>(bb(k0));
+          });
+      }
+
+      template<typename AAViewType, typename xxViewType, typename yyViewType>
+      KOKKOS_INLINE_FUNCTION
+      void
+      TeamVectorGemv(const member_type &member,
+               const local_ordinal_type &blocksize,
+               const AAViewType &AA,
+               const xxViewType &xx,
+               const yyViewType &yy) const {
+        Kokkos::parallel_for
+          (Kokkos::TeamThreadRange(member, blocksize),
+           [&](const local_ordinal_type &k0) {
+            impl_scalar_type val = 0;
+            Kokkos::parallel_for
+              (Kokkos::ThreadVectorRange(member, blocksize),
+               [&](const local_ordinal_type &k1) {
+                val += AA(k0,k1)*xx(k1);
+              });
+            Kokkos::atomic_fetch_add(&yy(k0), typename yyViewType::const_value_type(-val));
+          });
+      }
+
+      template<typename AAViewType, typename xxViewType, typename yyViewType>
+      KOKKOS_INLINE_FUNCTION
+      void
+      VectorGemv(const member_type &member,
+                 const local_ordinal_type &blocksize,
+                 const AAViewType &AA,
+                 const xxViewType &xx,
+                 const yyViewType &yy) const {
+        Kokkos::parallel_for
+          (Kokkos::ThreadVectorRange(member, blocksize),
+           [&](const local_ordinal_type &k0) {
+            impl_scalar_type val(0);
+            for (local_ordinal_type k1=0;k1<blocksize;++k1) {
+              val += AA(k0,k1)*xx(k1);
+            }
+            Kokkos::atomic_fetch_add(&yy(k0), typename yyViewType::const_value_type(-val));
+          });
+      }
+
+      // template<typename AAViewType, typename xxViewType, typename yyViewType>
+      // KOKKOS_INLINE_FUNCTION
+      // void
+      // VectorGemv(const member_type &member,
+      //                 const local_ordinal_type &blocksize,
+      //                 const AAViewType &AA,
+      //                 const xxViewType &xx,
+      //                 const yyViewType &yy) const {
+      //        for (local_ordinal_type k0=0;k0<blocksize;++k0) {
+      //          impl_scalar_type val = 0;
+      //          Kokkos::parallel_for
+      //            (Kokkos::ThreadVectorRange(member, blocksize),
+      //             [&](const local_ordinal_type &k1) {
+      //              val += AA(k0,k1)*xx(k1);
+      //            });
+      //          Kokkos::atomic_fetch_add(&yy(k0), -val);
+      //        }
+      // }
+
+      struct SeqTag {};
+
+      // inline  ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION
+      KOKKOS_INLINE_FUNCTION
+      void
+      operator() (const SeqTag &, const local_ordinal_type& i) const {
+        const local_ordinal_type blocksize = blocksize_requested;
+        const local_ordinal_type blocksize_square = blocksize*blocksize;
+
+        // constants
+        const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
+        const local_ordinal_type num_vectors = y.extent(1);
+        const local_ordinal_type row = i*blocksize;
+        for (local_ordinal_type col=0;col<num_vectors;++col) {
+          // y := b
+          impl_scalar_type *yy = &y(row, col);
+          const impl_scalar_type * const bb = &b(row, col);
+          memcpy(yy, bb, sizeof(impl_scalar_type)*blocksize);
+
+          // y -= Rx
+          const size_type A_k0 = A_rowptr[i];
+          for (size_type k=rowptr[i];k<rowptr[i+1];++k) {
+            const size_type j = A_k0 + colindsub[k];
+            const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
+            const impl_scalar_type * const xx = &x(A_colind[j]*blocksize, col);
+            SerialGemv(blocksize,AA,xx,yy);
+          }
+        }
+      }
+
+      KOKKOS_INLINE_FUNCTION
+      void
+      operator() (const SeqTag &, const member_type &member) const {
+
+        // constants
+        const local_ordinal_type blocksize = blocksize_requested;
+        const local_ordinal_type blocksize_square = blocksize*blocksize;
+
+        const local_ordinal_type lr = member.league_rank();
+        const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
+        const local_ordinal_type num_vectors = y.extent(1);
+
+        // subview pattern
+        auto bb = Kokkos::subview(b, block_range, 0);
+        auto xx = bb;
+        auto yy = Kokkos::subview(y, block_range, 0);
+        auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
+
+        const local_ordinal_type row = lr*blocksize;
+        for (local_ordinal_type col=0;col<num_vectors;++col) {
+          // y := b
+          yy.assign_data(&y(row, col));
+          bb.assign_data(&b(row, col));
+          if (member.team_rank() == 0)
+            VectorCopy(member, blocksize, bb, yy);
+          member.team_barrier();
+
+          // y -= Rx
+          const size_type A_k0 = A_rowptr[lr];
+          Kokkos::parallel_for
+            (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
+             [&](const local_ordinal_type &k) {
+              const size_type j = A_k0 + colindsub[k];
+              A_block.assign_data( &tpetra_values(j*blocksize_square) );
+              xx.assign_data( &x(A_colind[j]*blocksize, col) );
+              VectorGemv(member, blocksize, A_block, xx, yy);
+            });
+        }
+      }
+
+      template<int B>
+      struct AsyncTag {};
+
+      template<int B>
+      // inline  ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION
+      KOKKOS_INLINE_FUNCTION
+      void
+      operator() (const AsyncTag<B> &, const local_ordinal_type &rowidx) const {
+        const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
+        const local_ordinal_type blocksize_square = blocksize*blocksize;
+
+        // constants
+        const local_ordinal_type partidx = rowidx2part(rowidx);
+        const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
+        const local_ordinal_type v = partidx % vector_length;
+
+        const local_ordinal_type num_vectors = y_packed.extent(2);
+        const local_ordinal_type num_local_rows = lclrow.extent(0);
+
+        // temporary buffer for y flat
+        impl_scalar_type yy[B == 0 ? max_blocksize : B] = {};
+
+        const local_ordinal_type lr = lclrow(rowidx);
+        const local_ordinal_type row = lr*blocksize;
+        for (local_ordinal_type col=0;col<num_vectors;++col) {
+          // y := b
+          memcpy(yy, &b(row, col), sizeof(impl_scalar_type)*blocksize);
+
+          // y -= Rx
+          const size_type A_k0 = A_rowptr[lr];
+          for (size_type k=rowptr[lr];k<rowptr[lr+1];++k) {
+            const size_type j = A_k0 + colindsub[k];
+            const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
+            const local_ordinal_type A_colind_at_j = A_colind[j];
+            if (A_colind_at_j < num_local_rows) {
+              const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
+              const impl_scalar_type * const xx = &x(loc*blocksize, col);
+              SerialGemv(blocksize, AA,xx,yy);
+            } else {
+              const auto loc = A_colind_at_j - num_local_rows;
+              const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
+              SerialGemv(blocksize, AA,xx_remote,yy);
+            }
+          }
+          // move yy to y_packed
+          for (local_ordinal_type k=0;k<blocksize;++k)
+            y_packed(pri, k, col)[v] = yy[k];
+        }
+      }
+
+      template<int B>
+      KOKKOS_INLINE_FUNCTION
+      void
+      operator() (const AsyncTag<B> &, const member_type &member) const {
+        const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
+        const local_ordinal_type blocksize_square = blocksize*blocksize;
+
+        // constants
+        const local_ordinal_type rowidx = member.league_rank();
+        const local_ordinal_type partidx = rowidx2part(rowidx);
+        const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
+        const local_ordinal_type v = partidx % vector_length;
+
+        const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
+        const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
+        const local_ordinal_type num_local_rows = lclrow.extent(0);
+
+        // subview pattern
+        auto bb = Kokkos::subview(b, block_range, 0);
+        auto xx = Kokkos::subview(x, block_range, 0);
+        auto xx_remote = Kokkos::subview(x_remote, block_range, 0);
+        auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
+        auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
+
+        const local_ordinal_type lr = lclrow(rowidx);
+        const local_ordinal_type row = lr*blocksize;
+        for (local_ordinal_type col=0;col<num_vectors;++col) {
+          // y := b
+          bb.assign_data(&b(row, col));
+          yy.assign_data(&y_packed_scalar(pri, 0, col, v));
+          if (member.team_rank() == 0)
+            VectorCopy(member, blocksize, bb, yy);
+          member.team_barrier();
+
+          // y -= Rx
+          const size_type A_k0 = A_rowptr[lr];
+          Kokkos::parallel_for
+            (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
+             [&](const local_ordinal_type &k) {
+              const size_type j = A_k0 + colindsub[k];
+              A_block.assign_data( &tpetra_values(j*blocksize_square) );
+
+              const local_ordinal_type A_colind_at_j = A_colind[j];
+              if (A_colind_at_j < num_local_rows) {
+                const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
+                xx.assign_data( &x(loc*blocksize, col) );
+                VectorGemv(member, blocksize, A_block, xx, yy);
+              } else {
+                const auto loc = A_colind_at_j - num_local_rows;
+                xx_remote.assign_data( &x_remote(loc*blocksize, col) );
+                VectorGemv(member, blocksize, A_block, xx_remote, yy);
+              }
+            });
+        }
+      }
+
+      template <int P, int B> struct OverlapTag {};
+
+      template<int P, int B>
+      // inline  ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION
+      KOKKOS_INLINE_FUNCTION
+      void
+      operator() (const OverlapTag<P,B> &, const local_ordinal_type& rowidx) const {
+        const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
+        const local_ordinal_type blocksize_square = blocksize*blocksize;
+
+        // constants
+        const local_ordinal_type partidx = rowidx2part(rowidx);
+        const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
+        const local_ordinal_type v = partidx % vector_length;
+
+        const local_ordinal_type num_vectors = y_packed.extent(2);
+        const local_ordinal_type num_local_rows = lclrow.extent(0);
+
+        // temporary buffer for y flat
+        impl_scalar_type yy[max_blocksize] = {};
+
+        auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
+        auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
+
+        const local_ordinal_type lr = lclrow(rowidx);
+        const local_ordinal_type row = lr*blocksize;
+        for (local_ordinal_type col=0;col<num_vectors;++col) {
+          if (P == 0) {
+            // y := b
+            memcpy(yy, &b(row, col), sizeof(impl_scalar_type)*blocksize);
+          } else {
+            // y (temporary) := 0
+            memset(yy, 0, sizeof(impl_scalar_type)*blocksize);
+          }
+
+          // y -= Rx
+          const size_type A_k0 = A_rowptr[lr];
+          for (size_type k=rowptr_used[lr];k<rowptr_used[lr+1];++k) {
+            const size_type j = A_k0 + colindsub_used[k];
+            const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
+            const local_ordinal_type A_colind_at_j = A_colind[j];
+            if (P == 0) {
+              const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
+              const impl_scalar_type * const xx = &x(loc*blocksize, col);
+              SerialGemv(blocksize,AA,xx,yy);
+            } else {
+              const auto loc = A_colind_at_j - num_local_rows;
+              const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
+              SerialGemv(blocksize,AA,xx_remote,yy);
+            }
+          }
+          // move yy to y_packed
+          if (P == 0) {
+            for (local_ordinal_type k=0;k<blocksize;++k)
+              y_packed(pri, k, col)[v] = yy[k];
+          } else {
+            for (local_ordinal_type k=0;k<blocksize;++k)
+              y_packed(pri, k, col)[v] += yy[k];
+          }
+        }
+      }
+
+      template<int P, int B>
+      KOKKOS_INLINE_FUNCTION
+      void
+      operator() (const OverlapTag<P,B> &, const member_type &member) const {
+        const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
+        const local_ordinal_type blocksize_square = blocksize*blocksize;
+
+        // constants
+        const local_ordinal_type rowidx = member.league_rank();
+        const local_ordinal_type partidx = rowidx2part(rowidx);
+        const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
+        const local_ordinal_type v = partidx % vector_length;
+
+        const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
+        const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
+        const local_ordinal_type num_local_rows = lclrow.extent(0);
+
+        // subview pattern
+        auto bb = Kokkos::subview(b, block_range, 0);
+        auto xx = bb; //Kokkos::subview(x, block_range, 0);
+        auto xx_remote = bb; //Kokkos::subview(x_remote, block_range, 0);
+        auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
+        auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
+        auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
+        auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
+
+        const local_ordinal_type lr = lclrow(rowidx);
+        const local_ordinal_type row = lr*blocksize;
+        for (local_ordinal_type col=0;col<num_vectors;++col) {
+          yy.assign_data(&y_packed_scalar(pri, 0, col, v));
+          if (P == 0) {
+            // y := b
+            bb.assign_data(&b(row, col));
+            if (member.team_rank() == 0)
+              VectorCopy(member, blocksize, bb, yy);
+            member.team_barrier();
+          }
+
+          // y -= Rx
+          const size_type A_k0 = A_rowptr[lr];
+          Kokkos::parallel_for
+            (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]),
+             [&](const local_ordinal_type &k) {
+              const size_type j = A_k0 + colindsub_used[k];
+              A_block.assign_data( &tpetra_values(j*blocksize_square) );
+
+              const local_ordinal_type A_colind_at_j = A_colind[j];
+              if (P == 0) {
+                const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
+                xx.assign_data( &x(loc*blocksize, col) );
+                VectorGemv(member, blocksize, A_block, xx, yy);
+              } else {
+                const auto loc = A_colind_at_j - num_local_rows;
+                xx_remote.assign_data( &x_remote(loc*blocksize, col) );
+                VectorGemv(member, blocksize, A_block, xx_remote, yy);
+              }
+            });
+        }
+      }
+
+      // y = b - Rx; seq method
+      template<typename MultiVectorLocalViewTypeY,
+               typename MultiVectorLocalViewTypeB,
+               typename MultiVectorLocalViewTypeX>
+      void run(const MultiVectorLocalViewTypeY &y_,
+               const MultiVectorLocalViewTypeB &b_,
+               const MultiVectorLocalViewTypeX &x_) {
+        IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
+        IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ComputeResidual::<SeqTag>");
+
+        y = y_; b = b_; x = x_;
+        if constexpr (is_device<execution_space>::value) {
+          const local_ordinal_type blocksize = blocksize_requested;
+          const local_ordinal_type team_size = 8;
+          const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
+          const Kokkos::TeamPolicy<execution_space,SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
+          Kokkos::parallel_for
+            ("ComputeResidual::TeamPolicy::run<SeqTag>", policy, *this);
+        } else {
+          const Kokkos::RangePolicy<execution_space,SeqTag> policy(0, rowptr.extent(0) - 1);
+          Kokkos::parallel_for
+            ("ComputeResidual::RangePolicy::run<SeqTag>", policy, *this);
+        }
+        IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
+      }
+
+      // y = b - R (x , x_remote)
+      template<typename MultiVectorLocalViewTypeB,
+               typename MultiVectorLocalViewTypeX,
+               typename MultiVectorLocalViewTypeX_Remote>
+      void run(const vector_type_3d_view &y_packed_,
+               const MultiVectorLocalViewTypeB &b_,
+               const MultiVectorLocalViewTypeX &x_,
+               const MultiVectorLocalViewTypeX_Remote &x_remote_) {
+        IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
+        IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ComputeResidual::<AsyncTag>");
+
+        b = b_; x = x_; x_remote = x_remote_;
+        if constexpr (is_device<execution_space>::value) {
+          y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
+                                                     y_packed_.extent(0),
+                                                     y_packed_.extent(1),
+                                                     y_packed_.extent(2),
+                                                     vector_length);
+        } else {
+          y_packed = y_packed_;
+        }
+
+        if constexpr(is_device<execution_space>::value) {
+          const local_ordinal_type blocksize = blocksize_requested;
+          const local_ordinal_type team_size = 8;
+          const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
+          // local_ordinal_type vl_power_of_two = 1;
+          // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
+          // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
+          // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
+#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) {                \
+            const Kokkos::TeamPolicy<execution_space,AsyncTag<B> >      \
+              policy(rowidx2part.extent(0), team_size, vector_size);    \
+            Kokkos::parallel_for                                        \
+              ("ComputeResidual::TeamPolicy::run<AsyncTag>",            \
+               policy, *this); } break
+          switch (blocksize_requested) {
+          case   3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
+          case   5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
+          case   7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
+          case   9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
+          case  10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
+          case  11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
+          case  16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
+          case  17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
+          case  18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
+          default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
+          }
+#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
+	} else {
+#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) {                \
+            const Kokkos::RangePolicy<execution_space,AsyncTag<B> > policy(0, rowidx2part.extent(0)); \
+            Kokkos::parallel_for                                        \
+              ("ComputeResidual::RangePolicy::run<AsyncTag>",           \
+               policy, *this); } break
+          switch (blocksize_requested) {
+          case   3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
+          case   5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
+          case   7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
+          case   9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
+          case  10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
+          case  11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
+          case  16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
+          case  17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
+          case  18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
+          default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
+          }
+#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
+        }
+        IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
+      }
+
+      // y = b - R (y , y_remote)
+      template<typename MultiVectorLocalViewTypeB,
+               typename MultiVectorLocalViewTypeX,
+               typename MultiVectorLocalViewTypeX_Remote>
+      void run(const vector_type_3d_view &y_packed_,
+               const MultiVectorLocalViewTypeB &b_,
+               const MultiVectorLocalViewTypeX &x_,
+               const MultiVectorLocalViewTypeX_Remote &x_remote_,
+               const bool compute_owned) {
+        IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
+        IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ComputeResidual::<OverlapTag>");
+
+        b = b_; x = x_; x_remote = x_remote_;
+        if constexpr (is_device<execution_space>::value) {
+          y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
+                                                     y_packed_.extent(0),
+                                                     y_packed_.extent(1),
+                                                     y_packed_.extent(2),
+                                                     vector_length);
+        } else {
+          y_packed = y_packed_;
+        }
+
+        if constexpr (is_device<execution_space>::value) {
+          const local_ordinal_type blocksize = blocksize_requested;
+          const local_ordinal_type team_size = 8;
+          const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
+          // local_ordinal_type vl_power_of_two = 1;
+          // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
+          // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
+          // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
+#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B)  \
+          if (compute_owned) {                                          \
+            const Kokkos::TeamPolicy<execution_space,OverlapTag<0,B> > \
+              policy(rowidx2part.extent(0), team_size, vector_size);    \
+            Kokkos::parallel_for                                        \
+              ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
+          } else {                                                      \
+            const Kokkos::TeamPolicy<execution_space,OverlapTag<1,B> > \
+              policy(rowidx2part.extent(0), team_size, vector_size);    \
+            Kokkos::parallel_for                                        \
+              ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
+          } break
+          switch (blocksize_requested) {
+          case   3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
+          case   5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
+          case   7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
+          case   9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
+          case  10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
+          case  11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
+          case  16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
+          case  17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
+          case  18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
+          default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
+          }
+#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
+        } else {
+#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B)  \
+          if (compute_owned) {                                          \
+            const Kokkos::RangePolicy<execution_space,OverlapTag<0,B> > \
+              policy(0, rowidx2part.extent(0));                         \
+            Kokkos::parallel_for                                        \
+              ("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
+          } else {                                                      \
+            const Kokkos::RangePolicy<execution_space,OverlapTag<1,B> > \
+              policy(0, rowidx2part.extent(0));                         \
+            Kokkos::parallel_for                                        \
+              ("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
+          } break
+
+          switch (blocksize_requested) {
+          case   3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
+          case   5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
+          case   7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
+          case   9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
+          case  10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
+          case  11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
+          case  16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
+          case  17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
+          case  18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
+          default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
+          }
+#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
+        }
+        IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
+      }
+    };
+
+
+  } // namespace BlockHelperDetails
+
+} // namespace Ifpack2
+
+#endif
diff --git a/packages/ifpack2/src/Ifpack2_BlockHelper.hpp b/packages/ifpack2/src/Ifpack2_BlockHelper.hpp
new file mode 100644
index 000000000000..18d6da71be83
--- /dev/null
+++ b/packages/ifpack2/src/Ifpack2_BlockHelper.hpp
@@ -0,0 +1,539 @@
+/*@HEADER
+// ***********************************************************************
+//
+//       Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
+//                 Copyright (2009) Sandia Corporation
+//
+// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
+// license for use of this work by or on behalf of the U.S. Government.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Michael A. Heroux (maherou@sandia.gov)
+//
+// ***********************************************************************
+//@HEADER
+*/
+
+#ifndef IFPACK2_BLOCKHELPER_IMPL_HPP
+#define IFPACK2_BLOCKHELPER_IMPL_HPP
+
+
+namespace Ifpack2 {
+
+  namespace BlockHelperDetails {
+
+    namespace KB = KokkosBatched;
+
+    ///
+    /// view decorators for unmanaged and const memory
+    ///
+    using do_not_initialize_tag = Kokkos::ViewAllocateWithoutInitializing;
+
+    template <typename MemoryTraitsType, Kokkos::MemoryTraitsFlags flag>
+    using MemoryTraits = Kokkos::MemoryTraits<MemoryTraitsType::is_unmanaged |
+                                              MemoryTraitsType::is_random_access |
+                                              flag>;
+
+    template <typename ViewType>
+    using Unmanaged = Kokkos::View<typename ViewType::data_type,
+                                   typename ViewType::array_layout,
+                                   typename ViewType::device_type,
+                                  MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
+    template <typename ViewType>
+    using Atomic = Kokkos::View<typename ViewType::data_type,
+                                typename ViewType::array_layout,
+                                typename ViewType::device_type,
+                                MemoryTraits<typename ViewType::memory_traits,Kokkos::Atomic> >;
+    template <typename ViewType>
+    using Const = Kokkos::View<typename ViewType::const_data_type,
+                               typename ViewType::array_layout,
+                               typename ViewType::device_type,
+                               typename ViewType::memory_traits>;
+    template <typename ViewType>
+    using ConstUnmanaged = Const<Unmanaged<ViewType> >;
+
+    template <typename ViewType>
+    using AtomicUnmanaged = Atomic<Unmanaged<ViewType> >;
+
+    template <typename ViewType>
+    using Unmanaged = Kokkos::View<typename ViewType::data_type,
+                                   typename ViewType::array_layout,
+                                   typename ViewType::device_type,
+                                   MemoryTraits<typename ViewType::memory_traits,Kokkos::Unmanaged> >;
+
+
+    template <typename ViewType>
+    using Scratch = Kokkos::View<typename ViewType::data_type,
+                                 typename ViewType::array_layout,
+                                 typename ViewType::execution_space::scratch_memory_space,
+                                 MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged> >;
+
+    /// 
+    /// tpetra little block index
+    ///
+    template<typename LayoutType> struct TpetraLittleBlock;
+    template<> struct TpetraLittleBlock<Kokkos::LayoutLeft> {
+      template<typename T> KOKKOS_INLINE_FUNCTION
+      static T getFlatIndex(const T i, const T j, const T blksize) { return i+j*blksize; }
+    };
+    template<> struct TpetraLittleBlock<Kokkos::LayoutRight> {
+      template<typename T> KOKKOS_INLINE_FUNCTION
+      static T getFlatIndex(const T i, const T j, const T blksize) { return i*blksize+j; }
+    };
+
+    ///
+    /// block tridiag scalar type
+    ///
+    template<typename T> struct BlockTridiagScalarType { typedef T type; };
+#if defined(IFPACK2_BLOCKHELPER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG)
+    template<> struct BlockTridiagScalarType<double> { typedef float type; };
+    //template<> struct SmallScalarType<Kokkos::complex<double> > { typedef Kokkos::complex<float> type; };
+#endif
+
+    ///
+    /// cuda specialization
+    ///
+    template<typename T> struct is_cuda                 { enum : bool { value = false }; };
+#if defined(KOKKOS_ENABLE_CUDA)
+    template<> struct is_cuda<Kokkos::Cuda>             { enum : bool { value = true  }; };
+#endif
+
+    ///
+    /// hip specialization
+    ///
+    template<typename T> struct is_hip                  { enum : bool { value = false }; };
+#if defined(KOKKOS_ENABLE_HIP)
+    template<> struct is_hip<Kokkos::HIP> { enum : bool { value = true  }; };
+#endif
+
+    ///
+    /// sycl specialization
+    ///
+    template<typename T> struct is_sycl                  { enum : bool { value = false }; };
+#if defined(KOKKOS_ENABLE_SYCL)
+    template<> struct is_sycl<Kokkos::Experimental::SYCL> { enum : bool { value = true  }; };
+#endif
+
+    template<typename T> struct is_device                  { enum : bool { value = is_cuda<T>::value || is_hip<T>::value || is_sycl<T>::value }; };
+
+    
+    ///
+    /// execution space instance
+    ///
+    template<typename T>
+    struct ExecutionSpaceFactory {
+      static void createInstance(T &exec_instance) {
+        exec_instance = T();
+      }
+#if defined(KOKKOS_ENABLE_CUDA)
+      static void createInstance(const cudaStream_t &s, T &exec_instance) {
+        exec_instance = T();
+      }
+#endif
+    };
+
+#if defined(KOKKOS_ENABLE_CUDA)
+    template<>
+    struct ExecutionSpaceFactory<Kokkos::Cuda> {
+      static void createInstance(Kokkos::Cuda &exec_instance) {
+        exec_instance = Kokkos::Cuda();
+      }
+      static void createInstance(const cudaStream_t &s, Kokkos::Cuda &exec_instance) {
+        exec_instance = Kokkos::Cuda(s);
+      }      
+    };
+#endif
+
+#if defined(KOKKOS_ENABLE_HIP)
+    template<>
+    struct ExecutionSpaceFactory<Kokkos::HIP> {
+      static void createInstance(Kokkos::HIP &exec_instance) {
+	exec_instance = Kokkos::HIP();
+      }
+    };
+#endif
+
+#if defined(KOKKOS_ENABLE_SYCL)
+    template<>
+    struct ExecutionSpaceFactory<Kokkos::Experimental::SYCL> {
+      static void createInstance(Kokkos::Experimental::SYCL &exec_instance) {
+	exec_instance = Kokkos::Experimental::SYCL();
+      }
+    };
+#endif
+
+
+#if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS)
+#define IFPACK2_BLOCKHELPER_TIMER(label) TEUCHOS_FUNC_TIME_MONITOR(label);
+#define IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) execution_space().fence();
+#else
+#define IFPACK2_BLOCKHELPER_TIMER(label)
+#define IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
+#endif
+
+#if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKHELPER_ENABLE_PROFILE)
+#define IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN \
+    KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStart());
+
+#define IFPACK2_BLOCKHELPER_PROFILER_REGION_END \
+    { KOKKOS_IMPL_CUDA_SAFE_CALL( cudaProfilerStop() ); }
+#else
+    /// later put vtune profiler region
+#define IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN
+#define IFPACK2_BLOCKHELPER_PROFILER_REGION_END
+#endif
+
+    
+    ///
+    /// utility functions
+    ///
+    template<typename CommPtrType>
+    std::string get_msg_prefix (const CommPtrType &comm) {
+      const auto rank = comm->getRank();
+      const auto nranks = comm->getSize();
+      std::stringstream ss;
+      ss << "Rank " << rank << " of " << nranks << ": ";
+      return ss.str();
+    }
+
+    ///
+    /// custom multiple varilable reduce and scan
+    ///
+    template<typename T, int N>
+    struct ArrayValueType {
+      T v[N];
+      KOKKOS_INLINE_FUNCTION
+      ArrayValueType() {
+        for (int i=0;i<N;++i)
+          this->v[i] = 0;
+      }
+      KOKKOS_INLINE_FUNCTION
+      ArrayValueType(const ArrayValueType &b) {
+        for (int i=0;i<N;++i)
+          this->v[i] = b.v[i];
+      }
+    };
+    template<typename T, int N>
+    static
+    KOKKOS_INLINE_FUNCTION
+    void
+    operator+=(ArrayValueType<T,N> &a,
+               const ArrayValueType<T,N> &b) {
+      for (int i=0;i<N;++i)
+        a.v[i] += b.v[i];
+    }
+
+    ///
+    /// custom reducer functor for compile time array variable
+    ///
+    template<typename T, int N, typename ExecSpace>
+    struct SumReducer {
+      typedef SumReducer reducer;
+      typedef ArrayValueType<T,N> value_type;
+      typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
+      value_type *value;
+
+      KOKKOS_INLINE_FUNCTION
+      SumReducer(value_type &val) : value(&val) {}
+
+      KOKKOS_INLINE_FUNCTION
+      void join(value_type &dst, value_type const &src) const {
+        for (int i=0;i<N;++i)
+          dst.v[i] += src.v[i];
+      }
+      KOKKOS_INLINE_FUNCTION
+      void init(value_type &val) const {
+        for (int i=0;i<N;++i)
+          val.v[i] = Kokkos::reduction_identity<T>::sum();
+      }
+      KOKKOS_INLINE_FUNCTION
+      value_type& reference() {
+        return *value;
+      }
+      KOKKOS_INLINE_FUNCTION
+      result_view_type view() const {
+        return result_view_type(value);
+      }
+    };
+
+
+    ///
+    /// implementation typedefs
+    ///
+    template <typename MatrixType>
+    struct ImplType {
+      ///
+      /// matrix type derived types
+      ///
+      typedef size_t size_type;
+      typedef typename MatrixType::scalar_type scalar_type;
+      typedef typename MatrixType::local_ordinal_type local_ordinal_type;
+      typedef typename MatrixType::global_ordinal_type global_ordinal_type;
+      typedef typename MatrixType::node_type node_type;
+
+      ///
+      /// kokkos arithmetic traits of scalar_type
+      ///
+      typedef typename Kokkos::Details::ArithTraits<scalar_type>::val_type impl_scalar_type;
+      typedef typename Kokkos::ArithTraits<impl_scalar_type>::mag_type magnitude_type;
+
+      typedef typename BlockTridiagScalarType<impl_scalar_type>::type btdm_scalar_type;
+      typedef typename Kokkos::ArithTraits<btdm_scalar_type>::mag_type btdm_magnitude_type;
+
+      ///
+      /// default host execution space
+      ///
+      typedef Kokkos::DefaultHostExecutionSpace host_execution_space;
+
+      ///
+      /// tpetra types
+      ///
+      typedef typename node_type::device_type node_device_type;
+      typedef typename node_device_type::execution_space node_execution_space;
+      typedef typename node_device_type::memory_space node_memory_space;
+
+#if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKHELPER_USE_CUDA_SPACE)
+      /// force to use cuda space instead uvm space
+      typedef node_execution_space execution_space;
+      typedef typename std::conditional<std::is_same<node_memory_space,Kokkos::CudaUVMSpace>::value,
+                                        Kokkos::CudaSpace,
+                                        node_memory_space>::type memory_space;
+      typedef Kokkos::Device<execution_space,memory_space> device_type;
+#else
+      typedef node_device_type device_type;
+      typedef node_execution_space execution_space;
+      typedef node_memory_space memory_space;
+#endif
+
+      typedef Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_multivector_type;
+      typedef Tpetra::Map<local_ordinal_type,global_ordinal_type,node_type> tpetra_map_type;
+      typedef Tpetra::Import<local_ordinal_type,global_ordinal_type,node_type> tpetra_import_type;
+      typedef Tpetra::RowMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_row_matrix_type;
+      typedef Tpetra::BlockCrsMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_crs_matrix_type;
+      typedef typename tpetra_block_crs_matrix_type::little_block_type tpetra_block_access_view_type;
+      typedef Tpetra::BlockMultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_multivector_type;
+      typedef typename tpetra_block_crs_matrix_type::crs_graph_type::local_graph_device_type local_crs_graph_type;
+
+      ///
+      /// simd vectorization
+      ///
+      template<typename T, int l> using Vector = KB::Vector<T,l>;
+      template<typename T> using SIMD = KB::SIMD<T>;
+      template<typename T, typename M> using DefaultVectorLength = KB::DefaultVectorLength<T,M>;
+      template<typename T, typename M> using DefaultInternalVectorLength = KB::DefaultInternalVectorLength<T,M>;
+
+      static constexpr int vector_length = 1; //DefaultVectorLength<btdm_scalar_type,memory_space>::value;
+      static constexpr int internal_vector_length = 1; //DefaultInternalVectorLength<btdm_scalar_type,memory_space>::value;
+      typedef Vector<SIMD<btdm_scalar_type>,vector_length> vector_type;
+      typedef Vector<SIMD<btdm_scalar_type>,internal_vector_length> internal_vector_type;
+
+      ///
+      /// commonly used view types
+      ///
+      typedef Kokkos::View<size_type*,device_type> size_type_1d_view;
+      typedef Kokkos::View<size_type**,device_type> size_type_2d_view;
+      typedef Kokkos::View<local_ordinal_type*,device_type> local_ordinal_type_1d_view;
+      typedef Kokkos::View<local_ordinal_type**,device_type> local_ordinal_type_2d_view;
+      // tpetra block crs values
+      typedef Kokkos::View<impl_scalar_type*,device_type> impl_scalar_type_1d_view;
+      typedef Kokkos::View<impl_scalar_type*,node_device_type> impl_scalar_type_1d_view_tpetra;
+
+      // tpetra multivector values (layout left): may need to change the typename more explicitly
+      typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,device_type> impl_scalar_type_2d_view;
+      typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,node_device_type> impl_scalar_type_2d_view_tpetra;
+
+      // packed data always use layout right
+      typedef Kokkos::View<vector_type*,device_type> vector_type_1d_view;
+      typedef Kokkos::View<vector_type***,Kokkos::LayoutRight,device_type> vector_type_3d_view;
+      typedef Kokkos::View<vector_type****,Kokkos::LayoutRight,device_type> vector_type_4d_view;
+      typedef Kokkos::View<internal_vector_type***,Kokkos::LayoutRight,device_type> internal_vector_type_3d_view;
+      typedef Kokkos::View<internal_vector_type****,Kokkos::LayoutRight,device_type> internal_vector_type_4d_view;
+      typedef Kokkos::View<internal_vector_type*****,Kokkos::LayoutRight,device_type> internal_vector_type_5d_view;
+      typedef Kokkos::View<btdm_scalar_type***,Kokkos::LayoutRight,device_type> btdm_scalar_type_3d_view;
+      typedef Kokkos::View<btdm_scalar_type****,Kokkos::LayoutRight,device_type> btdm_scalar_type_4d_view;
+      typedef Kokkos::View<btdm_scalar_type*****,Kokkos::LayoutRight,device_type> btdm_scalar_type_5d_view;
+    };
+
+
+    ///
+    /// Manage the distributed part of the computation of residual norms.
+    ///
+    template<typename MatrixType>
+    struct NormManager {
+    public:
+      using impl_type = ImplType<MatrixType>;
+      using host_execution_space = typename impl_type::host_execution_space;
+      using magnitude_type = typename impl_type::magnitude_type;
+
+    private:
+      bool collective_;
+      int sweep_step_, sweep_step_upper_bound_;
+#ifdef HAVE_IFPACK2_MPI
+      MPI_Request mpi_request_;
+      MPI_Comm comm_;
+#endif
+      magnitude_type work_[3];
+
+    public:
+      NormManager() = default;
+      NormManager(const NormManager &b) = default;
+      NormManager(const Teuchos::RCP<const Teuchos::Comm<int> >& comm) {
+        sweep_step_ = 1;
+        sweep_step_upper_bound_ = 1;
+        collective_ = comm->getSize() > 1;
+        if (collective_) {
+#ifdef HAVE_IFPACK2_MPI
+          const auto mpi_comm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm);
+          TEUCHOS_ASSERT( ! mpi_comm.is_null());
+          comm_ = *mpi_comm->getRawMpiComm();
+#endif
+        }
+        const magnitude_type zero(0), minus_one(-1);
+        work_[0] = zero;
+        work_[1] = zero;
+        work_[2] = minus_one;
+      }
+
+      // Check the norm every sweep_step sweeps.
+      void setCheckFrequency(const int sweep_step) {
+        TEUCHOS_TEST_FOR_EXCEPT_MSG(sweep_step < 1, "sweep step must be >= 1");
+        sweep_step_upper_bound_ = sweep_step;
+        sweep_step_ = 1;
+      }
+
+      // Get the buffer into which to store rank-local squared norms.
+      magnitude_type* getBuffer() { return &work_[0]; }
+
+      // Call MPI_Iallreduce to find the global squared norms.
+      void ireduce(const int sweep, const bool force = false) {
+        if ( ! force && sweep % sweep_step_) return;
+
+        IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NormManager::Ireduce");
+
+        work_[1] = work_[0];
+#ifdef HAVE_IFPACK2_MPI
+        auto send_data = &work_[1];
+        auto recv_data = &work_[0];
+        if (collective_) {
+# if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3)
+          MPI_Iallreduce(send_data, recv_data, 1,
+                         Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(),
+                         MPI_SUM, comm_, &mpi_request_);
+# else
+          MPI_Allreduce (send_data, recv_data, 1,
+                         Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(),
+                         MPI_SUM, comm_);
+# endif
+        }
+#endif
+      }
+
+      // Check if the norm-based termination criterion is met. tol2 is the
+      // tolerance squared. Sweep is the sweep index. If not every iteration is
+      // being checked, this function immediately returns false. If a check must
+      // be done at this iteration, it waits for the reduction triggered by
+      // ireduce to complete, then checks the global norm against the tolerance.
+      bool checkDone (const int sweep, const magnitude_type tol2, const bool force = false) {
+        // early return
+        if (sweep <= 0) return false;
+
+        IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NormManager::CheckDone");
+
+        TEUCHOS_ASSERT(sweep >= 1);
+        if ( ! force && (sweep - 1) % sweep_step_) return false;
+        if (collective_) {
+#ifdef HAVE_IFPACK2_MPI
+# if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3)
+          MPI_Wait(&mpi_request_, MPI_STATUS_IGNORE);
+# else
+          // Do nothing.
+# endif
+#endif
+        }
+        bool r_val = false;
+        if (sweep == 1) {
+          work_[2] = work_[0];
+        } else {
+          r_val = (work_[0] < tol2*work_[2]);
+        }
+
+        // adjust sweep step
+        const auto adjusted_sweep_step = 2*sweep_step_;
+        if (adjusted_sweep_step < sweep_step_upper_bound_) {
+          sweep_step_ = adjusted_sweep_step;
+        } else {
+          sweep_step_ = sweep_step_upper_bound_;
+        }
+        return r_val;
+      }
+
+      // After termination has occurred, finalize the norms for use in
+      // get_norms{0,final}.
+      void finalize () {
+        work_[0] = std::sqrt(work_[0]); // after converged
+        if (work_[2] >= 0)
+          work_[2] = std::sqrt(work_[2]); // first norm
+        // if work_[2] is minus one, then norm is not requested.
+      }
+
+      // Report norms to the caller.
+      const magnitude_type getNorms0 () const { return work_[2]; }
+      const magnitude_type getNormsFinal () const { return work_[0]; }
+    };
+
+    template<typename MatrixType>
+    void reduceVector(const ConstUnmanaged<typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view> zz,
+                      /* */ typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type *vals) {
+      IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
+      IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ReduceVector");
+
+      using impl_type = BlockHelperDetails::ImplType<MatrixType>;
+      using local_ordinal_type = typename impl_type::local_ordinal_type;
+      using impl_scalar_type = typename impl_type::impl_scalar_type;
+#if 0
+      const auto norm2 = KokkosBlas::nrm1(zz);
+#else
+      impl_scalar_type norm2(0);
+      Kokkos::parallel_reduce
+        ("ReduceMultiVector::Device",
+         Kokkos::RangePolicy<typename impl_type::execution_space>(0,zz.extent(0)),
+         KOKKOS_LAMBDA(const local_ordinal_type &i, impl_scalar_type &update) {
+          update += zz(i);
+        }, norm2);
+#endif
+      vals[0] = Kokkos::ArithTraits<impl_scalar_type>::abs(norm2);
+
+      IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
+    } 
+
+  } // namespace BlockHelperDetails
+
+} // namespace Ifpack2
+
+#endif
diff --git a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_decl.hpp b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_decl.hpp
index 6cbcc0f839d7..b085eb8e8741 100644
--- a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_decl.hpp
+++ b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_decl.hpp
@@ -228,6 +228,7 @@ namespace Ifpack2 {
     ///   purposes only.
     BlockTriDiContainer (const Teuchos::RCP<const row_matrix_type>& matrix,
                          const Teuchos::Array<Teuchos::Array<local_ordinal_type> >& partitions,
+                         const int n_subparts_per_part = 1,
                          bool overlapCommAndComp = false, bool useSequentialMethod = false);
 
     //! Destructor (declared virtual for memory safety of derived classes).
@@ -389,11 +390,13 @@ namespace Ifpack2 {
 
     // hide details of impl using ImplObj; finally I understand why AMB did that way.
     Teuchos::RCP<BlockTriDiContainerDetails::ImplObject<MatrixType> > impl_;
+    int n_subparts_per_part_;
     
     // initialize distributed and local objects
     void initInternal (const Teuchos::RCP<const row_matrix_type>& matrix,
                        const Teuchos::Array<Teuchos::Array<local_ordinal_type> >& partitions,
                        const Teuchos::RCP<const import_type> &importer,
+                       const int n_subparts_per_part,
                        const bool overlapCommAndComp,
                        const bool useSeqMethod);
 
diff --git a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp
index 49b4eecaa547..65bc2964405a 100644
--- a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp
@@ -81,34 +81,50 @@ namespace Ifpack2 {
   ::initInternal (const Teuchos::RCP<const row_matrix_type>& matrix,
                   const Teuchos::Array<Teuchos::Array<local_ordinal_type> >& partitions,
                   const Teuchos::RCP<const import_type>& importer,
+                  const int n_subparts_per_part,
                   const bool overlapCommAndComp,
                   const bool useSeqMethod) 
   {
+    IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::initInternal");
+    n_subparts_per_part_ = n_subparts_per_part;
+
     // create pointer of impl
-    impl_ = Teuchos::rcp(new BlockTriDiContainerDetails::ImplObject<MatrixType>());
+    {
+      IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::createImpl");
+      impl_ = Teuchos::rcp(new BlockTriDiContainerDetails::ImplObject<MatrixType>());
+      IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
+    }
 
-    using impl_type = BlockTriDiContainerDetails::ImplType<MatrixType>;
+    using impl_type = BlockHelperDetails::ImplType<MatrixType>;
     // using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
 
-    impl_->A = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(matrix);
-    TEUCHOS_TEST_FOR_EXCEPT_MSG
-      (impl_->A.is_null(), "BlockTriDiContainer currently supports Tpetra::BlockCrsMatrix only.");
+    {
+      IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::setA");
+      impl_->A = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(matrix);
+      TEUCHOS_TEST_FOR_EXCEPT_MSG
+        (impl_->A.is_null(), "BlockTriDiContainer currently supports Tpetra::BlockCrsMatrix only.");
+      IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
+    }
 
     impl_->tpetra_importer = Teuchos::null;
     impl_->async_importer  = Teuchos::null;
     
     if (useSeqMethod)
     {
+      IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::createBlockCrsTpetraImporter useSeqMethod");
       if (importer.is_null()) // there is no given importer, then create one
         impl_->tpetra_importer = BlockTriDiContainerDetails::createBlockCrsTpetraImporter<MatrixType>(impl_->A);
       else
         impl_->tpetra_importer = importer; // if there is a given importer, use it
+      IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
     }
     else
     {
+      IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::createBlockCrsTpetraImporter");
       //Leave tpetra_importer null even if user provided an importer.
       //It is not used in the performant codepath (!useSeqMethod)
       impl_->async_importer = BlockTriDiContainerDetails::createBlockCrsAsyncImporter<MatrixType>(impl_->A);
+      IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
     }
 
     // as a result, there are 
@@ -118,13 +134,26 @@ namespace Ifpack2 {
 
     // temporary disabling 
     impl_->overlap_communication_and_computation = overlapCommAndComp;
-    
-    impl_->Z = typename impl_type::tpetra_multivector_type();
-    impl_->W = typename impl_type::impl_scalar_type_1d_view();
 
-    impl_->part_interface  = BlockTriDiContainerDetails::createPartInterface<MatrixType>(impl_->A, partitions);
-    impl_->block_tridiags  = BlockTriDiContainerDetails::createBlockTridiags<MatrixType>(impl_->part_interface);
-    impl_->norm_manager    = BlockTriDiContainerDetails::NormManager<MatrixType>(impl_->A->getComm());
+    {
+      IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::createZ");
+      impl_->Z = typename impl_type::tpetra_multivector_type();
+      IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
+    }
+    {
+      IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::createW");
+      impl_->W = typename impl_type::impl_scalar_type_1d_view();
+      IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
+    }
+
+    {
+      IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::createPartInterfaceBlockTridiagsNormManager");
+      impl_->part_interface  = BlockTriDiContainerDetails::createPartInterface<MatrixType>(impl_->A, partitions, n_subparts_per_part_);
+      impl_->block_tridiags  = BlockTriDiContainerDetails::createBlockTridiags<MatrixType>(impl_->part_interface);
+      impl_->norm_manager    = BlockHelperDetails::NormManager<MatrixType>(impl_->A->getComm());
+      IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
+    }
+    IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
   }
 
   template <typename MatrixType>
@@ -132,11 +161,12 @@ namespace Ifpack2 {
   BlockTriDiContainer<MatrixType, BlockTriDiContainerDetails::ImplSimdTag>
   ::clearInternal ()
   {
-    using impl_type = BlockTriDiContainerDetails::ImplType<MatrixType>;
-    using part_interface_type = BlockTriDiContainerDetails::PartInterface<MatrixType>;
+    IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::clearInternal");
+    using impl_type = BlockHelperDetails::ImplType<MatrixType>;
+    using part_interface_type = BlockHelperDetails::PartInterface<MatrixType>;
     using block_tridiags_type = BlockTriDiContainerDetails::BlockTridiags<MatrixType>;
-    using amd_type = BlockTriDiContainerDetails::AmD<MatrixType>;
-    using norm_manager_type = BlockTriDiContainerDetails::NormManager<MatrixType>;
+    using amd_type = BlockHelperDetails::AmD<MatrixType>;
+    using norm_manager_type = BlockHelperDetails::NormManager<MatrixType>;
     
     impl_->A = Teuchos::null;
     impl_->tpetra_importer = Teuchos::null;
@@ -152,6 +182,7 @@ namespace Ifpack2 {
     impl_->norm_manager    = norm_manager_type();
 
     impl_ = Teuchos::null;
+    IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
   }
 
   template <typename MatrixType>
@@ -162,19 +193,25 @@ namespace Ifpack2 {
                        bool pointIndexed)
     : Container<MatrixType>(matrix, partitions, pointIndexed)
   {
+    IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::BlockTriDiContainer");
     const bool useSeqMethod = false;
     const bool overlapCommAndComp = false;
-    initInternal(matrix, partitions, importer, overlapCommAndComp, useSeqMethod);
+    initInternal(matrix, partitions, importer, 1, overlapCommAndComp, useSeqMethod);
+    IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
   }
 
   template <typename MatrixType>
   BlockTriDiContainer<MatrixType, BlockTriDiContainerDetails::ImplSimdTag>
   ::BlockTriDiContainer (const Teuchos::RCP<const row_matrix_type>& matrix,
                        const Teuchos::Array<Teuchos::Array<local_ordinal_type> >& partitions,
-                       const bool overlapCommAndComp, const bool useSeqMethod)
+                       const int n_subparts_per_part,
+                       const bool overlapCommAndComp, 
+                       const bool useSeqMethod)
     : Container<MatrixType>(matrix, partitions, false)
   {
-    initInternal(matrix, partitions, Teuchos::null, overlapCommAndComp, useSeqMethod);
+    IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::BlockTriDiContainer");
+    initInternal(matrix, partitions, Teuchos::null, n_subparts_per_part, overlapCommAndComp, useSeqMethod);
+    IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
   }
 
   template <typename MatrixType>
@@ -196,6 +233,7 @@ namespace Ifpack2 {
   BlockTriDiContainer<MatrixType, BlockTriDiContainerDetails::ImplSimdTag>
   ::initialize ()
   {
+    IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::initialize");
     this->IsInitialized_ = true;
     // We assume that if you called this method, you intend to recompute
     // everything.
@@ -208,6 +246,7 @@ namespace Ifpack2 {
          impl_->a_minus_d, 
          impl_->overlap_communication_and_computation);    
     }
+    IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
   }
 
   template <typename MatrixType>
@@ -215,6 +254,7 @@ namespace Ifpack2 {
   BlockTriDiContainer<MatrixType, BlockTriDiContainerDetails::ImplSimdTag>
   ::compute ()
   {
+    IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::compute");
     this->IsComputed_ = false;
     if (!this->isInitialized())
       this->initialize();
@@ -225,6 +265,7 @@ namespace Ifpack2 {
          Kokkos::ArithTraits<magnitude_type>::zero());
     }
     this->IsComputed_ = true;
+    IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
   }
 
   template <typename MatrixType>
@@ -232,10 +273,12 @@ namespace Ifpack2 {
   BlockTriDiContainer<MatrixType, BlockTriDiContainerDetails::ImplSimdTag>
   ::clearBlocks ()
   {
+    IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::clearBlocks");
     clearInternal();
     this->IsInitialized_ = false;
     this->IsComputed_ = false;
     Container<MatrixType>::clearBlocks();
+    IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
   }
 
   template <typename MatrixType>
@@ -244,6 +287,7 @@ namespace Ifpack2 {
   ::applyInverseJacobi (const mv_type& X, mv_type& Y, scalar_type dampingFactor,
                         bool zeroStartingSolution, int numSweeps) const
   {
+    IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::applyInverseJacobi");
     const magnitude_type tol = Kokkos::ArithTraits<magnitude_type>::zero();
     const int check_tol_every = 1;
 
@@ -261,6 +305,7 @@ namespace Ifpack2 {
        numSweeps,
        tol,
        check_tol_every);
+    IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
   }
 
   template <typename MatrixType>
@@ -276,6 +321,7 @@ namespace Ifpack2 {
   BlockTriDiContainer<MatrixType, BlockTriDiContainerDetails::ImplSimdTag>
   ::compute (const ComputeParameters& in)
   {
+    IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::compute");
     this->IsComputed_ = false;
     if (!this->isInitialized())
       this->initialize();
@@ -286,6 +332,7 @@ namespace Ifpack2 {
          in.addRadiallyToDiagonal);
     }
     this->IsComputed_ = true;
+    IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
   }
 
   template <typename MatrixType>
@@ -304,6 +351,7 @@ namespace Ifpack2 {
   ::applyInverseJacobi (const mv_type& X, mv_type& Y, 
                         const ApplyParameters& in) const
   {
+    IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::applyInverseJacobi");
     int r_val = 0;
     {
       r_val = BlockTriDiContainerDetails::applyInverseJacobi<MatrixType>
@@ -321,6 +369,7 @@ namespace Ifpack2 {
          in.tolerance,
          in.checkToleranceEvery);
     }
+    IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
     return r_val;
   }
 
diff --git a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp
index 2b2563567eca..7d992cb51d3c 100644
--- a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp
+++ b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp
@@ -43,6 +43,9 @@
 #ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
 #define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP
 
+//#define IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
+//#define IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
+
 #include <Teuchos_Details_MpiTypeTraits.hpp>
 
 #include <Tpetra_Details_extractMpiCommFromTeuchos.hpp>
@@ -78,6 +81,11 @@
 
 #include <memory>
 
+#include "Ifpack2_BlockHelper.hpp"
+#include "Ifpack2_BlockComputeResidualVector.hpp"
+
+//#include <KokkosBlas2_gemv.hpp>
+
 // need to interface this into cmake variable (or only use this flag when it is necessary)
 //#define IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
 //#undef  IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE
@@ -159,19 +167,6 @@ namespace Ifpack2 {
                                  typename ViewType::execution_space::scratch_memory_space,
                                  MemoryTraits<typename ViewType::memory_traits, Kokkos::Unmanaged> >;
 
-    /// 
-    /// tpetra little block index
-    ///
-    template<typename LayoutType> struct TpetraLittleBlock;
-    template<> struct TpetraLittleBlock<Kokkos::LayoutLeft> {
-      template<typename T> KOKKOS_INLINE_FUNCTION
-      static T getFlatIndex(const T i, const T j, const T blksize) { return i+j*blksize; }
-    };
-    template<> struct TpetraLittleBlock<Kokkos::LayoutRight> {
-      template<typename T> KOKKOS_INLINE_FUNCTION
-      static T getFlatIndex(const T i, const T j, const T blksize) { return i*blksize+j; }
-    };
-
     ///
     /// block tridiag scalar type
     ///
@@ -181,159 +176,6 @@ namespace Ifpack2 {
     //template<> struct SmallScalarType<Kokkos::complex<double> > { typedef Kokkos::complex<float> type; };
 #endif
 
-    ///
-    /// cuda specialization
-    ///
-    template<typename T> struct is_cuda                 { enum : bool { value = false }; };
-#if defined(KOKKOS_ENABLE_CUDA)
-    template<> struct is_cuda<Kokkos::Cuda>             { enum : bool { value = true  }; };
-#endif
-
-    ///
-    /// hip specialization
-    ///
-    template<typename T> struct is_hip                  { enum : bool { value = false }; };
-#if defined(KOKKOS_ENABLE_HIP)
-    template<> struct is_hip<Kokkos::HIP> { enum : bool { value = true  }; };
-#endif
-
-    ///
-    /// sycl specialization
-    ///
-    template<typename T> struct is_sycl                  { enum : bool { value = false }; };
-#if defined(KOKKOS_ENABLE_SYCL)
-    template<> struct is_sycl<Kokkos::Experimental::SYCL> { enum : bool { value = true  }; };
-#endif
-
-    template<typename T> struct is_device                  { enum : bool { value = is_cuda<T>::value || is_hip<T>::value || is_sycl<T>::value }; };
-
-    
-    ///
-    /// execution space instance
-    ///
-    template<typename T>
-    struct ExecutionSpaceFactory {
-      static void createInstance(T &exec_instance) {
-        exec_instance = T();
-      }
-#if defined(KOKKOS_ENABLE_CUDA)
-      static void createInstance(const cudaStream_t &s, T &exec_instance) {
-        exec_instance = T();
-      }
-#endif
-    };
-
-#if defined(KOKKOS_ENABLE_CUDA)
-    template<>
-    struct ExecutionSpaceFactory<Kokkos::Cuda> {
-      static void createInstance(Kokkos::Cuda &exec_instance) {
-        exec_instance = Kokkos::Cuda();
-      }
-      static void createInstance(const cudaStream_t &s, Kokkos::Cuda &exec_instance) {
-        exec_instance = Kokkos::Cuda(s);
-      }      
-    };
-#endif
-
-#if defined(KOKKOS_ENABLE_HIP)
-    template<>
-    struct ExecutionSpaceFactory<Kokkos::HIP> {
-      static void createInstance(Kokkos::HIP &exec_instance) {
-	exec_instance = Kokkos::HIP();
-      }
-    };
-#endif
-
-#if defined(KOKKOS_ENABLE_SYCL)
-    template<>
-    struct ExecutionSpaceFactory<Kokkos::Experimental::SYCL> {
-      static void createInstance(Kokkos::Experimental::SYCL &exec_instance) {
-	exec_instance = Kokkos::Experimental::SYCL();
-      }
-    };
-#endif
-
-
-    
-    ///
-    /// utility functions
-    ///
-    template<typename CommPtrType>
-    std::string get_msg_prefix (const CommPtrType &comm) {
-      const auto rank = comm->getRank();
-      const auto nranks = comm->getSize();
-      std::stringstream ss;
-      ss << "Rank " << rank << " of " << nranks << ": ";
-      return ss.str();
-    }
-
-    ///
-    /// custom multiple varilable reduce and scan
-    ///
-    template<typename T, int N>
-    struct ArrayValueType {
-      T v[N];
-      KOKKOS_INLINE_FUNCTION
-      ArrayValueType() {
-        for (int i=0;i<N;++i)
-          this->v[i] = 0;
-      }
-      KOKKOS_INLINE_FUNCTION
-      ArrayValueType(const ArrayValueType &b) {
-        for (int i=0;i<N;++i)
-          this->v[i] = b.v[i];
-      }
-    };
-    template<typename T, int N>
-    static
-    KOKKOS_INLINE_FUNCTION
-    void
-    operator+=(ArrayValueType<T,N> &a,
-               const ArrayValueType<T,N> &b) {
-      for (int i=0;i<N;++i)
-        a.v[i] += b.v[i];
-    }
-
-    ///
-    /// custom reducer functor for compile time array variable
-    ///
-    template<typename T, int N, typename ExecSpace>
-    struct SumReducer {
-      typedef SumReducer reducer;
-      typedef ArrayValueType<T,N> value_type;
-      typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
-      value_type *value;
-
-      KOKKOS_INLINE_FUNCTION
-      SumReducer(value_type &val) : value(&val) {}
-
-      KOKKOS_INLINE_FUNCTION
-      void join(value_type &dst, value_type const &src) const {
-        for (int i=0;i<N;++i)
-          dst.v[i] += src.v[i];
-      }
-      KOKKOS_INLINE_FUNCTION
-      void init(value_type &val) const {
-        for (int i=0;i<N;++i)
-          val.v[i] = Kokkos::reduction_identity<T>::sum();
-      }
-      KOKKOS_INLINE_FUNCTION
-      value_type& reference() {
-        return *value;
-      }
-      KOKKOS_INLINE_FUNCTION
-      result_view_type view() const {
-        return result_view_type(value);
-      }
-    };
-
-#if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS)
-#define IFPACK2_BLOCKTRIDICONTAINER_TIMER(label) TEUCHOS_FUNC_TIME_MONITOR(label);
-#define IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space) execution_space().fence();
-#else
-#define IFPACK2_BLOCKTRIDICONTAINER_TIMER(label)
-#define IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space)
-#endif
 
 #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE)
 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN \
@@ -347,106 +189,14 @@ namespace Ifpack2 {
 #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END
 #endif
 
-    ///
-    /// implementation typedefs
-    ///
-    template <typename MatrixType>
-    struct ImplType {
-      ///
-      /// matrix type derived types
-      ///
-      typedef size_t size_type;
-      typedef typename MatrixType::scalar_type scalar_type;
-      typedef typename MatrixType::local_ordinal_type local_ordinal_type;
-      typedef typename MatrixType::global_ordinal_type global_ordinal_type;
-      typedef typename MatrixType::node_type node_type;
-
-      ///
-      /// kokkos arithmetic traits of scalar_type
-      ///
-      typedef typename Kokkos::ArithTraits<scalar_type>::val_type impl_scalar_type;
-      typedef typename Kokkos::ArithTraits<impl_scalar_type>::mag_type magnitude_type;
-
-      typedef typename BlockTridiagScalarType<impl_scalar_type>::type btdm_scalar_type;
-      typedef typename Kokkos::ArithTraits<btdm_scalar_type>::mag_type btdm_magnitude_type;
-
-      ///
-      /// default host execution space
-      ///
-      typedef Kokkos::DefaultHostExecutionSpace host_execution_space;
-
-      ///
-      /// tpetra types
-      ///
-      typedef typename node_type::device_type node_device_type;
-      typedef typename node_device_type::execution_space node_execution_space;
-      typedef typename node_device_type::memory_space node_memory_space;
-
-#if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE)
-      /// force to use cuda space instead uvm space
-      typedef node_execution_space execution_space;
-      typedef typename std::conditional<std::is_same<node_memory_space,Kokkos::CudaUVMSpace>::value,
-                                        Kokkos::CudaSpace,
-                                        node_memory_space>::type memory_space;
-      typedef Kokkos::Device<execution_space,memory_space> device_type;
-#else
-      typedef node_device_type device_type;
-      typedef node_execution_space execution_space;
-      typedef node_memory_space memory_space;
-#endif
-
-      typedef Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_multivector_type;
-      typedef Tpetra::Map<local_ordinal_type,global_ordinal_type,node_type> tpetra_map_type;
-      typedef Tpetra::Import<local_ordinal_type,global_ordinal_type,node_type> tpetra_import_type;
-      typedef Tpetra::RowMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_row_matrix_type;
-      typedef Tpetra::BlockCrsMatrix<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_crs_matrix_type;
-      typedef typename tpetra_block_crs_matrix_type::little_block_type tpetra_block_access_view_type;
-      typedef Tpetra::BlockMultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type> tpetra_block_multivector_type;
-      typedef typename tpetra_block_crs_matrix_type::crs_graph_type::local_graph_device_type local_crs_graph_type;
-
-      ///
-      /// simd vectorization
-      ///
-      template<typename T, int l> using Vector = KB::Vector<T,l>;
-      template<typename T> using SIMD = KB::SIMD<T>;
-      template<typename T, typename M> using DefaultVectorLength = KB::DefaultVectorLength<T,M>;
-      template<typename T, typename M> using DefaultInternalVectorLength = KB::DefaultInternalVectorLength<T,M>;
-
-      static constexpr int vector_length = DefaultVectorLength<btdm_scalar_type,memory_space>::value;
-      static constexpr int internal_vector_length = DefaultInternalVectorLength<btdm_scalar_type,memory_space>::value;
-      typedef Vector<SIMD<btdm_scalar_type>,vector_length> vector_type;
-      typedef Vector<SIMD<btdm_scalar_type>,internal_vector_length> internal_vector_type;
-
-      ///
-      /// commonly used view types
-      ///
-      typedef Kokkos::View<size_type*,device_type> size_type_1d_view;
-      typedef Kokkos::View<local_ordinal_type*,device_type> local_ordinal_type_1d_view;
-      // tpetra block crs values
-      typedef Kokkos::View<impl_scalar_type*,device_type> impl_scalar_type_1d_view;
-      typedef Kokkos::View<impl_scalar_type*,node_device_type> impl_scalar_type_1d_view_tpetra;
-
-      // tpetra multivector values (layout left): may need to change the typename more explicitly
-      typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,device_type> impl_scalar_type_2d_view;
-      typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,node_device_type> impl_scalar_type_2d_view_tpetra;
-
-      // packed data always use layout right
-      typedef Kokkos::View<vector_type*,device_type> vector_type_1d_view;
-      typedef Kokkos::View<vector_type***,Kokkos::LayoutRight,device_type> vector_type_3d_view;
-      typedef Kokkos::View<internal_vector_type***,Kokkos::LayoutRight,device_type> internal_vector_type_3d_view;
-      typedef Kokkos::View<internal_vector_type****,Kokkos::LayoutRight,device_type> internal_vector_type_4d_view;
-      typedef Kokkos::View<btdm_scalar_type***,Kokkos::LayoutRight,device_type> btdm_scalar_type_3d_view;
-      typedef Kokkos::View<btdm_scalar_type****,Kokkos::LayoutRight,device_type> btdm_scalar_type_4d_view;
-    };
-
     ///
     /// setup sequential importer
     ///
     template<typename MatrixType>
-    typename Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_import_type>
-    createBlockCrsTpetraImporter(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A) {
-      IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::CreateBlockCrsTpetraImporter");
-      using impl_type = ImplType<MatrixType>;
+    typename Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type>
+    createBlockCrsTpetraImporter(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A) {
+      IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::CreateBlockCrsTpetraImporter");
+      using impl_type = BlockHelperDetails::ImplType<MatrixType>;
       using tpetra_map_type = typename impl_type::tpetra_map_type;
       using tpetra_mv_type = typename impl_type::tpetra_block_multivector_type;
       using tpetra_import_type = typename impl_type::tpetra_import_type;
@@ -455,11 +205,8 @@ namespace Ifpack2 {
       const auto blocksize = A->getBlockSize();
       const auto src = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize)));
       const auto tgt = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap()   , blocksize)));
-
-      auto blockCrsTpetraImporter = Teuchos::rcp(new tpetra_import_type(src, tgt));
-      IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(typename ImplType<MatrixType>::execution_space)
-
-      return blockCrsTpetraImporter;
+      IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
+      return Teuchos::rcp(new tpetra_import_type(src, tgt));
     }
 
     // Partial replacement for forward-mode MultiVector::doImport.
@@ -470,7 +217,7 @@ namespace Ifpack2 {
     template<typename MatrixType>
     struct AsyncableImport {
     public:
-      using impl_type = ImplType<MatrixType>;
+      using impl_type = BlockHelperDetails::ImplType<MatrixType>;
 
     private:
       ///
@@ -794,7 +541,7 @@ namespace Ifpack2 {
       }
 
       void asyncSendRecvVar1(const impl_scalar_type_2d_view_tpetra &mv) {
-        IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv");
+        IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv");
 
 #ifdef HAVE_IFPACK2_MPI
         // constants and reallocate data buffers if necessary
@@ -845,11 +592,11 @@ namespace Ifpack2 {
           MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
         }
 #endif // HAVE_IFPACK2_MPI
-        IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space)
+        IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
       }
 
       void syncRecvVar1() {
-        IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::SyncRecv");
+        IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncRecv");
 #ifdef HAVE_IFPACK2_MPI
         // 0. wait for receive async.
         for (local_ordinal_type i=0;i<static_cast<local_ordinal_type>(pids.recv.extent(0));++i) {
@@ -871,6 +618,7 @@ namespace Ifpack2 {
         // 2. cleanup all open comm
         waitall(reqs.send.size(), reqs.send.data());
 #endif // HAVE_IFPACK2_MPI
+        IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
       }
 #endif //defined(KOKKOS_ENABLE_CUDA|HIP|SYCL)
 
@@ -891,7 +639,7 @@ namespace Ifpack2 {
         const local_ordinal_type mv_blocksize = blocksize_*num_vectors;
         const local_ordinal_type idiff = iend_ - ibeg_;
         const auto abase = buffer_.data() + mv_blocksize*ibeg_;
-        if constexpr (is_device<execution_space>::value) {
+        if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
           using team_policy_type = Kokkos::TeamPolicy<execution_space>;
           local_ordinal_type vector_size(0);
           if      (blocksize_ <=  4) vector_size =  4;
@@ -940,7 +688,7 @@ namespace Ifpack2 {
       /// standard comm
       ///
       void asyncSendRecvVar0(const impl_scalar_type_2d_view_tpetra &mv) {
-        IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv");
+        IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv");
 
 #ifdef HAVE_IFPACK2_MPI
         // constants and reallocate data buffers if necessary
@@ -978,11 +726,11 @@ namespace Ifpack2 {
           MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat);
         }
 #endif
-        IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space)
+        IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
       }
 
       void syncRecvVar0() {
-        IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::SyncRecv");
+        IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncRecv");
 #ifdef HAVE_IFPACK2_MPI
         // receive async.
         for (local_ordinal_type i=0,iend=pids.recv.extent(0);i<iend;++i) {
@@ -994,6 +742,7 @@ namespace Ifpack2 {
         // wait on the sends to match all Isends with a cleanup operation.
         waitall(reqs.send.size(), reqs.send.data());
 #endif
+        IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
       }
 
       ///
@@ -1023,10 +772,10 @@ namespace Ifpack2 {
       }
 
       void syncExchange(const impl_scalar_type_2d_view_tpetra &mv) {
-        IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::SyncExchange");
+        IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncExchange");
         asyncSendRecv(mv);
         syncRecv();
-        IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space)
+        IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
       }
 
       impl_scalar_type_2d_view_tpetra getRemoteMultiVectorLocalView() const { return remote_multivector; }
@@ -1037,8 +786,8 @@ namespace Ifpack2 {
     ///
     template<typename MatrixType>
     Teuchos::RCP<AsyncableImport<MatrixType> >
-    createBlockCrsAsyncImporter(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A) {
-      using impl_type = ImplType<MatrixType>;
+    createBlockCrsAsyncImporter(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A) {
+      using impl_type = BlockHelperDetails::ImplType<MatrixType>;
       using tpetra_map_type = typename impl_type::tpetra_map_type;
       using local_ordinal_type = typename impl_type::local_ordinal_type;
       using global_ordinal_type = typename impl_type::global_ordinal_type;
@@ -1094,79 +843,37 @@ namespace Ifpack2 {
       return Teuchos::null;
     }
 
-    template<typename MatrixType>
-    struct PartInterface {
-      using local_ordinal_type = typename ImplType<MatrixType>::local_ordinal_type;
-      using local_ordinal_type_1d_view = typename ImplType<MatrixType>::local_ordinal_type_1d_view;
-
-      PartInterface() = default;
-      PartInterface(const PartInterface &b) = default;
-
-      // Some terms:
-      //   The matrix A is split as A = D + R, where D is the matrix of tridiag
-      // blocks and R is the remainder.
-      //   A part is roughly a synonym for a tridiag. The distinction is that a part
-      // is the set of rows belonging to one tridiag and, equivalently, the off-diag
-      // rows in R associated with that tridiag. In contrast, the term tridiag is
-      // used to refer specifically to tridiag data, such as the pointer into the
-      // tridiag data array.
-      //   Local (lcl) row arge the LIDs. lclrow lists the LIDs belonging to each
-      // tridiag, and partptr points to the beginning of each tridiag. This is the
-      // LID space.
-      //   Row index (idx) is the ordinal in the tridiag ordering. lclrow is indexed
-      // by this ordinal. This is the 'index' space.
-      //   A flat index is the mathematical index into an array. A pack index
-      // accounts for SIMD packing.
-
-      // Local row LIDs. Permutation from caller's index space to tridiag index
-      // space.
-      local_ordinal_type_1d_view lclrow;
-      // partptr_ is the pointer array into lclrow_.
-      local_ordinal_type_1d_view partptr; // np+1
-      // packptr_(i), for i the pack index, indexes partptr_. partptr_(packptr_(i))
-      // is the start of the i'th pack.
-      local_ordinal_type_1d_view packptr; // npack+1
-      // part2rowidx0_(i) is the flat row index of the start of the i'th part. It's
-      // an alias of partptr_ in the case of no overlap.
-      local_ordinal_type_1d_view part2rowidx0; // np+1
-      // part2packrowidx0_(i) is the packed row index. If vector_length is 1, then
-      // it's the same as part2rowidx0_; if it's > 1, then the value is combined
-      // with i % vector_length to get the location in the packed data.
-      local_ordinal_type_1d_view part2packrowidx0; // np+1
-      local_ordinal_type part2packrowidx0_back; // So we don't need to grab the array from the GPU.
-      // rowidx2part_ maps the row index to the part index.
-      local_ordinal_type_1d_view rowidx2part; // nr
-      // True if lcl{row|col} is at most a constant away from row{idx|col}. In
-      // practice, this knowledge is not particularly useful, as packing for batched
-      // processing is done at the same time as the permutation from LID to index
-      // space. But it's easy to detect, so it's recorded in case an optimization
-      // can be made based on it.
-      bool row_contiguous;
-
-      local_ordinal_type max_partsz;
-    };
-
     ///
     /// setup part interface using the container partitions array
     ///
     template<typename MatrixType>
-    PartInterface<MatrixType>
-    createPartInterface(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
-                        const Teuchos::Array<Teuchos::Array<typename ImplType<MatrixType>::local_ordinal_type> > &partitions) {
-      using impl_type = ImplType<MatrixType>;
+    BlockHelperDetails::PartInterface<MatrixType>
+    createPartInterface(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
+                        const Teuchos::Array<Teuchos::Array<typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type> > &partitions,
+                        const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type n_subparts_per_part) {
+      IFPACK2_BLOCKHELPER_TIMER("createPartInterface");
+      using impl_type = BlockHelperDetails::ImplType<MatrixType>;
       using local_ordinal_type = typename impl_type::local_ordinal_type;
       using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
+      using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
+      using size_type = typename impl_type::size_type;
 
       constexpr int vector_length = impl_type::vector_length;
 
       const auto comm = A->getRowMap()->getComm();
 
-      PartInterface<MatrixType> interf;
+      BlockHelperDetails::PartInterface<MatrixType> interf;
 
       const bool jacobi = partitions.size() == 0;
       const local_ordinal_type A_n_lclrows = A->getLocalNumRows();
       const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size();
 
+      // Total number of sub lines:
+      const local_ordinal_type n_sub_parts = nparts * n_subparts_per_part;
+      // Total number of sub lines + the Schur complement blocks.
+      // For a given live 2 sub lines implies one Schur complement, 3 sub lines implies two Schur complements etc.
+      const local_ordinal_type n_sub_parts_and_schur = n_sub_parts + nparts * (n_subparts_per_part-1);
+
 #if defined(BLOCKTRIDICONTAINER_DEBUG)
       local_ordinal_type nrows = 0;
       if (jacobi)
@@ -1175,7 +882,7 @@ namespace Ifpack2 {
         for (local_ordinal_type i=0;i<nparts;++i) nrows += partitions[i].size();
 
       TEUCHOS_TEST_FOR_EXCEPT_MSG
-        (nrows != A_n_lclrows, get_msg_prefix(comm) << "The #rows implied by the local partition is not "
+        (nrows != A_n_lclrows, BlockHelperDetails::get_msg_prefix(comm) << "The #rows implied by the local partition is not "
          << "the same as getLocalNumRows: " << nrows << " vs " << A_n_lclrows);
 #endif
 
@@ -1183,6 +890,9 @@ namespace Ifpack2 {
       std::vector<local_ordinal_type> p;
       if (jacobi) {
         interf.max_partsz = 1;
+        interf.max_subpartsz = 0;
+        interf.n_subparts_per_part = 1;
+        interf.nparts = nparts;
       } else {
         // reorder parts to maximize simd packing efficiency
         p.resize(nparts);
@@ -1199,6 +909,14 @@ namespace Ifpack2 {
           p[i] = partsz[i].second;
 
         interf.max_partsz = partsz[0].first;
+
+        constexpr local_ordinal_type connection_length = 2;
+        const local_ordinal_type sub_line_length = (interf.max_partsz - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
+        const local_ordinal_type last_sub_line_length = interf.max_partsz - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
+
+        interf.max_subpartsz = (sub_line_length > last_sub_line_length) ? sub_line_length : last_sub_line_length;
+        interf.n_subparts_per_part = n_subparts_per_part;
+        interf.nparts = nparts;
       }
 
       // allocate parts
@@ -1208,79 +926,319 @@ namespace Ifpack2 {
       interf.part2packrowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2packrowidx0"), nparts + 1);
       interf.rowidx2part = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
 
+      interf.part2rowidx0_sub = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0_sub"), n_sub_parts_and_schur + 1);
+      interf.part2packrowidx0_sub = local_ordinal_type_2d_view(do_not_initialize_tag("part2packrowidx0_sub"), nparts, 2 * n_subparts_per_part);
+      interf.rowidx2part_sub = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows);
+
+      interf.partptr_sub = local_ordinal_type_2d_view(do_not_initialize_tag("partptr_sub"), n_sub_parts_and_schur, 2);
+
       // mirror to host and compute on host execution space
       const auto partptr = Kokkos::create_mirror_view(interf.partptr);
+      const auto partptr_sub = Kokkos::create_mirror_view(interf.partptr_sub);
+      
       const auto lclrow = Kokkos::create_mirror_view(interf.lclrow);
       const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0);
       const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0);
       const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part);
 
+      const auto part2rowidx0_sub = Kokkos::create_mirror_view(interf.part2rowidx0_sub);
+      const auto part2packrowidx0_sub = Kokkos::create_mirror_view(Kokkos::HostSpace(), interf.part2packrowidx0_sub);
+      const auto rowidx2part_sub = Kokkos::create_mirror_view(interf.rowidx2part_sub);
+
       // Determine parts.
       interf.row_contiguous = true;
       partptr(0) = 0;
       part2rowidx0(0) = 0;
       part2packrowidx0(0) = 0;
       local_ordinal_type pack_nrows = 0;
+      local_ordinal_type pack_nrows_sub = 0;
       if (jacobi) {
-	for (local_ordinal_type ip=0;ip<nparts;++ip) {
-	  const local_ordinal_type ipnrows = 1;
-	  TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
-				      get_msg_prefix(comm)
-				      << "partition " << p[ip]
-				      << " is empty, which is not allowed.");
-	  //assume No overlap.
-	  part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
-	  // Since parts are ordered in nonincreasing size, the size of the first
-	  // part in a pack is the size for all parts in the pack.
-	  if (ip % vector_length == 0) pack_nrows = ipnrows;
-	  part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
-	  const local_ordinal_type os = partptr(ip);
-	  for (local_ordinal_type i=0;i<ipnrows;++i) {
-	    const auto lcl_row = ip;
-	    TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
-					get_msg_prefix(comm)
-					<< "partitions[" << p[ip] << "]["
-					<< i << "] = " << lcl_row
-					<< " but input matrix implies limits of [0, " << A_n_lclrows-1
-					<< "].");
-	    lclrow(os+i) = lcl_row;
-	    rowidx2part(os+i) = ip;
-	    if (interf.row_contiguous && os+i > 0 && lclrow((os+i)-1) + 1 != lcl_row)
-	      interf.row_contiguous = false;
-	  }
-	  partptr(ip+1) = os + ipnrows;
-	}
+        IFPACK2_BLOCKHELPER_TIMER("compute part indices (Jacobi)");
+        for (local_ordinal_type ip=0;ip<nparts;++ip) {
+          constexpr local_ordinal_type ipnrows = 1;
+          //assume No overlap.
+          part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
+          // Since parts are ordered in decreasing size, the size of the first
+          // part in a pack is the size for all parts in the pack.
+          if (ip % vector_length == 0) pack_nrows = ipnrows;
+          part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
+          const local_ordinal_type offset = partptr(ip);
+          for (local_ordinal_type i=0;i<ipnrows;++i) {
+            const auto lcl_row = ip;
+            TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
+                BlockHelperDetails::get_msg_prefix(comm)
+                << "partitions[" << p[ip] << "]["
+                << i << "] = " << lcl_row
+                << " but input matrix implies limits of [0, " << A_n_lclrows-1
+                << "].");
+            lclrow(offset+i) = lcl_row;
+            rowidx2part(offset+i) = ip;
+            if (interf.row_contiguous && offset+i > 0 && lclrow((offset+i)-1) + 1 != lcl_row)
+              interf.row_contiguous = false;
+          }
+          partptr(ip+1) = offset + ipnrows;
+        }
+        part2rowidx0_sub(0) = 0;
+        partptr_sub(0, 0) = 0;
+
+        for (local_ordinal_type ip=0;ip<nparts;++ip) {
+          constexpr local_ordinal_type ipnrows = 1;
+          const local_ordinal_type full_line_length = partptr(ip+1) - partptr(ip);
+
+          TEUCHOS_TEST_FOR_EXCEPTION
+            (full_line_length != ipnrows, std::logic_error, 
+            "In the part " << ip );  
+
+          constexpr local_ordinal_type connection_length = 2;
+
+          if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length )
+              TEUCHOS_TEST_FOR_EXCEPTION
+                (true, std::logic_error, 
+                "The part " << ip << " is too short to use " << n_subparts_per_part << " sub parts.");            
+
+          const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
+          const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
+
+          if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
+
+          for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part;++local_sub_ip) {
+            const local_ordinal_type sub_ip = nparts*(2*local_sub_ip) + ip;
+            const local_ordinal_type schur_ip = nparts*(2*local_sub_ip+1) + ip;
+            if (local_sub_ip != n_subparts_per_part-1) {
+              if (local_sub_ip != 0) {
+                partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
+              }
+              else if (ip != 0) {
+                partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
+              }
+              partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
+              partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
+              partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
+
+              part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
+              part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
+
+#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
+              printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), sub_line_length);
+              printf("Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
+#endif
+            }
+            else {
+              if (local_sub_ip != 0) {
+                partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
+              }
+              else if (ip != 0) {
+                partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
+              }
+              partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
+
+              part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
+
+#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
+              printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), last_sub_line_length);
+#endif
+            }
+          }
+        }
+
+#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
+        std::cout << "partptr_sub = " << std::endl;
+        for (size_type i = 0; i < partptr_sub.extent(0); ++i) {
+          for (size_type j = 0; j < partptr_sub.extent(1); ++j) {
+            std::cout << partptr_sub(i,j) << " ";
+          }
+          std::cout << std::endl;
+        }
+        std::cout << "partptr_sub end" << std::endl;
+#endif
+
+        {
+          local_ordinal_type npacks = ceil(float(nparts)/vector_length);
+
+          local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
+          for (local_ordinal_type ip=0;ip<ip_max;++ip) {
+            part2packrowidx0_sub(ip, 0) = 0;
+          }
+          for (local_ordinal_type ipack=0;ipack<npacks;++ipack) {
+            if (ipack != 0) {
+              local_ordinal_type ip_min = ipack*vector_length;
+              ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
+              for (local_ordinal_type ip=ip_min;ip<ip_max;++ip) {
+                part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip-vector_length, part2packrowidx0_sub.extent(1)-1);
+              }
+            }
+
+            for (size_type local_sub_ip=0; local_sub_ip<part2packrowidx0_sub.extent(1)-1;++local_sub_ip) {
+              local_ordinal_type ip_min = ipack*vector_length;
+              ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
+
+              const local_ordinal_type full_line_length = partptr(ip_min+1) - partptr(ip_min);
+
+              constexpr local_ordinal_type connection_length = 2;      
+
+              const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
+              const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
+
+              if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
+              if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
+              if (local_sub_ip == part2packrowidx0_sub.extent(1)-2) pack_nrows_sub = last_sub_line_length;
+
+              part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
+
+              for (local_ordinal_type ip=ip_min+1;ip<ip_max;++ip) {
+                part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
+              }
+            }
+          }
+
+          Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
+        }        
+        IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
       } else {
-	for (local_ordinal_type ip=0;ip<nparts;++ip) {
-	  const auto* part = &partitions[p[ip]];
-	  const local_ordinal_type ipnrows = part->size();
-	  TEUCHOS_ASSERT(ip == 0 || (ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip-1]].size())));
-	  TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
-				      get_msg_prefix(comm)
-				      << "partition " << p[ip]
-				      << " is empty, which is not allowed.");
-	  //assume No overlap.
-	  part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
-	  // Since parts are ordered in nonincreasing size, the size of the first
-	  // part in a pack is the size for all parts in the pack.
-	  if (ip % vector_length == 0) pack_nrows = ipnrows;
-	  part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
-	  const local_ordinal_type os = partptr(ip);
-	  for (local_ordinal_type i=0;i<ipnrows;++i) {
-	    const auto lcl_row = (*part)[i];
-	    TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
-					get_msg_prefix(comm)
-					<< "partitions[" << p[ip] << "]["
-					<< i << "] = " << lcl_row
-					<< " but input matrix implies limits of [0, " << A_n_lclrows-1
-					<< "].");
-	    lclrow(os+i) = lcl_row;
-	    rowidx2part(os+i) = ip;
-	    if (interf.row_contiguous && os+i > 0 && lclrow((os+i)-1) + 1 != lcl_row)
-	      interf.row_contiguous = false;
-	  }
-	  partptr(ip+1) = os + ipnrows;
-	}
+        IFPACK2_BLOCKHELPER_TIMER("compute part indices");
+        for (local_ordinal_type ip=0;ip<nparts;++ip) {
+          const auto* part = &partitions[p[ip]];
+          const local_ordinal_type ipnrows = part->size();
+          TEUCHOS_ASSERT(ip == 0 || (ipnrows <= static_cast<local_ordinal_type>(partitions[p[ip-1]].size())));
+          TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0,
+                    BlockHelperDetails::get_msg_prefix(comm)
+                    << "partition " << p[ip]
+                    << " is empty, which is not allowed.");
+          //assume No overlap.
+          part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
+          // Since parts are ordered in decreasing size, the size of the first
+          // part in a pack is the size for all parts in the pack.
+          if (ip % vector_length == 0) pack_nrows = ipnrows;
+          part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
+          const local_ordinal_type offset = partptr(ip);
+          for (local_ordinal_type i=0;i<ipnrows;++i) {
+            const auto lcl_row = (*part)[i];
+            TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
+                BlockHelperDetails::get_msg_prefix(comm)
+                << "partitions[" << p[ip] << "]["
+                << i << "] = " << lcl_row
+                << " but input matrix implies limits of [0, " << A_n_lclrows-1
+                << "].");
+            lclrow(offset+i) = lcl_row;
+            rowidx2part(offset+i) = ip;
+            if (interf.row_contiguous && offset+i > 0 && lclrow((offset+i)-1) + 1 != lcl_row)
+              interf.row_contiguous = false;
+          }
+          partptr(ip+1) = offset + ipnrows;
+
+#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
+          printf("Part index = ip = %d, first LID associated to the part = partptr(ip) = os = %d, part->size() = ipnrows = %d;\n", ip, os, ipnrows);
+          printf("partptr(%d+1) = %d\n", ip, partptr(ip+1));
+#endif
+        }
+
+        part2rowidx0_sub(0) = 0;
+        partptr_sub(0, 0) = 0;
+        //const local_ordinal_type number_pack_per_sub_part = ceil(float(nparts)/vector_length);
+
+        for (local_ordinal_type ip=0;ip<nparts;++ip) {
+          const auto* part = &partitions[p[ip]];
+          const local_ordinal_type ipnrows = part->size();
+          const local_ordinal_type full_line_length = partptr(ip+1) - partptr(ip);
+
+          TEUCHOS_TEST_FOR_EXCEPTION
+            (full_line_length != ipnrows, std::logic_error, 
+            "In the part " << ip );  
+
+          constexpr local_ordinal_type connection_length = 2;
+
+          if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length )
+              TEUCHOS_TEST_FOR_EXCEPTION
+                (true, std::logic_error, 
+                "The part " << ip << " is too short to use " << n_subparts_per_part << " sub parts.");            
+
+          const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
+          const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
+
+          if (ip % vector_length == 0) pack_nrows_sub = ipnrows;
+
+          for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part;++local_sub_ip) {
+            const local_ordinal_type sub_ip = nparts*(2*local_sub_ip) + ip;
+            const local_ordinal_type schur_ip = nparts*(2*local_sub_ip+1) + ip;
+            if (local_sub_ip != n_subparts_per_part-1) {
+              if (local_sub_ip != 0) {
+                partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
+              }
+              else if (ip != 0) {
+                partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
+              }
+              partptr_sub(sub_ip, 1) = sub_line_length + partptr_sub(sub_ip, 0);
+              partptr_sub(schur_ip, 0) = partptr_sub(sub_ip, 1);
+              partptr_sub(schur_ip, 1) = connection_length + partptr_sub(schur_ip, 0);
+
+              part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + sub_line_length;
+              part2rowidx0_sub(sub_ip + 2) = part2rowidx0_sub(sub_ip + 1) + connection_length;
+
+#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
+              printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), sub_line_length);
+              printf("Sub Part index Schur = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip + 1, partptr_sub(ip, 2 * local_sub_ip + 1), connection_length);
+#endif
+            }
+            else {
+              if (local_sub_ip != 0) {
+                partptr_sub(sub_ip, 0) = partptr_sub(nparts*(2*local_sub_ip-1) + ip, 1);
+              }
+              else if (ip != 0) {
+                partptr_sub(sub_ip, 0) = partptr_sub(nparts*2*(n_subparts_per_part-1) + ip - 1, 1);
+              }
+              partptr_sub(sub_ip, 1) = last_sub_line_length + partptr_sub(sub_ip, 0);
+
+              part2rowidx0_sub(sub_ip + 1) = part2rowidx0_sub(sub_ip) + last_sub_line_length;
+
+#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
+              printf("Sub Part index = %d, first LID associated to the sub part = %d, sub part size = %d;\n", sub_ip, partptr_sub(ip, 2 * local_sub_ip), last_sub_line_length);
+#endif
+            }
+          }
+        }
+
+        {
+          local_ordinal_type npacks = ceil(float(nparts)/vector_length);
+
+          local_ordinal_type ip_max = nparts > vector_length ? vector_length : nparts;
+          for (local_ordinal_type ip=0;ip<ip_max;++ip) {
+            part2packrowidx0_sub(ip, 0) = 0;
+          }
+          for (local_ordinal_type ipack=0;ipack<npacks;++ipack) {
+            if (ipack != 0) {
+              local_ordinal_type ip_min = ipack*vector_length;
+              ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
+              for (local_ordinal_type ip=ip_min;ip<ip_max;++ip) {
+                part2packrowidx0_sub(ip, 0) = part2packrowidx0_sub(ip-vector_length, part2packrowidx0_sub.extent(1)-1);
+              }
+            }
+
+            for (size_type local_sub_ip=0; local_sub_ip<part2packrowidx0_sub.extent(1)-1;++local_sub_ip) {
+              local_ordinal_type ip_min = ipack*vector_length;
+              ip_max = nparts > (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts;
+
+              const local_ordinal_type full_line_length = partptr(ip_min+1) - partptr(ip_min);
+
+              constexpr local_ordinal_type connection_length = 2;      
+
+              const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part;
+              const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length);
+
+              if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length;
+              if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length;
+              if (local_sub_ip == part2packrowidx0_sub.extent(1)-2) pack_nrows_sub = last_sub_line_length;
+
+              part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub;
+
+              for (local_ordinal_type ip=ip_min+1;ip<ip_max;++ip) {
+                part2packrowidx0_sub(ip, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip + 1);
+              }
+            }
+          }
+
+          Kokkos::deep_copy(interf.part2packrowidx0_sub, part2packrowidx0_sub);
+        }
+        IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
       }
 #if defined(BLOCKTRIDICONTAINER_DEBUG)
       TEUCHOS_ASSERT(partptr(nparts) == nrows);
@@ -1290,26 +1248,67 @@ namespace Ifpack2 {
       Kokkos::deep_copy(interf.partptr, partptr);
       Kokkos::deep_copy(interf.lclrow, lclrow);
 
+      Kokkos::deep_copy(interf.partptr_sub, partptr_sub);
+
       //assume No overlap. Thus:
       interf.part2rowidx0 = interf.partptr;
       Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0);
 
-      interf.part2packrowidx0_back = part2packrowidx0(part2packrowidx0.extent(0) - 1);
+      interf.part2packrowidx0_back = part2packrowidx0_sub(part2packrowidx0_sub.extent(0) - 1, part2packrowidx0_sub.extent(1) - 1);
       Kokkos::deep_copy(interf.rowidx2part, rowidx2part);
 
       { // Fill packptr.
-        local_ordinal_type npacks = 0;
-        for (local_ordinal_type ip=1;ip<=nparts;++ip)
+        IFPACK2_BLOCKHELPER_TIMER("Fill packptr");
+        local_ordinal_type npacks = ceil(float(nparts)/vector_length) * (part2packrowidx0_sub.extent(1)-1);
+        npacks = 0;
+        for (local_ordinal_type ip=1;ip<=nparts;++ip) //n_sub_parts_and_schur
           if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
             ++npacks;
+
         interf.packptr = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
         const auto packptr = Kokkos::create_mirror_view(interf.packptr);
         packptr(0) = 0;
         for (local_ordinal_type ip=1,k=1;ip<=nparts;++ip)
           if (part2packrowidx0(ip) != part2packrowidx0(ip-1))
             packptr(k++) = ip;
+        
         Kokkos::deep_copy(interf.packptr, packptr);
+
+        local_ordinal_type npacks_per_subpart = ceil(float(nparts)/vector_length);
+        npacks = ceil(float(nparts)/vector_length) * (part2packrowidx0_sub.extent(1)-1);
+
+        interf.packindices_sub = local_ordinal_type_1d_view(do_not_initialize_tag("packindices_sub"), npacks_per_subpart*n_subparts_per_part);
+        interf.packindices_schur = local_ordinal_type_1d_view(do_not_initialize_tag("packindices_schur"), npacks_per_subpart*(n_subparts_per_part-1));
+
+        const auto packindices_sub = Kokkos::create_mirror_view(interf.packindices_sub);
+        const auto packindices_schur = Kokkos::create_mirror_view(interf.packindices_schur);
+
+
+        // Fill packindices_sub and packindices_schur
+        for (local_ordinal_type local_sub_ip=0; local_sub_ip<n_subparts_per_part-1;++local_sub_ip) {
+          for (local_ordinal_type local_pack_ip=0; local_pack_ip<npacks_per_subpart;++local_pack_ip) {
+            packindices_sub(local_sub_ip * npacks_per_subpart + local_pack_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip;
+            packindices_schur(local_sub_ip * npacks_per_subpart + local_pack_ip) = 2 * local_sub_ip * npacks_per_subpart + local_pack_ip + npacks_per_subpart;
+          }
+        }
+
+        for (local_ordinal_type local_pack_ip=0; local_pack_ip<npacks_per_subpart;++local_pack_ip) {
+          packindices_sub((n_subparts_per_part-1) * npacks_per_subpart + local_pack_ip) = 2 * (n_subparts_per_part-1) * npacks_per_subpart + local_pack_ip;
+        }
+
+        Kokkos::deep_copy(interf.packindices_sub, packindices_sub);
+        Kokkos::deep_copy(interf.packindices_schur, packindices_schur);
+
+        interf.packptr_sub = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1);
+        const auto packptr_sub = Kokkos::create_mirror_view(interf.packptr_sub);
+        packptr_sub(0) = 0;
+        for (local_ordinal_type k=0;k<npacks + 1;++k)
+          packptr_sub(k) = packptr(k%npacks_per_subpart) + (k / npacks_per_subpart) * packptr(npacks_per_subpart);
+
+        Kokkos::deep_copy(interf.packptr_sub, packptr_sub);
+        IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
       }
+      IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
 
       return interf;
     }
@@ -1319,22 +1318,29 @@ namespace Ifpack2 {
     ///
     template <typename MatrixType>
     struct BlockTridiags {
-      using impl_type = ImplType<MatrixType>;
+      using impl_type = BlockHelperDetails::ImplType<MatrixType>;
       using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
       using size_type_1d_view = typename impl_type::size_type_1d_view;
+      using size_type_2d_view = typename impl_type::size_type_2d_view;
       using vector_type_3d_view = typename impl_type::vector_type_3d_view;
+      using vector_type_4d_view = typename impl_type::vector_type_4d_view;
 
       // flat_td_ptr(i) is the index into flat-array values of the start of the
       // i'th tridiag. pack_td_ptr is the same, but for packs. If vector_length ==
       // 1, pack_td_ptr is the same as flat_td_ptr; if vector_length > 1, then i %
       // vector_length is the position in the pack.
-      size_type_1d_view flat_td_ptr, pack_td_ptr;
+      size_type_2d_view flat_td_ptr, pack_td_ptr, pack_td_ptr_schur;
       // List of local column indices into A from which to grab
       // data. flat_td_ptr(i) points to the start of the i'th tridiag's data.
       local_ordinal_type_1d_view A_colindsub;
       // Tridiag block values. pack_td_ptr(i) points to the start of the i'th
       // tridiag's pack, and i % vector_length gives the position in the pack.
       vector_type_3d_view values;
+      // Schur block values. pack_td_ptr_schur(i) points to the start of the i'th
+      // Schur's pack, and i % vector_length gives the position in the pack.
+      vector_type_3d_view values_schur;
+      // inv(A_00)*A_01 block values.
+      vector_type_4d_view e_values;
 
       bool is_diagonal_only;
 
@@ -1354,6 +1360,10 @@ namespace Ifpack2 {
       template <typename idx_type>
       static KOKKOS_FORCEINLINE_FUNCTION
       idx_type NumBlocks (const idx_type& nrows) { return nrows > 0 ? 3*nrows - 2 : 0; }
+      // Number of blocks associated to a Schur complement having a given number of rows.
+      template <typename idx_type>
+      static KOKKOS_FORCEINLINE_FUNCTION
+      idx_type NumBlocksSchur (const idx_type& nrows) { return nrows > 0 ? 3*nrows + 2 : 0; }
     };
 
 
@@ -1362,35 +1372,43 @@ namespace Ifpack2 {
     ///
     template<typename MatrixType>
     BlockTridiags<MatrixType>
-    createBlockTridiags(const PartInterface<MatrixType> &interf) {
-      using impl_type = ImplType<MatrixType>;
+    createBlockTridiags(const BlockHelperDetails::PartInterface<MatrixType> &interf) {
+      IFPACK2_BLOCKHELPER_TIMER("createBlockTridiags");
+      using impl_type = BlockHelperDetails::ImplType<MatrixType>;
       using execution_space = typename impl_type::execution_space;
       using local_ordinal_type = typename impl_type::local_ordinal_type;
       using size_type = typename impl_type::size_type;
-      using size_type_1d_view = typename impl_type::size_type_1d_view;
+      using size_type_2d_view = typename impl_type::size_type_2d_view;
 
       constexpr int vector_length = impl_type::vector_length;
 
       BlockTridiags<MatrixType> btdm;
 
-      const local_ordinal_type ntridiags = interf.partptr.extent(0) - 1;
+      const local_ordinal_type ntridiags = interf.partptr_sub.extent(0);
 
       { // construct the flat index pointers into the tridiag values array.
-        btdm.flat_td_ptr = size_type_1d_view(do_not_initialize_tag("btdm.flat_td_ptr"), ntridiags + 1);
-        const Kokkos::RangePolicy<execution_space> policy(0,ntridiags + 1);
+        btdm.flat_td_ptr = size_type_2d_view(do_not_initialize_tag("btdm.flat_td_ptr"), interf.nparts, 2*interf.n_subparts_per_part);
+        const Kokkos::RangePolicy<execution_space> policy(0, 2 * interf.nparts * interf.n_subparts_per_part );
         Kokkos::parallel_scan
           ("createBlockTridiags::RangePolicy::flat_td_ptr",
            policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
-            if (final)
-              btdm.flat_td_ptr(i) = update;
-            if (i < ntridiags) {
-              const local_ordinal_type nrows = interf.partptr(i+1) - interf.partptr(i);
-              update += btdm.NumBlocks(nrows);
+            const local_ordinal_type partidx = i/(2 * interf.n_subparts_per_part);
+            const local_ordinal_type local_subpartidx = i % (2 * interf.n_subparts_per_part);
+
+            if (final) {
+              btdm.flat_td_ptr(partidx, local_subpartidx) = update;
+            }
+            if (local_subpartidx != (2 * interf.n_subparts_per_part -1)) {
+              const local_ordinal_type nrows = interf.partptr_sub(interf.nparts*local_subpartidx + partidx,1) - interf.partptr_sub(interf.nparts*local_subpartidx + partidx,0);
+              if (local_subpartidx % 2 == 0)
+                update += btdm.NumBlocks(nrows);
+              else
+                update += btdm.NumBlocksSchur(nrows);
             }
           });
 
         const auto nblocks = Kokkos::create_mirror_view_and_copy
-          (Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, ntridiags));
+          (Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, interf.nparts-1, 2*interf.n_subparts_per_part-1));
         btdm.is_diagonal_only = (static_cast<local_ordinal_type>(nblocks()) == ntridiags);
       }
 
@@ -1398,57 +1416,114 @@ namespace Ifpack2 {
       if (vector_length == 1) {
         btdm.pack_td_ptr = btdm.flat_td_ptr;
       } else {
-        const local_ordinal_type npacks = interf.packptr.extent(0) - 1;
-        btdm.pack_td_ptr = size_type_1d_view(do_not_initialize_tag("btdm.pack_td_ptr"), ntridiags + 1);
-        const Kokkos::RangePolicy<execution_space> policy(0,npacks);
-        Kokkos::parallel_scan
+        //const local_ordinal_type npacks = interf.packptr_sub.extent(0) - 1;
+
+        local_ordinal_type npacks_per_subpart = 0;
+        for (local_ordinal_type ip=1;ip<=interf.nparts;++ip) //n_sub_parts_and_schur
+            if (interf.part2packrowidx0(ip) != interf.part2packrowidx0(ip-1))
+              ++npacks_per_subpart;
+
+        btdm.pack_td_ptr = size_type_2d_view(do_not_initialize_tag("btdm.pack_td_ptr"), interf.nparts, 2*interf.n_subparts_per_part);
+        const Kokkos::RangePolicy<execution_space> policy(0,npacks_per_subpart);
+
+        Kokkos::parallel_for
           ("createBlockTridiags::RangePolicy::pack_td_ptr",
-           policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) {
-            const local_ordinal_type parti      = interf.packptr(i);
-            const local_ordinal_type parti_next = interf.packptr(i+1);
-            if (final) {
-              const size_type nblks = update;
-              for (local_ordinal_type pti=parti;pti<parti_next;++pti)
-                btdm.pack_td_ptr(pti) = nblks;
-              const local_ordinal_type nrows = interf.partptr(parti+1) - interf.partptr(parti);
-              // last one
-              if (i == npacks-1)
-                btdm.pack_td_ptr(ntridiags) = nblks + btdm.NumBlocks(nrows);
-            }
-            {
-              const local_ordinal_type nrows = interf.partptr(parti+1) - interf.partptr(parti);
-              update += btdm.NumBlocks(nrows);
+           policy, KOKKOS_LAMBDA(const local_ordinal_type &i) {
+            for (local_ordinal_type j = 0; j < 2*interf.n_subparts_per_part; ++j) {
+              const local_ordinal_type pack_id = ( j == 2*interf.n_subparts_per_part-1 ) ? i+(j-1)*npacks_per_subpart : i+j*npacks_per_subpart;
+              const local_ordinal_type nparts_in_pack = interf.packptr_sub(pack_id+1) - interf.packptr_sub(pack_id);
+
+              const local_ordinal_type parti = interf.packptr_sub(pack_id);
+              const local_ordinal_type partidx = parti%interf.nparts;
+
+              for (local_ordinal_type pti=0;pti<nparts_in_pack;++pti) {
+                btdm.pack_td_ptr(partidx+pti, j) = btdm.flat_td_ptr(i, j);
+              }
             }
           });
       }
 
-      // values and A_colindsub are created in the symbolic phase
+      btdm.pack_td_ptr_schur = size_type_2d_view(do_not_initialize_tag("btdm.pack_td_ptr_schur"), interf.nparts, interf.n_subparts_per_part);
 
-      return btdm;
-    }
+      const auto host_pack_td_ptr_schur = Kokkos::create_mirror_view(btdm.pack_td_ptr_schur);
+      constexpr local_ordinal_type connection_length = 2;
 
-    // Set the tridiags to be I to the full pack block size. That way, if a
-    // tridiag within a pack is shorter than the longest one, the extra blocks are
-    // processed in a safe way. Similarly, in the solve phase, if the extra blocks
-    // in the packed multvector are 0, and the tridiag LU reflects the extra I
-    // blocks, then the solve proceeds as though the extra blocks aren't
-    // present. Since this extra work is part of the SIMD calls, it's not actually
-    // extra work. Instead, it means we don't have to put checks or masks in, or
-    // quiet NaNs. This functor has to be called just once, in the symbolic phase,
-    // since the numeric phase fills in only the used entries, leaving these I
+      host_pack_td_ptr_schur(0,0) = 0;
+      for (local_ordinal_type i = 0; i < interf.nparts; ++i) {
+        if (i % vector_length == 0) {
+          if (i != 0)
+            host_pack_td_ptr_schur(i,0) = host_pack_td_ptr_schur(i-1,host_pack_td_ptr_schur.extent(1)-1);
+          for (local_ordinal_type j = 0; j < interf.n_subparts_per_part-1; ++j) {
+            host_pack_td_ptr_schur(i,j+1) = host_pack_td_ptr_schur(i,j) + btdm.NumBlocks(connection_length) + (j != 0 ? 1 : 0) + (j != interf.n_subparts_per_part-2 ? 1 : 0);
+          }
+        }
+        else {
+          for (local_ordinal_type j = 0; j < interf.n_subparts_per_part; ++j) {
+            host_pack_td_ptr_schur(i,j) = host_pack_td_ptr_schur(i-1,j);
+          }
+        }
+      }
+
+      Kokkos::deep_copy(btdm.pack_td_ptr_schur, host_pack_td_ptr_schur);
+
+#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
+      std::cout << "flat_td_ptr = " << std::endl;
+      for (size_type i = 0; i < btdm.flat_td_ptr.extent(0); ++i) {
+        for (size_type j = 0; j < btdm.flat_td_ptr.extent(1); ++j) {
+          std::cout << btdm.flat_td_ptr(i,j) << " ";
+        }
+        std::cout << std::endl;
+      }
+      std::cout << "flat_td_ptr end" << std::endl;
+
+      std::cout << "pack_td_ptr = " << std::endl;
+      for (size_type i = 0; i < btdm.pack_td_ptr.extent(0); ++i) {
+        for (size_type j = 0; j < btdm.pack_td_ptr.extent(1); ++j) {
+          std::cout << btdm.pack_td_ptr(i,j) << " ";
+        }
+        std::cout << std::endl;
+      }
+      std::cout << "pack_td_ptr end" << std::endl;
+
+
+      std::cout << "pack_td_ptr_schur = " << std::endl;
+      for (size_type i = 0; i < btdm.pack_td_ptr_schur.extent(0); ++i) {
+        for (size_type j = 0; j < btdm.pack_td_ptr_schur.extent(1); ++j) {
+          std::cout << btdm.pack_td_ptr_schur(i,j) << " ";
+        }
+        std::cout << std::endl;
+      }
+      std::cout << "pack_td_ptr_schur end" << std::endl;
+#endif
+
+      // values and A_colindsub are created in the symbolic phase
+      IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
+
+      return btdm;
+    }
+
+    // Set the tridiags to be I to the full pack block size. That way, if a
+    // tridiag within a pack is shorter than the longest one, the extra blocks are
+    // processed in a safe way. Similarly, in the solve phase, if the extra blocks
+    // in the packed multvector are 0, and the tridiag LU reflects the extra I
+    // blocks, then the solve proceeds as though the extra blocks aren't
+    // present. Since this extra work is part of the SIMD calls, it's not actually
+    // extra work. Instead, it means we don't have to put checks or masks in, or
+    // quiet NaNs. This functor has to be called just once, in the symbolic phase,
+    // since the numeric phase fills in only the used entries, leaving these I
     // blocks intact.
     template<typename MatrixType>
     void
     setTridiagsToIdentity
       (const BlockTridiags<MatrixType>& btdm,
-       const typename ImplType<MatrixType>::local_ordinal_type_1d_view& packptr)
+       const typename BlockHelperDetails::ImplType<MatrixType>::local_ordinal_type_1d_view& packptr)
     {
-      using impl_type = ImplType<MatrixType>;
+      using impl_type = BlockHelperDetails::ImplType<MatrixType>;
       using execution_space = typename impl_type::execution_space;
       using local_ordinal_type = typename impl_type::local_ordinal_type;
-      using size_type_1d_view = typename impl_type::size_type_1d_view;
+      using size_type_2d_view = typename impl_type::size_type_2d_view;
 
-      const ConstUnmanaged<size_type_1d_view> pack_td_ptr(btdm.pack_td_ptr);
+      const ConstUnmanaged<size_type_2d_view> pack_td_ptr(btdm.pack_td_ptr);
       const local_ordinal_type blocksize = btdm.values.extent(1);
 
       {
@@ -1511,64 +1586,38 @@ namespace Ifpack2 {
           ("setTridiagsToIdentity::TeamPolicy",
            policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) {
             const local_ordinal_type k = member.league_rank();
-            const local_ordinal_type ibeg = pack_td_ptr(packptr(k));
-            const local_ordinal_type iend = pack_td_ptr(packptr(k+1));
+            const local_ordinal_type ibeg = pack_td_ptr(packptr(k),0);
+            const local_ordinal_type iend = pack_td_ptr(packptr(k),pack_td_ptr.extent(1)-1);
+
             const local_ordinal_type diff = iend - ibeg;
             const local_ordinal_type icount = diff/3 + (diff%3 > 0);
             const btdm_scalar_type one(1);
             Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
                 Kokkos::parallel_for(Kokkos::TeamThreadRange(member,icount),[&](const local_ordinal_type &ii) {
                     const local_ordinal_type i = ibeg + ii*3;
-                    for (local_ordinal_type j=0;j<blocksize;++j)
+                    for (local_ordinal_type j=0;j<blocksize;++j) {
                       values(i,j,j,v) = one;
+                    }
                   });
               });
           });
       }
     }
 
-    ///
-    /// A - Tridiags(A), i.e., R in the splitting A = D + R.
-    ///
-    template <typename MatrixType>
-    struct AmD {
-      using impl_type = ImplType<MatrixType>;
-      using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
-      using size_type_1d_view = typename impl_type::size_type_1d_view;
-      using impl_scalar_type_1d_view_tpetra = Unmanaged<typename impl_type::impl_scalar_type_1d_view_tpetra>;
-      // rowptr points to the start of each row of A_colindsub.
-      size_type_1d_view rowptr, rowptr_remote;
-      // Indices into A's rows giving the blocks to extract. rowptr(i) points to
-      // the i'th row. Thus, g.entries(A_colindsub(rowptr(row) : rowptr(row+1))),
-      // where g is A's graph, are the columns AmD uses. If seq_method_, then
-      // A_colindsub contains all the LIDs and A_colindsub_remote is empty. If !
-      // seq_method_, then A_colindsub contains owned LIDs and A_colindsub_remote
-      // contains the remote ones.
-      local_ordinal_type_1d_view A_colindsub, A_colindsub_remote;
-
-      // Currently always true.
-      bool is_tpetra_block_crs;
-
-      // If is_tpetra_block_crs, then this is a pointer to A_'s value data.
-      impl_scalar_type_1d_view_tpetra tpetra_values;
-
-      AmD() = default;
-      AmD(const AmD &b) = default;
-    };
-
     ///
     /// symbolic phase, on host : create R = A - D, pack D
     ///
     template<typename MatrixType>
     void
-    performSymbolicPhase(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
-                         const PartInterface<MatrixType> &interf,
+    performSymbolicPhase(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
+                         const BlockHelperDetails::PartInterface<MatrixType> &interf,
                          BlockTridiags<MatrixType> &btdm,
-                         AmD<MatrixType> &amd,
+                         BlockHelperDetails::AmD<MatrixType> &amd,
                          const bool overlap_communication_and_computation) {
-      IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::SymbolicPhase");
+      IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::SymbolicPhase");
+
+      using impl_type = BlockHelperDetails::ImplType<MatrixType>;
 
-      using impl_type = ImplType<MatrixType>;
       // using node_memory_space = typename impl_type::node_memory_space;
       using host_execution_space = typename impl_type::host_execution_space;
 
@@ -1578,12 +1627,15 @@ namespace Ifpack2 {
       using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
       using size_type_1d_view = typename impl_type::size_type_1d_view;
       using vector_type_3d_view = typename impl_type::vector_type_3d_view;
+      using vector_type_4d_view = typename impl_type::vector_type_4d_view;
       using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
 
       constexpr int vector_length = impl_type::vector_length;
 
       const auto comm = A->getRowMap()->getComm();
+
       const auto& g = A->getCrsGraph();
+
       const auto blocksize = A->getBlockSize();
 
       // mirroring to host
@@ -1595,8 +1647,10 @@ namespace Ifpack2 {
 
       const local_ordinal_type nrows = partptr(partptr.extent(0) - 1);
 
-      // find column to row map on host
       Kokkos::View<local_ordinal_type*,host_execution_space> col2row("col2row", A->getLocalNumCols());
+
+      // find column to row map on host
+      
       Kokkos::deep_copy(col2row, Teuchos::OrdinalTraits<local_ordinal_type>::invalid());
       {
         const auto rowmap = g.getRowMap();
@@ -1615,7 +1669,7 @@ namespace Ifpack2 {
               const local_ordinal_type lc = colmap->getLocalElement(gid);
 #  if defined(BLOCKTRIDICONTAINER_DEBUG)
               TEUCHOS_TEST_FOR_EXCEPT_MSG(lc == Teuchos::OrdinalTraits<local_ordinal_type>::invalid(),
-                                          get_msg_prefix(comm) << "GID " << gid
+                                          BlockHelperDetails::get_msg_prefix(comm) << "GID " << gid
                                           << " gives an invalid local column.");
 #  endif
               col2row(lc) = lr;
@@ -1644,7 +1698,7 @@ namespace Ifpack2 {
         }
 
         // count (block) nnzs in D and R.
-        typedef SumReducer<size_type,3,host_execution_space> sum_reducer_type;
+        typedef BlockHelperDetails::SumReducer<size_type,3,host_execution_space> sum_reducer_type;
         typename sum_reducer_type::value_type sum_reducer_value;
         {
           const Kokkos::RangePolicy<host_execution_space> policy(0,nrows);
@@ -1694,7 +1748,7 @@ namespace Ifpack2 {
           R_nnz_remote = 0;
         }
 
-        // construct the D graph.
+        // construct the D_00 graph.
         {
           const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr);
 
@@ -1706,6 +1760,9 @@ namespace Ifpack2 {
 #endif
 
           const local_ordinal_type nparts = partptr.extent(0) - 1;
+
+          // Loop over the lines:
+          // - part2rowidx0 stores the 
           {
             const Kokkos::RangePolicy<host_execution_space> policy(0, nparts);
             Kokkos::parallel_for
@@ -1727,7 +1784,7 @@ namespace Ifpack2 {
                     if (pi != pi0) continue;
                     if (ri + 1 < ri0 || ri > ri0 + 1) continue;
                     const local_ordinal_type row_entry = j - j0;
-                    D_A_colindsub(flat_td_ptr(pi0) + ((td_row_os + ri) - ri0)) = row_entry;
+                    D_A_colindsub(flat_td_ptr(pi0,0) + ((td_row_os + ri) - ri0)) = row_entry;
                   }
                 }
               });
@@ -1740,9 +1797,14 @@ namespace Ifpack2 {
 
           // Allocate values.
           {
-            const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, nparts);
+            const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, btdm.pack_td_ptr.extent(0)-1, btdm.pack_td_ptr.extent(1)-1);
             const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last);
             btdm.values = vector_type_3d_view("btdm.values", num_packed_blocks(), blocksize, blocksize);
+
+            const auto pack_td_ptr_schur_last = Kokkos::subview(btdm.pack_td_ptr_schur, btdm.pack_td_ptr_schur.extent(0)-1, btdm.pack_td_ptr_schur.extent(1)-1);
+            const auto num_packed_blocks_schur = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_schur_last);
+            btdm.values_schur = vector_type_3d_view("btdm.values_schur", num_packed_blocks_schur(), blocksize, blocksize);
+
             if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr);
           }
         }
@@ -1790,7 +1852,7 @@ namespace Ifpack2 {
           }
 
           // exclusive scan
-          typedef ArrayValueType<size_type,2> update_type;
+          typedef BlockHelperDetails::ArrayValueType<size_type,2> update_type;
           {
             Kokkos::RangePolicy<host_execution_space> policy(0,nrows+1);
             Kokkos::parallel_scan
@@ -1849,8 +1911,13 @@ namespace Ifpack2 {
           amd.tpetra_values = (const_cast<block_crs_matrix_type*>(A.get())->getValuesDeviceNonConst());
                                
         }
+
+        // Allocate view for E and initialize the values with B:
+        
+        //btdm.e_values = vector_type_4d_view("btdm.e_values", 2, (interf.n_subparts_per_part-1)*interf.max_subpartsz*interf.nparts, blocksize, blocksize);
+        btdm.e_values = vector_type_4d_view("btdm.e_values", 2, interf.part2packrowidx0_back, blocksize, blocksize);
       }
-      IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(typename ImplType<MatrixType>::execution_space)
+      IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
     }
 
 
@@ -1984,11 +2051,448 @@ namespace Ifpack2 {
     };
 #endif
 
-    
+    template<typename ArgActiveExecutionMemorySpace>
+    struct SolveTridiagsDefaultModeAndAlgo;
+
+    template<typename impl_type, typename WWViewType>
+    KOKKOS_INLINE_FUNCTION
+    void
+    solveMultiVector(const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
+                      const typename impl_type::local_ordinal_type &/* blocksize */,
+                      const typename impl_type::local_ordinal_type &i0,
+                      const typename impl_type::local_ordinal_type &r0,
+                      const typename impl_type::local_ordinal_type &nrows,
+                      const typename impl_type::local_ordinal_type &v,
+                      const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
+                      const Unmanaged<typename impl_type::internal_vector_type_4d_view> X_internal_vector_values,
+                      const WWViewType &WW, 
+                      const bool skip_first_pass=false) {
+        using execution_space = typename impl_type::execution_space;
+        using team_policy_type = Kokkos::TeamPolicy<execution_space>;
+        using member_type = typename team_policy_type::member_type;
+        using local_ordinal_type = typename impl_type::local_ordinal_type;
+
+        typedef SolveTridiagsDefaultModeAndAlgo
+          <typename execution_space::memory_space> default_mode_and_algo_type;
+
+        typedef typename default_mode_and_algo_type::mode_type default_mode_type;
+        typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type;
+
+        using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
+
+        // constant
+        const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
+        const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
+
+        // subview pattern
+        auto A  = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v);
+        auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v);
+        auto X2 = X1;
+
+        local_ordinal_type i = i0, r = r0;
+
+
+        if (nrows > 1) {
+          // solve Lx = x
+          if (skip_first_pass) {
+            i += (nrows-2) * 3;
+            r += (nrows-2);
+            A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
+            X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
+            A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
+            KB::Trsm<member_type,
+                    KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
+                    default_mode_type,default_algo_type>
+              ::invoke(member, one, A, X2);
+            X1.assign_data( X2.data() );
+            i+=3;
+          }
+          else {
+            KB::Trsm<member_type,
+                    KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
+                    default_mode_type,default_algo_type>
+              ::invoke(member, one, A, X1);
+            for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
+              A.assign_data( &D_internal_vector_values(i+2,0,0,v) );
+              X2.assign_data( &X_internal_vector_values(++r,0,0,v) );
+              member.team_barrier();
+              KB::Gemm<member_type,
+                      KB::Trans::NoTranspose,KB::Trans::NoTranspose,
+                      default_mode_type,default_algo_type>
+                ::invoke(member, -one, A, X1, one, X2);
+              A.assign_data( &D_internal_vector_values(i+3,0,0,v) );
+              KB::Trsm<member_type,
+                      KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
+                      default_mode_type,default_algo_type>
+                ::invoke(member, one, A, X2);
+              X1.assign_data( X2.data() );
+            }
+          }
+
+          // solve Ux = x
+          KB::Trsm<member_type,
+                   KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
+                   default_mode_type,default_algo_type>
+            ::invoke(member, one, A, X1);
+          for (local_ordinal_type tr=nrows;tr>1;--tr) {
+            i -= 3;
+            A.assign_data( &D_internal_vector_values(i+1,0,0,v) );
+            X2.assign_data( &X_internal_vector_values(--r,0,0,v) );
+            member.team_barrier();
+            KB::Gemm<member_type,
+                     KB::Trans::NoTranspose,KB::Trans::NoTranspose,
+                     default_mode_type,default_algo_type>
+              ::invoke(member, -one, A, X1, one, X2);
+
+            A.assign_data( &D_internal_vector_values(i,0,0,v) );
+            KB::Trsm<member_type,
+                     KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
+                     default_mode_type,default_algo_type>
+              ::invoke(member, one, A, X2);
+            X1.assign_data( X2.data() );
+          }
+        } else {
+          // matrix is already inverted
+          auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v);
+          KB::Copy<member_type,KB::Trans::NoTranspose,default_mode_type>
+            ::invoke(member, X1, W);
+          member.team_barrier();
+          KB::Gemm<member_type,
+                   KB::Trans::NoTranspose,KB::Trans::NoTranspose,
+                   default_mode_type,default_algo_type>
+            ::invoke(member, one, A, W, zero, X1);
+        }
+
+    }
+
+    template<typename impl_type, typename WWViewType, typename XViewType>
+    KOKKOS_INLINE_FUNCTION
+    void
+    solveSingleVectorNew(const typename Kokkos::TeamPolicy<typename impl_type::execution_space>::member_type &member,
+                      const typename impl_type::local_ordinal_type &blocksize,
+                      const typename impl_type::local_ordinal_type &i0,
+                      const typename impl_type::local_ordinal_type &r0,
+                      const typename impl_type::local_ordinal_type &nrows,
+                      const typename impl_type::local_ordinal_type &v,
+                      const ConstUnmanaged<typename impl_type::internal_vector_type_4d_view> D_internal_vector_values,
+                      const XViewType &X_internal_vector_values, //Unmanaged<typename impl_type::internal_vector_type_4d_view>
+                      const WWViewType &WW) {
+      using execution_space = typename impl_type::execution_space;
+      //using team_policy_type = Kokkos::TeamPolicy<execution_space>;
+      //using member_type = typename team_policy_type::member_type;
+      using local_ordinal_type = typename impl_type::local_ordinal_type;
+
+      typedef SolveTridiagsDefaultModeAndAlgo
+        <typename execution_space::memory_space> default_mode_and_algo_type;
+
+      typedef typename default_mode_and_algo_type::mode_type default_mode_type;
+      typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
+
+      using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
+
+      // base pointers
+      auto A = D_internal_vector_values.data();
+      auto X = X_internal_vector_values.data();
+
+      // constant
+      const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
+      const auto zero = Kokkos::ArithTraits<btdm_magnitude_type>::zero();
+      //const local_ordinal_type num_vectors = X_scalar_values.extent(2);
+
+      // const local_ordinal_type blocksize = D_scalar_values.extent(1);
+      const local_ordinal_type astep = D_internal_vector_values.stride_0();
+      const local_ordinal_type as0 = D_internal_vector_values.stride_1(); //blocksize*vector_length;
+      const local_ordinal_type as1 = D_internal_vector_values.stride_2(); //vector_length;
+      const local_ordinal_type xstep = X_internal_vector_values.stride_0();
+      const local_ordinal_type xs0 = X_internal_vector_values.stride_1(); //vector_length;
+
+      // move to starting point
+      A += i0*astep + v;
+      X += r0*xstep + v;
+
+      //for (local_ordinal_type col=0;col<num_vectors;++col)
+      if (nrows > 1) {
+        // solve Lx = x
+        KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
+          (default_mode_type,default_algo_type,
+            member,
+            KB::Diag::Unit,
+            blocksize,blocksize,
+            one,
+            A, as0, as1,
+            X, xs0);
+
+        for (local_ordinal_type tr=1;tr<nrows;++tr) {
+          member.team_barrier();
+          KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
+            (default_mode_type,default_algo_type,
+              member,
+              blocksize, blocksize,
+              -one,
+              A+2*astep, as0, as1,
+              X, xs0,
+              one,
+              X+1*xstep, xs0);
+          KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE
+            (default_mode_type,default_algo_type,
+              member,
+              KB::Diag::Unit,
+              blocksize,blocksize,
+              one,
+              A+3*astep, as0, as1,
+              X+1*xstep, xs0);
+
+          A += 3*astep;
+          X += 1*xstep;
+        }
+
+        // solve Ux = x
+        KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
+          (default_mode_type,default_algo_type,
+            member,
+            KB::Diag::NonUnit,
+            blocksize, blocksize,
+            one,
+            A, as0, as1,
+            X, xs0);
+
+        for (local_ordinal_type tr=nrows;tr>1;--tr) {
+          A -= 3*astep;
+          member.team_barrier();
+          KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
+            (default_mode_type,default_algo_type,
+              member,
+              blocksize, blocksize,
+              -one,
+              A+1*astep, as0, as1,
+              X, xs0,
+              one,
+              X-1*xstep, xs0);
+          KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE
+            (default_mode_type,default_algo_type,
+              member,
+              KB::Diag::NonUnit,
+              blocksize, blocksize,
+              one,
+              A, as0, as1,
+              X-1*xstep,xs0);
+          X -= 1*xstep;
+        }
+        // for multiple rhs
+        //X += xs1;
+      } else {
+        const local_ordinal_type ws0 = WW.stride_0();
+        auto W = WW.data() + v;
+        KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE
+          (default_mode_type,
+            member, blocksize, X, xs0, W, ws0);
+        member.team_barrier();
+        KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
+          (default_mode_type,default_algo_type,
+            member,
+            blocksize, blocksize,
+            one,
+            A, as0, as1,
+            W, xs0,
+            zero,
+            X, xs0);
+      }
+    }
+
+    template<typename local_ordinal_type, typename ViewType>
+    void writeBTDValuesToFile (const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
+#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
+      auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
+      std::ofstream myfile;
+      myfile.open (fileName);
+
+      const local_ordinal_type n_parts_per_pack = n_parts < (local_ordinal_type) scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
+      local_ordinal_type nnz = scalar_values.extent(0) * scalar_values.extent(1) * scalar_values.extent(2) * n_parts_per_pack;
+      const local_ordinal_type n_blocks = scalar_values.extent(0)*n_parts_per_pack;
+      const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
+
+      const local_ordinal_type block_size = scalar_values.extent(1);
+
+      const local_ordinal_type n_rows_per_part = (n_blocks_per_part+2)/3 * block_size;
+      const local_ordinal_type n_rows = n_rows_per_part*n_parts;
+
+      const local_ordinal_type n_packs = ceil(float(n_parts)/n_parts_per_pack);
+
+      myfile << "%%MatrixMarket matrix coordinate real general"<< std::endl;
+      myfile << "%%nnz = " << nnz; 
+      myfile << " block size = " << block_size;
+      myfile << " number of blocks = " << n_blocks;
+      myfile << " number of parts = " << n_parts;
+      myfile << " number of blocks per part = " << n_blocks_per_part;
+      myfile << " number of rows = " << n_rows ;
+      myfile << " number of cols = " << n_rows;
+      myfile << " number of packs = " << n_packs << std::endl;
+
+      myfile << n_rows << " " << n_rows << " " << nnz << std::setprecision(9) << std::endl;
+
+      local_ordinal_type current_part_idx, current_block_idx, current_row_offset, current_col_offset, current_row, current_col;
+      for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
+        for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
+          current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
+          for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
+            current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
+            if (i_block_in_part % 3 == 0) {
+              current_row_offset = i_block_in_part/3 * block_size;
+              current_col_offset = i_block_in_part/3 * block_size;
+            }
+            else if (i_block_in_part % 3 == 1) {
+              current_row_offset = (i_block_in_part-1)/3 * block_size;
+              current_col_offset = ((i_block_in_part-1)/3+1) * block_size;
+            }
+            else if (i_block_in_part % 3 == 2) {
+              current_row_offset = ((i_block_in_part-2)/3+1) * block_size;
+              current_col_offset = (i_block_in_part-2)/3 * block_size;
+            }
+            current_row_offset += current_part_idx * n_rows_per_part;
+            current_col_offset += current_part_idx * n_rows_per_part;
+            for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
+              for (local_ordinal_type j_in_block=0;j_in_block<block_size;++j_in_block) {
+                current_row = current_row_offset + i_in_block + 1;
+                current_col = current_col_offset + j_in_block + 1;
+                myfile <<  current_row << " " << current_col << " " << scalar_values(current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
+              }
+            }
+          }
+        }
+      }
+
+      myfile.close();
+#endif
+    }
+
+    template<typename local_ordinal_type, typename ViewType>
+    void write4DMultiVectorValuesToFile (const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
+#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
+      auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
+      std::ofstream myfile;
+      myfile.open (fileName);
+
+      const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(2) ? n_parts : scalar_values.extent(2);
+      const local_ordinal_type n_blocks = scalar_values.extent(0)*n_parts_per_pack;
+      const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
+
+      const local_ordinal_type block_size = scalar_values.extent(1);
+      const local_ordinal_type n_cols = scalar_values.extent(2);
+
+      const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
+      const local_ordinal_type n_rows = n_rows_per_part*n_parts;
+
+      const local_ordinal_type n_packs = ceil(float(n_parts)/n_parts_per_pack);
+
+
+      myfile << "%%MatrixMarket matrix array real general"<< std::endl;
+      myfile << "%%block size = " << block_size;
+      myfile << " number of blocks = " << n_blocks;
+      myfile << " number of parts = " << n_parts;
+      myfile << " number of blocks per part = " << n_blocks_per_part;
+      myfile << " number of rows = " << n_rows ;
+      myfile << " number of cols = " << n_cols;
+      myfile << " number of packs = " << n_packs << std::endl;
+
+      myfile << n_rows << " " << n_cols << std::setprecision(9) << std::endl;     
+
+      local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
+      (void) current_row_offset;
+      (void) current_part_idx;
+      for (local_ordinal_type j_in_block=0;j_in_block<n_cols;++j_in_block) {      
+        for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
+          for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
+            current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
+            for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
+              current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
+
+              for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
+                myfile << scalar_values(current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
+              }
+            }
+          }
+        }
+      }
+      myfile.close();
+#endif
+    }
+
+    template<typename local_ordinal_type, typename ViewType>
+    void write5DMultiVectorValuesToFile (const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) {
+#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM
+      auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device);
+      std::ofstream myfile;
+      myfile.open (fileName);
+
+      const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(3) ? n_parts : scalar_values.extent(3);
+      const local_ordinal_type n_blocks = scalar_values.extent(1)*n_parts_per_pack;
+      const local_ordinal_type n_blocks_per_part = n_blocks/n_parts;
+
+      const local_ordinal_type block_size = scalar_values.extent(2);
+      const local_ordinal_type n_blocks_cols = scalar_values.extent(0);
+      const local_ordinal_type n_cols = n_blocks_cols * block_size;
+
+      const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size;
+      const local_ordinal_type n_rows = n_rows_per_part*n_parts;
+
+      const local_ordinal_type n_packs = ceil(float(n_parts)/n_parts_per_pack);
+
+      myfile << "%%MatrixMarket matrix array real general"<< std::endl;
+      myfile << "%%block size = " << block_size;
+      myfile << " number of blocks = " << n_blocks;
+      myfile << " number of parts = " << n_parts;
+      myfile << " number of blocks per part = " << n_blocks_per_part;
+      myfile << " number of rows = " << n_rows ;
+      myfile << " number of cols = " << n_cols;
+      myfile << " number of packs = " << n_packs << std::endl;
+
+      myfile << n_rows << " " << n_cols << std::setprecision(9) << std::endl;     
+
+      local_ordinal_type current_part_idx, current_block_idx, current_row_offset;
+      (void) current_row_offset;
+      (void) current_part_idx;
+      for (local_ordinal_type i_block_col=0;i_block_col<n_blocks_cols;++i_block_col) {
+        for (local_ordinal_type j_in_block=0;j_in_block<block_size;++j_in_block) {      
+          for (local_ordinal_type i_pack=0;i_pack<n_packs;++i_pack) {
+            for (local_ordinal_type i_part_in_pack=0;i_part_in_pack<n_parts_per_pack;++i_part_in_pack) {
+              current_part_idx = i_part_in_pack + i_pack * n_parts_per_pack;
+              for (local_ordinal_type i_block_in_part=0;i_block_in_part<n_blocks_per_part;++i_block_in_part) {
+                current_block_idx = i_block_in_part + i_pack * n_blocks_per_part;
+
+                for (local_ordinal_type i_in_block=0;i_in_block<block_size;++i_in_block) {
+                  myfile << scalar_values(i_block_col,current_block_idx,i_in_block,j_in_block,i_part_in_pack) << std::endl;
+                }
+              }
+            }
+          }
+        }
+      }
+      myfile.close();
+#endif
+    }
+
+    template<typename local_ordinal_type, typename member_type, typename ViewType1, typename ViewType2>
+    KOKKOS_INLINE_FUNCTION
+    void
+    copy3DView(const member_type &member, const ViewType1 &view1, const ViewType2 &view2) {
+/*
+      // Kokkos::Experimental::local_deep_copy
+      auto teamVectorRange =
+          Kokkos::TeamVectorMDRange<Kokkos::Rank<3>, member_type>(
+              member, view1.extent(0), view1.extent(1), view1.extent(2));
+
+      Kokkos::parallel_for
+        (teamVectorRange,
+      [&](const local_ordinal_type &i, const local_ordinal_type &j, const local_ordinal_type &k) {
+        view1(i,j,k) = view2(i,j,k);
+      });
+*/
+      Kokkos::Experimental::local_deep_copy(member, view1, view2);
+    }
     template<typename MatrixType>
     struct ExtractAndFactorizeTridiags {
     public:
-      using impl_type = ImplType<MatrixType>;
+      using impl_type = BlockHelperDetails::ImplType<MatrixType>;
       // a functor cannot have both device_type and execution_space; specialization error in kokkos
       using execution_space = typename impl_type::execution_space;
       using memory_space = typename impl_type::memory_space;
@@ -2001,14 +2505,18 @@ namespace Ifpack2 {
       using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
       /// views
       using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
-      using size_type_1d_view = typename impl_type::size_type_1d_view;
+      using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
+      using size_type_2d_view = typename impl_type::size_type_2d_view;
       using impl_scalar_type_1d_view_tpetra = typename impl_type::impl_scalar_type_1d_view_tpetra;
       /// vectorization
       using btdm_scalar_type = typename impl_type::btdm_scalar_type;
       using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
       using vector_type_3d_view = typename impl_type::vector_type_3d_view;
+      using vector_type_4d_view = typename impl_type::vector_type_4d_view;
       using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
+      using internal_vector_type_5d_view = typename impl_type::internal_vector_type_5d_view;
       using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
+      using btdm_scalar_type_5d_view = typename impl_type::btdm_scalar_type_5d_view;
       using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
       using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
 
@@ -2022,17 +2530,20 @@ namespace Ifpack2 {
 
     private:
       // part interface
-      const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr;
+      const ConstUnmanaged<local_ordinal_type_1d_view> partptr, lclrow, packptr, packindices_sub, packindices_schur, packptr_sub;
+      const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub, part2packrowidx0_sub;
       const local_ordinal_type max_partsz;
       // block crs matrix (it could be Kokkos::UVMSpace::size_type, which is int)
       using size_type_1d_view_tpetra = Kokkos::View<size_t*,typename impl_type::node_device_type>;
       const ConstUnmanaged<size_type_1d_view_tpetra> A_rowptr;
       const ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
       // block tridiags
-      const ConstUnmanaged<size_type_1d_view> pack_td_ptr, flat_td_ptr;
+      const ConstUnmanaged<size_type_2d_view> pack_td_ptr, flat_td_ptr, pack_td_ptr_schur;
       const ConstUnmanaged<local_ordinal_type_1d_view> A_colindsub;
-      const Unmanaged<internal_vector_type_4d_view> internal_vector_values;
-      const Unmanaged<btdm_scalar_type_4d_view> scalar_values;
+      const Unmanaged<internal_vector_type_4d_view> internal_vector_values, internal_vector_values_schur;
+      const Unmanaged<internal_vector_type_5d_view> e_internal_vector_values;
+      const Unmanaged<btdm_scalar_type_4d_view> scalar_values, scalar_values_schur;
+      const Unmanaged<btdm_scalar_type_5d_view> e_scalar_values;
       // shared information
       const local_ordinal_type blocksize, blocksize_square;
       // diagonal safety
@@ -2042,13 +2553,18 @@ namespace Ifpack2 {
 
     public:
       ExtractAndFactorizeTridiags(const BlockTridiags<MatrixType> &btdm_,
-                                  const PartInterface<MatrixType> &interf_,
+                                  const BlockHelperDetails::PartInterface<MatrixType> &interf_,
                                   const Teuchos::RCP<const block_crs_matrix_type> &A_,
                                   const magnitude_type& tiny_) :
         // interface
         partptr(interf_.partptr),
         lclrow(interf_.lclrow),
         packptr(interf_.packptr),
+        packindices_sub(interf_.packindices_sub),
+        packindices_schur(interf_.packindices_schur),
+        packptr_sub(interf_.packptr_sub),
+        partptr_sub(interf_.partptr_sub),
+        part2packrowidx0_sub(interf_.part2packrowidx0_sub),
         max_partsz(interf_.max_partsz),
         // block crs matrix
         A_rowptr(A_->getCrsGraph().getLocalGraphDevice().row_map),
@@ -2056,17 +2572,40 @@ namespace Ifpack2 {
         // block tridiags
         pack_td_ptr(btdm_.pack_td_ptr),
         flat_td_ptr(btdm_.flat_td_ptr),
+        pack_td_ptr_schur(btdm_.pack_td_ptr_schur),
         A_colindsub(btdm_.A_colindsub),
         internal_vector_values((internal_vector_type*)btdm_.values.data(),
                                btdm_.values.extent(0),
                                btdm_.values.extent(1),
                                btdm_.values.extent(2),
                                vector_length/internal_vector_length),
+        internal_vector_values_schur((internal_vector_type*)btdm_.values_schur.data(),
+                               btdm_.values_schur.extent(0),
+                               btdm_.values_schur.extent(1),
+                               btdm_.values_schur.extent(2),
+                               vector_length/internal_vector_length),
+        e_internal_vector_values((internal_vector_type*)btdm_.e_values.data(),
+                                btdm_.e_values.extent(0),
+                                btdm_.e_values.extent(1),
+                                btdm_.e_values.extent(2),
+                                btdm_.e_values.extent(3),
+                                vector_length/internal_vector_length),
         scalar_values((btdm_scalar_type*)btdm_.values.data(),
                       btdm_.values.extent(0),
                       btdm_.values.extent(1),
                       btdm_.values.extent(2),
                       vector_length),
+        scalar_values_schur((btdm_scalar_type*)btdm_.values_schur.data(),
+                      btdm_.values_schur.extent(0),
+                      btdm_.values_schur.extent(1),
+                      btdm_.values_schur.extent(2),
+                      vector_length),
+        e_scalar_values((btdm_scalar_type*)btdm_.e_values.data(),
+                      btdm_.e_values.extent(0),
+                      btdm_.e_values.extent(1),
+                      btdm_.e_values.extent(2),
+                      btdm_.e_values.extent(3),
+                      vector_length),
         blocksize(btdm_.values.extent(1)),
         blocksize_square(blocksize*blocksize),
         // diagonal weight to avoid zero pivots
@@ -2079,43 +2618,93 @@ namespace Ifpack2 {
       KOKKOS_INLINE_FUNCTION
       void
       extract(local_ordinal_type partidx,
+              local_ordinal_type local_subpartidx,
               local_ordinal_type npacks) const {
-        using tlb = TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
-        const size_type kps = pack_td_ptr(partidx);
+#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
+        printf("extract partidx = %d, local_subpartidx = %d, npacks = %d;\n", partidx, local_subpartidx, npacks);
+#endif
+        using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
+        const size_type kps = pack_td_ptr(partidx, local_subpartidx);
         local_ordinal_type kfs[vector_length] = {};
         local_ordinal_type ri0[vector_length] = {};
         local_ordinal_type nrows[vector_length] = {};
 
+        //TEUCHOS_TEST_FOR_EXCEPT_MSG(npacks > vector_length,
+        //          "npacks is too big.");
+
         for (local_ordinal_type vi=0;vi<npacks;++vi,++partidx) {
-          kfs[vi] = flat_td_ptr(partidx);
-          ri0[vi] = partptr(partidx);
-          nrows[vi] = partptr(partidx+1) - ri0[vi];
+          kfs[vi] = flat_td_ptr(partidx,local_subpartidx);
+          ri0[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidx,0);
+          nrows[vi] = partptr_sub(pack_td_ptr.extent(0)*local_subpartidx + partidx,1) - ri0[vi];
+#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
+          printf("kfs[%d] = %d;\n", vi, kfs[vi]);
+          printf("ri0[%d] = %d;\n", vi, ri0[vi]);
+          printf("nrows[%d] = %d;\n", vi, nrows[vi]);
+#endif
         }
-        for (local_ordinal_type tr=0,j=0;tr<nrows[0];++tr) {
-          for (local_ordinal_type e=0;e<3;++e) {
-            const impl_scalar_type* block[vector_length] = {};
-            for (local_ordinal_type vi=0;vi<npacks;++vi) {
-              const size_type Aj = A_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
-              block[vi] = &A_values(Aj*blocksize_square);
-            }
-            const size_type pi = kps + j;
-            ++j;
-            for (local_ordinal_type ii=0;ii<blocksize;++ii) {
-              for (local_ordinal_type jj=0;jj<blocksize;++jj) {
-                //const auto idx = ii*blocksize + jj;
-                const auto idx = tlb::getFlatIndex(ii, jj, blocksize);
-                auto& v = internal_vector_values(pi, ii, jj, 0);
-                for (local_ordinal_type vi=0;vi<npacks;++vi)
-                  v[vi] = static_cast<btdm_scalar_type>(block[vi][idx]);
+        if (local_subpartidx % 2 == 0) {
+          for (local_ordinal_type tr=0,j=0;tr<nrows[0];++tr) {
+            for (local_ordinal_type e=0;e<3;++e) {
+              const impl_scalar_type* block[vector_length] = {};
+              for (local_ordinal_type vi=0;vi<npacks;++vi) {
+                const size_type Aj = A_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
+                block[vi] = &A_values(Aj*blocksize_square);
+              }
+              const size_type pi = kps + j;
+              ++j;
+              for (local_ordinal_type ii=0;ii<blocksize;++ii) {
+                for (local_ordinal_type jj=0;jj<blocksize;++jj) {
+                  //const auto idx = ii*blocksize + jj;
+                  const auto idx = tlb::getFlatIndex(ii, jj, blocksize);
+                  auto& v = internal_vector_values(pi, ii, jj, 0);
+                  for (local_ordinal_type vi=0;vi<npacks;++vi)
+                    v[vi] = static_cast<btdm_scalar_type>(block[vi][idx]);
+                }
+              }
+
+              if (nrows[0] == 1) break;
+              if (e == 1 && (tr == 0 || tr+1 == nrows[0])) break;
+              for (local_ordinal_type vi=1;vi<npacks;++vi) {
+                if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr+1 == nrows[vi])) {
+                  npacks = vi;
+                  break;
+                }
               }
             }
+          }
+        }
+        else {
+#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
+          printf("This is a Schur related extract for local_subpartidx = %d!\n", local_subpartidx);
+#endif
+          for (local_ordinal_type tr=-1,j=0;tr<nrows[0]+1;++tr) {
+            for (local_ordinal_type e=0;e<3;++e) {
+              const impl_scalar_type* block[vector_length] = {};
+              for (local_ordinal_type vi=0;vi<npacks;++vi) {
+                const size_type Aj = A_rowptr(lclrow(ri0[vi] + tr)) + A_colindsub(kfs[vi] + j);
+                block[vi] = &A_values(Aj*blocksize_square);
+              }
+              const size_type pi = kps + j;
+#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
+              printf("extract pi = %ld;\n", pi);
+#endif
+              ++j;
+              for (local_ordinal_type ii=0;ii<blocksize;++ii) {
+                for (local_ordinal_type jj=0;jj<blocksize;++jj) {
+                  const auto idx = tlb::getFlatIndex(ii, jj, blocksize);
+                  auto& v = internal_vector_values(pi, ii, jj, 0);
+                  for (local_ordinal_type vi=0;vi<npacks;++vi)
+                    v[vi] = static_cast<btdm_scalar_type>(block[vi][idx]);
+                }
+              }
 
-            if (nrows[0] == 1) break;
-            if (e == 1 && (tr == 0 || tr+1 == nrows[0])) break;
-            for (local_ordinal_type vi=1;vi<npacks;++vi) {
-              if ((e == 0 && nrows[vi] == 1) || (e == 1 && tr+1 == nrows[vi])) {
-                npacks = vi;
-                break;
+              if (nrows[0] == 1) break;
+              if (e == 0 && (tr == -1 || tr == nrows[0])) break;
+              for (local_ordinal_type vi=1;vi<npacks;++vi) {
+                if ((e == 0 && nrows[vi] == 1) || (e == 0 && tr == nrows[vi])) {
+                  npacks = vi;
+                  break;
+                }
               }
             }
           }
@@ -2128,14 +2717,14 @@ namespace Ifpack2 {
               const local_ordinal_type &partidxbeg,
               const local_ordinal_type &npacks,
               const local_ordinal_type &vbeg) const {
-        using tlb = TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
+        using tlb = BlockHelperDetails::TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
         local_ordinal_type kfs_vals[internal_vector_length] = {};
         local_ordinal_type ri0_vals[internal_vector_length] = {};
         local_ordinal_type nrows_vals[internal_vector_length] = {};
 
-        const size_type kps = pack_td_ptr(partidxbeg);
+        const size_type kps = pack_td_ptr(partidxbeg,0);
         for (local_ordinal_type v=vbeg,vi=0;v<npacks && vi<internal_vector_length;++v,++vi) {
-          kfs_vals[vi] = flat_td_ptr(partidxbeg+vi);
+          kfs_vals[vi] = flat_td_ptr(partidxbeg+vi,0);
           ri0_vals[vi] = partptr(partidxbeg+vi);
           nrows_vals[vi] = partptr(partidxbeg+vi+1) - ri0_vals[vi];
         }
@@ -2157,8 +2746,9 @@ namespace Ifpack2 {
                 Kokkos::parallel_for
                   (Kokkos::TeamThreadRange(member,blocksize),
                    [&](const local_ordinal_type &ii) {
-                    for (local_ordinal_type jj=0;jj<blocksize;++jj) 
+                    for (local_ordinal_type jj=0;jj<blocksize;++jj) {
                       scalar_values(pi, ii, jj, v) = static_cast<btdm_scalar_type>(block[tlb::getFlatIndex(ii,jj,blocksize)]);
+                    }
                   });
               }
             }
@@ -2170,7 +2760,7 @@ namespace Ifpack2 {
                typename WWViewType>
       KOKKOS_INLINE_FUNCTION
       void
-      factorize(const member_type &member,
+      factorize_subline(const member_type &member,
                 const local_ordinal_type &i0,
                 const local_ordinal_type &nrows,
                 const local_ordinal_type &v,
@@ -2186,6 +2776,11 @@ namespace Ifpack2 {
         // constant
         const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
 
+#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
+        printf("i0 = %d, nrows = %d, v = %d;\n", i0, nrows, v);
+        printf("AA.extent(0) = %ld\n", AA.extent(0));
+#endif
+
         // subview pattern
         auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v);
         KB::LU<member_type,
@@ -2197,6 +2792,9 @@ namespace Ifpack2 {
           auto C = A;
           local_ordinal_type i = i0;
           for (local_ordinal_type tr=1;tr<nrows;++tr,i+=3) {
+#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
+            printf("tr = %d, i = %d;\n", tr, i);
+#endif
             B.assign_data( &AA(i+1,0,0,v) );
             KB::Trsm<member_type,
                      KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
@@ -2239,75 +2837,388 @@ namespace Ifpack2 {
 
     public:
 
-      struct ExtractAndFactorizeTag {};
+      struct ExtractAndFactorizeSubLineTag {};
+      struct ExtractBCDTag {};
+      struct ComputeETag {};
+      struct ComputeSchurTag {};
+      struct FactorizeSchurTag {};
 
       KOKKOS_INLINE_FUNCTION
       void
-      operator() (const ExtractAndFactorizeTag &, const member_type &member) const {
+      operator() (const ExtractAndFactorizeSubLineTag &, const member_type &member) const {
         // btdm is packed and sorted from largest one
-        const local_ordinal_type packidx = member.league_rank();
+        const local_ordinal_type packidx = packindices_sub(member.league_rank());
 
-        const local_ordinal_type partidx = packptr(packidx);
-        const local_ordinal_type npacks = packptr(packidx+1) - partidx;
-        const local_ordinal_type i0 = pack_td_ptr(partidx);
-        const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
+        const local_ordinal_type subpartidx = packptr_sub(packidx);
+        const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
+        const local_ordinal_type local_subpartidx = subpartidx/n_parts;
+        const local_ordinal_type partidx = subpartidx%n_parts;
+
+        const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
+        const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
+        const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
 
         internal_vector_scratch_type_3d_view
           WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
+
+#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
+        printf("rank = %d, i0 = %d, npacks = %d, nrows = %d, packidx = %d, subpartidx = %d, partidx = %d, local_subpartidx = %d;\n", member.league_rank(), i0, npacks, nrows, packidx, subpartidx, partidx, local_subpartidx);
+        printf("vector_loop_size = %d\n", vector_loop_size);
+#endif
+
         if (vector_loop_size == 1) {
-          extract(partidx, npacks);
-          factorize(member, i0, nrows, 0, internal_vector_values, WW);
+          extract(partidx, local_subpartidx, npacks);
+          factorize_subline(member, i0, nrows, 0, internal_vector_values, WW);
         } else {
           Kokkos::parallel_for
             (Kokkos::ThreadVectorRange(member, vector_loop_size),
 	     [&](const local_ordinal_type &v) {
               const local_ordinal_type vbeg = v*internal_vector_length;
+#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
+              printf("i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
+#endif
               if (vbeg < npacks)
                 extract(member, partidx+vbeg, npacks, vbeg);
               // this is not safe if vector loop size is different from vector size of 
               // the team policy. we always make sure this when constructing the team policy
               member.team_barrier();
-              factorize(member, i0, nrows, v, internal_vector_values, WW);
+              factorize_subline(member, i0, nrows, v, internal_vector_values, WW);
             });
         }
       }
 
-      void run() {
-        IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
-        const local_ordinal_type team_size =
-          ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
-          recommended_team_size(blocksize, vector_length, internal_vector_length);
-        const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
-          shmem_size(blocksize, blocksize, vector_loop_size);
+      KOKKOS_INLINE_FUNCTION
+      void
+      operator() (const ExtractBCDTag &, const member_type &member) const {
+        // btdm is packed and sorted from largest one
+        const local_ordinal_type packidx = packindices_schur(member.league_rank());
 
-        Kokkos::TeamPolicy<execution_space,ExtractAndFactorizeTag>
-          policy(packptr.extent(0)-1, team_size, vector_loop_size);
-#if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
-        Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeTag>",
-                             policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this);
-#else
-        policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
-        Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeTag>",
-                             policy, *this);
+        const local_ordinal_type subpartidx = packptr_sub(packidx);
+        const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
+        const local_ordinal_type local_subpartidx = subpartidx/n_parts;
+        const local_ordinal_type partidx = subpartidx%n_parts;
+
+        const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
+        //const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
+        //const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
+
+        if (vector_loop_size == 1) {
+          extract(partidx, local_subpartidx, npacks);
+        }
+        else {
+          Kokkos::parallel_for
+            (Kokkos::ThreadVectorRange(member, vector_loop_size),
+	     [&](const local_ordinal_type &v) {
+              const local_ordinal_type vbeg = v*internal_vector_length;
+#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
+              printf("i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg);
 #endif
-        IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
-      }
+              if (vbeg < npacks)
+                extract(member, partidx+vbeg, npacks, vbeg);
+            });
+        }
 
-    };
+        member.team_barrier();
 
-    ///
-    /// top level numeric interface
-    ///
-    template<typename MatrixType>
+        const size_type kps1 = pack_td_ptr(partidx, local_subpartidx);
+        const size_type kps2 = pack_td_ptr(partidx, local_subpartidx+1)-1;
+
+        const local_ordinal_type r1 = part2packrowidx0_sub(partidx,local_subpartidx)-1;
+        const local_ordinal_type r2 = part2packrowidx0_sub(partidx,local_subpartidx)+2;
+
+#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
+        printf("Copy for Schur complement part id = %d from kps1 = %d to r1 = %d and from kps2 = %d to r2 = %d partidx = %d local_subpartidx = %d;\n", packidx, kps1, r1, kps2, r2, partidx, local_subpartidx);
+#endif
+
+        // Need to copy D to e_internal_vector_values.
+        copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 0, r1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), 
+                          Kokkos::subview(internal_vector_values, kps1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
+
+        copy3DView<local_ordinal_type>(member, Kokkos::subview(e_internal_vector_values, 1, r2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), 
+                          Kokkos::subview(internal_vector_values, kps2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
+
+      }
+      
+      KOKKOS_INLINE_FUNCTION
+      void
+      operator() (const ComputeETag &, const member_type &member) const {
+        // btdm is packed and sorted from largest one
+        const local_ordinal_type packidx = packindices_sub(member.league_rank());
+
+        const local_ordinal_type subpartidx = packptr_sub(packidx);
+        const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
+        const local_ordinal_type local_subpartidx = subpartidx/n_parts;
+        const local_ordinal_type partidx = subpartidx%n_parts;
+
+        const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
+        const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
+        const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
+        const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
+        const local_ordinal_type num_vectors = blocksize;
+
+        (void) npacks;
+
+        internal_vector_scratch_type_3d_view
+          WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
+        if (local_subpartidx == 0) {
+          Kokkos::parallel_for
+            (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
+              solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW, true);
+            });
+        }
+        else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
+          Kokkos::parallel_for
+            (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
+              solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW);
+            });
+        }
+        else {
+          Kokkos::parallel_for
+            (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
+              solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW, true);
+              solveMultiVector<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW); 
+            });
+        }
+      }
+
+      KOKKOS_INLINE_FUNCTION
+      void
+      operator() (const ComputeSchurTag &, const member_type &member) const {
+        // btdm is packed and sorted from largest one
+        const local_ordinal_type packidx = packindices_schur(member.league_rank());
+
+        const local_ordinal_type subpartidx = packptr_sub(packidx);
+        const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
+        const local_ordinal_type local_subpartidx = subpartidx/n_parts;
+        const local_ordinal_type partidx = subpartidx%n_parts;
+
+        //const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
+        const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
+        //const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
+        //const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
+
+        internal_vector_scratch_type_3d_view
+          WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
+
+        // Compute S = D - C E
+
+        const local_ordinal_type local_subpartidx_schur = (local_subpartidx-1)/2;
+        const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx,local_subpartidx_schur) : pack_td_ptr_schur(partidx,local_subpartidx_schur) + 1;
+        const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0+2 : i0+2;
+
+        for  (local_ordinal_type i = 0; i < 4; ++i) { //pack_td_ptr_schur(partidx,local_subpartidx_schur+1)-i0_schur
+          copy3DView<local_ordinal_type>(member, Kokkos::subview(internal_vector_values_schur, i0_schur+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), 
+                            Kokkos::subview(internal_vector_values, i0_offset+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
+        }
+
+        member.team_barrier();
+
+        const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
+
+        const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx)+1;
+        const size_type c_kps2 = pack_td_ptr(partidx, local_subpartidx+1)-2;
+
+        const local_ordinal_type e_r1 = part2packrowidx0_sub(partidx,local_subpartidx)-1;
+        const local_ordinal_type e_r2 = part2packrowidx0_sub(partidx,local_subpartidx)+2;
+
+        typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo
+          <typename execution_space::memory_space> default_mode_and_algo_type;
+
+        typedef typename default_mode_and_algo_type::mode_type default_mode_type;
+        typedef typename default_mode_and_algo_type::algo_type default_algo_type;
+
+        Kokkos::parallel_for
+          (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
+            for  (size_type i = 0; i < pack_td_ptr_schur(partidx,local_subpartidx_schur+1)-pack_td_ptr_schur(partidx,local_subpartidx_schur); ++i) {
+              local_ordinal_type e_r, e_c, c_kps;
+
+              if ( local_subpartidx_schur == 0 ) {
+                if ( i == 0 ) {
+                  e_r = e_r1;
+                  e_c = 0;
+                  c_kps = c_kps1;
+                }
+                else if ( i == 3 ) {
+                  e_r = e_r2;
+                  e_c = 1;
+                  c_kps = c_kps2;
+                }
+                else if ( i == 4 ) {
+                  e_r = e_r2;
+                  e_c = 0;
+                  c_kps = c_kps2;
+                }
+                else {
+                  continue;
+                }
+              }
+              else {
+                if ( i == 0 ) {
+                  e_r = e_r1;
+                  e_c = 1;
+                  c_kps = c_kps1;
+                }
+                else if ( i == 1 ) {
+                  e_r = e_r1;
+                  e_c = 0;
+                  c_kps = c_kps1;
+                }
+                else if ( i == 4 ) {
+                  e_r = e_r2;
+                  e_c = 1;
+                  c_kps = c_kps2;
+                }
+                else if ( i == 5 ) {
+                  e_r = e_r2;
+                  e_c = 0;
+                  c_kps = c_kps2;
+                }
+                else {
+                  continue;
+                }
+              }
+
+              auto S = Kokkos::subview(internal_vector_values_schur, pack_td_ptr_schur(partidx,local_subpartidx_schur)+i, Kokkos::ALL(), Kokkos::ALL(), v);
+              auto C = Kokkos::subview(internal_vector_values, c_kps, Kokkos::ALL(), Kokkos::ALL(), v);
+              auto E = Kokkos::subview(e_internal_vector_values, e_c, e_r, Kokkos::ALL(), Kokkos::ALL(), v);
+              KB::Gemm<member_type,
+                      KB::Trans::NoTranspose,KB::Trans::NoTranspose,
+                      default_mode_type,default_algo_type>
+                ::invoke(member, -one, C, E, one, S);
+            }
+          });
+      }
+
+      KOKKOS_INLINE_FUNCTION
+      void
+      operator() (const FactorizeSchurTag &, const member_type &member) const {
+        const local_ordinal_type packidx = packindices_sub(member.league_rank());
+
+        const local_ordinal_type partidx = packptr_sub(packidx);
+
+        const local_ordinal_type i0 = pack_td_ptr_schur(partidx,0);
+        const local_ordinal_type nrows = 2*(pack_td_ptr_schur.extent(1)-1);
+
+        internal_vector_scratch_type_3d_view
+          WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
+        
+#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF
+        printf("FactorizeSchurTag rank = %d, i0 = %d, nrows = %d;\n", member.league_rank(), i0, nrows);
+        printf("vector_loop_size = %d\n", vector_loop_size);
+#endif
+
+        if (vector_loop_size == 1) {
+          factorize_subline(member, i0, nrows, 0, internal_vector_values_schur, WW);
+        } else {
+          Kokkos::parallel_for
+            (Kokkos::ThreadVectorRange(member, vector_loop_size),
+	     [&](const local_ordinal_type &v) {
+              factorize_subline(member, i0, nrows, v, internal_vector_values_schur, WW);
+            });
+        }
+      }
+
+      void run() {
+        IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
+        const local_ordinal_type team_size =
+          ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
+          recommended_team_size(blocksize, vector_length, internal_vector_length);
+        const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view::
+          shmem_size(blocksize, blocksize, vector_loop_size);
+
+        {
+          IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractAndFactorizeSubLineTag");
+          Kokkos::TeamPolicy<execution_space,ExtractAndFactorizeSubLineTag>
+            policy(packindices_sub.extent(0), team_size, vector_loop_size);
+
+
+          const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
+          writeBTDValuesToFile(n_parts, scalar_values, "before.mm");
+
+          policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
+          Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeSubLineTag>",
+                              policy, *this);
+          execution_space().fence();
+
+          writeBTDValuesToFile(n_parts, scalar_values, "after.mm");
+        }
+
+        if (packindices_schur.extent(0) != 0)
+        {
+          {
+
+            write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_before_extract.mm");
+
+            {
+              IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractBCDTag");
+              Kokkos::TeamPolicy<execution_space,ExtractBCDTag>
+                policy(packindices_schur.extent(0), team_size, vector_loop_size);
+
+              policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
+              Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractBCDTag>",
+                                  policy, *this);
+              execution_space().fence();
+            }
+
+            writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values, "after_extraction_of_BCD.mm");
+
+            write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_after_extract.mm");
+            {
+              IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ComputeETag");
+              Kokkos::TeamPolicy<execution_space,ComputeETag>
+                policy(packindices_sub.extent(0), team_size, vector_loop_size);
+
+              policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
+              Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ComputeETag>",
+                                  policy, *this);
+              execution_space().fence();
+            }
+            write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_after_compute.mm");
+          }
+
+          {
+            IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ComputeSchurTag");
+            writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "before_schur.mm");
+            Kokkos::TeamPolicy<execution_space,ComputeSchurTag>
+              policy(packindices_schur.extent(0), team_size, vector_loop_size);
+
+            policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
+            Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ComputeSchurTag>",
+                                policy, *this);
+            writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_schur.mm");
+            execution_space().fence();
+          }
+
+          {
+            IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::FactorizeSchurTag");
+            Kokkos::TeamPolicy<execution_space,FactorizeSchurTag>
+              policy(part2packrowidx0_sub.extent(0), team_size, vector_loop_size);
+            policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch));
+            Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<FactorizeSchurTag>",
+                                policy, *this);
+            execution_space().fence();
+            writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_factor_schur.mm");
+          }
+        }
+
+        IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
+      }
+
+    };
+
+    ///
+    /// top level numeric interface
+    ///
+    template<typename MatrixType>
     void
-    performNumericPhase(const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
-                        const PartInterface<MatrixType> &interf,
+    performNumericPhase(const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
+                        const BlockHelperDetails::PartInterface<MatrixType> &interf,
                         BlockTridiags<MatrixType> &btdm,
-                        const typename ImplType<MatrixType>::magnitude_type tiny) {
-      IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NumericPhase");
+                        const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tiny) {
+      IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase");
       ExtractAndFactorizeTridiags<MatrixType> function(btdm, interf, A, tiny);
       function.run();
-      IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(typename ImplType<MatrixType>::execution_space)
+      IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
     }
 
     ///
@@ -2316,7 +3227,7 @@ namespace Ifpack2 {
     template<typename MatrixType>
     struct MultiVectorConverter {
     public:
-      using impl_type = ImplType<MatrixType>;
+      using impl_type = BlockHelperDetails::ImplType<MatrixType>;
       using execution_space = typename impl_type::execution_space;
       using memory_space = typename impl_type::memory_space;
 
@@ -2359,7 +3270,7 @@ namespace Ifpack2 {
 
     public:
 
-      MultiVectorConverter(const PartInterface<MatrixType> &interf,
+      MultiVectorConverter(const BlockHelperDetails::PartInterface<MatrixType> &interf,
                            const vector_type_3d_view &pmv)
         : partptr(interf.partptr),
           packptr(interf.packptr),
@@ -2429,10 +3340,10 @@ namespace Ifpack2 {
 
       void run(const const_impl_scalar_type_2d_view_tpetra &scalar_multivector_) {
         IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
-        IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::MultiVectorConverter");
+        IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::MultiVectorConverter");
 
         scalar_multivector = scalar_multivector_;
-        if constexpr (is_device<execution_space>::value) {
+        if constexpr (BlockHelperDetails::is_device<execution_space>::value) {
           const local_ordinal_type vl = vector_length;
           const Kokkos::TeamPolicy<execution_space> policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl);
           Kokkos::parallel_for
@@ -2443,15 +3354,13 @@ namespace Ifpack2 {
             ("MultiVectorConverter::RangePolicy", policy, *this);
         }
         IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
-        IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space)
+        IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
       }
     };
 
     ///
     /// solve tridiags
     ///
-    template<typename ArgActiveExecutionMemorySpace>
-    struct SolveTridiagsDefaultModeAndAlgo;
 
     template<>
     struct SolveTridiagsDefaultModeAndAlgo<Kokkos::HostSpace> {
@@ -2592,7 +3501,7 @@ namespace Ifpack2 {
     template<typename MatrixType>
     struct SolveTridiags {
     public:
-      using impl_type = ImplType<MatrixType>;
+      using impl_type = BlockHelperDetails::ImplType<MatrixType>;
       using execution_space = typename impl_type::execution_space;
 
       using local_ordinal_type = typename impl_type::local_ordinal_type;
@@ -2603,11 +3512,13 @@ namespace Ifpack2 {
       using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
       /// views
       using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
-      using size_type_1d_view = typename impl_type::size_type_1d_view;
+      using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
+      using size_type_2d_view = typename impl_type::size_type_2d_view;
       /// vectorization
       using vector_type_3d_view = typename impl_type::vector_type_3d_view;
       using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
-      //using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
+      using internal_vector_type_5d_view = typename impl_type::internal_vector_type_5d_view;
+      using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
 
       using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
 
@@ -2625,17 +3536,32 @@ namespace Ifpack2 {
 
     private:
       // part interface
+      local_ordinal_type n_subparts_per_part;
       const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
       const ConstUnmanaged<local_ordinal_type_1d_view> packptr;
+      const ConstUnmanaged<local_ordinal_type_1d_view> packindices_sub;
+      const ConstUnmanaged<local_ordinal_type_1d_view> packindices_schur;
       const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
+      const ConstUnmanaged<local_ordinal_type_2d_view> part2packrowidx0_sub;
       const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
+      const ConstUnmanaged<local_ordinal_type_1d_view> packptr_sub;
+
+      const ConstUnmanaged<local_ordinal_type_2d_view> partptr_sub;
+      const ConstUnmanaged<size_type_2d_view> pack_td_ptr_schur;
 
       // block tridiags
-      const ConstUnmanaged<size_type_1d_view> pack_td_ptr;
+      const ConstUnmanaged<size_type_2d_view> pack_td_ptr;
 
       // block tridiags values
       const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values;
       const Unmanaged<internal_vector_type_4d_view> X_internal_vector_values;
+      const Unmanaged<btdm_scalar_type_4d_view> X_internal_scalar_values;
+
+      internal_vector_type_4d_view X_internal_vector_values_schur;
+
+      const ConstUnmanaged<internal_vector_type_4d_view> D_internal_vector_values_schur;
+      const ConstUnmanaged<internal_vector_type_5d_view> e_internal_vector_values;
+
 
       const local_ordinal_type vector_loop_size;
 
@@ -2650,17 +3576,24 @@ namespace Ifpack2 {
       const bool compute_diff;
 
     public:
-      SolveTridiags(const PartInterface<MatrixType> &interf,
+      SolveTridiags(const BlockHelperDetails::PartInterface<MatrixType> &interf,
                     const BlockTridiags<MatrixType> &btdm,
                     const vector_type_3d_view &pmv,
                     const impl_scalar_type damping_factor,
                     const bool is_norm_manager_active)
         :
         // interface
+        n_subparts_per_part(interf.n_subparts_per_part),
         partptr(interf.partptr),
         packptr(interf.packptr),
+        packindices_sub(interf.packindices_sub),
+        packindices_schur(interf.packindices_schur),
         part2packrowidx0(interf.part2packrowidx0),
+        part2packrowidx0_sub(interf.part2packrowidx0_sub),
         lclrow(interf.lclrow),
+        packptr_sub(interf.packptr_sub),
+        partptr_sub(interf.partptr_sub),
+        pack_td_ptr_schur(btdm.pack_td_ptr_schur),
         // block tridiags and  multivector
         pack_td_ptr(btdm.pack_td_ptr),
         D_internal_vector_values((internal_vector_type*)btdm.values.data(),
@@ -2673,6 +3606,27 @@ namespace Ifpack2 {
                                  pmv.extent(1),
                                  pmv.extent(2),
                                  vector_length/internal_vector_length),
+        X_internal_scalar_values((btdm_scalar_type*)pmv.data(),
+                                 pmv.extent(0),
+                                 pmv.extent(1),
+                                 pmv.extent(2),
+                                 vector_length),
+        X_internal_vector_values_schur(do_not_initialize_tag("X_internal_vector_values_schur"),
+                                       2*(n_subparts_per_part-1) * part2packrowidx0_sub.extent(0),
+                                       pmv.extent(1),
+                                       pmv.extent(2),
+                                       vector_length/internal_vector_length),
+        D_internal_vector_values_schur((internal_vector_type*)btdm.values_schur.data(),
+                               btdm.values_schur.extent(0),
+                               btdm.values_schur.extent(1),
+                               btdm.values_schur.extent(2),
+                               vector_length/internal_vector_length),
+        e_internal_vector_values((internal_vector_type*)btdm.e_values.data(),
+                                btdm.e_values.extent(0),
+                                btdm.e_values.extent(1),
+                                btdm.e_values.extent(2),
+                                btdm.e_values.extent(3),
+                                vector_length/internal_vector_length),
         vector_loop_size(vector_length/internal_vector_length),
         Y_scalar_multivector(),
         Z_scalar_vector(),
@@ -2795,10 +3749,6 @@ namespace Ifpack2 {
         const local_ordinal_type xstep = X_internal_vector_values.stride_0();
         const local_ordinal_type xs0 = X_internal_vector_values.stride_1(); //vector_length;
 
-        // for multiple rhs
-        //const local_ordinal_type xs0 = num_vectors*vector_length; //X_scalar_values.stride_1();
-        //const local_ordinal_type xs1 = vector_length; //X_scalar_values.stride_2();
-
         // move to starting point
         A += i0*astep + v;
         X += r0*xstep + v;
@@ -2981,6 +3931,15 @@ namespace Ifpack2 {
       template<int B> struct SingleVectorTag {};
       template<int B> struct MultiVectorTag {};
 
+      template<int B> struct SingleVectorSubLineTag {};
+      template<int B> struct MultiVectorSubLineTag {};
+      template<int B> struct SingleVectorApplyCTag {};
+      template<int B> struct MultiVectorApplyCTag {};
+      template<int B> struct SingleVectorSchurTag {};
+      template<int B> struct MultiVectorSchurTag {};
+      template<int B> struct SingleVectorApplyETag {};
+      template<int B> struct MultiVectorApplyETag {};
+
       template<int B>
       KOKKOS_INLINE_FUNCTION
       void
@@ -2989,7 +3948,7 @@ namespace Ifpack2 {
         const local_ordinal_type partidx = packptr(packidx);
         const local_ordinal_type npacks = packptr(packidx+1) - partidx;
         const local_ordinal_type pri0 = part2packrowidx0(partidx);
-        const local_ordinal_type i0 = pack_td_ptr(partidx);
+        const local_ordinal_type i0 = pack_td_ptr(partidx,0);
         const local_ordinal_type r0 = part2packrowidx0(partidx);
         const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
         const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
@@ -3014,7 +3973,7 @@ namespace Ifpack2 {
         const local_ordinal_type partidx = packptr(packidx);
         const local_ordinal_type npacks = packptr(packidx+1) - partidx;
         const local_ordinal_type pri0 = part2packrowidx0(partidx);
-        const local_ordinal_type i0 = pack_td_ptr(partidx);
+        const local_ordinal_type i0 = pack_td_ptr(partidx,0);
         const local_ordinal_type r0 = part2packrowidx0(partidx);
         const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx);
         const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B);
@@ -3032,939 +3991,429 @@ namespace Ifpack2 {
           });
       }
 
-      void run(const impl_scalar_type_2d_view_tpetra &Y,
-               const impl_scalar_type_1d_view &Z) {
-        IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
-        IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::SolveTridiags");
+      template<int B>
+      KOKKOS_INLINE_FUNCTION
+      void
+      operator() (const SingleVectorSubLineTag<B> &, const member_type &member) const {
+        // btdm is packed and sorted from largest one
+        const local_ordinal_type packidx = packindices_sub(member.league_rank());
 
-        /// set vectors
-        this->Y_scalar_multivector = Y;
-        this->Z_scalar_vector = Z;
+        const local_ordinal_type subpartidx = packptr_sub(packidx);
+        const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
+        const local_ordinal_type local_subpartidx = subpartidx/n_parts;
+        const local_ordinal_type partidx = subpartidx%n_parts;
 
-        const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
-        const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
+        const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
+        const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
+        const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
+        const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
+        const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
+        const local_ordinal_type num_vectors = blocksize;
 
-        const local_ordinal_type team_size =
-          SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
-          recommended_team_size(blocksize, vector_length, internal_vector_length);
-        const int per_team_scratch = internal_vector_scratch_type_3d_view
-          ::shmem_size(blocksize, num_vectors, vector_loop_size);
+        //(void) i0;
+        //(void) nrows;
+        (void) npacks;
 
-#if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
-#define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B)                    \
-        if (num_vectors == 1) {                                         \
-          const Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
-            policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
-          Kokkos::parallel_for                                          \
-            ("SolveTridiags::TeamPolicy::run<SingleVector>",            \
-             policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
-        } else {                                                        \
-          const Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
-            policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
-          Kokkos::parallel_for                                          \
-            ("SolveTridiags::TeamPolicy::run<MultiVector>",             \
-             policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
-        } break
-#else
-#define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B)                    \
-        if (num_vectors == 1) {                                         \
-          Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> >       \
-            policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
-          policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
-          Kokkos::parallel_for                                          \
-            ("SolveTridiags::TeamPolicy::run<SingleVector>",            \
-             policy, *this);                                            \
-        } else {                                                        \
-          Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> >        \
-            policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
-          policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
-          Kokkos::parallel_for                                          \
-            ("SolveTridiags::TeamPolicy::run<MultiVector>",             \
-             policy, *this);                                            \
-        } break
-#endif
-        switch (blocksize) {
-        case   3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 3);
-        case   5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 5);
-        case   7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 7);
-        case   9: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 9);
-        case  10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
-        case  11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
-        case  16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
-        case  17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
-        case  18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
-        default : BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 0);
-        }
-#undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
+        internal_vector_scratch_type_3d_view
+          WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size);
 
-        IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
-        IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space)
+        Kokkos::parallel_for
+          (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
+            solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0, r0, nrows, v, D_internal_vector_values, X_internal_vector_values, WW);
+          });
       }
-    };
 
-    ///
-    /// compute local residula vector y = b - R x
-    ///
-    static inline int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize,
-                                                                     const int team_size) {
-      int total_team_size(0);
-      if      (blksize <=  5) total_team_size =  32;
-      else if (blksize <=  9) total_team_size =  32; // 64
-      else if (blksize <= 12) total_team_size =  96;
-      else if (blksize <= 16) total_team_size = 128;
-      else if (blksize <= 20) total_team_size = 160;
-      else                    total_team_size = 160;
-      return total_team_size/team_size;
-    }
+      template<int B>
+      KOKKOS_INLINE_FUNCTION
+      void
+      operator() (const SingleVectorApplyCTag<B> &, const member_type &member) const {
+        // btdm is packed and sorted from largest one
+        //const local_ordinal_type packidx = packindices_schur(member.league_rank());
+        const local_ordinal_type packidx = packindices_sub(member.league_rank());
 
-    static inline int ComputeResidualVectorRecommendedHIPVectorSize(const int blksize,
-								    const int team_size) {
-      int total_team_size(0);
-      if      (blksize <=  5) total_team_size =  32;
-      else if (blksize <=  9) total_team_size =  32; // 64
-      else if (blksize <= 12) total_team_size =  96;
-      else if (blksize <= 16) total_team_size = 128;
-      else if (blksize <= 20) total_team_size = 160;
-      else                    total_team_size = 160;
-      return total_team_size/team_size;
-    }
+        const local_ordinal_type subpartidx = packptr_sub(packidx);
+        const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
+        const local_ordinal_type local_subpartidx = subpartidx/n_parts;
+        const local_ordinal_type partidx = subpartidx%n_parts;
+        const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
 
-    static inline int ComputeResidualVectorRecommendedSYCLVectorSize(const int blksize,
-								     const int team_size) {
-      int total_team_size(0);
-      if      (blksize <=  5) total_team_size =  32;
-      else if (blksize <=  9) total_team_size =  32; // 64
-      else if (blksize <= 12) total_team_size =  96;
-      else if (blksize <= 16) total_team_size = 128;
-      else if (blksize <= 20) total_team_size = 160;
-      else                    total_team_size = 160;
-      return total_team_size/team_size;
-    }
+        //const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx;
+        const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx);
+        const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
+        const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
 
-    template<typename T>
-    static inline int ComputeResidualVectorRecommendedVectorSize(const int blksize,
-                                                                 const int team_size) {
-      if ( is_cuda<T>::value )
-        return ComputeResidualVectorRecommendedCudaVectorSize(blksize, team_size);
-      if ( is_hip<T>::value )
-        return ComputeResidualVectorRecommendedHIPVectorSize(blksize, team_size);
-      if ( is_sycl<T>::value )
-        return ComputeResidualVectorRecommendedSYCLVectorSize(blksize, team_size);
-      return -1;
-    }
+        internal_vector_scratch_type_3d_view
+          WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
 
-    
-    template<typename MatrixType>
-    struct ComputeResidualVector {
-    public:
-      using impl_type = ImplType<MatrixType>;
-      using node_device_type = typename impl_type::node_device_type;
-      using execution_space = typename impl_type::execution_space;
-      using memory_space = typename impl_type::memory_space;
+        // Compute v_2 = v_2 - C v_1
 
-      using local_ordinal_type = typename impl_type::local_ordinal_type;
-      using size_type = typename impl_type::size_type;
-      using impl_scalar_type = typename impl_type::impl_scalar_type;
-      using magnitude_type = typename impl_type::magnitude_type;
-      using btdm_scalar_type = typename impl_type::btdm_scalar_type;
-      using btdm_magnitude_type = typename impl_type::btdm_magnitude_type;
-      /// views
-      using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
-      using size_type_1d_view = typename impl_type::size_type_1d_view;
-      using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right)
-      using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
-      using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector (layout left)
-      using vector_type_3d_view = typename impl_type::vector_type_3d_view;
-      using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
-      static constexpr int vector_length = impl_type::vector_length;
+        const local_ordinal_type local_subpartidx_schur = (local_subpartidx-1)/2;
+        const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx,local_subpartidx_schur) : pack_td_ptr_schur(partidx,local_subpartidx_schur) + 1;
+        const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0+2 : i0+2;
 
-      /// team policy member type (used in cuda)
-      using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
+        (void) i0_schur;
+        (void) i0_offset;
 
-      // enum for max blocksize and vector length
-      enum : int { max_blocksize = 32 };
+        const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
 
-    private:
-      ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
-      ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x; // x_owned
-      ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
-      Unmanaged<impl_scalar_type_2d_view_tpetra> y;
-      Unmanaged<vector_type_3d_view> y_packed;
-      Unmanaged<btdm_scalar_type_4d_view> y_packed_scalar;
-
-      // AmD information
-      const ConstUnmanaged<size_type_1d_view> rowptr, rowptr_remote;
-      const ConstUnmanaged<local_ordinal_type_1d_view> colindsub, colindsub_remote;
-      const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
-
-      // block crs graph information
-      // for cuda (kokkos crs graph uses a different size_type from size_t)
-      const ConstUnmanaged<Kokkos::View<size_t*,node_device_type> > A_rowptr;
-      const ConstUnmanaged<Kokkos::View<local_ordinal_type*,node_device_type> > A_colind;
-
-      // blocksize
-      const local_ordinal_type blocksize_requested;
+        const size_type c_kps2 =  local_subpartidx > 0 ? pack_td_ptr(partidx, local_subpartidx)-2 : 0;
+        const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx+1)+1;
 
-      // part interface
-      const ConstUnmanaged<local_ordinal_type_1d_view> part2packrowidx0;
-      const ConstUnmanaged<local_ordinal_type_1d_view> part2rowidx0;
-      const ConstUnmanaged<local_ordinal_type_1d_view> rowidx2part;
-      const ConstUnmanaged<local_ordinal_type_1d_view> partptr;
-      const ConstUnmanaged<local_ordinal_type_1d_view> lclrow;
-      const ConstUnmanaged<local_ordinal_type_1d_view> dm2cm;
-      const bool is_dm2cm_active;
+        typedef SolveTridiagsDefaultModeAndAlgo
+          <typename execution_space::memory_space> default_mode_and_algo_type;
 
-    public:
-      template<typename LocalCrsGraphType>
-      ComputeResidualVector(const AmD<MatrixType> &amd,
-                            const LocalCrsGraphType &graph,
-                            const local_ordinal_type &blocksize_requested_,
-                            const PartInterface<MatrixType> &interf,
-                            const local_ordinal_type_1d_view &dm2cm_)
-        : rowptr(amd.rowptr), rowptr_remote(amd.rowptr_remote),
-          colindsub(amd.A_colindsub), colindsub_remote(amd.A_colindsub_remote),
-          tpetra_values(amd.tpetra_values),
-          A_rowptr(graph.row_map),
-          A_colind(graph.entries),
-          blocksize_requested(blocksize_requested_),
-          part2packrowidx0(interf.part2packrowidx0),
-          part2rowidx0(interf.part2rowidx0),
-          rowidx2part(interf.rowidx2part),
-          partptr(interf.partptr),
-          lclrow(interf.lclrow),
-          dm2cm(dm2cm_),
-          is_dm2cm_active(dm2cm_.span() > 0)
-      {}
+        typedef typename default_mode_and_algo_type::mode_type default_mode_type;
+        typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
 
-      inline
-      void
-      SerialGemv(const local_ordinal_type &blocksize,
-                 const impl_scalar_type * const KOKKOS_RESTRICT AA,
-                 const impl_scalar_type * const KOKKOS_RESTRICT xx,
-                 /* */ impl_scalar_type * KOKKOS_RESTRICT yy) const {
-        using tlb = TpetraLittleBlock<Tpetra::Impl::BlockCrsMatrixLittleBlockArrayLayout>;
-        for (local_ordinal_type k0=0;k0<blocksize;++k0) {
-          impl_scalar_type val = 0;
-#if defined(KOKKOS_ENABLE_PRAGMA_IVDEP)
-#   pragma ivdep
-#endif
-#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
-#   pragma unroll
-#endif
-          for (local_ordinal_type k1=0;k1<blocksize;++k1)
-            val += AA[tlb::getFlatIndex(k0,k1,blocksize)]*xx[k1];
-          yy[k0] -= val;
+        if (local_subpartidx == 0) {
+          Kokkos::parallel_for
+            (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
+              auto v_1 = Kokkos::subview(X_internal_vector_values, r0+nrows-1, Kokkos::ALL(), 0, v);
+              auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
+              auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
+
+              KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
+                (default_mode_type,default_algo_type,
+                  member,
+                  blocksize, blocksize,
+                  -one,
+                  C.data(), C.stride_0(), C.stride_1(),
+                  v_1.data(), v_1.stride_0(),
+                  one,
+                  v_2.data(), v_2.stride_0());
+            });
         }
-      }
-
-      template<typename bbViewType, typename yyViewType>
-      KOKKOS_INLINE_FUNCTION
-      void
-      VectorCopy(const member_type &member,
-                 const local_ordinal_type &blocksize,
-                 const bbViewType &bb,
-                 const yyViewType &yy) const {
-        Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](const local_ordinal_type &k0)  {
-            yy(k0) = static_cast<typename yyViewType::const_value_type>(bb(k0));
-          });
-      }
-
-      template<typename AAViewType, typename xxViewType, typename yyViewType>
-      KOKKOS_INLINE_FUNCTION
-      void
-      TeamVectorGemv(const member_type &member,
-               const local_ordinal_type &blocksize,
-               const AAViewType &AA,
-               const xxViewType &xx,
-               const yyViewType &yy) const {
-        Kokkos::parallel_for
-          (Kokkos::TeamThreadRange(member, blocksize),
-           [&](const local_ordinal_type &k0) {
-            impl_scalar_type val = 0;
-            Kokkos::parallel_for
-              (Kokkos::ThreadVectorRange(member, blocksize),
-               [&](const local_ordinal_type &k1) {
-                val += AA(k0,k1)*xx(k1);
-              });
-            Kokkos::atomic_fetch_add(&yy(k0), typename yyViewType::const_value_type(-val));
-          });
-      }
-
-      template<typename AAViewType, typename xxViewType, typename yyViewType>
-      KOKKOS_INLINE_FUNCTION
-      void
-      VectorGemv(const member_type &member,
-                 const local_ordinal_type &blocksize,
-                 const AAViewType &AA,
-                 const xxViewType &xx,
-                 const yyViewType &yy) const {
-        Kokkos::parallel_for
-          (Kokkos::ThreadVectorRange(member, blocksize),
-           [&](const local_ordinal_type &k0) {
-            impl_scalar_type val(0);
-            for (local_ordinal_type k1=0;k1<blocksize;++k1) {
-              val += AA(k0,k1)*xx(k1);
-            }
-            Kokkos::atomic_fetch_add(&yy(k0), typename yyViewType::const_value_type(-val));
-          });
-      }
-
-      // template<typename AAViewType, typename xxViewType, typename yyViewType>
-      // KOKKOS_INLINE_FUNCTION
-      // void
-      // VectorGemv(const member_type &member,
-      //                 const local_ordinal_type &blocksize,
-      //                 const AAViewType &AA,
-      //                 const xxViewType &xx,
-      //                 const yyViewType &yy) const {
-      //        for (local_ordinal_type k0=0;k0<blocksize;++k0) {
-      //          impl_scalar_type val = 0;
-      //          Kokkos::parallel_for
-      //            (Kokkos::ThreadVectorRange(member, blocksize),
-      //             [&](const local_ordinal_type &k1) {
-      //              val += AA(k0,k1)*xx(k1);
-      //            });
-      //          Kokkos::atomic_fetch_add(&yy(k0), -val);
-      //        }
-      // }
-
-      struct SeqTag {};
-
-      // inline  ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION
-      KOKKOS_INLINE_FUNCTION
-      void
-      operator() (const SeqTag &, const local_ordinal_type& i) const {
-        const local_ordinal_type blocksize = blocksize_requested;
-        const local_ordinal_type blocksize_square = blocksize*blocksize;
-
-        // constants
-        const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
-        const local_ordinal_type num_vectors = y.extent(1);
-        const local_ordinal_type row = i*blocksize;
-        for (local_ordinal_type col=0;col<num_vectors;++col) {
-          // y := b
-          impl_scalar_type *yy = &y(row, col);
-          const impl_scalar_type * const bb = &b(row, col);
-          memcpy(yy, bb, sizeof(impl_scalar_type)*blocksize);
-
-          // y -= Rx
-          const size_type A_k0 = A_rowptr[i];
-          for (size_type k=rowptr[i];k<rowptr[i+1];++k) {
-            const size_type j = A_k0 + colindsub[k];
-            const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
-            const impl_scalar_type * const xx = &x(A_colind[j]*blocksize, col);
-            SerialGemv(blocksize,AA,xx,yy);
-          }
+        else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
+          Kokkos::parallel_for
+            (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
+              auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
+              auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
+              auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
+
+              KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
+                (default_mode_type,default_algo_type,
+                  member,
+                  blocksize, blocksize,
+                  -one,
+                  C.data(), C.stride_0(), C.stride_1(),
+                  v_1.data(), v_1.stride_0(),
+                  one,
+                  v_2.data(), v_2.stride_0());
+            });
         }
-      }
-
-      KOKKOS_INLINE_FUNCTION
-      void
-      operator() (const SeqTag &, const member_type &member) const {
-
-        // constants
-        const local_ordinal_type blocksize = blocksize_requested;
-        const local_ordinal_type blocksize_square = blocksize*blocksize;
-
-        const local_ordinal_type lr = member.league_rank();
-        const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
-        const local_ordinal_type num_vectors = y.extent(1);
-
-        // subview pattern
-        auto bb = Kokkos::subview(b, block_range, 0);
-        auto xx = bb;
-        auto yy = Kokkos::subview(y, block_range, 0);
-        auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
-
-        const local_ordinal_type row = lr*blocksize;
-        for (local_ordinal_type col=0;col<num_vectors;++col) {
-          // y := b
-          yy.assign_data(&y(row, col));
-          bb.assign_data(&b(row, col));
-          if (member.team_rank() == 0)
-            VectorCopy(member, blocksize, bb, yy);
-          member.team_barrier();
-
-          // y -= Rx
-          const size_type A_k0 = A_rowptr[lr];
+        else {
           Kokkos::parallel_for
-            (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
-             [&](const local_ordinal_type &k) {
-              const size_type j = A_k0 + colindsub[k];
-              A_block.assign_data( &tpetra_values(j*blocksize_square) );
-              xx.assign_data( &x(A_colind[j]*blocksize, col) );
-              VectorGemv(member, blocksize, A_block, xx, yy);
+            (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
+              {
+                auto v_1 = Kokkos::subview(X_internal_vector_values, r0+nrows-1, Kokkos::ALL(), 0, v);
+                auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
+                auto C = Kokkos::subview(D_internal_vector_values, c_kps1, Kokkos::ALL(), Kokkos::ALL(), v);
+
+                KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
+                  (default_mode_type,default_algo_type,
+                    member,
+                    blocksize, blocksize,
+                    -one,
+                    C.data(), C.stride_0(), C.stride_1(),
+                    v_1.data(), v_1.stride_0(),
+                    one,
+                    v_2.data(), v_2.stride_0());
+              }
+              {
+                auto v_1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), 0, v);
+                auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
+                auto C = Kokkos::subview(D_internal_vector_values, c_kps2, Kokkos::ALL(), Kokkos::ALL(), v);
+
+                KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
+                  (default_mode_type,default_algo_type,
+                    member,
+                    blocksize, blocksize,
+                    -one,
+                    C.data(), C.stride_0(), C.stride_1(),
+                    v_1.data(), v_1.stride_0(),
+                    one,
+                    v_2.data(), v_2.stride_0());
+              }
             });
         }
       }
 
       template<int B>
-      struct AsyncTag {};
-
-      template<int B>
-      // inline  ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION
       KOKKOS_INLINE_FUNCTION
       void
-      operator() (const AsyncTag<B> &, const local_ordinal_type &rowidx) const {
-        const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
-        const local_ordinal_type blocksize_square = blocksize*blocksize;
-
-        // constants
-        const local_ordinal_type partidx = rowidx2part(rowidx);
-        const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
-        const local_ordinal_type v = partidx % vector_length;
-
-        const local_ordinal_type num_vectors = y_packed.extent(2);
-        const local_ordinal_type num_local_rows = lclrow.extent(0);
-
-        // temporary buffer for y flat
-        impl_scalar_type yy[B == 0 ? max_blocksize : B] = {};
-
-        const local_ordinal_type lr = lclrow(rowidx);
-        const local_ordinal_type row = lr*blocksize;
-        for (local_ordinal_type col=0;col<num_vectors;++col) {
-          // y := b
-          memcpy(yy, &b(row, col), sizeof(impl_scalar_type)*blocksize);
-
-          // y -= Rx
-          const size_type A_k0 = A_rowptr[lr];
-          for (size_type k=rowptr[lr];k<rowptr[lr+1];++k) {
-            const size_type j = A_k0 + colindsub[k];
-            const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
-            const local_ordinal_type A_colind_at_j = A_colind[j];
-            if (A_colind_at_j < num_local_rows) {
-              const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
-              const impl_scalar_type * const xx = &x(loc*blocksize, col);
-              SerialGemv(blocksize, AA,xx,yy);
-            } else {
-              const auto loc = A_colind_at_j - num_local_rows;
-              const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
-              SerialGemv(blocksize, AA,xx_remote,yy);
-            }
-          }
-          // move yy to y_packed
-          for (local_ordinal_type k=0;k<blocksize;++k)
-            y_packed(pri, k, col)[v] = yy[k];
-        }
-      }
+      operator() (const SingleVectorSchurTag<B> &, const member_type &member) const {
+        const local_ordinal_type packidx = packindices_sub(member.league_rank());
 
-      template<int B>
-      KOKKOS_INLINE_FUNCTION
-      void
-      operator() (const AsyncTag<B> &, const member_type &member) const {
-        const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
-        const local_ordinal_type blocksize_square = blocksize*blocksize;
+        const local_ordinal_type partidx = packptr_sub(packidx);
 
-        // constants
-        const local_ordinal_type rowidx = member.league_rank();
-        const local_ordinal_type partidx = rowidx2part(rowidx);
-        const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
-        const local_ordinal_type v = partidx % vector_length;
+        const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
 
-        const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
-        const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
-        const local_ordinal_type num_local_rows = lclrow.extent(0);
+        const local_ordinal_type i0_schur = pack_td_ptr_schur(partidx,0);
+        const local_ordinal_type nrows = 2*(n_subparts_per_part-1);
 
-        // subview pattern
-        auto bb = Kokkos::subview(b, block_range, 0);
-        auto xx = Kokkos::subview(x, block_range, 0);
-        auto xx_remote = Kokkos::subview(x_remote, block_range, 0);
-        auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
-        auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
-
-        const local_ordinal_type lr = lclrow(rowidx);
-        const local_ordinal_type row = lr*blocksize;
-        for (local_ordinal_type col=0;col<num_vectors;++col) {
-          // y := b
-          bb.assign_data(&b(row, col));
-          yy.assign_data(&y_packed_scalar(pri, 0, col, v));
-          if (member.team_rank() == 0)
-            VectorCopy(member, blocksize, bb, yy);
-          member.team_barrier();
+        const local_ordinal_type r0_schur = nrows * member.league_rank();
 
-          // y -= Rx
-          const size_type A_k0 = A_rowptr[lr];
-          Kokkos::parallel_for
-            (Kokkos::TeamThreadRange(member, rowptr[lr], rowptr[lr+1]),
-             [&](const local_ordinal_type &k) {
-              const size_type j = A_k0 + colindsub[k];
-              A_block.assign_data( &tpetra_values(j*blocksize_square) );
-
-              const local_ordinal_type A_colind_at_j = A_colind[j];
-              if (A_colind_at_j < num_local_rows) {
-                const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
-                xx.assign_data( &x(loc*blocksize, col) );
-                VectorGemv(member, blocksize, A_block, xx, yy);
-              } else {
-                const auto loc = A_colind_at_j - num_local_rows;
-                xx_remote.assign_data( &x_remote(loc*blocksize, col) );
-                VectorGemv(member, blocksize, A_block, xx_remote, yy);
-              }
-            });
+        internal_vector_scratch_type_3d_view
+          WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
+        
+        for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part-1; ++schur_sub_part) {
+          const local_ordinal_type r0 = part2packrowidx0_sub(partidx,2*schur_sub_part+1);
+          for (local_ordinal_type i = 0; i < 2; ++i) {
+            copy3DView<local_ordinal_type>(member, 
+              Kokkos::subview(X_internal_vector_values_schur, r0_schur+2*schur_sub_part+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), 
+              Kokkos::subview(X_internal_vector_values, r0+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
+          }
         }
-      }
 
-      template <int P, int B> struct OverlapTag {};
-
-      template<int P, int B>
-      // inline  ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION
-      KOKKOS_INLINE_FUNCTION
-      void
-      operator() (const OverlapTag<P,B> &, const local_ordinal_type& rowidx) const {
-        const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
-        const local_ordinal_type blocksize_square = blocksize*blocksize;
-
-        // constants
-        const local_ordinal_type partidx = rowidx2part(rowidx);
-        const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
-        const local_ordinal_type v = partidx % vector_length;
-
-        const local_ordinal_type num_vectors = y_packed.extent(2);
-        const local_ordinal_type num_local_rows = lclrow.extent(0);
-
-        // temporary buffer for y flat
-        impl_scalar_type yy[max_blocksize] = {};
-
-        auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
-        auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
-
-        const local_ordinal_type lr = lclrow(rowidx);
-        const local_ordinal_type row = lr*blocksize;
-        for (local_ordinal_type col=0;col<num_vectors;++col) {
-          if (P == 0) {
-            // y := b
-            memcpy(yy, &b(row, col), sizeof(impl_scalar_type)*blocksize);
-          } else {
-            // y (temporary) := 0
-            memset(yy, 0, sizeof(impl_scalar_type)*blocksize);
-          }
+        Kokkos::parallel_for
+          (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
+            solveSingleVectorNew<impl_type, internal_vector_scratch_type_3d_view> (member, blocksize, i0_schur, r0_schur, nrows, v, D_internal_vector_values_schur, X_internal_vector_values_schur, WW);
+          });
 
-          // y -= Rx
-          const size_type A_k0 = A_rowptr[lr];
-          for (size_type k=rowptr_used[lr];k<rowptr_used[lr+1];++k) {
-            const size_type j = A_k0 + colindsub_used[k];
-            const impl_scalar_type * const AA = &tpetra_values(j*blocksize_square);
-            const local_ordinal_type A_colind_at_j = A_colind[j];
-            if (P == 0) {
-              const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
-              const impl_scalar_type * const xx = &x(loc*blocksize, col);
-              SerialGemv(blocksize,AA,xx,yy);
-            } else {
-              const auto loc = A_colind_at_j - num_local_rows;
-              const impl_scalar_type * const xx_remote = &x_remote(loc*blocksize, col);
-              SerialGemv(blocksize,AA,xx_remote,yy);
-            }
-          }
-          // move yy to y_packed
-          if (P == 0) {
-            for (local_ordinal_type k=0;k<blocksize;++k)
-              y_packed(pri, k, col)[v] = yy[k];
-          } else {
-            for (local_ordinal_type k=0;k<blocksize;++k)
-              y_packed(pri, k, col)[v] += yy[k];
+        for (local_ordinal_type schur_sub_part = 0; schur_sub_part < n_subparts_per_part-1; ++schur_sub_part) {
+          const local_ordinal_type r0 = part2packrowidx0_sub(partidx,2*schur_sub_part+1);
+          for (local_ordinal_type i = 0; i < 2; ++i) {
+            copy3DView<local_ordinal_type>(member, 
+              Kokkos::subview(X_internal_vector_values, r0+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), 
+              Kokkos::subview(X_internal_vector_values_schur, r0_schur+2*schur_sub_part+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()));
           }
         }
       }
 
-      template<int P, int B>
+      template<int B>
       KOKKOS_INLINE_FUNCTION
       void
-      operator() (const OverlapTag<P,B> &, const member_type &member) const {
-        const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
-        const local_ordinal_type blocksize_square = blocksize*blocksize;
+      operator() (const SingleVectorApplyETag<B> &, const member_type &member) const {
+        const local_ordinal_type packidx = packindices_sub(member.league_rank());
 
-        // constants
-        const local_ordinal_type rowidx = member.league_rank();
-        const local_ordinal_type partidx = rowidx2part(rowidx);
-        const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx));
-        const local_ordinal_type v = partidx % vector_length;
+        const local_ordinal_type subpartidx = packptr_sub(packidx);
+        const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0);
+        const local_ordinal_type local_subpartidx = subpartidx/n_parts;
+        const local_ordinal_type partidx = subpartidx%n_parts;
+        const local_ordinal_type blocksize = e_internal_vector_values.extent(2);
 
-        const Kokkos::pair<local_ordinal_type,local_ordinal_type> block_range(0, blocksize);
-        const local_ordinal_type num_vectors = y_packed_scalar.extent(2);
-        const local_ordinal_type num_local_rows = lclrow.extent(0);
+        const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx);
+        const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0);
 
-        // subview pattern
-        auto bb = Kokkos::subview(b, block_range, 0);
-        auto xx = bb; //Kokkos::subview(x, block_range, 0);
-        auto xx_remote = bb; //Kokkos::subview(x_remote, block_range, 0);
-        auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0);
-        auto A_block = ConstUnmanaged<tpetra_block_access_view_type>(NULL, blocksize, blocksize);
-        auto colindsub_used = (P == 0 ? colindsub : colindsub_remote);
-        auto rowptr_used = (P == 0 ? rowptr : rowptr_remote);
-
-        const local_ordinal_type lr = lclrow(rowidx);
-        const local_ordinal_type row = lr*blocksize;
-        for (local_ordinal_type col=0;col<num_vectors;++col) {
-          yy.assign_data(&y_packed_scalar(pri, 0, col, v));
-          if (P == 0) {
-            // y := b
-            bb.assign_data(&b(row, col));
-            if (member.team_rank() == 0)
-              VectorCopy(member, blocksize, bb, yy);
-            member.team_barrier();
-          }
+        internal_vector_scratch_type_3d_view
+          WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size);
 
-          // y -= Rx
-          const size_type A_k0 = A_rowptr[lr];
+        // Compute v_2 = v_2 - C v_1
+
+        const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
+
+        typedef SolveTridiagsDefaultModeAndAlgo
+          <typename execution_space::memory_space> default_mode_and_algo_type;
+
+        typedef typename default_mode_and_algo_type::mode_type default_mode_type;
+        typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type;
+
+        if (local_subpartidx == 0) {
           Kokkos::parallel_for
-            (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]),
-             [&](const local_ordinal_type &k) {
-              const size_type j = A_k0 + colindsub_used[k];
-              A_block.assign_data( &tpetra_values(j*blocksize_square) );
-
-              const local_ordinal_type A_colind_at_j = A_colind[j];
-              if (P == 0) {
-                const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j;
-                xx.assign_data( &x(loc*blocksize, col) );
-                VectorGemv(member, blocksize, A_block, xx, yy);
-              } else {
-                const auto loc = A_colind_at_j - num_local_rows;
-                xx_remote.assign_data( &x_remote(loc*blocksize, col) );
-                VectorGemv(member, blocksize, A_block, xx_remote, yy);
+            (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
+
+              auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
+
+              for (local_ordinal_type row = 0; row < nrows; ++row) {
+                auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
+                auto E = Kokkos::subview(e_internal_vector_values, 0, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
+
+                KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
+                  (default_mode_type,default_algo_type,
+                    member,
+                    blocksize, blocksize,
+                    -one,
+                    E.data(), E.stride_0(), E.stride_1(),
+                    v_2.data(), v_2.stride_0(),
+                    one,
+                    v_1.data(), v_1.stride_0());
               }
             });
         }
-      }
-
-      // y = b - Rx; seq method
-      template<typename MultiVectorLocalViewTypeY,
-               typename MultiVectorLocalViewTypeB,
-               typename MultiVectorLocalViewTypeX>
-      void run(const MultiVectorLocalViewTypeY &y_,
-               const MultiVectorLocalViewTypeB &b_,
-               const MultiVectorLocalViewTypeX &x_) {
-        IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
-        IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ComputeResidual::<SeqTag>");
-
-        y = y_; b = b_; x = x_;
-        if constexpr (is_device<execution_space>::value) {
-          const local_ordinal_type blocksize = blocksize_requested;
-          const local_ordinal_type team_size = 8;
-          const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
-          const Kokkos::TeamPolicy<execution_space,SeqTag> policy(rowptr.extent(0) - 1, team_size, vector_size);
+        else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) {
           Kokkos::parallel_for
-            ("ComputeResidual::TeamPolicy::run<SeqTag>", policy, *this);
-        } else {
-          const Kokkos::RangePolicy<execution_space,SeqTag> policy(0, rowptr.extent(0) - 1);
+            (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
+              auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
+
+              for (local_ordinal_type row = 0; row < nrows; ++row) {
+                auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
+                auto E = Kokkos::subview(e_internal_vector_values, 1, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
+
+                KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
+                  (default_mode_type,default_algo_type,
+                    member,
+                    blocksize, blocksize,
+                    -one,
+                    E.data(), E.stride_0(), E.stride_1(),
+                    v_2.data(), v_2.stride_0(),
+                    one,
+                    v_1.data(), v_1.stride_0());
+              }
+            });
+        }
+        else {
           Kokkos::parallel_for
-            ("ComputeResidual::RangePolicy::run<SeqTag>", policy, *this);
+            (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) {
+              {
+                auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v);
+
+                for (local_ordinal_type row = 0; row < nrows; ++row) {
+                  auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
+                  auto E = Kokkos::subview(e_internal_vector_values, 0, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
+
+                  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
+                    (default_mode_type,default_algo_type,
+                      member,
+                      blocksize, blocksize,
+                      -one,
+                      E.data(), E.stride_0(), E.stride_1(),
+                      v_2.data(), v_2.stride_0(),
+                      one,
+                      v_1.data(), v_1.stride_0());
+                }
+              }
+              {
+                auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v);
+
+                for (local_ordinal_type row = 0; row < nrows; ++row) {
+                  auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v);
+                  auto E = Kokkos::subview(e_internal_vector_values, 1, r0+row, Kokkos::ALL(), Kokkos::ALL(), v);
+
+                  KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE
+                    (default_mode_type,default_algo_type,
+                      member,
+                      blocksize, blocksize,
+                      -one,
+                      E.data(), E.stride_0(), E.stride_1(),
+                      v_2.data(), v_2.stride_0(),
+                      one,
+                      v_1.data(), v_1.stride_0());
+                }
+              }
+            });
         }
-        IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
-        IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space)
       }
 
-      // y = b - R (x , x_remote)
-      template<typename MultiVectorLocalViewTypeB,
-               typename MultiVectorLocalViewTypeX,
-               typename MultiVectorLocalViewTypeX_Remote>
-      void run(const vector_type_3d_view &y_packed_,
-               const MultiVectorLocalViewTypeB &b_,
-               const MultiVectorLocalViewTypeX &x_,
-               const MultiVectorLocalViewTypeX_Remote &x_remote_) {
+      void run(const impl_scalar_type_2d_view_tpetra &Y,
+               const impl_scalar_type_1d_view &Z) {
         IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
-        IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ComputeResidual::<AsyncTag>");
-
-        b = b_; x = x_; x_remote = x_remote_;
-        if constexpr (is_device<execution_space>::value) {
-          y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
-                                                     y_packed_.extent(0),
-                                                     y_packed_.extent(1),
-                                                     y_packed_.extent(2),
-                                                     vector_length);
-        } else {
-          y_packed = y_packed_;
-        }
+        IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::SolveTridiags");
 
-        if constexpr(is_device<execution_space>::value) {
-          const local_ordinal_type blocksize = blocksize_requested;
-          const local_ordinal_type team_size = 8;
-          const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
-          // local_ordinal_type vl_power_of_two = 1;
-          // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
-          // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
-          // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
-#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) {                \
-            const Kokkos::TeamPolicy<execution_space,AsyncTag<B> >      \
-              policy(rowidx2part.extent(0), team_size, vector_size);    \
-            Kokkos::parallel_for                                        \
-              ("ComputeResidual::TeamPolicy::run<AsyncTag>",            \
-               policy, *this); } break
-          switch (blocksize_requested) {
-          case   3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
-          case   5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
-          case   7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
-          case   9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
-          case  10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
-          case  11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
-          case  16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
-          case  17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
-          case  18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
-          default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
-          }
-#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
-	} else {
-#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) {                \
-            const Kokkos::RangePolicy<execution_space,AsyncTag<B> > policy(0, rowidx2part.extent(0)); \
-            Kokkos::parallel_for                                        \
-              ("ComputeResidual::RangePolicy::run<AsyncTag>",           \
-               policy, *this); } break
-          switch (blocksize_requested) {
-          case   3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
-          case   5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
-          case   7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
-          case   9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
-          case  10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
-          case  11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
-          case  16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
-          case  17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
-          case  18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
-          default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
-          }
-#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
-        }
-        IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
-        IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space)
-      }
-
-      // y = b - R (y , y_remote)
-      template<typename MultiVectorLocalViewTypeB,
-               typename MultiVectorLocalViewTypeX,
-               typename MultiVectorLocalViewTypeX_Remote>
-      void run(const vector_type_3d_view &y_packed_,
-               const MultiVectorLocalViewTypeB &b_,
-               const MultiVectorLocalViewTypeX &x_,
-               const MultiVectorLocalViewTypeX_Remote &x_remote_,
-               const bool compute_owned) {
-        IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
-        IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ComputeResidual::<OverlapTag>");
-
-        b = b_; x = x_; x_remote = x_remote_;
-        if constexpr (is_device<execution_space>::value) {
-          y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(),
-                                                     y_packed_.extent(0),
-                                                     y_packed_.extent(1),
-                                                     y_packed_.extent(2),
-                                                     vector_length);
-        } else {
-          y_packed = y_packed_;
-        }
+        /// set vectors
+        this->Y_scalar_multivector = Y;
+        this->Z_scalar_vector = Z;
 
-        if constexpr (is_device<execution_space>::value) {
-          const local_ordinal_type blocksize = blocksize_requested;
-          const local_ordinal_type team_size = 8;
-          const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize<execution_space>(blocksize, team_size);
-          // local_ordinal_type vl_power_of_two = 1;
-          // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2);
-          // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1);
-          // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two;
-#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B)  \
-          if (compute_owned) {                                          \
-            const Kokkos::TeamPolicy<execution_space,OverlapTag<0,B> > \
-              policy(rowidx2part.extent(0), team_size, vector_size);    \
-            Kokkos::parallel_for                                        \
-              ("ComputeResidual::TeamPolicy::run<OverlapTag<0> >", policy, *this); \
-          } else {                                                      \
-            const Kokkos::TeamPolicy<execution_space,OverlapTag<1,B> > \
-              policy(rowidx2part.extent(0), team_size, vector_size);    \
-            Kokkos::parallel_for                                        \
-              ("ComputeResidual::TeamPolicy::run<OverlapTag<1> >", policy, *this); \
-          } break
-          switch (blocksize_requested) {
-          case   3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
-          case   5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
-          case   7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
-          case   9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
-          case  10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
-          case  11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
-          case  16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
-          case  17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
-          case  18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
-          default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
-          }
-#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
-        } else {
-#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B)  \
-          if (compute_owned) {                                          \
-            const Kokkos::RangePolicy<execution_space,OverlapTag<0,B> > \
-              policy(0, rowidx2part.extent(0));                         \
-            Kokkos::parallel_for                                        \
-              ("ComputeResidual::RangePolicy::run<OverlapTag<0> >", policy, *this); \
-          } else {                                                      \
-            const Kokkos::RangePolicy<execution_space,OverlapTag<1,B> > \
-              policy(0, rowidx2part.extent(0));                         \
-            Kokkos::parallel_for                                        \
-              ("ComputeResidual::RangePolicy::run<OverlapTag<1> >", policy, *this); \
-          } break
-
-          switch (blocksize_requested) {
-          case   3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3);
-          case   5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5);
-          case   7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7);
-          case   9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9);
-          case  10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10);
-          case  11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11);
-          case  16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16);
-          case  17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17);
-          case  18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18);
-          default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0);
-          }
-#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL
-        }
-        IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
-        IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space)
-      }
-    };
+        const local_ordinal_type num_vectors = X_internal_vector_values.extent(2);
+        const local_ordinal_type blocksize = D_internal_vector_values.extent(1);
 
-    template<typename MatrixType>
-    void reduceVector(const ConstUnmanaged<typename ImplType<MatrixType>::impl_scalar_type_1d_view> zz,
-                      /* */ typename ImplType<MatrixType>::magnitude_type *vals) {
-      IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
-      IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ReduceVector");
+        const local_ordinal_type team_size =
+          SolveTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
+          recommended_team_size(blocksize, vector_length, internal_vector_length);
+        const int per_team_scratch = internal_vector_scratch_type_3d_view
+          ::shmem_size(blocksize, num_vectors, vector_loop_size);
 
-      using impl_type = ImplType<MatrixType>;
-      using local_ordinal_type = typename impl_type::local_ordinal_type;
-      using impl_scalar_type = typename impl_type::impl_scalar_type;
-#if 0
-      const auto norm2 = KokkosBlas::nrm1(zz);
+#if defined(KOKKOS_ENABLE_DEPRECATED_CODE)
+#define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B)                    \
+        if (num_vectors == 1) {                                         \
+          const Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> > \
+            policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
+          Kokkos::parallel_for                                          \
+            ("SolveTridiags::TeamPolicy::run<SingleVector>",            \
+             policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
+        } else {                                                        \
+          const Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> > \
+            policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
+          Kokkos::parallel_for                                          \
+            ("SolveTridiags::TeamPolicy::run<MultiVector>",             \
+             policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \
+        } break
 #else
-      impl_scalar_type norm2(0);
-      Kokkos::parallel_reduce
-        ("ReduceMultiVector::Device",
-         Kokkos::RangePolicy<typename impl_type::execution_space>(0,zz.extent(0)),
-         KOKKOS_LAMBDA(const local_ordinal_type &i, impl_scalar_type &update) {
-          update += zz(i);
-        }, norm2);
-#endif
-      vals[0] = Kokkos::ArithTraits<impl_scalar_type>::abs(norm2);
-
-      IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
-      IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(typename ImplType<MatrixType>::execution_space)
-    }
-
-    ///
-    /// Manage the distributed part of the computation of residual norms.
-    ///
-    template<typename MatrixType>
-    struct NormManager {
-    public:
-      using impl_type = ImplType<MatrixType>;
-      using host_execution_space = typename impl_type::host_execution_space;
-      using magnitude_type = typename impl_type::magnitude_type;
-
-    private:
-      bool collective_;
-      int sweep_step_, sweep_step_upper_bound_;
-#ifdef HAVE_IFPACK2_MPI
-      MPI_Request mpi_request_;
-      MPI_Comm comm_;
-#endif
-      magnitude_type work_[3];
-
-    public:
-      NormManager() = default;
-      NormManager(const NormManager &b) = default;
-      NormManager(const Teuchos::RCP<const Teuchos::Comm<int> >& comm) {
-        sweep_step_ = 1;
-        sweep_step_upper_bound_ = 1;
-        collective_ = comm->getSize() > 1;
-        if (collective_) {
-#ifdef HAVE_IFPACK2_MPI
-          const auto mpi_comm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm);
-          TEUCHOS_ASSERT( ! mpi_comm.is_null());
-          comm_ = *mpi_comm->getRawMpiComm();
-#endif
-        }
-        const magnitude_type zero(0), minus_one(-1);
-        work_[0] = zero;
-        work_[1] = zero;
-        work_[2] = minus_one;
-      }
-
-      // Check the norm every sweep_step sweeps.
-      void setCheckFrequency(const int sweep_step) {
-        TEUCHOS_TEST_FOR_EXCEPT_MSG(sweep_step < 1, "sweep step must be >= 1");
-        sweep_step_upper_bound_ = sweep_step;
-        sweep_step_ = 1;
-      }
-
-      // Get the buffer into which to store rank-local squared norms.
-      magnitude_type* getBuffer() { return &work_[0]; }
-
-      // Call MPI_Iallreduce to find the global squared norms.
-      void ireduce(const int sweep, const bool force = false) {
-        if ( ! force && sweep % sweep_step_) return;
-
-        IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NormManager::Ireduce");
-
-        work_[1] = work_[0];
-#ifdef HAVE_IFPACK2_MPI
-        auto send_data = &work_[1];
-        auto recv_data = &work_[0];
-        if (collective_) {
-# if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3)
-          MPI_Iallreduce(send_data, recv_data, 1,
-                         Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(),
-                         MPI_SUM, comm_, &mpi_request_);
-# else
-          MPI_Allreduce (send_data, recv_data, 1,
-                         Teuchos::Details::MpiTypeTraits<magnitude_type>::getType(),
-                         MPI_SUM, comm_);
-# endif
-        }
-#endif
-      }
-
-      // Check if the norm-based termination criterion is met. tol2 is the
-      // tolerance squared. Sweep is the sweep index. If not every iteration is
-      // being checked, this function immediately returns false. If a check must
-      // be done at this iteration, it waits for the reduction triggered by
-      // ireduce to complete, then checks the global norm against the tolerance.
-      bool checkDone (const int sweep, const magnitude_type tol2, const bool force = false) {
-        // early return
-        if (sweep <= 0) return false;
-
-        IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NormManager::CheckDone");
-
-        TEUCHOS_ASSERT(sweep >= 1);
-        if ( ! force && (sweep - 1) % sweep_step_) return false;
-        if (collective_) {
-#ifdef HAVE_IFPACK2_MPI
-# if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3)
-          MPI_Wait(&mpi_request_, MPI_STATUS_IGNORE);
-# else
-          // Do nothing.
-# endif
+#define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B)                    \
+        if (num_vectors == 1) {                                         \
+          if (packindices_schur.extent(0) == 0) { \
+            Kokkos::TeamPolicy<execution_space,SingleVectorTag<B> >       \
+              policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
+            policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
+            Kokkos::parallel_for                                          \
+              ("SolveTridiags::TeamPolicy::run<SingleVector>",            \
+              policy, *this);                                            \
+          } \
+          else { \
+            { \
+              IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSubLineTag"); \
+              write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSubLineTag.mm"); \
+              Kokkos::TeamPolicy<execution_space,SingleVectorSubLineTag<B> >       \
+                policy(packindices_sub.extent(0), team_size, vector_loop_size); \
+              policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
+              Kokkos::parallel_for                                          \
+                ("SolveTridiags::TeamPolicy::run<SingleVector>",            \
+                policy, *this);                                            \
+              write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSubLineTag.mm"); \
+              IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
+            } \
+            { \
+              IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyCTag"); \
+              write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyCTag.mm"); \
+              Kokkos::TeamPolicy<execution_space,SingleVectorApplyCTag<B> >       \
+                policy(packindices_sub.extent(0), team_size, vector_loop_size); \
+              policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
+              Kokkos::parallel_for                                          \
+                ("SolveTridiags::TeamPolicy::run<SingleVector>",            \
+                policy, *this);                                            \
+              write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyCTag.mm"); \
+              IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
+            } \
+            { \
+              IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSchurTag"); \
+              write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSchurTag.mm"); \
+              Kokkos::TeamPolicy<execution_space,SingleVectorSchurTag<B> >       \
+                policy(part2packrowidx0_sub.extent(0), team_size, vector_loop_size); \
+              policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
+              Kokkos::parallel_for                                          \
+                ("SolveTridiags::TeamPolicy::run<SingleVector>",            \
+                policy, *this);                                            \
+              write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSchurTag.mm"); \
+              IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
+            } \
+            { \
+              IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyETag"); \
+              write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyETag.mm"); \
+              Kokkos::TeamPolicy<execution_space,SingleVectorApplyETag<B> >       \
+                policy(packindices_sub.extent(0), team_size, vector_loop_size); \
+              policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
+              Kokkos::parallel_for                                          \
+                ("SolveTridiags::TeamPolicy::run<SingleVector>",            \
+                policy, *this);                                            \
+              write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyETag.mm"); \
+              IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \
+            } \
+          } \
+        } else {                                                        \
+          Kokkos::TeamPolicy<execution_space,MultiVectorTag<B> >        \
+            policy(packptr.extent(0) - 1, team_size, vector_loop_size); \
+          policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \
+          Kokkos::parallel_for                                          \
+            ("SolveTridiags::TeamPolicy::run<MultiVector>",             \
+             policy, *this);                                            \
+        } break
 #endif
+        switch (blocksize) {
+        case   3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 3);
+        case   5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 5);
+        case   7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 7);
+        case   9: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 9);
+        case  10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10);
+        case  11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11);
+        case  16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16);
+        case  17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17);
+        case  18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18);
+        default : BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 0);
         }
-        bool r_val = false;
-        if (sweep == 1) {
-          work_[2] = work_[0];
-        } else {
-          r_val = (work_[0] < tol2*work_[2]);
-        }
-
-        // adjust sweep step
-        const auto adjusted_sweep_step = 2*sweep_step_;
-        if (adjusted_sweep_step < sweep_step_upper_bound_) {
-          sweep_step_ = adjusted_sweep_step;
-        } else {
-          sweep_step_ = sweep_step_upper_bound_;
-        }
-        return r_val;
-      }
+#undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS
 
-      // After termination has occurred, finalize the norms for use in
-      // get_norms{0,final}.
-      void finalize () {
-        work_[0] = std::sqrt(work_[0]); // after converged
-        if (work_[2] >= 0)
-          work_[2] = std::sqrt(work_[2]); // first norm
-        // if work_[2] is minus one, then norm is not requested.
+        IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
+        IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
       }
-
-      // Report norms to the caller.
-      const magnitude_type getNorms0 () const { return work_[2]; }
-      const magnitude_type getNormsFinal () const { return work_[0]; }
     };
 
     ///
@@ -3973,30 +4422,30 @@ namespace Ifpack2 {
     template<typename MatrixType>
     int
     applyInverseJacobi(// importer
-                       const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
-                       const Teuchos::RCP<const typename ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
+                       const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_block_crs_matrix_type> &A,
+                       const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
                        const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
                        const bool overlap_communication_and_computation,
                        // tpetra interface
-                       const typename ImplType<MatrixType>::tpetra_multivector_type &X,  // tpetra interface
-                       /* */ typename ImplType<MatrixType>::tpetra_multivector_type &Y,  // tpetra interface
-                       /* */ typename ImplType<MatrixType>::tpetra_multivector_type &Z,  // temporary tpetra interface (seq_method)
-                       /* */ typename ImplType<MatrixType>::impl_scalar_type_1d_view &W,  // temporary tpetra interface (diff)
+                       const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X,  // tpetra interface
+                       /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y,  // tpetra interface
+                       /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Z,  // temporary tpetra interface (seq_method)
+                       /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W,  // temporary tpetra interface (diff)
                        // local object interface
-                       const PartInterface<MatrixType> &interf, // mesh interface
+                       const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
                        const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
-                       const AmD<MatrixType> &amd, // R = A - D
-                       /* */ typename ImplType<MatrixType>::vector_type_1d_view &work, // workspace for packed multivector of right hand side
-                       /* */ NormManager<MatrixType> &norm_manager,
+                       const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
+                       /* */ typename BlockHelperDetails::ImplType<MatrixType>::vector_type_1d_view &work, // workspace for packed multivector of right hand side
+                       /* */ BlockHelperDetails::NormManager<MatrixType> &norm_manager,
                        // preconditioner parameters
-                       const typename ImplType<MatrixType>::impl_scalar_type &damping_factor,
+                       const typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type &damping_factor,
                        /* */ bool is_y_zero,
                        const int max_num_sweeps,
-                       const typename ImplType<MatrixType>::magnitude_type tol,
+                       const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
                        const int check_tol_every) {
-      IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ApplyInverseJacobi");
+      IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi");
 
-      using impl_type = ImplType<MatrixType>;
+      using impl_type = BlockHelperDetails::ImplType<MatrixType>;
       using node_memory_space = typename impl_type::node_memory_space;
       using local_ordinal_type = typename impl_type::local_ordinal_type;
       using size_type = typename impl_type::size_type;
@@ -4075,7 +4524,7 @@ namespace Ifpack2 {
                                                damping_factor, is_norm_manager_active);
 
       const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view;
-      ComputeResidualVector<MatrixType>
+      BlockHelperDetails::ComputeResidualVector<MatrixType>
         compute_residual_vector(amd, A->getCrsGraph().getLocalGraphDevice(), blocksize, interf,
                                 is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view);
 
@@ -4131,7 +4580,7 @@ namespace Ifpack2 {
         {
           if (is_norm_manager_active) {
             // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always.
-            reduceVector<MatrixType>(W, norm_manager.getBuffer());
+            BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
             if (sweep + 1 == max_num_sweeps) {
               norm_manager.ireduce(sweep, true);
               norm_manager.checkDone(sweep + 1, tolerance, true);
@@ -4152,11 +4601,11 @@ namespace Ifpack2 {
 
     template<typename MatrixType>
     struct ImplObject {
-      using impl_type = ImplType<MatrixType>;
-      using part_interface_type = PartInterface<MatrixType>;
+      using impl_type = BlockHelperDetails::ImplType<MatrixType>;
+      using part_interface_type = BlockHelperDetails::PartInterface<MatrixType>;
       using block_tridiags_type = BlockTridiags<MatrixType>;
-      using amd_type = AmD<MatrixType>;
-      using norm_manager_type = NormManager<MatrixType>;
+      using amd_type = BlockHelperDetails::AmD<MatrixType>;
+      using norm_manager_type = BlockHelperDetails::NormManager<MatrixType>;
       using async_import_type = AsyncableImport<MatrixType>;
 
       // distructed objects
diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainerUtil.hpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainerUtil.hpp
index 912e8e5e5baa..441da4c2757f 100644
--- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainerUtil.hpp
+++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainerUtil.hpp
@@ -186,7 +186,7 @@ struct BlockTriDiContainerTester {
     Teuchos::Array<Teuchos::Array<LO> > parts;
     make_parts(sb, sbp, *A, nonuniform_lines, jacobi, parts);
     return Teuchos::rcp(new Ifpack2::BlockTriDiContainer<Tpetra_RowMatrix>(
-                          A, parts, overlap_comm, seq_method));
+                          A, parts, 1, overlap_comm, seq_method));
   }
 
   static Int