From 1d16296713ef4beaae45c74c06cb44868aae57a7 Mon Sep 17 00:00:00 2001
From: Brian Kelley <bmkelle@sandia.gov>
Date: Tue, 18 Feb 2025 19:01:50 -0700
Subject: [PATCH] Fused block jacobi

More performant path for block Jacobi case inside BTDS
(GPU only, BlockCrs only). Fuses residual and solve
into one kernel and doesn't convert vectors to SIMD-packed
format. Also inverts diag blocks fully in shared to speed up numeric.

Signed-off-by: Brian Kelley <bmkelle@sandia.gov>
---
 .../Ifpack2_BlockComputeResidualAndSolve.hpp  | 684 ++++++++++++++++++
 packages/ifpack2/src/Ifpack2_BlockHelper.hpp  |   1 +
 .../src/Ifpack2_BlockTriDiContainer_def.hpp   | 121 +++-
 .../src/Ifpack2_BlockTriDiContainer_impl.hpp  | 471 ++++++++++--
 4 files changed, 1173 insertions(+), 104 deletions(-)
 create mode 100644 packages/ifpack2/src/Ifpack2_BlockComputeResidualAndSolve.hpp
diff --git a/packages/ifpack2/src/Ifpack2_BlockComputeResidualAndSolve.hpp b/packages/ifpack2/src/Ifpack2_BlockComputeResidualAndSolve.hpp
new file mode 100644
index 000000000000..5fe27cf9785a
--- /dev/null
+++ b/packages/ifpack2/src/Ifpack2_BlockComputeResidualAndSolve.hpp
@@ -0,0 +1,684 @@
+// @HEADER
+// *****************************************************************************
+//       Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
+//
+// Copyright 2009 NTESS and the Ifpack2 contributors.
+// SPDX-License-Identifier: BSD-3-Clause
+// *****************************************************************************
+// @HEADER
+
+#ifndef IFPACK2_BLOCKCOMPUTERESAND_SOLVE_IMPL_HPP
+#define IFPACK2_BLOCKCOMPUTERESAND_SOLVE_IMPL_HPP
+
+#include "Ifpack2_BlockHelper.hpp"
+#include "Ifpack2_BlockComputeResidualVector.hpp"
+
+namespace Ifpack2::BlockHelperDetails {
+
+template <typename ExecSpace, typename DiagOffsets, typename Rowptrs,
+          typename Entries>
+DiagOffsets findDiagOffsets(const Rowptrs& rowptrs, const Entries& entries,
+                            size_t nrows, int blocksize) {
+  DiagOffsets diag_offsets(do_not_initialize_tag("btdm.diag_offsets"), nrows);
+  int err = 0;
+  Kokkos::parallel_reduce(
+      Kokkos::RangePolicy<ExecSpace>(0, nrows),
+      KOKKOS_LAMBDA(size_t row, int& err) {
+        auto rowBegin = rowptrs(row);
+        auto rowEnd   = rowptrs(row + 1);
+        for (size_t j = rowBegin; j < rowEnd; j++) {
+          if (size_t(entries(j)) == row) {
+            diag_offsets(row) = j * blocksize * blocksize;
+            return;
+          }
+        }
+        err++;
+      },
+      err);
+  TEUCHOS_TEST_FOR_EXCEPT_MSG(
+      err, "Ifpack2 BTD: at least one row has no diagonal entry");
+  return diag_offsets;
+}
+
+template <typename MatrixType, int B>
+struct ComputeResidualAndSolve_1Pass {
+  using impl_type        = BlockHelperDetails::ImplType<MatrixType>;
+  using node_device_type = typename impl_type::node_device_type;
+  using execution_space  = typename impl_type::execution_space;
+  using memory_space     = typename impl_type::memory_space;
+
+  using local_ordinal_type = typename impl_type::local_ordinal_type;
+  using size_type          = typename impl_type::size_type;
+  using impl_scalar_type   = typename impl_type::impl_scalar_type;
+  using magnitude_type     = typename impl_type::magnitude_type;
+  /// views
+  using local_ordinal_type_1d_view =
+      typename impl_type::local_ordinal_type_1d_view;
+  using size_type_1d_view = typename impl_type::size_type_1d_view;
+  using tpetra_block_access_view_type =
+      typename impl_type::tpetra_block_access_view_type;  // block crs (layout
+                                                          // right)
+  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
+  using impl_scalar_type_2d_view_tpetra =
+      typename impl_type::impl_scalar_type_2d_view_tpetra;  // block multivector
+                                                            // (layout left)
+  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
+  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
+  using i64_3d_view              = typename impl_type::i64_3d_view;
+
+  /// team policy member type (used in cuda)
+  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
+
+  // enum for max blocksize and vector length
+  enum : int { max_blocksize = 32 };
+
+ private:
+  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
+  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x;  // x_owned
+  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
+  Unmanaged<impl_scalar_type_2d_view_tpetra> y;
+
+  // AmD information
+  const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
+
+  // blocksize
+  const local_ordinal_type blocksize_requested;
+
+  // block offsets
+  const ConstUnmanaged<i64_3d_view> A_x_offsets;
+  const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
+
+  // diagonal block inverses
+  const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
+
+  // squared update norms
+  const Unmanaged<impl_scalar_type_1d_view> W;
+
+  impl_scalar_type damping_factor;
+
+ public:
+  ComputeResidualAndSolve_1Pass(const AmD<MatrixType>& amd,
+                                const btdm_scalar_type_3d_view& d_inv_,
+                                const impl_scalar_type_1d_view& W_,
+                                const local_ordinal_type& blocksize_requested_,
+                                const impl_scalar_type& damping_factor_)
+      : tpetra_values(amd.tpetra_values),
+        blocksize_requested(blocksize_requested_),
+        A_x_offsets(amd.A_x_offsets),
+        A_x_offsets_remote(amd.A_x_offsets_remote),
+        d_inv(d_inv_),
+        W(W_),
+        damping_factor(damping_factor_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const member_type& member) const {
+    const local_ordinal_type blocksize   = (B == 0 ? blocksize_requested : B);
+    const local_ordinal_type rowidx      = member.league_rank();
+    const local_ordinal_type row         = rowidx * blocksize;
+    const local_ordinal_type num_vectors = b.extent(1);
+    const local_ordinal_type num_local_rows = d_inv.extent(0);
+
+    const impl_scalar_type* xx;
+    auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
+        NULL, blocksize, blocksize);
+
+    // Get shared allocation for a local copy of x, Ax, and A
+    impl_scalar_type* local_Ax = reinterpret_cast<impl_scalar_type*>(
+        member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
+    impl_scalar_type* local_DinvAx = reinterpret_cast<impl_scalar_type*>(
+        member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
+    impl_scalar_type* local_x =
+        reinterpret_cast<impl_scalar_type*>(member.thread_scratch(0).get_shmem(
+            blocksize * sizeof(impl_scalar_type)));
+
+    magnitude_type norm = 0;
+    for (local_ordinal_type col = 0; col < num_vectors; ++col) {
+      if (col) member.team_barrier();
+      // y -= Rx
+      // Initialize accumulation arrays
+      Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize),
+                           [&](const local_ordinal_type& i) {
+                             local_DinvAx[i] = 0;
+                             local_Ax[i]     = b(row + i, col);
+                           });
+      member.team_barrier();
+
+      int numEntries = A_x_offsets.extent(2);
+
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, 0, numEntries), [&](const int k) {
+            int64_t A_offset = A_x_offsets(rowidx, 0, k);
+            int64_t x_offset = A_x_offsets(rowidx, 1, k);
+            if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
+              A_block_cst.assign_data(tpetra_values.data() + A_offset);
+              // Pull x into local memory
+              int64_t remote_cutoff = blocksize * num_local_rows;
+              if (x_offset >= remote_cutoff)
+                xx = &x_remote(x_offset - remote_cutoff, col);
+              else
+                xx = &x(x_offset, col);
+
+              Kokkos::parallel_for(
+                  Kokkos::ThreadVectorRange(member, blocksize),
+                  [&](const local_ordinal_type& i) { local_x[i] = xx[i]; });
+
+              // matvec on block: local_Ax -= A_block_cst * local_x
+              Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
+                                   [&](const int k0) {
+                                     impl_scalar_type val = 0;
+                                     for (int k1 = 0; k1 < blocksize; k1++)
+                                       val += A_block_cst(k0, k1) * local_x[k1];
+                                     Kokkos::atomic_add(local_Ax + k0, -val);
+                                   });
+            }
+          });
+      member.team_barrier();
+      // Compute local_DinvAx = D^-1 * local_Ax
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, blocksize),
+          [&](const local_ordinal_type& k0) {
+            Kokkos::parallel_reduce(
+                Kokkos::ThreadVectorRange(member, blocksize),
+                [&](const local_ordinal_type& k1, impl_scalar_type& update) {
+                  update += d_inv(rowidx, k0, k1) * local_Ax[k1];
+                },
+                local_DinvAx[k0]);
+          });
+      member.team_barrier();
+      // local_DinvAx is fully computed. Now compute the
+      // squared y update norm and update y (using damping factor).
+      magnitude_type colNorm;
+      Kokkos::parallel_reduce(
+          Kokkos::TeamVectorRange(member, blocksize),
+          [&](const local_ordinal_type& k, magnitude_type& update) {
+            // Compute the change in y (assuming damping_factor == 1) for this
+            // entry.
+            impl_scalar_type old_y    = x(row + k, col);
+            impl_scalar_type y_update = local_DinvAx[k] - old_y;
+            magnitude_type ydiff =
+                Kokkos::ArithTraits<impl_scalar_type>::abs(y_update);
+            update += ydiff * ydiff;
+            y(row + k, col) = old_y + damping_factor * y_update;
+          },
+          colNorm);
+      norm += colNorm;
+    }
+    Kokkos::single(Kokkos::PerTeam(member), [&]() { W(rowidx) = norm; });
+  }
+
+  // Launch SinglePass version (owned + nonowned residual, plus Dinv in a single
+  // kernel)
+  void run(const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
+           const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
+           const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_remote_,
+           const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
+    IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
+    IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
+        "BlockTriDi::ComputeResidualAndSolve::RunSinglePass",
+        ComputeResidualAndSolve0, execution_space);
+
+    y        = y_;
+    b        = b_;
+    x        = x_;
+    x_remote = x_remote_;
+
+    const local_ordinal_type blocksize = blocksize_requested;
+    const local_ordinal_type nrows     = d_inv.extent(0);
+
+    const local_ordinal_type team_size   = 8;
+    const local_ordinal_type vector_size = 8;
+    // team: local_Ax, local_DinvAx
+    const size_t shmem_team_size = 2 * blocksize * sizeof(impl_scalar_type);
+    // thread: local_x
+    const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type);
+    Kokkos::TeamPolicy<execution_space> policy(nrows, team_size, vector_size);
+    policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
+                            Kokkos::PerThread(shmem_thread_size));
+    Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::SinglePass",
+                         policy, *this);
+    IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
+    IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
+  }
+};
+
+template <typename MatrixType, int B>
+struct ComputeResidualAndSolve_2Pass {
+  using impl_type        = BlockHelperDetails::ImplType<MatrixType>;
+  using node_device_type = typename impl_type::node_device_type;
+  using execution_space  = typename impl_type::execution_space;
+  using memory_space     = typename impl_type::memory_space;
+
+  using local_ordinal_type = typename impl_type::local_ordinal_type;
+  using size_type          = typename impl_type::size_type;
+  using impl_scalar_type   = typename impl_type::impl_scalar_type;
+  using magnitude_type     = typename impl_type::magnitude_type;
+  /// views
+  using local_ordinal_type_1d_view =
+      typename impl_type::local_ordinal_type_1d_view;
+  using size_type_1d_view = typename impl_type::size_type_1d_view;
+  using tpetra_block_access_view_type =
+      typename impl_type::tpetra_block_access_view_type;  // block crs (layout
+                                                          // right)
+  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
+  using impl_scalar_type_2d_view_tpetra =
+      typename impl_type::impl_scalar_type_2d_view_tpetra;  // block multivector
+                                                            // (layout left)
+  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
+  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
+  using i64_3d_view              = typename impl_type::i64_3d_view;
+
+  /// team policy member type (used in cuda)
+  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
+
+  // enum for max blocksize and vector length
+  enum : int { max_blocksize = 32 };
+
+  // Tag for computing residual with owned columns only (pass 1)
+  struct OwnedTag {};
+
+  // Tag for finishing the residual with nonowned columns, and solving/norming
+  // (pass 2)
+  struct NonownedTag {};
+
+ private:
+  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
+  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x;  // x_owned
+  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
+  Unmanaged<impl_scalar_type_2d_view_tpetra> y;
+
+  // AmD information
+  const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
+
+  // blocksize
+  const local_ordinal_type blocksize_requested;
+
+  // block offsets
+  const ConstUnmanaged<i64_3d_view> A_x_offsets;
+  const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
+
+  // diagonal block inverses
+  const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
+
+  // squared update norms
+  const Unmanaged<impl_scalar_type_1d_view> W;
+
+  impl_scalar_type damping_factor;
+
+ public:
+  ComputeResidualAndSolve_2Pass(const AmD<MatrixType>& amd,
+                                const btdm_scalar_type_3d_view& d_inv_,
+                                const impl_scalar_type_1d_view& W_,
+                                const local_ordinal_type& blocksize_requested_,
+                                const impl_scalar_type& damping_factor_)
+      : tpetra_values(amd.tpetra_values),
+        blocksize_requested(blocksize_requested_),
+        A_x_offsets(amd.A_x_offsets),
+        A_x_offsets_remote(amd.A_x_offsets_remote),
+        d_inv(d_inv_),
+        W(W_),
+        damping_factor(damping_factor_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const OwnedTag, const member_type& member) const {
+    const local_ordinal_type blocksize   = (B == 0 ? blocksize_requested : B);
+    const local_ordinal_type rowidx      = member.league_rank();
+    const local_ordinal_type row         = rowidx * blocksize;
+    const local_ordinal_type num_vectors = b.extent(1);
+
+    auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
+        NULL, blocksize, blocksize);
+
+    // Get shared allocation for a local copy of x, Ax, and A
+    impl_scalar_type* local_Ax = reinterpret_cast<impl_scalar_type*>(
+        member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
+    impl_scalar_type* local_x =
+        reinterpret_cast<impl_scalar_type*>(member.thread_scratch(0).get_shmem(
+            blocksize * sizeof(impl_scalar_type)));
+
+    for (local_ordinal_type col = 0; col < num_vectors; ++col) {
+      if (col) member.team_barrier();
+      // y -= Rx
+      // Initialize accumulation arrays
+      Kokkos::parallel_for(
+          Kokkos::TeamVectorRange(member, blocksize),
+          [&](const local_ordinal_type& i) { local_Ax[i] = b(row + i, col); });
+      member.team_barrier();
+
+      int numEntries = A_x_offsets.extent(2);
+
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, 0, numEntries), [&](const int k) {
+            int64_t A_offset = A_x_offsets(rowidx, 0, k);
+            int64_t x_offset = A_x_offsets(rowidx, 1, k);
+            if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
+              A_block_cst.assign_data(tpetra_values.data() + A_offset);
+              // Pull x into local memory
+              Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
+                                   [&](const local_ordinal_type& i) {
+                                     local_x[i] = x(x_offset + i, col);
+                                   });
+
+              // MatVec op Ax += A*x
+              Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
+                                   [&](const local_ordinal_type& k0) {
+                                     impl_scalar_type val = 0;
+                                     for (int k1 = 0; k1 < blocksize; k1++)
+                                       val += A_block_cst(k0, k1) * local_x[k1];
+                                     Kokkos::atomic_add(local_Ax + k0, -val);
+                                   });
+            }
+          });
+      member.team_barrier();
+      // Write back the partial residual to y
+      if (member.team_rank() == 0) {
+        Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
+                             [&](const local_ordinal_type& k) {
+                               y(row + k, col) = local_Ax[k];
+                             });
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const NonownedTag, const member_type& member) const {
+    const local_ordinal_type blocksize   = (B == 0 ? blocksize_requested : B);
+    const local_ordinal_type rowidx      = member.league_rank();
+    const local_ordinal_type row         = rowidx * blocksize;
+    const local_ordinal_type num_vectors = b.extent(1);
+
+    auto A_block_cst = ConstUnmanaged<tpetra_block_access_view_type>(
+        NULL, blocksize, blocksize);
+
+    // Get shared allocation for a local copy of x, Ax, and A
+    impl_scalar_type* local_Ax = reinterpret_cast<impl_scalar_type*>(
+        member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
+    impl_scalar_type* local_DinvAx = reinterpret_cast<impl_scalar_type*>(
+        member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type)));
+    impl_scalar_type* local_x =
+        reinterpret_cast<impl_scalar_type*>(member.thread_scratch(0).get_shmem(
+            blocksize * sizeof(impl_scalar_type)));
+
+    magnitude_type norm = 0;
+    for (local_ordinal_type col = 0; col < num_vectors; ++col) {
+      if (col) member.team_barrier();
+      // y -= Rx
+      // Initialize accumulation arrays.
+      Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize),
+                           [&](const local_ordinal_type& i) {
+                             local_DinvAx[i] = 0;
+                             local_Ax[i]     = y(row + i, col);
+                           });
+      member.team_barrier();
+
+      int numEntries = A_x_offsets_remote.extent(2);
+
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, 0, numEntries), [&](const int k) {
+            int64_t A_offset = A_x_offsets_remote(rowidx, 0, k);
+            int64_t x_offset = A_x_offsets_remote(rowidx, 1, k);
+            if (A_offset != Kokkos::ArithTraits<int64_t>::min()) {
+              A_block_cst.assign_data(tpetra_values.data() + A_offset);
+              // Pull x into local memory
+              Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
+                                   [&](const local_ordinal_type& i) {
+                                     local_x[i] = x_remote(x_offset + i, col);
+                                   });
+
+              // matvec on block: local_Ax -= A_block_cst * local_x
+              Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
+                                   [&](const int k0) {
+                                     impl_scalar_type val = 0;
+                                     for (int k1 = 0; k1 < blocksize; k1++)
+                                       val += A_block_cst(k0, k1) * local_x[k1];
+                                     Kokkos::atomic_add(local_Ax + k0, -val);
+                                   });
+            }
+          });
+      member.team_barrier();
+      // Compute local_DinvAx = D^-1 * local_Ax
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, blocksize),
+          [&](const local_ordinal_type& k0) {
+            Kokkos::parallel_reduce(
+                Kokkos::ThreadVectorRange(member, blocksize),
+                [&](const local_ordinal_type& k1, impl_scalar_type& update) {
+                  update += d_inv(rowidx, k0, k1) * local_Ax[k1];
+                },
+                local_DinvAx[k0]);
+          });
+      member.team_barrier();
+      // local_DinvAx is fully computed. Now compute the
+      // squared y update norm and update y (using damping factor).
+      magnitude_type colNorm;
+      Kokkos::parallel_reduce(
+          Kokkos::TeamVectorRange(member, blocksize),
+          [&](const local_ordinal_type& k, magnitude_type& update) {
+            // Compute the change in y (assuming damping_factor == 1) for this
+            // entry.
+            impl_scalar_type old_y    = x(row + k, col);
+            impl_scalar_type y_update = local_DinvAx[k] - old_y;
+            magnitude_type ydiff =
+                Kokkos::ArithTraits<impl_scalar_type>::abs(y_update);
+            update += ydiff * ydiff;
+            y(row + k, col) = old_y + damping_factor * y_update;
+          },
+          colNorm);
+      norm += colNorm;
+    }
+    Kokkos::single(Kokkos::PerTeam(member), [&]() { W(rowidx) = norm; });
+  }
+
+  // Launch pass 1 of the 2-pass version.
+  // This computes just the owned part of residual and writes that back to y.
+  void run_pass1(const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
+                 const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
+                 const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
+    IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
+    IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
+        "BlockTriDi::ComputeResidualAndSolve::RunPass1",
+        ComputeResidualAndSolve0, execution_space);
+
+    b = b_;
+    x = x_;
+    y = y_;
+
+    const local_ordinal_type blocksize = blocksize_requested;
+    const local_ordinal_type nrows     = d_inv.extent(0);
+
+    const local_ordinal_type team_size   = 8;
+    const local_ordinal_type vector_size = 8;
+    const size_t shmem_team_size         = blocksize * sizeof(impl_scalar_type);
+    const size_t shmem_thread_size       = blocksize * sizeof(impl_scalar_type);
+    Kokkos::TeamPolicy<execution_space, OwnedTag> policy(nrows, team_size,
+                                                         vector_size);
+    policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
+                            Kokkos::PerThread(shmem_thread_size));
+    Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::Pass1", policy,
+                         *this);
+    IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
+    IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
+  }
+
+  // Launch pass 2 of the 2-pass version.
+  // This finishes computing residual with x_remote,
+  // and then applies Dinv and computes norm.
+  void run_pass2(
+      const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_,
+      const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& x_remote_,
+      const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
+    IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
+    IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
+        "BlockTriDi::ComputeResidualAndSolve::RunPass2",
+        ComputeResidualAndSolve0, execution_space);
+
+    x        = x_;
+    x_remote = x_remote_;
+    y        = y_;
+
+    const local_ordinal_type blocksize = blocksize_requested;
+    const local_ordinal_type nrows     = d_inv.extent(0);
+
+    const local_ordinal_type team_size   = 8;
+    const local_ordinal_type vector_size = 8;
+    const size_t shmem_team_size   = 2 * blocksize * sizeof(impl_scalar_type);
+    const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type);
+    Kokkos::TeamPolicy<execution_space, NonownedTag> policy(nrows, team_size,
+                                                            vector_size);
+    policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size),
+                            Kokkos::PerThread(shmem_thread_size));
+    Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::Pass2", policy,
+                         *this);
+    IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
+    IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
+  }
+};
+
+template <typename MatrixType, int B>
+struct ComputeResidualAndSolve_SolveOnly {
+  using impl_type        = BlockHelperDetails::ImplType<MatrixType>;
+  using node_device_type = typename impl_type::node_device_type;
+  using execution_space  = typename impl_type::execution_space;
+  using memory_space     = typename impl_type::memory_space;
+
+  using local_ordinal_type = typename impl_type::local_ordinal_type;
+  using size_type          = typename impl_type::size_type;
+  using impl_scalar_type   = typename impl_type::impl_scalar_type;
+  using magnitude_type     = typename impl_type::magnitude_type;
+  /// views
+  using local_ordinal_type_1d_view =
+      typename impl_type::local_ordinal_type_1d_view;
+  using size_type_1d_view = typename impl_type::size_type_1d_view;
+  using tpetra_block_access_view_type =
+      typename impl_type::tpetra_block_access_view_type;  // block crs (layout
+                                                          // right)
+  using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
+  using impl_scalar_type_2d_view_tpetra =
+      typename impl_type::impl_scalar_type_2d_view_tpetra;  // block multivector
+                                                            // (layout left)
+  using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
+  using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
+  using i64_3d_view              = typename impl_type::i64_3d_view;
+
+  /// team policy member type (used in cuda)
+  using member_type = typename Kokkos::TeamPolicy<execution_space>::member_type;
+
+  // enum for max blocksize and vector length
+  enum : int { max_blocksize = 32 };
+
+ private:
+  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> b;
+  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x;  // x_owned
+  ConstUnmanaged<impl_scalar_type_2d_view_tpetra> x_remote;
+  Unmanaged<impl_scalar_type_2d_view_tpetra> y;
+
+  // AmD information
+  const ConstUnmanaged<impl_scalar_type_1d_view> tpetra_values;
+
+  // blocksize
+  const local_ordinal_type blocksize_requested;
+
+  // block offsets
+  const ConstUnmanaged<i64_3d_view> A_x_offsets;
+  const ConstUnmanaged<i64_3d_view> A_x_offsets_remote;
+
+  // diagonal block inverses
+  const ConstUnmanaged<btdm_scalar_type_3d_view> d_inv;
+
+  // squared update norms
+  const Unmanaged<impl_scalar_type_1d_view> W;
+
+  impl_scalar_type damping_factor;
+
+ public:
+  ComputeResidualAndSolve_SolveOnly(
+      const AmD<MatrixType>& amd, const btdm_scalar_type_3d_view& d_inv_,
+      const impl_scalar_type_1d_view& W_,
+      const local_ordinal_type& blocksize_requested_,
+      const impl_scalar_type& damping_factor_)
+      : tpetra_values(amd.tpetra_values),
+        blocksize_requested(blocksize_requested_),
+        A_x_offsets(amd.A_x_offsets),
+        A_x_offsets_remote(amd.A_x_offsets_remote),
+        d_inv(d_inv_),
+        W(W_),
+        damping_factor(damping_factor_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const member_type& member) const {
+    const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B);
+    const local_ordinal_type rowidx =
+        member.league_rank() * member.team_size() + member.team_rank();
+    const local_ordinal_type row         = rowidx * blocksize;
+    const local_ordinal_type num_vectors = b.extent(1);
+
+    // Get shared allocation for a local copy of x, Ax, and A
+    impl_scalar_type* local_DinvAx =
+        reinterpret_cast<impl_scalar_type*>(member.thread_scratch(0).get_shmem(
+            blocksize * sizeof(impl_scalar_type)));
+
+    if (rowidx >= (local_ordinal_type)d_inv.extent(0)) return;
+
+    magnitude_type norm = 0;
+    for (local_ordinal_type col = 0; col < num_vectors; ++col) {
+      // Compute local_DinvAx = D^-1 * local_Ax
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize),
+                           [&](const local_ordinal_type& k0) {
+                             impl_scalar_type val = 0;
+                             for (local_ordinal_type k1 = 0; k1 < blocksize;
+                                  k1++) {
+                               val += d_inv(rowidx, k0, k1) * b(row + k1, col);
+                             }
+                             local_DinvAx[k0] = val;
+                           });
+
+      magnitude_type colNorm;
+      Kokkos::parallel_reduce(
+          Kokkos::ThreadVectorRange(member, blocksize),
+          [&](const local_ordinal_type& k, magnitude_type& update) {
+            // Compute the change in y (assuming damping_factor == 1) for this
+            // entry.
+            impl_scalar_type y_update = local_DinvAx[k];
+            magnitude_type ydiff =
+                Kokkos::ArithTraits<impl_scalar_type>::abs(y_update);
+            update += ydiff * ydiff;
+            y(row + k, col) = damping_factor * y_update;
+          },
+          colNorm);
+      norm += colNorm;
+    }
+    Kokkos::single(Kokkos::PerThread(member), [&]() { W(rowidx) = norm; });
+  }
+
+  // ComputeResidualAndSolve_SolveOnly::run does the solve for the first
+  // iteration, when the initial guess for y is zero. This means the residual
+  // vector is just b. The kernel applies the inverse diags to b to find y, and
+  // also puts the partial squared update norms (1 per row) into W.
+  void run(const ConstUnmanaged<impl_scalar_type_2d_view_tpetra>& b_,
+           const Unmanaged<impl_scalar_type_2d_view_tpetra>& y_) {
+    IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN;
+    IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE(
+        "BlockTriDi::ComputeResidualAndSolve::Run_Y_Zero",
+        ComputeResidualAndSolve0, execution_space);
+
+    y = y_;
+    b = b_;
+
+    const local_ordinal_type blocksize = blocksize_requested;
+    const local_ordinal_type nrows     = d_inv.extent(0);
+
+    const local_ordinal_type team_size   = 8;
+    const local_ordinal_type vector_size = 8;
+    const size_t shmem_thread_size       = blocksize * sizeof(impl_scalar_type);
+    Kokkos::TeamPolicy<execution_space> policy(
+        (nrows + team_size - 1) / team_size, team_size, vector_size);
+    policy.set_scratch_size(0, Kokkos::PerThread(shmem_thread_size));
+    Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::y_zero", policy,
+                         *this);
+    IFPACK2_BLOCKHELPER_PROFILER_REGION_END;
+    IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space)
+  }
+};
+
+}  // namespace Ifpack2::BlockHelperDetails
+
+#endif
diff --git a/packages/ifpack2/src/Ifpack2_BlockHelper.hpp b/packages/ifpack2/src/Ifpack2_BlockHelper.hpp
index f67cebeb7e30..af735ddf7c33 100644
--- a/packages/ifpack2/src/Ifpack2_BlockHelper.hpp
+++ b/packages/ifpack2/src/Ifpack2_BlockHelper.hpp
@@ -329,6 +329,7 @@ namespace Ifpack2 {
       // tpetra multivector values (layout left): may need to change the typename more explicitly
       typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,device_type> impl_scalar_type_2d_view;
       typedef Kokkos::View<impl_scalar_type**,Kokkos::LayoutLeft,node_device_type> impl_scalar_type_2d_view_tpetra;
+      typedef Kokkos::View<const impl_scalar_type**,Kokkos::LayoutLeft,node_device_type> const_impl_scalar_type_2d_view_tpetra;
 
       // packed data always use layout right
       typedef Kokkos::View<vector_type*,device_type> vector_type_1d_view;
diff --git a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp
index d2e39255ba04..362970e13941 100644
--- a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp
+++ b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp
@@ -124,6 +124,7 @@ namespace Ifpack2 {
       impl_->W = typename impl_type::impl_scalar_type_1d_view();
       IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
     }
+    // Decide whether to use fused block Jacobi path
 
     IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
   }
@@ -165,10 +166,18 @@ namespace Ifpack2 {
                        bool pointIndexed)
     : Container<MatrixType>(matrix, partitions, pointIndexed), partitions_(partitions)
   {
+    using Helpers = BlockHelperDetails::ImplType<MatrixType>;
     IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::BlockTriDiContainer", BlockTriDiContainer);
     const bool useSeqMethod = false;
     const bool overlapCommAndComp = false;
     initInternal(matrix, importer, overlapCommAndComp, useSeqMethod);
+    auto matrixBCRS = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(matrix);
+    bool hasBlockCrs = !matrixBCRS.is_null();
+    bool onePartitionPerRow = hasBlockCrs && size_t(partitions.size()) == matrixBCRS->getLocalNumRows();
+    // Decide upfront whether the fused block Jacobi path can be used.
+    impl_->use_fused_jacobi = 
+      BlockHelperDetails::is_device<typename Helpers::execution_space>::value \
+      && hasBlockCrs && (!partitions.size() || onePartitionPerRow);
     n_subparts_per_part_ = -1;
     block_size_ = -1;
     IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
@@ -185,8 +194,18 @@ namespace Ifpack2 {
                        const bool explicitConversion)
     : Container<MatrixType>(matrix, partitions, false), partitions_(partitions)
   {
+    using Helpers = BlockHelperDetails::ImplType<MatrixType>;
     IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::BlockTriDiContainer", BlockTriDiContainer);
     initInternal(matrix, Teuchos::null, overlapCommAndComp, useSeqMethod, block_size, explicitConversion);
+    auto matrixBCRS = Teuchos::rcp_dynamic_cast<const block_crs_matrix_type>(matrix);
+    bool hasBlockCrs = !matrixBCRS.is_null();
+    bool onePartitionPerRow = hasBlockCrs && size_t(partitions.size()) == matrixBCRS->getLocalNumRows();
+    // Jacobi case can be represented in two ways:
+    // - partitions is empty
+    // - partitions has length equal to local number of rows (meaning all line lengths must be 1)
+    impl_->use_fused_jacobi = 
+      BlockHelperDetails::is_device<typename Helpers::execution_space>::value \
+      && !useSeqMethod && hasBlockCrs && (!partitions.size() || onePartitionPerRow);
     n_subparts_per_part_ = n_subparts_per_part;
     block_size_ = block_size;
     IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
@@ -254,7 +273,7 @@ namespace Ifpack2 {
          impl_->block_tridiags,
          impl_->a_minus_d,
          impl_->overlap_communication_and_computation,
-         impl_->async_importer, useSeqMethod);
+         impl_->async_importer, useSeqMethod, impl_->use_fused_jacobi);
     }
     IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
   }
@@ -273,7 +292,8 @@ namespace Ifpack2 {
         (impl_->A,
          impl_->blockGraph, 
          impl_->part_interface, impl_->block_tridiags, 
-         Kokkos::ArithTraits<magnitude_type>::zero());
+         Kokkos::ArithTraits<magnitude_type>::zero(),
+         impl_->use_fused_jacobi);
     }
     this->IsComputed_ = true;
     IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
@@ -302,21 +322,38 @@ namespace Ifpack2 {
     const magnitude_type tol = Kokkos::ArithTraits<magnitude_type>::zero();
     const int check_tol_every = 1;
 
-    BlockTriDiContainerDetails::applyInverseJacobi<MatrixType>
-      (impl_->A,
-       impl_->blockGraph,
-       impl_->tpetra_importer, 
-       impl_->async_importer, 
-       impl_->overlap_communication_and_computation,
-       X, Y, impl_->Z, impl_->W,
-       impl_->part_interface, impl_->block_tridiags, impl_->a_minus_d,
-       impl_->work,
-       impl_->norm_manager,
-       dampingFactor,
-       zeroStartingSolution,
-       numSweeps,
-       tol,
-       check_tol_every);
+    if(!impl_->use_fused_jacobi) {
+      BlockTriDiContainerDetails::applyInverseJacobi<MatrixType>
+        (impl_->A,
+         impl_->blockGraph,
+         impl_->tpetra_importer, 
+         impl_->async_importer, 
+         impl_->overlap_communication_and_computation,
+         X, Y, impl_->Z, impl_->W,
+         impl_->part_interface, impl_->block_tridiags, impl_->a_minus_d,
+         impl_->work,
+         impl_->norm_manager,
+         dampingFactor,
+         zeroStartingSolution,
+         numSweeps,
+         tol,
+         check_tol_every);
+    }
+    else {
+      BlockTriDiContainerDetails::applyFusedBlockJacobi<MatrixType>
+        (impl_->tpetra_importer, 
+         impl_->async_importer,
+         impl_->overlap_communication_and_computation,
+         X, Y, impl_->W,
+         impl_->part_interface, impl_->block_tridiags, impl_->a_minus_d,
+         impl_->work_flat,
+         impl_->norm_manager,
+         dampingFactor,
+         zeroStartingSolution,
+         numSweeps,
+         tol,
+         check_tol_every);
+    }
     IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
   }
 
@@ -342,7 +379,8 @@ namespace Ifpack2 {
         (impl_->A,
          impl_->blockGraph, 
          impl_->part_interface, impl_->block_tridiags, 
-         in.addRadiallyToDiagonal);
+         in.addRadiallyToDiagonal,
+         impl_->use_fused_jacobi);
     }
     this->IsComputed_ = true;
     IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
@@ -367,21 +405,38 @@ namespace Ifpack2 {
     IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::applyInverseJacobi", applyInverseJacobi);
     int r_val = 0;
     {
-      r_val = BlockTriDiContainerDetails::applyInverseJacobi<MatrixType>
-        (impl_->A,
-         impl_->blockGraph,
-         impl_->tpetra_importer, 
-         impl_->async_importer,
-         impl_->overlap_communication_and_computation,
-         X, Y, impl_->Z, impl_->W,
-         impl_->part_interface, impl_->block_tridiags, impl_->a_minus_d,
-         impl_->work,
-         impl_->norm_manager,
-         in.dampingFactor,
-         in.zeroStartingSolution,
-         in.maxNumSweeps,
-         in.tolerance,
-         in.checkToleranceEvery);
+      if(!impl_->use_fused_jacobi) {
+        r_val = BlockTriDiContainerDetails::applyInverseJacobi<MatrixType>
+          (impl_->A,
+           impl_->blockGraph,
+           impl_->tpetra_importer, 
+           impl_->async_importer,
+           impl_->overlap_communication_and_computation,
+           X, Y, impl_->Z, impl_->W,
+           impl_->part_interface, impl_->block_tridiags, impl_->a_minus_d,
+           impl_->work,
+           impl_->norm_manager,
+           in.dampingFactor,
+           in.zeroStartingSolution,
+           in.maxNumSweeps,
+           in.tolerance,
+           in.checkToleranceEvery);
+      }
+      else {
+        r_val = BlockTriDiContainerDetails::applyFusedBlockJacobi<MatrixType>
+          (impl_->tpetra_importer, 
+           impl_->async_importer,
+           impl_->overlap_communication_and_computation,
+           X, Y, impl_->W,
+           impl_->part_interface, impl_->block_tridiags, impl_->a_minus_d,
+           impl_->work_flat,
+           impl_->norm_manager,
+           in.dampingFactor,
+           in.zeroStartingSolution,
+           in.maxNumSweeps,
+           in.tolerance,
+           in.checkToleranceEvery);
+      }
     }
     IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
     return r_val;
diff --git a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp
index 88bb7a09b9db..5c24f35a183a 100644
--- a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp
+++ b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp
@@ -50,6 +50,7 @@
 
 #include "Ifpack2_BlockHelper.hpp"
 #include "Ifpack2_BlockComputeResidualVector.hpp"
+#include "Ifpack2_BlockComputeResidualAndSolve.hpp"
 
 //#include <KokkosBlas2_gemv.hpp>
 
@@ -1180,29 +1181,21 @@ namespace Ifpack2 {
       local_ordinal_type pack_nrows_sub = 0;
       if (jacobi) {
         IFPACK2_BLOCKHELPER_TIMER("compute part indices (Jacobi)", Jacobi);
+        // Jacobi (all lines have length 1) means that A_n_lclrows == nparts,
+        // so the mapping between parts and rows is trivial.
+        // Note: we can leave interf.row_contiguous = true, since for all i: lclrow(i) == i
+        for (local_ordinal_type i=0; i <= nparts; ++i) {
+          part2rowidx0(i) = i;
+          partptr(i) = i;
+        }
+        for (local_ordinal_type i=0; i < nparts; ++i) {
+          rowidx2part(i) = i;
+          lclrow(i) = i;
+        }
         for (local_ordinal_type ip=0;ip<nparts;++ip) {
-          constexpr local_ordinal_type ipnrows = 1;
           //assume No overlap.
-          part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows;
-          // Since parts are ordered in decreasing size, the size of the first
-          // part in a pack is the size for all parts in the pack.
-          if (ip % vector_length == 0) pack_nrows = ipnrows;
+          if (ip % vector_length == 0) pack_nrows = 1;
           part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0);
-          const local_ordinal_type offset = partptr(ip);
-          for (local_ordinal_type i=0;i<ipnrows;++i) {
-            const auto lcl_row = ip;
-            TEUCHOS_TEST_FOR_EXCEPT_MSG(lcl_row < 0 || lcl_row >= A_n_lclrows,
-                BlockHelperDetails::get_msg_prefix(comm)
-                << "partitions[" << p[ip] << "]["
-                << i << "] = " << lcl_row
-                << " but input matrix implies limits of [0, " << A_n_lclrows-1
-                << "].");
-            lclrow(offset+i) = lcl_row;
-            rowidx2part(offset+i) = ip;
-            if (interf.row_contiguous && offset+i > 0 && lclrow((offset+i)-1) + 1 != lcl_row)
-              interf.row_contiguous = false;
-          }
-          partptr(ip+1) = offset + ipnrows;
         }
         part2rowidx0_sub(0) = 0;
         partptr_sub(0, 0) = 0;
@@ -1569,6 +1562,7 @@ namespace Ifpack2 {
       using size_type_2d_view = typename impl_type::size_type_2d_view;
       using vector_type_3d_view = typename impl_type::vector_type_3d_view;
       using vector_type_4d_view = typename impl_type::vector_type_4d_view;
+      using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
 
       // flat_td_ptr(i) is the index into flat-array values of the start of the
       // i'th tridiag. pack_td_ptr is the same, but for packs. If vector_length ==
@@ -1587,6 +1581,15 @@ namespace Ifpack2 {
       // inv(A_00)*A_01 block values.
       vector_type_4d_view e_values;
 
+      // The following are for fused block Jacobi only.
+      // For block row i, diag_offset(i)...diag_offset(i + bs^2)
+      // is the range of scalars for the diagonal block.
+      size_type_1d_view diag_offsets;
+      // For fused residual+solve block Jacobi case,
+      // this contains the diagonal block inverses in flat, local row indexing:
+      // d_inv(row, :, :) gives the row-major block for row.
+      btdm_scalar_type_3d_view d_inv;
+
       bool is_diagonal_only;
 
       BlockTridiags() = default;
@@ -1866,7 +1869,8 @@ namespace Ifpack2 {
                          BlockHelperDetails::AmD<MatrixType> &amd,
                          const bool overlap_communication_and_computation,
                          const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
-                         bool useSeqMethod) {
+                         bool useSeqMethod,
+                         bool use_fused_jacobi) {
       IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::SymbolicPhase", SymbolicPhase);
 
       using impl_type = BlockHelperDetails::ImplType<MatrixType>;
@@ -1883,6 +1887,7 @@ namespace Ifpack2 {
       using vector_type_4d_view = typename impl_type::vector_type_4d_view;
       using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type;
       using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type;
+      using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
 
       constexpr int vector_length = impl_type::vector_length;
 
@@ -2193,6 +2198,13 @@ namespace Ifpack2 {
         BlockHelperDetails::precompute_A_x_offsets<MatrixType>(amd, interf, g, dm2cm, blocksize, ownedRemoteSeparate);
       }
 
+      // If using fused block Jacobi path, allocate diagonal inverses here (d_inv) and find diagonal offsets.
+      if(use_fused_jacobi) {
+        btdm.d_inv = btdm_scalar_type_3d_view(do_not_initialize_tag("btdm.d_inv"), interf.nparts, blocksize, blocksize);
+        auto rowptrs = A_bcrs->getCrsGraph().getLocalRowPtrsDevice();
+        auto entries = A_bcrs->getCrsGraph().getLocalIndicesDevice();
+        btdm.diag_offsets = BlockHelperDetails::findDiagOffsets<execution_space, size_type_1d_view>(rowptrs, entries, interf.nparts, blocksize);
+      }
       IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
     }
 
@@ -2786,6 +2798,7 @@ namespace Ifpack2 {
       /// views
       using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
       using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view;
+      using size_type_1d_view = typename impl_type::size_type_1d_view;
       using size_type_2d_view = typename impl_type::size_type_2d_view;
       using impl_scalar_type_1d_view_tpetra = typename impl_type::impl_scalar_type_1d_view_tpetra;
       /// vectorization
@@ -2795,10 +2808,15 @@ namespace Ifpack2 {
       using vector_type_4d_view = typename impl_type::vector_type_4d_view;
       using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view;
       using internal_vector_type_5d_view = typename impl_type::internal_vector_type_5d_view;
+      using btdm_scalar_type_2d_view = typename impl_type::btdm_scalar_type_2d_view;
+      using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view;
       using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view;
       using btdm_scalar_type_5d_view = typename impl_type::btdm_scalar_type_5d_view;
       using internal_vector_scratch_type_3d_view = Scratch<typename impl_type::internal_vector_type_3d_view>;
       using btdm_scalar_scratch_type_3d_view = Scratch<typename impl_type::btdm_scalar_type_3d_view>;
+      using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right)
+      using local_crs_graph_type = typename impl_type::local_crs_graph_type;
+      using colinds_view = typename local_crs_graph_type::entries_type;
 
       using internal_vector_type = typename impl_type::internal_vector_type;
       static constexpr int vector_length = impl_type::vector_length;
@@ -2818,6 +2836,7 @@ namespace Ifpack2 {
       // block crs matrix (it could be Kokkos::UVMSpace::size_type, which is int)
       using size_type_1d_view_tpetra = Kokkos::View<size_t*,typename impl_type::node_device_type>;
       ConstUnmanaged<size_type_1d_view_tpetra> A_block_rowptr;
+      ConstUnmanaged<colinds_view> A_colinds;
       ConstUnmanaged<size_type_1d_view_tpetra> A_point_rowptr;
       ConstUnmanaged<impl_scalar_type_1d_view_tpetra> A_values;
       // block tridiags
@@ -2827,6 +2846,8 @@ namespace Ifpack2 {
       const Unmanaged<internal_vector_type_5d_view> e_internal_vector_values;
       const Unmanaged<btdm_scalar_type_4d_view> scalar_values, scalar_values_schur;
       const Unmanaged<btdm_scalar_type_5d_view> e_scalar_values;
+      const Unmanaged<btdm_scalar_type_3d_view> d_inv;
+      const Unmanaged<size_type_1d_view> diag_offsets;
       // shared information
       const local_ordinal_type blocksize, blocksize_square;
       // diagonal safety
@@ -2888,6 +2909,8 @@ namespace Ifpack2 {
                       btdm_.e_values.extent(2),
                       btdm_.e_values.extent(3),
                       vector_length),
+        d_inv(btdm_.d_inv),
+        diag_offsets(btdm_.diag_offsets),
         blocksize(btdm_.values.extent(1)),
         blocksize_square(blocksize*blocksize),
         // diagonal weight to avoid zero pivots
@@ -2904,6 +2927,8 @@ namespace Ifpack2 {
           A_block_rowptr = G_->getLocalGraphDevice().row_map;
           if (hasBlockCrsMatrix) {
             A_values = const_cast<block_crs_matrix_type*>(A_bcrs.get())->getValuesDeviceNonConst();
+            // A_colinds only used in fused Jacobi path
+            A_colinds = A_bcrs->getCrsGraph().getLocalIndicesDevice();
           }
           else {
             A_point_rowptr = A_crs->getCrsGraph()->getLocalGraphDevice().row_map;
@@ -3188,6 +3213,7 @@ namespace Ifpack2 {
     public:
 
       struct ExtractAndFactorizeSubLineTag {};
+      struct ExtractAndFactorizeFusedJacobiTag {};
       struct ExtractBCDTag {};
       struct ComputeETag {};
       struct ComputeSchurTag {};
@@ -3237,6 +3263,68 @@ namespace Ifpack2 {
         }
       }
 
+      KOKKOS_INLINE_FUNCTION
+      void
+      operator() (const ExtractAndFactorizeFusedJacobiTag&, const member_type &member) const {
+        using default_mode_and_algo_type = ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>;
+        using default_mode_type = typename default_mode_and_algo_type::mode_type;
+        using default_algo_type = typename default_mode_and_algo_type::algo_type;
+        // When fused block Jacobi can be used, the mapping between local rows and parts is trivial (i <-> i)
+        // We can simply pull the diagonal entry from A into d_inv
+        btdm_scalar_scratch_type_3d_view WW1(member.team_scratch(ScratchLevel), vector_length / 2, blocksize, blocksize);
+        btdm_scalar_scratch_type_3d_view WW2(member.team_scratch(ScratchLevel), vector_length / 2, blocksize, blocksize);
+        const auto one = Kokkos::ArithTraits<btdm_magnitude_type>::one();
+        const local_ordinal_type nrows = lclrow.extent(0);
+        Kokkos::parallel_for
+          (Kokkos::ThreadVectorRange(member, vector_length / 2),
+	   [&](const local_ordinal_type &v) {
+            local_ordinal_type row = member.league_rank() * vector_length / 2 + v;
+            // diagEntry has index of diagonal within row
+            auto W1 = Kokkos::subview(WW1, v, Kokkos::ALL(), Kokkos::ALL());
+            auto W2 = Kokkos::subview(WW2, v, Kokkos::ALL(), Kokkos::ALL());
+            if(row < nrows) {
+              // View the diagonal block of A in row as 2D row-major
+              const impl_scalar_type* A_diag = A_values.data() + diag_offsets(row);
+              // Copy the diag into scratch slice W1
+              // (copying elements directly is better than KokkosBatched copy)
+              Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize * blocksize),
+                [&](int i)
+                {
+                  W1.data()[i] = A_diag[i];
+                });
+              // and set W2 to identity in preparation to invert with 2 x Trsm
+              KB::SetIdentity<member_type,default_mode_type>
+                ::invoke(member, W2);
+            }
+            member.team_barrier();
+            if(row < nrows) {
+              // LU factorize in-place
+              KB::LU<member_type, default_mode_type,KB::Algo::LU::Unblocked>
+                ::invoke(member, W1, tiny);
+            }
+            member.team_barrier();
+            if(row < nrows) {
+              KB::Trsm<member_type,
+                       KB::Side::Left,KB::Uplo::Lower,KB::Trans::NoTranspose,KB::Diag::Unit,
+                       default_mode_type,default_algo_type>
+                ::invoke(member, one, W1, W2);
+              KB::Trsm<member_type,
+                       KB::Side::Left,KB::Uplo::Upper,KB::Trans::NoTranspose,KB::Diag::NonUnit,
+                       default_mode_type,default_algo_type>
+                ::invoke(member, one, W1, W2);
+            }
+            member.team_barrier();
+            if(row < nrows) {
+              Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize * blocksize),
+                [&](int i)
+                {
+                  auto d_inv_block = &d_inv(row, 0, 0);
+                  d_inv_block[i] = W2.data()[i];
+                });
+            }
+          });
+      }
+
       KOKKOS_INLINE_FUNCTION
       void
       operator() (const ExtractBCDTag &, const member_type &member) const {
@@ -3589,6 +3677,25 @@ namespace Ifpack2 {
         IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
       }
 
+      void run_fused_jacobi() {
+        IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN;
+        constexpr int half_vector_length = vector_length / 2;
+        const local_ordinal_type team_size =
+          ExtractAndFactorizeTridiagsDefaultModeAndAlgo<typename execution_space::memory_space>::
+          recommended_team_size(blocksize, half_vector_length, 1);
+        const local_ordinal_type per_team_scratch =
+          btdm_scalar_scratch_type_3d_view::shmem_size(blocksize, blocksize, vector_length);
+        {
+          IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractAndFactorizeFusedJacobi", ExtractAndFactorizeFusedJacobiTag);
+          Kokkos::TeamPolicy<execution_space, ExtractAndFactorizeFusedJacobiTag>
+            policy((lclrow.extent(0) + half_vector_length - 1) / half_vector_length, team_size, half_vector_length);
+
+          policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch));
+          Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run<ExtractAndFactorizeFusedJacobiTag>",
+                              policy, *this);
+        }
+        IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END;
+      }
     };
 
     ///
@@ -3600,7 +3707,8 @@ namespace Ifpack2 {
                         const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_crs_graph_type> &G,
                         const BlockHelperDetails::PartInterface<MatrixType> &interf,
                         BlockTridiags<MatrixType> &btdm,
-                        const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tiny) {
+                        const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tiny,
+                        bool use_fused_jacobi) {
       using impl_type = BlockHelperDetails::ImplType<MatrixType>;
       using execution_space = typename impl_type::execution_space;
       using team_policy_type = Kokkos::TeamPolicy<execution_space>;
@@ -3617,12 +3725,18 @@ namespace Ifpack2 {
       if(scratch_required < max_scratch) {
         // Can use level 0 scratch
         ExtractAndFactorizeTridiags<MatrixType, 0> function(btdm, interf, A, G, tiny);
-        function.run();
+        if(!use_fused_jacobi)
+          function.run();
+        else
+          function.run_fused_jacobi();
       }
       else {
         // Not enough level 0 scratch, so fall back to level 1
         ExtractAndFactorizeTridiags<MatrixType, 1> function(btdm, interf, A, G, tiny);
-        function.run();
+        if(!use_fused_jacobi)
+          function.run();
+        else
+          function.run_fused_jacobi();
       }
       IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType<MatrixType>::execution_space)
     }
@@ -4998,68 +5112,276 @@ namespace Ifpack2 {
       // iterate
       int sweep = 0;
       for (;sweep<max_num_sweeps;++sweep) {
-        {
-          if (is_y_zero) {
-            // pmv := x(lclrow)
-            multivector_converter.run(XX);
-          } else {
-            if (is_seq_method_requested) {
-              // SEQ METHOD IS TESTING ONLY
+        // General path: with separate residual, solve tridiags, and local norm computation
+        if (is_y_zero) {
+          // pmv := x(lclrow)
+          multivector_converter.run(XX);
+        } else {
+          if (is_seq_method_requested) {
+            // SEQ METHOD IS TESTING ONLY
 
-              // y := x - R y
-              Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
-              compute_residual_vector.run(YY, XX, ZZ);
+            // y := x - R y
+            Z.doImport(Y, *tpetra_importer, Tpetra::REPLACE);
+            compute_residual_vector.run(YY, XX, ZZ);
 
-              // pmv := y(lclrow).
-              multivector_converter.run(YY);
-            } else {
-              // fused y := x - R y and pmv := y(lclrow);
-              // real use case does not use overlap comp and comm
-              if (overlap_communication_and_computation || !is_async_importer_active) {
-                if (is_async_importer_active) async_importer->asyncSendRecv(YY);
-                // OverlapTag, compute_owned = true
-                compute_residual_vector.run(pmv, XX, YY, remote_multivector, true);
-                if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
-                  if (is_async_importer_active) async_importer->cancel();
-                  break;
-                }
-                if (is_async_importer_active) {
-                  async_importer->syncRecv();
-                  // OverlapTag, compute_owned = false
-                  compute_residual_vector.run(pmv, XX, YY, remote_multivector, false);
-                }
-              } else {
-                if (is_async_importer_active)
-                  async_importer->syncExchange(YY);
-                if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
-                // AsyncTag
-                compute_residual_vector.run(pmv, XX, YY, remote_multivector);
+            // pmv := y(lclrow).
+            multivector_converter.run(YY);
+          } else {
+            // fused y := x - R y and pmv := y(lclrow);
+            // real use case does not use overlap comp and comm
+            if (overlap_communication_and_computation || !is_async_importer_active) {
+              if (is_async_importer_active) async_importer->asyncSendRecv(YY);
+              // OverlapTag, compute_owned = true
+              compute_residual_vector.run(pmv, XX, YY, remote_multivector, true);
+              if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
+                if (is_async_importer_active) async_importer->cancel();
+                break;
+              }
+              if (is_async_importer_active) {
+                async_importer->syncRecv();
+                // OverlapTag, compute_owned = false
+                compute_residual_vector.run(pmv, XX, YY, remote_multivector, false);
               }
+            } else {
+              if (is_async_importer_active)
+                async_importer->syncExchange(YY);
+              if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
+              // AsyncTag
+              compute_residual_vector.run(pmv, XX, YY, remote_multivector);
             }
           }
         }
-
         // pmv := inv(D) pmv.
-        {
-          solve_tridiags.run(YY, W);
+        solve_tridiags.run(YY, W);
+        // Compute norm.
+        if (is_norm_manager_active) {
+          // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always.
+          BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
+          if (sweep + 1 == max_num_sweeps) {
+            norm_manager.ireduce(sweep, true);
+            norm_manager.checkDone(sweep + 1, tolerance, true);
+          } else {
+            norm_manager.ireduce(sweep);
+          }
         }
-        {
-          if (is_norm_manager_active) {
-            // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always.
-            BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
-            if (sweep + 1 == max_num_sweeps) {
-              norm_manager.ireduce(sweep, true);
-              norm_manager.checkDone(sweep + 1, tolerance, true);
-            } else {
-              norm_manager.ireduce(sweep);
+        is_y_zero = false;
+      }
+
+      //sqrt the norms for the caller's use.
+      if (is_norm_manager_active) norm_manager.finalize();
+
+      return sweep;
+    }
+
+    // fused block Jacobi apply for a specific block size B
+    // (or B = 0 for general case). 
+    template<typename MatrixType, int B>
+    int
+    applyFusedBlockJacobi_Impl(
+                       const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
+                       const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
+                       const bool overlap_communication_and_computation,
+                       // tpetra interface
+                       const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X,  // tpetra interface
+                       /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y,  // tpetra interface
+                       /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
+                       // local object interface
+                       const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
+                       const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
+                       const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
+                       /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &work, // workspace
+                       /* */ BlockHelperDetails::NormManager<MatrixType> &norm_manager,
+                       // preconditioner parameters
+                       const typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type &damping_factor,
+                       /* */ bool is_y_zero,
+                       const int max_num_sweeps,
+                       const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
+                       const int check_tol_every) {
+      using impl_type = BlockHelperDetails::ImplType<MatrixType>;
+      using node_memory_space = typename impl_type::node_memory_space;
+      using local_ordinal_type = typename impl_type::local_ordinal_type;
+      using size_type = typename impl_type::size_type;
+      using impl_scalar_type = typename impl_type::impl_scalar_type;
+      using magnitude_type = typename impl_type::magnitude_type;
+      using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view;
+      using tpetra_multivector_type = typename impl_type::tpetra_multivector_type;
+      using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view;
+      using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra;
+
+      // the tpetra importer and async importer can't both be active
+      TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(),
+                                  "Neither Tpetra importer nor Async importer is null.");
+      // max number of sweeps should be positive number
+      TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0,
+                                  "Maximum number of sweeps must be >= 1.");
+
+      // const parameters
+      const bool is_async_importer_active = !async_importer.is_null();
+      const bool is_norm_manager_active = tol > Kokkos::ArithTraits<magnitude_type>::zero();
+      const magnitude_type tolerance = tol*tol;
+      const local_ordinal_type blocksize = btdm.d_inv.extent(1);
+      const local_ordinal_type num_vectors = Y.getNumVectors();
+      const local_ordinal_type num_blockrows = interf.nparts;
+
+      typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector;
+      {
+        if (is_async_importer_active) {
+          // create comm data buffer and keep it here
+          async_importer->createDataBuffer(num_vectors);
+          remote_multivector = async_importer->getRemoteMultiVectorLocalView();
+        }
+      }
+
+      const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
+      const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
+
+      const bool two_pass_residual =
+        overlap_communication_and_computation && is_async_importer_active;
+
+      // Calculate the required work size and reallocate it if not already big enough.
+      // Check that our assumptions about YY dimension are correct.
+      TEUCHOS_TEST_FOR_EXCEPT_MSG(
+          size_t(num_blockrows) * blocksize * num_vectors != YY.extent(0) * YY.extent(1),
+          "Local LHS vector (YY) has total size " << YY.extent(0) << "x" << YY.extent(1) <<
+          " = " << YY.extent(0) * YY.extent(1) << ",\n" <<
+          "but expected " << num_blockrows << "x" << blocksize << "x" << num_vectors <<
+          " = " << size_t(num_blockrows) * blocksize * num_vectors << '\n');
+      size_type work_required = size_type(num_blockrows) * blocksize * num_vectors;
+      if (work.extent(0) < work_required) {
+        work = impl_scalar_type_1d_view(do_not_initialize_tag("flat workspace 1d view"), work_required);
+      }
+
+      Unmanaged<impl_scalar_type_2d_view_tpetra> y_doublebuf(work.data(), num_blockrows * blocksize, num_vectors);
+
+      // construct W
+      if (W.extent(0) != size_t(num_blockrows))
+        W = impl_scalar_type_1d_view(do_not_initialize_tag("W"), num_blockrows);
+
+      // Create the required functors upfront (this is inexpensive - all shallow copies)
+      BlockHelperDetails::ComputeResidualAndSolve_SolveOnly<MatrixType, B>
+        functor_solve_only(amd, btdm.d_inv, W, blocksize, damping_factor);
+      BlockHelperDetails::ComputeResidualAndSolve_1Pass<MatrixType, B>
+        functor_1pass(amd, btdm.d_inv, W, blocksize, damping_factor);
+      BlockHelperDetails::ComputeResidualAndSolve_2Pass<MatrixType, B>
+        functor_2pass(amd, btdm.d_inv, W, blocksize, damping_factor);
+
+      // norm manager workspace resize
+      if (is_norm_manager_active)
+        norm_manager.setCheckFrequency(check_tol_every);
+
+      // For double-buffering.
+      //   yy_buffers[current_y] has the current iterate of y.
+      //   yy_buffers[1-current_y] has the next iterate of y.
+      Unmanaged<impl_scalar_type_2d_view_tpetra> y_buffers[2] = {YY, y_doublebuf};
+      int current_y = 0;
+
+      // iterate
+      int sweep = 0;
+      for (;sweep < max_num_sweeps; ++sweep) {
+        if (is_y_zero) {
+          // If y is initially zero, then we are just computing y := damping_factor * Dinv * x
+          functor_solve_only.run(XX, y_buffers[1-current_y]);
+        } else {
+          // real use case does not use overlap comp and comm
+          if (overlap_communication_and_computation || !is_async_importer_active) {
+            if (is_async_importer_active) async_importer->asyncSendRecv(y_buffers[current_y]);
+            if(two_pass_residual) {
+              // Pass 1 computes owned residual and stores into new y buffer,
+              // but doesn't apply Dinv or produce a norm yet
+              functor_2pass.run_pass1(XX, y_buffers[current_y], y_buffers[1-current_y]);
+            }
+            else {
+              // This case happens if running with single rank.
+              // There are no remote columns, so residual and solve can happen in one step.
+              functor_1pass.run(XX, y_buffers[current_y], remote_multivector, y_buffers[1-current_y]);
+            }
+            if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) {
+              if (is_async_importer_active) async_importer->cancel();
+              break;
             }
+            if (is_async_importer_active) {
+              async_importer->syncRecv();
+              // Stage 2 finishes computing the residual, then applies Dinv and computes norm.
+              functor_2pass.run_pass2(y_buffers[current_y], remote_multivector, y_buffers[1-current_y]);
+            }
+          } else {
+            if (is_async_importer_active)
+              async_importer->syncExchange(y_buffers[current_y]);
+            if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break;
+            // Full residual, Dinv apply, and norm in one kernel
+            functor_1pass.run(XX, y_buffers[current_y], remote_multivector, y_buffers[1-current_y]);
+          }
+        }
+
+        // Compute global norm.
+        if (is_norm_manager_active) {
+          BlockHelperDetails::reduceVector<MatrixType>(W, norm_manager.getBuffer());
+          if (sweep + 1 == max_num_sweeps) {
+            norm_manager.ireduce(sweep, true);
+            norm_manager.checkDone(sweep + 1, tolerance, true);
+          } else {
+            norm_manager.ireduce(sweep);
           }
         }
         is_y_zero = false;
+        // flip y buffers for next iteration, or termination if we reached max_num_sweeps.
+        current_y = 1 - current_y;
+      }
+      if(current_y == 1) {
+        // We finished iterating with y in the double buffer, so copy it to the user's vector.
+        Kokkos::deep_copy(YY, y_doublebuf);
       }
 
       //sqrt the norms for the caller's use.
       if (is_norm_manager_active) norm_manager.finalize();
+      return sweep;
+    }
+
+    template<typename MatrixType>
+    int
+    applyFusedBlockJacobi(
+                       const Teuchos::RCP<const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_import_type> &tpetra_importer,
+                       const Teuchos::RCP<AsyncableImport<MatrixType> > &async_importer,
+                       const bool overlap_communication_and_computation,
+                       // tpetra interface
+                       const typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &X,  // tpetra interface
+                       /* */ typename BlockHelperDetails::ImplType<MatrixType>::tpetra_multivector_type &Y,  // tpetra interface
+                       /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff)
+                       // local object interface
+                       const BlockHelperDetails::PartInterface<MatrixType> &interf, // mesh interface
+                       const BlockTridiags<MatrixType> &btdm, // packed block tridiagonal matrices
+                       const BlockHelperDetails::AmD<MatrixType> &amd, // R = A - D
+                       /* */ typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type_1d_view &work, // workspace
+                       /* */ BlockHelperDetails::NormManager<MatrixType> &norm_manager,
+                       // preconditioner parameters
+                       const typename BlockHelperDetails::ImplType<MatrixType>::impl_scalar_type &damping_factor,
+                       /* */ bool is_y_zero,
+                       const int max_num_sweeps,
+                       const typename BlockHelperDetails::ImplType<MatrixType>::magnitude_type tol,
+                       const int check_tol_every) {
+      IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyFusedBlockJacobi", ApplyFusedBlockJacobi);
+      int blocksize = btdm.d_inv.extent(1);
+      int sweep = 0;
+#define BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(B) {                \
+        sweep = applyFusedBlockJacobi_Impl<MatrixType, B>( \
+            tpetra_importer, async_importer, overlap_communication_and_computation, \
+            X, Y, W, interf, btdm, amd, work, \
+            norm_manager, damping_factor, is_y_zero, \
+            max_num_sweeps, tol, check_tol_every); \
+      } break
+      switch (blocksize) {
+        case   3: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI( 3);
+        case   5: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI( 5);
+        case   7: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI( 7);
+        case   9: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI( 9);
+        case  10: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(10);
+        case  11: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(11);
+        case  16: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(16);
+        case  17: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(17);
+        case  18: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(18);
+        default : BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI( 0);
+      }
+#undef BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI
 
       return sweep;
     }
@@ -5089,7 +5411,14 @@ namespace Ifpack2 {
       part_interface_type part_interface;
       block_tridiags_type block_tridiags; // D
       amd_type a_minus_d; // R = A - D
-      mutable typename impl_type::vector_type_1d_view work; // right hand side workspace
+
+      // whether to use fused block Jacobi path
+      bool use_fused_jacobi;
+
+      // vector workspace is used for general block tridi case
+      mutable typename impl_type::vector_type_1d_view work; // right hand side workspace (1D view of vector)
+      // scalar workspace is used for fused block jacobi case
+      mutable typename impl_type::impl_scalar_type_1d_view work_flat; // right hand side workspace (1D view of scalar)
       mutable norm_manager_type norm_manager;
     };