From 1d16296713ef4beaae45c74c06cb44868aae57a7 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 18 Feb 2025 19:01:50 -0700 Subject: [PATCH] Fused block jacobi More performant path for block Jacobi case inside BTDS (GPU only, BlockCrs only). Fuses residual and solve into one kernel and doesn't convert vectors to SIMD-packed format. Also inverts diag blocks fully in shared to speed up numeric. Signed-off-by: Brian Kelley --- .../Ifpack2_BlockComputeResidualAndSolve.hpp | 684 ++++++++++++++++++ packages/ifpack2/src/Ifpack2_BlockHelper.hpp | 1 + .../src/Ifpack2_BlockTriDiContainer_def.hpp | 121 +++- .../src/Ifpack2_BlockTriDiContainer_impl.hpp | 471 ++++++++++-- 4 files changed, 1173 insertions(+), 104 deletions(-) create mode 100644 packages/ifpack2/src/Ifpack2_BlockComputeResidualAndSolve.hpp diff --git a/packages/ifpack2/src/Ifpack2_BlockComputeResidualAndSolve.hpp b/packages/ifpack2/src/Ifpack2_BlockComputeResidualAndSolve.hpp new file mode 100644 index 000000000000..5fe27cf9785a --- /dev/null +++ b/packages/ifpack2/src/Ifpack2_BlockComputeResidualAndSolve.hpp @@ -0,0 +1,684 @@ +// @HEADER +// ***************************************************************************** +// Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package +// +// Copyright 2009 NTESS and the Ifpack2 contributors. +// SPDX-License-Identifier: BSD-3-Clause +// ***************************************************************************** +// @HEADER + +#ifndef IFPACK2_BLOCKCOMPUTERESAND_SOLVE_IMPL_HPP +#define IFPACK2_BLOCKCOMPUTERESAND_SOLVE_IMPL_HPP + +#include "Ifpack2_BlockHelper.hpp" +#include "Ifpack2_BlockComputeResidualVector.hpp" + +namespace Ifpack2::BlockHelperDetails { + +template +DiagOffsets findDiagOffsets(const Rowptrs& rowptrs, const Entries& entries, + size_t nrows, int blocksize) { + DiagOffsets diag_offsets(do_not_initialize_tag("btdm.diag_offsets"), nrows); + int err = 0; + Kokkos::parallel_reduce( + Kokkos::RangePolicy(0, nrows), + KOKKOS_LAMBDA(size_t row, int& err) { + auto rowBegin = rowptrs(row); + auto rowEnd = rowptrs(row + 1); + for (size_t j = rowBegin; j < rowEnd; j++) { + if (size_t(entries(j)) == row) { + diag_offsets(row) = j * blocksize * blocksize; + return; + } + } + err++; + }, + err); + TEUCHOS_TEST_FOR_EXCEPT_MSG( + err, "Ifpack2 BTD: at least one row has no diagonal entry"); + return diag_offsets; +} + +template +struct ComputeResidualAndSolve_1Pass { + using impl_type = BlockHelperDetails::ImplType; + using node_device_type = typename impl_type::node_device_type; + using execution_space = typename impl_type::execution_space; + using memory_space = typename impl_type::memory_space; + + using local_ordinal_type = typename impl_type::local_ordinal_type; + using size_type = typename impl_type::size_type; + using impl_scalar_type = typename impl_type::impl_scalar_type; + using magnitude_type = typename impl_type::magnitude_type; + /// views + using local_ordinal_type_1d_view = + typename impl_type::local_ordinal_type_1d_view; + using size_type_1d_view = typename impl_type::size_type_1d_view; + using tpetra_block_access_view_type = + typename impl_type::tpetra_block_access_view_type; // block crs (layout + // right) + using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view; + using impl_scalar_type_2d_view_tpetra = + typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector + // (layout left) + using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view; + using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view; + using i64_3d_view = typename impl_type::i64_3d_view; + + /// team policy member type (used in cuda) + using member_type = typename Kokkos::TeamPolicy::member_type; + + // enum for max blocksize and vector length + enum : int { max_blocksize = 32 }; + + private: + ConstUnmanaged b; + ConstUnmanaged x; // x_owned + ConstUnmanaged x_remote; + Unmanaged y; + + // AmD information + const ConstUnmanaged tpetra_values; + + // blocksize + const local_ordinal_type blocksize_requested; + + // block offsets + const ConstUnmanaged A_x_offsets; + const ConstUnmanaged A_x_offsets_remote; + + // diagonal block inverses + const ConstUnmanaged d_inv; + + // squared update norms + const Unmanaged W; + + impl_scalar_type damping_factor; + + public: + ComputeResidualAndSolve_1Pass(const AmD& amd, + const btdm_scalar_type_3d_view& d_inv_, + const impl_scalar_type_1d_view& W_, + const local_ordinal_type& blocksize_requested_, + const impl_scalar_type& damping_factor_) + : tpetra_values(amd.tpetra_values), + blocksize_requested(blocksize_requested_), + A_x_offsets(amd.A_x_offsets), + A_x_offsets_remote(amd.A_x_offsets_remote), + d_inv(d_inv_), + W(W_), + damping_factor(damping_factor_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type& member) const { + const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B); + const local_ordinal_type rowidx = member.league_rank(); + const local_ordinal_type row = rowidx * blocksize; + const local_ordinal_type num_vectors = b.extent(1); + const local_ordinal_type num_local_rows = d_inv.extent(0); + + const impl_scalar_type* xx; + auto A_block_cst = ConstUnmanaged( + NULL, blocksize, blocksize); + + // Get shared allocation for a local copy of x, Ax, and A + impl_scalar_type* local_Ax = reinterpret_cast( + member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type))); + impl_scalar_type* local_DinvAx = reinterpret_cast( + member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type))); + impl_scalar_type* local_x = + reinterpret_cast(member.thread_scratch(0).get_shmem( + blocksize * sizeof(impl_scalar_type))); + + magnitude_type norm = 0; + for (local_ordinal_type col = 0; col < num_vectors; ++col) { + if (col) member.team_barrier(); + // y -= Rx + // Initialize accumulation arrays + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize), + [&](const local_ordinal_type& i) { + local_DinvAx[i] = 0; + local_Ax[i] = b(row + i, col); + }); + member.team_barrier(); + + int numEntries = A_x_offsets.extent(2); + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(member, 0, numEntries), [&](const int k) { + int64_t A_offset = A_x_offsets(rowidx, 0, k); + int64_t x_offset = A_x_offsets(rowidx, 1, k); + if (A_offset != Kokkos::ArithTraits::min()) { + A_block_cst.assign_data(tpetra_values.data() + A_offset); + // Pull x into local memory + int64_t remote_cutoff = blocksize * num_local_rows; + if (x_offset >= remote_cutoff) + xx = &x_remote(x_offset - remote_cutoff, col); + else + xx = &x(x_offset, col); + + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, blocksize), + [&](const local_ordinal_type& i) { local_x[i] = xx[i]; }); + + // matvec on block: local_Ax -= A_block_cst * local_x + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), + [&](const int k0) { + impl_scalar_type val = 0; + for (int k1 = 0; k1 < blocksize; k1++) + val += A_block_cst(k0, k1) * local_x[k1]; + Kokkos::atomic_add(local_Ax + k0, -val); + }); + } + }); + member.team_barrier(); + // Compute local_DinvAx = D^-1 * local_Ax + Kokkos::parallel_for( + Kokkos::TeamThreadRange(member, blocksize), + [&](const local_ordinal_type& k0) { + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(member, blocksize), + [&](const local_ordinal_type& k1, impl_scalar_type& update) { + update += d_inv(rowidx, k0, k1) * local_Ax[k1]; + }, + local_DinvAx[k0]); + }); + member.team_barrier(); + // local_DinvAx is fully computed. Now compute the + // squared y update norm and update y (using damping factor). + magnitude_type colNorm; + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(member, blocksize), + [&](const local_ordinal_type& k, magnitude_type& update) { + // Compute the change in y (assuming damping_factor == 1) for this + // entry. + impl_scalar_type old_y = x(row + k, col); + impl_scalar_type y_update = local_DinvAx[k] - old_y; + magnitude_type ydiff = + Kokkos::ArithTraits::abs(y_update); + update += ydiff * ydiff; + y(row + k, col) = old_y + damping_factor * y_update; + }, + colNorm); + norm += colNorm; + } + Kokkos::single(Kokkos::PerTeam(member), [&]() { W(rowidx) = norm; }); + } + + // Launch SinglePass version (owned + nonowned residual, plus Dinv in a single + // kernel) + void run(const ConstUnmanaged& b_, + const ConstUnmanaged& x_, + const ConstUnmanaged& x_remote_, + const Unmanaged& y_) { + IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN; + IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE( + "BlockTriDi::ComputeResidualAndSolve::RunSinglePass", + ComputeResidualAndSolve0, execution_space); + + y = y_; + b = b_; + x = x_; + x_remote = x_remote_; + + const local_ordinal_type blocksize = blocksize_requested; + const local_ordinal_type nrows = d_inv.extent(0); + + const local_ordinal_type team_size = 8; + const local_ordinal_type vector_size = 8; + // team: local_Ax, local_DinvAx + const size_t shmem_team_size = 2 * blocksize * sizeof(impl_scalar_type); + // thread: local_x + const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type); + Kokkos::TeamPolicy policy(nrows, team_size, vector_size); + policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size), + Kokkos::PerThread(shmem_thread_size)); + Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::SinglePass", + policy, *this); + IFPACK2_BLOCKHELPER_PROFILER_REGION_END; + IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) + } +}; + +template +struct ComputeResidualAndSolve_2Pass { + using impl_type = BlockHelperDetails::ImplType; + using node_device_type = typename impl_type::node_device_type; + using execution_space = typename impl_type::execution_space; + using memory_space = typename impl_type::memory_space; + + using local_ordinal_type = typename impl_type::local_ordinal_type; + using size_type = typename impl_type::size_type; + using impl_scalar_type = typename impl_type::impl_scalar_type; + using magnitude_type = typename impl_type::magnitude_type; + /// views + using local_ordinal_type_1d_view = + typename impl_type::local_ordinal_type_1d_view; + using size_type_1d_view = typename impl_type::size_type_1d_view; + using tpetra_block_access_view_type = + typename impl_type::tpetra_block_access_view_type; // block crs (layout + // right) + using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view; + using impl_scalar_type_2d_view_tpetra = + typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector + // (layout left) + using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view; + using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view; + using i64_3d_view = typename impl_type::i64_3d_view; + + /// team policy member type (used in cuda) + using member_type = typename Kokkos::TeamPolicy::member_type; + + // enum for max blocksize and vector length + enum : int { max_blocksize = 32 }; + + // Tag for computing residual with owned columns only (pass 1) + struct OwnedTag {}; + + // Tag for finishing the residual with nonowned columns, and solving/norming + // (pass 2) + struct NonownedTag {}; + + private: + ConstUnmanaged b; + ConstUnmanaged x; // x_owned + ConstUnmanaged x_remote; + Unmanaged y; + + // AmD information + const ConstUnmanaged tpetra_values; + + // blocksize + const local_ordinal_type blocksize_requested; + + // block offsets + const ConstUnmanaged A_x_offsets; + const ConstUnmanaged A_x_offsets_remote; + + // diagonal block inverses + const ConstUnmanaged d_inv; + + // squared update norms + const Unmanaged W; + + impl_scalar_type damping_factor; + + public: + ComputeResidualAndSolve_2Pass(const AmD& amd, + const btdm_scalar_type_3d_view& d_inv_, + const impl_scalar_type_1d_view& W_, + const local_ordinal_type& blocksize_requested_, + const impl_scalar_type& damping_factor_) + : tpetra_values(amd.tpetra_values), + blocksize_requested(blocksize_requested_), + A_x_offsets(amd.A_x_offsets), + A_x_offsets_remote(amd.A_x_offsets_remote), + d_inv(d_inv_), + W(W_), + damping_factor(damping_factor_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const OwnedTag, const member_type& member) const { + const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B); + const local_ordinal_type rowidx = member.league_rank(); + const local_ordinal_type row = rowidx * blocksize; + const local_ordinal_type num_vectors = b.extent(1); + + auto A_block_cst = ConstUnmanaged( + NULL, blocksize, blocksize); + + // Get shared allocation for a local copy of x, Ax, and A + impl_scalar_type* local_Ax = reinterpret_cast( + member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type))); + impl_scalar_type* local_x = + reinterpret_cast(member.thread_scratch(0).get_shmem( + blocksize * sizeof(impl_scalar_type))); + + for (local_ordinal_type col = 0; col < num_vectors; ++col) { + if (col) member.team_barrier(); + // y -= Rx + // Initialize accumulation arrays + Kokkos::parallel_for( + Kokkos::TeamVectorRange(member, blocksize), + [&](const local_ordinal_type& i) { local_Ax[i] = b(row + i, col); }); + member.team_barrier(); + + int numEntries = A_x_offsets.extent(2); + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(member, 0, numEntries), [&](const int k) { + int64_t A_offset = A_x_offsets(rowidx, 0, k); + int64_t x_offset = A_x_offsets(rowidx, 1, k); + if (A_offset != Kokkos::ArithTraits::min()) { + A_block_cst.assign_data(tpetra_values.data() + A_offset); + // Pull x into local memory + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), + [&](const local_ordinal_type& i) { + local_x[i] = x(x_offset + i, col); + }); + + // MatVec op Ax += A*x + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), + [&](const local_ordinal_type& k0) { + impl_scalar_type val = 0; + for (int k1 = 0; k1 < blocksize; k1++) + val += A_block_cst(k0, k1) * local_x[k1]; + Kokkos::atomic_add(local_Ax + k0, -val); + }); + } + }); + member.team_barrier(); + // Write back the partial residual to y + if (member.team_rank() == 0) { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), + [&](const local_ordinal_type& k) { + y(row + k, col) = local_Ax[k]; + }); + } + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const NonownedTag, const member_type& member) const { + const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B); + const local_ordinal_type rowidx = member.league_rank(); + const local_ordinal_type row = rowidx * blocksize; + const local_ordinal_type num_vectors = b.extent(1); + + auto A_block_cst = ConstUnmanaged( + NULL, blocksize, blocksize); + + // Get shared allocation for a local copy of x, Ax, and A + impl_scalar_type* local_Ax = reinterpret_cast( + member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type))); + impl_scalar_type* local_DinvAx = reinterpret_cast( + member.team_scratch(0).get_shmem(blocksize * sizeof(impl_scalar_type))); + impl_scalar_type* local_x = + reinterpret_cast(member.thread_scratch(0).get_shmem( + blocksize * sizeof(impl_scalar_type))); + + magnitude_type norm = 0; + for (local_ordinal_type col = 0; col < num_vectors; ++col) { + if (col) member.team_barrier(); + // y -= Rx + // Initialize accumulation arrays. + Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blocksize), + [&](const local_ordinal_type& i) { + local_DinvAx[i] = 0; + local_Ax[i] = y(row + i, col); + }); + member.team_barrier(); + + int numEntries = A_x_offsets_remote.extent(2); + + Kokkos::parallel_for( + Kokkos::TeamThreadRange(member, 0, numEntries), [&](const int k) { + int64_t A_offset = A_x_offsets_remote(rowidx, 0, k); + int64_t x_offset = A_x_offsets_remote(rowidx, 1, k); + if (A_offset != Kokkos::ArithTraits::min()) { + A_block_cst.assign_data(tpetra_values.data() + A_offset); + // Pull x into local memory + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), + [&](const local_ordinal_type& i) { + local_x[i] = x_remote(x_offset + i, col); + }); + + // matvec on block: local_Ax -= A_block_cst * local_x + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), + [&](const int k0) { + impl_scalar_type val = 0; + for (int k1 = 0; k1 < blocksize; k1++) + val += A_block_cst(k0, k1) * local_x[k1]; + Kokkos::atomic_add(local_Ax + k0, -val); + }); + } + }); + member.team_barrier(); + // Compute local_DinvAx = D^-1 * local_Ax + Kokkos::parallel_for( + Kokkos::TeamThreadRange(member, blocksize), + [&](const local_ordinal_type& k0) { + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(member, blocksize), + [&](const local_ordinal_type& k1, impl_scalar_type& update) { + update += d_inv(rowidx, k0, k1) * local_Ax[k1]; + }, + local_DinvAx[k0]); + }); + member.team_barrier(); + // local_DinvAx is fully computed. Now compute the + // squared y update norm and update y (using damping factor). + magnitude_type colNorm; + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(member, blocksize), + [&](const local_ordinal_type& k, magnitude_type& update) { + // Compute the change in y (assuming damping_factor == 1) for this + // entry. + impl_scalar_type old_y = x(row + k, col); + impl_scalar_type y_update = local_DinvAx[k] - old_y; + magnitude_type ydiff = + Kokkos::ArithTraits::abs(y_update); + update += ydiff * ydiff; + y(row + k, col) = old_y + damping_factor * y_update; + }, + colNorm); + norm += colNorm; + } + Kokkos::single(Kokkos::PerTeam(member), [&]() { W(rowidx) = norm; }); + } + + // Launch pass 1 of the 2-pass version. + // This computes just the owned part of residual and writes that back to y. + void run_pass1(const ConstUnmanaged& b_, + const ConstUnmanaged& x_, + const Unmanaged& y_) { + IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN; + IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE( + "BlockTriDi::ComputeResidualAndSolve::RunPass1", + ComputeResidualAndSolve0, execution_space); + + b = b_; + x = x_; + y = y_; + + const local_ordinal_type blocksize = blocksize_requested; + const local_ordinal_type nrows = d_inv.extent(0); + + const local_ordinal_type team_size = 8; + const local_ordinal_type vector_size = 8; + const size_t shmem_team_size = blocksize * sizeof(impl_scalar_type); + const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type); + Kokkos::TeamPolicy policy(nrows, team_size, + vector_size); + policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size), + Kokkos::PerThread(shmem_thread_size)); + Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::Pass1", policy, + *this); + IFPACK2_BLOCKHELPER_PROFILER_REGION_END; + IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) + } + + // Launch pass 2 of the 2-pass version. + // This finishes computing residual with x_remote, + // and then applies Dinv and computes norm. + void run_pass2( + const ConstUnmanaged& x_, + const ConstUnmanaged& x_remote_, + const Unmanaged& y_) { + IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN; + IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE( + "BlockTriDi::ComputeResidualAndSolve::RunPass2", + ComputeResidualAndSolve0, execution_space); + + x = x_; + x_remote = x_remote_; + y = y_; + + const local_ordinal_type blocksize = blocksize_requested; + const local_ordinal_type nrows = d_inv.extent(0); + + const local_ordinal_type team_size = 8; + const local_ordinal_type vector_size = 8; + const size_t shmem_team_size = 2 * blocksize * sizeof(impl_scalar_type); + const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type); + Kokkos::TeamPolicy policy(nrows, team_size, + vector_size); + policy.set_scratch_size(0, Kokkos::PerTeam(shmem_team_size), + Kokkos::PerThread(shmem_thread_size)); + Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::Pass2", policy, + *this); + IFPACK2_BLOCKHELPER_PROFILER_REGION_END; + IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) + } +}; + +template +struct ComputeResidualAndSolve_SolveOnly { + using impl_type = BlockHelperDetails::ImplType; + using node_device_type = typename impl_type::node_device_type; + using execution_space = typename impl_type::execution_space; + using memory_space = typename impl_type::memory_space; + + using local_ordinal_type = typename impl_type::local_ordinal_type; + using size_type = typename impl_type::size_type; + using impl_scalar_type = typename impl_type::impl_scalar_type; + using magnitude_type = typename impl_type::magnitude_type; + /// views + using local_ordinal_type_1d_view = + typename impl_type::local_ordinal_type_1d_view; + using size_type_1d_view = typename impl_type::size_type_1d_view; + using tpetra_block_access_view_type = + typename impl_type::tpetra_block_access_view_type; // block crs (layout + // right) + using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view; + using impl_scalar_type_2d_view_tpetra = + typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector + // (layout left) + using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view; + using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view; + using i64_3d_view = typename impl_type::i64_3d_view; + + /// team policy member type (used in cuda) + using member_type = typename Kokkos::TeamPolicy::member_type; + + // enum for max blocksize and vector length + enum : int { max_blocksize = 32 }; + + private: + ConstUnmanaged b; + ConstUnmanaged x; // x_owned + ConstUnmanaged x_remote; + Unmanaged y; + + // AmD information + const ConstUnmanaged tpetra_values; + + // blocksize + const local_ordinal_type blocksize_requested; + + // block offsets + const ConstUnmanaged A_x_offsets; + const ConstUnmanaged A_x_offsets_remote; + + // diagonal block inverses + const ConstUnmanaged d_inv; + + // squared update norms + const Unmanaged W; + + impl_scalar_type damping_factor; + + public: + ComputeResidualAndSolve_SolveOnly( + const AmD& amd, const btdm_scalar_type_3d_view& d_inv_, + const impl_scalar_type_1d_view& W_, + const local_ordinal_type& blocksize_requested_, + const impl_scalar_type& damping_factor_) + : tpetra_values(amd.tpetra_values), + blocksize_requested(blocksize_requested_), + A_x_offsets(amd.A_x_offsets), + A_x_offsets_remote(amd.A_x_offsets_remote), + d_inv(d_inv_), + W(W_), + damping_factor(damping_factor_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type& member) const { + const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B); + const local_ordinal_type rowidx = + member.league_rank() * member.team_size() + member.team_rank(); + const local_ordinal_type row = rowidx * blocksize; + const local_ordinal_type num_vectors = b.extent(1); + + // Get shared allocation for a local copy of x, Ax, and A + impl_scalar_type* local_DinvAx = + reinterpret_cast(member.thread_scratch(0).get_shmem( + blocksize * sizeof(impl_scalar_type))); + + if (rowidx >= (local_ordinal_type)d_inv.extent(0)) return; + + magnitude_type norm = 0; + for (local_ordinal_type col = 0; col < num_vectors; ++col) { + // Compute local_DinvAx = D^-1 * local_Ax + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), + [&](const local_ordinal_type& k0) { + impl_scalar_type val = 0; + for (local_ordinal_type k1 = 0; k1 < blocksize; + k1++) { + val += d_inv(rowidx, k0, k1) * b(row + k1, col); + } + local_DinvAx[k0] = val; + }); + + magnitude_type colNorm; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(member, blocksize), + [&](const local_ordinal_type& k, magnitude_type& update) { + // Compute the change in y (assuming damping_factor == 1) for this + // entry. + impl_scalar_type y_update = local_DinvAx[k]; + magnitude_type ydiff = + Kokkos::ArithTraits::abs(y_update); + update += ydiff * ydiff; + y(row + k, col) = damping_factor * y_update; + }, + colNorm); + norm += colNorm; + } + Kokkos::single(Kokkos::PerThread(member), [&]() { W(rowidx) = norm; }); + } + + // ComputeResidualAndSolve_SolveOnly::run does the solve for the first + // iteration, when the initial guess for y is zero. This means the residual + // vector is just b. The kernel applies the inverse diags to b to find y, and + // also puts the partial squared update norms (1 per row) into W. + void run(const ConstUnmanaged& b_, + const Unmanaged& y_) { + IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN; + IFPACK2_BLOCKHELPER_TIMER_WITH_FENCE( + "BlockTriDi::ComputeResidualAndSolve::Run_Y_Zero", + ComputeResidualAndSolve0, execution_space); + + y = y_; + b = b_; + + const local_ordinal_type blocksize = blocksize_requested; + const local_ordinal_type nrows = d_inv.extent(0); + + const local_ordinal_type team_size = 8; + const local_ordinal_type vector_size = 8; + const size_t shmem_thread_size = blocksize * sizeof(impl_scalar_type); + Kokkos::TeamPolicy policy( + (nrows + team_size - 1) / team_size, team_size, vector_size); + policy.set_scratch_size(0, Kokkos::PerThread(shmem_thread_size)); + Kokkos::parallel_for("ComputeResidualAndSolve::TeamPolicy::y_zero", policy, + *this); + IFPACK2_BLOCKHELPER_PROFILER_REGION_END; + IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) + } +}; + +} // namespace Ifpack2::BlockHelperDetails + +#endif diff --git a/packages/ifpack2/src/Ifpack2_BlockHelper.hpp b/packages/ifpack2/src/Ifpack2_BlockHelper.hpp index f67cebeb7e30..af735ddf7c33 100644 --- a/packages/ifpack2/src/Ifpack2_BlockHelper.hpp +++ b/packages/ifpack2/src/Ifpack2_BlockHelper.hpp @@ -329,6 +329,7 @@ namespace Ifpack2 { // tpetra multivector values (layout left): may need to change the typename more explicitly typedef Kokkos::View impl_scalar_type_2d_view; typedef Kokkos::View impl_scalar_type_2d_view_tpetra; + typedef Kokkos::View const_impl_scalar_type_2d_view_tpetra; // packed data always use layout right typedef Kokkos::View vector_type_1d_view; diff --git a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp index d2e39255ba04..362970e13941 100644 --- a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp +++ b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp @@ -124,6 +124,7 @@ namespace Ifpack2 { impl_->W = typename impl_type::impl_scalar_type_1d_view(); IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) } + // Decide whether to use fused block Jacobi path IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) } @@ -165,10 +166,18 @@ namespace Ifpack2 { bool pointIndexed) : Container(matrix, partitions, pointIndexed), partitions_(partitions) { + using Helpers = BlockHelperDetails::ImplType; IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::BlockTriDiContainer", BlockTriDiContainer); const bool useSeqMethod = false; const bool overlapCommAndComp = false; initInternal(matrix, importer, overlapCommAndComp, useSeqMethod); + auto matrixBCRS = Teuchos::rcp_dynamic_cast(matrix); + bool hasBlockCrs = !matrixBCRS.is_null(); + bool onePartitionPerRow = hasBlockCrs && size_t(partitions.size()) == matrixBCRS->getLocalNumRows(); + // Decide upfront whether the fused block Jacobi path can be used. + impl_->use_fused_jacobi = + BlockHelperDetails::is_device::value \ + && hasBlockCrs && (!partitions.size() || onePartitionPerRow); n_subparts_per_part_ = -1; block_size_ = -1; IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) @@ -185,8 +194,18 @@ namespace Ifpack2 { const bool explicitConversion) : Container(matrix, partitions, false), partitions_(partitions) { + using Helpers = BlockHelperDetails::ImplType; IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::BlockTriDiContainer", BlockTriDiContainer); initInternal(matrix, Teuchos::null, overlapCommAndComp, useSeqMethod, block_size, explicitConversion); + auto matrixBCRS = Teuchos::rcp_dynamic_cast(matrix); + bool hasBlockCrs = !matrixBCRS.is_null(); + bool onePartitionPerRow = hasBlockCrs && size_t(partitions.size()) == matrixBCRS->getLocalNumRows(); + // Jacobi case can be represented in two ways: + // - partitions is empty + // - partitions has length equal to local number of rows (meaning all line lengths must be 1) + impl_->use_fused_jacobi = + BlockHelperDetails::is_device::value \ + && !useSeqMethod && hasBlockCrs && (!partitions.size() || onePartitionPerRow); n_subparts_per_part_ = n_subparts_per_part; block_size_ = block_size; IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) @@ -254,7 +273,7 @@ namespace Ifpack2 { impl_->block_tridiags, impl_->a_minus_d, impl_->overlap_communication_and_computation, - impl_->async_importer, useSeqMethod); + impl_->async_importer, useSeqMethod, impl_->use_fused_jacobi); } IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) } @@ -273,7 +292,8 @@ namespace Ifpack2 { (impl_->A, impl_->blockGraph, impl_->part_interface, impl_->block_tridiags, - Kokkos::ArithTraits::zero()); + Kokkos::ArithTraits::zero(), + impl_->use_fused_jacobi); } this->IsComputed_ = true; IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) @@ -302,21 +322,38 @@ namespace Ifpack2 { const magnitude_type tol = Kokkos::ArithTraits::zero(); const int check_tol_every = 1; - BlockTriDiContainerDetails::applyInverseJacobi - (impl_->A, - impl_->blockGraph, - impl_->tpetra_importer, - impl_->async_importer, - impl_->overlap_communication_and_computation, - X, Y, impl_->Z, impl_->W, - impl_->part_interface, impl_->block_tridiags, impl_->a_minus_d, - impl_->work, - impl_->norm_manager, - dampingFactor, - zeroStartingSolution, - numSweeps, - tol, - check_tol_every); + if(!impl_->use_fused_jacobi) { + BlockTriDiContainerDetails::applyInverseJacobi + (impl_->A, + impl_->blockGraph, + impl_->tpetra_importer, + impl_->async_importer, + impl_->overlap_communication_and_computation, + X, Y, impl_->Z, impl_->W, + impl_->part_interface, impl_->block_tridiags, impl_->a_minus_d, + impl_->work, + impl_->norm_manager, + dampingFactor, + zeroStartingSolution, + numSweeps, + tol, + check_tol_every); + } + else { + BlockTriDiContainerDetails::applyFusedBlockJacobi + (impl_->tpetra_importer, + impl_->async_importer, + impl_->overlap_communication_and_computation, + X, Y, impl_->W, + impl_->part_interface, impl_->block_tridiags, impl_->a_minus_d, + impl_->work_flat, + impl_->norm_manager, + dampingFactor, + zeroStartingSolution, + numSweeps, + tol, + check_tol_every); + } IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) } @@ -342,7 +379,8 @@ namespace Ifpack2 { (impl_->A, impl_->blockGraph, impl_->part_interface, impl_->block_tridiags, - in.addRadiallyToDiagonal); + in.addRadiallyToDiagonal, + impl_->use_fused_jacobi); } this->IsComputed_ = true; IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) @@ -367,21 +405,38 @@ namespace Ifpack2 { IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::applyInverseJacobi", applyInverseJacobi); int r_val = 0; { - r_val = BlockTriDiContainerDetails::applyInverseJacobi - (impl_->A, - impl_->blockGraph, - impl_->tpetra_importer, - impl_->async_importer, - impl_->overlap_communication_and_computation, - X, Y, impl_->Z, impl_->W, - impl_->part_interface, impl_->block_tridiags, impl_->a_minus_d, - impl_->work, - impl_->norm_manager, - in.dampingFactor, - in.zeroStartingSolution, - in.maxNumSweeps, - in.tolerance, - in.checkToleranceEvery); + if(!impl_->use_fused_jacobi) { + r_val = BlockTriDiContainerDetails::applyInverseJacobi + (impl_->A, + impl_->blockGraph, + impl_->tpetra_importer, + impl_->async_importer, + impl_->overlap_communication_and_computation, + X, Y, impl_->Z, impl_->W, + impl_->part_interface, impl_->block_tridiags, impl_->a_minus_d, + impl_->work, + impl_->norm_manager, + in.dampingFactor, + in.zeroStartingSolution, + in.maxNumSweeps, + in.tolerance, + in.checkToleranceEvery); + } + else { + r_val = BlockTriDiContainerDetails::applyFusedBlockJacobi + (impl_->tpetra_importer, + impl_->async_importer, + impl_->overlap_communication_and_computation, + X, Y, impl_->W, + impl_->part_interface, impl_->block_tridiags, impl_->a_minus_d, + impl_->work_flat, + impl_->norm_manager, + in.dampingFactor, + in.zeroStartingSolution, + in.maxNumSweeps, + in.tolerance, + in.checkToleranceEvery); + } } IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) return r_val; diff --git a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp index 88bb7a09b9db..5c24f35a183a 100644 --- a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp +++ b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp @@ -50,6 +50,7 @@ #include "Ifpack2_BlockHelper.hpp" #include "Ifpack2_BlockComputeResidualVector.hpp" +#include "Ifpack2_BlockComputeResidualAndSolve.hpp" //#include @@ -1180,29 +1181,21 @@ namespace Ifpack2 { local_ordinal_type pack_nrows_sub = 0; if (jacobi) { IFPACK2_BLOCKHELPER_TIMER("compute part indices (Jacobi)", Jacobi); + // Jacobi (all lines have length 1) means that A_n_lclrows == nparts, + // so the mapping between parts and rows is trivial. + // Note: we can leave interf.row_contiguous = true, since for all i: lclrow(i) == i + for (local_ordinal_type i=0; i <= nparts; ++i) { + part2rowidx0(i) = i; + partptr(i) = i; + } + for (local_ordinal_type i=0; i < nparts; ++i) { + rowidx2part(i) = i; + lclrow(i) = i; + } for (local_ordinal_type ip=0;ip= A_n_lclrows, - BlockHelperDetails::get_msg_prefix(comm) - << "partitions[" << p[ip] << "][" - << i << "] = " << lcl_row - << " but input matrix implies limits of [0, " << A_n_lclrows-1 - << "]."); - lclrow(offset+i) = lcl_row; - rowidx2part(offset+i) = ip; - if (interf.row_contiguous && offset+i > 0 && lclrow((offset+i)-1) + 1 != lcl_row) - interf.row_contiguous = false; - } - partptr(ip+1) = offset + ipnrows; } part2rowidx0_sub(0) = 0; partptr_sub(0, 0) = 0; @@ -1569,6 +1562,7 @@ namespace Ifpack2 { using size_type_2d_view = typename impl_type::size_type_2d_view; using vector_type_3d_view = typename impl_type::vector_type_3d_view; using vector_type_4d_view = typename impl_type::vector_type_4d_view; + using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view; // flat_td_ptr(i) is the index into flat-array values of the start of the // i'th tridiag. pack_td_ptr is the same, but for packs. If vector_length == @@ -1587,6 +1581,15 @@ namespace Ifpack2 { // inv(A_00)*A_01 block values. vector_type_4d_view e_values; + // The following are for fused block Jacobi only. + // For block row i, diag_offset(i)...diag_offset(i + bs^2) + // is the range of scalars for the diagonal block. + size_type_1d_view diag_offsets; + // For fused residual+solve block Jacobi case, + // this contains the diagonal block inverses in flat, local row indexing: + // d_inv(row, :, :) gives the row-major block for row. + btdm_scalar_type_3d_view d_inv; + bool is_diagonal_only; BlockTridiags() = default; @@ -1866,7 +1869,8 @@ namespace Ifpack2 { BlockHelperDetails::AmD &amd, const bool overlap_communication_and_computation, const Teuchos::RCP > &async_importer, - bool useSeqMethod) { + bool useSeqMethod, + bool use_fused_jacobi) { IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::SymbolicPhase", SymbolicPhase); using impl_type = BlockHelperDetails::ImplType; @@ -1883,6 +1887,7 @@ namespace Ifpack2 { using vector_type_4d_view = typename impl_type::vector_type_4d_view; using crs_matrix_type = typename impl_type::tpetra_crs_matrix_type; using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type; + using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view; constexpr int vector_length = impl_type::vector_length; @@ -2193,6 +2198,13 @@ namespace Ifpack2 { BlockHelperDetails::precompute_A_x_offsets(amd, interf, g, dm2cm, blocksize, ownedRemoteSeparate); } + // If using fused block Jacobi path, allocate diagonal inverses here (d_inv) and find diagonal offsets. + if(use_fused_jacobi) { + btdm.d_inv = btdm_scalar_type_3d_view(do_not_initialize_tag("btdm.d_inv"), interf.nparts, blocksize, blocksize); + auto rowptrs = A_bcrs->getCrsGraph().getLocalRowPtrsDevice(); + auto entries = A_bcrs->getCrsGraph().getLocalIndicesDevice(); + btdm.diag_offsets = BlockHelperDetails::findDiagOffsets(rowptrs, entries, interf.nparts, blocksize); + } IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) } @@ -2786,6 +2798,7 @@ namespace Ifpack2 { /// views using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view; using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view; + using size_type_1d_view = typename impl_type::size_type_1d_view; using size_type_2d_view = typename impl_type::size_type_2d_view; using impl_scalar_type_1d_view_tpetra = typename impl_type::impl_scalar_type_1d_view_tpetra; /// vectorization @@ -2795,10 +2808,15 @@ namespace Ifpack2 { using vector_type_4d_view = typename impl_type::vector_type_4d_view; using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view; using internal_vector_type_5d_view = typename impl_type::internal_vector_type_5d_view; + using btdm_scalar_type_2d_view = typename impl_type::btdm_scalar_type_2d_view; + using btdm_scalar_type_3d_view = typename impl_type::btdm_scalar_type_3d_view; using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view; using btdm_scalar_type_5d_view = typename impl_type::btdm_scalar_type_5d_view; using internal_vector_scratch_type_3d_view = Scratch; using btdm_scalar_scratch_type_3d_view = Scratch; + using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right) + using local_crs_graph_type = typename impl_type::local_crs_graph_type; + using colinds_view = typename local_crs_graph_type::entries_type; using internal_vector_type = typename impl_type::internal_vector_type; static constexpr int vector_length = impl_type::vector_length; @@ -2818,6 +2836,7 @@ namespace Ifpack2 { // block crs matrix (it could be Kokkos::UVMSpace::size_type, which is int) using size_type_1d_view_tpetra = Kokkos::View; ConstUnmanaged A_block_rowptr; + ConstUnmanaged A_colinds; ConstUnmanaged A_point_rowptr; ConstUnmanaged A_values; // block tridiags @@ -2827,6 +2846,8 @@ namespace Ifpack2 { const Unmanaged e_internal_vector_values; const Unmanaged scalar_values, scalar_values_schur; const Unmanaged e_scalar_values; + const Unmanaged d_inv; + const Unmanaged diag_offsets; // shared information const local_ordinal_type blocksize, blocksize_square; // diagonal safety @@ -2888,6 +2909,8 @@ namespace Ifpack2 { btdm_.e_values.extent(2), btdm_.e_values.extent(3), vector_length), + d_inv(btdm_.d_inv), + diag_offsets(btdm_.diag_offsets), blocksize(btdm_.values.extent(1)), blocksize_square(blocksize*blocksize), // diagonal weight to avoid zero pivots @@ -2904,6 +2927,8 @@ namespace Ifpack2 { A_block_rowptr = G_->getLocalGraphDevice().row_map; if (hasBlockCrsMatrix) { A_values = const_cast(A_bcrs.get())->getValuesDeviceNonConst(); + // A_colinds only used in fused Jacobi path + A_colinds = A_bcrs->getCrsGraph().getLocalIndicesDevice(); } else { A_point_rowptr = A_crs->getCrsGraph()->getLocalGraphDevice().row_map; @@ -3188,6 +3213,7 @@ namespace Ifpack2 { public: struct ExtractAndFactorizeSubLineTag {}; + struct ExtractAndFactorizeFusedJacobiTag {}; struct ExtractBCDTag {}; struct ComputeETag {}; struct ComputeSchurTag {}; @@ -3237,6 +3263,68 @@ namespace Ifpack2 { } } + KOKKOS_INLINE_FUNCTION + void + operator() (const ExtractAndFactorizeFusedJacobiTag&, const member_type &member) const { + using default_mode_and_algo_type = ExtractAndFactorizeTridiagsDefaultModeAndAlgo; + using default_mode_type = typename default_mode_and_algo_type::mode_type; + using default_algo_type = typename default_mode_and_algo_type::algo_type; + // When fused block Jacobi can be used, the mapping between local rows and parts is trivial (i <-> i) + // We can simply pull the diagonal entry from A into d_inv + btdm_scalar_scratch_type_3d_view WW1(member.team_scratch(ScratchLevel), vector_length / 2, blocksize, blocksize); + btdm_scalar_scratch_type_3d_view WW2(member.team_scratch(ScratchLevel), vector_length / 2, blocksize, blocksize); + const auto one = Kokkos::ArithTraits::one(); + const local_ordinal_type nrows = lclrow.extent(0); + Kokkos::parallel_for + (Kokkos::ThreadVectorRange(member, vector_length / 2), + [&](const local_ordinal_type &v) { + local_ordinal_type row = member.league_rank() * vector_length / 2 + v; + // diagEntry has index of diagonal within row + auto W1 = Kokkos::subview(WW1, v, Kokkos::ALL(), Kokkos::ALL()); + auto W2 = Kokkos::subview(WW2, v, Kokkos::ALL(), Kokkos::ALL()); + if(row < nrows) { + // View the diagonal block of A in row as 2D row-major + const impl_scalar_type* A_diag = A_values.data() + diag_offsets(row); + // Copy the diag into scratch slice W1 + // (copying elements directly is better than KokkosBatched copy) + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize * blocksize), + [&](int i) + { + W1.data()[i] = A_diag[i]; + }); + // and set W2 to identity in preparation to invert with 2 x Trsm + KB::SetIdentity + ::invoke(member, W2); + } + member.team_barrier(); + if(row < nrows) { + // LU factorize in-place + KB::LU + ::invoke(member, W1, tiny); + } + member.team_barrier(); + if(row < nrows) { + KB::Trsm + ::invoke(member, one, W1, W2); + KB::Trsm + ::invoke(member, one, W1, W2); + } + member.team_barrier(); + if(row < nrows) { + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, blocksize * blocksize), + [&](int i) + { + auto d_inv_block = &d_inv(row, 0, 0); + d_inv_block[i] = W2.data()[i]; + }); + } + }); + } + KOKKOS_INLINE_FUNCTION void operator() (const ExtractBCDTag &, const member_type &member) const { @@ -3589,6 +3677,25 @@ namespace Ifpack2 { IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END; } + void run_fused_jacobi() { + IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN; + constexpr int half_vector_length = vector_length / 2; + const local_ordinal_type team_size = + ExtractAndFactorizeTridiagsDefaultModeAndAlgo:: + recommended_team_size(blocksize, half_vector_length, 1); + const local_ordinal_type per_team_scratch = + btdm_scalar_scratch_type_3d_view::shmem_size(blocksize, blocksize, vector_length); + { + IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractAndFactorizeFusedJacobi", ExtractAndFactorizeFusedJacobiTag); + Kokkos::TeamPolicy + policy((lclrow.extent(0) + half_vector_length - 1) / half_vector_length, team_size, half_vector_length); + + policy.set_scratch_size(ScratchLevel, Kokkos::PerTeam(per_team_scratch)); + Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run", + policy, *this); + } + IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END; + } }; /// @@ -3600,7 +3707,8 @@ namespace Ifpack2 { const Teuchos::RCP::tpetra_crs_graph_type> &G, const BlockHelperDetails::PartInterface &interf, BlockTridiags &btdm, - const typename BlockHelperDetails::ImplType::magnitude_type tiny) { + const typename BlockHelperDetails::ImplType::magnitude_type tiny, + bool use_fused_jacobi) { using impl_type = BlockHelperDetails::ImplType; using execution_space = typename impl_type::execution_space; using team_policy_type = Kokkos::TeamPolicy; @@ -3617,12 +3725,18 @@ namespace Ifpack2 { if(scratch_required < max_scratch) { // Can use level 0 scratch ExtractAndFactorizeTridiags function(btdm, interf, A, G, tiny); - function.run(); + if(!use_fused_jacobi) + function.run(); + else + function.run_fused_jacobi(); } else { // Not enough level 0 scratch, so fall back to level 1 ExtractAndFactorizeTridiags function(btdm, interf, A, G, tiny); - function.run(); + if(!use_fused_jacobi) + function.run(); + else + function.run_fused_jacobi(); } IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) } @@ -4998,68 +5112,276 @@ namespace Ifpack2 { // iterate int sweep = 0; for (;sweepasyncSendRecv(YY); - // OverlapTag, compute_owned = true - compute_residual_vector.run(pmv, XX, YY, remote_multivector, true); - if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) { - if (is_async_importer_active) async_importer->cancel(); - break; - } - if (is_async_importer_active) { - async_importer->syncRecv(); - // OverlapTag, compute_owned = false - compute_residual_vector.run(pmv, XX, YY, remote_multivector, false); - } - } else { - if (is_async_importer_active) - async_importer->syncExchange(YY); - if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break; - // AsyncTag - compute_residual_vector.run(pmv, XX, YY, remote_multivector); + // pmv := y(lclrow). + multivector_converter.run(YY); + } else { + // fused y := x - R y and pmv := y(lclrow); + // real use case does not use overlap comp and comm + if (overlap_communication_and_computation || !is_async_importer_active) { + if (is_async_importer_active) async_importer->asyncSendRecv(YY); + // OverlapTag, compute_owned = true + compute_residual_vector.run(pmv, XX, YY, remote_multivector, true); + if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) { + if (is_async_importer_active) async_importer->cancel(); + break; + } + if (is_async_importer_active) { + async_importer->syncRecv(); + // OverlapTag, compute_owned = false + compute_residual_vector.run(pmv, XX, YY, remote_multivector, false); } + } else { + if (is_async_importer_active) + async_importer->syncExchange(YY); + if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break; + // AsyncTag + compute_residual_vector.run(pmv, XX, YY, remote_multivector); } } } - // pmv := inv(D) pmv. - { - solve_tridiags.run(YY, W); + solve_tridiags.run(YY, W); + // Compute norm. + if (is_norm_manager_active) { + // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always. + BlockHelperDetails::reduceVector(W, norm_manager.getBuffer()); + if (sweep + 1 == max_num_sweeps) { + norm_manager.ireduce(sweep, true); + norm_manager.checkDone(sweep + 1, tolerance, true); + } else { + norm_manager.ireduce(sweep); + } } - { - if (is_norm_manager_active) { - // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always. - BlockHelperDetails::reduceVector(W, norm_manager.getBuffer()); - if (sweep + 1 == max_num_sweeps) { - norm_manager.ireduce(sweep, true); - norm_manager.checkDone(sweep + 1, tolerance, true); - } else { - norm_manager.ireduce(sweep); + is_y_zero = false; + } + + //sqrt the norms for the caller's use. + if (is_norm_manager_active) norm_manager.finalize(); + + return sweep; + } + + // fused block Jacobi apply for a specific block size B + // (or B = 0 for general case). + template + int + applyFusedBlockJacobi_Impl( + const Teuchos::RCP::tpetra_import_type> &tpetra_importer, + const Teuchos::RCP > &async_importer, + const bool overlap_communication_and_computation, + // tpetra interface + const typename BlockHelperDetails::ImplType::tpetra_multivector_type &X, // tpetra interface + /* */ typename BlockHelperDetails::ImplType::tpetra_multivector_type &Y, // tpetra interface + /* */ typename BlockHelperDetails::ImplType::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff) + // local object interface + const BlockHelperDetails::PartInterface &interf, // mesh interface + const BlockTridiags &btdm, // packed block tridiagonal matrices + const BlockHelperDetails::AmD &amd, // R = A - D + /* */ typename BlockHelperDetails::ImplType::impl_scalar_type_1d_view &work, // workspace + /* */ BlockHelperDetails::NormManager &norm_manager, + // preconditioner parameters + const typename BlockHelperDetails::ImplType::impl_scalar_type &damping_factor, + /* */ bool is_y_zero, + const int max_num_sweeps, + const typename BlockHelperDetails::ImplType::magnitude_type tol, + const int check_tol_every) { + using impl_type = BlockHelperDetails::ImplType; + using node_memory_space = typename impl_type::node_memory_space; + using local_ordinal_type = typename impl_type::local_ordinal_type; + using size_type = typename impl_type::size_type; + using impl_scalar_type = typename impl_type::impl_scalar_type; + using magnitude_type = typename impl_type::magnitude_type; + using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view; + using tpetra_multivector_type = typename impl_type::tpetra_multivector_type; + using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view; + using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra; + + // the tpetra importer and async importer can't both be active + TEUCHOS_TEST_FOR_EXCEPT_MSG(!tpetra_importer.is_null() && !async_importer.is_null(), + "Neither Tpetra importer nor Async importer is null."); + // max number of sweeps should be positive number + TEUCHOS_TEST_FOR_EXCEPT_MSG(max_num_sweeps <= 0, + "Maximum number of sweeps must be >= 1."); + + // const parameters + const bool is_async_importer_active = !async_importer.is_null(); + const bool is_norm_manager_active = tol > Kokkos::ArithTraits::zero(); + const magnitude_type tolerance = tol*tol; + const local_ordinal_type blocksize = btdm.d_inv.extent(1); + const local_ordinal_type num_vectors = Y.getNumVectors(); + const local_ordinal_type num_blockrows = interf.nparts; + + typename impl_type::impl_scalar_type_2d_view_tpetra remote_multivector; + { + if (is_async_importer_active) { + // create comm data buffer and keep it here + async_importer->createDataBuffer(num_vectors); + remote_multivector = async_importer->getRemoteMultiVectorLocalView(); + } + } + + const auto XX = X.getLocalViewDevice(Tpetra::Access::ReadOnly); + const auto YY = Y.getLocalViewDevice(Tpetra::Access::ReadWrite); + + const bool two_pass_residual = + overlap_communication_and_computation && is_async_importer_active; + + // Calculate the required work size and reallocate it if not already big enough. + // Check that our assumptions about YY dimension are correct. + TEUCHOS_TEST_FOR_EXCEPT_MSG( + size_t(num_blockrows) * blocksize * num_vectors != YY.extent(0) * YY.extent(1), + "Local LHS vector (YY) has total size " << YY.extent(0) << "x" << YY.extent(1) << + " = " << YY.extent(0) * YY.extent(1) << ",\n" << + "but expected " << num_blockrows << "x" << blocksize << "x" << num_vectors << + " = " << size_t(num_blockrows) * blocksize * num_vectors << '\n'); + size_type work_required = size_type(num_blockrows) * blocksize * num_vectors; + if (work.extent(0) < work_required) { + work = impl_scalar_type_1d_view(do_not_initialize_tag("flat workspace 1d view"), work_required); + } + + Unmanaged y_doublebuf(work.data(), num_blockrows * blocksize, num_vectors); + + // construct W + if (W.extent(0) != size_t(num_blockrows)) + W = impl_scalar_type_1d_view(do_not_initialize_tag("W"), num_blockrows); + + // Create the required functors upfront (this is inexpensive - all shallow copies) + BlockHelperDetails::ComputeResidualAndSolve_SolveOnly + functor_solve_only(amd, btdm.d_inv, W, blocksize, damping_factor); + BlockHelperDetails::ComputeResidualAndSolve_1Pass + functor_1pass(amd, btdm.d_inv, W, blocksize, damping_factor); + BlockHelperDetails::ComputeResidualAndSolve_2Pass + functor_2pass(amd, btdm.d_inv, W, blocksize, damping_factor); + + // norm manager workspace resize + if (is_norm_manager_active) + norm_manager.setCheckFrequency(check_tol_every); + + // For double-buffering. + // yy_buffers[current_y] has the current iterate of y. + // yy_buffers[1-current_y] has the next iterate of y. + Unmanaged y_buffers[2] = {YY, y_doublebuf}; + int current_y = 0; + + // iterate + int sweep = 0; + for (;sweep < max_num_sweeps; ++sweep) { + if (is_y_zero) { + // If y is initially zero, then we are just computing y := damping_factor * Dinv * x + functor_solve_only.run(XX, y_buffers[1-current_y]); + } else { + // real use case does not use overlap comp and comm + if (overlap_communication_and_computation || !is_async_importer_active) { + if (is_async_importer_active) async_importer->asyncSendRecv(y_buffers[current_y]); + if(two_pass_residual) { + // Pass 1 computes owned residual and stores into new y buffer, + // but doesn't apply Dinv or produce a norm yet + functor_2pass.run_pass1(XX, y_buffers[current_y], y_buffers[1-current_y]); + } + else { + // This case happens if running with single rank. + // There are no remote columns, so residual and solve can happen in one step. + functor_1pass.run(XX, y_buffers[current_y], remote_multivector, y_buffers[1-current_y]); + } + if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) { + if (is_async_importer_active) async_importer->cancel(); + break; } + if (is_async_importer_active) { + async_importer->syncRecv(); + // Stage 2 finishes computing the residual, then applies Dinv and computes norm. + functor_2pass.run_pass2(y_buffers[current_y], remote_multivector, y_buffers[1-current_y]); + } + } else { + if (is_async_importer_active) + async_importer->syncExchange(y_buffers[current_y]); + if (is_norm_manager_active && norm_manager.checkDone(sweep, tolerance)) break; + // Full residual, Dinv apply, and norm in one kernel + functor_1pass.run(XX, y_buffers[current_y], remote_multivector, y_buffers[1-current_y]); + } + } + + // Compute global norm. + if (is_norm_manager_active) { + BlockHelperDetails::reduceVector(W, norm_manager.getBuffer()); + if (sweep + 1 == max_num_sweeps) { + norm_manager.ireduce(sweep, true); + norm_manager.checkDone(sweep + 1, tolerance, true); + } else { + norm_manager.ireduce(sweep); } } is_y_zero = false; + // flip y buffers for next iteration, or termination if we reached max_num_sweeps. + current_y = 1 - current_y; + } + if(current_y == 1) { + // We finished iterating with y in the double buffer, so copy it to the user's vector. + Kokkos::deep_copy(YY, y_doublebuf); } //sqrt the norms for the caller's use. if (is_norm_manager_active) norm_manager.finalize(); + return sweep; + } + + template + int + applyFusedBlockJacobi( + const Teuchos::RCP::tpetra_import_type> &tpetra_importer, + const Teuchos::RCP > &async_importer, + const bool overlap_communication_and_computation, + // tpetra interface + const typename BlockHelperDetails::ImplType::tpetra_multivector_type &X, // tpetra interface + /* */ typename BlockHelperDetails::ImplType::tpetra_multivector_type &Y, // tpetra interface + /* */ typename BlockHelperDetails::ImplType::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff) + // local object interface + const BlockHelperDetails::PartInterface &interf, // mesh interface + const BlockTridiags &btdm, // packed block tridiagonal matrices + const BlockHelperDetails::AmD &amd, // R = A - D + /* */ typename BlockHelperDetails::ImplType::impl_scalar_type_1d_view &work, // workspace + /* */ BlockHelperDetails::NormManager &norm_manager, + // preconditioner parameters + const typename BlockHelperDetails::ImplType::impl_scalar_type &damping_factor, + /* */ bool is_y_zero, + const int max_num_sweeps, + const typename BlockHelperDetails::ImplType::magnitude_type tol, + const int check_tol_every) { + IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyFusedBlockJacobi", ApplyFusedBlockJacobi); + int blocksize = btdm.d_inv.extent(1); + int sweep = 0; +#define BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(B) { \ + sweep = applyFusedBlockJacobi_Impl( \ + tpetra_importer, async_importer, overlap_communication_and_computation, \ + X, Y, W, interf, btdm, amd, work, \ + norm_manager, damping_factor, is_y_zero, \ + max_num_sweeps, tol, check_tol_every); \ + } break + switch (blocksize) { + case 3: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI( 3); + case 5: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI( 5); + case 7: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI( 7); + case 9: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI( 9); + case 10: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(10); + case 11: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(11); + case 16: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(16); + case 17: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(17); + case 18: BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI(18); + default : BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI( 0); + } +#undef BLOCKTRIDICONTAINER_APPLY_FUSED_JACOBI return sweep; } @@ -5089,7 +5411,14 @@ namespace Ifpack2 { part_interface_type part_interface; block_tridiags_type block_tridiags; // D amd_type a_minus_d; // R = A - D - mutable typename impl_type::vector_type_1d_view work; // right hand side workspace + + // whether to use fused block Jacobi path + bool use_fused_jacobi; + + // vector workspace is used for general block tridi case + mutable typename impl_type::vector_type_1d_view work; // right hand side workspace (1D view of vector) + // scalar workspace is used for fused block jacobi case + mutable typename impl_type::impl_scalar_type_1d_view work_flat; // right hand side workspace (1D view of scalar) mutable norm_manager_type norm_manager; };