diff --git a/packages/ifpack2/example/BlockTriDi.cpp b/packages/ifpack2/example/BlockTriDi.cpp index 1c7bd3012570..3277179656c1 100644 --- a/packages/ifpack2/example/BlockTriDi.cpp +++ b/packages/ifpack2/example/BlockTriDi.cpp @@ -18,7 +18,7 @@ namespace { // (anonymous) // Values of command-line arguments. struct CmdLineArgs { - CmdLineArgs ():blockSize(-1),numIters(10),numRepeats(1),tol(1e-12),nx(172),ny(-1),nz(-1),mx(1),my(1),mz(1),sublinesPerLine(1),useStackedTimer(false),overlapCommAndComp(false){} + CmdLineArgs ():blockSize(-1),numIters(10),numRepeats(1),tol(1e-12),nx(172),ny(-1),nz(-1),mx(1),my(1),mz(1),sublinesPerLine(1),sublinesPerLineSchur(1),useStackedTimer(false),overlapCommAndComp(false){} std::string mapFilename; std::string matrixFilename; @@ -35,6 +35,7 @@ struct CmdLineArgs { int my; int mz; int sublinesPerLine; + int sublinesPerLineSchur; bool useStackedTimer; bool overlapCommAndComp; std::string problemName; @@ -71,6 +72,7 @@ getCmdLineArgs (CmdLineArgs& args, int argc, char* argv[]) "Whether to run with overlapCommAndComp)"); cmdp.setOption("problemName", &args.problemName, "Human-readable problem name for Watchr plot"); cmdp.setOption("matrixType", &args.matrixType, "matrixType"); + cmdp.setOption("sublinesPerLineSchur", &args.sublinesPerLineSchur, "sublinesPerLineSchur"); auto result = cmdp.parse (argc, argv); return result == Teuchos::CommandLineProcessor::PARSE_SUCCESSFUL; } @@ -573,7 +575,7 @@ main (int argc, char* argv[]) { Teuchos::TimeMonitor precSetupTimeMon (*precSetupTime); - precond = rcp(new BTDC(Ablock,parts,args.overlapCommAndComp)); + precond = rcp(new BTDC(Ablock,parts,args.sublinesPerLineSchur,args.overlapCommAndComp)); if(rank0) std::cout<<"Initializing preconditioner..."<initialize (); diff --git a/packages/ifpack2/src/Ifpack2_BlockComputeResidualVector.hpp b/packages/ifpack2/src/Ifpack2_BlockComputeResidualVector.hpp new file mode 100644 index 000000000000..a50f054a626a --- /dev/null +++ b/packages/ifpack2/src/Ifpack2_BlockComputeResidualVector.hpp @@ -0,0 +1,859 @@ +/*@HEADER +// *********************************************************************** +// +// Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package +// Copyright (2009) Sandia Corporation +// +// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive +// license for use of this work by or on behalf of the U.S. Government. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Michael A. Heroux (maherou@sandia.gov) +// +// *********************************************************************** +//@HEADER +*/ + +#ifndef IFPACK2_BLOCKCOMPUTERES_IMPL_HPP +#define IFPACK2_BLOCKCOMPUTERES_IMPL_HPP + +#include "Ifpack2_BlockHelper.hpp" + +namespace Ifpack2 { + + namespace BlockHelperDetails { + + /// + /// A - Tridiags(A), i.e., R in the splitting A = D + R. + /// + template + struct AmD { + using impl_type = BlockHelperDetails::ImplType; + using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view; + using size_type_1d_view = typename impl_type::size_type_1d_view; + using impl_scalar_type_1d_view_tpetra = Unmanaged; + // rowptr points to the start of each row of A_colindsub. + size_type_1d_view rowptr, rowptr_remote; + // Indices into A's rows giving the blocks to extract. rowptr(i) points to + // the i'th row. Thus, g.entries(A_colindsub(rowptr(row) : rowptr(row+1))), + // where g is A's graph, are the columns AmD uses. If seq_method_, then + // A_colindsub contains all the LIDs and A_colindsub_remote is empty. If ! + // seq_method_, then A_colindsub contains owned LIDs and A_colindsub_remote + // contains the remote ones. + local_ordinal_type_1d_view A_colindsub, A_colindsub_remote; + + // Currently always true. + bool is_tpetra_block_crs; + + // If is_tpetra_block_crs, then this is a pointer to A_'s value data. + impl_scalar_type_1d_view_tpetra tpetra_values; + + AmD() = default; + AmD(const AmD &b) = default; + }; + + template + struct PartInterface { + using local_ordinal_type = typename BlockHelperDetails::ImplType::local_ordinal_type; + using local_ordinal_type_1d_view = typename BlockHelperDetails::ImplType::local_ordinal_type_1d_view; + using local_ordinal_type_2d_view = typename BlockHelperDetails::ImplType::local_ordinal_type_2d_view; + + PartInterface() = default; + PartInterface(const PartInterface &b) = default; + + // Some terms: + // The matrix A is split as A = D + R, where D is the matrix of tridiag + // blocks and R is the remainder. + // A part is roughly a synonym for a tridiag. The distinction is that a part + // is the set of rows belonging to one tridiag and, equivalently, the off-diag + // rows in R associated with that tridiag. In contrast, the term tridiag is + // used to refer specifically to tridiag data, such as the pointer into the + // tridiag data array. + // Local (lcl) row are the LIDs. lclrow lists the LIDs belonging to each + // tridiag, and partptr points to the beginning of each tridiag. This is the + // LID space. + // Row index (idx) is the ordinal in the tridiag ordering. lclrow is indexed + // by this ordinal. This is the 'index' space. + // A flat index is the mathematical index into an array. A pack index + // accounts for SIMD packing. + + // Local row LIDs. Permutation from caller's index space to tridiag index + // space. + local_ordinal_type_1d_view lclrow; + // partptr_ is the pointer array into lclrow_. + local_ordinal_type_1d_view partptr; // np+1 + local_ordinal_type_2d_view partptr_sub; + local_ordinal_type_1d_view partptr_schur; + // packptr_(i), for i the pack index, indexes partptr_. partptr_(packptr_(i)) + // is the start of the i'th pack. + local_ordinal_type_1d_view packptr; // npack+1 + local_ordinal_type_1d_view packptr_sub; + local_ordinal_type_1d_view packindices_sub; + local_ordinal_type_1d_view packindices_schur; + // part2rowidx0_(i) is the flat row index of the start of the i'th part. It's + // an alias of partptr_ in the case of no overlap. + local_ordinal_type_1d_view part2rowidx0; // np+1 + local_ordinal_type_1d_view part2rowidx0_sub; + // part2packrowidx0_(i) is the packed row index. If vector_length is 1, then + // it's the same as part2rowidx0_; if it's > 1, then the value is combined + // with i % vector_length to get the location in the packed data. + local_ordinal_type_1d_view part2packrowidx0; // np+1 + local_ordinal_type_2d_view part2packrowidx0_sub; + local_ordinal_type part2packrowidx0_back; // So we don't need to grab the array from the GPU. + // rowidx2part_ maps the row index to the part index. + local_ordinal_type_1d_view rowidx2part; // nr + local_ordinal_type_1d_view rowidx2part_sub; + // True if lcl{row|col} is at most a constant away from row{idx|col}. In + // practice, this knowledge is not particularly useful, as packing for batched + // processing is done at the same time as the permutation from LID to index + // space. But it's easy to detect, so it's recorded in case an optimization + // can be made based on it. + bool row_contiguous; + + local_ordinal_type max_partsz; + local_ordinal_type max_subpartsz; + local_ordinal_type n_subparts_per_part; + local_ordinal_type nparts; + }; + + /// + /// compute local residula vector y = b - R x + /// + static inline int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize, + const int team_size) { + int total_team_size(0); + if (blksize <= 5) total_team_size = 32; + else if (blksize <= 9) total_team_size = 32; // 64 + else if (blksize <= 12) total_team_size = 96; + else if (blksize <= 16) total_team_size = 128; + else if (blksize <= 20) total_team_size = 160; + else total_team_size = 160; + return total_team_size/team_size; + } + + static inline int ComputeResidualVectorRecommendedHIPVectorSize(const int blksize, + const int team_size) { + int total_team_size(0); + if (blksize <= 5) total_team_size = 32; + else if (blksize <= 9) total_team_size = 32; // 64 + else if (blksize <= 12) total_team_size = 96; + else if (blksize <= 16) total_team_size = 128; + else if (blksize <= 20) total_team_size = 160; + else total_team_size = 160; + return total_team_size/team_size; + } + + static inline int ComputeResidualVectorRecommendedSYCLVectorSize(const int blksize, + const int team_size) { + int total_team_size(0); + if (blksize <= 5) total_team_size = 32; + else if (blksize <= 9) total_team_size = 32; // 64 + else if (blksize <= 12) total_team_size = 96; + else if (blksize <= 16) total_team_size = 128; + else if (blksize <= 20) total_team_size = 160; + else total_team_size = 160; + return total_team_size/team_size; + } + + template + static inline int ComputeResidualVectorRecommendedVectorSize(const int blksize, + const int team_size) { + if ( is_cuda::value ) + return ComputeResidualVectorRecommendedCudaVectorSize(blksize, team_size); + if ( is_hip::value ) + return ComputeResidualVectorRecommendedHIPVectorSize(blksize, team_size); + if ( is_sycl::value ) + return ComputeResidualVectorRecommendedSYCLVectorSize(blksize, team_size); + return -1; + } + + template + struct ComputeResidualVector { + public: + using impl_type = BlockHelperDetails::ImplType; + using node_device_type = typename impl_type::node_device_type; + using execution_space = typename impl_type::execution_space; + using memory_space = typename impl_type::memory_space; + + using local_ordinal_type = typename impl_type::local_ordinal_type; + using size_type = typename impl_type::size_type; + using impl_scalar_type = typename impl_type::impl_scalar_type; + using magnitude_type = typename impl_type::magnitude_type; + using btdm_scalar_type = typename impl_type::btdm_scalar_type; + using btdm_magnitude_type = typename impl_type::btdm_magnitude_type; + /// views + using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view; + using size_type_1d_view = typename impl_type::size_type_1d_view; + using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right) + using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view; + using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector (layout left) + using vector_type_3d_view = typename impl_type::vector_type_3d_view; + using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view; + static constexpr int vector_length = impl_type::vector_length; + + /// team policy member type (used in cuda) + using member_type = typename Kokkos::TeamPolicy::member_type; + + // enum for max blocksize and vector length + enum : int { max_blocksize = 32 }; + + private: + ConstUnmanaged b; + ConstUnmanaged x; // x_owned + ConstUnmanaged x_remote; + Unmanaged y; + Unmanaged y_packed; + Unmanaged y_packed_scalar; + + // AmD information + const ConstUnmanaged rowptr, rowptr_remote; + const ConstUnmanaged colindsub, colindsub_remote; + const ConstUnmanaged tpetra_values; + + // block crs graph information + // for cuda (kokkos crs graph uses a different size_type from size_t) + const ConstUnmanaged > A_rowptr; + const ConstUnmanaged > A_colind; + + // blocksize + const local_ordinal_type blocksize_requested; + + // part interface + const ConstUnmanaged part2packrowidx0; + const ConstUnmanaged part2rowidx0; + const ConstUnmanaged rowidx2part; + const ConstUnmanaged partptr; + const ConstUnmanaged lclrow; + const ConstUnmanaged dm2cm; + const bool is_dm2cm_active; + + public: + template + ComputeResidualVector(const AmD &amd, + const LocalCrsGraphType &graph, + const local_ordinal_type &blocksize_requested_, + const PartInterface &interf, + const local_ordinal_type_1d_view &dm2cm_) + : rowptr(amd.rowptr), rowptr_remote(amd.rowptr_remote), + colindsub(amd.A_colindsub), colindsub_remote(amd.A_colindsub_remote), + tpetra_values(amd.tpetra_values), + A_rowptr(graph.row_map), + A_colind(graph.entries), + blocksize_requested(blocksize_requested_), + part2packrowidx0(interf.part2packrowidx0), + part2rowidx0(interf.part2rowidx0), + rowidx2part(interf.rowidx2part), + partptr(interf.partptr), + lclrow(interf.lclrow), + dm2cm(dm2cm_), + is_dm2cm_active(dm2cm_.span() > 0) + {} + + inline + void + SerialGemv(const local_ordinal_type &blocksize, + const impl_scalar_type * const KOKKOS_RESTRICT AA, + const impl_scalar_type * const KOKKOS_RESTRICT xx, + /* */ impl_scalar_type * KOKKOS_RESTRICT yy) const { + using tlb = BlockHelperDetails::TpetraLittleBlock; + for (local_ordinal_type k0=0;k0 + KOKKOS_INLINE_FUNCTION + void + VectorCopy(const member_type &member, + const local_ordinal_type &blocksize, + const bbViewType &bb, + const yyViewType &yy) const { + Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](const local_ordinal_type &k0) { + yy(k0) = static_cast(bb(k0)); + }); + } + + template + KOKKOS_INLINE_FUNCTION + void + TeamVectorGemv(const member_type &member, + const local_ordinal_type &blocksize, + const AAViewType &AA, + const xxViewType &xx, + const yyViewType &yy) const { + Kokkos::parallel_for + (Kokkos::TeamThreadRange(member, blocksize), + [&](const local_ordinal_type &k0) { + impl_scalar_type val = 0; + Kokkos::parallel_for + (Kokkos::ThreadVectorRange(member, blocksize), + [&](const local_ordinal_type &k1) { + val += AA(k0,k1)*xx(k1); + }); + Kokkos::atomic_fetch_add(&yy(k0), typename yyViewType::const_value_type(-val)); + }); + } + + template + KOKKOS_INLINE_FUNCTION + void + VectorGemv(const member_type &member, + const local_ordinal_type &blocksize, + const AAViewType &AA, + const xxViewType &xx, + const yyViewType &yy) const { + Kokkos::parallel_for + (Kokkos::ThreadVectorRange(member, blocksize), + [&](const local_ordinal_type &k0) { + impl_scalar_type val(0); + for (local_ordinal_type k1=0;k1 + // KOKKOS_INLINE_FUNCTION + // void + // VectorGemv(const member_type &member, + // const local_ordinal_type &blocksize, + // const AAViewType &AA, + // const xxViewType &xx, + // const yyViewType &yy) const { + // for (local_ordinal_type k0=0;k0 FIXME HIP: should not need KOKKOS_INLINE_FUNCTION + KOKKOS_INLINE_FUNCTION + void + operator() (const SeqTag &, const local_ordinal_type& i) const { + const local_ordinal_type blocksize = blocksize_requested; + const local_ordinal_type blocksize_square = blocksize*blocksize; + + // constants + const Kokkos::pair block_range(0, blocksize); + const local_ordinal_type num_vectors = y.extent(1); + const local_ordinal_type row = i*blocksize; + for (local_ordinal_type col=0;col block_range(0, blocksize); + const local_ordinal_type num_vectors = y.extent(1); + + // subview pattern + auto bb = Kokkos::subview(b, block_range, 0); + auto xx = bb; + auto yy = Kokkos::subview(y, block_range, 0); + auto A_block = ConstUnmanaged(NULL, blocksize, blocksize); + + const local_ordinal_type row = lr*blocksize; + for (local_ordinal_type col=0;col + struct AsyncTag {}; + + template + // inline ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION + KOKKOS_INLINE_FUNCTION + void + operator() (const AsyncTag &, const local_ordinal_type &rowidx) const { + const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B); + const local_ordinal_type blocksize_square = blocksize*blocksize; + + // constants + const local_ordinal_type partidx = rowidx2part(rowidx); + const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx)); + const local_ordinal_type v = partidx % vector_length; + + const local_ordinal_type num_vectors = y_packed.extent(2); + const local_ordinal_type num_local_rows = lclrow.extent(0); + + // temporary buffer for y flat + impl_scalar_type yy[B == 0 ? max_blocksize : B] = {}; + + const local_ordinal_type lr = lclrow(rowidx); + const local_ordinal_type row = lr*blocksize; + for (local_ordinal_type col=0;col + KOKKOS_INLINE_FUNCTION + void + operator() (const AsyncTag &, const member_type &member) const { + const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B); + const local_ordinal_type blocksize_square = blocksize*blocksize; + + // constants + const local_ordinal_type rowidx = member.league_rank(); + const local_ordinal_type partidx = rowidx2part(rowidx); + const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx)); + const local_ordinal_type v = partidx % vector_length; + + const Kokkos::pair block_range(0, blocksize); + const local_ordinal_type num_vectors = y_packed_scalar.extent(2); + const local_ordinal_type num_local_rows = lclrow.extent(0); + + // subview pattern + auto bb = Kokkos::subview(b, block_range, 0); + auto xx = Kokkos::subview(x, block_range, 0); + auto xx_remote = Kokkos::subview(x_remote, block_range, 0); + auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0); + auto A_block = ConstUnmanaged(NULL, blocksize, blocksize); + + const local_ordinal_type lr = lclrow(rowidx); + const local_ordinal_type row = lr*blocksize; + for (local_ordinal_type col=0;col struct OverlapTag {}; + + template + // inline ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION + KOKKOS_INLINE_FUNCTION + void + operator() (const OverlapTag &, const local_ordinal_type& rowidx) const { + const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B); + const local_ordinal_type blocksize_square = blocksize*blocksize; + + // constants + const local_ordinal_type partidx = rowidx2part(rowidx); + const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx)); + const local_ordinal_type v = partidx % vector_length; + + const local_ordinal_type num_vectors = y_packed.extent(2); + const local_ordinal_type num_local_rows = lclrow.extent(0); + + // temporary buffer for y flat + impl_scalar_type yy[max_blocksize] = {}; + + auto colindsub_used = (P == 0 ? colindsub : colindsub_remote); + auto rowptr_used = (P == 0 ? rowptr : rowptr_remote); + + const local_ordinal_type lr = lclrow(rowidx); + const local_ordinal_type row = lr*blocksize; + for (local_ordinal_type col=0;col + KOKKOS_INLINE_FUNCTION + void + operator() (const OverlapTag &, const member_type &member) const { + const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B); + const local_ordinal_type blocksize_square = blocksize*blocksize; + + // constants + const local_ordinal_type rowidx = member.league_rank(); + const local_ordinal_type partidx = rowidx2part(rowidx); + const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx)); + const local_ordinal_type v = partidx % vector_length; + + const Kokkos::pair block_range(0, blocksize); + const local_ordinal_type num_vectors = y_packed_scalar.extent(2); + const local_ordinal_type num_local_rows = lclrow.extent(0); + + // subview pattern + auto bb = Kokkos::subview(b, block_range, 0); + auto xx = bb; //Kokkos::subview(x, block_range, 0); + auto xx_remote = bb; //Kokkos::subview(x_remote, block_range, 0); + auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0); + auto A_block = ConstUnmanaged(NULL, blocksize, blocksize); + auto colindsub_used = (P == 0 ? colindsub : colindsub_remote); + auto rowptr_used = (P == 0 ? rowptr : rowptr_remote); + + const local_ordinal_type lr = lclrow(rowidx); + const local_ordinal_type row = lr*blocksize; + for (local_ordinal_type col=0;col + void run(const MultiVectorLocalViewTypeY &y_, + const MultiVectorLocalViewTypeB &b_, + const MultiVectorLocalViewTypeX &x_) { + IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN; + IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ComputeResidual::"); + + y = y_; b = b_; x = x_; + if constexpr (is_device::value) { + const local_ordinal_type blocksize = blocksize_requested; + const local_ordinal_type team_size = 8; + const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize(blocksize, team_size); + const Kokkos::TeamPolicy policy(rowptr.extent(0) - 1, team_size, vector_size); + Kokkos::parallel_for + ("ComputeResidual::TeamPolicy::run", policy, *this); + } else { + const Kokkos::RangePolicy policy(0, rowptr.extent(0) - 1); + Kokkos::parallel_for + ("ComputeResidual::RangePolicy::run", policy, *this); + } + IFPACK2_BLOCKHELPER_PROFILER_REGION_END; + } + + // y = b - R (x , x_remote) + template + void run(const vector_type_3d_view &y_packed_, + const MultiVectorLocalViewTypeB &b_, + const MultiVectorLocalViewTypeX &x_, + const MultiVectorLocalViewTypeX_Remote &x_remote_) { + IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN; + IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ComputeResidual::"); + + b = b_; x = x_; x_remote = x_remote_; + if constexpr (is_device::value) { + y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(), + y_packed_.extent(0), + y_packed_.extent(1), + y_packed_.extent(2), + vector_length); + } else { + y_packed = y_packed_; + } + + if constexpr(is_device::value) { + const local_ordinal_type blocksize = blocksize_requested; + const local_ordinal_type team_size = 8; + const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize(blocksize, team_size); + // local_ordinal_type vl_power_of_two = 1; + // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2); + // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1); + // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two; +#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \ + const Kokkos::TeamPolicy > \ + policy(rowidx2part.extent(0), team_size, vector_size); \ + Kokkos::parallel_for \ + ("ComputeResidual::TeamPolicy::run", \ + policy, *this); } break + switch (blocksize_requested) { + case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3); + case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5); + case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7); + case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9); + case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10); + case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11); + case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16); + case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17); + case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18); + default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0); + } +#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL + } else { +#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \ + const Kokkos::RangePolicy > policy(0, rowidx2part.extent(0)); \ + Kokkos::parallel_for \ + ("ComputeResidual::RangePolicy::run", \ + policy, *this); } break + switch (blocksize_requested) { + case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3); + case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5); + case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7); + case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9); + case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10); + case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11); + case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16); + case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17); + case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18); + default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0); + } +#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL + } + IFPACK2_BLOCKHELPER_PROFILER_REGION_END; + } + + // y = b - R (y , y_remote) + template + void run(const vector_type_3d_view &y_packed_, + const MultiVectorLocalViewTypeB &b_, + const MultiVectorLocalViewTypeX &x_, + const MultiVectorLocalViewTypeX_Remote &x_remote_, + const bool compute_owned) { + IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN; + IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ComputeResidual::"); + + b = b_; x = x_; x_remote = x_remote_; + if constexpr (is_device::value) { + y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(), + y_packed_.extent(0), + y_packed_.extent(1), + y_packed_.extent(2), + vector_length); + } else { + y_packed = y_packed_; + } + + if constexpr (is_device::value) { + const local_ordinal_type blocksize = blocksize_requested; + const local_ordinal_type team_size = 8; + const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize(blocksize, team_size); + // local_ordinal_type vl_power_of_two = 1; + // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2); + // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1); + // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two; +#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \ + if (compute_owned) { \ + const Kokkos::TeamPolicy > \ + policy(rowidx2part.extent(0), team_size, vector_size); \ + Kokkos::parallel_for \ + ("ComputeResidual::TeamPolicy::run >", policy, *this); \ + } else { \ + const Kokkos::TeamPolicy > \ + policy(rowidx2part.extent(0), team_size, vector_size); \ + Kokkos::parallel_for \ + ("ComputeResidual::TeamPolicy::run >", policy, *this); \ + } break + switch (blocksize_requested) { + case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3); + case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5); + case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7); + case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9); + case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10); + case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11); + case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16); + case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17); + case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18); + default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0); + } +#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL + } else { +#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \ + if (compute_owned) { \ + const Kokkos::RangePolicy > \ + policy(0, rowidx2part.extent(0)); \ + Kokkos::parallel_for \ + ("ComputeResidual::RangePolicy::run >", policy, *this); \ + } else { \ + const Kokkos::RangePolicy > \ + policy(0, rowidx2part.extent(0)); \ + Kokkos::parallel_for \ + ("ComputeResidual::RangePolicy::run >", policy, *this); \ + } break + + switch (blocksize_requested) { + case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3); + case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5); + case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7); + case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9); + case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10); + case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11); + case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16); + case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17); + case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18); + default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0); + } +#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL + } + IFPACK2_BLOCKHELPER_PROFILER_REGION_END; + } + }; + + + } // namespace BlockHelperDetails + +} // namespace Ifpack2 + +#endif diff --git a/packages/ifpack2/src/Ifpack2_BlockHelper.hpp b/packages/ifpack2/src/Ifpack2_BlockHelper.hpp new file mode 100644 index 000000000000..18d6da71be83 --- /dev/null +++ b/packages/ifpack2/src/Ifpack2_BlockHelper.hpp @@ -0,0 +1,539 @@ +/*@HEADER +// *********************************************************************** +// +// Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package +// Copyright (2009) Sandia Corporation +// +// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive +// license for use of this work by or on behalf of the U.S. Government. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Michael A. Heroux (maherou@sandia.gov) +// +// *********************************************************************** +//@HEADER +*/ + +#ifndef IFPACK2_BLOCKHELPER_IMPL_HPP +#define IFPACK2_BLOCKHELPER_IMPL_HPP + + +namespace Ifpack2 { + + namespace BlockHelperDetails { + + namespace KB = KokkosBatched; + + /// + /// view decorators for unmanaged and const memory + /// + using do_not_initialize_tag = Kokkos::ViewAllocateWithoutInitializing; + + template + using MemoryTraits = Kokkos::MemoryTraits; + + template + using Unmanaged = Kokkos::View >; + template + using Atomic = Kokkos::View >; + template + using Const = Kokkos::View; + template + using ConstUnmanaged = Const >; + + template + using AtomicUnmanaged = Atomic >; + + template + using Unmanaged = Kokkos::View >; + + + template + using Scratch = Kokkos::View >; + + /// + /// tpetra little block index + /// + template struct TpetraLittleBlock; + template<> struct TpetraLittleBlock { + template KOKKOS_INLINE_FUNCTION + static T getFlatIndex(const T i, const T j, const T blksize) { return i+j*blksize; } + }; + template<> struct TpetraLittleBlock { + template KOKKOS_INLINE_FUNCTION + static T getFlatIndex(const T i, const T j, const T blksize) { return i*blksize+j; } + }; + + /// + /// block tridiag scalar type + /// + template struct BlockTridiagScalarType { typedef T type; }; +#if defined(IFPACK2_BLOCKHELPER_USE_SMALL_SCALAR_FOR_BLOCKTRIDIAG) + template<> struct BlockTridiagScalarType { typedef float type; }; + //template<> struct SmallScalarType > { typedef Kokkos::complex type; }; +#endif + + /// + /// cuda specialization + /// + template struct is_cuda { enum : bool { value = false }; }; +#if defined(KOKKOS_ENABLE_CUDA) + template<> struct is_cuda { enum : bool { value = true }; }; +#endif + + /// + /// hip specialization + /// + template struct is_hip { enum : bool { value = false }; }; +#if defined(KOKKOS_ENABLE_HIP) + template<> struct is_hip { enum : bool { value = true }; }; +#endif + + /// + /// sycl specialization + /// + template struct is_sycl { enum : bool { value = false }; }; +#if defined(KOKKOS_ENABLE_SYCL) + template<> struct is_sycl { enum : bool { value = true }; }; +#endif + + template struct is_device { enum : bool { value = is_cuda::value || is_hip::value || is_sycl::value }; }; + + + /// + /// execution space instance + /// + template + struct ExecutionSpaceFactory { + static void createInstance(T &exec_instance) { + exec_instance = T(); + } +#if defined(KOKKOS_ENABLE_CUDA) + static void createInstance(const cudaStream_t &s, T &exec_instance) { + exec_instance = T(); + } +#endif + }; + +#if defined(KOKKOS_ENABLE_CUDA) + template<> + struct ExecutionSpaceFactory { + static void createInstance(Kokkos::Cuda &exec_instance) { + exec_instance = Kokkos::Cuda(); + } + static void createInstance(const cudaStream_t &s, Kokkos::Cuda &exec_instance) { + exec_instance = Kokkos::Cuda(s); + } + }; +#endif + +#if defined(KOKKOS_ENABLE_HIP) + template<> + struct ExecutionSpaceFactory { + static void createInstance(Kokkos::HIP &exec_instance) { + exec_instance = Kokkos::HIP(); + } + }; +#endif + +#if defined(KOKKOS_ENABLE_SYCL) + template<> + struct ExecutionSpaceFactory { + static void createInstance(Kokkos::Experimental::SYCL &exec_instance) { + exec_instance = Kokkos::Experimental::SYCL(); + } + }; +#endif + + +#if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS) +#define IFPACK2_BLOCKHELPER_TIMER(label) TEUCHOS_FUNC_TIME_MONITOR(label); +#define IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) execution_space().fence(); +#else +#define IFPACK2_BLOCKHELPER_TIMER(label) +#define IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) +#endif + +#if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKHELPER_ENABLE_PROFILE) +#define IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN \ + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaProfilerStart()); + +#define IFPACK2_BLOCKHELPER_PROFILER_REGION_END \ + { KOKKOS_IMPL_CUDA_SAFE_CALL( cudaProfilerStop() ); } +#else + /// later put vtune profiler region +#define IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN +#define IFPACK2_BLOCKHELPER_PROFILER_REGION_END +#endif + + + /// + /// utility functions + /// + template + std::string get_msg_prefix (const CommPtrType &comm) { + const auto rank = comm->getRank(); + const auto nranks = comm->getSize(); + std::stringstream ss; + ss << "Rank " << rank << " of " << nranks << ": "; + return ss.str(); + } + + /// + /// custom multiple varilable reduce and scan + /// + template + struct ArrayValueType { + T v[N]; + KOKKOS_INLINE_FUNCTION + ArrayValueType() { + for (int i=0;iv[i] = 0; + } + KOKKOS_INLINE_FUNCTION + ArrayValueType(const ArrayValueType &b) { + for (int i=0;iv[i] = b.v[i]; + } + }; + template + static + KOKKOS_INLINE_FUNCTION + void + operator+=(ArrayValueType &a, + const ArrayValueType &b) { + for (int i=0;i + struct SumReducer { + typedef SumReducer reducer; + typedef ArrayValueType value_type; + typedef Kokkos::View > result_view_type; + value_type *value; + + KOKKOS_INLINE_FUNCTION + SumReducer(value_type &val) : value(&val) {} + + KOKKOS_INLINE_FUNCTION + void join(value_type &dst, value_type const &src) const { + for (int i=0;i::sum(); + } + KOKKOS_INLINE_FUNCTION + value_type& reference() { + return *value; + } + KOKKOS_INLINE_FUNCTION + result_view_type view() const { + return result_view_type(value); + } + }; + + + /// + /// implementation typedefs + /// + template + struct ImplType { + /// + /// matrix type derived types + /// + typedef size_t size_type; + typedef typename MatrixType::scalar_type scalar_type; + typedef typename MatrixType::local_ordinal_type local_ordinal_type; + typedef typename MatrixType::global_ordinal_type global_ordinal_type; + typedef typename MatrixType::node_type node_type; + + /// + /// kokkos arithmetic traits of scalar_type + /// + typedef typename Kokkos::Details::ArithTraits::val_type impl_scalar_type; + typedef typename Kokkos::ArithTraits::mag_type magnitude_type; + + typedef typename BlockTridiagScalarType::type btdm_scalar_type; + typedef typename Kokkos::ArithTraits::mag_type btdm_magnitude_type; + + /// + /// default host execution space + /// + typedef Kokkos::DefaultHostExecutionSpace host_execution_space; + + /// + /// tpetra types + /// + typedef typename node_type::device_type node_device_type; + typedef typename node_device_type::execution_space node_execution_space; + typedef typename node_device_type::memory_space node_memory_space; + +#if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKHELPER_USE_CUDA_SPACE) + /// force to use cuda space instead uvm space + typedef node_execution_space execution_space; + typedef typename std::conditional::value, + Kokkos::CudaSpace, + node_memory_space>::type memory_space; + typedef Kokkos::Device device_type; +#else + typedef node_device_type device_type; + typedef node_execution_space execution_space; + typedef node_memory_space memory_space; +#endif + + typedef Tpetra::MultiVector tpetra_multivector_type; + typedef Tpetra::Map tpetra_map_type; + typedef Tpetra::Import tpetra_import_type; + typedef Tpetra::RowMatrix tpetra_row_matrix_type; + typedef Tpetra::BlockCrsMatrix tpetra_block_crs_matrix_type; + typedef typename tpetra_block_crs_matrix_type::little_block_type tpetra_block_access_view_type; + typedef Tpetra::BlockMultiVector tpetra_block_multivector_type; + typedef typename tpetra_block_crs_matrix_type::crs_graph_type::local_graph_device_type local_crs_graph_type; + + /// + /// simd vectorization + /// + template using Vector = KB::Vector; + template using SIMD = KB::SIMD; + template using DefaultVectorLength = KB::DefaultVectorLength; + template using DefaultInternalVectorLength = KB::DefaultInternalVectorLength; + + static constexpr int vector_length = 1; //DefaultVectorLength::value; + static constexpr int internal_vector_length = 1; //DefaultInternalVectorLength::value; + typedef Vector,vector_length> vector_type; + typedef Vector,internal_vector_length> internal_vector_type; + + /// + /// commonly used view types + /// + typedef Kokkos::View size_type_1d_view; + typedef Kokkos::View size_type_2d_view; + typedef Kokkos::View local_ordinal_type_1d_view; + typedef Kokkos::View local_ordinal_type_2d_view; + // tpetra block crs values + typedef Kokkos::View impl_scalar_type_1d_view; + typedef Kokkos::View impl_scalar_type_1d_view_tpetra; + + // tpetra multivector values (layout left): may need to change the typename more explicitly + typedef Kokkos::View impl_scalar_type_2d_view; + typedef Kokkos::View impl_scalar_type_2d_view_tpetra; + + // packed data always use layout right + typedef Kokkos::View vector_type_1d_view; + typedef Kokkos::View vector_type_3d_view; + typedef Kokkos::View vector_type_4d_view; + typedef Kokkos::View internal_vector_type_3d_view; + typedef Kokkos::View internal_vector_type_4d_view; + typedef Kokkos::View internal_vector_type_5d_view; + typedef Kokkos::View btdm_scalar_type_3d_view; + typedef Kokkos::View btdm_scalar_type_4d_view; + typedef Kokkos::View btdm_scalar_type_5d_view; + }; + + + /// + /// Manage the distributed part of the computation of residual norms. + /// + template + struct NormManager { + public: + using impl_type = ImplType; + using host_execution_space = typename impl_type::host_execution_space; + using magnitude_type = typename impl_type::magnitude_type; + + private: + bool collective_; + int sweep_step_, sweep_step_upper_bound_; +#ifdef HAVE_IFPACK2_MPI + MPI_Request mpi_request_; + MPI_Comm comm_; +#endif + magnitude_type work_[3]; + + public: + NormManager() = default; + NormManager(const NormManager &b) = default; + NormManager(const Teuchos::RCP >& comm) { + sweep_step_ = 1; + sweep_step_upper_bound_ = 1; + collective_ = comm->getSize() > 1; + if (collective_) { +#ifdef HAVE_IFPACK2_MPI + const auto mpi_comm = Teuchos::rcp_dynamic_cast >(comm); + TEUCHOS_ASSERT( ! mpi_comm.is_null()); + comm_ = *mpi_comm->getRawMpiComm(); +#endif + } + const magnitude_type zero(0), minus_one(-1); + work_[0] = zero; + work_[1] = zero; + work_[2] = minus_one; + } + + // Check the norm every sweep_step sweeps. + void setCheckFrequency(const int sweep_step) { + TEUCHOS_TEST_FOR_EXCEPT_MSG(sweep_step < 1, "sweep step must be >= 1"); + sweep_step_upper_bound_ = sweep_step; + sweep_step_ = 1; + } + + // Get the buffer into which to store rank-local squared norms. + magnitude_type* getBuffer() { return &work_[0]; } + + // Call MPI_Iallreduce to find the global squared norms. + void ireduce(const int sweep, const bool force = false) { + if ( ! force && sweep % sweep_step_) return; + + IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NormManager::Ireduce"); + + work_[1] = work_[0]; +#ifdef HAVE_IFPACK2_MPI + auto send_data = &work_[1]; + auto recv_data = &work_[0]; + if (collective_) { +# if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3) + MPI_Iallreduce(send_data, recv_data, 1, + Teuchos::Details::MpiTypeTraits::getType(), + MPI_SUM, comm_, &mpi_request_); +# else + MPI_Allreduce (send_data, recv_data, 1, + Teuchos::Details::MpiTypeTraits::getType(), + MPI_SUM, comm_); +# endif + } +#endif + } + + // Check if the norm-based termination criterion is met. tol2 is the + // tolerance squared. Sweep is the sweep index. If not every iteration is + // being checked, this function immediately returns false. If a check must + // be done at this iteration, it waits for the reduction triggered by + // ireduce to complete, then checks the global norm against the tolerance. + bool checkDone (const int sweep, const magnitude_type tol2, const bool force = false) { + // early return + if (sweep <= 0) return false; + + IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NormManager::CheckDone"); + + TEUCHOS_ASSERT(sweep >= 1); + if ( ! force && (sweep - 1) % sweep_step_) return false; + if (collective_) { +#ifdef HAVE_IFPACK2_MPI +# if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3) + MPI_Wait(&mpi_request_, MPI_STATUS_IGNORE); +# else + // Do nothing. +# endif +#endif + } + bool r_val = false; + if (sweep == 1) { + work_[2] = work_[0]; + } else { + r_val = (work_[0] < tol2*work_[2]); + } + + // adjust sweep step + const auto adjusted_sweep_step = 2*sweep_step_; + if (adjusted_sweep_step < sweep_step_upper_bound_) { + sweep_step_ = adjusted_sweep_step; + } else { + sweep_step_ = sweep_step_upper_bound_; + } + return r_val; + } + + // After termination has occurred, finalize the norms for use in + // get_norms{0,final}. + void finalize () { + work_[0] = std::sqrt(work_[0]); // after converged + if (work_[2] >= 0) + work_[2] = std::sqrt(work_[2]); // first norm + // if work_[2] is minus one, then norm is not requested. + } + + // Report norms to the caller. + const magnitude_type getNorms0 () const { return work_[2]; } + const magnitude_type getNormsFinal () const { return work_[0]; } + }; + + template + void reduceVector(const ConstUnmanaged::impl_scalar_type_1d_view> zz, + /* */ typename BlockHelperDetails::ImplType::magnitude_type *vals) { + IFPACK2_BLOCKHELPER_PROFILER_REGION_BEGIN; + IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ReduceVector"); + + using impl_type = BlockHelperDetails::ImplType; + using local_ordinal_type = typename impl_type::local_ordinal_type; + using impl_scalar_type = typename impl_type::impl_scalar_type; +#if 0 + const auto norm2 = KokkosBlas::nrm1(zz); +#else + impl_scalar_type norm2(0); + Kokkos::parallel_reduce + ("ReduceMultiVector::Device", + Kokkos::RangePolicy(0,zz.extent(0)), + KOKKOS_LAMBDA(const local_ordinal_type &i, impl_scalar_type &update) { + update += zz(i); + }, norm2); +#endif + vals[0] = Kokkos::ArithTraits::abs(norm2); + + IFPACK2_BLOCKHELPER_PROFILER_REGION_END; + } + + } // namespace BlockHelperDetails + +} // namespace Ifpack2 + +#endif diff --git a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_decl.hpp b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_decl.hpp index 6cbcc0f839d7..b085eb8e8741 100644 --- a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_decl.hpp @@ -228,6 +228,7 @@ namespace Ifpack2 { /// purposes only. BlockTriDiContainer (const Teuchos::RCP& matrix, const Teuchos::Array >& partitions, + const int n_subparts_per_part = 1, bool overlapCommAndComp = false, bool useSequentialMethod = false); //! Destructor (declared virtual for memory safety of derived classes). @@ -389,11 +390,13 @@ namespace Ifpack2 { // hide details of impl using ImplObj; finally I understand why AMB did that way. Teuchos::RCP > impl_; + int n_subparts_per_part_; // initialize distributed and local objects void initInternal (const Teuchos::RCP& matrix, const Teuchos::Array >& partitions, const Teuchos::RCP &importer, + const int n_subparts_per_part, const bool overlapCommAndComp, const bool useSeqMethod); diff --git a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp index 49b4eecaa547..65bc2964405a 100644 --- a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp +++ b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_def.hpp @@ -81,34 +81,50 @@ namespace Ifpack2 { ::initInternal (const Teuchos::RCP& matrix, const Teuchos::Array >& partitions, const Teuchos::RCP& importer, + const int n_subparts_per_part, const bool overlapCommAndComp, const bool useSeqMethod) { + IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::initInternal"); + n_subparts_per_part_ = n_subparts_per_part; + // create pointer of impl - impl_ = Teuchos::rcp(new BlockTriDiContainerDetails::ImplObject()); + { + IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::createImpl"); + impl_ = Teuchos::rcp(new BlockTriDiContainerDetails::ImplObject()); + IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) + } - using impl_type = BlockTriDiContainerDetails::ImplType; + using impl_type = BlockHelperDetails::ImplType; // using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type; - impl_->A = Teuchos::rcp_dynamic_cast(matrix); - TEUCHOS_TEST_FOR_EXCEPT_MSG - (impl_->A.is_null(), "BlockTriDiContainer currently supports Tpetra::BlockCrsMatrix only."); + { + IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::setA"); + impl_->A = Teuchos::rcp_dynamic_cast(matrix); + TEUCHOS_TEST_FOR_EXCEPT_MSG + (impl_->A.is_null(), "BlockTriDiContainer currently supports Tpetra::BlockCrsMatrix only."); + IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) + } impl_->tpetra_importer = Teuchos::null; impl_->async_importer = Teuchos::null; if (useSeqMethod) { + IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::createBlockCrsTpetraImporter useSeqMethod"); if (importer.is_null()) // there is no given importer, then create one impl_->tpetra_importer = BlockTriDiContainerDetails::createBlockCrsTpetraImporter(impl_->A); else impl_->tpetra_importer = importer; // if there is a given importer, use it + IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) } else { + IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::createBlockCrsTpetraImporter"); //Leave tpetra_importer null even if user provided an importer. //It is not used in the performant codepath (!useSeqMethod) impl_->async_importer = BlockTriDiContainerDetails::createBlockCrsAsyncImporter(impl_->A); + IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) } // as a result, there are @@ -118,13 +134,26 @@ namespace Ifpack2 { // temporary disabling impl_->overlap_communication_and_computation = overlapCommAndComp; - - impl_->Z = typename impl_type::tpetra_multivector_type(); - impl_->W = typename impl_type::impl_scalar_type_1d_view(); - impl_->part_interface = BlockTriDiContainerDetails::createPartInterface(impl_->A, partitions); - impl_->block_tridiags = BlockTriDiContainerDetails::createBlockTridiags(impl_->part_interface); - impl_->norm_manager = BlockTriDiContainerDetails::NormManager(impl_->A->getComm()); + { + IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::createZ"); + impl_->Z = typename impl_type::tpetra_multivector_type(); + IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) + } + { + IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::createW"); + impl_->W = typename impl_type::impl_scalar_type_1d_view(); + IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) + } + + { + IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::createPartInterfaceBlockTridiagsNormManager"); + impl_->part_interface = BlockTriDiContainerDetails::createPartInterface(impl_->A, partitions, n_subparts_per_part_); + impl_->block_tridiags = BlockTriDiContainerDetails::createBlockTridiags(impl_->part_interface); + impl_->norm_manager = BlockHelperDetails::NormManager(impl_->A->getComm()); + IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) + } + IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) } template @@ -132,11 +161,12 @@ namespace Ifpack2 { BlockTriDiContainer ::clearInternal () { - using impl_type = BlockTriDiContainerDetails::ImplType; - using part_interface_type = BlockTriDiContainerDetails::PartInterface; + IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::clearInternal"); + using impl_type = BlockHelperDetails::ImplType; + using part_interface_type = BlockHelperDetails::PartInterface; using block_tridiags_type = BlockTriDiContainerDetails::BlockTridiags; - using amd_type = BlockTriDiContainerDetails::AmD; - using norm_manager_type = BlockTriDiContainerDetails::NormManager; + using amd_type = BlockHelperDetails::AmD; + using norm_manager_type = BlockHelperDetails::NormManager; impl_->A = Teuchos::null; impl_->tpetra_importer = Teuchos::null; @@ -152,6 +182,7 @@ namespace Ifpack2 { impl_->norm_manager = norm_manager_type(); impl_ = Teuchos::null; + IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) } template @@ -162,19 +193,25 @@ namespace Ifpack2 { bool pointIndexed) : Container(matrix, partitions, pointIndexed) { + IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::BlockTriDiContainer"); const bool useSeqMethod = false; const bool overlapCommAndComp = false; - initInternal(matrix, partitions, importer, overlapCommAndComp, useSeqMethod); + initInternal(matrix, partitions, importer, 1, overlapCommAndComp, useSeqMethod); + IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) } template BlockTriDiContainer ::BlockTriDiContainer (const Teuchos::RCP& matrix, const Teuchos::Array >& partitions, - const bool overlapCommAndComp, const bool useSeqMethod) + const int n_subparts_per_part, + const bool overlapCommAndComp, + const bool useSeqMethod) : Container(matrix, partitions, false) { - initInternal(matrix, partitions, Teuchos::null, overlapCommAndComp, useSeqMethod); + IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::BlockTriDiContainer"); + initInternal(matrix, partitions, Teuchos::null, n_subparts_per_part, overlapCommAndComp, useSeqMethod); + IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) } template @@ -196,6 +233,7 @@ namespace Ifpack2 { BlockTriDiContainer ::initialize () { + IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::initialize"); this->IsInitialized_ = true; // We assume that if you called this method, you intend to recompute // everything. @@ -208,6 +246,7 @@ namespace Ifpack2 { impl_->a_minus_d, impl_->overlap_communication_and_computation); } + IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) } template @@ -215,6 +254,7 @@ namespace Ifpack2 { BlockTriDiContainer ::compute () { + IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::compute"); this->IsComputed_ = false; if (!this->isInitialized()) this->initialize(); @@ -225,6 +265,7 @@ namespace Ifpack2 { Kokkos::ArithTraits::zero()); } this->IsComputed_ = true; + IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) } template @@ -232,10 +273,12 @@ namespace Ifpack2 { BlockTriDiContainer ::clearBlocks () { + IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::clearBlocks"); clearInternal(); this->IsInitialized_ = false; this->IsComputed_ = false; Container::clearBlocks(); + IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) } template @@ -244,6 +287,7 @@ namespace Ifpack2 { ::applyInverseJacobi (const mv_type& X, mv_type& Y, scalar_type dampingFactor, bool zeroStartingSolution, int numSweeps) const { + IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::applyInverseJacobi"); const magnitude_type tol = Kokkos::ArithTraits::zero(); const int check_tol_every = 1; @@ -261,6 +305,7 @@ namespace Ifpack2 { numSweeps, tol, check_tol_every); + IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) } template @@ -276,6 +321,7 @@ namespace Ifpack2 { BlockTriDiContainer ::compute (const ComputeParameters& in) { + IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::compute"); this->IsComputed_ = false; if (!this->isInitialized()) this->initialize(); @@ -286,6 +332,7 @@ namespace Ifpack2 { in.addRadiallyToDiagonal); } this->IsComputed_ = true; + IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) } template @@ -304,6 +351,7 @@ namespace Ifpack2 { ::applyInverseJacobi (const mv_type& X, mv_type& Y, const ApplyParameters& in) const { + IFPACK2_BLOCKHELPER_TIMER("BlockTriDiContainer::applyInverseJacobi"); int r_val = 0; { r_val = BlockTriDiContainerDetails::applyInverseJacobi @@ -321,6 +369,7 @@ namespace Ifpack2 { in.tolerance, in.checkToleranceEvery); } + IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) return r_val; } diff --git a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp index 2b2563567eca..7d992cb51d3c 100644 --- a/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp +++ b/packages/ifpack2/src/Ifpack2_BlockTriDiContainer_impl.hpp @@ -43,6 +43,9 @@ #ifndef IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP #define IFPACK2_BLOCKTRIDICONTAINER_IMPL_HPP +//#define IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM +//#define IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF + #include #include @@ -78,6 +81,11 @@ #include +#include "Ifpack2_BlockHelper.hpp" +#include "Ifpack2_BlockComputeResidualVector.hpp" + +//#include + // need to interface this into cmake variable (or only use this flag when it is necessary) //#define IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE //#undef IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE @@ -159,19 +167,6 @@ namespace Ifpack2 { typename ViewType::execution_space::scratch_memory_space, MemoryTraits >; - /// - /// tpetra little block index - /// - template struct TpetraLittleBlock; - template<> struct TpetraLittleBlock { - template KOKKOS_INLINE_FUNCTION - static T getFlatIndex(const T i, const T j, const T blksize) { return i+j*blksize; } - }; - template<> struct TpetraLittleBlock { - template KOKKOS_INLINE_FUNCTION - static T getFlatIndex(const T i, const T j, const T blksize) { return i*blksize+j; } - }; - /// /// block tridiag scalar type /// @@ -181,159 +176,6 @@ namespace Ifpack2 { //template<> struct SmallScalarType > { typedef Kokkos::complex type; }; #endif - /// - /// cuda specialization - /// - template struct is_cuda { enum : bool { value = false }; }; -#if defined(KOKKOS_ENABLE_CUDA) - template<> struct is_cuda { enum : bool { value = true }; }; -#endif - - /// - /// hip specialization - /// - template struct is_hip { enum : bool { value = false }; }; -#if defined(KOKKOS_ENABLE_HIP) - template<> struct is_hip { enum : bool { value = true }; }; -#endif - - /// - /// sycl specialization - /// - template struct is_sycl { enum : bool { value = false }; }; -#if defined(KOKKOS_ENABLE_SYCL) - template<> struct is_sycl { enum : bool { value = true }; }; -#endif - - template struct is_device { enum : bool { value = is_cuda::value || is_hip::value || is_sycl::value }; }; - - - /// - /// execution space instance - /// - template - struct ExecutionSpaceFactory { - static void createInstance(T &exec_instance) { - exec_instance = T(); - } -#if defined(KOKKOS_ENABLE_CUDA) - static void createInstance(const cudaStream_t &s, T &exec_instance) { - exec_instance = T(); - } -#endif - }; - -#if defined(KOKKOS_ENABLE_CUDA) - template<> - struct ExecutionSpaceFactory { - static void createInstance(Kokkos::Cuda &exec_instance) { - exec_instance = Kokkos::Cuda(); - } - static void createInstance(const cudaStream_t &s, Kokkos::Cuda &exec_instance) { - exec_instance = Kokkos::Cuda(s); - } - }; -#endif - -#if defined(KOKKOS_ENABLE_HIP) - template<> - struct ExecutionSpaceFactory { - static void createInstance(Kokkos::HIP &exec_instance) { - exec_instance = Kokkos::HIP(); - } - }; -#endif - -#if defined(KOKKOS_ENABLE_SYCL) - template<> - struct ExecutionSpaceFactory { - static void createInstance(Kokkos::Experimental::SYCL &exec_instance) { - exec_instance = Kokkos::Experimental::SYCL(); - } - }; -#endif - - - - /// - /// utility functions - /// - template - std::string get_msg_prefix (const CommPtrType &comm) { - const auto rank = comm->getRank(); - const auto nranks = comm->getSize(); - std::stringstream ss; - ss << "Rank " << rank << " of " << nranks << ": "; - return ss.str(); - } - - /// - /// custom multiple varilable reduce and scan - /// - template - struct ArrayValueType { - T v[N]; - KOKKOS_INLINE_FUNCTION - ArrayValueType() { - for (int i=0;iv[i] = 0; - } - KOKKOS_INLINE_FUNCTION - ArrayValueType(const ArrayValueType &b) { - for (int i=0;iv[i] = b.v[i]; - } - }; - template - static - KOKKOS_INLINE_FUNCTION - void - operator+=(ArrayValueType &a, - const ArrayValueType &b) { - for (int i=0;i - struct SumReducer { - typedef SumReducer reducer; - typedef ArrayValueType value_type; - typedef Kokkos::View > result_view_type; - value_type *value; - - KOKKOS_INLINE_FUNCTION - SumReducer(value_type &val) : value(&val) {} - - KOKKOS_INLINE_FUNCTION - void join(value_type &dst, value_type const &src) const { - for (int i=0;i::sum(); - } - KOKKOS_INLINE_FUNCTION - value_type& reference() { - return *value; - } - KOKKOS_INLINE_FUNCTION - result_view_type view() const { - return result_view_type(value); - } - }; - -#if defined(HAVE_IFPACK2_BLOCKTRIDICONTAINER_TIMERS) -#define IFPACK2_BLOCKTRIDICONTAINER_TIMER(label) TEUCHOS_FUNC_TIME_MONITOR(label); -#define IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space) execution_space().fence(); -#else -#define IFPACK2_BLOCKTRIDICONTAINER_TIMER(label) -#define IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space) -#endif #if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_ENABLE_PROFILE) #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN \ @@ -347,106 +189,14 @@ namespace Ifpack2 { #define IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END #endif - /// - /// implementation typedefs - /// - template - struct ImplType { - /// - /// matrix type derived types - /// - typedef size_t size_type; - typedef typename MatrixType::scalar_type scalar_type; - typedef typename MatrixType::local_ordinal_type local_ordinal_type; - typedef typename MatrixType::global_ordinal_type global_ordinal_type; - typedef typename MatrixType::node_type node_type; - - /// - /// kokkos arithmetic traits of scalar_type - /// - typedef typename Kokkos::ArithTraits::val_type impl_scalar_type; - typedef typename Kokkos::ArithTraits::mag_type magnitude_type; - - typedef typename BlockTridiagScalarType::type btdm_scalar_type; - typedef typename Kokkos::ArithTraits::mag_type btdm_magnitude_type; - - /// - /// default host execution space - /// - typedef Kokkos::DefaultHostExecutionSpace host_execution_space; - - /// - /// tpetra types - /// - typedef typename node_type::device_type node_device_type; - typedef typename node_device_type::execution_space node_execution_space; - typedef typename node_device_type::memory_space node_memory_space; - -#if defined(KOKKOS_ENABLE_CUDA) && defined(IFPACK2_BLOCKTRIDICONTAINER_USE_CUDA_SPACE) - /// force to use cuda space instead uvm space - typedef node_execution_space execution_space; - typedef typename std::conditional::value, - Kokkos::CudaSpace, - node_memory_space>::type memory_space; - typedef Kokkos::Device device_type; -#else - typedef node_device_type device_type; - typedef node_execution_space execution_space; - typedef node_memory_space memory_space; -#endif - - typedef Tpetra::MultiVector tpetra_multivector_type; - typedef Tpetra::Map tpetra_map_type; - typedef Tpetra::Import tpetra_import_type; - typedef Tpetra::RowMatrix tpetra_row_matrix_type; - typedef Tpetra::BlockCrsMatrix tpetra_block_crs_matrix_type; - typedef typename tpetra_block_crs_matrix_type::little_block_type tpetra_block_access_view_type; - typedef Tpetra::BlockMultiVector tpetra_block_multivector_type; - typedef typename tpetra_block_crs_matrix_type::crs_graph_type::local_graph_device_type local_crs_graph_type; - - /// - /// simd vectorization - /// - template using Vector = KB::Vector; - template using SIMD = KB::SIMD; - template using DefaultVectorLength = KB::DefaultVectorLength; - template using DefaultInternalVectorLength = KB::DefaultInternalVectorLength; - - static constexpr int vector_length = DefaultVectorLength::value; - static constexpr int internal_vector_length = DefaultInternalVectorLength::value; - typedef Vector,vector_length> vector_type; - typedef Vector,internal_vector_length> internal_vector_type; - - /// - /// commonly used view types - /// - typedef Kokkos::View size_type_1d_view; - typedef Kokkos::View local_ordinal_type_1d_view; - // tpetra block crs values - typedef Kokkos::View impl_scalar_type_1d_view; - typedef Kokkos::View impl_scalar_type_1d_view_tpetra; - - // tpetra multivector values (layout left): may need to change the typename more explicitly - typedef Kokkos::View impl_scalar_type_2d_view; - typedef Kokkos::View impl_scalar_type_2d_view_tpetra; - - // packed data always use layout right - typedef Kokkos::View vector_type_1d_view; - typedef Kokkos::View vector_type_3d_view; - typedef Kokkos::View internal_vector_type_3d_view; - typedef Kokkos::View internal_vector_type_4d_view; - typedef Kokkos::View btdm_scalar_type_3d_view; - typedef Kokkos::View btdm_scalar_type_4d_view; - }; - /// /// setup sequential importer /// template - typename Teuchos::RCP::tpetra_import_type> - createBlockCrsTpetraImporter(const Teuchos::RCP::tpetra_block_crs_matrix_type> &A) { - IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::CreateBlockCrsTpetraImporter"); - using impl_type = ImplType; + typename Teuchos::RCP::tpetra_import_type> + createBlockCrsTpetraImporter(const Teuchos::RCP::tpetra_block_crs_matrix_type> &A) { + IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::CreateBlockCrsTpetraImporter"); + using impl_type = BlockHelperDetails::ImplType; using tpetra_map_type = typename impl_type::tpetra_map_type; using tpetra_mv_type = typename impl_type::tpetra_block_multivector_type; using tpetra_import_type = typename impl_type::tpetra_import_type; @@ -455,11 +205,8 @@ namespace Ifpack2 { const auto blocksize = A->getBlockSize(); const auto src = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getDomainMap(), blocksize))); const auto tgt = Teuchos::rcp(new tpetra_map_type(tpetra_mv_type::makePointMap(*g.getColMap() , blocksize))); - - auto blockCrsTpetraImporter = Teuchos::rcp(new tpetra_import_type(src, tgt)); - IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(typename ImplType::execution_space) - - return blockCrsTpetraImporter; + IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) + return Teuchos::rcp(new tpetra_import_type(src, tgt)); } // Partial replacement for forward-mode MultiVector::doImport. @@ -470,7 +217,7 @@ namespace Ifpack2 { template struct AsyncableImport { public: - using impl_type = ImplType; + using impl_type = BlockHelperDetails::ImplType; private: /// @@ -794,7 +541,7 @@ namespace Ifpack2 { } void asyncSendRecvVar1(const impl_scalar_type_2d_view_tpetra &mv) { - IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv"); + IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv"); #ifdef HAVE_IFPACK2_MPI // constants and reallocate data buffers if necessary @@ -845,11 +592,11 @@ namespace Ifpack2 { MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat); } #endif // HAVE_IFPACK2_MPI - IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space) + IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) } void syncRecvVar1() { - IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::SyncRecv"); + IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncRecv"); #ifdef HAVE_IFPACK2_MPI // 0. wait for receive async. for (local_ordinal_type i=0;i(pids.recv.extent(0));++i) { @@ -871,6 +618,7 @@ namespace Ifpack2 { // 2. cleanup all open comm waitall(reqs.send.size(), reqs.send.data()); #endif // HAVE_IFPACK2_MPI + IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) } #endif //defined(KOKKOS_ENABLE_CUDA|HIP|SYCL) @@ -891,7 +639,7 @@ namespace Ifpack2 { const local_ordinal_type mv_blocksize = blocksize_*num_vectors; const local_ordinal_type idiff = iend_ - ibeg_; const auto abase = buffer_.data() + mv_blocksize*ibeg_; - if constexpr (is_device::value) { + if constexpr (BlockHelperDetails::is_device::value) { using team_policy_type = Kokkos::TeamPolicy; local_ordinal_type vector_size(0); if (blocksize_ <= 4) vector_size = 4; @@ -940,7 +688,7 @@ namespace Ifpack2 { /// standard comm /// void asyncSendRecvVar0(const impl_scalar_type_2d_view_tpetra &mv) { - IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv"); + IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::AsyncSendRecv"); #ifdef HAVE_IFPACK2_MPI // constants and reallocate data buffers if necessary @@ -978,11 +726,11 @@ namespace Ifpack2 { MPI_Iprobe(pids.recv[i], 42, comm, &flag, &stat); } #endif - IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space) + IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) } void syncRecvVar0() { - IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::AsyncableImport::SyncRecv"); + IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::AsyncableImport::SyncRecv"); #ifdef HAVE_IFPACK2_MPI // receive async. for (local_ordinal_type i=0,iend=pids.recv.extent(0);i Teuchos::RCP > - createBlockCrsAsyncImporter(const Teuchos::RCP::tpetra_block_crs_matrix_type> &A) { - using impl_type = ImplType; + createBlockCrsAsyncImporter(const Teuchos::RCP::tpetra_block_crs_matrix_type> &A) { + using impl_type = BlockHelperDetails::ImplType; using tpetra_map_type = typename impl_type::tpetra_map_type; using local_ordinal_type = typename impl_type::local_ordinal_type; using global_ordinal_type = typename impl_type::global_ordinal_type; @@ -1094,79 +843,37 @@ namespace Ifpack2 { return Teuchos::null; } - template - struct PartInterface { - using local_ordinal_type = typename ImplType::local_ordinal_type; - using local_ordinal_type_1d_view = typename ImplType::local_ordinal_type_1d_view; - - PartInterface() = default; - PartInterface(const PartInterface &b) = default; - - // Some terms: - // The matrix A is split as A = D + R, where D is the matrix of tridiag - // blocks and R is the remainder. - // A part is roughly a synonym for a tridiag. The distinction is that a part - // is the set of rows belonging to one tridiag and, equivalently, the off-diag - // rows in R associated with that tridiag. In contrast, the term tridiag is - // used to refer specifically to tridiag data, such as the pointer into the - // tridiag data array. - // Local (lcl) row arge the LIDs. lclrow lists the LIDs belonging to each - // tridiag, and partptr points to the beginning of each tridiag. This is the - // LID space. - // Row index (idx) is the ordinal in the tridiag ordering. lclrow is indexed - // by this ordinal. This is the 'index' space. - // A flat index is the mathematical index into an array. A pack index - // accounts for SIMD packing. - - // Local row LIDs. Permutation from caller's index space to tridiag index - // space. - local_ordinal_type_1d_view lclrow; - // partptr_ is the pointer array into lclrow_. - local_ordinal_type_1d_view partptr; // np+1 - // packptr_(i), for i the pack index, indexes partptr_. partptr_(packptr_(i)) - // is the start of the i'th pack. - local_ordinal_type_1d_view packptr; // npack+1 - // part2rowidx0_(i) is the flat row index of the start of the i'th part. It's - // an alias of partptr_ in the case of no overlap. - local_ordinal_type_1d_view part2rowidx0; // np+1 - // part2packrowidx0_(i) is the packed row index. If vector_length is 1, then - // it's the same as part2rowidx0_; if it's > 1, then the value is combined - // with i % vector_length to get the location in the packed data. - local_ordinal_type_1d_view part2packrowidx0; // np+1 - local_ordinal_type part2packrowidx0_back; // So we don't need to grab the array from the GPU. - // rowidx2part_ maps the row index to the part index. - local_ordinal_type_1d_view rowidx2part; // nr - // True if lcl{row|col} is at most a constant away from row{idx|col}. In - // practice, this knowledge is not particularly useful, as packing for batched - // processing is done at the same time as the permutation from LID to index - // space. But it's easy to detect, so it's recorded in case an optimization - // can be made based on it. - bool row_contiguous; - - local_ordinal_type max_partsz; - }; - /// /// setup part interface using the container partitions array /// template - PartInterface - createPartInterface(const Teuchos::RCP::tpetra_block_crs_matrix_type> &A, - const Teuchos::Array::local_ordinal_type> > &partitions) { - using impl_type = ImplType; + BlockHelperDetails::PartInterface + createPartInterface(const Teuchos::RCP::tpetra_block_crs_matrix_type> &A, + const Teuchos::Array::local_ordinal_type> > &partitions, + const typename BlockHelperDetails::ImplType::local_ordinal_type n_subparts_per_part) { + IFPACK2_BLOCKHELPER_TIMER("createPartInterface"); + using impl_type = BlockHelperDetails::ImplType; using local_ordinal_type = typename impl_type::local_ordinal_type; using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view; + using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view; + using size_type = typename impl_type::size_type; constexpr int vector_length = impl_type::vector_length; const auto comm = A->getRowMap()->getComm(); - PartInterface interf; + BlockHelperDetails::PartInterface interf; const bool jacobi = partitions.size() == 0; const local_ordinal_type A_n_lclrows = A->getLocalNumRows(); const local_ordinal_type nparts = jacobi ? A_n_lclrows : partitions.size(); + // Total number of sub lines: + const local_ordinal_type n_sub_parts = nparts * n_subparts_per_part; + // Total number of sub lines + the Schur complement blocks. + // For a given live 2 sub lines implies one Schur complement, 3 sub lines implies two Schur complements etc. + const local_ordinal_type n_sub_parts_and_schur = n_sub_parts + nparts * (n_subparts_per_part-1); + #if defined(BLOCKTRIDICONTAINER_DEBUG) local_ordinal_type nrows = 0; if (jacobi) @@ -1175,7 +882,7 @@ namespace Ifpack2 { for (local_ordinal_type i=0;i p; if (jacobi) { interf.max_partsz = 1; + interf.max_subpartsz = 0; + interf.n_subparts_per_part = 1; + interf.nparts = nparts; } else { // reorder parts to maximize simd packing efficiency p.resize(nparts); @@ -1199,6 +909,14 @@ namespace Ifpack2 { p[i] = partsz[i].second; interf.max_partsz = partsz[0].first; + + constexpr local_ordinal_type connection_length = 2; + const local_ordinal_type sub_line_length = (interf.max_partsz - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part; + const local_ordinal_type last_sub_line_length = interf.max_partsz - (n_subparts_per_part - 1) * (connection_length + sub_line_length); + + interf.max_subpartsz = (sub_line_length > last_sub_line_length) ? sub_line_length : last_sub_line_length; + interf.n_subparts_per_part = n_subparts_per_part; + interf.nparts = nparts; } // allocate parts @@ -1208,79 +926,319 @@ namespace Ifpack2 { interf.part2packrowidx0 = local_ordinal_type_1d_view(do_not_initialize_tag("part2packrowidx0"), nparts + 1); interf.rowidx2part = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows); + interf.part2rowidx0_sub = local_ordinal_type_1d_view(do_not_initialize_tag("part2rowidx0_sub"), n_sub_parts_and_schur + 1); + interf.part2packrowidx0_sub = local_ordinal_type_2d_view(do_not_initialize_tag("part2packrowidx0_sub"), nparts, 2 * n_subparts_per_part); + interf.rowidx2part_sub = local_ordinal_type_1d_view(do_not_initialize_tag("rowidx2part"), A_n_lclrows); + + interf.partptr_sub = local_ordinal_type_2d_view(do_not_initialize_tag("partptr_sub"), n_sub_parts_and_schur, 2); + // mirror to host and compute on host execution space const auto partptr = Kokkos::create_mirror_view(interf.partptr); + const auto partptr_sub = Kokkos::create_mirror_view(interf.partptr_sub); + const auto lclrow = Kokkos::create_mirror_view(interf.lclrow); const auto part2rowidx0 = Kokkos::create_mirror_view(interf.part2rowidx0); const auto part2packrowidx0 = Kokkos::create_mirror_view(interf.part2packrowidx0); const auto rowidx2part = Kokkos::create_mirror_view(interf.rowidx2part); + const auto part2rowidx0_sub = Kokkos::create_mirror_view(interf.part2rowidx0_sub); + const auto part2packrowidx0_sub = Kokkos::create_mirror_view(Kokkos::HostSpace(), interf.part2packrowidx0_sub); + const auto rowidx2part_sub = Kokkos::create_mirror_view(interf.rowidx2part_sub); + // Determine parts. interf.row_contiguous = true; partptr(0) = 0; part2rowidx0(0) = 0; part2packrowidx0(0) = 0; local_ordinal_type pack_nrows = 0; + local_ordinal_type pack_nrows_sub = 0; if (jacobi) { - for (local_ordinal_type ip=0;ip= A_n_lclrows, - get_msg_prefix(comm) - << "partitions[" << p[ip] << "][" - << i << "] = " << lcl_row - << " but input matrix implies limits of [0, " << A_n_lclrows-1 - << "]."); - lclrow(os+i) = lcl_row; - rowidx2part(os+i) = ip; - if (interf.row_contiguous && os+i > 0 && lclrow((os+i)-1) + 1 != lcl_row) - interf.row_contiguous = false; - } - partptr(ip+1) = os + ipnrows; - } + IFPACK2_BLOCKHELPER_TIMER("compute part indices (Jacobi)"); + for (local_ordinal_type ip=0;ip= A_n_lclrows, + BlockHelperDetails::get_msg_prefix(comm) + << "partitions[" << p[ip] << "][" + << i << "] = " << lcl_row + << " but input matrix implies limits of [0, " << A_n_lclrows-1 + << "]."); + lclrow(offset+i) = lcl_row; + rowidx2part(offset+i) = ip; + if (interf.row_contiguous && offset+i > 0 && lclrow((offset+i)-1) + 1 != lcl_row) + interf.row_contiguous = false; + } + partptr(ip+1) = offset + ipnrows; + } + part2rowidx0_sub(0) = 0; + partptr_sub(0, 0) = 0; + + for (local_ordinal_type ip=0;ip vector_length ? vector_length : nparts; + for (local_ordinal_type ip=0;ip (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts; + for (local_ordinal_type ip=ip_min;ip (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts; + + const local_ordinal_type full_line_length = partptr(ip_min+1) - partptr(ip_min); + + constexpr local_ordinal_type connection_length = 2; + + const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part; + const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length); + + if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length; + if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length; + if (local_sub_ip == part2packrowidx0_sub.extent(1)-2) pack_nrows_sub = last_sub_line_length; + + part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub; + + for (local_ordinal_type ip=ip_min+1;ip::execution_space) } else { - for (local_ordinal_type ip=0;ipsize(); - TEUCHOS_ASSERT(ip == 0 || (ipnrows <= static_cast(partitions[p[ip-1]].size()))); - TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0, - get_msg_prefix(comm) - << "partition " << p[ip] - << " is empty, which is not allowed."); - //assume No overlap. - part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows; - // Since parts are ordered in nonincreasing size, the size of the first - // part in a pack is the size for all parts in the pack. - if (ip % vector_length == 0) pack_nrows = ipnrows; - part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0); - const local_ordinal_type os = partptr(ip); - for (local_ordinal_type i=0;i= A_n_lclrows, - get_msg_prefix(comm) - << "partitions[" << p[ip] << "][" - << i << "] = " << lcl_row - << " but input matrix implies limits of [0, " << A_n_lclrows-1 - << "]."); - lclrow(os+i) = lcl_row; - rowidx2part(os+i) = ip; - if (interf.row_contiguous && os+i > 0 && lclrow((os+i)-1) + 1 != lcl_row) - interf.row_contiguous = false; - } - partptr(ip+1) = os + ipnrows; - } + IFPACK2_BLOCKHELPER_TIMER("compute part indices"); + for (local_ordinal_type ip=0;ipsize(); + TEUCHOS_ASSERT(ip == 0 || (ipnrows <= static_cast(partitions[p[ip-1]].size()))); + TEUCHOS_TEST_FOR_EXCEPT_MSG(ipnrows == 0, + BlockHelperDetails::get_msg_prefix(comm) + << "partition " << p[ip] + << " is empty, which is not allowed."); + //assume No overlap. + part2rowidx0(ip+1) = part2rowidx0(ip) + ipnrows; + // Since parts are ordered in decreasing size, the size of the first + // part in a pack is the size for all parts in the pack. + if (ip % vector_length == 0) pack_nrows = ipnrows; + part2packrowidx0(ip+1) = part2packrowidx0(ip) + ((ip+1) % vector_length == 0 || ip+1 == nparts ? pack_nrows : 0); + const local_ordinal_type offset = partptr(ip); + for (local_ordinal_type i=0;i= A_n_lclrows, + BlockHelperDetails::get_msg_prefix(comm) + << "partitions[" << p[ip] << "][" + << i << "] = " << lcl_row + << " but input matrix implies limits of [0, " << A_n_lclrows-1 + << "]."); + lclrow(offset+i) = lcl_row; + rowidx2part(offset+i) = ip; + if (interf.row_contiguous && offset+i > 0 && lclrow((offset+i)-1) + 1 != lcl_row) + interf.row_contiguous = false; + } + partptr(ip+1) = offset + ipnrows; + +#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF + printf("Part index = ip = %d, first LID associated to the part = partptr(ip) = os = %d, part->size() = ipnrows = %d;\n", ip, os, ipnrows); + printf("partptr(%d+1) = %d\n", ip, partptr(ip+1)); +#endif + } + + part2rowidx0_sub(0) = 0; + partptr_sub(0, 0) = 0; + //const local_ordinal_type number_pack_per_sub_part = ceil(float(nparts)/vector_length); + + for (local_ordinal_type ip=0;ipsize(); + const local_ordinal_type full_line_length = partptr(ip+1) - partptr(ip); + + TEUCHOS_TEST_FOR_EXCEPTION + (full_line_length != ipnrows, std::logic_error, + "In the part " << ip ); + + constexpr local_ordinal_type connection_length = 2; + + if (full_line_length < n_subparts_per_part + (n_subparts_per_part - 1) * connection_length ) + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::logic_error, + "The part " << ip << " is too short to use " << n_subparts_per_part << " sub parts."); + + const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part; + const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length); + + if (ip % vector_length == 0) pack_nrows_sub = ipnrows; + + for (local_ordinal_type local_sub_ip=0; local_sub_ip vector_length ? vector_length : nparts; + for (local_ordinal_type ip=0;ip (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts; + for (local_ordinal_type ip=ip_min;ip (ipack+1)*vector_length ? (ipack+1)*vector_length : nparts; + + const local_ordinal_type full_line_length = partptr(ip_min+1) - partptr(ip_min); + + constexpr local_ordinal_type connection_length = 2; + + const local_ordinal_type sub_line_length = (full_line_length - (n_subparts_per_part - 1) * connection_length) / n_subparts_per_part; + const local_ordinal_type last_sub_line_length = full_line_length - (n_subparts_per_part - 1) * (connection_length + sub_line_length); + + if (local_sub_ip % 2 == 0) pack_nrows_sub = sub_line_length; + if (local_sub_ip % 2 == 1) pack_nrows_sub = connection_length; + if (local_sub_ip == part2packrowidx0_sub.extent(1)-2) pack_nrows_sub = last_sub_line_length; + + part2packrowidx0_sub(ip_min, local_sub_ip + 1) = part2packrowidx0_sub(ip_min, local_sub_ip) + pack_nrows_sub; + + for (local_ordinal_type ip=ip_min+1;ip::execution_space) } #if defined(BLOCKTRIDICONTAINER_DEBUG) TEUCHOS_ASSERT(partptr(nparts) == nrows); @@ -1290,26 +1248,67 @@ namespace Ifpack2 { Kokkos::deep_copy(interf.partptr, partptr); Kokkos::deep_copy(interf.lclrow, lclrow); + Kokkos::deep_copy(interf.partptr_sub, partptr_sub); + //assume No overlap. Thus: interf.part2rowidx0 = interf.partptr; Kokkos::deep_copy(interf.part2packrowidx0, part2packrowidx0); - interf.part2packrowidx0_back = part2packrowidx0(part2packrowidx0.extent(0) - 1); + interf.part2packrowidx0_back = part2packrowidx0_sub(part2packrowidx0_sub.extent(0) - 1, part2packrowidx0_sub.extent(1) - 1); Kokkos::deep_copy(interf.rowidx2part, rowidx2part); { // Fill packptr. - local_ordinal_type npacks = 0; - for (local_ordinal_type ip=1;ip<=nparts;++ip) + IFPACK2_BLOCKHELPER_TIMER("Fill packptr"); + local_ordinal_type npacks = ceil(float(nparts)/vector_length) * (part2packrowidx0_sub.extent(1)-1); + npacks = 0; + for (local_ordinal_type ip=1;ip<=nparts;++ip) //n_sub_parts_and_schur if (part2packrowidx0(ip) != part2packrowidx0(ip-1)) ++npacks; + interf.packptr = local_ordinal_type_1d_view(do_not_initialize_tag("packptr"), npacks + 1); const auto packptr = Kokkos::create_mirror_view(interf.packptr); packptr(0) = 0; for (local_ordinal_type ip=1,k=1;ip<=nparts;++ip) if (part2packrowidx0(ip) != part2packrowidx0(ip-1)) packptr(k++) = ip; + Kokkos::deep_copy(interf.packptr, packptr); + + local_ordinal_type npacks_per_subpart = ceil(float(nparts)/vector_length); + npacks = ceil(float(nparts)/vector_length) * (part2packrowidx0_sub.extent(1)-1); + + interf.packindices_sub = local_ordinal_type_1d_view(do_not_initialize_tag("packindices_sub"), npacks_per_subpart*n_subparts_per_part); + interf.packindices_schur = local_ordinal_type_1d_view(do_not_initialize_tag("packindices_schur"), npacks_per_subpart*(n_subparts_per_part-1)); + + const auto packindices_sub = Kokkos::create_mirror_view(interf.packindices_sub); + const auto packindices_schur = Kokkos::create_mirror_view(interf.packindices_schur); + + + // Fill packindices_sub and packindices_schur + for (local_ordinal_type local_sub_ip=0; local_sub_ip::execution_space) } + IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) return interf; } @@ -1319,22 +1318,29 @@ namespace Ifpack2 { /// template struct BlockTridiags { - using impl_type = ImplType; + using impl_type = BlockHelperDetails::ImplType; using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view; using size_type_1d_view = typename impl_type::size_type_1d_view; + using size_type_2d_view = typename impl_type::size_type_2d_view; using vector_type_3d_view = typename impl_type::vector_type_3d_view; + using vector_type_4d_view = typename impl_type::vector_type_4d_view; // flat_td_ptr(i) is the index into flat-array values of the start of the // i'th tridiag. pack_td_ptr is the same, but for packs. If vector_length == // 1, pack_td_ptr is the same as flat_td_ptr; if vector_length > 1, then i % // vector_length is the position in the pack. - size_type_1d_view flat_td_ptr, pack_td_ptr; + size_type_2d_view flat_td_ptr, pack_td_ptr, pack_td_ptr_schur; // List of local column indices into A from which to grab // data. flat_td_ptr(i) points to the start of the i'th tridiag's data. local_ordinal_type_1d_view A_colindsub; // Tridiag block values. pack_td_ptr(i) points to the start of the i'th // tridiag's pack, and i % vector_length gives the position in the pack. vector_type_3d_view values; + // Schur block values. pack_td_ptr_schur(i) points to the start of the i'th + // Schur's pack, and i % vector_length gives the position in the pack. + vector_type_3d_view values_schur; + // inv(A_00)*A_01 block values. + vector_type_4d_view e_values; bool is_diagonal_only; @@ -1354,6 +1360,10 @@ namespace Ifpack2 { template static KOKKOS_FORCEINLINE_FUNCTION idx_type NumBlocks (const idx_type& nrows) { return nrows > 0 ? 3*nrows - 2 : 0; } + // Number of blocks associated to a Schur complement having a given number of rows. + template + static KOKKOS_FORCEINLINE_FUNCTION + idx_type NumBlocksSchur (const idx_type& nrows) { return nrows > 0 ? 3*nrows + 2 : 0; } }; @@ -1362,35 +1372,43 @@ namespace Ifpack2 { /// template BlockTridiags - createBlockTridiags(const PartInterface &interf) { - using impl_type = ImplType; + createBlockTridiags(const BlockHelperDetails::PartInterface &interf) { + IFPACK2_BLOCKHELPER_TIMER("createBlockTridiags"); + using impl_type = BlockHelperDetails::ImplType; using execution_space = typename impl_type::execution_space; using local_ordinal_type = typename impl_type::local_ordinal_type; using size_type = typename impl_type::size_type; - using size_type_1d_view = typename impl_type::size_type_1d_view; + using size_type_2d_view = typename impl_type::size_type_2d_view; constexpr int vector_length = impl_type::vector_length; BlockTridiags btdm; - const local_ordinal_type ntridiags = interf.partptr.extent(0) - 1; + const local_ordinal_type ntridiags = interf.partptr_sub.extent(0); { // construct the flat index pointers into the tridiag values array. - btdm.flat_td_ptr = size_type_1d_view(do_not_initialize_tag("btdm.flat_td_ptr"), ntridiags + 1); - const Kokkos::RangePolicy policy(0,ntridiags + 1); + btdm.flat_td_ptr = size_type_2d_view(do_not_initialize_tag("btdm.flat_td_ptr"), interf.nparts, 2*interf.n_subparts_per_part); + const Kokkos::RangePolicy policy(0, 2 * interf.nparts * interf.n_subparts_per_part ); Kokkos::parallel_scan ("createBlockTridiags::RangePolicy::flat_td_ptr", policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) { - if (final) - btdm.flat_td_ptr(i) = update; - if (i < ntridiags) { - const local_ordinal_type nrows = interf.partptr(i+1) - interf.partptr(i); - update += btdm.NumBlocks(nrows); + const local_ordinal_type partidx = i/(2 * interf.n_subparts_per_part); + const local_ordinal_type local_subpartidx = i % (2 * interf.n_subparts_per_part); + + if (final) { + btdm.flat_td_ptr(partidx, local_subpartidx) = update; + } + if (local_subpartidx != (2 * interf.n_subparts_per_part -1)) { + const local_ordinal_type nrows = interf.partptr_sub(interf.nparts*local_subpartidx + partidx,1) - interf.partptr_sub(interf.nparts*local_subpartidx + partidx,0); + if (local_subpartidx % 2 == 0) + update += btdm.NumBlocks(nrows); + else + update += btdm.NumBlocksSchur(nrows); } }); const auto nblocks = Kokkos::create_mirror_view_and_copy - (Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, ntridiags)); + (Kokkos::HostSpace(), Kokkos::subview(btdm.flat_td_ptr, interf.nparts-1, 2*interf.n_subparts_per_part-1)); btdm.is_diagonal_only = (static_cast(nblocks()) == ntridiags); } @@ -1398,57 +1416,114 @@ namespace Ifpack2 { if (vector_length == 1) { btdm.pack_td_ptr = btdm.flat_td_ptr; } else { - const local_ordinal_type npacks = interf.packptr.extent(0) - 1; - btdm.pack_td_ptr = size_type_1d_view(do_not_initialize_tag("btdm.pack_td_ptr"), ntridiags + 1); - const Kokkos::RangePolicy policy(0,npacks); - Kokkos::parallel_scan + //const local_ordinal_type npacks = interf.packptr_sub.extent(0) - 1; + + local_ordinal_type npacks_per_subpart = 0; + for (local_ordinal_type ip=1;ip<=interf.nparts;++ip) //n_sub_parts_and_schur + if (interf.part2packrowidx0(ip) != interf.part2packrowidx0(ip-1)) + ++npacks_per_subpart; + + btdm.pack_td_ptr = size_type_2d_view(do_not_initialize_tag("btdm.pack_td_ptr"), interf.nparts, 2*interf.n_subparts_per_part); + const Kokkos::RangePolicy policy(0,npacks_per_subpart); + + Kokkos::parallel_for ("createBlockTridiags::RangePolicy::pack_td_ptr", - policy, KOKKOS_LAMBDA(const local_ordinal_type &i, size_type &update, const bool &final) { - const local_ordinal_type parti = interf.packptr(i); - const local_ordinal_type parti_next = interf.packptr(i+1); - if (final) { - const size_type nblks = update; - for (local_ordinal_type pti=parti;pti::execution_space) + + return btdm; + } + + // Set the tridiags to be I to the full pack block size. That way, if a + // tridiag within a pack is shorter than the longest one, the extra blocks are + // processed in a safe way. Similarly, in the solve phase, if the extra blocks + // in the packed multvector are 0, and the tridiag LU reflects the extra I + // blocks, then the solve proceeds as though the extra blocks aren't + // present. Since this extra work is part of the SIMD calls, it's not actually + // extra work. Instead, it means we don't have to put checks or masks in, or + // quiet NaNs. This functor has to be called just once, in the symbolic phase, + // since the numeric phase fills in only the used entries, leaving these I // blocks intact. template void setTridiagsToIdentity (const BlockTridiags& btdm, - const typename ImplType::local_ordinal_type_1d_view& packptr) + const typename BlockHelperDetails::ImplType::local_ordinal_type_1d_view& packptr) { - using impl_type = ImplType; + using impl_type = BlockHelperDetails::ImplType; using execution_space = typename impl_type::execution_space; using local_ordinal_type = typename impl_type::local_ordinal_type; - using size_type_1d_view = typename impl_type::size_type_1d_view; + using size_type_2d_view = typename impl_type::size_type_2d_view; - const ConstUnmanaged pack_td_ptr(btdm.pack_td_ptr); + const ConstUnmanaged pack_td_ptr(btdm.pack_td_ptr); const local_ordinal_type blocksize = btdm.values.extent(1); { @@ -1511,64 +1586,38 @@ namespace Ifpack2 { ("setTridiagsToIdentity::TeamPolicy", policy, KOKKOS_LAMBDA(const typename team_policy_type::member_type &member) { const local_ordinal_type k = member.league_rank(); - const local_ordinal_type ibeg = pack_td_ptr(packptr(k)); - const local_ordinal_type iend = pack_td_ptr(packptr(k+1)); + const local_ordinal_type ibeg = pack_td_ptr(packptr(k),0); + const local_ordinal_type iend = pack_td_ptr(packptr(k),pack_td_ptr.extent(1)-1); + const local_ordinal_type diff = iend - ibeg; const local_ordinal_type icount = diff/3 + (diff%3 > 0); const btdm_scalar_type one(1); Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) { Kokkos::parallel_for(Kokkos::TeamThreadRange(member,icount),[&](const local_ordinal_type &ii) { const local_ordinal_type i = ibeg + ii*3; - for (local_ordinal_type j=0;j - struct AmD { - using impl_type = ImplType; - using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view; - using size_type_1d_view = typename impl_type::size_type_1d_view; - using impl_scalar_type_1d_view_tpetra = Unmanaged; - // rowptr points to the start of each row of A_colindsub. - size_type_1d_view rowptr, rowptr_remote; - // Indices into A's rows giving the blocks to extract. rowptr(i) points to - // the i'th row. Thus, g.entries(A_colindsub(rowptr(row) : rowptr(row+1))), - // where g is A's graph, are the columns AmD uses. If seq_method_, then - // A_colindsub contains all the LIDs and A_colindsub_remote is empty. If ! - // seq_method_, then A_colindsub contains owned LIDs and A_colindsub_remote - // contains the remote ones. - local_ordinal_type_1d_view A_colindsub, A_colindsub_remote; - - // Currently always true. - bool is_tpetra_block_crs; - - // If is_tpetra_block_crs, then this is a pointer to A_'s value data. - impl_scalar_type_1d_view_tpetra tpetra_values; - - AmD() = default; - AmD(const AmD &b) = default; - }; - /// /// symbolic phase, on host : create R = A - D, pack D /// template void - performSymbolicPhase(const Teuchos::RCP::tpetra_block_crs_matrix_type> &A, - const PartInterface &interf, + performSymbolicPhase(const Teuchos::RCP::tpetra_block_crs_matrix_type> &A, + const BlockHelperDetails::PartInterface &interf, BlockTridiags &btdm, - AmD &amd, + BlockHelperDetails::AmD &amd, const bool overlap_communication_and_computation) { - IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::SymbolicPhase"); + IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::SymbolicPhase"); + + using impl_type = BlockHelperDetails::ImplType; - using impl_type = ImplType; // using node_memory_space = typename impl_type::node_memory_space; using host_execution_space = typename impl_type::host_execution_space; @@ -1578,12 +1627,15 @@ namespace Ifpack2 { using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view; using size_type_1d_view = typename impl_type::size_type_1d_view; using vector_type_3d_view = typename impl_type::vector_type_3d_view; + using vector_type_4d_view = typename impl_type::vector_type_4d_view; using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type; constexpr int vector_length = impl_type::vector_length; const auto comm = A->getRowMap()->getComm(); + const auto& g = A->getCrsGraph(); + const auto blocksize = A->getBlockSize(); // mirroring to host @@ -1595,8 +1647,10 @@ namespace Ifpack2 { const local_ordinal_type nrows = partptr(partptr.extent(0) - 1); - // find column to row map on host Kokkos::View col2row("col2row", A->getLocalNumCols()); + + // find column to row map on host + Kokkos::deep_copy(col2row, Teuchos::OrdinalTraits::invalid()); { const auto rowmap = g.getRowMap(); @@ -1615,7 +1669,7 @@ namespace Ifpack2 { const local_ordinal_type lc = colmap->getLocalElement(gid); # if defined(BLOCKTRIDICONTAINER_DEBUG) TEUCHOS_TEST_FOR_EXCEPT_MSG(lc == Teuchos::OrdinalTraits::invalid(), - get_msg_prefix(comm) << "GID " << gid + BlockHelperDetails::get_msg_prefix(comm) << "GID " << gid << " gives an invalid local column."); # endif col2row(lc) = lr; @@ -1644,7 +1698,7 @@ namespace Ifpack2 { } // count (block) nnzs in D and R. - typedef SumReducer sum_reducer_type; + typedef BlockHelperDetails::SumReducer sum_reducer_type; typename sum_reducer_type::value_type sum_reducer_value; { const Kokkos::RangePolicy policy(0,nrows); @@ -1694,7 +1748,7 @@ namespace Ifpack2 { R_nnz_remote = 0; } - // construct the D graph. + // construct the D_00 graph. { const auto flat_td_ptr = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), btdm.flat_td_ptr); @@ -1706,6 +1760,9 @@ namespace Ifpack2 { #endif const local_ordinal_type nparts = partptr.extent(0) - 1; + + // Loop over the lines: + // - part2rowidx0 stores the { const Kokkos::RangePolicy policy(0, nparts); Kokkos::parallel_for @@ -1727,7 +1784,7 @@ namespace Ifpack2 { if (pi != pi0) continue; if (ri + 1 < ri0 || ri > ri0 + 1) continue; const local_ordinal_type row_entry = j - j0; - D_A_colindsub(flat_td_ptr(pi0) + ((td_row_os + ri) - ri0)) = row_entry; + D_A_colindsub(flat_td_ptr(pi0,0) + ((td_row_os + ri) - ri0)) = row_entry; } } }); @@ -1740,9 +1797,14 @@ namespace Ifpack2 { // Allocate values. { - const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, nparts); + const auto pack_td_ptr_last = Kokkos::subview(btdm.pack_td_ptr, btdm.pack_td_ptr.extent(0)-1, btdm.pack_td_ptr.extent(1)-1); const auto num_packed_blocks = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_last); btdm.values = vector_type_3d_view("btdm.values", num_packed_blocks(), blocksize, blocksize); + + const auto pack_td_ptr_schur_last = Kokkos::subview(btdm.pack_td_ptr_schur, btdm.pack_td_ptr_schur.extent(0)-1, btdm.pack_td_ptr_schur.extent(1)-1); + const auto num_packed_blocks_schur = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), pack_td_ptr_schur_last); + btdm.values_schur = vector_type_3d_view("btdm.values_schur", num_packed_blocks_schur(), blocksize, blocksize); + if (vector_length > 1) setTridiagsToIdentity(btdm, interf.packptr); } } @@ -1790,7 +1852,7 @@ namespace Ifpack2 { } // exclusive scan - typedef ArrayValueType update_type; + typedef BlockHelperDetails::ArrayValueType update_type; { Kokkos::RangePolicy policy(0,nrows+1); Kokkos::parallel_scan @@ -1849,8 +1911,13 @@ namespace Ifpack2 { amd.tpetra_values = (const_cast(A.get())->getValuesDeviceNonConst()); } + + // Allocate view for E and initialize the values with B: + + //btdm.e_values = vector_type_4d_view("btdm.e_values", 2, (interf.n_subparts_per_part-1)*interf.max_subpartsz*interf.nparts, blocksize, blocksize); + btdm.e_values = vector_type_4d_view("btdm.e_values", 2, interf.part2packrowidx0_back, blocksize, blocksize); } - IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(typename ImplType::execution_space) + IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) } @@ -1984,11 +2051,448 @@ namespace Ifpack2 { }; #endif - + template + struct SolveTridiagsDefaultModeAndAlgo; + + template + KOKKOS_INLINE_FUNCTION + void + solveMultiVector(const typename Kokkos::TeamPolicy::member_type &member, + const typename impl_type::local_ordinal_type &/* blocksize */, + const typename impl_type::local_ordinal_type &i0, + const typename impl_type::local_ordinal_type &r0, + const typename impl_type::local_ordinal_type &nrows, + const typename impl_type::local_ordinal_type &v, + const ConstUnmanaged D_internal_vector_values, + const Unmanaged X_internal_vector_values, + const WWViewType &WW, + const bool skip_first_pass=false) { + using execution_space = typename impl_type::execution_space; + using team_policy_type = Kokkos::TeamPolicy; + using member_type = typename team_policy_type::member_type; + using local_ordinal_type = typename impl_type::local_ordinal_type; + + typedef SolveTridiagsDefaultModeAndAlgo + default_mode_and_algo_type; + + typedef typename default_mode_and_algo_type::mode_type default_mode_type; + typedef typename default_mode_and_algo_type::multi_vector_algo_type default_algo_type; + + using btdm_magnitude_type = typename impl_type::btdm_magnitude_type; + + // constant + const auto one = Kokkos::ArithTraits::one(); + const auto zero = Kokkos::ArithTraits::zero(); + + // subview pattern + auto A = Kokkos::subview(D_internal_vector_values, i0, Kokkos::ALL(), Kokkos::ALL(), v); + auto X1 = Kokkos::subview(X_internal_vector_values, r0, Kokkos::ALL(), Kokkos::ALL(), v); + auto X2 = X1; + + local_ordinal_type i = i0, r = r0; + + + if (nrows > 1) { + // solve Lx = x + if (skip_first_pass) { + i += (nrows-2) * 3; + r += (nrows-2); + A.assign_data( &D_internal_vector_values(i+2,0,0,v) ); + X2.assign_data( &X_internal_vector_values(++r,0,0,v) ); + A.assign_data( &D_internal_vector_values(i+3,0,0,v) ); + KB::Trsm + ::invoke(member, one, A, X2); + X1.assign_data( X2.data() ); + i+=3; + } + else { + KB::Trsm + ::invoke(member, one, A, X1); + for (local_ordinal_type tr=1;tr + ::invoke(member, -one, A, X1, one, X2); + A.assign_data( &D_internal_vector_values(i+3,0,0,v) ); + KB::Trsm + ::invoke(member, one, A, X2); + X1.assign_data( X2.data() ); + } + } + + // solve Ux = x + KB::Trsm + ::invoke(member, one, A, X1); + for (local_ordinal_type tr=nrows;tr>1;--tr) { + i -= 3; + A.assign_data( &D_internal_vector_values(i+1,0,0,v) ); + X2.assign_data( &X_internal_vector_values(--r,0,0,v) ); + member.team_barrier(); + KB::Gemm + ::invoke(member, -one, A, X1, one, X2); + + A.assign_data( &D_internal_vector_values(i,0,0,v) ); + KB::Trsm + ::invoke(member, one, A, X2); + X1.assign_data( X2.data() ); + } + } else { + // matrix is already inverted + auto W = Kokkos::subview(WW, Kokkos::ALL(), Kokkos::ALL(), v); + KB::Copy + ::invoke(member, X1, W); + member.team_barrier(); + KB::Gemm + ::invoke(member, one, A, W, zero, X1); + } + + } + + template + KOKKOS_INLINE_FUNCTION + void + solveSingleVectorNew(const typename Kokkos::TeamPolicy::member_type &member, + const typename impl_type::local_ordinal_type &blocksize, + const typename impl_type::local_ordinal_type &i0, + const typename impl_type::local_ordinal_type &r0, + const typename impl_type::local_ordinal_type &nrows, + const typename impl_type::local_ordinal_type &v, + const ConstUnmanaged D_internal_vector_values, + const XViewType &X_internal_vector_values, //Unmanaged + const WWViewType &WW) { + using execution_space = typename impl_type::execution_space; + //using team_policy_type = Kokkos::TeamPolicy; + //using member_type = typename team_policy_type::member_type; + using local_ordinal_type = typename impl_type::local_ordinal_type; + + typedef SolveTridiagsDefaultModeAndAlgo + default_mode_and_algo_type; + + typedef typename default_mode_and_algo_type::mode_type default_mode_type; + typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type; + + using btdm_magnitude_type = typename impl_type::btdm_magnitude_type; + + // base pointers + auto A = D_internal_vector_values.data(); + auto X = X_internal_vector_values.data(); + + // constant + const auto one = Kokkos::ArithTraits::one(); + const auto zero = Kokkos::ArithTraits::zero(); + //const local_ordinal_type num_vectors = X_scalar_values.extent(2); + + // const local_ordinal_type blocksize = D_scalar_values.extent(1); + const local_ordinal_type astep = D_internal_vector_values.stride_0(); + const local_ordinal_type as0 = D_internal_vector_values.stride_1(); //blocksize*vector_length; + const local_ordinal_type as1 = D_internal_vector_values.stride_2(); //vector_length; + const local_ordinal_type xstep = X_internal_vector_values.stride_0(); + const local_ordinal_type xs0 = X_internal_vector_values.stride_1(); //vector_length; + + // move to starting point + A += i0*astep + v; + X += r0*xstep + v; + + //for (local_ordinal_type col=0;col 1) { + // solve Lx = x + KOKKOSBATCHED_TRSV_LOWER_NO_TRANSPOSE_INTERNAL_INVOKE + (default_mode_type,default_algo_type, + member, + KB::Diag::Unit, + blocksize,blocksize, + one, + A, as0, as1, + X, xs0); + + for (local_ordinal_type tr=1;tr1;--tr) { + A -= 3*astep; + member.team_barrier(); + KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE + (default_mode_type,default_algo_type, + member, + blocksize, blocksize, + -one, + A+1*astep, as0, as1, + X, xs0, + one, + X-1*xstep, xs0); + KOKKOSBATCHED_TRSV_UPPER_NO_TRANSPOSE_INTERNAL_INVOKE + (default_mode_type,default_algo_type, + member, + KB::Diag::NonUnit, + blocksize, blocksize, + one, + A, as0, as1, + X-1*xstep,xs0); + X -= 1*xstep; + } + // for multiple rhs + //X += xs1; + } else { + const local_ordinal_type ws0 = WW.stride_0(); + auto W = WW.data() + v; + KOKKOSBATCHED_COPY_VECTOR_NO_TRANSPOSE_INTERNAL_INVOKE + (default_mode_type, + member, blocksize, X, xs0, W, ws0); + member.team_barrier(); + KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE + (default_mode_type,default_algo_type, + member, + blocksize, blocksize, + one, + A, as0, as1, + W, xs0, + zero, + X, xs0); + } + } + + template + void writeBTDValuesToFile (const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) { +#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM + auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device); + std::ofstream myfile; + myfile.open (fileName); + + const local_ordinal_type n_parts_per_pack = n_parts < (local_ordinal_type) scalar_values.extent(3) ? n_parts : scalar_values.extent(3); + local_ordinal_type nnz = scalar_values.extent(0) * scalar_values.extent(1) * scalar_values.extent(2) * n_parts_per_pack; + const local_ordinal_type n_blocks = scalar_values.extent(0)*n_parts_per_pack; + const local_ordinal_type n_blocks_per_part = n_blocks/n_parts; + + const local_ordinal_type block_size = scalar_values.extent(1); + + const local_ordinal_type n_rows_per_part = (n_blocks_per_part+2)/3 * block_size; + const local_ordinal_type n_rows = n_rows_per_part*n_parts; + + const local_ordinal_type n_packs = ceil(float(n_parts)/n_parts_per_pack); + + myfile << "%%MatrixMarket matrix coordinate real general"<< std::endl; + myfile << "%%nnz = " << nnz; + myfile << " block size = " << block_size; + myfile << " number of blocks = " << n_blocks; + myfile << " number of parts = " << n_parts; + myfile << " number of blocks per part = " << n_blocks_per_part; + myfile << " number of rows = " << n_rows ; + myfile << " number of cols = " << n_rows; + myfile << " number of packs = " << n_packs << std::endl; + + myfile << n_rows << " " << n_rows << " " << nnz << std::setprecision(9) << std::endl; + + local_ordinal_type current_part_idx, current_block_idx, current_row_offset, current_col_offset, current_row, current_col; + for (local_ordinal_type i_pack=0;i_pack + void write4DMultiVectorValuesToFile (const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) { +#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM + auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device); + std::ofstream myfile; + myfile.open (fileName); + + const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(2) ? n_parts : scalar_values.extent(2); + const local_ordinal_type n_blocks = scalar_values.extent(0)*n_parts_per_pack; + const local_ordinal_type n_blocks_per_part = n_blocks/n_parts; + + const local_ordinal_type block_size = scalar_values.extent(1); + const local_ordinal_type n_cols = scalar_values.extent(2); + + const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size; + const local_ordinal_type n_rows = n_rows_per_part*n_parts; + + const local_ordinal_type n_packs = ceil(float(n_parts)/n_parts_per_pack); + + + myfile << "%%MatrixMarket matrix array real general"<< std::endl; + myfile << "%%block size = " << block_size; + myfile << " number of blocks = " << n_blocks; + myfile << " number of parts = " << n_parts; + myfile << " number of blocks per part = " << n_blocks_per_part; + myfile << " number of rows = " << n_rows ; + myfile << " number of cols = " << n_cols; + myfile << " number of packs = " << n_packs << std::endl; + + myfile << n_rows << " " << n_cols << std::setprecision(9) << std::endl; + + local_ordinal_type current_part_idx, current_block_idx, current_row_offset; + (void) current_row_offset; + (void) current_part_idx; + for (local_ordinal_type j_in_block=0;j_in_block + void write5DMultiVectorValuesToFile (const local_ordinal_type &n_parts, const ViewType &scalar_values_device, std::string fileName) { +#ifdef IFPACK2_BLOCKTRIDICONTAINER_WRITE_MM + auto scalar_values = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), scalar_values_device); + std::ofstream myfile; + myfile.open (fileName); + + const local_ordinal_type n_parts_per_pack = n_parts < scalar_values.extent(3) ? n_parts : scalar_values.extent(3); + const local_ordinal_type n_blocks = scalar_values.extent(1)*n_parts_per_pack; + const local_ordinal_type n_blocks_per_part = n_blocks/n_parts; + + const local_ordinal_type block_size = scalar_values.extent(2); + const local_ordinal_type n_blocks_cols = scalar_values.extent(0); + const local_ordinal_type n_cols = n_blocks_cols * block_size; + + const local_ordinal_type n_rows_per_part = n_blocks_per_part * block_size; + const local_ordinal_type n_rows = n_rows_per_part*n_parts; + + const local_ordinal_type n_packs = ceil(float(n_parts)/n_parts_per_pack); + + myfile << "%%MatrixMarket matrix array real general"<< std::endl; + myfile << "%%block size = " << block_size; + myfile << " number of blocks = " << n_blocks; + myfile << " number of parts = " << n_parts; + myfile << " number of blocks per part = " << n_blocks_per_part; + myfile << " number of rows = " << n_rows ; + myfile << " number of cols = " << n_cols; + myfile << " number of packs = " << n_packs << std::endl; + + myfile << n_rows << " " << n_cols << std::setprecision(9) << std::endl; + + local_ordinal_type current_part_idx, current_block_idx, current_row_offset; + (void) current_row_offset; + (void) current_part_idx; + for (local_ordinal_type i_block_col=0;i_block_col + KOKKOS_INLINE_FUNCTION + void + copy3DView(const member_type &member, const ViewType1 &view1, const ViewType2 &view2) { +/* + // Kokkos::Experimental::local_deep_copy + auto teamVectorRange = + Kokkos::TeamVectorMDRange, member_type>( + member, view1.extent(0), view1.extent(1), view1.extent(2)); + + Kokkos::parallel_for + (teamVectorRange, + [&](const local_ordinal_type &i, const local_ordinal_type &j, const local_ordinal_type &k) { + view1(i,j,k) = view2(i,j,k); + }); +*/ + Kokkos::Experimental::local_deep_copy(member, view1, view2); + } template struct ExtractAndFactorizeTridiags { public: - using impl_type = ImplType; + using impl_type = BlockHelperDetails::ImplType; // a functor cannot have both device_type and execution_space; specialization error in kokkos using execution_space = typename impl_type::execution_space; using memory_space = typename impl_type::memory_space; @@ -2001,14 +2505,18 @@ namespace Ifpack2 { using block_crs_matrix_type = typename impl_type::tpetra_block_crs_matrix_type; /// views using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view; - using size_type_1d_view = typename impl_type::size_type_1d_view; + using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view; + using size_type_2d_view = typename impl_type::size_type_2d_view; using impl_scalar_type_1d_view_tpetra = typename impl_type::impl_scalar_type_1d_view_tpetra; /// vectorization using btdm_scalar_type = typename impl_type::btdm_scalar_type; using btdm_magnitude_type = typename impl_type::btdm_magnitude_type; using vector_type_3d_view = typename impl_type::vector_type_3d_view; + using vector_type_4d_view = typename impl_type::vector_type_4d_view; using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view; + using internal_vector_type_5d_view = typename impl_type::internal_vector_type_5d_view; using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view; + using btdm_scalar_type_5d_view = typename impl_type::btdm_scalar_type_5d_view; using internal_vector_scratch_type_3d_view = Scratch; using btdm_scalar_scratch_type_3d_view = Scratch; @@ -2022,17 +2530,20 @@ namespace Ifpack2 { private: // part interface - const ConstUnmanaged partptr, lclrow, packptr; + const ConstUnmanaged partptr, lclrow, packptr, packindices_sub, packindices_schur, packptr_sub; + const ConstUnmanaged partptr_sub, part2packrowidx0_sub; const local_ordinal_type max_partsz; // block crs matrix (it could be Kokkos::UVMSpace::size_type, which is int) using size_type_1d_view_tpetra = Kokkos::View; const ConstUnmanaged A_rowptr; const ConstUnmanaged A_values; // block tridiags - const ConstUnmanaged pack_td_ptr, flat_td_ptr; + const ConstUnmanaged pack_td_ptr, flat_td_ptr, pack_td_ptr_schur; const ConstUnmanaged A_colindsub; - const Unmanaged internal_vector_values; - const Unmanaged scalar_values; + const Unmanaged internal_vector_values, internal_vector_values_schur; + const Unmanaged e_internal_vector_values; + const Unmanaged scalar_values, scalar_values_schur; + const Unmanaged e_scalar_values; // shared information const local_ordinal_type blocksize, blocksize_square; // diagonal safety @@ -2042,13 +2553,18 @@ namespace Ifpack2 { public: ExtractAndFactorizeTridiags(const BlockTridiags &btdm_, - const PartInterface &interf_, + const BlockHelperDetails::PartInterface &interf_, const Teuchos::RCP &A_, const magnitude_type& tiny_) : // interface partptr(interf_.partptr), lclrow(interf_.lclrow), packptr(interf_.packptr), + packindices_sub(interf_.packindices_sub), + packindices_schur(interf_.packindices_schur), + packptr_sub(interf_.packptr_sub), + partptr_sub(interf_.partptr_sub), + part2packrowidx0_sub(interf_.part2packrowidx0_sub), max_partsz(interf_.max_partsz), // block crs matrix A_rowptr(A_->getCrsGraph().getLocalGraphDevice().row_map), @@ -2056,17 +2572,40 @@ namespace Ifpack2 { // block tridiags pack_td_ptr(btdm_.pack_td_ptr), flat_td_ptr(btdm_.flat_td_ptr), + pack_td_ptr_schur(btdm_.pack_td_ptr_schur), A_colindsub(btdm_.A_colindsub), internal_vector_values((internal_vector_type*)btdm_.values.data(), btdm_.values.extent(0), btdm_.values.extent(1), btdm_.values.extent(2), vector_length/internal_vector_length), + internal_vector_values_schur((internal_vector_type*)btdm_.values_schur.data(), + btdm_.values_schur.extent(0), + btdm_.values_schur.extent(1), + btdm_.values_schur.extent(2), + vector_length/internal_vector_length), + e_internal_vector_values((internal_vector_type*)btdm_.e_values.data(), + btdm_.e_values.extent(0), + btdm_.e_values.extent(1), + btdm_.e_values.extent(2), + btdm_.e_values.extent(3), + vector_length/internal_vector_length), scalar_values((btdm_scalar_type*)btdm_.values.data(), btdm_.values.extent(0), btdm_.values.extent(1), btdm_.values.extent(2), vector_length), + scalar_values_schur((btdm_scalar_type*)btdm_.values_schur.data(), + btdm_.values_schur.extent(0), + btdm_.values_schur.extent(1), + btdm_.values_schur.extent(2), + vector_length), + e_scalar_values((btdm_scalar_type*)btdm_.e_values.data(), + btdm_.e_values.extent(0), + btdm_.e_values.extent(1), + btdm_.e_values.extent(2), + btdm_.e_values.extent(3), + vector_length), blocksize(btdm_.values.extent(1)), blocksize_square(blocksize*blocksize), // diagonal weight to avoid zero pivots @@ -2079,43 +2618,93 @@ namespace Ifpack2 { KOKKOS_INLINE_FUNCTION void extract(local_ordinal_type partidx, + local_ordinal_type local_subpartidx, local_ordinal_type npacks) const { - using tlb = TpetraLittleBlock; - const size_type kps = pack_td_ptr(partidx); +#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF + printf("extract partidx = %d, local_subpartidx = %d, npacks = %d;\n", partidx, local_subpartidx, npacks); +#endif + using tlb = BlockHelperDetails::TpetraLittleBlock; + const size_type kps = pack_td_ptr(partidx, local_subpartidx); local_ordinal_type kfs[vector_length] = {}; local_ordinal_type ri0[vector_length] = {}; local_ordinal_type nrows[vector_length] = {}; + //TEUCHOS_TEST_FOR_EXCEPT_MSG(npacks > vector_length, + // "npacks is too big."); + for (local_ordinal_type vi=0;vi(block[vi][idx]); + if (local_subpartidx % 2 == 0) { + for (local_ordinal_type tr=0,j=0;tr(block[vi][idx]); + } + } + + if (nrows[0] == 1) break; + if (e == 1 && (tr == 0 || tr+1 == nrows[0])) break; + for (local_ordinal_type vi=1;vi(block[vi][idx]); + } + } - if (nrows[0] == 1) break; - if (e == 1 && (tr == 0 || tr+1 == nrows[0])) break; - for (local_ordinal_type vi=1;vi; + using tlb = BlockHelperDetails::TpetraLittleBlock; local_ordinal_type kfs_vals[internal_vector_length] = {}; local_ordinal_type ri0_vals[internal_vector_length] = {}; local_ordinal_type nrows_vals[internal_vector_length] = {}; - const size_type kps = pack_td_ptr(partidxbeg); + const size_type kps = pack_td_ptr(partidxbeg,0); for (local_ordinal_type v=vbeg,vi=0;v(block[tlb::getFlatIndex(ii,jj,blocksize)]); + } }); } } @@ -2170,7 +2760,7 @@ namespace Ifpack2 { typename WWViewType> KOKKOS_INLINE_FUNCTION void - factorize(const member_type &member, + factorize_subline(const member_type &member, const local_ordinal_type &i0, const local_ordinal_type &nrows, const local_ordinal_type &v, @@ -2186,6 +2776,11 @@ namespace Ifpack2 { // constant const auto one = Kokkos::ArithTraits::one(); +#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF + printf("i0 = %d, nrows = %d, v = %d;\n", i0, nrows, v); + printf("AA.extent(0) = %ld\n", AA.extent(0)); +#endif + // subview pattern auto A = Kokkos::subview(AA, i0, Kokkos::ALL(), Kokkos::ALL(), v); KB::LU:: - recommended_team_size(blocksize, vector_length, internal_vector_length); - const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view:: - shmem_size(blocksize, blocksize, vector_loop_size); + KOKKOS_INLINE_FUNCTION + void + operator() (const ExtractBCDTag &, const member_type &member) const { + // btdm is packed and sorted from largest one + const local_ordinal_type packidx = packindices_schur(member.league_rank()); - Kokkos::TeamPolicy - policy(packptr.extent(0)-1, team_size, vector_loop_size); -#if defined(KOKKOS_ENABLE_DEPRECATED_CODE) - Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run", - policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); -#else - policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); - Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run", - policy, *this); + const local_ordinal_type subpartidx = packptr_sub(packidx); + const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0); + const local_ordinal_type local_subpartidx = subpartidx/n_parts; + const local_ordinal_type partidx = subpartidx%n_parts; + + const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx; + //const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx); + //const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0); + + if (vector_loop_size == 1) { + extract(partidx, local_subpartidx, npacks); + } + else { + Kokkos::parallel_for + (Kokkos::ThreadVectorRange(member, vector_loop_size), + [&](const local_ordinal_type &v) { + const local_ordinal_type vbeg = v*internal_vector_length; +#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF + printf("i0 = %d, npacks = %d, vbeg = %d;\n", i0, npacks, vbeg); #endif - IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END; - } + if (vbeg < npacks) + extract(member, partidx+vbeg, npacks, vbeg); + }); + } - }; + member.team_barrier(); - /// - /// top level numeric interface - /// - template + const size_type kps1 = pack_td_ptr(partidx, local_subpartidx); + const size_type kps2 = pack_td_ptr(partidx, local_subpartidx+1)-1; + + const local_ordinal_type r1 = part2packrowidx0_sub(partidx,local_subpartidx)-1; + const local_ordinal_type r2 = part2packrowidx0_sub(partidx,local_subpartidx)+2; + +#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF + printf("Copy for Schur complement part id = %d from kps1 = %d to r1 = %d and from kps2 = %d to r2 = %d partidx = %d local_subpartidx = %d;\n", packidx, kps1, r1, kps2, r2, partidx, local_subpartidx); +#endif + + // Need to copy D to e_internal_vector_values. + copy3DView(member, Kokkos::subview(e_internal_vector_values, 0, r1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), + Kokkos::subview(internal_vector_values, kps1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL())); + + copy3DView(member, Kokkos::subview(e_internal_vector_values, 1, r2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), + Kokkos::subview(internal_vector_values, kps2, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL())); + + } + + KOKKOS_INLINE_FUNCTION + void + operator() (const ComputeETag &, const member_type &member) const { + // btdm is packed and sorted from largest one + const local_ordinal_type packidx = packindices_sub(member.league_rank()); + + const local_ordinal_type subpartidx = packptr_sub(packidx); + const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0); + const local_ordinal_type local_subpartidx = subpartidx/n_parts; + const local_ordinal_type partidx = subpartidx%n_parts; + + const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx; + const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx); + const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx); + const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0); + const local_ordinal_type num_vectors = blocksize; + + (void) npacks; + + internal_vector_scratch_type_3d_view + WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size); + if (local_subpartidx == 0) { + Kokkos::parallel_for + (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) { + solveMultiVector (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW, true); + }); + } + else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) { + Kokkos::parallel_for + (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) { + solveMultiVector (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW); + }); + } + else { + Kokkos::parallel_for + (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) { + solveMultiVector (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW, true); + solveMultiVector (member, blocksize, i0, r0, nrows, v, internal_vector_values, Kokkos::subview(e_internal_vector_values, 1, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), WW); + }); + } + } + + KOKKOS_INLINE_FUNCTION + void + operator() (const ComputeSchurTag &, const member_type &member) const { + // btdm is packed and sorted from largest one + const local_ordinal_type packidx = packindices_schur(member.league_rank()); + + const local_ordinal_type subpartidx = packptr_sub(packidx); + const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0); + const local_ordinal_type local_subpartidx = subpartidx/n_parts; + const local_ordinal_type partidx = subpartidx%n_parts; + + //const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx; + const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx); + //const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx); + //const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0); + + internal_vector_scratch_type_3d_view + WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size); + + // Compute S = D - C E + + const local_ordinal_type local_subpartidx_schur = (local_subpartidx-1)/2; + const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx,local_subpartidx_schur) : pack_td_ptr_schur(partidx,local_subpartidx_schur) + 1; + const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0+2 : i0+2; + + for (local_ordinal_type i = 0; i < 4; ++i) { //pack_td_ptr_schur(partidx,local_subpartidx_schur+1)-i0_schur + copy3DView(member, Kokkos::subview(internal_vector_values_schur, i0_schur+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), + Kokkos::subview(internal_vector_values, i0_offset+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL())); + } + + member.team_barrier(); + + const auto one = Kokkos::ArithTraits::one(); + + const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx)+1; + const size_type c_kps2 = pack_td_ptr(partidx, local_subpartidx+1)-2; + + const local_ordinal_type e_r1 = part2packrowidx0_sub(partidx,local_subpartidx)-1; + const local_ordinal_type e_r2 = part2packrowidx0_sub(partidx,local_subpartidx)+2; + + typedef ExtractAndFactorizeTridiagsDefaultModeAndAlgo + default_mode_and_algo_type; + + typedef typename default_mode_and_algo_type::mode_type default_mode_type; + typedef typename default_mode_and_algo_type::algo_type default_algo_type; + + Kokkos::parallel_for + (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) { + for (size_type i = 0; i < pack_td_ptr_schur(partidx,local_subpartidx_schur+1)-pack_td_ptr_schur(partidx,local_subpartidx_schur); ++i) { + local_ordinal_type e_r, e_c, c_kps; + + if ( local_subpartidx_schur == 0 ) { + if ( i == 0 ) { + e_r = e_r1; + e_c = 0; + c_kps = c_kps1; + } + else if ( i == 3 ) { + e_r = e_r2; + e_c = 1; + c_kps = c_kps2; + } + else if ( i == 4 ) { + e_r = e_r2; + e_c = 0; + c_kps = c_kps2; + } + else { + continue; + } + } + else { + if ( i == 0 ) { + e_r = e_r1; + e_c = 1; + c_kps = c_kps1; + } + else if ( i == 1 ) { + e_r = e_r1; + e_c = 0; + c_kps = c_kps1; + } + else if ( i == 4 ) { + e_r = e_r2; + e_c = 1; + c_kps = c_kps2; + } + else if ( i == 5 ) { + e_r = e_r2; + e_c = 0; + c_kps = c_kps2; + } + else { + continue; + } + } + + auto S = Kokkos::subview(internal_vector_values_schur, pack_td_ptr_schur(partidx,local_subpartidx_schur)+i, Kokkos::ALL(), Kokkos::ALL(), v); + auto C = Kokkos::subview(internal_vector_values, c_kps, Kokkos::ALL(), Kokkos::ALL(), v); + auto E = Kokkos::subview(e_internal_vector_values, e_c, e_r, Kokkos::ALL(), Kokkos::ALL(), v); + KB::Gemm + ::invoke(member, -one, C, E, one, S); + } + }); + } + + KOKKOS_INLINE_FUNCTION + void + operator() (const FactorizeSchurTag &, const member_type &member) const { + const local_ordinal_type packidx = packindices_sub(member.league_rank()); + + const local_ordinal_type partidx = packptr_sub(packidx); + + const local_ordinal_type i0 = pack_td_ptr_schur(partidx,0); + const local_ordinal_type nrows = 2*(pack_td_ptr_schur.extent(1)-1); + + internal_vector_scratch_type_3d_view + WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size); + +#ifdef IFPACK2_BLOCKTRIDICONTAINER_USE_PRINTF + printf("FactorizeSchurTag rank = %d, i0 = %d, nrows = %d;\n", member.league_rank(), i0, nrows); + printf("vector_loop_size = %d\n", vector_loop_size); +#endif + + if (vector_loop_size == 1) { + factorize_subline(member, i0, nrows, 0, internal_vector_values_schur, WW); + } else { + Kokkos::parallel_for + (Kokkos::ThreadVectorRange(member, vector_loop_size), + [&](const local_ordinal_type &v) { + factorize_subline(member, i0, nrows, v, internal_vector_values_schur, WW); + }); + } + } + + void run() { + IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN; + const local_ordinal_type team_size = + ExtractAndFactorizeTridiagsDefaultModeAndAlgo:: + recommended_team_size(blocksize, vector_length, internal_vector_length); + const local_ordinal_type per_team_scratch = internal_vector_scratch_type_3d_view:: + shmem_size(blocksize, blocksize, vector_loop_size); + + { + IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractAndFactorizeSubLineTag"); + Kokkos::TeamPolicy + policy(packindices_sub.extent(0), team_size, vector_loop_size); + + + const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0); + writeBTDValuesToFile(n_parts, scalar_values, "before.mm"); + + policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); + Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run", + policy, *this); + execution_space().fence(); + + writeBTDValuesToFile(n_parts, scalar_values, "after.mm"); + } + + if (packindices_schur.extent(0) != 0) + { + { + + write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_before_extract.mm"); + + { + IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ExtractBCDTag"); + Kokkos::TeamPolicy + policy(packindices_schur.extent(0), team_size, vector_loop_size); + + policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); + Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run", + policy, *this); + execution_space().fence(); + } + + writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values, "after_extraction_of_BCD.mm"); + + write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_after_extract.mm"); + { + IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ComputeETag"); + Kokkos::TeamPolicy + policy(packindices_sub.extent(0), team_size, vector_loop_size); + + policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); + Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run", + policy, *this); + execution_space().fence(); + } + write5DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), e_scalar_values, "e_scalar_values_after_compute.mm"); + } + + { + IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::ComputeSchurTag"); + writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "before_schur.mm"); + Kokkos::TeamPolicy + policy(packindices_schur.extent(0), team_size, vector_loop_size); + + policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); + Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run", + policy, *this); + writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_schur.mm"); + execution_space().fence(); + } + + { + IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase::FactorizeSchurTag"); + Kokkos::TeamPolicy + policy(part2packrowidx0_sub.extent(0), team_size, vector_loop_size); + policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); + Kokkos::parallel_for("ExtractAndFactorize::TeamPolicy::run", + policy, *this); + execution_space().fence(); + writeBTDValuesToFile(part2packrowidx0_sub.extent(0), scalar_values_schur, "after_factor_schur.mm"); + } + } + + IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END; + } + + }; + + /// + /// top level numeric interface + /// + template void - performNumericPhase(const Teuchos::RCP::tpetra_block_crs_matrix_type> &A, - const PartInterface &interf, + performNumericPhase(const Teuchos::RCP::tpetra_block_crs_matrix_type> &A, + const BlockHelperDetails::PartInterface &interf, BlockTridiags &btdm, - const typename ImplType::magnitude_type tiny) { - IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NumericPhase"); + const typename BlockHelperDetails::ImplType::magnitude_type tiny) { + IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::NumericPhase"); ExtractAndFactorizeTridiags function(btdm, interf, A, tiny); function.run(); - IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(typename ImplType::execution_space) + IFPACK2_BLOCKHELPER_TIMER_FENCE(typename BlockHelperDetails::ImplType::execution_space) } /// @@ -2316,7 +3227,7 @@ namespace Ifpack2 { template struct MultiVectorConverter { public: - using impl_type = ImplType; + using impl_type = BlockHelperDetails::ImplType; using execution_space = typename impl_type::execution_space; using memory_space = typename impl_type::memory_space; @@ -2359,7 +3270,7 @@ namespace Ifpack2 { public: - MultiVectorConverter(const PartInterface &interf, + MultiVectorConverter(const BlockHelperDetails::PartInterface &interf, const vector_type_3d_view &pmv) : partptr(interf.partptr), packptr(interf.packptr), @@ -2429,10 +3340,10 @@ namespace Ifpack2 { void run(const const_impl_scalar_type_2d_view_tpetra &scalar_multivector_) { IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN; - IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::MultiVectorConverter"); + IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::MultiVectorConverter"); scalar_multivector = scalar_multivector_; - if constexpr (is_device::value) { + if constexpr (BlockHelperDetails::is_device::value) { const local_ordinal_type vl = vector_length; const Kokkos::TeamPolicy policy(packptr.extent(0) - 1, Kokkos::AUTO(), vl); Kokkos::parallel_for @@ -2443,15 +3354,13 @@ namespace Ifpack2 { ("MultiVectorConverter::RangePolicy", policy, *this); } IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END; - IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space) + IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) } }; /// /// solve tridiags /// - template - struct SolveTridiagsDefaultModeAndAlgo; template<> struct SolveTridiagsDefaultModeAndAlgo { @@ -2592,7 +3501,7 @@ namespace Ifpack2 { template struct SolveTridiags { public: - using impl_type = ImplType; + using impl_type = BlockHelperDetails::ImplType; using execution_space = typename impl_type::execution_space; using local_ordinal_type = typename impl_type::local_ordinal_type; @@ -2603,11 +3512,13 @@ namespace Ifpack2 { using btdm_magnitude_type = typename impl_type::btdm_magnitude_type; /// views using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view; - using size_type_1d_view = typename impl_type::size_type_1d_view; + using local_ordinal_type_2d_view = typename impl_type::local_ordinal_type_2d_view; + using size_type_2d_view = typename impl_type::size_type_2d_view; /// vectorization using vector_type_3d_view = typename impl_type::vector_type_3d_view; using internal_vector_type_4d_view = typename impl_type::internal_vector_type_4d_view; - //using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view; + using internal_vector_type_5d_view = typename impl_type::internal_vector_type_5d_view; + using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view; using internal_vector_scratch_type_3d_view = Scratch; @@ -2625,17 +3536,32 @@ namespace Ifpack2 { private: // part interface + local_ordinal_type n_subparts_per_part; const ConstUnmanaged partptr; const ConstUnmanaged packptr; + const ConstUnmanaged packindices_sub; + const ConstUnmanaged packindices_schur; const ConstUnmanaged part2packrowidx0; + const ConstUnmanaged part2packrowidx0_sub; const ConstUnmanaged lclrow; + const ConstUnmanaged packptr_sub; + + const ConstUnmanaged partptr_sub; + const ConstUnmanaged pack_td_ptr_schur; // block tridiags - const ConstUnmanaged pack_td_ptr; + const ConstUnmanaged pack_td_ptr; // block tridiags values const ConstUnmanaged D_internal_vector_values; const Unmanaged X_internal_vector_values; + const Unmanaged X_internal_scalar_values; + + internal_vector_type_4d_view X_internal_vector_values_schur; + + const ConstUnmanaged D_internal_vector_values_schur; + const ConstUnmanaged e_internal_vector_values; + const local_ordinal_type vector_loop_size; @@ -2650,17 +3576,24 @@ namespace Ifpack2 { const bool compute_diff; public: - SolveTridiags(const PartInterface &interf, + SolveTridiags(const BlockHelperDetails::PartInterface &interf, const BlockTridiags &btdm, const vector_type_3d_view &pmv, const impl_scalar_type damping_factor, const bool is_norm_manager_active) : // interface + n_subparts_per_part(interf.n_subparts_per_part), partptr(interf.partptr), packptr(interf.packptr), + packindices_sub(interf.packindices_sub), + packindices_schur(interf.packindices_schur), part2packrowidx0(interf.part2packrowidx0), + part2packrowidx0_sub(interf.part2packrowidx0_sub), lclrow(interf.lclrow), + packptr_sub(interf.packptr_sub), + partptr_sub(interf.partptr_sub), + pack_td_ptr_schur(btdm.pack_td_ptr_schur), // block tridiags and multivector pack_td_ptr(btdm.pack_td_ptr), D_internal_vector_values((internal_vector_type*)btdm.values.data(), @@ -2673,6 +3606,27 @@ namespace Ifpack2 { pmv.extent(1), pmv.extent(2), vector_length/internal_vector_length), + X_internal_scalar_values((btdm_scalar_type*)pmv.data(), + pmv.extent(0), + pmv.extent(1), + pmv.extent(2), + vector_length), + X_internal_vector_values_schur(do_not_initialize_tag("X_internal_vector_values_schur"), + 2*(n_subparts_per_part-1) * part2packrowidx0_sub.extent(0), + pmv.extent(1), + pmv.extent(2), + vector_length/internal_vector_length), + D_internal_vector_values_schur((internal_vector_type*)btdm.values_schur.data(), + btdm.values_schur.extent(0), + btdm.values_schur.extent(1), + btdm.values_schur.extent(2), + vector_length/internal_vector_length), + e_internal_vector_values((internal_vector_type*)btdm.e_values.data(), + btdm.e_values.extent(0), + btdm.e_values.extent(1), + btdm.e_values.extent(2), + btdm.e_values.extent(3), + vector_length/internal_vector_length), vector_loop_size(vector_length/internal_vector_length), Y_scalar_multivector(), Z_scalar_vector(), @@ -2795,10 +3749,6 @@ namespace Ifpack2 { const local_ordinal_type xstep = X_internal_vector_values.stride_0(); const local_ordinal_type xs0 = X_internal_vector_values.stride_1(); //vector_length; - // for multiple rhs - //const local_ordinal_type xs0 = num_vectors*vector_length; //X_scalar_values.stride_1(); - //const local_ordinal_type xs1 = vector_length; //X_scalar_values.stride_2(); - // move to starting point A += i0*astep + v; X += r0*xstep + v; @@ -2981,6 +3931,15 @@ namespace Ifpack2 { template struct SingleVectorTag {}; template struct MultiVectorTag {}; + template struct SingleVectorSubLineTag {}; + template struct MultiVectorSubLineTag {}; + template struct SingleVectorApplyCTag {}; + template struct MultiVectorApplyCTag {}; + template struct SingleVectorSchurTag {}; + template struct MultiVectorSchurTag {}; + template struct SingleVectorApplyETag {}; + template struct MultiVectorApplyETag {}; + template KOKKOS_INLINE_FUNCTION void @@ -2989,7 +3948,7 @@ namespace Ifpack2 { const local_ordinal_type partidx = packptr(packidx); const local_ordinal_type npacks = packptr(packidx+1) - partidx; const local_ordinal_type pri0 = part2packrowidx0(partidx); - const local_ordinal_type i0 = pack_td_ptr(partidx); + const local_ordinal_type i0 = pack_td_ptr(partidx,0); const local_ordinal_type r0 = part2packrowidx0(partidx); const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx); const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B); @@ -3014,7 +3973,7 @@ namespace Ifpack2 { const local_ordinal_type partidx = packptr(packidx); const local_ordinal_type npacks = packptr(packidx+1) - partidx; const local_ordinal_type pri0 = part2packrowidx0(partidx); - const local_ordinal_type i0 = pack_td_ptr(partidx); + const local_ordinal_type i0 = pack_td_ptr(partidx,0); const local_ordinal_type r0 = part2packrowidx0(partidx); const local_ordinal_type nrows = partptr(partidx+1) - partptr(partidx); const local_ordinal_type blocksize = (B == 0 ? D_internal_vector_values.extent(1) : B); @@ -3032,939 +3991,429 @@ namespace Ifpack2 { }); } - void run(const impl_scalar_type_2d_view_tpetra &Y, - const impl_scalar_type_1d_view &Z) { - IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN; - IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::SolveTridiags"); + template + KOKKOS_INLINE_FUNCTION + void + operator() (const SingleVectorSubLineTag &, const member_type &member) const { + // btdm is packed and sorted from largest one + const local_ordinal_type packidx = packindices_sub(member.league_rank()); - /// set vectors - this->Y_scalar_multivector = Y; - this->Z_scalar_vector = Z; + const local_ordinal_type subpartidx = packptr_sub(packidx); + const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0); + const local_ordinal_type local_subpartidx = subpartidx/n_parts; + const local_ordinal_type partidx = subpartidx%n_parts; - const local_ordinal_type num_vectors = X_internal_vector_values.extent(2); - const local_ordinal_type blocksize = D_internal_vector_values.extent(1); + const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx; + const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx); + const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx); + const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0); + const local_ordinal_type blocksize = e_internal_vector_values.extent(2); + const local_ordinal_type num_vectors = blocksize; - const local_ordinal_type team_size = - SolveTridiagsDefaultModeAndAlgo:: - recommended_team_size(blocksize, vector_length, internal_vector_length); - const int per_team_scratch = internal_vector_scratch_type_3d_view - ::shmem_size(blocksize, num_vectors, vector_loop_size); + //(void) i0; + //(void) nrows; + (void) npacks; -#if defined(KOKKOS_ENABLE_DEPRECATED_CODE) -#define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \ - if (num_vectors == 1) { \ - const Kokkos::TeamPolicy > \ - policy(packptr.extent(0) - 1, team_size, vector_loop_size); \ - Kokkos::parallel_for \ - ("SolveTridiags::TeamPolicy::run", \ - policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \ - } else { \ - const Kokkos::TeamPolicy > \ - policy(packptr.extent(0) - 1, team_size, vector_loop_size); \ - Kokkos::parallel_for \ - ("SolveTridiags::TeamPolicy::run", \ - policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \ - } break -#else -#define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \ - if (num_vectors == 1) { \ - Kokkos::TeamPolicy > \ - policy(packptr.extent(0) - 1, team_size, vector_loop_size); \ - policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \ - Kokkos::parallel_for \ - ("SolveTridiags::TeamPolicy::run", \ - policy, *this); \ - } else { \ - Kokkos::TeamPolicy > \ - policy(packptr.extent(0) - 1, team_size, vector_loop_size); \ - policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \ - Kokkos::parallel_for \ - ("SolveTridiags::TeamPolicy::run", \ - policy, *this); \ - } break -#endif - switch (blocksize) { - case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 3); - case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 5); - case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 7); - case 9: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 9); - case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10); - case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11); - case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16); - case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17); - case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18); - default : BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 0); - } -#undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS + internal_vector_scratch_type_3d_view + WW(member.team_scratch(0), blocksize, num_vectors, vector_loop_size); - IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END; - IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space) + Kokkos::parallel_for + (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) { + solveSingleVectorNew (member, blocksize, i0, r0, nrows, v, D_internal_vector_values, X_internal_vector_values, WW); + }); } - }; - /// - /// compute local residula vector y = b - R x - /// - static inline int ComputeResidualVectorRecommendedCudaVectorSize(const int blksize, - const int team_size) { - int total_team_size(0); - if (blksize <= 5) total_team_size = 32; - else if (blksize <= 9) total_team_size = 32; // 64 - else if (blksize <= 12) total_team_size = 96; - else if (blksize <= 16) total_team_size = 128; - else if (blksize <= 20) total_team_size = 160; - else total_team_size = 160; - return total_team_size/team_size; - } + template + KOKKOS_INLINE_FUNCTION + void + operator() (const SingleVectorApplyCTag &, const member_type &member) const { + // btdm is packed and sorted from largest one + //const local_ordinal_type packidx = packindices_schur(member.league_rank()); + const local_ordinal_type packidx = packindices_sub(member.league_rank()); - static inline int ComputeResidualVectorRecommendedHIPVectorSize(const int blksize, - const int team_size) { - int total_team_size(0); - if (blksize <= 5) total_team_size = 32; - else if (blksize <= 9) total_team_size = 32; // 64 - else if (blksize <= 12) total_team_size = 96; - else if (blksize <= 16) total_team_size = 128; - else if (blksize <= 20) total_team_size = 160; - else total_team_size = 160; - return total_team_size/team_size; - } + const local_ordinal_type subpartidx = packptr_sub(packidx); + const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0); + const local_ordinal_type local_subpartidx = subpartidx/n_parts; + const local_ordinal_type partidx = subpartidx%n_parts; + const local_ordinal_type blocksize = e_internal_vector_values.extent(2); - static inline int ComputeResidualVectorRecommendedSYCLVectorSize(const int blksize, - const int team_size) { - int total_team_size(0); - if (blksize <= 5) total_team_size = 32; - else if (blksize <= 9) total_team_size = 32; // 64 - else if (blksize <= 12) total_team_size = 96; - else if (blksize <= 16) total_team_size = 128; - else if (blksize <= 20) total_team_size = 160; - else total_team_size = 160; - return total_team_size/team_size; - } + //const local_ordinal_type npacks = packptr_sub(packidx+1) - subpartidx; + const local_ordinal_type i0 = pack_td_ptr(partidx,local_subpartidx); + const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx); + const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0); - template - static inline int ComputeResidualVectorRecommendedVectorSize(const int blksize, - const int team_size) { - if ( is_cuda::value ) - return ComputeResidualVectorRecommendedCudaVectorSize(blksize, team_size); - if ( is_hip::value ) - return ComputeResidualVectorRecommendedHIPVectorSize(blksize, team_size); - if ( is_sycl::value ) - return ComputeResidualVectorRecommendedSYCLVectorSize(blksize, team_size); - return -1; - } + internal_vector_scratch_type_3d_view + WW(member.team_scratch(0), blocksize, blocksize, vector_loop_size); - - template - struct ComputeResidualVector { - public: - using impl_type = ImplType; - using node_device_type = typename impl_type::node_device_type; - using execution_space = typename impl_type::execution_space; - using memory_space = typename impl_type::memory_space; + // Compute v_2 = v_2 - C v_1 - using local_ordinal_type = typename impl_type::local_ordinal_type; - using size_type = typename impl_type::size_type; - using impl_scalar_type = typename impl_type::impl_scalar_type; - using magnitude_type = typename impl_type::magnitude_type; - using btdm_scalar_type = typename impl_type::btdm_scalar_type; - using btdm_magnitude_type = typename impl_type::btdm_magnitude_type; - /// views - using local_ordinal_type_1d_view = typename impl_type::local_ordinal_type_1d_view; - using size_type_1d_view = typename impl_type::size_type_1d_view; - using tpetra_block_access_view_type = typename impl_type::tpetra_block_access_view_type; // block crs (layout right) - using impl_scalar_type_1d_view = typename impl_type::impl_scalar_type_1d_view; - using impl_scalar_type_2d_view_tpetra = typename impl_type::impl_scalar_type_2d_view_tpetra; // block multivector (layout left) - using vector_type_3d_view = typename impl_type::vector_type_3d_view; - using btdm_scalar_type_4d_view = typename impl_type::btdm_scalar_type_4d_view; - static constexpr int vector_length = impl_type::vector_length; + const local_ordinal_type local_subpartidx_schur = (local_subpartidx-1)/2; + const local_ordinal_type i0_schur = local_subpartidx_schur == 0 ? pack_td_ptr_schur(partidx,local_subpartidx_schur) : pack_td_ptr_schur(partidx,local_subpartidx_schur) + 1; + const local_ordinal_type i0_offset = local_subpartidx_schur == 0 ? i0+2 : i0+2; - /// team policy member type (used in cuda) - using member_type = typename Kokkos::TeamPolicy::member_type; + (void) i0_schur; + (void) i0_offset; - // enum for max blocksize and vector length - enum : int { max_blocksize = 32 }; + const auto one = Kokkos::ArithTraits::one(); - private: - ConstUnmanaged b; - ConstUnmanaged x; // x_owned - ConstUnmanaged x_remote; - Unmanaged y; - Unmanaged y_packed; - Unmanaged y_packed_scalar; - - // AmD information - const ConstUnmanaged rowptr, rowptr_remote; - const ConstUnmanaged colindsub, colindsub_remote; - const ConstUnmanaged tpetra_values; - - // block crs graph information - // for cuda (kokkos crs graph uses a different size_type from size_t) - const ConstUnmanaged > A_rowptr; - const ConstUnmanaged > A_colind; - - // blocksize - const local_ordinal_type blocksize_requested; + const size_type c_kps2 = local_subpartidx > 0 ? pack_td_ptr(partidx, local_subpartidx)-2 : 0; + const size_type c_kps1 = pack_td_ptr(partidx, local_subpartidx+1)+1; - // part interface - const ConstUnmanaged part2packrowidx0; - const ConstUnmanaged part2rowidx0; - const ConstUnmanaged rowidx2part; - const ConstUnmanaged partptr; - const ConstUnmanaged lclrow; - const ConstUnmanaged dm2cm; - const bool is_dm2cm_active; + typedef SolveTridiagsDefaultModeAndAlgo + default_mode_and_algo_type; - public: - template - ComputeResidualVector(const AmD &amd, - const LocalCrsGraphType &graph, - const local_ordinal_type &blocksize_requested_, - const PartInterface &interf, - const local_ordinal_type_1d_view &dm2cm_) - : rowptr(amd.rowptr), rowptr_remote(amd.rowptr_remote), - colindsub(amd.A_colindsub), colindsub_remote(amd.A_colindsub_remote), - tpetra_values(amd.tpetra_values), - A_rowptr(graph.row_map), - A_colind(graph.entries), - blocksize_requested(blocksize_requested_), - part2packrowidx0(interf.part2packrowidx0), - part2rowidx0(interf.part2rowidx0), - rowidx2part(interf.rowidx2part), - partptr(interf.partptr), - lclrow(interf.lclrow), - dm2cm(dm2cm_), - is_dm2cm_active(dm2cm_.span() > 0) - {} + typedef typename default_mode_and_algo_type::mode_type default_mode_type; + typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type; - inline - void - SerialGemv(const local_ordinal_type &blocksize, - const impl_scalar_type * const KOKKOS_RESTRICT AA, - const impl_scalar_type * const KOKKOS_RESTRICT xx, - /* */ impl_scalar_type * KOKKOS_RESTRICT yy) const { - using tlb = TpetraLittleBlock; - for (local_ordinal_type k0=0;k0 - KOKKOS_INLINE_FUNCTION - void - VectorCopy(const member_type &member, - const local_ordinal_type &blocksize, - const bbViewType &bb, - const yyViewType &yy) const { - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, blocksize), [&](const local_ordinal_type &k0) { - yy(k0) = static_cast(bb(k0)); - }); - } - - template - KOKKOS_INLINE_FUNCTION - void - TeamVectorGemv(const member_type &member, - const local_ordinal_type &blocksize, - const AAViewType &AA, - const xxViewType &xx, - const yyViewType &yy) const { - Kokkos::parallel_for - (Kokkos::TeamThreadRange(member, blocksize), - [&](const local_ordinal_type &k0) { - impl_scalar_type val = 0; - Kokkos::parallel_for - (Kokkos::ThreadVectorRange(member, blocksize), - [&](const local_ordinal_type &k1) { - val += AA(k0,k1)*xx(k1); - }); - Kokkos::atomic_fetch_add(&yy(k0), typename yyViewType::const_value_type(-val)); - }); - } - - template - KOKKOS_INLINE_FUNCTION - void - VectorGemv(const member_type &member, - const local_ordinal_type &blocksize, - const AAViewType &AA, - const xxViewType &xx, - const yyViewType &yy) const { - Kokkos::parallel_for - (Kokkos::ThreadVectorRange(member, blocksize), - [&](const local_ordinal_type &k0) { - impl_scalar_type val(0); - for (local_ordinal_type k1=0;k1 - // KOKKOS_INLINE_FUNCTION - // void - // VectorGemv(const member_type &member, - // const local_ordinal_type &blocksize, - // const AAViewType &AA, - // const xxViewType &xx, - // const yyViewType &yy) const { - // for (local_ordinal_type k0=0;k0 FIXME HIP: should not need KOKKOS_INLINE_FUNCTION - KOKKOS_INLINE_FUNCTION - void - operator() (const SeqTag &, const local_ordinal_type& i) const { - const local_ordinal_type blocksize = blocksize_requested; - const local_ordinal_type blocksize_square = blocksize*blocksize; - - // constants - const Kokkos::pair block_range(0, blocksize); - const local_ordinal_type num_vectors = y.extent(1); - const local_ordinal_type row = i*blocksize; - for (local_ordinal_type col=0;col block_range(0, blocksize); - const local_ordinal_type num_vectors = y.extent(1); - - // subview pattern - auto bb = Kokkos::subview(b, block_range, 0); - auto xx = bb; - auto yy = Kokkos::subview(y, block_range, 0); - auto A_block = ConstUnmanaged(NULL, blocksize, blocksize); - - const local_ordinal_type row = lr*blocksize; - for (local_ordinal_type col=0;col - struct AsyncTag {}; - - template - // inline ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION void - operator() (const AsyncTag &, const local_ordinal_type &rowidx) const { - const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B); - const local_ordinal_type blocksize_square = blocksize*blocksize; - - // constants - const local_ordinal_type partidx = rowidx2part(rowidx); - const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx)); - const local_ordinal_type v = partidx % vector_length; - - const local_ordinal_type num_vectors = y_packed.extent(2); - const local_ordinal_type num_local_rows = lclrow.extent(0); - - // temporary buffer for y flat - impl_scalar_type yy[B == 0 ? max_blocksize : B] = {}; - - const local_ordinal_type lr = lclrow(rowidx); - const local_ordinal_type row = lr*blocksize; - for (local_ordinal_type col=0;col &, const member_type &member) const { + const local_ordinal_type packidx = packindices_sub(member.league_rank()); - template - KOKKOS_INLINE_FUNCTION - void - operator() (const AsyncTag &, const member_type &member) const { - const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B); - const local_ordinal_type blocksize_square = blocksize*blocksize; + const local_ordinal_type partidx = packptr_sub(packidx); - // constants - const local_ordinal_type rowidx = member.league_rank(); - const local_ordinal_type partidx = rowidx2part(rowidx); - const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx)); - const local_ordinal_type v = partidx % vector_length; + const local_ordinal_type blocksize = e_internal_vector_values.extent(2); - const Kokkos::pair block_range(0, blocksize); - const local_ordinal_type num_vectors = y_packed_scalar.extent(2); - const local_ordinal_type num_local_rows = lclrow.extent(0); + const local_ordinal_type i0_schur = pack_td_ptr_schur(partidx,0); + const local_ordinal_type nrows = 2*(n_subparts_per_part-1); - // subview pattern - auto bb = Kokkos::subview(b, block_range, 0); - auto xx = Kokkos::subview(x, block_range, 0); - auto xx_remote = Kokkos::subview(x_remote, block_range, 0); - auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0); - auto A_block = ConstUnmanaged(NULL, blocksize, blocksize); - - const local_ordinal_type lr = lclrow(rowidx); - const local_ordinal_type row = lr*blocksize; - for (local_ordinal_type col=0;col(member, + Kokkos::subview(X_internal_vector_values_schur, r0_schur+2*schur_sub_part+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), + Kokkos::subview(X_internal_vector_values, r0+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL())); + } } - } - template struct OverlapTag {}; - - template - // inline ---> FIXME HIP: should not need KOKKOS_INLINE_FUNCTION - KOKKOS_INLINE_FUNCTION - void - operator() (const OverlapTag &, const local_ordinal_type& rowidx) const { - const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B); - const local_ordinal_type blocksize_square = blocksize*blocksize; - - // constants - const local_ordinal_type partidx = rowidx2part(rowidx); - const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx)); - const local_ordinal_type v = partidx % vector_length; - - const local_ordinal_type num_vectors = y_packed.extent(2); - const local_ordinal_type num_local_rows = lclrow.extent(0); - - // temporary buffer for y flat - impl_scalar_type yy[max_blocksize] = {}; - - auto colindsub_used = (P == 0 ? colindsub : colindsub_remote); - auto rowptr_used = (P == 0 ? rowptr : rowptr_remote); - - const local_ordinal_type lr = lclrow(rowidx); - const local_ordinal_type row = lr*blocksize; - for (local_ordinal_type col=0;col (member, blocksize, i0_schur, r0_schur, nrows, v, D_internal_vector_values_schur, X_internal_vector_values_schur, WW); + }); - // y -= Rx - const size_type A_k0 = A_rowptr[lr]; - for (size_type k=rowptr_used[lr];k(member, + Kokkos::subview(X_internal_vector_values, r0+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()), + Kokkos::subview(X_internal_vector_values_schur, r0_schur+2*schur_sub_part+i, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL())); } } } - template + template KOKKOS_INLINE_FUNCTION void - operator() (const OverlapTag &, const member_type &member) const { - const local_ordinal_type blocksize = (B == 0 ? blocksize_requested : B); - const local_ordinal_type blocksize_square = blocksize*blocksize; + operator() (const SingleVectorApplyETag &, const member_type &member) const { + const local_ordinal_type packidx = packindices_sub(member.league_rank()); - // constants - const local_ordinal_type rowidx = member.league_rank(); - const local_ordinal_type partidx = rowidx2part(rowidx); - const local_ordinal_type pri = part2packrowidx0(partidx) + (rowidx - partptr(partidx)); - const local_ordinal_type v = partidx % vector_length; + const local_ordinal_type subpartidx = packptr_sub(packidx); + const local_ordinal_type n_parts = part2packrowidx0_sub.extent(0); + const local_ordinal_type local_subpartidx = subpartidx/n_parts; + const local_ordinal_type partidx = subpartidx%n_parts; + const local_ordinal_type blocksize = e_internal_vector_values.extent(2); - const Kokkos::pair block_range(0, blocksize); - const local_ordinal_type num_vectors = y_packed_scalar.extent(2); - const local_ordinal_type num_local_rows = lclrow.extent(0); + const local_ordinal_type r0 = part2packrowidx0_sub(partidx,local_subpartidx); + const local_ordinal_type nrows = partptr_sub(subpartidx,1) - partptr_sub(subpartidx,0); - // subview pattern - auto bb = Kokkos::subview(b, block_range, 0); - auto xx = bb; //Kokkos::subview(x, block_range, 0); - auto xx_remote = bb; //Kokkos::subview(x_remote, block_range, 0); - auto yy = Kokkos::subview(y_packed_scalar, 0, block_range, 0, 0); - auto A_block = ConstUnmanaged(NULL, blocksize, blocksize); - auto colindsub_used = (P == 0 ? colindsub : colindsub_remote); - auto rowptr_used = (P == 0 ? rowptr : rowptr_remote); - - const local_ordinal_type lr = lclrow(rowidx); - const local_ordinal_type row = lr*blocksize; - for (local_ordinal_type col=0;col::one(); + + typedef SolveTridiagsDefaultModeAndAlgo + default_mode_and_algo_type; + + typedef typename default_mode_and_algo_type::mode_type default_mode_type; + typedef typename default_mode_and_algo_type::single_vector_algo_type default_algo_type; + + if (local_subpartidx == 0) { Kokkos::parallel_for - (Kokkos::TeamThreadRange(member, rowptr_used[lr], rowptr_used[lr+1]), - [&](const local_ordinal_type &k) { - const size_type j = A_k0 + colindsub_used[k]; - A_block.assign_data( &tpetra_values(j*blocksize_square) ); - - const local_ordinal_type A_colind_at_j = A_colind[j]; - if (P == 0) { - const auto loc = is_dm2cm_active ? dm2cm[A_colind_at_j] : A_colind_at_j; - xx.assign_data( &x(loc*blocksize, col) ); - VectorGemv(member, blocksize, A_block, xx, yy); - } else { - const auto loc = A_colind_at_j - num_local_rows; - xx_remote.assign_data( &x_remote(loc*blocksize, col) ); - VectorGemv(member, blocksize, A_block, xx_remote, yy); + (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) { + + auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v); + + for (local_ordinal_type row = 0; row < nrows; ++row) { + auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v); + auto E = Kokkos::subview(e_internal_vector_values, 0, r0+row, Kokkos::ALL(), Kokkos::ALL(), v); + + KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE + (default_mode_type,default_algo_type, + member, + blocksize, blocksize, + -one, + E.data(), E.stride_0(), E.stride_1(), + v_2.data(), v_2.stride_0(), + one, + v_1.data(), v_1.stride_0()); } }); } - } - - // y = b - Rx; seq method - template - void run(const MultiVectorLocalViewTypeY &y_, - const MultiVectorLocalViewTypeB &b_, - const MultiVectorLocalViewTypeX &x_) { - IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN; - IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ComputeResidual::"); - - y = y_; b = b_; x = x_; - if constexpr (is_device::value) { - const local_ordinal_type blocksize = blocksize_requested; - const local_ordinal_type team_size = 8; - const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize(blocksize, team_size); - const Kokkos::TeamPolicy policy(rowptr.extent(0) - 1, team_size, vector_size); + else if (local_subpartidx == (local_ordinal_type) part2packrowidx0_sub.extent(1) - 2) { Kokkos::parallel_for - ("ComputeResidual::TeamPolicy::run", policy, *this); - } else { - const Kokkos::RangePolicy policy(0, rowptr.extent(0) - 1); + (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) { + auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v); + + for (local_ordinal_type row = 0; row < nrows; ++row) { + auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v); + auto E = Kokkos::subview(e_internal_vector_values, 1, r0+row, Kokkos::ALL(), Kokkos::ALL(), v); + + KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE + (default_mode_type,default_algo_type, + member, + blocksize, blocksize, + -one, + E.data(), E.stride_0(), E.stride_1(), + v_2.data(), v_2.stride_0(), + one, + v_1.data(), v_1.stride_0()); + } + }); + } + else { Kokkos::parallel_for - ("ComputeResidual::RangePolicy::run", policy, *this); + (Kokkos::ThreadVectorRange(member, vector_loop_size),[&](const int &v) { + { + auto v_2 = Kokkos::subview(X_internal_vector_values, r0+nrows, Kokkos::ALL(), 0, v); + + for (local_ordinal_type row = 0; row < nrows; ++row) { + auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v); + auto E = Kokkos::subview(e_internal_vector_values, 0, r0+row, Kokkos::ALL(), Kokkos::ALL(), v); + + KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE + (default_mode_type,default_algo_type, + member, + blocksize, blocksize, + -one, + E.data(), E.stride_0(), E.stride_1(), + v_2.data(), v_2.stride_0(), + one, + v_1.data(), v_1.stride_0()); + } + } + { + auto v_2 = Kokkos::subview(X_internal_vector_values, r0-1, Kokkos::ALL(), 0, v); + + for (local_ordinal_type row = 0; row < nrows; ++row) { + auto v_1 = Kokkos::subview(X_internal_vector_values, r0+row, Kokkos::ALL(), 0, v); + auto E = Kokkos::subview(e_internal_vector_values, 1, r0+row, Kokkos::ALL(), Kokkos::ALL(), v); + + KOKKOSBATCHED_GEMV_NO_TRANSPOSE_INTERNAL_INVOKE + (default_mode_type,default_algo_type, + member, + blocksize, blocksize, + -one, + E.data(), E.stride_0(), E.stride_1(), + v_2.data(), v_2.stride_0(), + one, + v_1.data(), v_1.stride_0()); + } + } + }); } - IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END; - IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space) } - // y = b - R (x , x_remote) - template - void run(const vector_type_3d_view &y_packed_, - const MultiVectorLocalViewTypeB &b_, - const MultiVectorLocalViewTypeX &x_, - const MultiVectorLocalViewTypeX_Remote &x_remote_) { + void run(const impl_scalar_type_2d_view_tpetra &Y, + const impl_scalar_type_1d_view &Z) { IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN; - IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ComputeResidual::"); - - b = b_; x = x_; x_remote = x_remote_; - if constexpr (is_device::value) { - y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(), - y_packed_.extent(0), - y_packed_.extent(1), - y_packed_.extent(2), - vector_length); - } else { - y_packed = y_packed_; - } + IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::SolveTridiags"); - if constexpr(is_device::value) { - const local_ordinal_type blocksize = blocksize_requested; - const local_ordinal_type team_size = 8; - const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize(blocksize, team_size); - // local_ordinal_type vl_power_of_two = 1; - // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2); - // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1); - // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two; -#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \ - const Kokkos::TeamPolicy > \ - policy(rowidx2part.extent(0), team_size, vector_size); \ - Kokkos::parallel_for \ - ("ComputeResidual::TeamPolicy::run", \ - policy, *this); } break - switch (blocksize_requested) { - case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3); - case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5); - case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7); - case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9); - case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10); - case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11); - case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16); - case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17); - case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18); - default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0); - } -#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL - } else { -#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) { \ - const Kokkos::RangePolicy > policy(0, rowidx2part.extent(0)); \ - Kokkos::parallel_for \ - ("ComputeResidual::RangePolicy::run", \ - policy, *this); } break - switch (blocksize_requested) { - case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3); - case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5); - case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7); - case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9); - case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10); - case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11); - case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16); - case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17); - case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18); - default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0); - } -#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL - } - IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END; - IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space) - } - - // y = b - R (y , y_remote) - template - void run(const vector_type_3d_view &y_packed_, - const MultiVectorLocalViewTypeB &b_, - const MultiVectorLocalViewTypeX &x_, - const MultiVectorLocalViewTypeX_Remote &x_remote_, - const bool compute_owned) { - IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN; - IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ComputeResidual::"); - - b = b_; x = x_; x_remote = x_remote_; - if constexpr (is_device::value) { - y_packed_scalar = btdm_scalar_type_4d_view((btdm_scalar_type*)y_packed_.data(), - y_packed_.extent(0), - y_packed_.extent(1), - y_packed_.extent(2), - vector_length); - } else { - y_packed = y_packed_; - } + /// set vectors + this->Y_scalar_multivector = Y; + this->Z_scalar_vector = Z; - if constexpr (is_device::value) { - const local_ordinal_type blocksize = blocksize_requested; - const local_ordinal_type team_size = 8; - const local_ordinal_type vector_size = ComputeResidualVectorRecommendedVectorSize(blocksize, team_size); - // local_ordinal_type vl_power_of_two = 1; - // for (;vl_power_of_two<=blocksize_requested;vl_power_of_two*=2); - // vl_power_of_two *= (vl_power_of_two < blocksize_requested ? 2 : 1); - // const local_ordinal_type vl = vl_power_of_two > vector_length ? vector_length : vl_power_of_two; -#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \ - if (compute_owned) { \ - const Kokkos::TeamPolicy > \ - policy(rowidx2part.extent(0), team_size, vector_size); \ - Kokkos::parallel_for \ - ("ComputeResidual::TeamPolicy::run >", policy, *this); \ - } else { \ - const Kokkos::TeamPolicy > \ - policy(rowidx2part.extent(0), team_size, vector_size); \ - Kokkos::parallel_for \ - ("ComputeResidual::TeamPolicy::run >", policy, *this); \ - } break - switch (blocksize_requested) { - case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3); - case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5); - case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7); - case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9); - case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10); - case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11); - case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16); - case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17); - case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18); - default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0); - } -#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL - } else { -#define BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(B) \ - if (compute_owned) { \ - const Kokkos::RangePolicy > \ - policy(0, rowidx2part.extent(0)); \ - Kokkos::parallel_for \ - ("ComputeResidual::RangePolicy::run >", policy, *this); \ - } else { \ - const Kokkos::RangePolicy > \ - policy(0, rowidx2part.extent(0)); \ - Kokkos::parallel_for \ - ("ComputeResidual::RangePolicy::run >", policy, *this); \ - } break - - switch (blocksize_requested) { - case 3: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 3); - case 5: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 5); - case 7: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 7); - case 9: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 9); - case 10: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(10); - case 11: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(11); - case 16: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(16); - case 17: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(17); - case 18: BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL(18); - default : BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL( 0); - } -#undef BLOCKTRIDICONTAINER_DETAILS_COMPUTERESIDUAL - } - IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END; - IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(execution_space) - } - }; + const local_ordinal_type num_vectors = X_internal_vector_values.extent(2); + const local_ordinal_type blocksize = D_internal_vector_values.extent(1); - template - void reduceVector(const ConstUnmanaged::impl_scalar_type_1d_view> zz, - /* */ typename ImplType::magnitude_type *vals) { - IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_BEGIN; - IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ReduceVector"); + const local_ordinal_type team_size = + SolveTridiagsDefaultModeAndAlgo:: + recommended_team_size(blocksize, vector_length, internal_vector_length); + const int per_team_scratch = internal_vector_scratch_type_3d_view + ::shmem_size(blocksize, num_vectors, vector_loop_size); - using impl_type = ImplType; - using local_ordinal_type = typename impl_type::local_ordinal_type; - using impl_scalar_type = typename impl_type::impl_scalar_type; -#if 0 - const auto norm2 = KokkosBlas::nrm1(zz); +#if defined(KOKKOS_ENABLE_DEPRECATED_CODE) +#define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \ + if (num_vectors == 1) { \ + const Kokkos::TeamPolicy > \ + policy(packptr.extent(0) - 1, team_size, vector_loop_size); \ + Kokkos::parallel_for \ + ("SolveTridiags::TeamPolicy::run", \ + policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \ + } else { \ + const Kokkos::TeamPolicy > \ + policy(packptr.extent(0) - 1, team_size, vector_loop_size); \ + Kokkos::parallel_for \ + ("SolveTridiags::TeamPolicy::run", \ + policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)), *this); \ + } break #else - impl_scalar_type norm2(0); - Kokkos::parallel_reduce - ("ReduceMultiVector::Device", - Kokkos::RangePolicy(0,zz.extent(0)), - KOKKOS_LAMBDA(const local_ordinal_type &i, impl_scalar_type &update) { - update += zz(i); - }, norm2); -#endif - vals[0] = Kokkos::ArithTraits::abs(norm2); - - IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END; - IFPACK2_BLOCKTRIDICONTAINER_TIMER_FENCE(typename ImplType::execution_space) - } - - /// - /// Manage the distributed part of the computation of residual norms. - /// - template - struct NormManager { - public: - using impl_type = ImplType; - using host_execution_space = typename impl_type::host_execution_space; - using magnitude_type = typename impl_type::magnitude_type; - - private: - bool collective_; - int sweep_step_, sweep_step_upper_bound_; -#ifdef HAVE_IFPACK2_MPI - MPI_Request mpi_request_; - MPI_Comm comm_; -#endif - magnitude_type work_[3]; - - public: - NormManager() = default; - NormManager(const NormManager &b) = default; - NormManager(const Teuchos::RCP >& comm) { - sweep_step_ = 1; - sweep_step_upper_bound_ = 1; - collective_ = comm->getSize() > 1; - if (collective_) { -#ifdef HAVE_IFPACK2_MPI - const auto mpi_comm = Teuchos::rcp_dynamic_cast >(comm); - TEUCHOS_ASSERT( ! mpi_comm.is_null()); - comm_ = *mpi_comm->getRawMpiComm(); -#endif - } - const magnitude_type zero(0), minus_one(-1); - work_[0] = zero; - work_[1] = zero; - work_[2] = minus_one; - } - - // Check the norm every sweep_step sweeps. - void setCheckFrequency(const int sweep_step) { - TEUCHOS_TEST_FOR_EXCEPT_MSG(sweep_step < 1, "sweep step must be >= 1"); - sweep_step_upper_bound_ = sweep_step; - sweep_step_ = 1; - } - - // Get the buffer into which to store rank-local squared norms. - magnitude_type* getBuffer() { return &work_[0]; } - - // Call MPI_Iallreduce to find the global squared norms. - void ireduce(const int sweep, const bool force = false) { - if ( ! force && sweep % sweep_step_) return; - - IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NormManager::Ireduce"); - - work_[1] = work_[0]; -#ifdef HAVE_IFPACK2_MPI - auto send_data = &work_[1]; - auto recv_data = &work_[0]; - if (collective_) { -# if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3) - MPI_Iallreduce(send_data, recv_data, 1, - Teuchos::Details::MpiTypeTraits::getType(), - MPI_SUM, comm_, &mpi_request_); -# else - MPI_Allreduce (send_data, recv_data, 1, - Teuchos::Details::MpiTypeTraits::getType(), - MPI_SUM, comm_); -# endif - } -#endif - } - - // Check if the norm-based termination criterion is met. tol2 is the - // tolerance squared. Sweep is the sweep index. If not every iteration is - // being checked, this function immediately returns false. If a check must - // be done at this iteration, it waits for the reduction triggered by - // ireduce to complete, then checks the global norm against the tolerance. - bool checkDone (const int sweep, const magnitude_type tol2, const bool force = false) { - // early return - if (sweep <= 0) return false; - - IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::NormManager::CheckDone"); - - TEUCHOS_ASSERT(sweep >= 1); - if ( ! force && (sweep - 1) % sweep_step_) return false; - if (collective_) { -#ifdef HAVE_IFPACK2_MPI -# if defined(IFPACK2_BLOCKTRIDICONTAINER_USE_MPI_3) - MPI_Wait(&mpi_request_, MPI_STATUS_IGNORE); -# else - // Do nothing. -# endif +#define BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(B) \ + if (num_vectors == 1) { \ + if (packindices_schur.extent(0) == 0) { \ + Kokkos::TeamPolicy > \ + policy(packptr.extent(0) - 1, team_size, vector_loop_size); \ + policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \ + Kokkos::parallel_for \ + ("SolveTridiags::TeamPolicy::run", \ + policy, *this); \ + } \ + else { \ + { \ + IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSubLineTag"); \ + write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSubLineTag.mm"); \ + Kokkos::TeamPolicy > \ + policy(packindices_sub.extent(0), team_size, vector_loop_size); \ + policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \ + Kokkos::parallel_for \ + ("SolveTridiags::TeamPolicy::run", \ + policy, *this); \ + write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSubLineTag.mm"); \ + IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \ + } \ + { \ + IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyCTag"); \ + write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyCTag.mm"); \ + Kokkos::TeamPolicy > \ + policy(packindices_sub.extent(0), team_size, vector_loop_size); \ + policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \ + Kokkos::parallel_for \ + ("SolveTridiags::TeamPolicy::run", \ + policy, *this); \ + write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyCTag.mm"); \ + IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \ + } \ + { \ + IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorSchurTag"); \ + write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorSchurTag.mm"); \ + Kokkos::TeamPolicy > \ + policy(part2packrowidx0_sub.extent(0), team_size, vector_loop_size); \ + policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \ + Kokkos::parallel_for \ + ("SolveTridiags::TeamPolicy::run", \ + policy, *this); \ + write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorSchurTag.mm"); \ + IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \ + } \ + { \ + IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi::SingleVectorApplyETag"); \ + write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_before_SingleVectorApplyETag.mm"); \ + Kokkos::TeamPolicy > \ + policy(packindices_sub.extent(0), team_size, vector_loop_size); \ + policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \ + Kokkos::parallel_for \ + ("SolveTridiags::TeamPolicy::run", \ + policy, *this); \ + write4DMultiVectorValuesToFile(part2packrowidx0_sub.extent(0), X_internal_scalar_values, "x_scalar_values_after_SingleVectorApplyETag.mm"); \ + IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) \ + } \ + } \ + } else { \ + Kokkos::TeamPolicy > \ + policy(packptr.extent(0) - 1, team_size, vector_loop_size); \ + policy.set_scratch_size(0,Kokkos::PerTeam(per_team_scratch)); \ + Kokkos::parallel_for \ + ("SolveTridiags::TeamPolicy::run", \ + policy, *this); \ + } break #endif + switch (blocksize) { + case 3: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 3); + case 5: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 5); + case 7: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 7); + case 9: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 9); + case 10: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(10); + case 11: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(11); + case 16: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(16); + case 17: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(17); + case 18: BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS(18); + default : BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS( 0); } - bool r_val = false; - if (sweep == 1) { - work_[2] = work_[0]; - } else { - r_val = (work_[0] < tol2*work_[2]); - } - - // adjust sweep step - const auto adjusted_sweep_step = 2*sweep_step_; - if (adjusted_sweep_step < sweep_step_upper_bound_) { - sweep_step_ = adjusted_sweep_step; - } else { - sweep_step_ = sweep_step_upper_bound_; - } - return r_val; - } +#undef BLOCKTRIDICONTAINER_DETAILS_SOLVETRIDIAGS - // After termination has occurred, finalize the norms for use in - // get_norms{0,final}. - void finalize () { - work_[0] = std::sqrt(work_[0]); // after converged - if (work_[2] >= 0) - work_[2] = std::sqrt(work_[2]); // first norm - // if work_[2] is minus one, then norm is not requested. + IFPACK2_BLOCKTRIDICONTAINER_PROFILER_REGION_END; + IFPACK2_BLOCKHELPER_TIMER_FENCE(execution_space) } - - // Report norms to the caller. - const magnitude_type getNorms0 () const { return work_[2]; } - const magnitude_type getNormsFinal () const { return work_[0]; } }; /// @@ -3973,30 +4422,30 @@ namespace Ifpack2 { template int applyInverseJacobi(// importer - const Teuchos::RCP::tpetra_block_crs_matrix_type> &A, - const Teuchos::RCP::tpetra_import_type> &tpetra_importer, + const Teuchos::RCP::tpetra_block_crs_matrix_type> &A, + const Teuchos::RCP::tpetra_import_type> &tpetra_importer, const Teuchos::RCP > &async_importer, const bool overlap_communication_and_computation, // tpetra interface - const typename ImplType::tpetra_multivector_type &X, // tpetra interface - /* */ typename ImplType::tpetra_multivector_type &Y, // tpetra interface - /* */ typename ImplType::tpetra_multivector_type &Z, // temporary tpetra interface (seq_method) - /* */ typename ImplType::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff) + const typename BlockHelperDetails::ImplType::tpetra_multivector_type &X, // tpetra interface + /* */ typename BlockHelperDetails::ImplType::tpetra_multivector_type &Y, // tpetra interface + /* */ typename BlockHelperDetails::ImplType::tpetra_multivector_type &Z, // temporary tpetra interface (seq_method) + /* */ typename BlockHelperDetails::ImplType::impl_scalar_type_1d_view &W, // temporary tpetra interface (diff) // local object interface - const PartInterface &interf, // mesh interface + const BlockHelperDetails::PartInterface &interf, // mesh interface const BlockTridiags &btdm, // packed block tridiagonal matrices - const AmD &amd, // R = A - D - /* */ typename ImplType::vector_type_1d_view &work, // workspace for packed multivector of right hand side - /* */ NormManager &norm_manager, + const BlockHelperDetails::AmD &amd, // R = A - D + /* */ typename BlockHelperDetails::ImplType::vector_type_1d_view &work, // workspace for packed multivector of right hand side + /* */ BlockHelperDetails::NormManager &norm_manager, // preconditioner parameters - const typename ImplType::impl_scalar_type &damping_factor, + const typename BlockHelperDetails::ImplType::impl_scalar_type &damping_factor, /* */ bool is_y_zero, const int max_num_sweeps, - const typename ImplType::magnitude_type tol, + const typename BlockHelperDetails::ImplType::magnitude_type tol, const int check_tol_every) { - IFPACK2_BLOCKTRIDICONTAINER_TIMER("BlockTriDi::ApplyInverseJacobi"); + IFPACK2_BLOCKHELPER_TIMER("BlockTriDi::ApplyInverseJacobi"); - using impl_type = ImplType; + using impl_type = BlockHelperDetails::ImplType; using node_memory_space = typename impl_type::node_memory_space; using local_ordinal_type = typename impl_type::local_ordinal_type; using size_type = typename impl_type::size_type; @@ -4075,7 +4524,7 @@ namespace Ifpack2 { damping_factor, is_norm_manager_active); const local_ordinal_type_1d_view dummy_local_ordinal_type_1d_view; - ComputeResidualVector + BlockHelperDetails::ComputeResidualVector compute_residual_vector(amd, A->getCrsGraph().getLocalGraphDevice(), blocksize, interf, is_async_importer_active ? async_importer->dm2cm : dummy_local_ordinal_type_1d_view); @@ -4131,7 +4580,7 @@ namespace Ifpack2 { { if (is_norm_manager_active) { // y(lclrow) = (b - a) y(lclrow) + a pmv, with b = 1 always. - reduceVector(W, norm_manager.getBuffer()); + BlockHelperDetails::reduceVector(W, norm_manager.getBuffer()); if (sweep + 1 == max_num_sweeps) { norm_manager.ireduce(sweep, true); norm_manager.checkDone(sweep + 1, tolerance, true); @@ -4152,11 +4601,11 @@ namespace Ifpack2 { template struct ImplObject { - using impl_type = ImplType; - using part_interface_type = PartInterface; + using impl_type = BlockHelperDetails::ImplType; + using part_interface_type = BlockHelperDetails::PartInterface; using block_tridiags_type = BlockTridiags; - using amd_type = AmD; - using norm_manager_type = NormManager; + using amd_type = BlockHelperDetails::AmD; + using norm_manager_type = BlockHelperDetails::NormManager; using async_import_type = AsyncableImport; // distructed objects diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainerUtil.hpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainerUtil.hpp index 912e8e5e5baa..441da4c2757f 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainerUtil.hpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestBlockTriDiContainerUtil.hpp @@ -186,7 +186,7 @@ struct BlockTriDiContainerTester { Teuchos::Array > parts; make_parts(sb, sbp, *A, nonuniform_lines, jacobi, parts); return Teuchos::rcp(new Ifpack2::BlockTriDiContainer( - A, parts, overlap_comm, seq_method)); + A, parts, 1, overlap_comm, seq_method)); } static Int