From df5636ce56e3ca2309906138699cfa0ea4e17f03 Mon Sep 17 00:00:00 2001 From: Christian Glusa Date: Thu, 20 May 2021 12:13:37 -0600 Subject: [PATCH 1/8] Tpetra Transfer: Add isLocallyFitted Determines if source and target maps are locally fitted. Can be more efficient than calling isLocallyFitted() on these maps. --- packages/tpetra/core/src/Tpetra_Details_Transfer_decl.hpp | 8 ++++++++ packages/tpetra/core/src/Tpetra_Details_Transfer_def.hpp | 8 ++++++++ .../core/test/ImportExport/ImportExport_UnitTests.cpp | 8 ++++++++ 3 files changed, 24 insertions(+) diff --git a/packages/tpetra/core/src/Tpetra_Details_Transfer_decl.hpp b/packages/tpetra/core/src/Tpetra_Details_Transfer_decl.hpp index 30aabde1c071..3e3d9aef2cb8 100644 --- a/packages/tpetra/core/src/Tpetra_Details_Transfer_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_Transfer_decl.hpp @@ -233,6 +233,14 @@ class Transfer : public Teuchos::Describable { void expertSetExportLIDsContiguous(Transfer transfer, bool contig); #endif // DOXYGEN_SHOULD_SKIP_THIS + /// \brief Are source and target map locally fitted? + /// + /// Returns whether source and target map are locally fitted on the + /// calling rank. This is can be more efficient that calling + /// isLocallyFitted() on the maps directly, since no indices need to + /// be compared. + bool isLocallyFitted () const; + /// \brief Describe this object in a human-readable way to the given /// output stream. /// diff --git a/packages/tpetra/core/src/Tpetra_Details_Transfer_def.hpp b/packages/tpetra/core/src/Tpetra_Details_Transfer_def.hpp index 3e089b8842f6..a9759fa67e77 100644 --- a/packages/tpetra/core/src/Tpetra_Details_Transfer_def.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_Transfer_def.hpp @@ -299,6 +299,14 @@ isLocallyComplete () const { return TransferData_->isLocallyComplete_; } +template +bool +Transfer:: +isLocallyFitted () const { + return (getNumSameIDs() == std::min(getSourceMap()->getNodeNumElements(), + getTargetMap()->getNodeNumElements())); +} + template void Transfer:: diff --git a/packages/tpetra/core/test/ImportExport/ImportExport_UnitTests.cpp b/packages/tpetra/core/test/ImportExport/ImportExport_UnitTests.cpp index fea5092ed910..83f9c8adc214 100644 --- a/packages/tpetra/core/test/ImportExport/ImportExport_UnitTests.cpp +++ b/packages/tpetra/core/test/ImportExport/ImportExport_UnitTests.cpp @@ -128,6 +128,7 @@ namespace { bool isvalid=Tpetra::Import_Util::checkImportValidity(*importer); TEST_EQUALITY(isvalid,true); + TEST_EQUALITY(importer->isLocallyFitted(), source->isLocallyFitted(*target)); } TEUCHOS_UNIT_TEST_TEMPLATE_4_DECL( ImportExport, GetNeighborsForward, Scalar, LO, GO, Node ) @@ -185,11 +186,13 @@ namespace { TEST_EQUALITY( importer->getNumPermuteIDs(), static_cast(myImageID == 0 ? 0 : 1) ); TEST_EQUALITY( importer->getNumExportIDs(), (myImageID == 0 || myImageID == numImages - 1 ? 1 : 2) ); TEST_EQUALITY( importer->getNumRemoteIDs(), (myImageID == 0 || myImageID == numImages - 1 ? 1 : 2) ); + TEST_EQUALITY( importer->isLocallyFitted(), tmap->isLocallyFitted(*smap)); // exporter testing TEST_EQUALITY_CONST( exporter->getSourceMap() == tmap, true ); TEST_EQUALITY_CONST( exporter->getTargetMap() == smap, true ); TEST_EQUALITY( importer->getNumSameIDs(), (myImageID == 0 ? 1 : 0) ); TEST_EQUALITY( exporter->getNumPermuteIDs(), static_cast(myImageID == 0 ? 0 : 1) ); + TEST_EQUALITY( exporter->isLocallyFitted(), tmap->isLocallyFitted(*smap)); // import neighbors, test their proper arrival // [ 0 n 2n 3n 4n ] // mvWithNeighbors = [... .... .... .... ....] @@ -290,11 +293,13 @@ namespace { TEST_EQUALITY( importer->getNumPermuteIDs(), static_cast(myImageID == 0 ? 0 : 1) ); TEST_EQUALITY( importer->getNumExportIDs(), (myImageID == 0 || myImageID == numImages - 1 ? 1 : 2) ); TEST_EQUALITY( importer->getNumRemoteIDs(), (myImageID == 0 || myImageID == numImages - 1 ? 1 : 2) ); + TEST_EQUALITY( importer->isLocallyFitted(), tmap->isLocallyFitted(*smap)); // exporter testing TEST_EQUALITY_CONST( exporter->getSourceMap() == tmap, true ); TEST_EQUALITY_CONST( exporter->getTargetMap() == smap, true ); TEST_EQUALITY( importer->getNumSameIDs(), (myImageID == 0 ? 1 : 0) ); TEST_EQUALITY( exporter->getNumPermuteIDs(), static_cast(myImageID == 0 ? 0 : 1) ); + TEST_EQUALITY( exporter->isLocallyFitted(), tmap->isLocallyFitted(*smap)); // import neighbors, test their proper arrival // [ 0 n 2n 3n 4n ] // mvWithNeighbors = [... .... .... .... ....] @@ -371,6 +376,7 @@ namespace { // - the first will be over-written (by 1.0) from the source, while // - the second will be "combined", i.e., abs(max(1.0,3.0)) = 3.0 from the dest auto importer = Tpetra::createImport (smap, dmap); + TEST_EQUALITY( importer->isLocallyFitted(), dmap->isLocallyFitted(*smap)); dstVec->doImport (*srcVec,*importer,Tpetra::ABSMAX); TEST_COMPARE_ARRAYS( tuple(-1.0,3.0), dstVec->get1dView() ) // All procs fail if any proc fails @@ -443,6 +449,8 @@ namespace { // Importer Tpetra_Import Importer(FromMap,ToMap); + TEST_EQUALITY( Importer.isLocallyFitted(), ToMap->isLocallyFitted(*FromMap)); + // Duplicating what Zoltan2/Tpetra Does IntVector FromVector(FromMap); IntVector ToVector(ToMap); From 15ac304c0c95cc73308fbcf7ab85a05c739fd454 Mon Sep 17 00:00:00 2001 From: Christian Glusa Date: Thu, 20 May 2021 12:14:57 -0600 Subject: [PATCH 2/8] Tpetra residual: Skip copyAndPermute for locally fitted domain and column map --- .../core/src/Tpetra_Details_residual.hpp | 221 +++++++++++++----- 1 file changed, 159 insertions(+), 62 deletions(-) diff --git a/packages/tpetra/core/src/Tpetra_Details_residual.hpp b/packages/tpetra/core/src/Tpetra_Details_residual.hpp index e95b90bcaead..c65d64431aba 100644 --- a/packages/tpetra/core/src/Tpetra_Details_residual.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_residual.hpp @@ -134,34 +134,35 @@ residual_launch_parameters (int64_t numRows, template void localResidual(const CrsMatrix & A, - const MultiVector & X, + const MultiVector & X_colmap, const MultiVector & B, - MultiVector & R) { + MultiVector & R, + const MultiVector * X_domainmap=nullptr) { using Tpetra::Details::ProfilingRegion; using Teuchos::NO_TRANS; ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localResidual"); - auto A_lcl = A.getLocalMatrixDevice (); - auto X_lcl = X.getLocalViewDevice(Access::ReadOnly); + auto A_lcl = A.getLocalMatrixDevice (); + auto X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly); auto B_lcl = B.getLocalViewDevice(Access::ReadOnly); auto R_lcl = R.getLocalViewDevice(Access::OverwriteAll); const bool debug = ::Tpetra::Details::Behavior::debug (); if (debug) { TEUCHOS_TEST_FOR_EXCEPTION - (X.getNumVectors () != R.getNumVectors (), std::runtime_error, - "X.getNumVectors() = " << X.getNumVectors () << " != " + (X_colmap.getNumVectors () != R.getNumVectors (), std::runtime_error, + "X.getNumVectors() = " << X_colmap.getNumVectors () << " != " "R.getNumVectors() = " << R.getNumVectors () << "."); TEUCHOS_TEST_FOR_EXCEPTION - (X.getNumVectors () != R.getNumVectors (), std::runtime_error, - "X.getNumVectors() = " << X.getNumVectors () << " != " + (X_colmap.getNumVectors () != R.getNumVectors (), std::runtime_error, + "X.getNumVectors() = " << X_colmap.getNumVectors () << " != " "R.getNumVectors() = " << R.getNumVectors () << "."); TEUCHOS_TEST_FOR_EXCEPTION - (X.getLocalLength () != + (X_colmap.getLocalLength () != A.getColMap ()->getNodeNumElements (), std::runtime_error, "X has the wrong number of local rows. " - "X.getLocalLength() = " << X.getLocalLength () << " != " + "X.getLocalLength() = " << X_colmap.getLocalLength () << " != " "A.getColMap()->getNodeNumElements() = " << A.getColMap ()->getNodeNumElements () << "."); TEUCHOS_TEST_FOR_EXCEPTION @@ -185,13 +186,13 @@ void localResidual(const CrsMatrix & A, "domain and range Map arguments) without an intervening " "resumeFill() call before you may call this method."); TEUCHOS_TEST_FOR_EXCEPTION - (! X.isConstantStride () || ! R.isConstantStride () || ! B.isConstantStride (), + (! X_colmap.isConstantStride () || ! R.isConstantStride () || ! B.isConstantStride (), std::runtime_error, "X, Y and B must be constant stride."); // If the two pointers are NULL, then they don't alias one // another, even though they are equal. TEUCHOS_TEST_FOR_EXCEPTION - ((X_lcl.data () == R_lcl.data () && X_lcl.data () != nullptr) || - (X_lcl.data () == B_lcl.data () && X_lcl.data () != nullptr), + ((X_colmap_lcl.data () == R_lcl.data () && X_colmap_lcl.data () != nullptr) || + (X_colmap_lcl.data () == B_lcl.data () && X_colmap_lcl.data () != nullptr), std::runtime_error, "X, Y and R may not alias one another."); } @@ -199,7 +200,7 @@ void localResidual(const CrsMatrix & A, #ifdef TPETRA_DETAILS_USE_REFERENCE_RESIDUAL // This is currently a "reference implementation" waiting until Kokkos Kernels provides // a residual kernel. - A.localApply(X,R,Teuchos::NO_TRANS, one, zero); + A.localApply(X_colmap,R,Teuchos::NO_TRANS, one, zero); R.update(one,B,negone); #else using execution_space = typename CrsMatrix::execution_space; @@ -221,7 +222,7 @@ void localResidual(const CrsMatrix & A, //note: lclOp will be wrapped in shared_ptr auto lclOp = A.getLocalMultiplyOperator(); //Call local SPMV, requesting merge path, through A's LocalCrsMatrixOperator - lclOp->applyImbalancedRows (X_lcl, R_lcl, Teuchos::NO_TRANS, one, zero); + lclOp->applyImbalancedRows (X_colmap_lcl, R_lcl, Teuchos::NO_TRANS, one, zero); R.update(one,B,negone); return; } @@ -248,63 +249,135 @@ void localResidual(const CrsMatrix & A, policy = policy_type (worksets, team_size, vector_length); } - bool is_vector = (X_lcl.extent(1) == 1); + bool is_vector = (X_colmap_lcl.extent(1) == 1); if(is_vector) { - // Vector case - // Kernel interior shamelessly horked from Ifpack2_Details_ScaledDampedResidual_def.hpp - Kokkos::parallel_for("residual-vector",policy,KOKKOS_LAMBDA(const team_member& dev) { - Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) { - const LO lclRow = static_cast (dev.league_rank ()) * rows_per_team + loop; + + if (X_domainmap == nullptr) { + + // Vector case + // Kernel interior shamelessly horked from Ifpack2_Details_ScaledDampedResidual_def.hpp + Kokkos::parallel_for("residual-vector",policy,KOKKOS_LAMBDA(const team_member& dev) { + Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) { + const LO lclRow = static_cast (dev.league_rank ()) * rows_per_team + loop; - if (lclRow >= A_lcl.numRows ()) { - return; - } + if (lclRow >= A_lcl.numRows ()) { + return; + } - const auto A_row = A_lcl.rowConst(lclRow); - const LO row_length = static_cast (A_row.length); - residual_value_type A_x = KAT::zero (); + const auto A_row = A_lcl.rowConst(lclRow); + const LO row_length = static_cast (A_row.length); + residual_value_type A_x = KAT::zero (); - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, residual_value_type& lsum) { - const auto A_val = A_row.value(iEntry); - lsum += A_val * X_lcl(A_row.colidx(iEntry),0); - }, A_x); + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, residual_value_type& lsum) { + const auto A_val = A_row.value(iEntry); + lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry),0); + }, A_x); - Kokkos::single(Kokkos::PerThread(dev),[&] () { - R_lcl(lclRow,0) = B_lcl(lclRow,0) - A_x; - }); - });//end parallel_for TeamThreadRange - });//end parallel_for "residual-vector" - } else { - // MultiVector case - // Kernel interior shamelessly horked from Ifpack2_Details_ScaledDampedResidual_def.hpp - Kokkos::parallel_for("residual-multivector",policy,KOKKOS_LAMBDA(const team_member& dev) { - // NOTE: It looks like I should be able to get this data up above, but if I try to - // we get internal compiler errors. Who knew that gcc tried to "gimplify"? - const LO numVectors = static_cast(X_lcl.extent(1)); - Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) { - const LO lclRow = static_cast (dev.league_rank ()) * rows_per_team + loop; + Kokkos::single(Kokkos::PerThread(dev),[&] () { + R_lcl(lclRow,0) = B_lcl(lclRow,0) - A_x; + }); + });//end parallel_for TeamThreadRange + });//end parallel_for "residual-vector" + } else { + auto X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly); + + Kokkos::parallel_for("residual-vector",policy,KOKKOS_LAMBDA(const team_member& dev) { + Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) { + const LO lclRow = static_cast (dev.league_rank ()) * rows_per_team + loop; + const LO numRows = A_lcl.numRows (); - if (lclRow >= A_lcl.numRows ()) { - return; - } - const auto A_row = A_lcl.rowConst(lclRow); - const LO row_length = static_cast (A_row.length); - for(LO v=0; v= numRows) { + return; + } + + const auto A_row = A_lcl.rowConst(lclRow); + const LO row_length = static_cast (A_row.length); residual_value_type A_x = KAT::zero (); - + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, residual_value_type& lsum) { const auto A_val = A_row.value(iEntry); - lsum += A_val * X_lcl(A_row.colidx(iEntry),v); + const auto lclCol = A_row.colidx(iEntry); + if (lclCol < numRows) + lsum += A_val * X_domainmap_lcl(lclCol,0); + else + lsum += A_val * X_colmap_lcl(lclCol,0); }, A_x); - + Kokkos::single(Kokkos::PerThread(dev),[&] () { - R_lcl(lclRow,v) = B_lcl(lclRow,v) - A_x; - }); + R_lcl(lclRow,0) = B_lcl(lclRow,0) - A_x; + }); + });//end parallel_for TeamThreadRange + });//end parallel_for "residual-vector" + + } + } else { + // MultiVector case + if (X_domainmap == nullptr) { + // Kernel interior shamelessly horked from Ifpack2_Details_ScaledDampedResidual_def.hpp + Kokkos::parallel_for("residual-multivector",policy,KOKKOS_LAMBDA(const team_member& dev) { + // NOTE: It looks like I should be able to get this data up above, but if I try to + // we get internal compiler errors. Who knew that gcc tried to "gimplify"? + const LO numVectors = static_cast(X_colmap_lcl.extent(1)); + Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) { + const LO lclRow = static_cast (dev.league_rank ()) * rows_per_team + loop; + + if (lclRow >= A_lcl.numRows ()) { + return; + } + const auto A_row = A_lcl.rowConst(lclRow); + const LO row_length = static_cast (A_row.length); + for(LO v=0; vgetLocalViewDevice(Access::ReadOnly); + + Kokkos::parallel_for("residual-multivector",policy,KOKKOS_LAMBDA(const team_member& dev) { + // NOTE: It looks like I should be able to get this data up above, but if I try to + // we get internal compiler errors. Who knew that gcc tried to "gimplify"? + const LO numVectors = static_cast(X_colmap_lcl.extent(1)); + Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) { + const LO lclRow = static_cast (dev.league_rank ()) * rows_per_team + loop; + const LO numRows = A_lcl.numRows (); - }//end for numVectors - });//end parallel_for TeamThreadRange - });//end parallel_for "residual-multivector" + if (lclRow >= numRows) { + return; + } + const auto A_row = A_lcl.rowConst(lclRow); + const LO row_length = static_cast (A_row.length); + for(LO v=0; v & Aop, using Teuchos::rcp; using Teuchos::rcp_const_cast; using Teuchos::rcpFromRef; + + const bool debug = ::Tpetra::Details::Behavior::debug (); + + // Whether we are using restrictedMode in the import from domain to + // column map. Restricted mode skips the copy and permutation of the + // local part of X. We are using restrictedMode only when domain and + // column map are locally fitted, i.e. when the local indices of + // domain and column map match. + bool restrictedMode = false; const CrsMatrix * Apt = dynamic_cast*>(&Aop); if(!Apt) { @@ -379,8 +461,17 @@ void residual(const Operator & Aop, // stride. RCP X_colMapNonConst = A.getColumnMapMultiVector (X_in); + // Do we want to use restrictedMode? + restrictedMode = importer->isLocallyFitted(); + + if (debug && restrictedMode) { + TEUCHOS_TEST_FOR_EXCEPTION + (!importer->getTargetMap()->isLocallyFitted(*importer->getSourceMap()), std::runtime_error, + "Source map and target map are not locally fitted, but Tpetra::residual thinks they are."); + } + // Import from the domain Map MV to the column Map MV. - X_colMapNonConst->doImport (X_in, *importer, INSERT); + X_colMapNonConst->doImport (X_in, *importer, INSERT, restrictedMode); X_colMap = rcp_const_cast (X_colMapNonConst); } @@ -431,7 +522,10 @@ void residual(const Operator & Aop, // make a constant stride R_rowMap MV and do an Export anyway. if (! exporter.is_null ()) { - localResidual (A, *X_colMap, *B_rowMap, *R_rowMap); + if (restrictedMode && !importer.is_null ()) + localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, &X_in); + else + localResidual (A, *X_colMap, *B_rowMap, *R_rowMap); { ProfilingRegion regionExport ("Tpetra::CrsMatrix::residual: R Export"); @@ -453,7 +547,10 @@ void residual(const Operator & Aop, Tpetra::deep_copy (R_in, *R_rowMap); } else { - localResidual (A, *X_colMap, *B_rowMap, *R_rowMap); + if (restrictedMode && !importer.is_null ()) + localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, &X_in); + else + localResidual (A, *X_colMap, *B_rowMap, *R_rowMap); } } From 4d4e5252ebe07876f71e0bd1f629c8b91ee07346 Mon Sep 17 00:00:00 2001 From: Christian Glusa Date: Thu, 20 May 2021 12:31:10 -0600 Subject: [PATCH 3/8] Tpetra residual: Use same launch parameters as Kokkos SpMV --- .../core/src/Tpetra_Details_residual.hpp | 76 +------------------ 1 file changed, 2 insertions(+), 74 deletions(-) diff --git a/packages/tpetra/core/src/Tpetra_Details_residual.hpp b/packages/tpetra/core/src/Tpetra_Details_residual.hpp index c65d64431aba..b6f41847aa4b 100644 --- a/packages/tpetra/core/src/Tpetra_Details_residual.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_residual.hpp @@ -49,6 +49,7 @@ #include "Teuchos_ScalarTraits.hpp" #include "Tpetra_Details_Behavior.hpp" #include "Tpetra_Details_Profiling.hpp" +#include "KokkosSparse_spmv_impl.hpp" /// \file Tpetra_Details_residual.hpp /// \brief Functions that allow for fused residual calculation. @@ -60,78 +61,6 @@ namespace Tpetra { namespace Details { -template -int64_t -residual_launch_parameters (int64_t numRows, - int64_t nnz, - int64_t rows_per_thread, - int& team_size, - int& vector_length) -{ - using execution_space = typename ExecutionSpace::execution_space; - - int64_t rows_per_team; - int64_t nnz_per_row = nnz/numRows; - - if (nnz_per_row < 1) { - nnz_per_row = 1; - } - - if (vector_length < 1) { - vector_length = 1; - while (vector_length<32 && vector_length*6 < nnz_per_row) { - vector_length *= 2; - } - } - - // Determine rows per thread - if (rows_per_thread < 1) { -#ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) { - rows_per_thread = 1; - } - else -#endif - { - if (nnz_per_row < 20 && nnz > 5000000) { - rows_per_thread = 256; - } - else { - rows_per_thread = 64; - } - } - } - -#ifdef KOKKOS_ENABLE_CUDA - if (team_size < 1) { - if (std::is_same::value) { - team_size = 256/vector_length; - } - else { - team_size = 1; - } - } -#endif - - rows_per_team = rows_per_thread * team_size; - - if (rows_per_team < 0) { - int64_t nnz_per_team = 4096; - int64_t conc = execution_space::concurrency (); - while ((conc * nnz_per_team * 4 > nnz) && - (nnz_per_team > 256)) { - nnz_per_team /= 2; - } - rows_per_team = (nnz_per_team + nnz_per_row - 1) / nnz_per_row; - } - - return rows_per_team; -} - - - - - template void localResidual(const CrsMatrix & A, const MultiVector & X_colmap, @@ -231,8 +160,7 @@ void localResidual(const CrsMatrix & A, int vector_length = -1; int64_t rows_per_thread = -1; - int64_t rows_per_team = - residual_launch_parameters(numLocalRows, myNnz, rows_per_thread, team_size, vector_length); + int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters(numLocalRows, myNnz, rows_per_thread, team_size, vector_length); int64_t worksets = (B_lcl.extent (0) + rows_per_team - 1) / rows_per_team; using policy_type = typename Kokkos::TeamPolicy; From d316fd8f2da2c3ce820af7f69462a1e91ba2b8d8 Mon Sep 17 00:00:00 2001 From: Christian Glusa Date: Thu, 20 May 2021 13:28:51 -0600 Subject: [PATCH 4/8] Ifpack2 OverlappingRowMatrix unit test: use launch params frm KK SpMV --- .../test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp index 954d5acc0bc0..86a91941e21f 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestOverlappingRowMatrix.cpp @@ -67,6 +67,7 @@ #endif #include "Tpetra_Details_residual.hpp" +#include "KokkosSparse_spmv_impl.hpp" #include #include @@ -133,8 +134,7 @@ void localReducedMatvec(const MatrixClass & A_lcl, int64_t numLocalRows = userNumRows; int64_t myNnz = A_lcl.nnz(); - int64_t rows_per_team = - Tpetra::Details::residual_launch_parameters(numLocalRows, myNnz, rows_per_thread, team_size, vector_length); + int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters(numLocalRows, myNnz, rows_per_thread, team_size, vector_length); int64_t worksets = (X_lcl.extent (0) + rows_per_team - 1) / rows_per_team; using policy_type = typename Kokkos::TeamPolicy; From a3113e52659f11509d9011d03e651110a08fd0e0 Mon Sep 17 00:00:00 2001 From: Christian Glusa Date: Tue, 1 Jun 2021 15:17:36 -0600 Subject: [PATCH 5/8] Tpetra CrsGraph: Add method "getLocalOffRankOffsets" --- packages/tpetra/core/src/CMakeLists.txt | 8 +- .../tpetra/core/src/Tpetra_CrsGraph_decl.hpp | 18 +++ .../tpetra/core/src/Tpetra_CrsGraph_def.hpp | 58 +++++++ .../Tpetra_Details_getGraphOffRankOffsets.cpp | 67 ++++++++ ...ra_Details_getGraphOffRankOffsets_decl.hpp | 146 ++++++++++++++++++ ...tra_Details_getGraphOffRankOffsets_def.hpp | 134 ++++++++++++++++ .../core/src/Tpetra_Details_residual.hpp | 67 ++++---- .../test/CrsGraph/CrsGraph_UnitTests1.cpp | 59 ++++++- 8 files changed, 523 insertions(+), 34 deletions(-) create mode 100644 packages/tpetra/core/src/Tpetra_Details_getGraphOffRankOffsets.cpp create mode 100644 packages/tpetra/core/src/Tpetra_Details_getGraphOffRankOffsets_decl.hpp create mode 100644 packages/tpetra/core/src/Tpetra_Details_getGraphOffRankOffsets_def.hpp diff --git a/packages/tpetra/core/src/CMakeLists.txt b/packages/tpetra/core/src/CMakeLists.txt index f72f5dd50099..1195b5634ce3 100644 --- a/packages/tpetra/core/src/CMakeLists.txt +++ b/packages/tpetra/core/src/CMakeLists.txt @@ -317,7 +317,7 @@ FUNCTION(TPETRA_PROCESS_ALL_LGN_TEMPLATES OUTPUT_FILES TEMPLATE_FILE FOREACH(LO ${LOCALORDINAL_TYPES}) TPETRA_MANGLE_TEMPLATE_PARAMETER(LO_MANGLED "${LO}") TPETRA_SLG_MACRO_NAME(LO_MACRO_NAME "${LO}") - + TPETRA_PROCESS_ONE_LGN_TEMPLATE(OUT_FILE "${TEMPLATE_FILE}" "${CLASS_NAME}" "${CLASS_MACRO_NAME}" "${LO_MANGLED}" "${GO_MANGLED}" "${NT_MANGLED}" @@ -619,7 +619,7 @@ IF (${PACKAGE_NAME}_ENABLE_EXPLICIT_INSTANTIATION) "${TpetraCore_ETI_NODES}" TRUE) LIST(APPEND SOURCES ${LOCALDEEPCOPYROWMATRIX_OUTPUT_FILES}) - + # Generate ETI .cpp files for the RowMatrix -> CrsMatrix overload of # Tpetra::createDeepCopy. Do this only for non-integer Scalar # types, since we really only need this function for linear solvers. @@ -634,7 +634,7 @@ IF (${PACKAGE_NAME}_ENABLE_EXPLICIT_INSTANTIATION) FALSE) LIST(APPEND SOURCES ${CREATEDEEPCOPY_CRSMATRIX_OUTPUT_FILES}) ENDIF () - + # Generate ETI .cpp files for Tpetra::LocalCrsMatrixOperator. TPETRA_PROCESS_ALL_SN_TEMPLATES(LOCALCRSMATRIXOPERATOR_OUTPUT_FILES "Tpetra_ETI_SC_NT.tmpl" "LocalCrsMatrixOperator" @@ -777,5 +777,5 @@ SET_PROPERTY( # / from this directory, or to / from the 'impl' subdirectory. That ensures # that running "make" will also rerun CMake in order to regenerate Makefiles. # -# Here's another change, another, and another. +# Here's another change, another, and another and yet another. # diff --git a/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp b/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp index 01f2b3a79916..a7bc7f167021 100644 --- a/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp @@ -303,6 +303,9 @@ namespace Tpetra { using nonconst_global_inds_host_view_type = typename row_graph_type::nonconst_global_inds_host_view_type; + using offset_device_view_type = + typename row_ptrs_device_view_type::non_const_type; + //KDDKDD INROW using local_inds_host_view_type = //KDDKDD INROW typename local_inds_dualv_type::t_host::const_type; @@ -1387,6 +1390,10 @@ namespace Tpetra { void getLocalDiagOffsets (const Kokkos::View& offsets) const; + /// \brief Get offsets of the off-rank entries in the graph. + void + getLocalOffRankOffsets (offset_device_view_type& offsets) const; + /// \brief Backwards compatibility overload of the above method. /// /// This method takes a Teuchos::ArrayRCP instead of a @@ -2064,6 +2071,8 @@ namespace Tpetra { /// void computeGlobalConstants (); + bool haveLocalOffRankOffsets() const { return haveLocalOffRankOffsets_;} + protected: /// \brief Compute local constants, if they have not yet been computed. /// @@ -2410,6 +2419,13 @@ namespace Tpetra { /// This may also exist with 1-D storage, if storage is unpacked. num_row_entries_type k_numRowEntries_; + /// \brief The offsets for off-rank entries. + /// + /// When off-rank entries are sorted last, this rowPtr-lile view + /// contains the offsets. It is compute on the first call to + /// getLocalOffRankOffsets(). + mutable offset_device_view_type k_offRankOffsets_; + //@} /// \brief Status of the graph's storage, when not in a @@ -2438,6 +2454,8 @@ namespace Tpetra { bool haveLocalConstants_ = false; //! Whether all processes have computed global constants. bool haveGlobalConstants_ = false; + //! + mutable bool haveLocalOffRankOffsets_ = false; typedef typename std::map > nonlocals_type; diff --git a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp index 6a3df5701921..96f29198b6bc 100644 --- a/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp @@ -48,6 +48,7 @@ #include "Tpetra_Details_copyOffsets.hpp" #include "Tpetra_Details_gathervPrint.hpp" #include "Tpetra_Details_getGraphDiagOffsets.hpp" +#include "Tpetra_Details_getGraphOffRankOffsets.hpp" #include "Tpetra_Details_makeColMap.hpp" #include "Tpetra_Details_Profiling.hpp" #include "Tpetra_Details_getEntryOnHost.hpp" @@ -6698,6 +6699,60 @@ namespace Tpetra { } // debug_ } + template + void + CrsGraph:: + getLocalOffRankOffsets (offset_device_view_type& offsets) const + { + using std::endl; + const char tfecfFuncName[] = "getLocalOffRankOffsets: "; + const bool verbose = verbose_; + + std::unique_ptr prefix; + if (verbose) { + prefix = this->createPrefix("CrsGraph", "getLocalOffRankOffsets"); + std::ostringstream os; + os << *prefix << "offsets.extent(0)=" << offsets.extent(0) + << endl; + std::cerr << os.str(); + } + + TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC + (! hasColMap (), std::runtime_error, "The graph must have a column Map."); + // Instead of throwing, we could also copy the rowPtr to k_offRankOffsets_. + + const size_t lclNumRows = this->getNodeNumRows (); + + if (haveLocalOffRankOffsets_ && k_offRankOffsets_.extent(0) == lclNumRows+1) { + offsets = k_offRankOffsets_; + return; + } + haveLocalOffRankOffsets_ = false; + k_offRankOffsets_ = offset_device_view_type(Kokkos::ViewAllocateWithoutInitializing("offRankOffset"), lclNumRows+1); + offsets = k_offRankOffsets_; + + const map_type& colMap = * (this->getColMap ()); + const map_type& domMap = * (this->getDomainMap ()); + + // mfh 12 Mar 2016: LocalMap works on (CUDA) device. It has just + // the subset of Map functionality that we need below. + auto lclColMap = colMap.getLocalMap (); + auto lclDomMap = domMap.getLocalMap (); + + // FIXME (mfh 16 Dec 2015) It's easy to thread-parallelize this + // setup, at least on the host. For CUDA, we have to use LocalMap + // (that comes from each of the two Maps). + + TEUCHOS_ASSERT(this->isSorted ()); + if (isFillComplete ()) { + auto lclGraph = this->getLocalGraph (); + ::Tpetra::Details::getGraphOffRankOffsets (k_offRankOffsets_, + lclColMap, lclDomMap, + lclGraph); + haveLocalOffRankOffsets_ = true; + } + } + namespace { // (anonymous) // mfh 21 Jan 2016: This is useful for getLocalDiagOffsets (see @@ -7548,6 +7603,7 @@ namespace Tpetra { std::swap(graph.rowPtrsUnpacked_dev_, this->rowPtrsUnpacked_dev_); std::swap(graph.rowPtrsUnpacked_host_, this->rowPtrsUnpacked_host_); + std::swap(graph.k_offRankOffsets_, this->k_offRankOffsets_); std::swap(graph.lclIndsUnpacked_wdv, this->lclIndsUnpacked_wdv); std::swap(graph.gblInds_wdv, this->gblInds_wdv); @@ -7563,6 +7619,7 @@ namespace Tpetra { std::swap(graph.noRedundancies_, this->noRedundancies_); std::swap(graph.haveLocalConstants_, this->haveLocalConstants_); std::swap(graph.haveGlobalConstants_, this->haveGlobalConstants_); + std::swap(graph.haveLocalOffRankOffsets_, this->haveLocalOffRankOffsets_); std::swap(graph.sortGhostsAssociatedWithEachProcessor_, this->sortGhostsAssociatedWithEachProcessor_); @@ -7625,6 +7682,7 @@ namespace Tpetra { output = this->noRedundancies_ == graph.noRedundancies_ ? output : false; output = this->haveLocalConstants_ == graph.haveLocalConstants_ ? output : false; output = this->haveGlobalConstants_ == graph.haveGlobalConstants_ ? output : false; + output = this->haveLocalOffRankOffsets_ == graph.haveLocalOffRankOffsets_ ? output : false; output = this->sortGhostsAssociatedWithEachProcessor_ == this->sortGhostsAssociatedWithEachProcessor_ ? output : false; // Compare nonlocals_ -- std::map > diff --git a/packages/tpetra/core/src/Tpetra_Details_getGraphOffRankOffsets.cpp b/packages/tpetra/core/src/Tpetra_Details_getGraphOffRankOffsets.cpp new file mode 100644 index 000000000000..bdd62e8538ab --- /dev/null +++ b/packages/tpetra/core/src/Tpetra_Details_getGraphOffRankOffsets.cpp @@ -0,0 +1,67 @@ +/* +// @HEADER +// *********************************************************************** +// +// Tpetra: Templated Linear Algebra Services Package +// Copyright (2008) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Michael A. Heroux (maherou@sandia.gov) +// +// ************************************************************************ +// @HEADER +*/ + +#include "TpetraCore_config.h" + +#if defined(HAVE_TPETRA_EXPLICIT_INSTANTIATION) + +// We protect the contents of this file with macros, to assist +// applications that circumvent Trilinos' build system. (We do NOT +// recommend this.) That way, they can still build this file, but as +// long as the macros have correct definitions, they won't build +// anything that's not enabled. + +#include "KokkosCompat_ClassicNodeAPI_Wrapper.hpp" +#include "Tpetra_Details_getGraphOffRankOffsets_decl.hpp" +#include "Tpetra_Details_getGraphOffRankOffsets_def.hpp" +#include "TpetraCore_ETIHelperMacros.h" + +namespace Tpetra { + + TPETRA_ETI_MANGLING_TYPEDEFS() + + TPETRA_INSTANTIATE_LGN( TPETRA_DETAILS_IMPL_GETGRAPHOFFRANKOFFSETS_INSTANT ) + +} // namespace Tpetra + +#endif // Whether we should build this specialization diff --git a/packages/tpetra/core/src/Tpetra_Details_getGraphOffRankOffsets_decl.hpp b/packages/tpetra/core/src/Tpetra_Details_getGraphOffRankOffsets_decl.hpp new file mode 100644 index 000000000000..8f7d85271278 --- /dev/null +++ b/packages/tpetra/core/src/Tpetra_Details_getGraphOffRankOffsets_decl.hpp @@ -0,0 +1,146 @@ +/* +// @HEADER +// *********************************************************************** +// +// Tpetra: Templated Linear Algebra Services Package +// Copyright (2008) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Michael A. Heroux (maherou@sandia.gov) +// +// ************************************************************************ +// @HEADER +*/ + +#ifndef TPETRA_DETAILS_GETGRAPHOFFRANKOFFSETS_DECL_HPP +#define TPETRA_DETAILS_GETGRAPHOFFRANKOFFSETS_DECL_HPP + +/// \file Tpetra_Details_getGraphOffRankOffsets_decl.hpp +/// \brief Declare and define the function +/// Tpetra::Details::getGraphOffRankOffsets, an implementation detail +/// of Tpetra::CrsGraph. + +#include "TpetraCore_config.h" +#include "Kokkos_Core.hpp" +#include "Kokkos_StaticCrsGraph.hpp" +#include "Tpetra_Details_LocalMap.hpp" +#include + +namespace Tpetra { +namespace Details { +namespace Impl { + +/// \brief Implementation detail of +/// Tpetra::Details::getGraphOffRankOffsets, which in turn is an +/// implementation detail of Tpetra::CrsGraph. +/// +/// FIXME (mfh 12 Mar 2016) There's currently no way to make a +/// MemoryUnmanaged Kokkos::StaticCrsGraph. Thus, we have to do this +/// separately for its column indices. We want the column indices to +/// be unmanaged because we need to take subviews in this kernel. +/// Taking a subview of a managed View updates the reference count, +/// which is a thread scalability bottleneck. +/// +/// mfh 12 Mar 2016: Tpetra::CrsGraph::getLocalOffRankOffsets returns +/// offsets as size_t. However, see Github Issue #213. +template +class GetGraphOffRankOffsets { +public: + typedef typename DeviceType::device_type device_type; + typedef OffsetType offset_type; + typedef ::Kokkos::View offsets_type; + typedef ::Kokkos::StaticCrsGraph local_graph_type; + typedef ::Tpetra::Details::LocalMap local_map_type; + typedef ::Kokkos::View row_offsets_type; + // This is unmanaged for performance, because we need to take + // subviews inside the functor. + typedef ::Kokkos::View lcl_col_inds_type; + + //! Constructor; also runs the functor. + GetGraphOffRankOffsets (const offsets_type& OffRankOffsets, + const local_map_type& lclColMap, + const local_map_type& lclDomMap, + const row_offsets_type& ptr, + const lcl_col_inds_type& ind); + + //! Kokkos::parallel_for loop body. + KOKKOS_FUNCTION void operator() (const LO& lclRowInd) const; + +private: + offsets_type OffRankOffsets_; + local_map_type lclColMap_; + local_map_type lclDomMap_; + row_offsets_type ptr_; + lcl_col_inds_type ind_; + LO lclNumRows_; +}; + +} // namespace Impl + +template +void +getGraphOffRankOffsets (const OffsetsType& OffRankOffsets, + const LclMapType& lclColMap, + const LclMapType& lclDomMap, + const LclGraphType& lclGraph) +{ + typedef typename OffsetsType::non_const_value_type offset_type; + typedef typename LclMapType::local_ordinal_type LO; + typedef typename LclMapType::global_ordinal_type GO; + typedef typename LclMapType::device_type DT; + + typedef Impl::GetGraphOffRankOffsets impl_type; + + // The functor's constructor runs the functor. + impl_type impl (OffRankOffsets, lclColMap, lclDomMap, lclGraph.row_map, lclGraph.entries); +} + +} // namespace Details +} // namespace Tpetra + +#endif // TPETRA_DETAILS_GETGRAPHOFFRANKOFFSETS_DECL_HPP diff --git a/packages/tpetra/core/src/Tpetra_Details_getGraphOffRankOffsets_def.hpp b/packages/tpetra/core/src/Tpetra_Details_getGraphOffRankOffsets_def.hpp new file mode 100644 index 000000000000..9431c1af42f7 --- /dev/null +++ b/packages/tpetra/core/src/Tpetra_Details_getGraphOffRankOffsets_def.hpp @@ -0,0 +1,134 @@ +/* +// @HEADER +// *********************************************************************** +// +// Tpetra: Templated Linear Algebra Services Package +// Copyright (2008) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Michael A. Heroux (maherou@sandia.gov) +// +// ************************************************************************ +// @HEADER +*/ + +#ifndef TPETRA_DETAILS_GETGRAPHOFFRANKOFFSETS_DEF_HPP +#define TPETRA_DETAILS_GETGRAPHOFFRANKOFFSETS_DEF_HPP + +/// \file Tpetra_Details_getGraphOffRankOffsets_def.hpp +/// \brief Define the implementation of the function +/// Tpetra::Details::getGraphOffRankOffsets, an implementation detail +/// of Tpetra::CrsGraph. + +#include "Tpetra_Details_OrdinalTraits.hpp" +#include "Tpetra_Map.hpp" +#include "KokkosSparse_findRelOffset.hpp" + +namespace Tpetra { +namespace Details { +namespace Impl { + +/// \brief Implementation detail of +/// Tpetra::Details::getGraphOffRankOffsets, which in turn is an +/// implementation detail of Tpetra::CrsGraph. +/// +/// FIXME (mfh 12 Mar 2016) There's currently no way to make a +/// MemoryUnmanaged Kokkos::StaticCrsGraph. Thus, we have to do this +/// separately for its column indices. We want the column indices to +/// be unmanaged because we need to take subviews in this kernel. +/// Taking a subview of a managed View updates the reference count, +/// which is a thread scalability bottleneck. +/// +/// mfh 12 Mar 2016: Tpetra::CrsGraph::getLocalOffRankOffsets returns +/// offsets as size_t. However, see Github Issue #213. +template +GetGraphOffRankOffsets:: +GetGraphOffRankOffsets (const offsets_type& OffRankOffsets, + const local_map_type& lclColMap, + const local_map_type& lclDomMap, + const row_offsets_type& ptr, + const lcl_col_inds_type& ind) : + OffRankOffsets_ (OffRankOffsets), + lclColMap_ (lclColMap), + lclDomMap_ (lclDomMap), + ptr_ (ptr), + ind_ (ind) +{ + typedef typename device_type::execution_space execution_space; + typedef Kokkos::RangePolicy policy_type; + + lclNumRows_ = ptr.extent(0)-1; + policy_type range (0, ptr.extent(0)); + Kokkos::parallel_for (range, *this); +} + +template +KOKKOS_FUNCTION void +GetGraphOffRankOffsets:: +operator() (const LO& lclRowInd) const +{ + const LO INVALID = + Tpetra::Details::OrdinalTraits::invalid (); + + if (lclRowInd == lclNumRows_) + OffRankOffsets_[lclRowInd] = ptr_[lclRowInd]; + else { + // TODO: use parallel reduce + size_t offset = ptr_[lclRowInd+1]; + for (size_t j = ptr_[lclRowInd]; j < ptr_[lclRowInd+1]; j++) { + const LO lclColInd = ind_[j]; + const GO gblColInd = lclColMap_.getGlobalElement (lclColInd); + const LO lclDomInd = lclDomMap_.getLocalElement (gblColInd); + if ((lclDomInd == INVALID) && (j < offset)) + offset = j; + } + OffRankOffsets_[lclRowInd] = offset; + } +} + +} // namespace Impl +} // namespace Details +} // namespace Tpetra + +// Explicit template instantiation macro for +// Tpetra::Details::Impl::GetGraphOffRankOffsets. NOT FOR USERS!!! Must +// be used inside the Tpetra namespace. +#define TPETRA_DETAILS_IMPL_GETGRAPHOFFRANKOFFSETS_INSTANT( LO, GO, NODE ) \ + template class Details::Impl::GetGraphOffRankOffsets< LO, GO, NODE::device_type >; + +#endif // TPETRA_DETAILS_GETGRAPHOFFRANKOFFSETS_DEF_HPP diff --git a/packages/tpetra/core/src/Tpetra_Details_residual.hpp b/packages/tpetra/core/src/Tpetra_Details_residual.hpp index b6f41847aa4b..beff28c317c3 100644 --- a/packages/tpetra/core/src/Tpetra_Details_residual.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_residual.hpp @@ -66,6 +66,7 @@ void localResidual(const CrsMatrix & A, const MultiVector & X_colmap, const MultiVector & B, MultiVector & R, + const Kokkos::View& offsets, const MultiVector * X_domainmap=nullptr) { using Tpetra::Details::ProfilingRegion; using Teuchos::NO_TRANS; @@ -214,19 +215,18 @@ void localResidual(const CrsMatrix & A, Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) { const LO lclRow = static_cast (dev.league_rank ()) * rows_per_team + loop; const LO numRows = A_lcl.numRows (); - if (lclRow >= numRows) { return; } - - const auto A_row = A_lcl.rowConst(lclRow); - const LO row_length = static_cast (A_row.length); - residual_value_type A_x = KAT::zero (); - - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, residual_value_type& lsum) { - const auto A_val = A_row.value(iEntry); - const auto lclCol = A_row.colidx(iEntry); - if (lclCol < numRows) + const LO offRankOffset = offsets(lclRow); + const size_t start = A_lcl.graph.row_map(lclRow); + const size_t end = A_lcl.graph.row_map(lclRow+1); + residual_value_type A_x = KAT::zero (); + + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, start, end), [&] (const LO iEntry, residual_value_type& lsum) { + const auto A_val = A_lcl.values(iEntry); + const auto lclCol = A_lcl.graph.entries(iEntry); + if (iEntry < offRankOffset) lsum += A_val * X_domainmap_lcl(lclCol,0); else lsum += A_val * X_colmap_lcl(lclCol,0); @@ -237,7 +237,7 @@ void localResidual(const CrsMatrix & A, }); });//end parallel_for TeamThreadRange });//end parallel_for "residual-vector" - + } } else { // MultiVector case @@ -280,27 +280,29 @@ void localResidual(const CrsMatrix & A, Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) { const LO lclRow = static_cast (dev.league_rank ()) * rows_per_team + loop; const LO numRows = A_lcl.numRows (); - if (lclRow >= numRows) { return; } - const auto A_row = A_lcl.rowConst(lclRow); - const LO row_length = static_cast (A_row.length); + + const LO offRankOffset = offsets(lclRow); + const size_t start = A_lcl.graph.row_map(lclRow); + const size_t end = A_lcl.graph.row_map(lclRow+1); + for(LO v=0; v & Aop, using import_type = typename CrsMatrix::import_type; using export_type = typename CrsMatrix::export_type; using MV = MultiVector; + using graph_type = Tpetra::CrsGraph; + using local_graph_type = typename graph_type::local_graph_type; + using offset_type = typename graph_type::offset_device_view_type; // We treat the case of a replicated MV output specially. const bool R_is_replicated = @@ -403,6 +408,10 @@ void residual(const Operator & Aop, X_colMap = rcp_const_cast (X_colMapNonConst); } + offset_type offsets; + if (restrictedMode) + A.getCrsGraph()->getLocalOffRankOffsets(offsets); + // Get a vector for the R_rowMap output residual, handling the // non-constant stride and exporter cases. Since R gets clobbered // we don't need to worry about the data in it @@ -451,9 +460,9 @@ void residual(const Operator & Aop, if (! exporter.is_null ()) { if (restrictedMode && !importer.is_null ()) - localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, &X_in); + localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in); else - localResidual (A, *X_colMap, *B_rowMap, *R_rowMap); + localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets); { ProfilingRegion regionExport ("Tpetra::CrsMatrix::residual: R Export"); @@ -471,14 +480,14 @@ void residual(const Operator & Aop, // if (! R_in.isConstantStride () ) { // We need to be sure to do a copy out in this case. - localResidual (A, *X_colMap, *B_rowMap, *R_rowMap); + localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets); Tpetra::deep_copy (R_in, *R_rowMap); } else { if (restrictedMode && !importer.is_null ()) - localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, &X_in); + localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in); else - localResidual (A, *X_colMap, *B_rowMap, *R_rowMap); + localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets); } } diff --git a/packages/tpetra/core/test/CrsGraph/CrsGraph_UnitTests1.cpp b/packages/tpetra/core/test/CrsGraph/CrsGraph_UnitTests1.cpp index b668f308b039..34ff87cfdcac 100644 --- a/packages/tpetra/core/test/CrsGraph/CrsGraph_UnitTests1.cpp +++ b/packages/tpetra/core/test/CrsGraph/CrsGraph_UnitTests1.cpp @@ -583,6 +583,62 @@ namespace { // (anonymous) TEST_EQUALITY_CONST(globalSuccess_int, 0); } + TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( CrsGraph, Offsets, LO, GO , Node ) + { + typedef Tpetra::CrsGraph GRAPH; + typedef Tpetra::Map map_type; + typedef typename GRAPH::device_type device_type; + + const GST INVALID = Teuchos::OrdinalTraits::invalid (); + // get a comm + RCP > comm = getDefaultComm(); + const int numProcs = comm->getSize(); + // test filtering + if (numProcs > 1) { + const size_t numLocal = 2; + RCP rmap = + rcp (new map_type (INVALID, numLocal, 0, comm)); + ArrayRCP cmap_ind(numLocal); + cmap_ind[0] = comm->getRank()*numLocal; + cmap_ind[1] = ((comm->getRank()+1)*numLocal) % (numProcs*numLocal); + RCP cmap = + rcp (new map_type (INVALID, cmap_ind(), 0, comm)); + ArrayRCP rowptr(numLocal+1); + ArrayRCP colind(numLocal); // one unknown per row + rowptr[0] = 0; rowptr[1] = 1; rowptr[2] = 2; + colind[0] = Teuchos::as(0); + colind[1] = Teuchos::as(1); + + RCP G = rcp(new GRAPH(rmap,cmap,0,StaticProfile) ); + TEST_NOTHROW( G->setAllIndices(rowptr,colind) ); + TEST_EQUALITY_CONST( G->hasColMap(), true ); + + TEST_NOTHROW( G->expertStaticFillComplete(rmap,rmap) ); + TEST_EQUALITY( G->getRowMap(), rmap ); + TEST_EQUALITY( G->getColMap(), cmap ); + + auto diagOffsets = Kokkos::View("diagOffsets", numLocal); + G->getLocalDiagOffsets(diagOffsets); + auto diagOffsets_h = Kokkos::create_mirror_view(diagOffsets); + Kokkos::deep_copy(diagOffsets_h, diagOffsets); + TEST_EQUALITY( diagOffsets_h(0), 0 ); + TEST_EQUALITY( diagOffsets_h(1), INVALID ); + + typename GRAPH::offset_device_view_type offRankOffsets; + G->getLocalOffRankOffsets(offRankOffsets); + auto offRankOffsets_h = Kokkos::create_mirror_view(offRankOffsets); + Kokkos::deep_copy(offRankOffsets_h, offRankOffsets); + TEST_EQUALITY( offRankOffsets_h(0), 1 ); + TEST_EQUALITY( offRankOffsets_h(1), 1 ); + + } + + // All procs fail if any node fails + int globalSuccess_int = -1; + Teuchos::reduceAll( *comm, Teuchos::REDUCE_SUM, success ? 0 : 1, outArg(globalSuccess_int) ); + TEST_EQUALITY_CONST( globalSuccess_int, 0 ); + } + // // INSTANTIATIONS // @@ -599,7 +655,8 @@ namespace { // (anonymous) TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT( CrsGraph, SortingTests, LO, GO, NODE ) \ TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT( CrsGraph, TwoArraysESFC, LO, GO, NODE ) \ TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT( CrsGraph, SetAllIndices, LO, GO, NODE ) \ - TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT( CrsGraph, StaticProfileMultiInsert, LO, GO, NODE ) + TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT( CrsGraph, StaticProfileMultiInsert, LO, GO, NODE ) \ + TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT( CrsGraph, Offsets, LO, GO, NODE ) TPETRA_ETI_MANGLING_TYPEDEFS() From 7fe526cb85f4923fa5c1f92cee4b001a09f705f3 Mon Sep 17 00:00:00 2001 From: Christian Glusa Date: Tue, 22 Jun 2021 15:47:48 -0600 Subject: [PATCH 6/8] Tpetra residual: Add options for skipping of local copy and comm/comp overlap --- .../core/src/Tpetra_Details_Behavior.cpp | 23 + .../core/src/Tpetra_Details_Behavior.hpp | 12 + .../core/src/Tpetra_Details_residual.hpp | 588 +++++++++++++----- 3 files changed, 465 insertions(+), 158 deletions(-) diff --git a/packages/tpetra/core/src/Tpetra_Details_Behavior.cpp b/packages/tpetra/core/src/Tpetra_Details_Behavior.cpp index bc34f1972585..6a12004ad642 100644 --- a/packages/tpetra/core/src/Tpetra_Details_Behavior.cpp +++ b/packages/tpetra/core/src/Tpetra_Details_Behavior.cpp @@ -549,6 +549,29 @@ bool Behavior::hierarchicalUnpack () defaultValue); } +bool Behavior::skipCopyAndPermuteIfPossible () +{ + constexpr char envVarName[] = "TPETRA_SKIP_COPY_AND_PERMUTE"; + constexpr bool defaultValue(false); + + static bool value_ = defaultValue; + static bool initialized_ = false; + return idempotentlyGetEnvironmentVariableAsBool + (value_, initialized_, envVarName, defaultValue); +} + +bool Behavior::overlapCommunicationAndComputation () +{ + constexpr char envVarName[] = "TPETRA_OVERLAP"; + constexpr bool defaultValue(false); + + static bool value_ = defaultValue; + static bool initialized_ = false; + return idempotentlyGetEnvironmentVariableAsBool + (value_, initialized_, envVarName, defaultValue); +} + + } // namespace Details } // namespace Tpetra diff --git a/packages/tpetra/core/src/Tpetra_Details_Behavior.hpp b/packages/tpetra/core/src/Tpetra_Details_Behavior.hpp index a5094cb80c57..b2d5dd26c9c1 100644 --- a/packages/tpetra/core/src/Tpetra_Details_Behavior.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_Behavior.hpp @@ -263,6 +263,18 @@ class Behavior { /// environment variable. static bool profilingRegionUseKokkosProfiling(); + /// \brief Skip copyAndPermute if possible + /// + /// This is disabled by default. You may control this at run time via the + /// TPETRA_SKIP_COPY_AND_PERMUTE environment variable. + static bool skipCopyAndPermuteIfPossible(); + + /// \brief Overlap communication and computation. + /// + /// This is disabled by default. You may control this at run time via the + /// TPETRA_OVERLAP environment variable. + static bool overlapCommunicationAndComputation(); + }; diff --git a/packages/tpetra/core/src/Tpetra_Details_residual.hpp b/packages/tpetra/core/src/Tpetra_Details_residual.hpp index beff28c317c3..7c532a030d82 100644 --- a/packages/tpetra/core/src/Tpetra_Details_residual.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_residual.hpp @@ -61,6 +61,218 @@ namespace Tpetra { namespace Details { +/// \brief Functor for computing the residual +/// +/// The template parameters decide what is actually computed. +/// is_MV: whether the multivectors have actually more than 1 vector. +/// restrictedMode: when true, read on-rank RHS data out of X_domainmap_lcl instead of X_colmap_lcl +/// skipOffRank: when true, only compute the local part of the residual +template +struct LocalResidualFunctor { + + using execution_space = typename AMatrix::execution_space; + using LO = typename AMatrix::non_const_ordinal_type; + using value_type = typename AMatrix::non_const_value_type; + using team_policy = typename Kokkos::TeamPolicy; + using team_member = typename team_policy::member_type; + using ATV = Kokkos::ArithTraits; + + AMatrix A_lcl; + ConstMV X_colmap_lcl; + ConstMV B_lcl; + MV R_lcl; + int rows_per_team; + Offsets offsets; + ConstMV X_domainmap_lcl; + + + LocalResidualFunctor (const AMatrix& A_lcl_, + const ConstMV& X_colmap_lcl_, + const ConstMV& B_lcl_, + const MV& R_lcl_, + const int rows_per_team_, + const Offsets& offsets_, + const ConstMV& X_domainmap_lcl_) : + A_lcl(A_lcl_), + X_colmap_lcl(X_colmap_lcl_), + B_lcl(B_lcl_), + R_lcl(R_lcl_), + rows_per_team(rows_per_team_), + offsets(offsets_), + X_domainmap_lcl(X_domainmap_lcl_) + { } + + KOKKOS_INLINE_FUNCTION + void operator() (const team_member& dev) const + { + + Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) { + const LO lclRow = static_cast (dev.league_rank ()) * rows_per_team + loop; + + if (lclRow >= A_lcl.numRows ()) { + return; + } + + if (!is_MV) { // MultiVectors only have a single column + + value_type A_x = ATV::zero (); + + if (!restrictedMode) { + const auto A_row = A_lcl.rowConst(lclRow); + const LO row_length = static_cast (A_row.length); + + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, value_type& lsum) { + const auto A_val = A_row.value(iEntry); + lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry),0); + }, A_x); + + } + else { + + const LO offRankOffset = offsets(lclRow); + const size_t start = A_lcl.graph.row_map(lclRow); + const size_t end = A_lcl.graph.row_map(lclRow+1); + + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, start, end), [&] (const LO iEntry, value_type& lsum) { + const auto A_val = A_lcl.values(iEntry); + const auto lclCol = A_lcl.graph.entries(iEntry); + if (iEntry < offRankOffset) + lsum += A_val * X_domainmap_lcl(lclCol,0); + else if (!skipOffRank) + lsum += A_val * X_colmap_lcl(lclCol,0); + }, A_x); + } + + Kokkos::single(Kokkos::PerThread(dev),[&] () { + R_lcl(lclRow,0) = B_lcl(lclRow,0) - A_x; + }); + } + else { // MultiVectors have more than one column + + const LO numVectors = static_cast(X_colmap_lcl.extent(1)); + + for(LO v=0; v (A_row.length); + + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, value_type& lsum) { + const auto A_val = A_row.value(iEntry); + lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry),v); + }, A_x); + } + else { + const LO offRankOffset = offsets(lclRow); + const size_t start = A_lcl.graph.row_map(lclRow); + const size_t end = A_lcl.graph.row_map(lclRow+1); + + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, start, end), [&] (const LO iEntry, value_type& lsum) { + const auto A_val = A_lcl.values(iEntry); + const auto lclCol = A_lcl.graph.entries(iEntry); + if (iEntry < offRankOffset) + lsum += A_val * X_domainmap_lcl(lclCol,v); + else if (!skipOffRank) + lsum += A_val * X_colmap_lcl(lclCol,v); + }, A_x); + } + + Kokkos::single(Kokkos::PerThread(dev),[&] () { + R_lcl(lclRow,v) = B_lcl(lclRow,v) - A_x; + }); + + }//end for numVectors + } + });//end parallel_for TeamThreadRange + } +}; + + +/// \brief Functor for computing R -= A_offRank*X_colmap +template +struct OffRankUpdateFunctor { + + using execution_space = typename AMatrix::execution_space; + using LO = typename AMatrix::non_const_ordinal_type; + using value_type = typename AMatrix::non_const_value_type; + using team_policy = typename Kokkos::TeamPolicy; + using team_member = typename team_policy::member_type; + using ATV = Kokkos::ArithTraits; + + AMatrix A_lcl; + ConstMV X_colmap_lcl; + MV R_lcl; + int rows_per_team; + Offsets offsets; + + + OffRankUpdateFunctor (const AMatrix& A_lcl_, + const ConstMV& X_colmap_lcl_, + const MV& R_lcl_, + const int rows_per_team_, + const Offsets& offsets_) : + A_lcl(A_lcl_), + X_colmap_lcl(X_colmap_lcl_), + R_lcl(R_lcl_), + rows_per_team(rows_per_team_), + offsets(offsets_) + { } + + KOKKOS_INLINE_FUNCTION + void operator() (const team_member& dev) const + { + + Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) { + const LO lclRow = static_cast (dev.league_rank ()) * rows_per_team + loop; + + if (lclRow >= A_lcl.numRows ()) { + return; + } + + const LO offRankOffset = offsets(lclRow); + const size_t end = A_lcl.graph.row_map(lclRow+1); + + if (!is_MV) { // MultiVectors only have a single column + + value_type A_x = ATV::zero (); + + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, offRankOffset, end), [&] (const LO iEntry, value_type& lsum) { + const auto A_val = A_lcl.values(iEntry); + const auto lclCol = A_lcl.graph.entries(iEntry); + lsum += A_val * X_colmap_lcl(lclCol,0); + }, A_x); + + Kokkos::single(Kokkos::PerThread(dev),[&] () { + R_lcl(lclRow,0) -= A_x; + }); + } + else { // MultiVectors have more than one column + + const LO numVectors = static_cast(X_colmap_lcl.extent(1)); + + for(LO v=0; v void localResidual(const CrsMatrix & A, const MultiVector & X_colmap, @@ -72,10 +284,16 @@ void localResidual(const CrsMatrix & A, using Teuchos::NO_TRANS; ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localResidual"); - auto A_lcl = A.getLocalMatrixDevice (); - auto X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly); - auto B_lcl = B.getLocalViewDevice(Access::ReadOnly); - auto R_lcl = R.getLocalViewDevice(Access::OverwriteAll); + using local_matrix_device_type = typename CrsMatrix::local_matrix_device_type; + using local_view_device_type = typename MultiVector::dual_view_type::t_dev::non_const_type; + using const_local_view_device_type = typename MultiVector::dual_view_type::t_dev::const_type; + using offset_type = Kokkos::View; + + local_matrix_device_type A_lcl = A.getLocalMatrixDevice (); + const_local_view_device_type X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly); + const_local_view_device_type B_lcl = B.getLocalViewDevice(Access::ReadOnly); + local_view_device_type R_lcl = R.getLocalViewDevice(Access::OverwriteAll); + const_local_view_device_type X_domainmap_lcl; const bool debug = ::Tpetra::Details::Behavior::debug (); if (debug) { @@ -83,11 +301,6 @@ void localResidual(const CrsMatrix & A, (X_colmap.getNumVectors () != R.getNumVectors (), std::runtime_error, "X.getNumVectors() = " << X_colmap.getNumVectors () << " != " "R.getNumVectors() = " << R.getNumVectors () << "."); - TEUCHOS_TEST_FOR_EXCEPTION - (X_colmap.getNumVectors () != R.getNumVectors (), std::runtime_error, - "X.getNumVectors() = " << X_colmap.getNumVectors () << " != " - "R.getNumVectors() = " << R.getNumVectors () << "."); - TEUCHOS_TEST_FOR_EXCEPTION (X_colmap.getLocalLength () != A.getColMap ()->getNodeNumElements (), std::runtime_error, @@ -133,7 +346,6 @@ void localResidual(const CrsMatrix & A, A.localApply(X_colmap,R,Teuchos::NO_TRANS, one, zero); R.update(one,B,negone); #else - using execution_space = typename CrsMatrix::execution_space; if (A_lcl.numRows() == 0) { return; @@ -160,15 +372,12 @@ void localResidual(const CrsMatrix & A, int team_size = -1; int vector_length = -1; int64_t rows_per_thread = -1; - - int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters(numLocalRows, myNnz, rows_per_thread, team_size, vector_length); - int64_t worksets = (B_lcl.extent (0) + rows_per_team - 1) / rows_per_team; + using execution_space = typename CrsMatrix::execution_space; using policy_type = typename Kokkos::TeamPolicy; - using team_member = typename policy_type::member_type; - using residual_value_type = typename MultiVector::dual_view_type::t_dev::non_const_value_type; - using KAT = Kokkos::ArithTraits; + int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters(numLocalRows, myNnz, rows_per_thread, team_size, vector_length); + int64_t worksets = (B_lcl.extent (0) + rows_per_team - 1) / rows_per_team; policy_type policy (1, 1); if (team_size < 0) { @@ -183,135 +392,188 @@ void localResidual(const CrsMatrix & A, if(is_vector) { if (X_domainmap == nullptr) { - - // Vector case - // Kernel interior shamelessly horked from Ifpack2_Details_ScaledDampedResidual_def.hpp - Kokkos::parallel_for("residual-vector",policy,KOKKOS_LAMBDA(const team_member& dev) { - Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) { - const LO lclRow = static_cast (dev.league_rank ()) * rows_per_team + loop; - - if (lclRow >= A_lcl.numRows ()) { - return; - } - - const auto A_row = A_lcl.rowConst(lclRow); - const LO row_length = static_cast (A_row.length); - residual_value_type A_x = KAT::zero (); - - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, row_length), [&] (const LO iEntry, residual_value_type& lsum) { - const auto A_val = A_row.value(iEntry); - lsum += A_val * X_colmap_lcl(A_row.colidx(iEntry),0); - }, A_x); - - Kokkos::single(Kokkos::PerThread(dev),[&] () { - R_lcl(lclRow,0) = B_lcl(lclRow,0) - A_x; - }); - });//end parallel_for TeamThreadRange - });//end parallel_for "residual-vector" - } else { - auto X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly); - - Kokkos::parallel_for("residual-vector",policy,KOKKOS_LAMBDA(const team_member& dev) { - Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) { - const LO lclRow = static_cast (dev.league_rank ()) * rows_per_team + loop; - const LO numRows = A_lcl.numRows (); - if (lclRow >= numRows) { - return; - } - const LO offRankOffset = offsets(lclRow); - const size_t start = A_lcl.graph.row_map(lclRow); - const size_t end = A_lcl.graph.row_map(lclRow+1); - residual_value_type A_x = KAT::zero (); - - Kokkos::parallel_reduce(Kokkos::ThreadVectorRange (dev, start, end), [&] (const LO iEntry, residual_value_type& lsum) { - const auto A_val = A_lcl.values(iEntry); - const auto lclCol = A_lcl.graph.entries(iEntry); - if (iEntry < offRankOffset) - lsum += A_val * X_domainmap_lcl(lclCol,0); - else - lsum += A_val * X_colmap_lcl(lclCol,0); - }, A_x); - - Kokkos::single(Kokkos::PerThread(dev),[&] () { - R_lcl(lclRow,0) = B_lcl(lclRow,0) - A_x; - }); - });//end parallel_for TeamThreadRange - });//end parallel_for "residual-vector" + + using functor_type = LocalResidualFunctor; + functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl); + Kokkos::parallel_for("residual-vector",policy,func); + + } + else { + + X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly); + using functor_type = LocalResidualFunctor; + functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl); + Kokkos::parallel_for("residual-vector",policy,func); } - } else { - // MultiVector case + } + else { + if (X_domainmap == nullptr) { - // Kernel interior shamelessly horked from Ifpack2_Details_ScaledDampedResidual_def.hpp - Kokkos::parallel_for("residual-multivector",policy,KOKKOS_LAMBDA(const team_member& dev) { - // NOTE: It looks like I should be able to get this data up above, but if I try to - // we get internal compiler errors. Who knew that gcc tried to "gimplify"? - const LO numVectors = static_cast(X_colmap_lcl.extent(1)); - Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) { - const LO lclRow = static_cast (dev.league_rank ()) * rows_per_team + loop; - - if (lclRow >= A_lcl.numRows ()) { - return; - } - const auto A_row = A_lcl.rowConst(lclRow); - const LO row_length = static_cast (A_row.length); - for(LO v=0; vgetLocalViewDevice(Access::ReadOnly); - - Kokkos::parallel_for("residual-multivector",policy,KOKKOS_LAMBDA(const team_member& dev) { - // NOTE: It looks like I should be able to get this data up above, but if I try to - // we get internal compiler errors. Who knew that gcc tried to "gimplify"? - const LO numVectors = static_cast(X_colmap_lcl.extent(1)); - Kokkos::parallel_for(Kokkos::TeamThreadRange (dev, 0, rows_per_team),[&] (const LO& loop) { - const LO lclRow = static_cast (dev.league_rank ()) * rows_per_team + loop; - const LO numRows = A_lcl.numRows (); - if (lclRow >= numRows) { - return; - } - - const LO offRankOffset = offsets(lclRow); - const size_t start = A_lcl.graph.row_map(lclRow); - const size_t end = A_lcl.graph.row_map(lclRow+1); - - for(LO v=0; v; + functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl); + Kokkos::parallel_for("residual-multivector",policy,func); + + } + else { + + X_domainmap_lcl = X_domainmap->getLocalViewDevice(Access::ReadOnly); + using functor_type = LocalResidualFunctor; + functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl); + Kokkos::parallel_for("residual-multivector",policy,func); + } - }// end else + } #endif } - + + +template +void localResidualWithCommCompOverlap(const CrsMatrix & A, + MultiVector & X_colmap, + const MultiVector & B, + MultiVector & R, + const Kokkos::View& offsets, + const MultiVector & X_domainmap) { + using Tpetra::Details::ProfilingRegion; + using Teuchos::NO_TRANS; + using Teuchos::RCP; + using import_type = typename CrsMatrix::import_type; + + ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localResidualWithCommCompOverlap"); + + using local_matrix_device_type = typename CrsMatrix::local_matrix_device_type; + using local_view_device_type = typename MultiVector::dual_view_type::t_dev::non_const_type; + using const_local_view_device_type = typename MultiVector::dual_view_type::t_dev::const_type; + using offset_type = Kokkos::View; + + local_matrix_device_type A_lcl = A.getLocalMatrixDevice (); + const_local_view_device_type X_colmap_lcl = X_colmap.getLocalViewDevice(Access::ReadOnly); + const_local_view_device_type B_lcl = B.getLocalViewDevice(Access::ReadOnly); + local_view_device_type R_lcl = R.getLocalViewDevice(Access::OverwriteAll); + const_local_view_device_type X_domainmap_lcl = X_domainmap.getLocalViewDevice(Access::ReadOnly);; + + const bool debug = ::Tpetra::Details::Behavior::debug (); + if (debug) { + TEUCHOS_TEST_FOR_EXCEPTION + (X_colmap.getNumVectors () != R.getNumVectors (), std::runtime_error, + "X.getNumVectors() = " << X_colmap.getNumVectors () << " != " + "R.getNumVectors() = " << R.getNumVectors () << "."); + TEUCHOS_TEST_FOR_EXCEPTION + (X_colmap.getLocalLength () != + A.getColMap ()->getNodeNumElements (), std::runtime_error, + "X has the wrong number of local rows. " + "X.getLocalLength() = " << X_colmap.getLocalLength () << " != " + "A.getColMap()->getNodeNumElements() = " << + A.getColMap ()->getNodeNumElements () << "."); + TEUCHOS_TEST_FOR_EXCEPTION + (R.getLocalLength () != + A.getRowMap ()->getNodeNumElements (), std::runtime_error, + "R has the wrong number of local rows. " + "R.getLocalLength() = " << R.getLocalLength () << " != " + "A.getRowMap()->getNodeNumElements() = " << + A.getRowMap ()->getNodeNumElements () << "."); + TEUCHOS_TEST_FOR_EXCEPTION + (B.getLocalLength () != + A.getRowMap ()->getNodeNumElements (), std::runtime_error, + "B has the wrong number of local rows. " + "B.getLocalLength() = " << B.getLocalLength () << " != " + "A.getRowMap()->getNodeNumElements() = " << + A.getRowMap ()->getNodeNumElements () << "."); + + TEUCHOS_TEST_FOR_EXCEPTION + (! A.isFillComplete (), std::runtime_error, "The matrix A is not " + "fill complete. You must call fillComplete() (possibly with " + "domain and range Map arguments) without an intervening " + "resumeFill() call before you may call this method."); + TEUCHOS_TEST_FOR_EXCEPTION + (! X_colmap.isConstantStride () || ! R.isConstantStride () || ! B.isConstantStride (), + std::runtime_error, "X, Y and B must be constant stride."); + // If the two pointers are NULL, then they don't alias one + // another, even though they are equal. + TEUCHOS_TEST_FOR_EXCEPTION + ((X_colmap_lcl.data () == R_lcl.data () && X_colmap_lcl.data () != nullptr) || + (X_colmap_lcl.data () == B_lcl.data () && X_colmap_lcl.data () != nullptr), + std::runtime_error, "X, Y and R may not alias one another."); + } + + if (A_lcl.numRows() == 0) { + return; + } + + int64_t numLocalRows = A_lcl.numRows (); + int64_t myNnz = A_lcl.nnz(); + + // //Check for imbalanced rows. If the logic for SPMV to use merge path is triggered, + // //use it and follow the reference residual code + // LO maxRowImbalance = 0; + // if(numLocalRows != 0) + // maxRowImbalance = A.getNodeMaxNumRowEntries() - (myNnz / numLocalRows); + // if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold()) + // { + // //note: lclOp will be wrapped in shared_ptr + // auto lclOp = A.getLocalMultiplyOperator(); + // //Call local SPMV, requesting merge path, through A's LocalCrsMatrixOperator + // lclOp->applyImbalancedRows (X_colmap_lcl, R_lcl, Teuchos::NO_TRANS, one, zero); + // R.update(one,B,negone); + // return; + // } + + int team_size = -1; + int vector_length = -1; + int64_t rows_per_thread = -1; + + using execution_space = typename CrsMatrix::execution_space; + using policy_type = typename Kokkos::TeamPolicy; + + int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters(numLocalRows, myNnz, rows_per_thread, team_size, vector_length); + int64_t worksets = (B_lcl.extent (0) + rows_per_team - 1) / rows_per_team; + + policy_type policy (1, 1); + if (team_size < 0) { + policy = policy_type (worksets, Kokkos::AUTO, vector_length); + } + else { + policy = policy_type (worksets, team_size, vector_length); + } + + bool is_vector = (X_colmap_lcl.extent(1) == 1); + + if(is_vector) { + + using functor_type = LocalResidualFunctor; + functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl); + Kokkos::parallel_for("residual-vector",policy,func); + + RCP importer = A.getGraph ()->getImporter (); + X_colmap.endImport (X_domainmap, *importer, INSERT, true); + + Kokkos::fence(); + + using functor_type2 = OffRankUpdateFunctor; + functor_type2 func2 (A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets); + Kokkos::parallel_for("residual-vector-offrank",policy,func2); + + } + else { + + using functor_type = LocalResidualFunctor; + functor_type func (A_lcl, X_colmap_lcl, B_lcl, R_lcl, rows_per_team, offsets, X_domainmap_lcl); + Kokkos::parallel_for("residual-multivector",policy,func); + + RCP importer = A.getGraph ()->getImporter (); + X_colmap.endImport (X_domainmap, *importer, INSERT, true); + + Kokkos::fence(); + + using functor_type2 = OffRankUpdateFunctor; + functor_type2 func2 (A_lcl, X_colmap_lcl, R_lcl, rows_per_team, offsets); + Kokkos::parallel_for("residual-vector-offrank",policy,func2); + + } +} + //! Computes R = B - A * X template @@ -326,6 +588,10 @@ void residual(const Operator & Aop, using Teuchos::rcpFromRef; const bool debug = ::Tpetra::Details::Behavior::debug (); + const bool skipCopyAndPermuteIfPossible = ::Tpetra::Details::Behavior::skipCopyAndPermuteIfPossible (); + const bool overlapCommunicationAndComputation = ::Tpetra::Details::Behavior::overlapCommunicationAndComputation (); + if (overlapCommunicationAndComputation) + TEUCHOS_ASSERT(skipCopyAndPermuteIfPossible); // Whether we are using restrictedMode in the import from domain to // column map. Restricted mode skips the copy and permutation of the @@ -348,7 +614,6 @@ void residual(const Operator & Aop, using export_type = typename CrsMatrix::export_type; using MV = MultiVector; using graph_type = Tpetra::CrsGraph; - using local_graph_type = typename graph_type::local_graph_type; using offset_type = typename graph_type::offset_device_view_type; // We treat the case of a replicated MV output specially. @@ -367,7 +632,7 @@ void residual(const Operator & Aop, // Temporary MV for Import operation. After the block of code // below, this will be an (Imported if necessary) column Map MV // ready to give to localApply(...). - RCP X_colMap; + RCP X_colMap; if (importer.is_null ()) { if (! X_in.isConstantStride ()) { // Not all sparse mat-vec kernels can handle an input MV with @@ -376,14 +641,14 @@ void residual(const Operator & Aop, // copy of X_in, we force creation of the column (== domain) // Map MV (if it hasn't already been created, else fetch the // cached copy). This avoids creating a new MV each time. - RCP X_colMapNonConst = A.getColumnMapMultiVector (X_in, true); - Tpetra::deep_copy (*X_colMapNonConst, X_in); - X_colMap = rcp_const_cast (X_colMapNonConst); + X_colMap = A.getColumnMapMultiVector (X_in, true); + Tpetra::deep_copy (*X_colMap, X_in); + // X_colMap = rcp_const_cast (X_colMapNonConst); } else { // The domain and column Maps are the same, so do the local // multiply using the domain Map input MV X_in. - X_colMap = rcpFromRef (X_in); + X_colMap = rcp_const_cast (rcpFromRef (X_in) ); } } else { // need to Import source (multi)vector @@ -392,10 +657,10 @@ void residual(const Operator & Aop, // elements of the domain Map MV X_in into a separate column Map // MV. Thus, we don't have to worry whether X_in is constant // stride. - RCP X_colMapNonConst = A.getColumnMapMultiVector (X_in); + X_colMap = A.getColumnMapMultiVector (X_in); // Do we want to use restrictedMode? - restrictedMode = importer->isLocallyFitted(); + restrictedMode = skipCopyAndPermuteIfPossible && importer->isLocallyFitted(); if (debug && restrictedMode) { TEUCHOS_TEST_FOR_EXCEPTION @@ -404,8 +669,7 @@ void residual(const Operator & Aop, } // Import from the domain Map MV to the column Map MV. - X_colMapNonConst->doImport (X_in, *importer, INSERT, restrictedMode); - X_colMap = rcp_const_cast (X_colMapNonConst); + X_colMap->beginImport (X_in, *importer, INSERT, restrictedMode); } offset_type offsets; @@ -458,7 +722,8 @@ void residual(const Operator & Aop, // constant-stride version of R_in in this case, because we had to // make a constant stride R_rowMap MV and do an Export anyway. if (! exporter.is_null ()) { - + if ( ! importer.is_null ()) + X_colMap->endImport (X_in, *importer, INSERT, restrictedMode); if (restrictedMode && !importer.is_null ()) localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in); else @@ -472,23 +737,30 @@ void residual(const Operator & Aop, } } else { // Don't do an Export: row Map and range Map are the same. + + if (restrictedMode) + if (overlapCommunicationAndComputation) { + localResidualWithCommCompOverlap (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, X_in); + } else { + X_colMap->endImport (X_in, *importer, INSERT, restrictedMode); + localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in); + } + else { + if ( ! importer.is_null ()) + X_colMap->endImport (X_in, *importer, INSERT, restrictedMode); + localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets); + } + // // If R_in does not have constant stride, - // then we can't let the kernel write directly to R_in. + // then we can't let the kernel write directly to R_in. // Instead, we have to use the cached row (== range) // Map MV as temporary storage. // if (! R_in.isConstantStride () ) { // We need to be sure to do a copy out in this case. - localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets); Tpetra::deep_copy (R_in, *R_rowMap); } - else { - if (restrictedMode && !importer.is_null ()) - localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets, &X_in); - else - localResidual (A, *X_colMap, *B_rowMap, *R_rowMap, offsets); - } } // If the range Map is a locally replicated Map, sum up From 690f001c2616285d64ee72a4f4ee5ce1ab72a628 Mon Sep 17 00:00:00 2001 From: Peter Brian Ohm Date: Mon, 28 Jun 2021 11:41:44 -0600 Subject: [PATCH 7/8] MueLu: regionMG add missing Node templates --- .../research/regionMG/src/SetupRegionHierarchy_def.hpp | 8 ++++---- .../test/structured/Driver_Structured_Regions.cpp | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/packages/muelu/research/regionMG/src/SetupRegionHierarchy_def.hpp b/packages/muelu/research/regionMG/src/SetupRegionHierarchy_def.hpp index 024e97968f7b..56e145157a50 100644 --- a/packages/muelu/research/regionMG/src/SetupRegionHierarchy_def.hpp +++ b/packages/muelu/research/regionMG/src/SetupRegionHierarchy_def.hpp @@ -715,7 +715,7 @@ void MakeInterfaceScalingFactors(const int numLevels, RCP regRowMap = regMat->getRowMap(); RCP > regRowImporters = level->Get > >("rowImport"); // initialize region vector with all ones. - RCP > regInterfaceScalings = VectorFactory::Build(regRowMap); + RCP > regInterfaceScalings = VectorFactory::Build(regRowMap); regInterfaceScalings->putScalar(SC_ONE); // transform to composite layout while adding interface values via the Export() combine mode @@ -730,7 +730,7 @@ void MakeInterfaceScalingFactors(const int numLevels, regInterfaceScalings, regRowMap, regRowImporters); - level->Set > >("regInterfaceScalings", regInterfaceScalings); + level->Set > >("regInterfaceScalings", regInterfaceScalings); } } // MakeInterfaceScalingFactors @@ -829,7 +829,7 @@ void createRegionHierarchy(const int numDimensions, RCP regMatrix = level->Get >("A", MueLu::NoFactory::get()); RCP regRowMap = regMatrix->getRowMap(); RCP > regRowImporter = level->Get > >("rowImport"); - RCP > regInterfaceScalings = level->Get > >("regInterfaceScalings"); + RCP > regInterfaceScalings = level->Get > >("regInterfaceScalings"); smootherParams[levelIdx]->set("smoother: level", levelIdx); smootherSetup(smootherParams[levelIdx], regRowMap, @@ -953,7 +953,7 @@ void vCycle(const int l, ///< ID of current level RCP regMatrix = level->Get >("A", MueLu::NoFactory::get()); RCP regRowMap = regMatrix->getRowMap(); RCP > regRowImporter = level->Get > >("rowImport"); - RCP > regInterfaceScalings = level->Get > >("regInterfaceScalings"); + RCP > regInterfaceScalings = level->Get > >("regInterfaceScalings"); int cycleCount = 1; if(cycleType == "W" && l > 0) // W cycle and not on finest level diff --git a/packages/muelu/research/regionMG/test/structured/Driver_Structured_Regions.cpp b/packages/muelu/research/regionMG/test/structured/Driver_Structured_Regions.cpp index 3585da484d65..a50a1c8d5e7e 100644 --- a/packages/muelu/research/regionMG/test/structured/Driver_Structured_Regions.cpp +++ b/packages/muelu/research/regionMG/test/structured/Driver_Structured_Regions.cpp @@ -830,7 +830,7 @@ int main_(Teuchos::CommandLineProcessor &clp, Xpetra::UnderlyingLib& lib, int ar regCorrect->putScalar(SC_ZERO); // Get Stuff out of Hierarchy RCP level = regHierarchy->GetLevel(0); - RCP > regInterfaceScalings = level->Get > >("regInterfaceScalings"); + RCP > regInterfaceScalings = level->Get > >("regInterfaceScalings"); // check for convergence { //////////////////////////////////////////////////////////////////////// From da66451a06baba8dc90ab21be8908adcdb17eefc Mon Sep 17 00:00:00 2001 From: Christian Glusa Date: Mon, 28 Jun 2021 13:15:40 -0600 Subject: [PATCH 8/8] MueLu: Create cuBLAS and cuSPARSE handles up front --- packages/muelu/cmake/MueLu_config.hpp.in | 2 ++ .../muelu/test/unit_tests/MueLu_Test_ETI.hpp | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/packages/muelu/cmake/MueLu_config.hpp.in b/packages/muelu/cmake/MueLu_config.hpp.in index 96c398616cf2..201b154ca62c 100644 --- a/packages/muelu/cmake/MueLu_config.hpp.in +++ b/packages/muelu/cmake/MueLu_config.hpp.in @@ -59,6 +59,8 @@ #cmakedefine HAVE_MUELU_KOKKOSCORE +#cmakedefine HAVE_MUELU_KOKKOSKERNELS + #cmakedefine HAVE_MUELU_ML #cmakedefine HAVE_MUELU_PAMGEN diff --git a/packages/muelu/test/unit_tests/MueLu_Test_ETI.hpp b/packages/muelu/test/unit_tests/MueLu_Test_ETI.hpp index 73c9bbb7cd72..9bab38919842 100644 --- a/packages/muelu/test/unit_tests/MueLu_Test_ETI.hpp +++ b/packages/muelu/test/unit_tests/MueLu_Test_ETI.hpp @@ -63,6 +63,11 @@ #include #endif +#ifdef HAVE_MUELU_KOKKOSKERNELS +#include +#include +#endif + #ifndef MUELU_AUTOMATIC_TEST_ETI_NAME #error "The macro MUELU_AUTOMATIC_TEST_ETI_NAME was not defined" #endif @@ -95,6 +100,19 @@ bool Automatic_Test_ETI(int argc, char *argv[]) { Kokkos::initialize(argc, argv); #endif + // Create handles for cuBLAS and cuSPARSE. Otherwise they get + // created on the first call to these libraries, and that can mess + // up timings. +#ifdef HAVE_MUELU_KOKKOSKERNELS + KokkosKernels::Experimental::Controls controls; +# ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS + controls.getCublasHandle(); +# endif +# ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + controls.getCusparseHandle(); +# endif + Kokkos::fence(); +#endif bool success = true; bool verbose = true; try {