From 1805e900f9aa3cc2d8c84929953860eecf78a24d Mon Sep 17 00:00:00 2001 From: Jonathan Hu Date: Mon, 7 Aug 2023 14:54:13 -0700 Subject: [PATCH] Revert "Tpetra: move unpack and combine to device in TAFC" --- .../src/Utils/MueLu_UtilitiesBase_def.hpp | 20 +- .../tpetra/core/src/Tpetra_CrsMatrix_def.hpp | 126 +++++++----- ...Details_unpackCrsMatrixAndCombine_decl.hpp | 28 +-- ..._Details_unpackCrsMatrixAndCombine_def.hpp | 187 ++++++++---------- .../ImportExport2/ImportExport2_UnitTests.cpp | 102 ++++------ 5 files changed, 216 insertions(+), 247 deletions(-) diff --git a/packages/muelu/src/Utils/MueLu_UtilitiesBase_def.hpp b/packages/muelu/src/Utils/MueLu_UtilitiesBase_def.hpp index 59d75dd1e796..1fe2f6df1b7f 100644 --- a/packages/muelu/src/Utils/MueLu_UtilitiesBase_def.hpp +++ b/packages/muelu/src/Utils/MueLu_UtilitiesBase_def.hpp @@ -342,7 +342,6 @@ namespace MueLu { diag = Xpetra::VectorFactory::Build(rowMap,true); if(rowMap->lib() == Xpetra::UnderlyingLib::UseTpetra) { - Teuchos::TimeMonitor MM = *Teuchos::TimeMonitor::getNewTimer("UtilitiesBase::GetLumpedMatrixDiagonal (Kokkos implementation)"); // Implement using Kokkos using local_vector_type = typename Vector::dual_view_type::t_dev_um; using local_matrix_type = typename Matrix::local_matrix_type; @@ -367,8 +366,6 @@ namespace MueLu { Kokkos::View avgAbsDiagVal_dev("avgAbsDiagVal"); Kokkos::View numDiagsEqualToOne_dev("numDiagsEqualToOne"); - { - Teuchos::TimeMonitor MMM = *Teuchos::TimeMonitor::getNewTimer("GetLumpedMatrixDiagonal: parallel_for (doReciprocal)"); Kokkos::parallel_for("GetLumpedMatrixDiagonal", my_policy, KOKKOS_LAMBDA(const int rowIdx) { diag_dev(rowIdx, 0) = KAT_S::zero(); @@ -390,19 +387,15 @@ namespace MueLu { } }); - } - if (useAverageAbsDiagVal) { - Teuchos::TimeMonitor MMM = *Teuchos::TimeMonitor::getNewTimer("GetLumpedMatrixDiagonal: useAverageAbsDiagVal"); - typename Kokkos::View::HostMirror avgAbsDiagVal = Kokkos::create_mirror_view(avgAbsDiagVal_dev); - Kokkos::deep_copy(avgAbsDiagVal, avgAbsDiagVal_dev); - int numDiagsEqualToOne; - Kokkos::deep_copy(numDiagsEqualToOne, numDiagsEqualToOne_dev); + typename Kokkos::View::HostMirror avgAbsDiagVal = Kokkos::create_mirror_view(avgAbsDiagVal_dev); + Kokkos::deep_copy(avgAbsDiagVal, avgAbsDiagVal_dev); + int numDiagsEqualToOne; + Kokkos::deep_copy(numDiagsEqualToOne, numDiagsEqualToOne_dev); + if (useAverageAbsDiagVal) { tol = TST::magnitude(100 * Teuchos::ScalarTraits::eps()) * (avgAbsDiagVal()-numDiagsEqualToOne) / (rowMap->getLocalNumElements()-numDiagsEqualToOne); } - { - Teuchos::TimeMonitor MMM = *Teuchos::TimeMonitor::getNewTimer("ComputeLumpedDiagonalInverse: parallel_for (doReciprocal)"); Kokkos::parallel_for("ComputeLumpedDiagonalInverse", my_policy, KOKKOS_LAMBDA(const int rowIdx) { if (replaceSingleEntryRowWithZero && nnzPerRow(rowIdx) <= 1) { @@ -417,10 +410,8 @@ namespace MueLu { } } }); - } } else { - Teuchos::TimeMonitor MMM = *Teuchos::TimeMonitor::getNewTimer("GetLumpedMatrixDiagonal: parallel_for"); Kokkos::parallel_for("GetLumpedMatrixDiagonal", my_policy, KOKKOS_LAMBDA(const int rowIdx) { diag_dev(rowIdx, 0) = KAT_S::zero(); @@ -433,7 +424,6 @@ namespace MueLu { } } else { // Implement using Teuchos - Teuchos::TimeMonitor MMM = *Teuchos::TimeMonitor::getNewTimer("UtilitiesBase: GetLumpedMatrixDiagonal: (Teuchos implementation)"); ArrayRCP diagVals = diag->getDataNonConst(0); Teuchos::Array regSum(diag->getLocalLength()); Teuchos::ArrayView cols; diff --git a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp index 743a1bb5f6ca..114673a9f6c5 100644 --- a/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp @@ -4529,7 +4529,7 @@ CrsMatrix:: ); this->checkInternalState (); } - } //fillComplete(domainMap, rangeMap, params) + } template void @@ -7914,12 +7914,12 @@ CrsMatrix:: const size_t NumSameIDs = rowTransfer.getNumSameIDs(); ArrayView ExportLIDs = reverseMode ? rowTransfer.getRemoteLIDs () : rowTransfer.getExportLIDs (); - auto RemoteLIDs = reverseMode ? - rowTransfer.getExportLIDs_dv() : rowTransfer.getRemoteLIDs_dv(); - auto PermuteToLIDs = reverseMode ? - rowTransfer.getPermuteFromLIDs_dv() : rowTransfer.getPermuteToLIDs_dv(); - auto PermuteFromLIDs = reverseMode ? - rowTransfer.getPermuteToLIDs_dv() : rowTransfer.getPermuteFromLIDs_dv(); + ArrayView RemoteLIDs = reverseMode ? + rowTransfer.getExportLIDs () : rowTransfer.getRemoteLIDs (); + ArrayView PermuteToLIDs = reverseMode ? + rowTransfer.getPermuteFromLIDs () : rowTransfer.getPermuteToLIDs (); + ArrayView PermuteFromLIDs = reverseMode ? + rowTransfer.getPermuteToLIDs () : rowTransfer.getPermuteFromLIDs (); Distributor& Distor = rowTransfer.getDistributor (); // Owning PIDs @@ -8114,14 +8114,14 @@ CrsMatrix:: #endif if (constantNumPackets == 0) { destMat->reallocArraysForNumPacketsPerLid (ExportLIDs.size (), - RemoteLIDs.view_host().size ()); + RemoteLIDs.size ()); } else { // There are a constant number of packets per element. We // already know (from the number of "remote" (incoming) // elements) how many incoming elements we expect, so we can // resize the buffer accordingly. - const size_t rbufLen = RemoteLIDs.view_host().size() * constantNumPackets; + const size_t rbufLen = RemoteLIDs.size() * constantNumPackets; destMat->reallocImportsIfNeeded (rbufLen, false, nullptr); } } @@ -8445,48 +8445,52 @@ CrsMatrix:: } } + /*********************************************************************/ /**** 3) Copy all of the Same/Permute/Remote data into CSR_arrays ****/ /*********************************************************************/ // Backwards compatibility measure. We'll use this again below. +#ifdef HAVE_TPETRA_MMM_TIMINGS + RCP tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize")))); +#endif + destMat->numImportPacketsPerLID_.sync_host (); + Teuchos::ArrayView numImportPacketsPerLID = + getArrayViewFromDualView (destMat->numImportPacketsPerLID_); + destMat->imports_.sync_host (); + Teuchos::ArrayView hostImports = + getArrayViewFromDualView (destMat->imports_); - // TODO JHU Need to track down why numImportPacketsPerLID_ has not been corrently marked as modified on host (which it has been) - // TODO JHU somewhere above, e.g., call to Distor.doPostsAndWaits(). - // TODO JHU This only becomes apparent as we begin to convert TAFC to run on device. - destMat->numImportPacketsPerLID_.modify_host(); //FIXME + if (verbose) { + std::ostringstream os; + os << *verbosePrefix << "Calling unpackAndCombineWithOwningPIDsCount" + << std::endl; + std::cerr << os.str (); + } + size_t mynnz = + unpackAndCombineWithOwningPIDsCount (*this, + RemoteLIDs, + hostImports, + numImportPacketsPerLID, + constantNumPackets, + INSERT, + NumSameIDs, + PermuteToLIDs, + PermuteFromLIDs); + if (verbose) { + std::ostringstream os; + os << *verbosePrefix << "unpackAndCombineWithOwningPIDsCount returned " + << mynnz << std::endl; + std::cerr << os.str (); + } + size_t N = BaseRowMap->getLocalNumElements (); -# ifdef HAVE_TPETRA_MMM_TIMINGS - RCP tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize + copy same-perm-remote data")))); -# endif - ArrayRCP CSR_rowptr; + // Allocations + ArrayRCP CSR_rowptr(N+1); ArrayRCP CSR_colind_GID; ArrayRCP CSR_colind_LID; ArrayRCP CSR_vals; - - destMat->imports_.sync_device (); - destMat->numImportPacketsPerLID_.sync_device (); - - size_t N = BaseRowMap->getLocalNumElements (); - - const Kokkos::View RemoteLIDs_d = RemoteLIDs.view_device(); - const Kokkos::View PermuteToLIDs_d = PermuteToLIDs.view_device(); - const Kokkos::View PermuteFromLIDs_d = PermuteFromLIDs.view_device(); - //auto PermuteToLIDs_d = PermuteToLIDs.view_device(); //FAILS - Details::unpackAndCombineIntoCrsArrays( - *this, - RemoteLIDs_d, - destMat->imports_.view_device(), //hostImports - destMat->numImportPacketsPerLID_.view_device(), //numImportPacketsPerLID - NumSameIDs, - PermuteToLIDs_d, - PermuteFromLIDs_d, - N, - MyPID, - CSR_rowptr, - CSR_colind_GID, - CSR_vals, - SourcePids(), - TargetPids); + CSR_colind_GID.resize (mynnz); + CSR_vals.resize (mynnz); // If LO and GO are the same, we can reuse memory when // converting the column indices from global to local indices. @@ -8494,14 +8498,44 @@ CrsMatrix:: CSR_colind_LID = Teuchos::arcp_reinterpret_cast (CSR_colind_GID); } else { - CSR_colind_LID.resize (CSR_colind_GID.size()); + CSR_colind_LID.resize (mynnz); } - CSR_colind_LID.resize (CSR_colind_GID.size()); - size_t mynnz = CSR_vals.size(); +#ifdef HAVE_TPETRA_MMM_TIMINGS + tmCopySPRdata = Teuchos::null; + tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC copy same-perm-remote data")))); +#endif + + if (verbose) { + std::ostringstream os; + os << *verbosePrefix << "Calling unpackAndCombineIntoCrsArrays" + << std::endl; + std::cerr << os.str (); + } + // FIXME (mfh 15 May 2014) Why can't we abstract this out as an + // unpackAndCombine method on a "CrsArrays" object? This passing + // in a huge list of arrays is icky. Can't we have a bit of an + // abstraction? Implementing a concrete DistObject subclass only + // takes five methods. + unpackAndCombineIntoCrsArrays (*this, + RemoteLIDs, + hostImports, + numImportPacketsPerLID, + constantNumPackets, + INSERT, + NumSameIDs, + PermuteToLIDs, + PermuteFromLIDs, + N, + mynnz, + MyPID, + CSR_rowptr (), + CSR_colind_GID (), + Teuchos::av_reinterpret_cast (CSR_vals ()), + SourcePids (), + TargetPids); // On return from unpackAndCombineIntoCrsArrays TargetPids[i] == -1 for locally // owned entries. Convert them to the actual PID. - // JHU FIXME This can be done within unpackAndCombineIntoCrsArrays with a parallel_for. for(size_t i=0; i(TargetPids.size()); i++) { if(TargetPids[i] == -1) TargetPids[i] = MyPID; diff --git a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_decl.hpp b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_decl.hpp index a112950e37be..349a1fa0ca86 100644 --- a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_decl.hpp @@ -213,6 +213,10 @@ unpackAndCombineWithOwningPIDsCount ( /// \brief unpackAndCombineIntoCrsArrays /// +/// \note You should call unpackAndCombineWithOwningPIDsCount first +/// and allocate all arrays accordingly, before calling this +/// function. +/// /// Note: The SourcePids vector (on input) should contain owning PIDs /// for each column in the (source) ColMap, as from /// Tpetra::Import_Util::getPids, with the "-1 for local" option being @@ -221,26 +225,24 @@ unpackAndCombineWithOwningPIDsCount ( /// Note: The TargetPids vector (on output) will contain owning PIDs /// for each entry in the matrix, with the "-1 for local" for locally /// owned entries. -/// -/// Note: This method does the work previously done in unpackAndCombineWithOwningPIDsCount, -/// namely, calculating the local number of nonzeros, and allocates CRS -/// arrays of the correct sizes. - template void unpackAndCombineIntoCrsArrays ( const CrsMatrix & sourceMatrix, - const Kokkos::View, - const Kokkos::View, - const Kokkos::View, + const Teuchos::ArrayView& importLIDs, + const Teuchos::ArrayView& imports, + const Teuchos::ArrayView& numPacketsPerLID, + const size_t constantNumPackets, + const CombineMode combineMode, const size_t numSameIDs, - const Kokkos::View, - const Kokkos::View, + const Teuchos::ArrayView& permuteToLIDs, + const Teuchos::ArrayView& permuteFromLIDs, size_t TargetNumRows, + size_t TargetNumNonzeros, const int MyTargetPID, - Teuchos::ArrayRCP& CRS_rowptr, - Teuchos::ArrayRCP& CRS_colind, - Teuchos::ArrayRCP& CRS_vals, + const Teuchos::ArrayView& CRS_rowptr, + const Teuchos::ArrayView& CRS_colind, + const Teuchos::ArrayView::impl_scalar_type>& CRS_vals, const Teuchos::ArrayView& SourcePids, Teuchos::Array& TargetPids); diff --git a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp index 433fce8a2371..aafdde2d536d 100644 --- a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp @@ -44,7 +44,6 @@ #include "Teuchos_Array.hpp" #include "Teuchos_ArrayView.hpp" #include "Teuchos_OrdinalTraits.hpp" -#include "Teuchos_TimeMonitor.hpp" #include "Tpetra_Details_castAwayConstDualView.hpp" #include "Tpetra_Details_computeOffsets.hpp" #include "Tpetra_Details_createMirrorView.hpp" @@ -167,7 +166,7 @@ unpackRow(const typename PackTraits::output_array_type& gids_out, return 24; // error code } return 0; // no errors -} //unpackRow +} /// \brief Unpacks and combines a single row of the CrsMatrix. /// @@ -420,7 +419,7 @@ struct UnpackCrsMatrixAndCombineFunctor { return error_code_h(); } -}; //UnpackCrsMatrixAndCombineFunctor +}; struct MaxNumEntTag {}; struct TotNumEntTag {}; @@ -490,7 +489,7 @@ class NumEntriesFunctor { tot_num_ent += static_cast (num_ent_LO); } } -}; //NumEntriesFunctor +}; /// \brief Maximum number of entries in any row of the packed matrix. /// @@ -740,7 +739,7 @@ unpackAndCombineIntoCrsMatrix( std::runtime_error, prefix << "UnpackCrsMatrixAndCombineFunctor reported error code " << error_code ); -} //unpackAndCombineIntoCrsMatrix (Kokkos version) +} template size_t @@ -798,7 +797,7 @@ unpackAndCombineWithOwningPIDsCount( } return count; -} //unpackAndCombineWithOwningPIDsCount (Kokkos version) +} /// \brief Setup row pointers for remotes template @@ -1368,7 +1367,7 @@ unpackAndCombineWithOwningPIDsCount ( using Kokkos::MemoryUnmanaged; using Kokkos::View; typedef typename Node::device_type DT; - typedef typename DT::execution_space execution_space; + typedef typename DistObject::buffer_device_type BDT; const char prefix[] = "unpackAndCombineWithOwningPIDsCount: "; TEUCHOS_TEST_FOR_EXCEPTION @@ -1393,12 +1392,12 @@ unpackAndCombineWithOwningPIDsCount ( permuteFromLIDs.size (), true, "permute_from_lids"); auto imports_d = - create_mirror_view_from_raw_host_array (DT (), + create_mirror_view_from_raw_host_array (BDT (), imports.getRawPtr (), imports.size (), true, "imports"); auto num_packets_per_lid_d = - create_mirror_view_from_raw_host_array (DT (), + create_mirror_view_from_raw_host_array (BDT (), numPacketsPerLID.getRawPtr (), numPacketsPerLID.size (), true, "num_packets_per_lid"); @@ -1406,7 +1405,7 @@ unpackAndCombineWithOwningPIDsCount ( return UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount( local_matrix, permute_from_lids_d, imports_d, num_packets_per_lid_d, numSameIDs); -} //unpackAndCombineWithOwningPIDsCount (Teuchos::Array version) +} /// \brief unpackAndCombineIntoCrsArrays /// @@ -1422,22 +1421,24 @@ unpackAndCombineWithOwningPIDsCount ( /// Note: The TargetPids vector (on output) will contain owning PIDs /// for each entry in the matrix, with the "-1 for local" for locally /// owned entries. - template void unpackAndCombineIntoCrsArrays ( const CrsMatrix & sourceMatrix, - const Kokkos::View import_lids_d, - const Kokkos::View imports_d, - const Kokkos::View num_packets_per_lid_d, + const Teuchos::ArrayView& importLIDs, + const Teuchos::ArrayView& imports, + const Teuchos::ArrayView& numPacketsPerLID, + const size_t /* constantNumPackets */, + const CombineMode /* combineMode */, const size_t numSameIDs, - const Kokkos::View permute_to_lids_d, - const Kokkos::View permute_from_lids_d, + const Teuchos::ArrayView& permuteToLIDs, + const Teuchos::ArrayView& permuteFromLIDs, size_t TargetNumRows, + size_t TargetNumNonzeros, const int MyTargetPID, - Teuchos::ArrayRCP& CRS_rowptr, - Teuchos::ArrayRCP& CRS_colind, - Teuchos::ArrayRCP& CRS_vals, + const Teuchos::ArrayView& CRS_rowptr, + const Teuchos::ArrayView& CRS_colind, + const Teuchos::ArrayView::impl_scalar_type>& CRS_vals, const Teuchos::ArrayView& SourcePids, Teuchos::Array& TargetPids) { @@ -1460,58 +1461,23 @@ unpackAndCombineIntoCrsArrays ( typedef typename matrix_type::impl_scalar_type ST; typedef typename ArrayView::size_type size_type; - const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays_new: "; -# ifdef HAVE_TPETRA_MMM_TIMINGS - using Teuchos::TimeMonitor; - Teuchos::RCP tm; -# endif - - using Kokkos::MemoryUnmanaged; + const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays: "; - TEUCHOS_TEST_FOR_EXCEPTION - (permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument, - prefix << "permute_to_lids_d.size() = " << permute_to_lids_d.size () << " != " - "permute_from_lids_d.size() = " << permute_from_lids_d.size() << "."); - // FIXME (mfh 26 Jan 2015) If there are no entries on the calling - // process, then the matrix is neither locally nor globally indexed. - const bool locallyIndexed = sourceMatrix.isLocallyIndexed (); - TEUCHOS_TEST_FOR_EXCEPTION - (! locallyIndexed, std::invalid_argument, prefix << "The input " - "CrsMatrix 'sourceMatrix' must be locally indexed."); - TEUCHOS_TEST_FOR_EXCEPTION - (((size_t)import_lids_d.size ()) != num_packets_per_lid_d.size (), std::invalid_argument, - prefix << "import_lids_d.size() = " << import_lids_d.size () << " != " - "num_packets_per_lid_d.size() = " << num_packets_per_lid_d.size () << "."); - - auto local_matrix = sourceMatrix.getLocalMatrixDevice (); + TEUCHOS_TEST_FOR_EXCEPTION( + TargetNumRows + 1 != static_cast (CRS_rowptr.size ()), + std::invalid_argument, prefix << "CRS_rowptr.size() = " << + CRS_rowptr.size () << "!= TargetNumRows+1 = " << TargetNumRows+1 << "."); - // TargetNumNonzeros is number of nonzeros in local matrix. -# ifdef HAVE_TPETRA_MMM_TIMINGS - tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineWithOwningPIDsCount")))); -# endif - size_t TargetNumNonzeros = - UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount( - local_matrix, permute_from_lids_d, imports_d, - num_packets_per_lid_d, numSameIDs); -# ifdef HAVE_TPETRA_MMM_TIMINGS - tm = Teuchos::null; -# endif - -# ifdef HAVE_TPETRA_MMM_TIMINGS - tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("resize CRS pointers")))); -# endif - CRS_rowptr.resize (TargetNumRows+1); - CRS_colind.resize(TargetNumNonzeros); - CRS_vals.resize(TargetNumNonzeros); - Teuchos::ArrayRCP const & CRS_vals_impl_scalar_type = Teuchos::arcp_reinterpret_cast(CRS_vals); -# ifdef HAVE_TPETRA_MMM_TIMINGS - tm = Teuchos::null; -# endif + TEUCHOS_TEST_FOR_EXCEPTION( + permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument, + prefix << "permuteToLIDs.size() = " << permuteToLIDs.size () + << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size () << "."); + const size_type numImportLIDs = importLIDs.size (); TEUCHOS_TEST_FOR_EXCEPTION( - permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument, - prefix << "permuteToLIDs.size() = " << permute_to_lids_d.size () - << "!= permute_from_lids_d.size() = " << permute_from_lids_d.size () << "."); + numImportLIDs != numPacketsPerLID.size (), std::invalid_argument, + prefix << "importLIDs.size() = " << numImportLIDs << " != " + "numPacketsPerLID.size() = " << numPacketsPerLID.size() << "."); // Preseed TargetPids with -1 for local if (static_cast (TargetPids.size ()) != TargetNumNonzeros) { @@ -1520,13 +1486,30 @@ unpackAndCombineIntoCrsArrays ( TargetPids.assign (TargetNumNonzeros, -1); // Grab pointers for sourceMatrix + auto local_matrix = sourceMatrix.getLocalMatrixDevice(); auto local_col_map = sourceMatrix.getColMap()->getLocalMap(); -# ifdef HAVE_TPETRA_MMM_TIMINGS - tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("create mirror views from inputs")))); -# endif - // Convert input arrays to Kokkos::Views + // Convert input arrays to Kokkos::View DT outputDevice; + auto import_lids_d = + create_mirror_view_from_raw_host_array(outputDevice, importLIDs.getRawPtr(), + importLIDs.size(), true, "import_lids"); + + auto imports_d = + create_mirror_view_from_raw_host_array(outputDevice, imports.getRawPtr(), + imports.size(), true, "imports"); + + auto num_packets_per_lid_d = + create_mirror_view_from_raw_host_array(outputDevice, numPacketsPerLID.getRawPtr(), + numPacketsPerLID.size(), true, "num_packets_per_lid"); + + auto permute_from_lids_d = + create_mirror_view_from_raw_host_array(outputDevice, permuteFromLIDs.getRawPtr(), + permuteFromLIDs.size(), true, "permute_from_lids"); + + auto permute_to_lids_d = + create_mirror_view_from_raw_host_array(outputDevice, permuteToLIDs.getRawPtr(), + permuteToLIDs.size(), true, "permute_to_lids"); auto crs_rowptr_d = create_mirror_view_from_raw_host_array(outputDevice, CRS_rowptr.getRawPtr(), @@ -1535,11 +1518,12 @@ unpackAndCombineIntoCrsArrays ( auto crs_colind_d = create_mirror_view_from_raw_host_array(outputDevice, CRS_colind.getRawPtr(), CRS_colind.size(), true, "crs_colidx"); + #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE static_assert (! std::is_same< typename std::remove_const< typename std::decay< - decltype (CRS_vals_impl_scalar_type) + decltype (CRS_vals) >::type::value_type >::type, std::complex >::value, @@ -1548,8 +1532,8 @@ unpackAndCombineIntoCrsArrays ( #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE auto crs_vals_d = - create_mirror_view_from_raw_host_array(outputDevice, CRS_vals_impl_scalar_type.getRawPtr(), - CRS_vals_impl_scalar_type.size(), true, "crs_vals"); + create_mirror_view_from_raw_host_array(outputDevice, CRS_vals.getRawPtr(), + CRS_vals.size(), true, "crs_vals"); #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE static_assert (! std::is_same< @@ -1567,10 +1551,6 @@ unpackAndCombineIntoCrsArrays ( create_mirror_view_from_raw_host_array(outputDevice, TargetPids.getRawPtr(), TargetPids.size(), true, "tgt_pids"); -# ifdef HAVE_TPETRA_MMM_TIMINGS - tm = Teuchos::null; -# endif - size_t bytes_per_value = 0; if (PackTraits::compileTimeSize) { // assume that ST is default constructible @@ -1607,23 +1587,14 @@ unpackAndCombineIntoCrsArrays ( "never happen, since std::complex does not work in Kokkos::View objects."); #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE -# ifdef HAVE_TPETRA_MMM_TIMINGS - tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineIntoCrsArrays")))); -# endif UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsArrays( local_matrix, local_col_map, import_lids_d, imports_d, num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d, crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d, numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID, bytes_per_value); -# ifdef HAVE_TPETRA_MMM_TIMINGS - tm = Teuchos::null; -# endif // Copy outputs back to host -# ifdef HAVE_TPETRA_MMM_TIMINGS - tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("copy back to host")))); -# endif typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h( CRS_rowptr.getRawPtr(), CRS_rowptr.size()); // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR @@ -1635,7 +1606,7 @@ unpackAndCombineIntoCrsArrays ( deep_copy(execution_space(), crs_colind_h, crs_colind_d); typename decltype(crs_vals_d)::HostMirror crs_vals_h( - CRS_vals_impl_scalar_type.getRawPtr(), CRS_vals_impl_scalar_type.size()); + CRS_vals.getRawPtr(), CRS_vals.size()); // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR deep_copy(execution_space(), crs_vals_h, crs_vals_d); @@ -1643,8 +1614,7 @@ unpackAndCombineIntoCrsArrays ( TargetPids.getRawPtr(), TargetPids.size()); // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR deep_copy(execution_space(), tgt_pids_h, tgt_pids_d); - -} //unpackAndCombineIntoCrsArrays +} } // namespace Details } // namespace Tpetra @@ -1666,6 +1636,25 @@ unpackAndCombineIntoCrsArrays ( const Kokkos::DualView::buffer_device_type>&, \ const size_t, \ const CombineMode); \ + template void \ + Details::unpackAndCombineIntoCrsArrays ( \ + const CrsMatrix &, \ + const Teuchos::ArrayView&, \ + const Teuchos::ArrayView&, \ + const Teuchos::ArrayView&, \ + const size_t, \ + const CombineMode, \ + const size_t, \ + const Teuchos::ArrayView&, \ + const Teuchos::ArrayView&, \ + size_t, \ + size_t, \ + const int, \ + const Teuchos::ArrayView&, \ + const Teuchos::ArrayView&, \ + const Teuchos::ArrayView::impl_scalar_type>&, \ + const Teuchos::ArrayView&, \ + Teuchos::Array&); \ template size_t \ Details::unpackAndCombineWithOwningPIDsCount ( \ const CrsMatrix &, \ @@ -1676,22 +1665,6 @@ unpackAndCombineIntoCrsArrays ( CombineMode, \ size_t, \ const Teuchos::ArrayView&, \ - const Teuchos::ArrayView&); \ - template void \ - Details::unpackAndCombineIntoCrsArrays ( \ - const CrsMatrix &, \ - const Kokkos::View, \ - const Kokkos::View, \ - const Kokkos::View, \ - const size_t, \ - const Kokkos::View, \ - const Kokkos::View, \ - size_t, \ - const int, \ - Teuchos::ArrayRCP&, \ - Teuchos::ArrayRCP&, \ - Teuchos::ArrayRCP&, \ - const Teuchos::ArrayView&, \ - Teuchos::Array&); + const Teuchos::ArrayView&); #endif // TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP diff --git a/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp b/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp index 1f3802a60e32..4c244dd30c08 100644 --- a/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp +++ b/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp @@ -39,8 +39,6 @@ // ************************************************************************ // @HEADER -#include - #include #include @@ -461,20 +459,6 @@ namespace { } src_mat->fillComplete (); - RCP fos = Teuchos::fancyOStream(Teuchos::rcpFromRef(std::cout)); - fos->setOutputToRootOnly(-1); - -#if 0 - fflush(stdout); - sleep(1); comm->barrier(); - if (comm->getRank() == 0) std::cout << "========\nsrc_mat\n========" << std::endl; - sleep(1); comm->barrier(); - src_mat->describe(*fos,Teuchos::VERB_EXTREME); - sleep(1); comm->barrier(); - if (comm->getRank() == 0) std::cout << "========\nend of src_mat\n========\n\n" << std::endl; - sleep(1); comm->barrier(); -#endif - // Create the importer Import importer (src_map, tgt_map, getImportParameterList ()); // Do the import, and fill-complete the target matrix. @@ -512,9 +496,6 @@ namespace { Teuchos::null, Teuchos::null, rcp(&dummy,false)); - //comm->barrier(); - //TEST_EQUALITY(1,1); - //return; // Make sure that A_tgt2's row Map is the same as tgt_map, and // is also the same as the Import's targetMap. They should have @@ -540,25 +521,6 @@ namespace { as (10) * ScalarTraits::eps (); typedef typename CrsMatrix::nonconst_local_inds_host_view_type lids_type; typedef typename CrsMatrix::nonconst_values_host_view_type vals_type; - -#if 0 - fflush(stdout); - sleep(1); comm->barrier(); - if (comm->getRank() == 0) std::cout << "tgt_mat\n========" << std::endl; - sleep(1); comm->barrier(); - A_tgt2->describe(*fos,Teuchos::VERB_EXTREME); - sleep(1); comm->barrier(); - if (comm->getRank() == 0) std::cout << "=======\nend of tgt_mat\n========\n\n" << std::endl; - sleep(1); comm->barrier(); - - sleep(1); comm->barrier(); - if (comm->getRank() == 0) std::cout << "A_tgt2\n========" << std::endl; - sleep(1); comm->barrier(); - A_tgt2->describe(*fos,Teuchos::VERB_EXTREME); - sleep(1); comm->barrier(); - if (comm->getRank() == 0) std::cout << "=======\nend of A_tgt2\n========" << std::endl; - sleep(1); comm->barrier(); - #endif lids_type tgtRowInds; vals_type tgtRowVals; @@ -2383,8 +2345,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( Import_Util, UnpackAndCombineWithOwningPIDs, } Kokkos::View importsView(imports.data(), imports.size()); distor.doPostsAndWaits(exports.view_host(),numExportPackets(),importsView,numImportPackets()); - auto importsView_d = Kokkos::create_mirror_view(Node::device_type::memory_space(), importsView); - deep_copy(importsView_d,importsView); if (verbose) { std::ostringstream os; os << *prefix << "Done with 4-arg doPostsAndWaits" << std::endl; @@ -2393,13 +2353,33 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( Import_Util, UnpackAndCombineWithOwningPIDs, ::Tpetra::Details::Behavior::enable_verbose_behavior (); + // Run the count... which should get the same NNZ as the traditional import + using Tpetra::Details::unpackAndCombineWithOwningPIDsCount; + size_t nnz2 = + unpackAndCombineWithOwningPIDsCount (*A, Importer->getRemoteLIDs (), + imports (), numImportPackets (), + constantNumPackets, + Tpetra::INSERT, + Importer->getNumSameIDs (), + Importer->getPermuteToLIDs (), + Importer->getPermuteFromLIDs ()); + if (verbose) { + std::ostringstream os; + os << *prefix << "Done with unpackAndCombineWithOwningPIDsCount; " + "nnz1=" << nnz1 << ", nnz2=" << nnz2 << std::endl; + std::cerr << os.str (); + } + + if(nnz1!=nnz2) test_err++; + total_err+=test_err; + ///////////////////////////////////////////////////////// // Test #2: Actual combine test ///////////////////////////////////////////////////////// - Teuchos::ArrayRCP rowptr; - Teuchos::ArrayRCP colind; - Teuchos::ArrayRCP vals; - Teuchos::Array TargetPids; + Teuchos::Array rowptr (MapTarget->getLocalNumElements () + 1); + Teuchos::Array colind (nnz2); + Teuchos::Array vals (nnz2); + Teuchos::Array TargetPids; if (verbose) { std::ostringstream os; @@ -2407,39 +2387,29 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( Import_Util, UnpackAndCombineWithOwningPIDs, std::cerr << os.str (); } - auto numImportPacketsView_d = Kokkos::create_mirror_view(Node::device_type::memory_space(),numImportPacketsView); - deep_copy(numImportPacketsView_d,numImportPacketsView); - - - const Kokkos::View RemoteLIDs_d = Importer->getRemoteLIDs_dv().view_device(); - const Kokkos::View PermuteToLIDs_d = Importer->getPermuteToLIDs_dv().view_device(); - const Kokkos::View PermuteFromLIDs_d = Importer->getPermuteFromLIDs_dv().view_device(); - using Tpetra::Details::unpackAndCombineIntoCrsArrays; unpackAndCombineIntoCrsArrays ( *A, - RemoteLIDs_d, - importsView_d, - numImportPacketsView_d, + Importer->getRemoteLIDs (), + imports (), + numImportPackets (), + constantNumPackets, + Tpetra::INSERT, Importer->getNumSameIDs (), - PermuteToLIDs_d, - PermuteFromLIDs_d, + Importer->getPermuteToLIDs (), + Importer->getPermuteFromLIDs (), MapTarget->getLocalNumElements (), + nnz2, MyPID, - rowptr, - colind, - vals, + rowptr (), + colind (), + Teuchos::av_reinterpret_cast (vals ()), SourcePids (), TargetPids); - size_t nnz2 = vals.size(); - if(nnz1!=nnz2) test_err++; - total_err+=test_err; - if (verbose) { std::ostringstream os; - os << *prefix << "Done with unpackAndCombineIntoCrsArrays; " - "nnz1=" << nnz1 << ", nnz2=" << nnz2 << std::endl; + os << *prefix << "Done with unpackAndCombineIntoCrsArrays" << std::endl; std::cerr << os.str (); }