Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Revert "Tpetra: move unpack and combine to device in TAFC" #12115

Merged
merged 1 commit into from
Aug 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 5 additions & 15 deletions packages/muelu/src/Utils/MueLu_UtilitiesBase_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,6 @@ namespace MueLu {
diag = Xpetra::VectorFactory<Scalar,LocalOrdinal,GlobalOrdinal,Node>::Build(rowMap,true);

if(rowMap->lib() == Xpetra::UnderlyingLib::UseTpetra) {
Teuchos::TimeMonitor MM = *Teuchos::TimeMonitor::getNewTimer("UtilitiesBase::GetLumpedMatrixDiagonal (Kokkos implementation)");
// Implement using Kokkos
using local_vector_type = typename Vector::dual_view_type::t_dev_um;
using local_matrix_type = typename Matrix::local_matrix_type;
Expand All @@ -367,8 +366,6 @@ namespace MueLu {
Kokkos::View<mag_type, execution_space> avgAbsDiagVal_dev("avgAbsDiagVal");
Kokkos::View<int, execution_space> numDiagsEqualToOne_dev("numDiagsEqualToOne");

{
Teuchos::TimeMonitor MMM = *Teuchos::TimeMonitor::getNewTimer("GetLumpedMatrixDiagonal: parallel_for (doReciprocal)");
Kokkos::parallel_for("GetLumpedMatrixDiagonal", my_policy,
KOKKOS_LAMBDA(const int rowIdx) {
diag_dev(rowIdx, 0) = KAT_S::zero();
Expand All @@ -390,19 +387,15 @@ namespace MueLu {
}
});

}
if (useAverageAbsDiagVal) {
Teuchos::TimeMonitor MMM = *Teuchos::TimeMonitor::getNewTimer("GetLumpedMatrixDiagonal: useAverageAbsDiagVal");
typename Kokkos::View<mag_type, execution_space>::HostMirror avgAbsDiagVal = Kokkos::create_mirror_view(avgAbsDiagVal_dev);
Kokkos::deep_copy(avgAbsDiagVal, avgAbsDiagVal_dev);
int numDiagsEqualToOne;
Kokkos::deep_copy(numDiagsEqualToOne, numDiagsEqualToOne_dev);
typename Kokkos::View<mag_type, execution_space>::HostMirror avgAbsDiagVal = Kokkos::create_mirror_view(avgAbsDiagVal_dev);
Kokkos::deep_copy(avgAbsDiagVal, avgAbsDiagVal_dev);
int numDiagsEqualToOne;
Kokkos::deep_copy(numDiagsEqualToOne, numDiagsEqualToOne_dev);

if (useAverageAbsDiagVal) {
tol = TST::magnitude(100 * Teuchos::ScalarTraits<Scalar>::eps()) * (avgAbsDiagVal()-numDiagsEqualToOne) / (rowMap->getLocalNumElements()-numDiagsEqualToOne);
}

{
Teuchos::TimeMonitor MMM = *Teuchos::TimeMonitor::getNewTimer("ComputeLumpedDiagonalInverse: parallel_for (doReciprocal)");
Kokkos::parallel_for("ComputeLumpedDiagonalInverse", my_policy,
KOKKOS_LAMBDA(const int rowIdx) {
if (replaceSingleEntryRowWithZero && nnzPerRow(rowIdx) <= 1) {
Expand All @@ -417,10 +410,8 @@ namespace MueLu {
}
}
});
}

} else {
Teuchos::TimeMonitor MMM = *Teuchos::TimeMonitor::getNewTimer("GetLumpedMatrixDiagonal: parallel_for");
Kokkos::parallel_for("GetLumpedMatrixDiagonal", my_policy,
KOKKOS_LAMBDA(const int rowIdx) {
diag_dev(rowIdx, 0) = KAT_S::zero();
Expand All @@ -433,7 +424,6 @@ namespace MueLu {
}
} else {
// Implement using Teuchos
Teuchos::TimeMonitor MMM = *Teuchos::TimeMonitor::getNewTimer("UtilitiesBase: GetLumpedMatrixDiagonal: (Teuchos implementation)");
ArrayRCP<Scalar> diagVals = diag->getDataNonConst(0);
Teuchos::Array<Scalar> regSum(diag->getLocalLength());
Teuchos::ArrayView<const LocalOrdinal> cols;
Expand Down
126 changes: 80 additions & 46 deletions packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4529,7 +4529,7 @@ CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
);
this->checkInternalState ();
}
} //fillComplete(domainMap, rangeMap, params)
}

template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
void
Expand Down Expand Up @@ -7914,12 +7914,12 @@ CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
const size_t NumSameIDs = rowTransfer.getNumSameIDs();
ArrayView<const LO> ExportLIDs = reverseMode ?
rowTransfer.getRemoteLIDs () : rowTransfer.getExportLIDs ();
auto RemoteLIDs = reverseMode ?
rowTransfer.getExportLIDs_dv() : rowTransfer.getRemoteLIDs_dv();
auto PermuteToLIDs = reverseMode ?
rowTransfer.getPermuteFromLIDs_dv() : rowTransfer.getPermuteToLIDs_dv();
auto PermuteFromLIDs = reverseMode ?
rowTransfer.getPermuteToLIDs_dv() : rowTransfer.getPermuteFromLIDs_dv();
ArrayView<const LO> RemoteLIDs = reverseMode ?
rowTransfer.getExportLIDs () : rowTransfer.getRemoteLIDs ();
ArrayView<const LO> PermuteToLIDs = reverseMode ?
rowTransfer.getPermuteFromLIDs () : rowTransfer.getPermuteToLIDs ();
ArrayView<const LO> PermuteFromLIDs = reverseMode ?
rowTransfer.getPermuteToLIDs () : rowTransfer.getPermuteFromLIDs ();
Distributor& Distor = rowTransfer.getDistributor ();

// Owning PIDs
Expand Down Expand Up @@ -8114,14 +8114,14 @@ CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
#endif
if (constantNumPackets == 0) {
destMat->reallocArraysForNumPacketsPerLid (ExportLIDs.size (),
RemoteLIDs.view_host().size ());
RemoteLIDs.size ());
}
else {
// There are a constant number of packets per element. We
// already know (from the number of "remote" (incoming)
// elements) how many incoming elements we expect, so we can
// resize the buffer accordingly.
const size_t rbufLen = RemoteLIDs.view_host().size() * constantNumPackets;
const size_t rbufLen = RemoteLIDs.size() * constantNumPackets;
destMat->reallocImportsIfNeeded (rbufLen, false, nullptr);
}
}
Expand Down Expand Up @@ -8445,63 +8445,97 @@ CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
}
}

/*********************************************************************/
/**** 3) Copy all of the Same/Permute/Remote data into CSR_arrays ****/
/*********************************************************************/

// Backwards compatibility measure. We'll use this again below.
#ifdef HAVE_TPETRA_MMM_TIMINGS
RCP<TimeMonitor> tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize"))));
#endif
destMat->numImportPacketsPerLID_.sync_host ();
Teuchos::ArrayView<const size_t> numImportPacketsPerLID =
getArrayViewFromDualView (destMat->numImportPacketsPerLID_);
destMat->imports_.sync_host ();
Teuchos::ArrayView<const char> hostImports =
getArrayViewFromDualView (destMat->imports_);

// TODO JHU Need to track down why numImportPacketsPerLID_ has not been corrently marked as modified on host (which it has been)
// TODO JHU somewhere above, e.g., call to Distor.doPostsAndWaits().
// TODO JHU This only becomes apparent as we begin to convert TAFC to run on device.
destMat->numImportPacketsPerLID_.modify_host(); //FIXME
if (verbose) {
std::ostringstream os;
os << *verbosePrefix << "Calling unpackAndCombineWithOwningPIDsCount"
<< std::endl;
std::cerr << os.str ();
}
size_t mynnz =
unpackAndCombineWithOwningPIDsCount (*this,
RemoteLIDs,
hostImports,
numImportPacketsPerLID,
constantNumPackets,
INSERT,
NumSameIDs,
PermuteToLIDs,
PermuteFromLIDs);
if (verbose) {
std::ostringstream os;
os << *verbosePrefix << "unpackAndCombineWithOwningPIDsCount returned "
<< mynnz << std::endl;
std::cerr << os.str ();
}
size_t N = BaseRowMap->getLocalNumElements ();

# ifdef HAVE_TPETRA_MMM_TIMINGS
RCP<TimeMonitor> tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize + copy same-perm-remote data"))));
# endif
ArrayRCP<size_t> CSR_rowptr;
// Allocations
ArrayRCP<size_t> CSR_rowptr(N+1);
ArrayRCP<GO> CSR_colind_GID;
ArrayRCP<LO> CSR_colind_LID;
ArrayRCP<Scalar> CSR_vals;

destMat->imports_.sync_device ();
destMat->numImportPacketsPerLID_.sync_device ();

size_t N = BaseRowMap->getLocalNumElements ();

const Kokkos::View<LO const *, typename Node::device_type> RemoteLIDs_d = RemoteLIDs.view_device();
const Kokkos::View<LO const *, typename Node::device_type> PermuteToLIDs_d = PermuteToLIDs.view_device();
const Kokkos::View<LO const *, typename Node::device_type> PermuteFromLIDs_d = PermuteFromLIDs.view_device();
//auto PermuteToLIDs_d = PermuteToLIDs.view_device(); //FAILS
Details::unpackAndCombineIntoCrsArrays(
*this,
RemoteLIDs_d,
destMat->imports_.view_device(), //hostImports
destMat->numImportPacketsPerLID_.view_device(), //numImportPacketsPerLID
NumSameIDs,
PermuteToLIDs_d,
PermuteFromLIDs_d,
N,
MyPID,
CSR_rowptr,
CSR_colind_GID,
CSR_vals,
SourcePids(),
TargetPids);
CSR_colind_GID.resize (mynnz);
CSR_vals.resize (mynnz);

// If LO and GO are the same, we can reuse memory when
// converting the column indices from global to local indices.
if (typeid (LO) == typeid (GO)) {
CSR_colind_LID = Teuchos::arcp_reinterpret_cast<LO> (CSR_colind_GID);
}
else {
CSR_colind_LID.resize (CSR_colind_GID.size());
CSR_colind_LID.resize (mynnz);
}
CSR_colind_LID.resize (CSR_colind_GID.size());
size_t mynnz = CSR_vals.size();
#ifdef HAVE_TPETRA_MMM_TIMINGS
tmCopySPRdata = Teuchos::null;
tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC copy same-perm-remote data"))));
#endif

if (verbose) {
std::ostringstream os;
os << *verbosePrefix << "Calling unpackAndCombineIntoCrsArrays"
<< std::endl;
std::cerr << os.str ();
}
// FIXME (mfh 15 May 2014) Why can't we abstract this out as an
// unpackAndCombine method on a "CrsArrays" object? This passing
// in a huge list of arrays is icky. Can't we have a bit of an
// abstraction? Implementing a concrete DistObject subclass only
// takes five methods.
unpackAndCombineIntoCrsArrays (*this,
RemoteLIDs,
hostImports,
numImportPacketsPerLID,
constantNumPackets,
INSERT,
NumSameIDs,
PermuteToLIDs,
PermuteFromLIDs,
N,
mynnz,
MyPID,
CSR_rowptr (),
CSR_colind_GID (),
Teuchos::av_reinterpret_cast<impl_scalar_type> (CSR_vals ()),
SourcePids (),
TargetPids);

// On return from unpackAndCombineIntoCrsArrays TargetPids[i] == -1 for locally
// owned entries. Convert them to the actual PID.
// JHU FIXME This can be done within unpackAndCombineIntoCrsArrays with a parallel_for.
for(size_t i=0; i<static_cast<size_t>(TargetPids.size()); i++)
{
if(TargetPids[i] == -1) TargetPids[i] = MyPID;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,10 @@ unpackAndCombineWithOwningPIDsCount (

/// \brief unpackAndCombineIntoCrsArrays
///
/// \note You should call unpackAndCombineWithOwningPIDsCount first
/// and allocate all arrays accordingly, before calling this
/// function.
///
/// Note: The SourcePids vector (on input) should contain owning PIDs
/// for each column in the (source) ColMap, as from
/// Tpetra::Import_Util::getPids, with the "-1 for local" option being
Expand All @@ -221,26 +225,24 @@ unpackAndCombineWithOwningPIDsCount (
/// Note: The TargetPids vector (on output) will contain owning PIDs
/// for each entry in the matrix, with the "-1 for local" for locally
/// owned entries.
///
/// Note: This method does the work previously done in unpackAndCombineWithOwningPIDsCount,
/// namely, calculating the local number of nonzeros, and allocates CRS
/// arrays of the correct sizes.

template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
void
unpackAndCombineIntoCrsArrays (
const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> & sourceMatrix,
const Kokkos::View<LocalOrdinal const *, typename Node::device_type>,
const Kokkos::View<const char*, typename Node::device_type>,
const Kokkos::View<const size_t*, typename Node::device_type>,
const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
const Teuchos::ArrayView<const char>& imports,
const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
const size_t constantNumPackets,
const CombineMode combineMode,
const size_t numSameIDs,
const Kokkos::View<LocalOrdinal const *, typename Node::device_type>,
const Kokkos::View<LocalOrdinal const *, typename Node::device_type>,
const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
size_t TargetNumRows,
size_t TargetNumNonzeros,
const int MyTargetPID,
Teuchos::ArrayRCP<size_t>& CRS_rowptr,
Teuchos::ArrayRCP<GlobalOrdinal>& CRS_colind,
Teuchos::ArrayRCP<Scalar>& CRS_vals,
const Teuchos::ArrayView<size_t>& CRS_rowptr,
const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
const Teuchos::ArrayView<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::impl_scalar_type>& CRS_vals,
const Teuchos::ArrayView<const int>& SourcePids,
Teuchos::Array<int>& TargetPids);

Expand Down
Loading