Skip to content

Commit

Permalink
Merge pull request #11120 from seanofthemillers/tpetra_wrapped_dual_v…
Browse files Browse the repository at this point in the history
…iew_reduced_memcpys

Tpetra: Reduce device-to-host memcpy calls in non-uvm builds
  • Loading branch information
csiefer2 authored Nov 10, 2022
2 parents 7ad8eb3 + b9bf282 commit 4dea6cc
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 22 deletions.
10 changes: 10 additions & 0 deletions packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2143,6 +2143,16 @@ namespace Tpetra {
typename row_ptrs_device_view_type::host_mirror_space(),
dview);
}

// There are common cases where both packed and unpacked views are set to the same array.
// Doing this in a single call can reduce dataspace on host, and reduce runtime by
// removing a deep_copy from device to host.

void setRowPtrs(const row_ptrs_device_view_type &dview) {
setRowPtrsUnpacked(dview);
rowPtrsPacked_dev_ = rowPtrsUnpacked_dev_;
rowPtrsPacked_host_ = rowPtrsUnpacked_host_;
}

//TODO: Make private -- matrix shouldn't access directly the guts of graph

Expand Down
21 changes: 12 additions & 9 deletions packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -683,8 +683,7 @@ namespace Tpetra {

lclIndsPacked_wdv = local_inds_wdv_type(k_local_graph_.entries);
lclIndsUnpacked_wdv = lclIndsPacked_wdv;
this->setRowPtrsUnpacked(k_local_graph_.row_map);
this->setRowPtrsPacked(k_local_graph_.row_map);
this->setRowPtrs(k_local_graph_.row_map);

set_need_sync_host_uvm_access(); // lclGraph_ potentially still in a kernel

Expand Down Expand Up @@ -730,8 +729,7 @@ namespace Tpetra {

lclIndsPacked_wdv = local_inds_wdv_type(lclGraph.entries);
lclIndsUnpacked_wdv = lclIndsPacked_wdv;
setRowPtrsUnpacked(lclGraph.row_map);
setRowPtrsPacked(lclGraph.row_map);
setRowPtrs(lclGraph.row_map);

set_need_sync_host_uvm_access(); // lclGraph_ potentially still in a kernel

Expand Down Expand Up @@ -2664,6 +2662,8 @@ namespace Tpetra {
setAllIndices (const typename local_graph_device_type::row_map_type& rowPointers,
const typename local_graph_device_type::entries_type::non_const_type& columnIndices)
{
using ProfilingRegion=Details::ProfilingRegion;
ProfilingRegion region ("Tpetra::CrsGraph::setAllIndices");
const char tfecfFuncName[] = "setAllIndices: ";
TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
! hasColMap () || getColMap ().is_null (), std::runtime_error,
Expand Down Expand Up @@ -2753,8 +2753,7 @@ namespace Tpetra {
noRedundancies_ = true;
lclIndsPacked_wdv= local_inds_wdv_type(columnIndices);
lclIndsUnpacked_wdv = lclIndsPacked_wdv;
setRowPtrsUnpacked(rowPointers);
setRowPtrsPacked(rowPointers);
setRowPtrs(rowPointers);

set_need_sync_host_uvm_access(); // columnIndices and rowPointers potentially still in a kernel

Expand Down Expand Up @@ -3651,11 +3650,16 @@ namespace Tpetra {
}
}
// Build the local graph.
setRowPtrsPacked(ptr_d_const);
if (requestOptimizedStorage)
setRowPtrs(ptr_d_const);
else
setRowPtrsPacked(ptr_d_const);
lclIndsPacked_wdv = local_inds_wdv_type(ind_d);
}
else { // We don't have to pack, so just set the pointers.
setRowPtrsPacked(rowPtrsUnpacked_dev_);
// Note: The setRowPtrsPacked call has an aditional memcpy to host that we want to avoid
rowPtrsPacked_dev_ = rowPtrsUnpacked_dev_;
rowPtrsPacked_host_ = rowPtrsUnpacked_host_;
lclIndsPacked_wdv = lclIndsUnpacked_wdv;

if (debug_) {
Expand Down Expand Up @@ -3707,7 +3711,6 @@ namespace Tpetra {
k_numRowEntries_ = row_entries_type ();

// Keep the new 1-D packed allocations.
setRowPtrsUnpacked(rowPtrsPacked_dev_);
lclIndsUnpacked_wdv = lclIndsPacked_wdv;

storageStatus_ = Details::STORAGE_1D_PACKED;
Expand Down
13 changes: 10 additions & 3 deletions packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1214,7 +1214,7 @@ namespace Tpetra {
using lclinds_1d_type = typename Graph::local_graph_device_type::entries_type::non_const_type;
using values_type = typename local_matrix_device_type::values_type;
Details::ProfilingRegion regionFLGAM
("Tpetra::CrsGraph::fillLocalGraphAndMatrix");
("Tpetra::CrsMatrix::fillLocalGraphAndMatrix");

const char tfecfFuncName[] = "fillLocalGraphAndMatrix (called from "
"fillComplete or expertStaticFillComplete): ";
Expand Down Expand Up @@ -1454,7 +1454,9 @@ namespace Tpetra {
}
else { // We don't have to pack, so just set the pointers.
// FIXME KDDKDD https://github.com/trilinos/Trilinos/issues/9657
myGraph_->setRowPtrsPacked(myGraph_->rowPtrsUnpacked_dev_);
// FIXME? This is already done in the graph fill call - need to avoid the memcpy to host
myGraph_->rowPtrsPacked_dev_ = myGraph_->rowPtrsUnpacked_dev_;
myGraph_->rowPtrsPacked_host_ = myGraph_->rowPtrsUnpacked_host_;
myGraph_->lclIndsPacked_wdv = myGraph_->lclIndsUnpacked_wdv;
valuesPacked_wdv = valuesUnpacked_wdv;

Expand Down Expand Up @@ -1547,7 +1549,9 @@ namespace Tpetra {

// Keep the new 1-D packed allocations.
// FIXME KDDKDD https://github.com/trilinos/Trilinos/issues/9657
myGraph_->setRowPtrsUnpacked(myGraph_->rowPtrsPacked_dev_);
// We directly set the memory spaces to avoid a memcpy from device to host
myGraph_->rowPtrsUnpacked_dev_ = myGraph_->rowPtrsPacked_dev_;
myGraph_->rowPtrsUnpacked_host_ = myGraph_->rowPtrsPacked_host_;
myGraph_->lclIndsUnpacked_wdv = myGraph_->lclIndsPacked_wdv;
valuesUnpacked_wdv = valuesPacked_wdv;

Expand Down Expand Up @@ -3438,6 +3442,8 @@ CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
const typename local_matrix_device_type::values_type& values)
{
using ProfilingRegion=Details::ProfilingRegion;
ProfilingRegion region ("Tpetra::CrsMatrix::setAllValues");
const char tfecfFuncName[] = "setAllValues: ";
TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
(columnIndices.size () != values.size (), std::invalid_argument,
Expand All @@ -3454,6 +3460,7 @@ CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
(true, std::runtime_error, "myGraph_->setAllIndices() threw an "
"exception: " << e.what ());
}

// Make sure that myGraph_ now has a local graph. It may not be
// fillComplete yet, so it's important to check. We don't care
// whether setAllIndices() did a shallow copy or a deep copy, so a
Expand Down
5 changes: 4 additions & 1 deletion packages/tpetra/core/src/Tpetra_Details_WrappedDualView.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,11 +167,14 @@ class WrappedDualView {
t_host hostView;
if(deviceView.use_count() != 0)
{
hostView = Kokkos::create_mirror_view_and_copy(
hostView = Kokkos::create_mirror_view(
Kokkos::WithoutInitializing,
typename t_host::memory_space(),
deviceView);
}
originalDualView = DualViewType(deviceView, hostView);
originalDualView.clear_sync_state();
originalDualView.modify_device();
dualView = originalDualView;
}

Expand Down
9 changes: 1 addition & 8 deletions packages/tpetra/core/src/Tpetra_MultiVector_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -217,14 +217,7 @@ namespace { // (anonymous)
<< ". Please report this bug to the Tpetra developers.");
}

dual_view_type dv (d_view, Kokkos::create_mirror_view (d_view));
// Whether or not the user cares about the initial contents of the
// MultiVector, the device and host views are out of sync. We
// prefer to work in device memory. The way to ensure this
// happens is to mark the device view as modified.
dv.modify_device ();

return wrapped_dual_view_type(dv);
return wrapped_dual_view_type(d_view);
}

// Convert 1-D Teuchos::ArrayView to an unmanaged 1-D host Kokkos::View.
Expand Down
3 changes: 2 additions & 1 deletion packages/tpetra/core/test/Block/BlockMultiVector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,7 @@ namespace {
{
auto X_overlap =
X.getLocalBlockHost (meshMap.getLocalElement (meshMap.getMinGlobalIndex ()),
colToModify, Tpetra::Access::OverwriteAll);
colToModify, Tpetra::Access::ReadWrite);
TEST_ASSERT( X_overlap.data () != NULL );
TEST_EQUALITY_CONST( static_cast<size_t> (X_overlap.extent (0)),
static_cast<size_t> (blockSize) );
Expand Down Expand Up @@ -466,6 +466,7 @@ namespace {
localMeshRow < meshMap.getMaxLocalIndex (); ++localMeshRow) {
auto Y_cur = Y.getLocalBlockHost (localMeshRow, col,
Tpetra::Access::ReadOnly);

if (col != colToModify) {
TEST_ASSERT( equal (Y_cur, zeroLittleVector) &&
equal (zeroLittleVector, Y_cur) );
Expand Down

0 comments on commit 4dea6cc

Please sign in to comment.