Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tpetra: Reduce device-to-host memcpy calls in non-uvm builds #11120

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2143,6 +2143,16 @@ namespace Tpetra {
typename row_ptrs_device_view_type::host_mirror_space(),
dview);
}

// There are common cases where both packed and unpacked views are set to the same array.
// Doing this in a single call can reduce dataspace on host, and reduce runtime by
// removing a deep_copy from device to host.

void setRowPtrs(const row_ptrs_device_view_type &dview) {
setRowPtrsUnpacked(dview);
rowPtrsPacked_dev_ = rowPtrsUnpacked_dev_;
rowPtrsPacked_host_ = rowPtrsUnpacked_host_;
}

//TODO: Make private -- matrix shouldn't access directly the guts of graph

Expand Down
21 changes: 12 additions & 9 deletions packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -683,8 +683,7 @@ namespace Tpetra {

lclIndsPacked_wdv = local_inds_wdv_type(k_local_graph_.entries);
lclIndsUnpacked_wdv = lclIndsPacked_wdv;
this->setRowPtrsUnpacked(k_local_graph_.row_map);
this->setRowPtrsPacked(k_local_graph_.row_map);
this->setRowPtrs(k_local_graph_.row_map);

set_need_sync_host_uvm_access(); // lclGraph_ potentially still in a kernel

Expand Down Expand Up @@ -730,8 +729,7 @@ namespace Tpetra {

lclIndsPacked_wdv = local_inds_wdv_type(lclGraph.entries);
lclIndsUnpacked_wdv = lclIndsPacked_wdv;
setRowPtrsUnpacked(lclGraph.row_map);
setRowPtrsPacked(lclGraph.row_map);
setRowPtrs(lclGraph.row_map);

set_need_sync_host_uvm_access(); // lclGraph_ potentially still in a kernel

Expand Down Expand Up @@ -2664,6 +2662,8 @@ namespace Tpetra {
setAllIndices (const typename local_graph_device_type::row_map_type& rowPointers,
const typename local_graph_device_type::entries_type::non_const_type& columnIndices)
{
using ProfilingRegion=Details::ProfilingRegion;
ProfilingRegion region ("Tpetra::CrsGraph::setAllIndices");
const char tfecfFuncName[] = "setAllIndices: ";
TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
! hasColMap () || getColMap ().is_null (), std::runtime_error,
Expand Down Expand Up @@ -2754,8 +2754,7 @@ namespace Tpetra {
noRedundancies_ = true;
lclIndsPacked_wdv= local_inds_wdv_type(columnIndices);
lclIndsUnpacked_wdv = lclIndsPacked_wdv;
setRowPtrsUnpacked(rowPointers);
setRowPtrsPacked(rowPointers);
setRowPtrs(rowPointers);

set_need_sync_host_uvm_access(); // columnIndices and rowPointers potentially still in a kernel

Expand Down Expand Up @@ -3652,11 +3651,16 @@ namespace Tpetra {
}
}
// Build the local graph.
setRowPtrsPacked(ptr_d_const);
if (requestOptimizedStorage)
setRowPtrs(ptr_d_const);
else
setRowPtrsPacked(ptr_d_const);
lclIndsPacked_wdv = local_inds_wdv_type(ind_d);
}
else { // We don't have to pack, so just set the pointers.
setRowPtrsPacked(rowPtrsUnpacked_dev_);
// Note: The setRowPtrsPacked call has an aditional memcpy to host that we want to avoid
rowPtrsPacked_dev_ = rowPtrsUnpacked_dev_;
rowPtrsPacked_host_ = rowPtrsUnpacked_host_;
lclIndsPacked_wdv = lclIndsUnpacked_wdv;

if (debug_) {
Expand Down Expand Up @@ -3708,7 +3712,6 @@ namespace Tpetra {
k_numRowEntries_ = row_entries_type ();

// Keep the new 1-D packed allocations.
setRowPtrsUnpacked(rowPtrsPacked_dev_);
lclIndsUnpacked_wdv = lclIndsPacked_wdv;

storageStatus_ = Details::STORAGE_1D_PACKED;
Expand Down
13 changes: 10 additions & 3 deletions packages/tpetra/core/src/Tpetra_CrsMatrix_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1214,7 +1214,7 @@ namespace Tpetra {
using lclinds_1d_type = typename Graph::local_graph_device_type::entries_type::non_const_type;
using values_type = typename local_matrix_device_type::values_type;
Details::ProfilingRegion regionFLGAM
("Tpetra::CrsGraph::fillLocalGraphAndMatrix");
("Tpetra::CrsMatrix::fillLocalGraphAndMatrix");

const char tfecfFuncName[] = "fillLocalGraphAndMatrix (called from "
"fillComplete or expertStaticFillComplete): ";
Expand Down Expand Up @@ -1454,7 +1454,9 @@ namespace Tpetra {
}
else { // We don't have to pack, so just set the pointers.
// FIXME KDDKDD https://github.com/trilinos/Trilinos/issues/9657
myGraph_->setRowPtrsPacked(myGraph_->rowPtrsUnpacked_dev_);
// FIXME? This is already done in the graph fill call - need to avoid the memcpy to host
myGraph_->rowPtrsPacked_dev_ = myGraph_->rowPtrsUnpacked_dev_;
myGraph_->rowPtrsPacked_host_ = myGraph_->rowPtrsUnpacked_host_;
myGraph_->lclIndsPacked_wdv = myGraph_->lclIndsUnpacked_wdv;
valuesPacked_wdv = valuesUnpacked_wdv;

Expand Down Expand Up @@ -1547,7 +1549,9 @@ namespace Tpetra {

// Keep the new 1-D packed allocations.
// FIXME KDDKDD https://github.com/trilinos/Trilinos/issues/9657
myGraph_->setRowPtrsUnpacked(myGraph_->rowPtrsPacked_dev_);
// We directly set the memory spaces to avoid a memcpy from device to host
myGraph_->rowPtrsUnpacked_dev_ = myGraph_->rowPtrsPacked_dev_;
myGraph_->rowPtrsUnpacked_host_ = myGraph_->rowPtrsPacked_host_;
myGraph_->lclIndsUnpacked_wdv = myGraph_->lclIndsPacked_wdv;
valuesUnpacked_wdv = valuesPacked_wdv;

Expand Down Expand Up @@ -3438,6 +3442,8 @@ CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
const typename local_matrix_device_type::values_type& values)
{
using ProfilingRegion=Details::ProfilingRegion;
ProfilingRegion region ("Tpetra::CrsMatrix::setAllValues");
const char tfecfFuncName[] = "setAllValues: ";
TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
(columnIndices.size () != values.size (), std::invalid_argument,
Expand All @@ -3454,6 +3460,7 @@ CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
(true, std::runtime_error, "myGraph_->setAllIndices() threw an "
"exception: " << e.what ());
}

// Make sure that myGraph_ now has a local graph. It may not be
// fillComplete yet, so it's important to check. We don't care
// whether setAllIndices() did a shallow copy or a deep copy, so a
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,11 +167,14 @@ class WrappedDualView {
t_host hostView;
if(deviceView.use_count() != 0)
{
hostView = Kokkos::create_mirror_view_and_copy(
hostView = Kokkos::create_mirror_view(
Kokkos::WithoutInitializing,
typename t_host::memory_space(),
deviceView);
}
originalDualView = DualViewType(deviceView, hostView);
originalDualView.clear_sync_state();
originalDualView.modify_device();
dualView = originalDualView;
}

Expand Down
9 changes: 1 addition & 8 deletions packages/tpetra/core/src/Tpetra_MultiVector_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -217,14 +217,7 @@ namespace { // (anonymous)
<< ". Please report this bug to the Tpetra developers.");
}

dual_view_type dv (d_view, Kokkos::create_mirror_view (d_view));
// Whether or not the user cares about the initial contents of the
// MultiVector, the device and host views are out of sync. We
// prefer to work in device memory. The way to ensure this
// happens is to mark the device view as modified.
dv.modify_device ();

return wrapped_dual_view_type(dv);
return wrapped_dual_view_type(d_view);
}

// Convert 1-D Teuchos::ArrayView to an unmanaged 1-D host Kokkos::View.
Expand Down
3 changes: 2 additions & 1 deletion packages/tpetra/core/test/Block/BlockMultiVector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,7 @@ namespace {
{
auto X_overlap =
X.getLocalBlockHost (meshMap.getLocalElement (meshMap.getMinGlobalIndex ()),
colToModify, Tpetra::Access::OverwriteAll);
colToModify, Tpetra::Access::ReadWrite);
TEST_ASSERT( X_overlap.data () != NULL );
TEST_EQUALITY_CONST( static_cast<size_t> (X_overlap.extent (0)),
static_cast<size_t> (blockSize) );
Expand Down Expand Up @@ -466,6 +466,7 @@ namespace {
localMeshRow < meshMap.getMaxLocalIndex (); ++localMeshRow) {
auto Y_cur = Y.getLocalBlockHost (localMeshRow, col,
Tpetra::Access::ReadOnly);

if (col != colToModify) {
TEST_ASSERT( equal (Y_cur, zeroLittleVector) &&
equal (zeroLittleVector, Y_cur) );
Expand Down