Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Trilinos Master Merge PR Generator: Auto PR created to promote from master_merge_20200805_000612 branch to master #7781

Merged
merged 11 commits into from
Aug 5, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ class MultiVectorLocalGatherScatter {
using Teuchos::ArrayRCP;
const size_t numRows = X_out.getLocalLength ();
const size_t numVecs = X_in.getNumVectors ();
Kokkos::fence();
for (size_t j = 0; j < numVecs; ++j) {
ArrayRCP<const InScalar> X_in_j = X_in.getData(j);
ArrayRCP<OutScalar> X_out_j = X_out.getDataNonConst(j);
Expand All @@ -109,6 +110,7 @@ class MultiVectorLocalGatherScatter {
X_out_j[i] = X_in_j[i_perm];
}
}
X_out.modify_host();
}

//Gather blocks (contiguous groups of blockSize rows)
Expand All @@ -124,6 +126,7 @@ class MultiVectorLocalGatherScatter {
using Teuchos::ArrayRCP;
const size_t numBlocks = X_out.getLocalLength() / blockSize;
const size_t numVecs = X_in.getNumVectors ();
Kokkos::fence();
for (size_t j = 0; j < numVecs; ++j) {
ArrayRCP<const InScalar> X_in_j = X_in.getData(j);
ArrayRCP<OutScalar> X_out_j = X_out.getDataNonConst(j);
Expand All @@ -134,6 +137,7 @@ class MultiVectorLocalGatherScatter {
}
}
}
X_out.modify_host();
}

void
Expand All @@ -144,6 +148,7 @@ class MultiVectorLocalGatherScatter {
using Teuchos::ArrayRCP;
const size_t numRows = X_out.getLocalLength();
const size_t numVecs = X_in.getNumVectors();
Kokkos::fence();
for (size_t j = 0; j < numVecs; ++j) {
ArrayRCP<InScalar> X_in_j = X_in.getDataNonConst(j);
ArrayRCP<const OutScalar> X_out_j = X_out.getData(j);
Expand All @@ -152,6 +157,7 @@ class MultiVectorLocalGatherScatter {
X_in_j[i_perm] = X_out_j[i];
}
}
X_out.modify_host();
}

void
Expand All @@ -164,6 +170,7 @@ class MultiVectorLocalGatherScatter {
using Teuchos::ArrayRCP;
const size_t numBlocks = X_out.getLocalLength() / blockSize;
const size_t numVecs = X_in.getNumVectors ();
Kokkos::fence();
for (size_t j = 0; j < numVecs; ++j) {
ArrayRCP<const InScalar> X_in_j = X_in.getData(j);
ArrayRCP<OutScalar> X_out_j = X_out.getDataNonConst(j);
Expand All @@ -174,6 +181,7 @@ class MultiVectorLocalGatherScatter {
}
}
}
X_out.modify_host();
}

/******************/
Expand All @@ -185,6 +193,7 @@ class MultiVectorLocalGatherScatter {
const Teuchos::ArrayView<const LO> perm) const
{
//note: j is col, i is row
Kokkos::fence(); // demonstrated via unit test failure
for(size_t j = 0; j < X_out.extent(1); ++j) {
for(size_t i = 0; i < X_out.extent(0); ++i) {
const LO i_perm = perm[i];
Expand All @@ -198,6 +207,7 @@ class MultiVectorLocalGatherScatter {
const OutView X_out,
const Teuchos::ArrayView<const LO> perm) const
{
Kokkos::fence();
for(size_t j = 0; j < X_out.extent(1); ++j) {
for(size_t i = 0; i < X_out.extent(0); ++i) {
const LO i_perm = perm[i];
Expand All @@ -213,6 +223,7 @@ class MultiVectorLocalGatherScatter {
LO blockSize) const
{
//note: j is col, i is row
Kokkos::fence();
size_t numBlocks = X_out.extent(0) / blockSize;
for(size_t j = 0; j < X_out.extent(1); ++j) {
for(size_t i = 0; i < numBlocks; ++i) {
Expand All @@ -231,6 +242,7 @@ class MultiVectorLocalGatherScatter {
LO blockSize) const
{
//note: j is col, i is row
Kokkos::fence();
size_t numBlocks = X_out.extent(0) / blockSize;
for(size_t j = 0; j < X_out.extent(1); ++j) {
for(size_t i = 0; i < numBlocks; ++i) {
Expand All @@ -251,6 +263,7 @@ class MultiVectorLocalGatherScatter {
const Teuchos::ArrayView<const LO> perm) const
{
//note: j is col, i is row
Kokkos::fence();
size_t numRows = X_out.getLocalLength();
for(size_t j = 0; j < X_out.getNumVectors(); ++j) {
Teuchos::ArrayRCP<OutScalar> X_out_j = X_out.getDataNonConst(j);
Expand All @@ -267,6 +280,7 @@ class MultiVectorLocalGatherScatter {
const Teuchos::ArrayView<const LO> perm) const
{
size_t numRows = X_out.getLocalLength();
Kokkos::fence();
for(size_t j = 0; j < X_in.extent(1); ++j) {
Teuchos::ArrayRCP<const OutScalar> X_out_j = X_out.getData(j);
for(size_t i = 0; i < numRows; ++i) {
Expand All @@ -284,6 +298,7 @@ class MultiVectorLocalGatherScatter {
{
//note: j is col, i is row
size_t numBlocks = X_out.getLocalLength() / blockSize;
Kokkos::fence();
for(size_t j = 0; j < X_out.getNumVectors(); ++j) {
Teuchos::ArrayRCP<OutScalar> X_out_j = X_out.getDataNonConst(j);
for(size_t i = 0; i < numBlocks; ++i) {
Expand All @@ -302,6 +317,7 @@ class MultiVectorLocalGatherScatter {
LO blockSize) const
{
size_t numBlocks = X_out.getLocalLength() / blockSize;
Kokkos::fence();
for(size_t j = 0; j < X_in.extent(1); ++j) {
Teuchos::ArrayRCP<const OutScalar> X_out_j = X_out.getData(j);
for(size_t i = 0; i < numBlocks; ++i) {
Expand Down
1 change: 1 addition & 0 deletions packages/ifpack2/src/Ifpack2_Experimental_RBILUK_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -870,6 +870,7 @@ apply (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_t
D_block_->applyBlock(cBlock, rBlock);

// Solve U Y = R.
Kokkos::fence(); // UVM access
for (local_ordinal_type imv = 0; imv < numVectors; ++imv)
{
const local_ordinal_type numRows = D_block_->getNodeNumRows();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,7 @@ struct BlockCrsMatrixMaker {
auto colsum = colsum_mv.getLocalViewHost();

// Get off-diag 1-norms.
Kokkos::fence(); // uvm access
for (LO r = 0; r < nrows; ++r) {
const auto rgid = row_map->getGlobalElement(r);
for (size_t j = rowptr(r); j < rowptr(r+1); ++j) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -707,6 +707,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2BlockRelaxation, TestDiagonalBlockCrsMa

const Scalar exactSol = 0.2;

yBlock.sync_host();
for (int k = 0; k < num_rows_per_proc; ++k) {
typename BMV::little_vec_type ylcl = yBlock.getLocalBlock(k,0);
Scalar* yb = ylcl.data();
Expand Down Expand Up @@ -1268,6 +1269,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2BlockRelaxation, TestLowerTriangularBlo
exactSol[1] = -0.25;
exactSol[2] = 0.625;

yBlock.sync_host();
for (size_t k = 0; k < num_rows_per_proc; ++k) {
LO lcl_row = k;
typename BMV::little_vec_type ylcl = yBlock.getLocalBlock(lcl_row,0);
Expand Down Expand Up @@ -1330,6 +1332,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2BlockRelaxation, TestUpperTriangularBlo
exactSol[1] = -0.25;
exactSol[2] = 0.5;

yBlock.sync_host();
for (int k = 0; k < num_rows_per_proc; ++k) {
typename BMV::little_vec_type ylcl = yBlock.getLocalBlock(k,0);
auto yb = ylcl.data();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2Chebyshev, Test0, Scalar, LocalOrdinal,

Teuchos::ArrayRCP<Scalar> twos(num_rows_per_proc*2, 2);

y.sync_host();
TEST_COMPARE_FLOATING_ARRAYS(yview, twos(), Teuchos::ScalarTraits<Scalar>::eps());

prec.apply(x, y);
Expand All @@ -131,6 +132,8 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2Chebyshev, Test0, Scalar, LocalOrdinal,

typename Teuchos::ScalarTraits<Scalar>::magnitudeType trial_tol = 1.e-13;
typename Teuchos::ScalarTraits<Scalar>::magnitudeType tol = std::max(trial_tol, Teuchos::ScalarTraits<Scalar>::eps());

y.sync_host();
TEST_COMPARE_FLOATING_ARRAYS(yview, halfs(), tol);

//If I now increase the degree of the polynomial to 4 the solve won't be
Expand All @@ -140,6 +143,8 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2Chebyshev, Test0, Scalar, LocalOrdinal,
prec.apply(x, y);

tol = 1.e-4;

y.sync_host();
TEST_COMPARE_FLOATING_ARRAYS(yview, halfs(), tol);
}

Expand Down
6 changes: 6 additions & 0 deletions packages/ifpack2/test/unit_tests/Ifpack2_UnitTestRBILUK.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(RBILUK, LowerTriangularBlockCrsMatrix, Scalar,
exactSol[1] = -0.25;
exactSol[2] = 0.625;

yBlock.sync_host();
for (size_t k = 0; k < num_rows_per_proc; ++k) {
LO lcl_row = k;
typename BMV::little_vec_type ylcl = yBlock.getLocalBlock(lcl_row,0);
Expand Down Expand Up @@ -260,6 +261,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(RBILUK, UpperTriangularBlockCrsMatrix, Scalar,
exactSol[1] = -0.25;
exactSol[2] = 0.5;

yBlock.sync_host();
for (int k = 0; k < num_rows_per_proc; ++k) {
typename BMV::little_vec_type ylcl = yBlock.getLocalBlock(k,0);
Scalar* yb = ylcl.data();
Expand Down Expand Up @@ -314,6 +316,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(RBILUK, FullLocalBlockCrsMatrix, Scalar, Local
exactSol[1] = -4.0/21.0;
exactSol[2] = 2.0/7.0;

yBlock.sync_host();
for (int k = 0; k < num_rows_per_proc; ++k) {
typename BMV::little_vec_type ylcl = yBlock.getLocalBlock(k,0);
Scalar* yb = ylcl.data();
Expand Down Expand Up @@ -381,6 +384,8 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(RBILUK, BandedBlockCrsMatrixWithDropping, Scal

prec_crs.apply(x, z);

y.sync_host();
z.sync_host();
Teuchos::ArrayRCP<const Scalar> zview = z.get1dView();
Teuchos::ArrayRCP<const Scalar> yview = y.get1dView();
for (int k = 0; k < num_rows_per_proc; ++k)
Expand Down Expand Up @@ -730,6 +735,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(RBILUK, DiagonalBlockCrsMatrix, Scalar, LocalO

const Scalar exactSol = 0.2;

yBlock.sync_host();
for (int k = 0; k < num_rows_per_proc; ++k) {
typename BMV::little_vec_type ylcl = yBlock.getLocalBlock(k,0);
Scalar* yb = ylcl.data();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2Relaxation, Test0, Scalar, LocalOrdinal
prec.apply(x, y);
//y should be full of 0.5's now.
Teuchos::ArrayRCP<Scalar> halfs(num_rows_per_proc*2, 0.5);

y.sync_host();
TEST_COMPARE_FLOATING_ARRAYS(yview, halfs(), Teuchos::ScalarTraits<Scalar>::eps());
}

Expand Down Expand Up @@ -907,6 +909,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2Relaxation, TestDiagonalBlockCrsMatrix,
using mag_type = typename STS::magnitudeType;
const auto tol = mag_type(100.0) * STS::eps();

yBlock.sync_host();
for (int k = 0; k < num_rows_per_proc; ++k) {
typename BMV::little_vec_type ylcl = yBlock.getLocalBlock(k,0);
Scalar* yb = ylcl.data();
Expand Down Expand Up @@ -1023,6 +1026,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2Relaxation, TestLowerTriangularBlockCrs
exactSol[1] = -0.25;
exactSol[2] = 0.625;

yBlock.sync_host();
for (size_t k = 0; k < num_rows_per_proc; ++k) {
LO lcl_row = k;
typename BMV::little_vec_type ylcl = yBlock.getLocalBlock(lcl_row,0);
Expand Down Expand Up @@ -1075,6 +1079,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(Ifpack2Relaxation, TestUpperTriangularBlockCrs
exactSol[1] = -0.25;
exactSol[2] = 0.5;

yBlock.sync_host();
for (int k = 0; k < num_rows_per_proc; ++k) {
typename BMV::little_vec_type ylcl = yBlock.getLocalBlock(k,0);
auto yb = ylcl.data();
Expand Down
3 changes: 2 additions & 1 deletion packages/tpetra/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ TRIBITS_ADD_OPTION_AND_DEFINE(

ASSERT_DEFINED(Kokkos_ENABLE_CUDA)
ASSERT_DEFINED(Kokkos_ENABLE_CUDA_UVM)
IF (Kokkos_ENABLE_CUDA AND NOT Kokkos_ENABLE_CUDA_UVM)
ASSERT_DEFINED(Tpetra_ENABLE_CUDA)
IF (Kokkos_ENABLE_CUDA AND NOT Kokkos_ENABLE_CUDA_UVM AND Tpetra_ENABLE_CUDA)
MESSAGE (FATAL_ERROR "If CUDA is enabled in Kokkos, Tpetra requires that Kokkos' UVM support be enabled. You may do this by setting the CMake option Kokkos_ENABLE_CUDA_UVM:BOOL=ON (WARNING: IT IS CASE SENSITIVE!) and running CMake again.\n\nDetails for developers: UVM stands for \"Unified Virtual Memory\". It lets code running on the host processor access GPU memory. There is a difference between CUDA's support for UVM, and Kokkos' support for UVM. Versions of CUDA >= 6 have UVM support built in by default. Kokkos always supports this. In particular, Kokkos always has a memory space for UVM allocations, called Kokkos::CudaUVMSpace. \"Turning on UVM support in Kokkos\" means two things:\n\n1. Kokkos::Cuda::memory_space (the CUDA execution space's default memory space) is Kokkos::CudaUVMSpace, rather than Kokkos::CudaSpace.\n\n2. Kokkos::DualView<T, Kokkos::Cuda> only uses a single allocation, in the Kokkos::CudaUVMSpace memory space.")
ENDIF ()

Expand Down
2 changes: 2 additions & 0 deletions packages/tpetra/core/src/Tpetra_BlockCrsMatrix_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1111,6 +1111,8 @@ class GetLocalDiagCopy {
const Scalar one_minus_omega = Teuchos::ScalarTraits<Scalar>::one()-omega;
const Scalar minus_omega = -omega;

Kokkos::fence();

if (numVecs == 1) {
for (LO lclRow = rowBegin; lclRow != rowEnd; lclRow += rowStride) {
const LO actlRow = lclRow - 1;
Expand Down
16 changes: 16 additions & 0 deletions packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2332,6 +2332,22 @@ namespace Tpetra {
/// This comes from Tpetra::Details::Behavior::debug("CrsGraph").
bool verbose_ = getVerbose();

private:
//! Track if we still might need to fence for the StaticGraph.
mutable bool need_sync_host_uvm_access = false;

//! Request fence before next host access.
void set_need_sync_host_uvm_access() {
need_sync_host_uvm_access = true;
}

//! Fence if necessary and set flag so we don't duplicate.
void execute_sync_host_uvm_access() const {
if(need_sync_host_uvm_access) {
Kokkos::fence();
need_sync_host_uvm_access = false;
}
}
}; // class CrsGraph

/// \brief Nonmember function to create an empty CrsGraph given a
Expand Down
Loading