Skip to content

Commit

Permalink
Merge pull request #6617 from Tech-XCorp/fixCudaLaunchBlocking
Browse files Browse the repository at this point in the history
Tpetra: Resolve CUDA_LAUNCH_BLOCKING requirements for serial
  • Loading branch information
MicheldeMessieres authored Jan 27, 2020
2 parents 32c26ab + 6d8efe1 commit 94e8d84
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 0 deletions.
6 changes: 6 additions & 0 deletions packages/tpetra/core/ext/TpetraExt_MatrixMatrix_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2019,6 +2019,8 @@ void mult_A_B_reuse(
}
});

Kokkos::fence();

// Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
// Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_reuse_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
Expand Down Expand Up @@ -2323,6 +2325,8 @@ void jacobi_A_B_newmatrix(
}
});

Kokkos::fence();

// Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
// Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_newmatrix_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
Expand Down Expand Up @@ -2667,6 +2671,8 @@ void jacobi_A_B_reuse(
MM = Teuchos::null;
#endif

Kokkos::fence();

// Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
// Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_reuse_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
Expand Down
14 changes: 14 additions & 0 deletions packages/tpetra/core/src/Tpetra_transform_MultiVector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,16 @@ namespace Tpetra {
const LO lclNumRows = static_cast<LO> (input_lcl.extent (0));
using range_type = Kokkos::RangePolicy<ExecutionSpace, LO>;
range_type range (execSpace, 0, lclNumRows);

// Note that the Transform.cpp test will not currently pass
// with CUDA_LAUNCH_BLOCKING=0. This is because execSpace is not
// considered by withLocalAccess so after requesting the transform
// with Host we don't get a sync and need_sync_host() is still true.
// That issue is regardless of whether CUDA_LAUNCH_BLOCKING is set.
// When withLocalAccess is fixed we should then have the proper fencing
// and the test will run correctly with CUDA_LAUNCH_BLOCKING=0 or =1.
// PR 6617 discusses these current issues.

Kokkos::parallel_for (kernelLabel, range, g);
},
readOnly (input).on (memSpace),
Expand Down Expand Up @@ -357,6 +367,10 @@ namespace Tpetra {
const LO lclNumRows = static_cast<LO> (input_lcl.extent (0));
using range_type = Kokkos::RangePolicy<ExecutionSpace, LO>;
range_type range (execSpace, 0, lclNumRows);

// See note above for explanation why a pending fix to withLocalAccess
// will fix Transform.cpp test to pass with CUDA_LAUNCH_BLOCKING=0.

Kokkos::parallel_for (kernelLabel, range, g);
},
readOnly (input).on (memSpace),
Expand Down
4 changes: 4 additions & 0 deletions packages/tpetra/core/test/CrsMatrix/CrsMatrix_UnitTests4.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -846,6 +846,8 @@ inline void tupleToArray(Array<T> &arr, const tuple &tup)
Tpetra::Details::inverseScaleBlockDiagonal(*diag2,false,*toScale2);
Tpetra::Details::inverseScaleBlockDiagonal(*diag4,false,*toScale4);

Kokkos::fence();

// Check norms
Array<Mag> norms2(1), norms4(1);
toScale2->norm1(norms2());
Expand Down Expand Up @@ -941,6 +943,8 @@ inline void tupleToArray(Array<T> &arr, const tuple &tup)
Tpetra::Details::inverseScaleBlockDiagonal(*diag2,true,*toScale2);
Tpetra::Details::inverseScaleBlockDiagonal(*diag4,true,*toScale4);

Kokkos::fence();

// Check norms
Array<Mag> norms2(1), norms4(1);
toScale2->norm1(norms2());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -701,6 +701,8 @@ mult_test_results multiply_reuse_test(
RCP<Matrix_t> diffMatrix =
Tpetra::createCrsMatrix<SC,LO,GO,NT>(C->getRowMap(),
computedC2->getGlobalMaxNumRowEntries());

Kokkos::fence();
Tpetra::MatrixMatrix::Add(*computedC1, false, -one, *computedC2, false, one, diffMatrix);
diffMatrix->fillComplete(C->getDomainMap(), C->getRangeMap());

Expand Down

0 comments on commit 94e8d84

Please sign in to comment.