From c39e4db16acd9116458ed81308f20184b3d82a0b Mon Sep 17 00:00:00 2001 From: Roger Pawlowski Date: Wed, 5 Dec 2018 07:13:41 -0700 Subject: [PATCH 1/3] Tpetra: fix race condition in transfers with AbsMax for CUDA_AWARE_MPI #3968 With M. Hoemmen --- ...r_Details_MultiVectorDistObjectKernels.hpp | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/packages/tpetra/core/src/kokkos_refactor/Tpetra_KokkosRefactor_Details_MultiVectorDistObjectKernels.hpp b/packages/tpetra/core/src/kokkos_refactor/Tpetra_KokkosRefactor_Details_MultiVectorDistObjectKernels.hpp index 78d6d25255ff..8427a8bde401 100644 --- a/packages/tpetra/core/src/kokkos_refactor/Tpetra_KokkosRefactor_Details_MultiVectorDistObjectKernels.hpp +++ b/packages/tpetra/core/src/kokkos_refactor/Tpetra_KokkosRefactor_Details_MultiVectorDistObjectKernels.hpp @@ -51,6 +51,7 @@ #include "Kokkos_Core.hpp" #include "Kokkos_ArithTraits.hpp" +#include "impl/Kokkos_Atomic_Generic.hpp" #include #include @@ -736,23 +737,22 @@ outOfBounds (const IntegerType x, const IntegerType exclusiveUpperBound) }; #endif // KOKKOS_ENABLE_SERIAL + template + struct AbsMaxOper { + KOKKOS_FORCEINLINE_FUNCTION + static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) { + const auto val1_abs = Kokkos::Details::ArithTraits::abs(val1); + const auto val2_abs = Kokkos::Details::ArithTraits::abs(val2); + return val1_abs > val2_abs ? Scalar1(val1_abs) : Scalar1(val2_abs); + } + }; + template struct AbsMaxOp { - // ETP: Is this really what we want? This seems very odd if - // Scalar != SCT::mag_type (e.g., Scalar == std::complex) - // - // mfh: I didn't write this code, but note that we don't use T = - // Scalar here, we use T = ArithTraits::mag_type. That - // makes this code reasonable. - template - KOKKOS_INLINE_FUNCTION - T max(const T& a, const T& b) const { return a > b ? a : b; } - template KOKKOS_INLINE_FUNCTION void operator() (Scalar& dest, const Scalar& src) const { - typedef Kokkos::Details::ArithTraits SCT; - Kokkos::atomic_assign(&dest, Scalar(max(SCT::abs(dest),SCT::abs(src)))); + Kokkos::Impl::atomic_fetch_oper (AbsMaxOper(), &dest, src); } }; From 3ff933806c562c534b8687fbcc8f76b0b8529356 Mon Sep 17 00:00:00 2001 From: Roger Pawlowski Date: Wed, 5 Dec 2018 08:16:08 -0700 Subject: [PATCH 2/3] Panzer: cleanup for tpetra CUDA_AWARE_MPI While these are not necessary to fix #3968, this cleans up some old and probably not so safe code. Mainly switching to using dual view semantics instead of the get 1d and 2d arrays and adding fences. --- .../example/PoissonInterfaceExample/main.cpp | 53 ++++++++++--- .../dof-mgr/src/Panzer_DOFManager_impl.hpp | 79 ++++++++++++------- 2 files changed, 90 insertions(+), 42 deletions(-) diff --git a/packages/panzer/adapters-stk/example/PoissonInterfaceExample/main.cpp b/packages/panzer/adapters-stk/example/PoissonInterfaceExample/main.cpp index 1bf72a914026..86b90c913803 100644 --- a/packages/panzer/adapters-stk/example/PoissonInterfaceExample/main.cpp +++ b/packages/panzer/adapters-stk/example/PoissonInterfaceExample/main.cpp @@ -388,7 +388,8 @@ int main (int argc, char* argv[]) Kokkos::initialize(argc,argv); - { + int status = 0; + try { Teuchos::GlobalMPISession mpiSession(&argc, &argv); RCP > tComm = Teuchos::rcp(new Teuchos::MpiComm(MPI_COMM_WORLD)); @@ -604,11 +605,11 @@ int main (int argc, char* argv[]) if (has_interface_condition) checkInterfaceConnections(conn_manager, dofManager->getComm()); } - + // construct some linear algebra object, build object to pass to evaluators Teuchos::RCP > linObjFactory = Teuchos::rcp(new panzer::BlockedEpetraLinearObjFactory(tComm.getConst(),dofManager)); - + std::vector names; std::vector > eblocks; const int c_name_start = 3; @@ -642,7 +643,7 @@ int main (int argc, char* argv[]) errorResponseLibrary->addResponse(names[i] + " L2 Error", eblocks[i], builder); } } - + // setup closure model ///////////////////////////////////////////////////////////// @@ -671,7 +672,7 @@ int main (int argc, char* argv[]) } Teuchos::ParameterList user_data("User Data"); // user data can be empty here - + // setup field manager builder ///////////////////////////////////////////////////////////// @@ -697,7 +698,7 @@ int main (int argc, char* argv[]) user_data.set("Workset Size", workset_size); if (po.check_error) errorResponseLibrary->buildResponseEvaluators(physicsBlocks, cm_factory, closure_models, user_data); - + // assemble and solve ///////////////////////////////////////////////////////////// Teuchos::RCP ep_container; @@ -714,19 +715,20 @@ int main (int argc, char* argv[]) linObjFactory->initializeGhostedContainer(panzer::LinearObjContainer::X | panzer::LinearObjContainer::F | panzer::LinearObjContainer::Mat,*ghost_container); + linObjFactory->initializeContainer(panzer::LinearObjContainer::X | panzer::LinearObjContainer::F | panzer::LinearObjContainer::Mat,*container); ghost_container->initialize(); container->initialize(); - + panzer::AssemblyEngineInArgs input(ghost_container,container); input.alpha = 0; input.beta = 1; // evaluate physics: This does both the Jacobian and residual at once ae_tm.getAsObject()->evaluate(input); - + // solve linear system ///////////////////////////////////////////////////////////// // convert generic linear object container to epetra container @@ -745,7 +747,7 @@ int main (int argc, char* argv[]) // solve the linear system solver.Iterate(1000,1e-5); - + // we have now solved for the residual correction from // zero in the context of a Newton solve. // J*e = -r = -(f - J*0) where f = J*u @@ -755,13 +757,14 @@ int main (int argc, char* argv[]) } else { // Some analysis and an outer iteration if necessary. Teuchos::RCP J_fd; assembleAndSolve(ae_tm, linObjFactory, ep_container, ghost_container, po); + if (po.test_Jacobian) { const double nwre = testJacobian(ae_tm, linObjFactory, ep_container->get_x()); out << "TEST JACOBIAN " << nwre << "\n"; if (nwre < 0 || nwre > 1e-5) pass = false; } } - + // output data (optional) ///////////////////////////////////////////////////////////// @@ -771,7 +774,7 @@ int main (int argc, char* argv[]) EpetraExt::VectorToMatrixMarketFile("x_vec.mm",*ep_container->get_x()); EpetraExt::VectorToMatrixMarketFile("b_vec.mm",*ep_container->get_f()); } - + if (po.check_error) { std::vector > > rfs(names.size()); for (std::size_t i = po.nonlinear_Robin ? c_name_start : 0; i < names.size(); ++i) { @@ -815,13 +818,37 @@ int main (int argc, char* argv[]) panzer_stk::write_solution_data(*dofManager,*mesh,*ep_ghost_container->get_x()); mesh->writeToExodus("output.exo"); } - + // all done! ///////////////////////////////////////////////////////////// out << (pass ? "PASS" : "FAIL") << " BASICS\n"; } + catch (std::exception& e) { + std::cout << "*********** Caught Exception: Begin Error Report ***********" << std::endl; + std::cout << e.what() << std::endl; + std::cout << "************ Caught Exception: End Error Report ************" << std::endl; + status = -1; + } + catch (std::string& msg) { + std::cout << "*********** Caught Exception: Begin Error Report ***********" << std::endl; + std::cout << msg << std::endl; + std::cout << "************ Caught Exception: End Error Report ************" << std::endl; + status = -1; + } + catch (char* msg) { + std::cout << "*********** Caught Exception: Begin Error Report ***********" << std::endl; + std::cout << *msg << std::endl; + std::cout << "************ Caught Exception: End Error Report ************" << std::endl; + status = -1; + } + catch (...) { + std::cout << "*********** Caught Exception: Begin Error Report ***********" << std::endl; + std::cout << "Caught UNKOWN exception" << std::endl; + std::cout << "************ Caught Exception: End Error Report ************" << std::endl; + status = -1; + } - return 0; + return status; } void testInitialization(const Teuchos::RCP& ipb, std::vector& bcs, diff --git a/packages/panzer/dof-mgr/src/Panzer_DOFManager_impl.hpp b/packages/panzer/dof-mgr/src/Panzer_DOFManager_impl.hpp index 07e1a41c2590..fdd15c5c5cc9 100644 --- a/packages/panzer/dof-mgr/src/Panzer_DOFManager_impl.hpp +++ b/packages/panzer/dof-mgr/src/Panzer_DOFManager_impl.hpp @@ -124,10 +124,10 @@ class GreedyTieBreak : public Tpetra::Details::TieBreak > & pid_and_lid) const { // always choose index of pair with smallest pid - auto numLids = pid_and_lid.size(); - decltype(numLids) idx = 0; - auto minpid = pid_and_lid[0].first; - decltype(minpid) minidx = 0; + const std::size_t numLids = pid_and_lid.size(); + std::size_t idx = 0; + int minpid = pid_and_lid[0].first; + std::size_t minidx = 0; for (idx = 0; idx < numLids; ++idx) { if (pid_and_lid[idx].first < minpid) { minpid = pid_and_lid[idx].first; @@ -569,17 +569,22 @@ void DOFManager::buildGlobalUnknowns(const Teuchos::RCP nvals = non_overlap_mv->get1dView(); - Teuchos::ArrayRCP tagged_vals = tagged_non_overlap_mv->get1dView(); + non_overlap_mv->sync_host(); + tagged_non_overlap_mv->sync_host(); + PHX::Device::fence(); + auto nvals = non_overlap_mv->template getLocalView(); + auto tagged_vals = tagged_non_overlap_mv->template getLocalView(); TEUCHOS_ASSERT(nvals.size()==tagged_vals.size()); - for (int j = 0; j < nvals.size(); ++j) { - if (nvals[j] != -1) { - for(GO offset=0;offset::buildGlobalUnknowns_GUN(const Tpetra::MultiVector::buildGlobalUnknowns_GUN(const Tpetra::MultiVectortemplate getLocalView(); auto mv_size = values.extent(0); Kokkos::parallel_reduce(mv_size,panzer::dof_functors::SumRank2(values),localsum); + PHX::Device::fence(); } - + /* 11. Create a map using local sums to generate final GIDs. */ @@ -805,7 +810,7 @@ DOFManager::buildGlobalUnknowns_GUN(const Tpetra::MultiVector (*getComm(), Teuchos::REDUCE_SUM, static_cast (localsum), Teuchos::outArg (scanResult)); + Teuchos::scan (*getComm(), Teuchos::REDUCE_SUM, static_cast (localsum), Teuchos::outArg (scanResult)); myOffset = scanResult - localsum; } @@ -820,24 +825,25 @@ DOFManager::buildGlobalUnknowns_GUN(const Tpetra::MultiVector owned_ids = gid_map->getNodeElementList(); int which_id=0; - ArrayRCP > editnonoverlap = non_overlap_mv->get2dViewNonConst(); + auto editnonoverlap = non_overlap_mv->getLocalViewHost(); for(size_t i=0; igetLocalLength(); ++i){ for(int j=0; j(editnonoverlap[j][i]); - editnonoverlap[j][i]=myOffset+which_id; + int ndof = Teuchos::as(editnonoverlap(i,j)); + editnonoverlap(i,j)=myOffset+which_id; which_id+=ndof; } else{ - editnonoverlap[j][i]=-1; + editnonoverlap(i,j)=-1; } } } + non_overlap_mv->modify_host(); + non_overlap_mv->sync_device(); + PHX::Device::fence(); } // LINE 22: In the GUN paper. Were performed above, and the overlaped_mv is @@ -909,6 +915,7 @@ DOFManager::buildTaggedMultiVector(const ElementBlockAccess & ownedAccess } RCP overlapmap = buildOverlapMapFromElements(ownedAccess); + PHX::Device::fence(); // LINE 22: In the GUN paper...the overlap_mv is reused for the tagged multivector. // This is a bit of a practical abuse of the algorithm presented in the paper. @@ -919,6 +926,7 @@ DOFManager::buildTaggedMultiVector(const ElementBlockAccess & ownedAccess overlap_mv = Tpetra::createMultiVector(overlapmap,(size_t)numFields_); overlap_mv->putScalar(0); // if tpetra is not initialized with zeros + PHX::Device::fence(); } /* 5. Iterate through all local elements again, checking with the FP @@ -930,7 +938,10 @@ DOFManager::buildTaggedMultiVector(const ElementBlockAccess & ownedAccess // temporary working vector to fill each row in tagged array std::vector working(overlap_mv->getNumVectors()); - ArrayRCP > edittwoview = overlap_mv->get2dViewNonConst(); + auto dual_view = overlap_mv->getDualView(); + dual_view.sync_host(); + PHX::Device::fence(); + auto edittwoview_host = dual_view.view_host(); for (size_t b = 0; b < blockOrder_.size(); ++b) { // there has to be a field pattern assocaited with the block if(fa_fps_[b]==Teuchos::null) @@ -956,13 +967,16 @@ DOFManager::buildTaggedMultiVector(const ElementBlockAccess & ownedAccess offset++; } for(std::size_t i=0;i working[i]) ? current : working[i]; + auto current = edittwoview_host(lid,i); + edittwoview_host(lid,i) = (current > working[i]) ? current : working[i]; } } } } + overlap_mv->modify_host(); + overlap_mv->sync_device(); + PHX::Device::fence(); // // verbose output for inspecting overlap_mv // for(int i=0;igetLocalLength(); i++) { @@ -1284,6 +1298,10 @@ buildOverlapMapFromElements(const ElementBlockAccess & access) const } } + + + + Array overlapVector; for (typename std::set::const_iterator itr = overlapset.begin(); itr!=overlapset.end(); ++itr) { overlapVector.push_back(*itr); @@ -1304,7 +1322,10 @@ fillGIDsFromOverlappedMV(const ElementBlockAccess & access, using Teuchos::ArrayRCP; //To generate elementGIDs we need to go through all of the local elements. - ArrayRCP > twoview = overlap_mv.get2dView(); + auto dual_view = overlap_mv.getDualView(); + dual_view.sync_host(); + PHX::Device::fence(); + auto twoview_host = dual_view.view_host(); //And for each of the things in fa_fp.fieldIds we go to that column. To the the row, //we move from globalID to localID in the map and use our local value for something. @@ -1338,7 +1359,7 @@ fillGIDsFromOverlappedMV(const ElementBlockAccess & access, offset++; //Row will be lid. column will be whichField. //Shove onto local ordering - localOrdering.push_back(twoview[whichField][lid]+dofsPerField[whichField]); + localOrdering.push_back(twoview_host(lid,whichField)+dofsPerField[whichField]); dofsPerField[whichField]++; } From 5f91b56708f1e47dcdd277812a0d73c760f51f35 Mon Sep 17 00:00:00 2001 From: Roger Pawlowski Date: Wed, 5 Dec 2018 12:13:42 -0700 Subject: [PATCH 3/3] Panzer: dof manager cleanup --- .../dof-mgr/src/Panzer_DOFManager_impl.hpp | 25 ++++++++----------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/packages/panzer/dof-mgr/src/Panzer_DOFManager_impl.hpp b/packages/panzer/dof-mgr/src/Panzer_DOFManager_impl.hpp index fdd15c5c5cc9..a4afd03d00bb 100644 --- a/packages/panzer/dof-mgr/src/Panzer_DOFManager_impl.hpp +++ b/packages/panzer/dof-mgr/src/Panzer_DOFManager_impl.hpp @@ -789,13 +789,9 @@ DOFManager::buildGlobalUnknowns_GUN(const Tpetra::MultiVector MV; - typedef typename MV::dual_view_type::t_dev KV; - typedef typename MV::dual_view_type::t_dev::memory_space DMS; - KV values = non_overlap_mv->template getLocalView(); + auto values = non_overlap_mv->getLocalViewDevice(); auto mv_size = values.extent(0); - Kokkos::parallel_reduce(mv_size,panzer::dof_functors::SumRank2(values),localsum); + Kokkos::parallel_reduce(mv_size,panzer::dof_functors::SumRank2(values),localsum); PHX::Device::fence(); } @@ -810,7 +806,7 @@ DOFManager::buildGlobalUnknowns_GUN(const Tpetra::MultiVector (*getComm(), Teuchos::REDUCE_SUM, static_cast (localsum), Teuchos::outArg (scanResult)); + Teuchos::scan (*getComm(), Teuchos::REDUCE_SUM, static_cast (localsum), Teuchos::outArg (scanResult)); myOffset = scanResult - localsum; } @@ -938,12 +934,11 @@ DOFManager::buildTaggedMultiVector(const ElementBlockAccess & ownedAccess // temporary working vector to fill each row in tagged array std::vector working(overlap_mv->getNumVectors()); - auto dual_view = overlap_mv->getDualView(); - dual_view.sync_host(); + overlap_mv->sync_host(); PHX::Device::fence(); - auto edittwoview_host = dual_view.view_host(); + auto edittwoview_host = overlap_mv->getLocalViewHost(); for (size_t b = 0; b < blockOrder_.size(); ++b) { - // there has to be a field pattern assocaited with the block + // there has to be a field pattern associated with the block if(fa_fps_[b]==Teuchos::null) continue; @@ -1317,15 +1312,15 @@ void DOFManager:: fillGIDsFromOverlappedMV(const ElementBlockAccess & access, std::vector > & elementGIDs, const Tpetra::Map & overlapmap, - const Tpetra::MultiVector & overlap_mv) const + const Tpetra::MultiVector & const_overlap_mv) const { using Teuchos::ArrayRCP; //To generate elementGIDs we need to go through all of the local elements. - auto dual_view = overlap_mv.getDualView(); - dual_view.sync_host(); + auto overlap_mv = const_cast&>(const_overlap_mv); + overlap_mv.sync_host(); PHX::Device::fence(); - auto twoview_host = dual_view.view_host(); + const auto twoview_host = overlap_mv.getLocalViewHost(); //And for each of the things in fa_fp.fieldIds we go to that column. To the the row, //we move from globalID to localID in the map and use our local value for something.