Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tpetra: fix race condition in transfers with AbsMax for CUDA_AWARE_MPI #3999

Merged
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,8 @@ int main (int argc, char* argv[])

Kokkos::initialize(argc,argv);

{
int status = 0;
try {

Teuchos::GlobalMPISession mpiSession(&argc, &argv);
RCP<const Teuchos::MpiComm<int> > tComm = Teuchos::rcp(new Teuchos::MpiComm<int>(MPI_COMM_WORLD));
Expand Down Expand Up @@ -604,11 +605,11 @@ int main (int argc, char* argv[])
if (has_interface_condition)
checkInterfaceConnections(conn_manager, dofManager->getComm());
}

// construct some linear algebra object, build object to pass to evaluators
Teuchos::RCP<panzer::LinearObjFactory<panzer::Traits> > linObjFactory
= Teuchos::rcp(new panzer::BlockedEpetraLinearObjFactory<panzer::Traits,int>(tComm.getConst(),dofManager));

std::vector<std::string> names;
std::vector<std::vector<std::string> > eblocks;
const int c_name_start = 3;
Expand Down Expand Up @@ -642,7 +643,7 @@ int main (int argc, char* argv[])
errorResponseLibrary->addResponse(names[i] + " L2 Error", eblocks[i], builder);
}
}

// setup closure model
/////////////////////////////////////////////////////////////

Expand Down Expand Up @@ -671,7 +672,7 @@ int main (int argc, char* argv[])
}

Teuchos::ParameterList user_data("User Data"); // user data can be empty here

// setup field manager builder
/////////////////////////////////////////////////////////////

Expand All @@ -697,7 +698,7 @@ int main (int argc, char* argv[])
user_data.set<int>("Workset Size", workset_size);
if (po.check_error)
errorResponseLibrary->buildResponseEvaluators(physicsBlocks, cm_factory, closure_models, user_data);

// assemble and solve
/////////////////////////////////////////////////////////////
Teuchos::RCP<panzer::EpetraLinearObjContainer> ep_container;
Expand All @@ -714,19 +715,20 @@ int main (int argc, char* argv[])
linObjFactory->initializeGhostedContainer(panzer::LinearObjContainer::X |
panzer::LinearObjContainer::F |
panzer::LinearObjContainer::Mat,*ghost_container);

linObjFactory->initializeContainer(panzer::LinearObjContainer::X |
panzer::LinearObjContainer::F |
panzer::LinearObjContainer::Mat,*container);
ghost_container->initialize();
container->initialize();

panzer::AssemblyEngineInArgs input(ghost_container,container);
input.alpha = 0;
input.beta = 1;

// evaluate physics: This does both the Jacobian and residual at once
ae_tm.getAsObject<panzer::Traits::Jacobian>()->evaluate(input);

// solve linear system
/////////////////////////////////////////////////////////////
// convert generic linear object container to epetra container
Expand All @@ -745,7 +747,7 @@ int main (int argc, char* argv[])

// solve the linear system
solver.Iterate(1000,1e-5);

// we have now solved for the residual correction from
// zero in the context of a Newton solve.
// J*e = -r = -(f - J*0) where f = J*u
Expand All @@ -755,13 +757,14 @@ int main (int argc, char* argv[])
} else { // Some analysis and an outer iteration if necessary.
Teuchos::RCP<Epetra_CrsMatrix> J_fd;
assembleAndSolve(ae_tm, linObjFactory, ep_container, ghost_container, po);

if (po.test_Jacobian) {
const double nwre = testJacobian(ae_tm, linObjFactory, ep_container->get_x());
out << "TEST JACOBIAN " << nwre << "\n";
if (nwre < 0 || nwre > 1e-5) pass = false;
}
}

// output data (optional)
/////////////////////////////////////////////////////////////

Expand All @@ -771,7 +774,7 @@ int main (int argc, char* argv[])
EpetraExt::VectorToMatrixMarketFile("x_vec.mm",*ep_container->get_x());
EpetraExt::VectorToMatrixMarketFile("b_vec.mm",*ep_container->get_f());
}

if (po.check_error) {
std::vector<Teuchos::RCP<panzer::Response_Functional<panzer::Traits::Residual> > > rfs(names.size());
for (std::size_t i = po.nonlinear_Robin ? c_name_start : 0; i < names.size(); ++i) {
Expand Down Expand Up @@ -815,13 +818,37 @@ int main (int argc, char* argv[])
panzer_stk::write_solution_data(*dofManager,*mesh,*ep_ghost_container->get_x());
mesh->writeToExodus("output.exo");
}

// all done!
/////////////////////////////////////////////////////////////
out << (pass ? "PASS" : "FAIL") << " BASICS\n";
}
catch (std::exception& e) {
std::cout << "*********** Caught Exception: Begin Error Report ***********" << std::endl;
std::cout << e.what() << std::endl;
std::cout << "************ Caught Exception: End Error Report ************" << std::endl;
status = -1;
}
catch (std::string& msg) {
std::cout << "*********** Caught Exception: Begin Error Report ***********" << std::endl;
std::cout << msg << std::endl;
std::cout << "************ Caught Exception: End Error Report ************" << std::endl;
status = -1;
}
catch (char* msg) {
std::cout << "*********** Caught Exception: Begin Error Report ***********" << std::endl;
std::cout << *msg << std::endl;
std::cout << "************ Caught Exception: End Error Report ************" << std::endl;
status = -1;
}
catch (...) {
std::cout << "*********** Caught Exception: Begin Error Report ***********" << std::endl;
std::cout << "Caught UNKOWN exception" << std::endl;
std::cout << "************ Caught Exception: End Error Report ************" << std::endl;
status = -1;
}

return 0;
return status;
}

void testInitialization(const Teuchos::RCP<Teuchos::ParameterList>& ipb, std::vector<panzer::BC>& bcs,
Expand Down
79 changes: 50 additions & 29 deletions packages/panzer/dof-mgr/src/Panzer_DOFManager_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,10 +124,10 @@ class GreedyTieBreak : public Tpetra::Details::TieBreak<LocalOrdinal,GlobalOrdin
const std::vector<std::pair<int,LocalOrdinal> > & pid_and_lid) const
{
// always choose index of pair with smallest pid
auto numLids = pid_and_lid.size();
decltype(numLids) idx = 0;
auto minpid = pid_and_lid[0].first;
decltype(minpid) minidx = 0;
const std::size_t numLids = pid_and_lid.size();
std::size_t idx = 0;
int minpid = pid_and_lid[0].first;
std::size_t minidx = 0;
for (idx = 0; idx < numLids; ++idx) {
if (pid_and_lid[idx].first < minpid) {
minpid = pid_and_lid[idx].first;
Expand Down Expand Up @@ -569,17 +569,22 @@ void DOFManager<LO,GO>::buildGlobalUnknowns(const Teuchos::RCP<const FieldPatter
HashTable isOwned, remainingOwned;

// owned_ is made up of owned_ids.: This doesn't work for high order
Teuchos::ArrayRCP<const GO> nvals = non_overlap_mv->get1dView();
Teuchos::ArrayRCP<const GO> tagged_vals = tagged_non_overlap_mv->get1dView();
non_overlap_mv->sync_host();
tagged_non_overlap_mv->sync_host();
PHX::Device::fence();
auto nvals = non_overlap_mv->template getLocalView<PHX::Device>();
auto tagged_vals = tagged_non_overlap_mv->template getLocalView<PHX::Device>();
TEUCHOS_ASSERT(nvals.size()==tagged_vals.size());
for (int j = 0; j < nvals.size(); ++j) {
if (nvals[j] != -1) {
for(GO offset=0;offset<tagged_vals[j];offset++)
isOwned.insert(nvals[j]+offset);
}
else {
// sanity check
TEUCHOS_ASSERT(tagged_vals[j]==0)
for (size_t i = 0; i < nvals.extent(1); ++i) {
for (size_t j = 0; j < nvals.extent(0); ++j) {
if (nvals(j,i) != -1) {
for(GO offset=0;offset<tagged_vals(j,i);++offset)
isOwned.insert(nvals(j,i)+offset);
}
else {
// sanity check
TEUCHOS_ASSERT(tagged_vals(j,i)==0);
}
}
}
remainingOwned = isOwned;
Expand Down Expand Up @@ -776,7 +781,6 @@ DOFManager<LO,GO>::buildGlobalUnknowns_GUN(const Tpetra::MultiVector<GO,LO,GO,pa
non_overlap_mv = rcp(new MultiVector(*tagged_non_overlap_mv,Teuchos::Copy));
}


/* 10. Compute the local sum using Kokkos.
*/

Expand All @@ -792,8 +796,9 @@ DOFManager<LO,GO>::buildGlobalUnknowns_GUN(const Tpetra::MultiVector<GO,LO,GO,pa
KV values = non_overlap_mv->template getLocalView<DMS>();
auto mv_size = values.extent(0);
Kokkos::parallel_reduce(mv_size,panzer::dof_functors::SumRank2<GO,KV>(values),localsum);
PHX::Device::fence();
}

/* 11. Create a map using local sums to generate final GIDs.
*/

Expand All @@ -805,7 +810,7 @@ DOFManager<LO,GO>::buildGlobalUnknowns_GUN(const Tpetra::MultiVector<GO,LO,GO,pa

// do a prefix sum
GO scanResult = 0;
Teuchos::scan<int, GO> (*getComm(), Teuchos::REDUCE_SUM, static_cast<size_t> (localsum), Teuchos::outArg (scanResult));
Teuchos::scan<int, GO> (*getComm(), Teuchos::REDUCE_SUM, static_cast<int> (localsum), Teuchos::outArg (scanResult));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is only helpful for removing warnings if GO is signed. You really want to cast localsum to GO since that's the type of the scan. It will get cast from int to GO here anyway.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks! will change

myOffset = scanResult - localsum;
}

Expand All @@ -820,24 +825,25 @@ DOFManager<LO,GO>::buildGlobalUnknowns_GUN(const Tpetra::MultiVector<GO,LO,GO,pa

{
PANZER_DOFMGR_FUNC_TIME_MONITOR("panzer::DOFManager::buildGlobalUnknowns_GUN::line_13-21 gid_assignment");

// ArrayView<const GO> owned_ids = gid_map->getNodeElementList();
int which_id=0;
ArrayRCP<ArrayRCP<GO> > editnonoverlap = non_overlap_mv->get2dViewNonConst();
auto editnonoverlap = non_overlap_mv->getLocalViewHost();
for(size_t i=0; i<non_overlap_mv->getLocalLength(); ++i){
for(int j=0; j<numFields_; ++j){
if(editnonoverlap[j][i]!=0){
if(editnonoverlap(i,j)!=0){
// editnonoverlap[j][i]=myOffset+which_id;
int ndof = Teuchos::as<int>(editnonoverlap[j][i]);
editnonoverlap[j][i]=myOffset+which_id;
int ndof = Teuchos::as<int>(editnonoverlap(i,j));
editnonoverlap(i,j)=myOffset+which_id;
which_id+=ndof;
}
else{
editnonoverlap[j][i]=-1;
editnonoverlap(i,j)=-1;
}

}
}
non_overlap_mv->modify_host();
non_overlap_mv->sync_device();
PHX::Device::fence();
}

// LINE 22: In the GUN paper. Were performed above, and the overlaped_mv is
Expand Down Expand Up @@ -909,6 +915,7 @@ DOFManager<LO,GO>::buildTaggedMultiVector(const ElementBlockAccess & ownedAccess
}

RCP<const Map> overlapmap = buildOverlapMapFromElements(ownedAccess);
PHX::Device::fence();

// LINE 22: In the GUN paper...the overlap_mv is reused for the tagged multivector.
// This is a bit of a practical abuse of the algorithm presented in the paper.
Expand All @@ -919,6 +926,7 @@ DOFManager<LO,GO>::buildTaggedMultiVector(const ElementBlockAccess & ownedAccess

overlap_mv = Tpetra::createMultiVector<GO>(overlapmap,(size_t)numFields_);
overlap_mv->putScalar(0); // if tpetra is not initialized with zeros
PHX::Device::fence();
}

/* 5. Iterate through all local elements again, checking with the FP
Expand All @@ -930,7 +938,10 @@ DOFManager<LO,GO>::buildTaggedMultiVector(const ElementBlockAccess & ownedAccess

// temporary working vector to fill each row in tagged array
std::vector<int> working(overlap_mv->getNumVectors());
ArrayRCP<ArrayRCP<GO> > edittwoview = overlap_mv->get2dViewNonConst();
auto dual_view = overlap_mv->getDualView();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer that you not call getDualView. Please call the functions that get the Kokkos::View of the appropriate memory space directly. This is fine for now, but I do plan on getting rid of getDualView at some point, since it exposes an implementation detail that constrains Tpetra to have allocations on both sides at all times (it can't do lazy allocation).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok. Will fix that as well.

dual_view.sync_host();
PHX::Device::fence();
auto edittwoview_host = dual_view.view_host();
for (size_t b = 0; b < blockOrder_.size(); ++b) {
// there has to be a field pattern assocaited with the block
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"associated"

if(fa_fps_[b]==Teuchos::null)
Expand All @@ -956,13 +967,16 @@ DOFManager<LO,GO>::buildTaggedMultiVector(const ElementBlockAccess & ownedAccess
offset++;
}
for(std::size_t i=0;i<working.size();i++) {
auto current = edittwoview[i][lid];
edittwoview[i][lid] = (current > working[i]) ? current : working[i];
auto current = edittwoview_host(lid,i);
edittwoview_host(lid,i) = (current > working[i]) ? current : working[i];
}

}
}
}
overlap_mv->modify_host();
overlap_mv->sync_device();
PHX::Device::fence();

// // verbose output for inspecting overlap_mv
// for(int i=0;i<overlap_mv->getLocalLength(); i++) {
Expand Down Expand Up @@ -1284,6 +1298,10 @@ buildOverlapMapFromElements(const ElementBlockAccess & access) const
}
}





Array<GO> overlapVector;
for (typename std::set<GO>::const_iterator itr = overlapset.begin(); itr!=overlapset.end(); ++itr) {
overlapVector.push_back(*itr);
Expand All @@ -1304,7 +1322,10 @@ fillGIDsFromOverlappedMV(const ElementBlockAccess & access,
using Teuchos::ArrayRCP;

//To generate elementGIDs we need to go through all of the local elements.
ArrayRCP<ArrayRCP<const GO> > twoview = overlap_mv.get2dView();
auto dual_view = overlap_mv.getDualView();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See above. Get Kokkos::View objects from the Tpetra::MultiVector directly; don't access the DualView.

dual_view.sync_host();
PHX::Device::fence();
auto twoview_host = dual_view.view_host();

//And for each of the things in fa_fp.fieldIds we go to that column. To the the row,
//we move from globalID to localID in the map and use our local value for something.
Expand Down Expand Up @@ -1338,7 +1359,7 @@ fillGIDsFromOverlappedMV(const ElementBlockAccess & access,
offset++;
//Row will be lid. column will be whichField.
//Shove onto local ordering
localOrdering.push_back(twoview[whichField][lid]+dofsPerField[whichField]);
localOrdering.push_back(twoview_host(lid,whichField)+dofsPerField[whichField]);

dofsPerField[whichField]++;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@

#include "Kokkos_Core.hpp"
#include "Kokkos_ArithTraits.hpp"
#include "impl/Kokkos_Atomic_Generic.hpp"
#include <sstream>
#include <stdexcept>

Expand Down Expand Up @@ -736,23 +737,22 @@ outOfBounds (const IntegerType x, const IntegerType exclusiveUpperBound)
};
#endif // KOKKOS_ENABLE_SERIAL

template<class Scalar1, class Scalar2>
struct AbsMaxOper {
KOKKOS_FORCEINLINE_FUNCTION
static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
const auto val1_abs = Kokkos::Details::ArithTraits<Scalar1>::abs(val1);
const auto val2_abs = Kokkos::Details::ArithTraits<Scalar2>::abs(val2);
return val1_abs > val2_abs ? Scalar1(val1_abs) : Scalar1(val2_abs);
}
};

template<class ExecutionSpace>
struct AbsMaxOp {
// ETP: Is this really what we want? This seems very odd if
// Scalar != SCT::mag_type (e.g., Scalar == std::complex<T>)
//
// mfh: I didn't write this code, but note that we don't use T =
// Scalar here, we use T = ArithTraits<Scalar>::mag_type. That
// makes this code reasonable.
template <typename T>
KOKKOS_INLINE_FUNCTION
T max(const T& a, const T& b) const { return a > b ? a : b; }

template <typename Scalar>
KOKKOS_INLINE_FUNCTION
void operator() (Scalar& dest, const Scalar& src) const {
typedef Kokkos::Details::ArithTraits<Scalar> SCT;
Kokkos::atomic_assign(&dest, Scalar(max(SCT::abs(dest),SCT::abs(src))));
Kokkos::Impl::atomic_fetch_oper (AbsMaxOper<Scalar,Scalar>(), &dest, src);
}
};

Expand Down