Skip to content

Commit

Permalink
Merge 'trilinos/Trilinos:develop' (5207574) into 'tcad-charon/Trilino…
Browse files Browse the repository at this point in the history
…s:develop' (ce7572e).

* trilinos-develop:
  MueLu: Create cuBLAS and cuSPARSE handles up front
  MueLu: regionMG add missing Node templates
  Tpetra residual: Add options for skipping of local copy and comm/comp overlap
  Tpetra CrsGraph: Add method "getLocalOffRankOffsets"
  Ifpack2 OverlappingRowMatrix unit test: use launch params frm KK SpMV
  Tpetra residual: Use same launch parameters as Kokkos SpMV
  Tpetra residual: Skip copyAndPermute for locally fitted domain and column map
  Tpetra Transfer: Add isLocallyFitted
  • Loading branch information
Charonops Jenkins Pipeline committed Jun 29, 2021
2 parents ce7572e + 5207574 commit 07e60f0
Show file tree
Hide file tree
Showing 18 changed files with 1,032 additions and 167 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@
#endif

#include "Tpetra_Details_residual.hpp"
#include "KokkosSparse_spmv_impl.hpp"

#include <Ifpack2_UnitTestHelpers.hpp>
#include <Ifpack2_OverlappingRowMatrix.hpp>
Expand Down Expand Up @@ -133,8 +134,7 @@ void localReducedMatvec(const MatrixClass & A_lcl,
int64_t numLocalRows = userNumRows;
int64_t myNnz = A_lcl.nnz();

int64_t rows_per_team =
Tpetra::Details::residual_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters<execution_space>(numLocalRows, myNnz, rows_per_thread, team_size, vector_length);
int64_t worksets = (X_lcl.extent (0) + rows_per_team - 1) / rows_per_team;

using policy_type = typename Kokkos::TeamPolicy<execution_space>;
Expand Down
2 changes: 2 additions & 0 deletions packages/muelu/cmake/MueLu_config.hpp.in
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@

#cmakedefine HAVE_MUELU_KOKKOSCORE

#cmakedefine HAVE_MUELU_KOKKOSKERNELS

#cmakedefine HAVE_MUELU_ML

#cmakedefine HAVE_MUELU_PAMGEN
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -715,7 +715,7 @@ void MakeInterfaceScalingFactors(const int numLevels,
RCP<const Map> regRowMap = regMat->getRowMap();
RCP<Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > regRowImporters = level->Get<RCP<Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > >("rowImport");
// initialize region vector with all ones.
RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal> > regInterfaceScalings = VectorFactory::Build(regRowMap);
RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> > regInterfaceScalings = VectorFactory::Build(regRowMap);
regInterfaceScalings->putScalar(SC_ONE);

// transform to composite layout while adding interface values via the Export() combine mode
Expand All @@ -730,7 +730,7 @@ void MakeInterfaceScalingFactors(const int numLevels,
regInterfaceScalings,
regRowMap, regRowImporters);

level->Set<RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal> > >("regInterfaceScalings", regInterfaceScalings);
level->Set<RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> > >("regInterfaceScalings", regInterfaceScalings);
}
} // MakeInterfaceScalingFactors

Expand Down Expand Up @@ -829,7 +829,7 @@ void createRegionHierarchy(const int numDimensions,
RCP<Matrix> regMatrix = level->Get<RCP<Matrix> >("A", MueLu::NoFactory::get());
RCP<const Map> regRowMap = regMatrix->getRowMap();
RCP<Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > regRowImporter = level->Get<RCP<Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > >("rowImport");
RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal> > regInterfaceScalings = level->Get<RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal> > >("regInterfaceScalings");
RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> > regInterfaceScalings = level->Get<RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> > >("regInterfaceScalings");

smootherParams[levelIdx]->set("smoother: level", levelIdx);
smootherSetup(smootherParams[levelIdx], regRowMap,
Expand Down Expand Up @@ -953,7 +953,7 @@ void vCycle(const int l, ///< ID of current level
RCP<Matrix> regMatrix = level->Get<RCP<Matrix> >("A", MueLu::NoFactory::get());
RCP<const Map> regRowMap = regMatrix->getRowMap();
RCP<Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > regRowImporter = level->Get<RCP<Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > >("rowImport");
RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal> > regInterfaceScalings = level->Get<RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal> > >("regInterfaceScalings");
RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> > regInterfaceScalings = level->Get<RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> > >("regInterfaceScalings");

int cycleCount = 1;
if(cycleType == "W" && l > 0) // W cycle and not on finest level
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -830,7 +830,7 @@ int main_(Teuchos::CommandLineProcessor &clp, Xpetra::UnderlyingLib& lib, int ar
regCorrect->putScalar(SC_ZERO);
// Get Stuff out of Hierarchy
RCP<MueLu::Level> level = regHierarchy->GetLevel(0);
RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal> > regInterfaceScalings = level->Get<RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal> > >("regInterfaceScalings");
RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> > regInterfaceScalings = level->Get<RCP<Xpetra::Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> > >("regInterfaceScalings");
// check for convergence
{
////////////////////////////////////////////////////////////////////////
Expand Down
18 changes: 18 additions & 0 deletions packages/muelu/test/unit_tests/MueLu_Test_ETI.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,11 @@
#include <TpetraCore_config.h>
#endif

#ifdef HAVE_MUELU_KOKKOSKERNELS
#include <KokkosKernels_config.h>
#include <KokkosKernels_Controls.hpp>
#endif

#ifndef MUELU_AUTOMATIC_TEST_ETI_NAME
#error "The macro MUELU_AUTOMATIC_TEST_ETI_NAME was not defined"
#endif
Expand Down Expand Up @@ -95,6 +100,19 @@ bool Automatic_Test_ETI(int argc, char *argv[]) {
Kokkos::initialize(argc, argv);
#endif

// Create handles for cuBLAS and cuSPARSE. Otherwise they get
// created on the first call to these libraries, and that can mess
// up timings.
#ifdef HAVE_MUELU_KOKKOSKERNELS
KokkosKernels::Experimental::Controls controls;
# ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
controls.getCublasHandle();
# endif
# ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
controls.getCusparseHandle();
# endif
Kokkos::fence();
#endif
bool success = true;
bool verbose = true;
try {
Expand Down
8 changes: 4 additions & 4 deletions packages/tpetra/core/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ FUNCTION(TPETRA_PROCESS_ALL_LGN_TEMPLATES OUTPUT_FILES TEMPLATE_FILE
FOREACH(LO ${LOCALORDINAL_TYPES})
TPETRA_MANGLE_TEMPLATE_PARAMETER(LO_MANGLED "${LO}")
TPETRA_SLG_MACRO_NAME(LO_MACRO_NAME "${LO}")

TPETRA_PROCESS_ONE_LGN_TEMPLATE(OUT_FILE "${TEMPLATE_FILE}"
"${CLASS_NAME}" "${CLASS_MACRO_NAME}"
"${LO_MANGLED}" "${GO_MANGLED}" "${NT_MANGLED}"
Expand Down Expand Up @@ -619,7 +619,7 @@ IF (${PACKAGE_NAME}_ENABLE_EXPLICIT_INSTANTIATION)
"${TpetraCore_ETI_NODES}"
TRUE)
LIST(APPEND SOURCES ${LOCALDEEPCOPYROWMATRIX_OUTPUT_FILES})

# Generate ETI .cpp files for the RowMatrix -> CrsMatrix overload of
# Tpetra::createDeepCopy. Do this only for non-integer Scalar
# types, since we really only need this function for linear solvers.
Expand All @@ -634,7 +634,7 @@ IF (${PACKAGE_NAME}_ENABLE_EXPLICIT_INSTANTIATION)
FALSE)
LIST(APPEND SOURCES ${CREATEDEEPCOPY_CRSMATRIX_OUTPUT_FILES})
ENDIF ()

# Generate ETI .cpp files for Tpetra::LocalCrsMatrixOperator.
TPETRA_PROCESS_ALL_SN_TEMPLATES(LOCALCRSMATRIXOPERATOR_OUTPUT_FILES
"Tpetra_ETI_SC_NT.tmpl" "LocalCrsMatrixOperator"
Expand Down Expand Up @@ -777,5 +777,5 @@ SET_PROPERTY(
# / from this directory, or to / from the 'impl' subdirectory. That ensures
# that running "make" will also rerun CMake in order to regenerate Makefiles.
#
# Here's another change, another, and another.
# Here's another change, another, and another and yet another.
#
18 changes: 18 additions & 0 deletions packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,9 @@ namespace Tpetra {
using nonconst_global_inds_host_view_type =
typename row_graph_type::nonconst_global_inds_host_view_type;

using offset_device_view_type =
typename row_ptrs_device_view_type::non_const_type;


//KDDKDD INROW using local_inds_host_view_type =
//KDDKDD INROW typename local_inds_dualv_type::t_host::const_type;
Expand Down Expand Up @@ -1387,6 +1390,10 @@ namespace Tpetra {
void
getLocalDiagOffsets (const Kokkos::View<size_t*, device_type, Kokkos::MemoryUnmanaged>& offsets) const;

/// \brief Get offsets of the off-rank entries in the graph.
void
getLocalOffRankOffsets (offset_device_view_type& offsets) const;

/// \brief Backwards compatibility overload of the above method.
///
/// This method takes a Teuchos::ArrayRCP instead of a
Expand Down Expand Up @@ -2064,6 +2071,8 @@ namespace Tpetra {
/// </ul>
void computeGlobalConstants ();

bool haveLocalOffRankOffsets() const { return haveLocalOffRankOffsets_;}

protected:
/// \brief Compute local constants, if they have not yet been computed.
///
Expand Down Expand Up @@ -2410,6 +2419,13 @@ namespace Tpetra {
/// This may also exist with 1-D storage, if storage is unpacked.
num_row_entries_type k_numRowEntries_;

/// \brief The offsets for off-rank entries.
///
/// When off-rank entries are sorted last, this rowPtr-lile view
/// contains the offsets. It is compute on the first call to
/// getLocalOffRankOffsets().
mutable offset_device_view_type k_offRankOffsets_;

//@}

/// \brief Status of the graph's storage, when not in a
Expand Down Expand Up @@ -2438,6 +2454,8 @@ namespace Tpetra {
bool haveLocalConstants_ = false;
//! Whether all processes have computed global constants.
bool haveGlobalConstants_ = false;
//!
mutable bool haveLocalOffRankOffsets_ = false;

typedef typename std::map<global_ordinal_type, std::vector<global_ordinal_type> > nonlocals_type;

Expand Down
58 changes: 58 additions & 0 deletions packages/tpetra/core/src/Tpetra_CrsGraph_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
#include "Tpetra_Details_copyOffsets.hpp"
#include "Tpetra_Details_gathervPrint.hpp"
#include "Tpetra_Details_getGraphDiagOffsets.hpp"
#include "Tpetra_Details_getGraphOffRankOffsets.hpp"
#include "Tpetra_Details_makeColMap.hpp"
#include "Tpetra_Details_Profiling.hpp"
#include "Tpetra_Details_getEntryOnHost.hpp"
Expand Down Expand Up @@ -6698,6 +6699,60 @@ namespace Tpetra {
} // debug_
}

template <class LocalOrdinal, class GlobalOrdinal, class Node>
void
CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::
getLocalOffRankOffsets (offset_device_view_type& offsets) const
{
using std::endl;
const char tfecfFuncName[] = "getLocalOffRankOffsets: ";
const bool verbose = verbose_;

std::unique_ptr<std::string> prefix;
if (verbose) {
prefix = this->createPrefix("CrsGraph", "getLocalOffRankOffsets");
std::ostringstream os;
os << *prefix << "offsets.extent(0)=" << offsets.extent(0)
<< endl;
std::cerr << os.str();
}

TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
(! hasColMap (), std::runtime_error, "The graph must have a column Map.");
// Instead of throwing, we could also copy the rowPtr to k_offRankOffsets_.

const size_t lclNumRows = this->getNodeNumRows ();

if (haveLocalOffRankOffsets_ && k_offRankOffsets_.extent(0) == lclNumRows+1) {
offsets = k_offRankOffsets_;
return;
}
haveLocalOffRankOffsets_ = false;
k_offRankOffsets_ = offset_device_view_type(Kokkos::ViewAllocateWithoutInitializing("offRankOffset"), lclNumRows+1);
offsets = k_offRankOffsets_;

const map_type& colMap = * (this->getColMap ());
const map_type& domMap = * (this->getDomainMap ());

// mfh 12 Mar 2016: LocalMap works on (CUDA) device. It has just
// the subset of Map functionality that we need below.
auto lclColMap = colMap.getLocalMap ();
auto lclDomMap = domMap.getLocalMap ();

// FIXME (mfh 16 Dec 2015) It's easy to thread-parallelize this
// setup, at least on the host. For CUDA, we have to use LocalMap
// (that comes from each of the two Maps).

TEUCHOS_ASSERT(this->isSorted ());
if (isFillComplete ()) {
auto lclGraph = this->getLocalGraph ();
::Tpetra::Details::getGraphOffRankOffsets (k_offRankOffsets_,
lclColMap, lclDomMap,
lclGraph);
haveLocalOffRankOffsets_ = true;
}
}

namespace { // (anonymous)

// mfh 21 Jan 2016: This is useful for getLocalDiagOffsets (see
Expand Down Expand Up @@ -7548,6 +7603,7 @@ namespace Tpetra {

std::swap(graph.rowPtrsUnpacked_dev_, this->rowPtrsUnpacked_dev_);
std::swap(graph.rowPtrsUnpacked_host_, this->rowPtrsUnpacked_host_);
std::swap(graph.k_offRankOffsets_, this->k_offRankOffsets_);

std::swap(graph.lclIndsUnpacked_wdv, this->lclIndsUnpacked_wdv);
std::swap(graph.gblInds_wdv, this->gblInds_wdv);
Expand All @@ -7563,6 +7619,7 @@ namespace Tpetra {
std::swap(graph.noRedundancies_, this->noRedundancies_);
std::swap(graph.haveLocalConstants_, this->haveLocalConstants_);
std::swap(graph.haveGlobalConstants_, this->haveGlobalConstants_);
std::swap(graph.haveLocalOffRankOffsets_, this->haveLocalOffRankOffsets_);

std::swap(graph.sortGhostsAssociatedWithEachProcessor_, this->sortGhostsAssociatedWithEachProcessor_);

Expand Down Expand Up @@ -7625,6 +7682,7 @@ namespace Tpetra {
output = this->noRedundancies_ == graph.noRedundancies_ ? output : false;
output = this->haveLocalConstants_ == graph.haveLocalConstants_ ? output : false;
output = this->haveGlobalConstants_ == graph.haveGlobalConstants_ ? output : false;
output = this->haveLocalOffRankOffsets_ == graph.haveLocalOffRankOffsets_ ? output : false;
output = this->sortGhostsAssociatedWithEachProcessor_ == this->sortGhostsAssociatedWithEachProcessor_ ? output : false;

// Compare nonlocals_ -- std::map<GlobalOrdinal, std::vector<GlobalOrdinal> >
Expand Down
23 changes: 23 additions & 0 deletions packages/tpetra/core/src/Tpetra_Details_Behavior.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -549,6 +549,29 @@ bool Behavior::hierarchicalUnpack ()
defaultValue);
}

bool Behavior::skipCopyAndPermuteIfPossible ()
{
constexpr char envVarName[] = "TPETRA_SKIP_COPY_AND_PERMUTE";
constexpr bool defaultValue(false);

static bool value_ = defaultValue;
static bool initialized_ = false;
return idempotentlyGetEnvironmentVariableAsBool
(value_, initialized_, envVarName, defaultValue);
}

bool Behavior::overlapCommunicationAndComputation ()
{
constexpr char envVarName[] = "TPETRA_OVERLAP";
constexpr bool defaultValue(false);

static bool value_ = defaultValue;
static bool initialized_ = false;
return idempotentlyGetEnvironmentVariableAsBool
(value_, initialized_, envVarName, defaultValue);
}


} // namespace Details
} // namespace Tpetra

12 changes: 12 additions & 0 deletions packages/tpetra/core/src/Tpetra_Details_Behavior.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,18 @@ class Behavior {
/// environment variable.
static bool profilingRegionUseKokkosProfiling();

/// \brief Skip copyAndPermute if possible
///
/// This is disabled by default. You may control this at run time via the
/// <tt>TPETRA_SKIP_COPY_AND_PERMUTE</tt> environment variable.
static bool skipCopyAndPermuteIfPossible();

/// \brief Overlap communication and computation.
///
/// This is disabled by default. You may control this at run time via the
/// <tt>TPETRA_OVERLAP</tt> environment variable.
static bool overlapCommunicationAndComputation();


};

Expand Down
8 changes: 8 additions & 0 deletions packages/tpetra/core/src/Tpetra_Details_Transfer_decl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,14 @@ class Transfer : public Teuchos::Describable {
void expertSetExportLIDsContiguous<LO,GO,NT>(Transfer<LO, GO, NT> transfer, bool contig);
#endif // DOXYGEN_SHOULD_SKIP_THIS

/// \brief Are source and target map locally fitted?
///
/// Returns whether source and target map are locally fitted on the
/// calling rank. This is can be more efficient that calling
/// isLocallyFitted() on the maps directly, since no indices need to
/// be compared.
bool isLocallyFitted () const;

/// \brief Describe this object in a human-readable way to the given
/// output stream.
///
Expand Down
8 changes: 8 additions & 0 deletions packages/tpetra/core/src/Tpetra_Details_Transfer_def.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,14 @@ isLocallyComplete () const {
return TransferData_->isLocallyComplete_;
}

template <class LO, class GO, class NT>
bool
Transfer<LO, GO, NT>::
isLocallyFitted () const {
return (getNumSameIDs() == std::min(getSourceMap()->getNodeNumElements(),
getTargetMap()->getNodeNumElements()));
}

template <class LO, class GO, class NT>
void
Transfer<LO, GO, NT>::
Expand Down
Loading

0 comments on commit 07e60f0

Please sign in to comment.