From ce2780fc1432dc1f1ef67186820d6f5dc49f8dea Mon Sep 17 00:00:00 2001 From: Tim Fuller Date: Tue, 20 Mar 2018 12:08:58 -0600 Subject: [PATCH] Tpetra: Implementation of CrsGraph::transferAndFillComplete (and friends) CrsGraph::transferAndFillComplete is new, independent code. However, there were some function name clashes in packCrsMatrix and packCrsGraph. So, I moved implementation details of each to a new namespace so that they could have consistent naming. The same goes for unpackAndCombineCrsMatrix and unpackAndCombineCrsGraph. Addresses: #2267 --- packages/tpetra/core/src/CMakeLists.txt | 2 - .../tpetra/core/src/Tpetra_CrsGraph_decl.hpp | 436 ++++++ .../tpetra/core/src/Tpetra_CrsGraph_def.hpp | 781 +++++++++- .../core/src/Tpetra_Details_packCrsGraph.cpp | 67 + .../src/Tpetra_Details_packCrsGraph_decl.hpp | 220 +++ .../src/Tpetra_Details_packCrsGraph_def.hpp | 999 +++++++++++++ .../src/Tpetra_Details_packCrsMatrix_def.hpp | 60 +- ...petra_Details_unpackCrsGraphAndCombine.cpp | 67 + ..._Details_unpackCrsGraphAndCombine_decl.hpp | 262 ++++ ...a_Details_unpackCrsGraphAndCombine_def.hpp | 1292 +++++++++++++++++ ..._Details_unpackCrsMatrixAndCombine_def.hpp | 97 +- .../tpetra/core/test/CrsGraph/CMakeLists.txt | 20 + .../test/CrsGraph/CrsGraph_PackUnpack.cpp | 419 ++++++ .../ImportExport2/ImportExport2_UnitTests.cpp | 199 ++- 14 files changed, 4833 insertions(+), 88 deletions(-) create mode 100644 packages/tpetra/core/src/Tpetra_Details_packCrsGraph.cpp create mode 100644 packages/tpetra/core/src/Tpetra_Details_packCrsGraph_decl.hpp create mode 100644 packages/tpetra/core/src/Tpetra_Details_packCrsGraph_def.hpp create mode 100644 packages/tpetra/core/src/Tpetra_Details_unpackCrsGraphAndCombine.cpp create mode 100644 packages/tpetra/core/src/Tpetra_Details_unpackCrsGraphAndCombine_decl.hpp create mode 100644 packages/tpetra/core/src/Tpetra_Details_unpackCrsGraphAndCombine_def.hpp create mode 100644 packages/tpetra/core/test/CrsGraph/CrsGraph_PackUnpack.cpp diff --git a/packages/tpetra/core/src/CMakeLists.txt b/packages/tpetra/core/src/CMakeLists.txt index 9c3519ca50ed..7236771c4441 100644 --- a/packages/tpetra/core/src/CMakeLists.txt +++ b/packages/tpetra/core/src/CMakeLists.txt @@ -386,5 +386,3 @@ SET_PROPERTY( # subdirectory. That ensures that running "make" will also rerun # CMake in order to regenerate Makefiles. # -# Here is another such change. -# diff --git a/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp b/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp index 15eb902e77d3..b1d320c5a2c5 100644 --- a/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp +++ b/packages/tpetra/core/src/Tpetra_CrsGraph_decl.hpp @@ -1368,6 +1368,196 @@ namespace Tpetra { } }; + private: + // Friend declaration for nonmember function. + template + friend Teuchos::RCP + importAndFillCompleteCrsGraph (const Teuchos::RCP& sourceGraph, + const Import& importer, + const Teuchos::RCP >& domainMap, + const Teuchos::RCP >& rangeMap, + const Teuchos::RCP& params); + + // Friend declaration for nonmember function. + template + friend Teuchos::RCP + importAndFillCompleteCrsGraph (const Teuchos::RCP& sourceGraph, + const Import& rowImporter, + const Import& domainImporter, + const Teuchos::RCP >& domainMap, + const Teuchos::RCP >& rangeMap, + const Teuchos::RCP& params); + + + // Friend declaration for nonmember function. + template + friend Teuchos::RCP + exportAndFillCompleteCrsGraph (const Teuchos::RCP& sourceGraph, + const Export& exporter, + const Teuchos::RCP >& domainMap, + const Teuchos::RCP >& rangeMap, + const Teuchos::RCP& params); + + // Friend declaration for nonmember function. + template + friend Teuchos::RCP + exportAndFillCompleteCrsGraph (const Teuchos::RCP& sourceGraph, + const Export& rowExporter, + const Export& domainExporter, + const Teuchos::RCP >& domainMap, + const Teuchos::RCP >& rangeMap, + const Teuchos::RCP& params); + + public: + /// \brief Import from this to the given destination + /// graph, and make the result fill complete. + /// + /// If destGraph.is_null(), this creates a new graph as the + /// destination. (This is why destGraph is passed in by nonconst + /// reference to RCP.) Otherwise it checks for "pristine" status + /// and throws if that is not the case. "Pristine" means that the + /// graph has no entries and is not fill complete. + /// + /// Use of the "non-member constructor" version of this method, + /// exportAndFillCompleteCrsGraph, is preferred for user + /// applications. + /// + /// \warning This method is intended for expert developer use + /// only, and should never be called by user code. + void + importAndFillComplete (Teuchos::RCP >& destGraph, + const import_type& importer, + const Teuchos::RCP& domainMap, + const Teuchos::RCP& rangeMap, + const Teuchos::RCP& params = Teuchos::null) const; + + /// \brief Import from this to the given destination + /// graph, and make the result fill complete. + /// + /// If destGraph.is_null(), this creates a new graph as the + /// destination. (This is why destGraph is passed in by nonconst + /// reference to RCP.) Otherwise it checks for "pristine" status + /// and throws if that is not the case. "Pristine" means that the + /// graph has no entries and is not fill complete. + /// + /// Use of the "non-member constructor" version of this method, + /// exportAndFillCompleteCrsGraph, is preferred for user + /// applications. + /// + /// \warning This method is intended for expert developer use + /// only, and should never be called by user code. + void + importAndFillComplete (Teuchos::RCP >& destGraph, + const import_type& rowImporter, + const import_type& domainImporter, + const Teuchos::RCP& domainMap, + const Teuchos::RCP& rangeMap, + const Teuchos::RCP& params) const; + + + /// \brief Export from this to the given destination + /// graph, and make the result fill complete. + /// + /// If destGraph.is_null(), this creates a new graph as the + /// destination. (This is why destGraph is passed in by nonconst + /// reference to RCP.) Otherwise it checks for "pristine" status + /// and throws if that is not the case. "Pristine" means that the + /// graph has no entries and is not fill complete. + /// + /// Use of the "non-member constructor" version of this method, + /// exportAndFillCompleteCrsGraph, is preferred for user + /// applications. + /// + /// \warning This method is intended for expert developer use + /// only, and should never be called by user code. + void + exportAndFillComplete (Teuchos::RCP >& destGraph, + const export_type& exporter, + const Teuchos::RCP& domainMap = Teuchos::null, + const Teuchos::RCP& rangeMap = Teuchos::null, + const Teuchos::RCP& params = Teuchos::null) const; + + /// \brief Export from this to the given destination + /// graph, and make the result fill complete. + /// + /// If destGraph.is_null(), this creates a new graph as the + /// destination. (This is why destGraph is passed in by nonconst + /// reference to RCP.) Otherwise it checks for "pristine" status + /// and throws if that is not the case. "Pristine" means that the + /// graph has no entries and is not fill complete. + /// + /// Use of the "non-member constructor" version of this method, + /// exportAndFillCompleteCrsGraph, is preferred for user + /// applications. + /// + /// \warning This method is intended for expert developer use + /// only, and should never be called by user code. + void + exportAndFillComplete (Teuchos::RCP >& destGraph, + const export_type& rowExporter, + const export_type& domainExporter, + const Teuchos::RCP& domainMap, + const Teuchos::RCP& rangeMap, + const Teuchos::RCP& params) const; + + + private: + /// \brief Transfer (e.g. Import/Export) from this to the + /// given destination graph, and make the result fill complete. + /// + /// If destGraph.is_null(), this creates a new graph, otherwise it + /// checks for "pristine" status and throws if that is not the + /// case. This method implements importAndFillComplete and + /// exportAndFillComplete, which in turn implemment the nonmember + /// "constructors" importAndFillCompleteCrsGraph and + /// exportAndFillCompleteCrsGraph. It's convenient to put those + /// nonmember constructors' implementations inside the CrsGraph + /// class, so that we don't have to put much code in the _decl + /// header file. + /// + /// The point of this method is to fuse three tasks: + /// + /// 1. Create a destination graph (CrsGraph constructor) + /// 2. Import or Export this graph to the destination graph + /// 3. Call fillComplete on the destination graph + /// + /// Fusing these tasks can avoid some communication and work. + void + transferAndFillComplete (Teuchos::RCP >& destGraph, + const ::Tpetra::Details::Transfer& rowTransfer, + const Teuchos::RCP > & domainTransfer, + const Teuchos::RCP& domainMap = Teuchos::null, + const Teuchos::RCP& rangeMap = Teuchos::null, + const Teuchos::RCP& params = Teuchos::null) const; + protected: // these structs are conveniences, to cut down on the number of // arguments to some of the methods below. @@ -2028,6 +2218,252 @@ namespace Tpetra { return rcp (new graph_type (map, maxNumEntriesPerRow, DynamicProfile, params)); } + /// \brief Nonmember CrsGraph constructor that fuses Import and fillComplete(). + /// \relatesalso CrsGraph + /// \tparam CrsGraphType A specialization of CrsGraph. + /// + /// A common use case is to create an empty destination CrsGraph, + /// redistribute from a source CrsGraph (by an Import or Export + /// operation), then call fillComplete() on the destination + /// CrsGraph. This constructor fuses these three cases, for an + /// Import redistribution. + /// + /// Fusing redistribution and fillComplete() exposes potential + /// optimizations. For example, it may make constructing the column + /// Map faster, and it may avoid intermediate unoptimized storage in + /// the destination CrsGraph. + /// + /// The resulting graph is fill complete (in the sense of + /// isFillComplete()) and has optimized storage (in the sense of + /// isStorageOptimized()). By default, its domain Map is the domain + /// Map of the source graph, and its range Map is the range Map of + /// the source graph. + /// + /// \warning If the target Map of the Import is a subset of the + /// source Map of the Import, then you cannot use the default + /// range Map. You should instead construct a nonoverlapping + /// version of the target Map and supply that as the nondefault + /// value of the range Map. + /// + /// \param sourceGraph [in] The source graph from which to + /// import. The source of an Import must have a nonoverlapping + /// distribution. + /// + /// \param importer [in] The Import instance containing a + /// precomputed redistribution plan. The source Map of the + /// Import must be the same as the rowMap of sourceGraph unless + /// the "Reverse Mode" option on the params list, in which case + /// the targetMap of Import must match the rowMap of the sourceGraph + /// + /// \param domainMap [in] Domain Map of the returned graph. If + /// null, we use the default, which is the domain Map of the + /// source graph. + /// + /// \param rangeMap [in] Range Map of the returned graph. If + /// null, we use the default, which is the range Map of the + /// source graph. + /// + /// \param params [in/out] Optional list of parameters. If not + /// null, any missing parameters will be filled in with their + /// default values. + template + Teuchos::RCP + importAndFillCompleteCrsGraph (const Teuchos::RCP& sourceGraph, + const Import& importer, + const Teuchos::RCP >& domainMap = Teuchos::null, + const Teuchos::RCP >& rangeMap = Teuchos::null, + const Teuchos::RCP& params = Teuchos::null) + { + Teuchos::RCP destGraph; + sourceGraph->importAndFillComplete (destGraph,importer,domainMap, rangeMap, params); + return destGraph; + } + + /// \brief Nonmember CrsGraph constructor that fuses Import and fillComplete(). + /// \relatesalso CrsGraph + /// \tparam CrsGraphType A specialization of CrsGraph. + /// + /// A common use case is to create an empty destination CrsGraph, + /// redistribute from a source CrsGraph (by an Import or Export + /// operation), then call fillComplete() on the destination + /// CrsGraph. This constructor fuses these three cases, for an + /// Import redistribution. + /// + /// Fusing redistribution and fillComplete() exposes potential + /// optimizations. For example, it may make constructing the column + /// Map faster, and it may avoid intermediate unoptimized storage in + /// the destination CrsGraph. + /// + /// The resulting graph is fill complete (in the sense of + /// isFillComplete()) and has optimized storage (in the sense of + /// isStorageOptimized()). By default, its domain Map is the domain + /// Map of the source graph, and its range Map is the range Map of + /// the source graph. + /// + /// \warning If the target Map of the Import is a subset of the + /// source Map of the Import, then you cannot use the default + /// range Map. You should instead construct a nonoverlapping + /// version of the target Map and supply that as the nondefault + /// value of the range Map. + /// + /// \param sourceGraph [in] The source graph from which to + /// import. The source of an Import must have a nonoverlapping + /// distribution. + /// + /// \param rowImporter [in] The Import instance containing a + /// precomputed redistribution plan. The source Map of the + /// Import must be the same as the rowMap of sourceGraph unless + /// the "Reverse Mode" option on the params list, in which case + /// the targetMap of Import must match the rowMap of the sourceGraph + /// + /// \param domainImporter [in] The Import instance containing a + /// precomputed redistribution plan. The source Map of the + /// Import must be the same as the domainMap of sourceGraph unless + /// the "Reverse Mode" option on the params list, in which case + /// the targetMap of Import must match the domainMap of the sourceGraph + /// + /// \param domainMap [in] Domain Map of the returned graph. + /// + /// \param rangeMap [in] Range Map of the returned graph. + /// + /// \param params [in/out] Optional list of parameters. If not + /// null, any missing parameters will be filled in with their + /// default values. + template + Teuchos::RCP + importAndFillCompleteCrsGraph (const Teuchos::RCP& sourceGraph, + const Import& rowImporter, + const Import& domainImporter, + const Teuchos::RCP >& domainMap, + const Teuchos::RCP >& rangeMap, + const Teuchos::RCP& params) + { + Teuchos::RCP destGraph; + sourceGraph->importAndFillComplete (destGraph,rowImporter,domainImporter, domainMap, rangeMap, params); + return destGraph; + } + + /// \brief Nonmember CrsGraph constructor that fuses Export and fillComplete(). + /// \relatesalso CrsGraph + /// \tparam CrsGraphType A specialization of CrsGraph. + /// + /// For justification, see the documentation of + /// importAndFillCompleteCrsGraph() (which is the Import analog of + /// this function). + /// + /// The resulting graph is fill complete (in the sense of + /// isFillComplete()) and has optimized storage (in the sense of + /// isStorageOptimized()). By default, its domain Map is the domain + /// Map of the source graph, and its range Map is the range Map of + /// the source graph. + /// + /// \param sourceGraph [in] The source graph from which to + /// export. Its row Map may be overlapping, since the source of + /// an Export may be overlapping. + /// + /// \param exporter [in] The Export instance containing a + /// precomputed redistribution plan. The source Map of the + /// Export must be the same as the row Map of sourceGraph. + /// + /// \param domainMap [in] Domain Map of the returned graph. If + /// null, we use the default, which is the domain Map of the + /// source graph. + /// + /// \param rangeMap [in] Range Map of the returned graph. If + /// null, we use the default, which is the range Map of the + /// source graph. + /// + /// \param params [in/out] Optional list of parameters. If not + /// null, any missing parameters will be filled in with their + /// default values. + template + Teuchos::RCP + exportAndFillCompleteCrsGraph (const Teuchos::RCP& sourceGraph, + const Export& exporter, + const Teuchos::RCP >& domainMap = Teuchos::null, + const Teuchos::RCP >& rangeMap = Teuchos::null, + const Teuchos::RCP& params = Teuchos::null) + { + Teuchos::RCP destGraph; + sourceGraph->exportAndFillComplete (destGraph,exporter,domainMap, rangeMap, params); + return destGraph; + } + + /// \brief Nonmember CrsGraph constructor that fuses Export and fillComplete(). + /// \relatesalso CrsGraph + /// \tparam CrsGraphType A specialization of CrsGraph. + /// + /// For justification, see the documentation of + /// importAndFillCompleteCrsGraph() (which is the Import analog of + /// this function). + /// + /// The resulting graph is fill complete (in the sense of + /// isFillComplete()) and has optimized storage (in the sense of + /// isStorageOptimized()). By default, its domain Map is the domain + /// Map of the source graph, and its range Map is the range Map of + /// the source graph. + /// + /// \param sourceGraph [in] The source graph from which to + /// export. Its row Map may be overlapping, since the source of + /// an Export may be overlapping. + /// + /// \param rowExporter [in] The Export instance containing a + /// precomputed redistribution plan. The source Map of the + /// Export must be the same as the row Map of sourceGraph. + /// + /// \param domainExporter [in] The Export instance containing a + /// precomputed redistribution plan. The source Map of the + /// Export must be the same as the domain Map of sourceGraph. + /// + /// \param domainMap [in] Domain Map of the returned graph. + /// + /// \param rangeMap [in] Range Map of the returned graph. + /// + /// \param params [in/out] Optional list of parameters. If not + /// null, any missing parameters will be filled in with their + /// default values. + template + Teuchos::RCP + exportAndFillCompleteCrsGraph (const Teuchos::RCP& sourceGraph, + const Export& rowExporter, + const Export& domainExporter, + const Teuchos::RCP >& domainMap, + const Teuchos::RCP >& rangeMap, + const Teuchos::RCP& params) + { + Teuchos::RCP destGraph; + sourceGraph->exportAndFillComplete (destGraph,rowExporter,domainExporter,domainMap, rangeMap, params); + return destGraph; + } + namespace Details { template #include #include @@ -2947,7 +2952,7 @@ namespace Tpetra { } if (! allInColMap) { std::ostringstream os; - os << "Tpetra::CrsMatrix::insertLocalIndices: You attempted to insert " + os << "Tpetra::CrsGraph::insertLocalIndices: You attempted to insert " "entries in owned row " << localRow << ", at the following column " "indices: " << toString (indices) << "." << endl; os << "Of those, the following indices are not in the column Map on " @@ -6226,6 +6231,703 @@ namespace Tpetra { return true; } + template + void + CrsGraph:: + transferAndFillComplete (Teuchos::RCP >& destGraph, + const ::Tpetra::Details::Transfer& rowTransfer, + const Teuchos::RCP > & domainTransfer, + const Teuchos::RCP& domainMap, + const Teuchos::RCP& rangeMap, + const Teuchos::RCP& params) const + { + using Tpetra::Details::getArrayViewFromDualView; + using Tpetra::Details::packCrsGraphWithOwningPIDs; + using Tpetra::Details::unpackAndCombineWithOwningPIDsCount; + using Tpetra::Details::unpackAndCombineIntoCrsArrays; + using Teuchos::ArrayRCP; + using Teuchos::ArrayView; + using Teuchos::Comm; + using Teuchos::ParameterList; + using Teuchos::rcp; + using Teuchos::RCP; +#ifdef HAVE_TPETRA_MMM_TIMINGS + using std::string; + using Teuchos::TimeMonitor; +#endif + + using LO = LocalOrdinal; + using GO = GlobalOrdinal; + using NT = node_type; + using this_type = CrsGraph; + using ivector_type = Vector; + using packet_type = typename this_type::packet_type; + + const char* prefix = "Tpetra::CrsGraph::transferAndFillComplete: "; + +#ifdef HAVE_TPETRA_MMM_TIMINGS + string label; + if(!params.is_null()) label = params->get("Timer Label", label); + string prefix2 = string("Tpetra ")+ label + std::string(": CrsGraph TAFC "); + RCP MM = + rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix2+string("Pack-1")))); +#endif + + // Make sure that the input argument rowTransfer is either an + // Import or an Export. Import and Export are the only two + // subclasses of Transfer that we defined, but users might + // (unwisely, for now at least) decide to implement their own + // subclasses. Exclude this possibility. + const import_type* xferAsImport = dynamic_cast(&rowTransfer); + const export_type* xferAsExport = dynamic_cast(&rowTransfer); + TEUCHOS_TEST_FOR_EXCEPTION( + xferAsImport == NULL && xferAsExport == NULL, std::invalid_argument, + prefix << "The 'rowTransfer' input argument must be either an Import or " + "an Export, and its template parameters must match the corresponding " + "template parameters of the CrsGraph."); + + // Make sure that the input argument domainTransfer is either an + // Import or an Export. Import and Export are the only two + // subclasses of Transfer that we defined, but users might + // (unwisely, for now at least) decide to implement their own + // subclasses. Exclude this possibility. + Teuchos::RCP xferDomainAsImport = + Teuchos::rcp_dynamic_cast(domainTransfer); + Teuchos::RCP xferDomainAsExport = + Teuchos::rcp_dynamic_cast(domainTransfer); + + if(! domainTransfer.is_null()) { + + TEUCHOS_TEST_FOR_EXCEPTION( + (xferDomainAsImport.is_null() && xferDomainAsExport.is_null()), std::invalid_argument, + prefix << "The 'domainTransfer' input argument must be either an " + "Import or an Export, and its template parameters must match the " + "corresponding template parameters of the CrsGraph."); + + TEUCHOS_TEST_FOR_EXCEPTION( + ( xferAsImport != NULL || ! xferDomainAsImport.is_null() ) && + (( xferAsImport != NULL && xferDomainAsImport.is_null() ) || + ( xferAsImport == NULL && ! xferDomainAsImport.is_null() )), std::invalid_argument, + prefix << "The 'rowTransfer' and 'domainTransfer' input arguments " + "must be of the same type (either Import or Export)."); + + TEUCHOS_TEST_FOR_EXCEPTION( + ( xferAsExport != NULL || ! xferDomainAsExport.is_null() ) && + (( xferAsExport != NULL && xferDomainAsExport.is_null() ) || + ( xferAsExport == NULL && ! xferDomainAsExport.is_null() )), std::invalid_argument, + prefix << "The 'rowTransfer' and 'domainTransfer' input arguments " + "must be of the same type (either Import or Export)."); + + } // domainTransfer != null + + + // FIXME (mfh 15 May 2014) Wouldn't communication still be needed, + // if the source Map is not distributed but the target Map is? + const bool communication_needed = rowTransfer.getSourceMap()->isDistributed(); + + // + // Get the caller's parameters + // + + bool reverseMode = false; // Are we in reverse mode? + bool restrictComm = false; // Do we need to restrict the communicator? + RCP graphparams; // parameters for the destination graph + if (! params.is_null()) { + reverseMode = params->get("Reverse Mode", reverseMode); + restrictComm = params->get("Restrict Communicator", restrictComm); + graphparams = sublist(params, "CrsGraph"); + } + + // Get the new domain and range Maps. We need some of them for error + // checking, now that we have the reverseMode parameter. + RCP MyRowMap = reverseMode ? + rowTransfer.getSourceMap() : rowTransfer.getTargetMap(); + RCP MyColMap; // create this below + RCP MyDomainMap = ! domainMap.is_null() ? domainMap : getDomainMap(); + RCP MyRangeMap = ! rangeMap.is_null() ? rangeMap : getRangeMap(); + RCP BaseRowMap = MyRowMap; + RCP BaseDomainMap = MyDomainMap; + + // If the user gave us a nonnull destGraph, then check whether it's + // "pristine." That means that it has no entries. + // + // FIXME (mfh 15 May 2014) If this is not true on all processes, + // then this exception test may hang. It would be better to + // forward an error flag to the next communication phase. + if (! destGraph.is_null()) { + // FIXME (mfh 15 May 2014): The Epetra idiom for checking + // whether a graph or matrix has no entries on the calling + // process, is that it is neither locally nor globally indexed. + // This may change eventually with the Kokkos refactor version + // of Tpetra, so it would be better just to check the quantity + // of interest directly. Note that with the Kokkos refactor + // version of Tpetra, asking for the total number of entries in + // a graph or matrix that is not fill complete might require + // computation (kernel launch), since it is not thread scalable + // to update a count every time an entry is inserted. + const bool NewFlag = + ! destGraph->isLocallyIndexed() && ! destGraph->isGloballyIndexed(); + TEUCHOS_TEST_FOR_EXCEPTION(! NewFlag, std::invalid_argument, + prefix << "The input argument 'destGraph' is only allowed to be nonnull, " + "if its graph is empty (neither locally nor globally indexed)."); + + // FIXME (mfh 15 May 2014) At some point, we want to change + // graphs and matrices so that their DistObject Map + // (this->getMap()) may differ from their row Map. This will + // make redistribution for 2-D distributions more efficient. I + // hesitate to change this check, because I'm not sure how much + // the code here depends on getMap() and getRowMap() being the + // same. + TEUCHOS_TEST_FOR_EXCEPTION( + ! destGraph->getRowMap()->isSameAs(*MyRowMap), std::invalid_argument, + prefix << "The (row) Map of the input argument 'destGraph' is not the " + "same as the (row) Map specified by the input argument 'rowTransfer'."); + + TEUCHOS_TEST_FOR_EXCEPTION( + ! destGraph->checkSizes(*this), std::invalid_argument, + prefix << "You provided a nonnull destination graph, but checkSizes() " + "indicates that it is not a legal legal target for redistribution from " + "the source graph (*this). This may mean that they do not have the " + "same dimensions."); + } + + // If forward mode (the default), then *this's (row) Map must be + // the same as the source Map of the Transfer. If reverse mode, + // then *this's (row) Map must be the same as the target Map of + // the Transfer. + // + // FIXME (mfh 15 May 2014) At some point, we want to change graphs + // and matrices so that their DistObject Map (this->getMap()) may + // differ from their row Map. This will make redistribution for + // 2-D distributions more efficient. I hesitate to change this + // check, because I'm not sure how much the code here depends on + // getMap() and getRowMap() being the same. + TEUCHOS_TEST_FOR_EXCEPTION( + ! (reverseMode || getRowMap()->isSameAs(*rowTransfer.getSourceMap())), + std::invalid_argument, prefix << + "rowTransfer->getSourceMap() must match this->getRowMap() in forward mode."); + + TEUCHOS_TEST_FOR_EXCEPTION( + ! (! reverseMode || getRowMap()->isSameAs(*rowTransfer.getTargetMap())), + std::invalid_argument, prefix << + "rowTransfer->getTargetMap() must match this->getRowMap() in reverse mode."); + + // checks for domainTransfer + TEUCHOS_TEST_FOR_EXCEPTION( + ! xferDomainAsImport.is_null() && ! xferDomainAsImport->getTargetMap()->isSameAs(*domainMap), + std::invalid_argument, + prefix << "The target map of the 'domainTransfer' input argument must be " + "the same as the rebalanced domain map 'domainMap'"); + + TEUCHOS_TEST_FOR_EXCEPTION( + ! xferDomainAsExport.is_null() && ! xferDomainAsExport->getSourceMap()->isSameAs(*domainMap), + std::invalid_argument, + prefix << "The source map of the 'domainTransfer' input argument must be " + "the same as the rebalanced domain map 'domainMap'"); + + // The basic algorithm here is: + // + // 1. Call the moral equivalent of "distor.do" to handle the import. + // 2. Copy all the Imported and Copy/Permuted data into the raw + // CrsGraph pointers, still using GIDs. + // 3. Call an optimized version of MakeColMap that avoids the + // Directory lookups (since the importer knows who owns all the + // GIDs) AND reindexes to LIDs. + // 4. Call expertStaticFillComplete() + + // Get information from the Importer + const size_t NumSameIDs = rowTransfer.getNumSameIDs(); + ArrayView ExportLIDs = reverseMode ? + rowTransfer.getRemoteLIDs() : rowTransfer.getExportLIDs(); + ArrayView RemoteLIDs = reverseMode ? + rowTransfer.getExportLIDs() : rowTransfer.getRemoteLIDs(); + ArrayView PermuteToLIDs = reverseMode ? + rowTransfer.getPermuteFromLIDs() : rowTransfer.getPermuteToLIDs(); + ArrayView PermuteFromLIDs = reverseMode ? + rowTransfer.getPermuteToLIDs() : rowTransfer.getPermuteFromLIDs(); + Distributor& Distor = rowTransfer.getDistributor(); + + // Owning PIDs + Teuchos::Array SourcePids; + Teuchos::Array TargetPids; + int MyPID = getComm()->getRank(); + + // Temp variables for sub-communicators + RCP ReducedRowMap, ReducedColMap, + ReducedDomainMap, ReducedRangeMap; + RCP > ReducedComm; + + // If the user gave us a null destGraph, then construct the new + // destination graph. We will replace its column Map later. + if (destGraph.is_null()) { + destGraph = rcp(new this_type(MyRowMap, 0, StaticProfile, graphparams)); + } + + /***************************************************/ + /***** 1) First communicator restriction phase ****/ + /***************************************************/ + if (restrictComm) { + ReducedRowMap = MyRowMap->removeEmptyProcesses(); + ReducedComm = ReducedRowMap.is_null() ? + Teuchos::null : + ReducedRowMap->getComm(); + destGraph->removeEmptyProcessesInPlace(ReducedRowMap); + + ReducedDomainMap = MyRowMap.getRawPtr() == MyDomainMap.getRawPtr() ? + ReducedRowMap : + MyDomainMap->replaceCommWithSubset(ReducedComm); + ReducedRangeMap = MyRowMap.getRawPtr() == MyRangeMap.getRawPtr() ? + ReducedRowMap : + MyRangeMap->replaceCommWithSubset(ReducedComm); + + // Reset the "my" maps + MyRowMap = ReducedRowMap; + MyDomainMap = ReducedDomainMap; + MyRangeMap = ReducedRangeMap; + + // Update my PID, if we've restricted the communicator + if (! ReducedComm.is_null()) { + MyPID = ReducedComm->getRank(); + } + else { + MyPID = -2; // For debugging + } + } + else { + ReducedComm = MyRowMap->getComm(); + } + + /***************************************************/ + /***** 2) From Tpera::DistObject::doTransfer() ****/ + /***************************************************/ +#ifdef HAVE_TPETRA_MMM_TIMINGS + MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix2+string("ImportSetup")))); +#endif + // Get the owning PIDs + RCP MyImporter = getImporter(); + + // check whether domain maps of source graph and base domain map is the same + bool bSameDomainMap = BaseDomainMap->isSameAs(*getDomainMap()); + + if (! restrictComm && ! MyImporter.is_null() && bSameDomainMap ) { + // Same domain map as source graph + // + // NOTE: This won't work for restrictComm (because the Import + // doesn't know the restricted PIDs), though writing an + // optimized version for that case would be easy (Import an + // IntVector of the new PIDs). Might want to add this later. + Import_Util::getPids(*MyImporter, SourcePids, false); + } + else if (restrictComm && ! MyImporter.is_null() && bSameDomainMap) { + // Same domain map as source graph (restricted communicator) + // We need one import from the domain to the column map + ivector_type SourceDomain_pids(getDomainMap(),true); + ivector_type SourceCol_pids(getColMap()); + // SourceDomain_pids contains the restricted pids + SourceDomain_pids.putScalar(MyPID); + + SourceCol_pids.doImport(SourceDomain_pids, *MyImporter, INSERT); + SourcePids.resize(getColMap()->getNodeNumElements()); + SourceCol_pids.get1dCopy(SourcePids()); + } + else if (MyImporter.is_null() && bSameDomainMap) { + // Graph has no off-process entries + SourcePids.resize(getColMap()->getNodeNumElements()); + SourcePids.assign(getColMap()->getNodeNumElements(), MyPID); + } + else if ( ! MyImporter.is_null() && + ! domainTransfer.is_null() ) { + // general implementation for rectangular matrices with + // domain map different than SourceGraph domain map. + // User has to provide a DomainTransfer object. We need + // to communications (import/export) + + // TargetDomain_pids lives on the rebalanced new domain map + ivector_type TargetDomain_pids(domainMap); + TargetDomain_pids.putScalar(MyPID); + + // SourceDomain_pids lives on the non-rebalanced old domain map + ivector_type SourceDomain_pids(getDomainMap()); + + // SourceCol_pids lives on the non-rebalanced old column map + ivector_type SourceCol_pids(getColMap()); + + if (! reverseMode && ! xferDomainAsImport.is_null() ) { + SourceDomain_pids.doExport(TargetDomain_pids, *xferDomainAsImport, INSERT); + } + else if (reverseMode && ! xferDomainAsExport.is_null() ) { + SourceDomain_pids.doExport(TargetDomain_pids, *xferDomainAsExport, INSERT); + } + else if (! reverseMode && ! xferDomainAsExport.is_null() ) { + SourceDomain_pids.doImport(TargetDomain_pids, *xferDomainAsExport, INSERT); + } + else if (reverseMode && ! xferDomainAsImport.is_null() ) { + SourceDomain_pids.doImport(TargetDomain_pids, *xferDomainAsImport, INSERT); + } + else { + TEUCHOS_TEST_FOR_EXCEPTION( + true, std::logic_error, + prefix << "Should never get here! Please report this bug to a Tpetra developer."); + } + SourceCol_pids.doImport(SourceDomain_pids, *MyImporter, INSERT); + SourcePids.resize(getColMap()->getNodeNumElements()); + SourceCol_pids.get1dCopy(SourcePids()); + } + else if (BaseDomainMap->isSameAs(*BaseRowMap) && + getDomainMap()->isSameAs(*getRowMap())) { + // We can use the rowTransfer + SourceGraph's Import to find out who owns what. + ivector_type TargetRow_pids(domainMap); + ivector_type SourceRow_pids(getRowMap()); + ivector_type SourceCol_pids(getColMap()); + + TargetRow_pids.putScalar(MyPID); + if (! reverseMode && xferAsImport != NULL) { + SourceRow_pids.doExport(TargetRow_pids, *xferAsImport, INSERT); + } + else if (reverseMode && xferAsExport != NULL) { + SourceRow_pids.doExport(TargetRow_pids, *xferAsExport, INSERT); + } + else if (! reverseMode && xferAsExport != NULL) { + SourceRow_pids.doImport(TargetRow_pids, *xferAsExport, INSERT); + } + else if (reverseMode && xferAsImport != NULL) { + SourceRow_pids.doImport(TargetRow_pids, *xferAsImport, INSERT); + } + else { + TEUCHOS_TEST_FOR_EXCEPTION( + true, std::logic_error, + prefix << "Should never get here! Please report this bug to a Tpetra developer."); + } + SourceCol_pids.doImport(SourceRow_pids, *MyImporter, INSERT); + SourcePids.resize(getColMap()->getNodeNumElements()); + SourceCol_pids.get1dCopy(SourcePids()); + } + else { + TEUCHOS_TEST_FOR_EXCEPTION( + true, std::invalid_argument, + prefix << "This method only allows either domainMap == getDomainMap(), " + "or (domainMap == rowTransfer.getTargetMap() and getDomainMap() == getRowMap())."); + } + + // Tpetra-specific stuff + size_t constantNumPackets = destGraph->constantNumberOfPackets(); + if (constantNumPackets == 0) { + destGraph->reallocArraysForNumPacketsPerLid(ExportLIDs.size(), + RemoteLIDs.size()); + } + else { + // There are a constant number of packets per element. We + // already know (from the number of "remote" (incoming) + // elements) how many incoming elements we expect, so we can + // resize the buffer accordingly. + const size_t rbufLen = RemoteLIDs.size() * constantNumPackets; + destGraph->reallocImportsIfNeeded(rbufLen); + } + + // packAndPrepare* methods modify numExportPacketsPerLID_. + destGraph->numExportPacketsPerLID_.template modify(); + Teuchos::ArrayView numExportPacketsPerLID = + getArrayViewFromDualView(destGraph->numExportPacketsPerLID_); + + // Pack & Prepare w/ owning PIDs + packCrsGraphWithOwningPIDs(*this, destGraph->exports_, + numExportPacketsPerLID, ExportLIDs, + SourcePids, constantNumPackets, Distor); + + // Do the exchange of remote data. +#ifdef HAVE_TPETRA_MMM_TIMINGS + MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix2+string("Transfer")))); +#endif + + if (communication_needed) { + if (reverseMode) { + if (constantNumPackets == 0) { // variable number of packets per LID + // Make sure that host has the latest version, since we're + // using the version on host. If host has the latest + // version, syncing to host does nothing. + destGraph->numExportPacketsPerLID_.template sync(); + Teuchos::ArrayView numExportPacketsPerLID = + getArrayViewFromDualView(destGraph->numExportPacketsPerLID_); + destGraph->numImportPacketsPerLID_.template sync(); + Teuchos::ArrayView numImportPacketsPerLID = + getArrayViewFromDualView(destGraph->numImportPacketsPerLID_); + Distor.doReversePostsAndWaits(numExportPacketsPerLID, 1, + numImportPacketsPerLID); + size_t totalImportPackets = 0; + for (Array_size_type i = 0; i < numImportPacketsPerLID.size(); ++i) { + totalImportPackets += numImportPacketsPerLID[i]; + } + + // Reallocation MUST go before setting the modified flag, + // because it may clear out the flags. + destGraph->reallocImportsIfNeeded(totalImportPackets); + destGraph->imports_.template modify(); + Teuchos::ArrayView hostImports = + getArrayViewFromDualView(destGraph->imports_); + // This is a legacy host pack/unpack path, so use the host + // version of exports_. + destGraph->exports_.template sync(); + Teuchos::ArrayView hostExports = + getArrayViewFromDualView(destGraph->exports_); + Distor.doReversePostsAndWaits(hostExports, + numExportPacketsPerLID, + hostImports, + numImportPacketsPerLID); + } + else { // constant number of packets per LI + destGraph->imports_.template modify(); + Teuchos::ArrayView hostImports = + getArrayViewFromDualView(destGraph->imports_); + // This is a legacy host pack/unpack path, so use the host + // version of exports_. + destGraph->exports_.template sync(); + Teuchos::ArrayView hostExports = + getArrayViewFromDualView(destGraph->exports_); + Distor.doReversePostsAndWaits(hostExports, + constantNumPackets, + hostImports); + } + } + else { // forward mode (the default) + if (constantNumPackets == 0) { // variable number of packets per LID + // Make sure that host has the latest version, since we're + // using the version on host. If host has the latest + // version, syncing to host does nothing. + destGraph->numExportPacketsPerLID_.template sync(); + Teuchos::ArrayView numExportPacketsPerLID = + getArrayViewFromDualView(destGraph->numExportPacketsPerLID_); + destGraph->numImportPacketsPerLID_.template sync(); + Teuchos::ArrayView numImportPacketsPerLID = + getArrayViewFromDualView(destGraph->numImportPacketsPerLID_); + Distor.doPostsAndWaits(numExportPacketsPerLID, 1, + numImportPacketsPerLID); + size_t totalImportPackets = 0; + for (Array_size_type i = 0; i < numImportPacketsPerLID.size(); ++i) { + totalImportPackets += numImportPacketsPerLID[i]; + } + + // Reallocation MUST go before setting the modified flag, + // because it may clear out the flags. + destGraph->reallocImportsIfNeeded(totalImportPackets); + destGraph->imports_.template modify(); + Teuchos::ArrayView hostImports = + getArrayViewFromDualView(destGraph->imports_); + // This is a legacy host pack/unpack path, so use the host + // version of exports_. + destGraph->exports_.template sync(); + Teuchos::ArrayView hostExports = + getArrayViewFromDualView(destGraph->exports_); + Distor.doPostsAndWaits(hostExports, + numExportPacketsPerLID, + hostImports, + numImportPacketsPerLID); + } + else { // constant number of packets per LID + destGraph->imports_.template modify(); + Teuchos::ArrayView hostImports = + getArrayViewFromDualView(destGraph->imports_); + // This is a legacy host pack/unpack path, so use the host + // version of exports_. + destGraph->exports_.template sync(); + Teuchos::ArrayView hostExports = + getArrayViewFromDualView(destGraph->exports_); + Distor.doPostsAndWaits(hostExports, + constantNumPackets, + hostImports); + } + } + } + + /*********************************************************************/ + /**** 3) Copy all of the Same/Permute/Remote data into CSR_arrays ****/ + /*********************************************************************/ + +#ifdef HAVE_TPETRA_MMM_TIMINGS + MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix2+string("Unpack-1")))); +#endif + + // Backwards compatibility measure. We'll use this again below. + destGraph->numImportPacketsPerLID_.template sync(); + Teuchos::ArrayView numImportPacketsPerLID = + getArrayViewFromDualView(destGraph->numImportPacketsPerLID_); + destGraph->imports_.template sync(); + Teuchos::ArrayView hostImports = + getArrayViewFromDualView(destGraph->imports_); + size_t mynnz = + unpackAndCombineWithOwningPIDsCount(*this, RemoteLIDs, hostImports, + numImportPacketsPerLID, + constantNumPackets, Distor, INSERT, + NumSameIDs, PermuteToLIDs, PermuteFromLIDs); + size_t N = BaseRowMap->getNodeNumElements(); + + // Allocations + ArrayRCP CSR_rowptr(N+1); + ArrayRCP CSR_colind_GID; + ArrayRCP CSR_colind_LID; + CSR_colind_GID.resize(mynnz); + + // If LO and GO are the same, we can reuse memory when + // converting the column indices from global to local indices. + if (typeid(LO) == typeid(GO)) { + CSR_colind_LID = Teuchos::arcp_reinterpret_cast(CSR_colind_GID); + } + else { + CSR_colind_LID.resize(mynnz); + } + + // FIXME (mfh 15 May 2014) Why can't we abstract this out as an + // unpackAndCombine method on a "CrsArrays" object? This passing + // in a huge list of arrays is icky. Can't we have a bit of an + // abstraction? Implementing a concrete DistObject subclass only + // takes five methods. + unpackAndCombineIntoCrsArrays(*this, RemoteLIDs, hostImports, + numImportPacketsPerLID, constantNumPackets, + Distor, INSERT, NumSameIDs, PermuteToLIDs, + PermuteFromLIDs, N, mynnz, MyPID, + CSR_rowptr(), CSR_colind_GID(), + SourcePids(), TargetPids); + + /**************************************************************/ + /**** 4) Call Optimized MakeColMap w/ no Directory Lookups ****/ + /**************************************************************/ +#ifdef HAVE_TPETRA_MMM_TIMINGS + MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix2+string("Unpack-2")))); +#endif + // Call an optimized version of makeColMap that avoids the + // Directory lookups (since the Import object knows who owns all + // the GIDs). + Teuchos::Array RemotePids; + Import_Util::lowCommunicationMakeColMapAndReindex(CSR_rowptr(), + CSR_colind_LID(), + CSR_colind_GID(), + BaseDomainMap, + TargetPids, RemotePids, + MyColMap); + + /*******************************************************/ + /**** 4) Second communicator restriction phase ****/ + /*******************************************************/ + if (restrictComm) { + ReducedColMap = (MyRowMap.getRawPtr() == MyColMap.getRawPtr()) ? + ReducedRowMap : + MyColMap->replaceCommWithSubset(ReducedComm); + MyColMap = ReducedColMap; // Reset the "my" maps + } + + // Replace the col map + destGraph->replaceColMap(MyColMap); + + // Short circuit if the processor is no longer in the communicator + // + // NOTE: Epetra replaces modifies all "removed" processes so they + // have a dummy (serial) Map that doesn't touch the original + // communicator. Duplicating that here might be a good idea. + if (ReducedComm.is_null()) { + return; + } + + /***************************************************/ + /**** 5) Sort ****/ + /***************************************************/ + if ((! reverseMode && xferAsImport != NULL) || + (reverseMode && xferAsExport != NULL)) { + Import_Util::sortCrsEntries(CSR_rowptr(), + CSR_colind_LID()); + } + else if ((! reverseMode && xferAsExport != NULL) || + (reverseMode && xferAsImport != NULL)) { + Import_Util::sortAndMergeCrsEntries(CSR_rowptr(), + CSR_colind_LID()); + if (CSR_rowptr[N] != mynnz) { + CSR_colind_LID.resize(CSR_rowptr[N]); + } + } + else { + TEUCHOS_TEST_FOR_EXCEPTION( + true, std::logic_error, + prefix << "Should never get here! Please report this bug to a Tpetra developer."); + } + /***************************************************/ + /**** 6) Reset the colmap and the arrays ****/ + /***************************************************/ + + // Call constructor for the new graph (restricted as needed) + // + destGraph->setAllIndices(CSR_rowptr, CSR_colind_LID); + + /***************************************************/ + /**** 7) Build Importer & Call ESFC ****/ + /***************************************************/ + // Pre-build the importer using the existing PIDs + Teuchos::ParameterList esfc_params; +#ifdef HAVE_TPETRA_MMM_TIMINGS + MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix2+string("CreateImporter")))); +#endif + RCP MyImport = rcp(new import_type(MyDomainMap, MyColMap, RemotePids)); +#ifdef HAVE_TPETRA_MMM_TIMINGS + MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix2+string("ESFC")))); + + esfc_params.set("Timer Label",prefix + std::string("TAFC")); +#endif + if(!params.is_null()) + esfc_params.set("compute global constants",params->get("compute global constants",true)); + + destGraph->expertStaticFillComplete(MyDomainMap, MyRangeMap, + MyImport, Teuchos::null, rcp(&esfc_params,false)); + + } + + template + void + CrsGraph:: + importAndFillComplete(Teuchos::RCP >& destGraph, + const import_type& importer, + const Teuchos::RCP& domainMap, + const Teuchos::RCP& rangeMap, + const Teuchos::RCP& params) const + { + transferAndFillComplete(destGraph, importer, Teuchos::null, domainMap, rangeMap, params); + } + + template + void + CrsGraph:: + importAndFillComplete(Teuchos::RCP >& destGraph, + const import_type& rowImporter, + const import_type& domainImporter, + const Teuchos::RCP& domainMap, + const Teuchos::RCP& rangeMap, + const Teuchos::RCP& params) const + { + transferAndFillComplete(destGraph, rowImporter, Teuchos::rcpFromRef(domainImporter), domainMap, rangeMap, params); + } + + template + void + CrsGraph:: + exportAndFillComplete(Teuchos::RCP >& destGraph, + const export_type& exporter, + const Teuchos::RCP& domainMap, + const Teuchos::RCP& rangeMap, + const Teuchos::RCP& params) const + { + transferAndFillComplete(destGraph, exporter, Teuchos::null, domainMap, rangeMap, params); + } + + template + void + CrsGraph:: + exportAndFillComplete(Teuchos::RCP >& destGraph, + const export_type& rowExporter, + const export_type& domainExporter, + const Teuchos::RCP& domainMap, + const Teuchos::RCP& rangeMap, + const Teuchos::RCP& params) const + { + transferAndFillComplete(destGraph, rowExporter, Teuchos::rcpFromRef(domainExporter), domainMap, rangeMap, params); + } + + } // namespace Tpetra // @@ -6235,6 +6937,74 @@ namespace Tpetra { // #define TPETRA_CRSGRAPH_GRAPH_INSTANT(LO,GO,NODE) template class CrsGraph< LO , GO , NODE >; +#define TPETRA_CRSGRAPH_IMPORT_AND_FILL_COMPLETE_INSTANT(LO,GO,NODE) \ + template<> \ + Teuchos::RCP > \ + importAndFillCompleteCrsGraph(const Teuchos::RCP >& sourceGraph, \ + const Import::local_ordinal_type, \ + CrsGraph::global_ordinal_type, \ + CrsGraph::node_type>& importer, \ + const Teuchos::RCP::local_ordinal_type, \ + CrsGraph::global_ordinal_type, \ + CrsGraph::node_type> >& domainMap, \ + const Teuchos::RCP::local_ordinal_type, \ + CrsGraph::global_ordinal_type, \ + CrsGraph::node_type> >& rangeMap, \ + const Teuchos::RCP& params); + +#define TPETRA_CRSGRAPH_IMPORT_AND_FILL_COMPLETE_INSTANT_TWO(LO,GO,NODE) \ + template<> \ + Teuchos::RCP > \ + importAndFillCompleteCrsGraph(const Teuchos::RCP >& sourceGraph, \ + const Import::local_ordinal_type, \ + CrsGraph::global_ordinal_type, \ + CrsGraph::node_type>& rowImporter, \ + const Import::local_ordinal_type, \ + CrsGraph::global_ordinal_type, \ + CrsGraph::node_type>& domainImporter, \ + const Teuchos::RCP::local_ordinal_type, \ + CrsGraph::global_ordinal_type, \ + CrsGraph::node_type> >& domainMap, \ + const Teuchos::RCP::local_ordinal_type, \ + CrsGraph::global_ordinal_type, \ + CrsGraph::node_type> >& rangeMap, \ + const Teuchos::RCP& params); + + +#define TPETRA_CRSGRAPH_EXPORT_AND_FILL_COMPLETE_INSTANT(LO,GO,NODE) \ + template<> \ + Teuchos::RCP > \ + exportAndFillCompleteCrsGraph(const Teuchos::RCP >& sourceGraph, \ + const Export::local_ordinal_type, \ + CrsGraph::global_ordinal_type, \ + CrsGraph::node_type>& exporter, \ + const Teuchos::RCP::local_ordinal_type, \ + CrsGraph::global_ordinal_type, \ + CrsGraph::node_type> >& domainMap, \ + const Teuchos::RCP::local_ordinal_type, \ + CrsGraph::global_ordinal_type, \ + CrsGraph::node_type> >& rangeMap, \ + const Teuchos::RCP& params); + +#define TPETRA_CRSGRAPH_EXPORT_AND_FILL_COMPLETE_INSTANT_TWO(LO,GO,NODE) \ + template<> \ + Teuchos::RCP > \ + exportAndFillCompleteCrsGraph(const Teuchos::RCP >& sourceGraph, \ + const Export::local_ordinal_type, \ + CrsGraph::global_ordinal_type, \ + CrsGraph::node_type>& rowExporter, \ + const Export::local_ordinal_type, \ + CrsGraph::global_ordinal_type, \ + CrsGraph::node_type>& domainExporter, \ + const Teuchos::RCP::local_ordinal_type, \ + CrsGraph::global_ordinal_type, \ + CrsGraph::node_type> >& domainMap, \ + const Teuchos::RCP::local_ordinal_type, \ + CrsGraph::global_ordinal_type, \ + CrsGraph::node_type> >& rangeMap, \ + const Teuchos::RCP& params); + + // WARNING: These macros exist only for backwards compatibility. // We will remove them at some point. #define TPETRA_CRSGRAPH_SORTROWINDICESANDVALUES_INSTANT(S,LO,GO,NODE) @@ -6246,6 +7016,11 @@ namespace Tpetra { TPETRA_CRSGRAPH_SORTROWINDICESANDVALUES_INSTANT(S,LO,GO,NODE) \ TPETRA_CRSGRAPH_MERGEROWINDICESANDVALUES_INSTANT(S,LO,GO,NODE) \ TPETRA_CRSGRAPH_ALLOCATEVALUES1D_INSTANT(S,LO,GO,NODE) \ - TPETRA_CRSGRAPH_ALLOCATEVALUES2D_INSTANT(S,LO,GO,NODE) + TPETRA_CRSGRAPH_ALLOCATEVALUES2D_INSTANT(S,LO,GO,NODE) \ + TPETRA_CRSGRAPH_IMPORT_AND_FILL_COMPLETE_INSTANT(LO,GO,NODE) \ + TPETRA_CRSGRAPH_EXPORT_AND_FILL_COMPLETE_INSTANT(LO,GO,NODE) \ + TPETRA_CRSGRAPH_IMPORT_AND_FILL_COMPLETE_INSTANT_TWO(LO,GO,NODE) \ + TPETRA_CRSGRAPH_EXPORT_AND_FILL_COMPLETE_INSTANT_TWO(LO,GO,NODE) + #endif // TPETRA_CRSGRAPH_DEF_HPP diff --git a/packages/tpetra/core/src/Tpetra_Details_packCrsGraph.cpp b/packages/tpetra/core/src/Tpetra_Details_packCrsGraph.cpp new file mode 100644 index 000000000000..ce753abdc229 --- /dev/null +++ b/packages/tpetra/core/src/Tpetra_Details_packCrsGraph.cpp @@ -0,0 +1,67 @@ +/* +// @HEADER +// *********************************************************************** +// +// Tpetra: Templated Linear Algebra Services Package +// Copyright (2008) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Michael A. Heroux (maherou@sandia.gov) +// +// ************************************************************************ +// @HEADER +*/ + +#include "TpetraCore_config.h" + +#if defined(HAVE_TPETRA_EXPLICIT_INSTANTIATION) + +// We protect the contents of this file with macros, to assist +// applications that circumvent Trilinos' build system. (We do NOT +// recommend this.) That way, they can still build this file, but as +// long as the macros have correct definitions, they won't build +// anything that's not enabled. + +#include "KokkosCompat_ClassicNodeAPI_Wrapper.hpp" +#include "Tpetra_Details_packCrsGraph_decl.hpp" +#include "Tpetra_Details_packCrsGraph_def.hpp" +#include "TpetraCore_ETIHelperMacros.h" + +namespace Tpetra { + + TPETRA_ETI_MANGLING_TYPEDEFS() + + TPETRA_INSTANTIATE_LGN( TPETRA_DETAILS_PACKCRSGRAPH_INSTANT ) + +} // namespace Tpetra + +#endif // Whether we should build this specialization diff --git a/packages/tpetra/core/src/Tpetra_Details_packCrsGraph_decl.hpp b/packages/tpetra/core/src/Tpetra_Details_packCrsGraph_decl.hpp new file mode 100644 index 000000000000..1baf2441f941 --- /dev/null +++ b/packages/tpetra/core/src/Tpetra_Details_packCrsGraph_decl.hpp @@ -0,0 +1,220 @@ +// @HEADER +// *********************************************************************** +// +// Tpetra: Templated Linear Algebra Services Package +// Copyright (2008) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Michael A. Heroux (maherou@sandia.gov) +// +// ************************************************************************ +// @HEADER + +#ifndef TPETRA_DETAILS_PACKCRSGRAPH_DECL_HPP +#define TPETRA_DETAILS_PACKCRSGRAPH_DECL_HPP + +#include "TpetraCore_config.h" +#include "Kokkos_DualView.hpp" +#include "Tpetra_DistObject_decl.hpp" + +/// \file Tpetra_Details_packCrsGraph.hpp +/// \brief Functions for packing the entries of a Tpetra::CrsGraph +/// for communication, in the case where it is valid to go to the +/// KokkosSparse::CrsGraph (local sparse graph data structure) +/// directly. +/// \warning This file, and its contents, are implementation details +/// of Tpetra. The file itself or its contents may disappear or +/// change at any time. +/// +/// Data (bytes) describing the row of the CRS graph are "packed" +/// (concatenated) in to a (view of) packet_type* object in the following order: +/// +/// 1. number of entries (LocalOrdinal) +/// 2. global column indices (GlobalOrdinal) +/// 3. proces IDs (optional, int) +/// +/// The functions in this file are companions to +/// Tpetra_Details_unpackCrsGraph.hpp, i.e., Tpetra_Details_unpackCrsGraph.hpp +/// implements the reverse of the packing order described above to ensure proper +/// unpacking. + +#ifndef DOXYGEN_SHOULD_SKIP_THIS +namespace Teuchos { +// Forward declaration of Array +template class Array; +// Forward declaration of ArrayView +template class ArrayView; +} // namespace Teuchos +#endif // DOXYGEN_SHOULD_SKIP_THIS + +namespace Tpetra { + +#ifndef DOXYGEN_SHOULD_SKIP_THIS +// Forward declaration of Distributor +class Distributor; +// Forward declaration of CrsGraph +template +class CrsGraph; +#endif // DOXYGEN_SHOULD_SKIP_THIS + +// +// Users must never rely on anything in the Details namespace. +// +namespace Details { + +/// \brief Pack specified entries of the given local sparse graph for +/// communication. +/// +/// \tparam LO The type of local indices. See the +/// documentation of Map for requirements. +/// \tparam GO The type of global indices. See the +/// documentation of Map for requirements. +/// \tparam NT The Kokkos Node type. See the documentation of Map +/// for requirements. +/// +/// \param sourceGraph [in] the CrsGraph source +/// +/// \param exports [in/out] Output pack buffer; resized if needed. +/// +/// \param numPacketsPerLID [out] Entry k gives the number of bytes +/// packed for row exportLIDs[k] of the local graph. +/// +/// \param exportLIDs [in] Local indices of the rows to pack. +/// +/// \param constantNumPackets [out] Setting this to zero tells the caller +/// to expect a possibly /// different ("nonconstant") number of packets per local index +/// (i.e., a possibly different number of entries per row). +/// +/// \param distor [in] The distributor (not used) +/// +/// This is the public interface to the pack machinery +/// converts passed Teuchos::ArrayView objects to Kokkos::View objects (and +/// copies back in to the Teuchos::ArrayView objects, if needed). When +/// CrsGraph migrates fully to adopting Kokkos::DualView objects for its storage +/// of data, this procedure could be bypassed. +template +void +packCrsGraph (const CrsGraph& sourceGraph, + Teuchos::Array::packet_type>& exports, + const Teuchos::ArrayView& numPacketsPerLID, + const Teuchos::ArrayView& exportLIDs, + size_t& constantNumPackets, + Distributor& distor); + +/// \brief Pack specified entries of the given local sparse graph for +/// communication, for "new" DistObject interface. +/// +/// \tparam LO The type of local indices. This must be the same as +/// the LocalOrdinal template parameter of Tpetra::CrsGraph. +/// \tparam GO The type of global indices. This must be the same as +/// the GlobalOrdinal template parameter of Tpetra::CrsGraph. +/// \tparam NT The Node type. This must be the same as the Node +/// template parameter of Tpetra::CrsGraph. +/// +/// \param sourceGraph [in] The "source" graph to pack. +/// +/// \param exports [in/out] Output pack buffer; resized if needed. +/// +/// \param numPacketsPerLID [out] On output, +/// numPacketsPerLID.d_view[k] is the number of bytes packed for row +/// exportLIDs.d_view[k] of the local graph. +/// +/// \param exportLIDs [in] Local indices of the rows to pack. +/// +/// \param constantNumPackets [out] Same as the constantNumPackets +/// output argument of Tpetra::DistObject::packAndPrepareNew (which +/// see). +/// +/// \param distor [in] (Not used.) +/// +/// This method implements CrsGraph::packNew, and thus +/// CrsGraph::packAndPrepareNew, for the case where the graph to +/// pack has a valid KokkosSparse::CrsGraph. +template +void +packCrsGraphNew (const CrsGraph& sourceGraph, + Kokkos::DualView::packet_type*, + typename CrsGraph::buffer_device_type>& + exports, + const Kokkos::DualView::buffer_device_type>& + numPacketsPerLID, + const Kokkos::DualView& exportLIDs, + size_t& constantNumPackets, + Distributor& distor); + +/// \brief Pack specified entries of the given local sparse graph for +/// communication. +/// +/// \tparam LO The type of local indices. See the +/// documentation of Map for requirements. +/// \tparam GO The type of global indices. See the +/// documentation of Map for requirements. +/// \tparam NT The Kokkos Node type. See the documentation of Map +/// for requirements. +/// +/// \param sourceGraph [in] the CrsGraph source +/// +/// \param exports [in/out] Output pack buffer; resized if needed. +/// +/// \param numPacketsPerLID [out] Entry k gives the number of bytes +/// packed for row exportLIDs[k] of the local graph. +/// +/// \param exportLIDs [in] Local indices of the rows to pack. +/// +/// \param constantNumPackets [out] Setting this to zero tells the caller +/// to expect a possibly /// different ("nonconstant") number of packets per local index +/// (i.e., a possibly different number of entries per row). +/// +/// \param distor [in] The distributor (not used) +/// +/// This is the public interface to the pack machinery +/// converts passed Teuchos::ArrayView objects to Kokkos::View objects (and +/// copies back in to the Teuchos::ArrayView objects, if needed). When +/// CrsGraph migrates fully to adopting Kokkos::DualView objects for its storage +/// of data, this procedure could be bypassed. +template +void +packCrsGraphWithOwningPIDs (const CrsGraph& sourceGraph, + Kokkos::DualView::packet_type*, + typename CrsGraph::buffer_device_type>& + exports_dv, + const Teuchos::ArrayView& numPacketsPerLID, + const Teuchos::ArrayView& exportLIDs, + const Teuchos::ArrayView& sourcePIDs, + size_t& constantNumPackets, + Distributor& distor); + +} // namespace Details +} // namespace Tpetra + +#endif // TPETRA_DETAILS_PACKCRSGRAPH_DECL_HPP diff --git a/packages/tpetra/core/src/Tpetra_Details_packCrsGraph_def.hpp b/packages/tpetra/core/src/Tpetra_Details_packCrsGraph_def.hpp new file mode 100644 index 000000000000..327c85b90e29 --- /dev/null +++ b/packages/tpetra/core/src/Tpetra_Details_packCrsGraph_def.hpp @@ -0,0 +1,999 @@ +// @HEADER +// *********************************************************************** +// +// Tpetra: Templated Linear Algebra Services Package +// Copyright (2008) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Michael A. Heroux (maherou@sandia.gov) +// +// ************************************************************************ +// @HEADER + +#ifndef TPETRA_DETAILS_PACKCRSGRAPH_DEF_HPP +#define TPETRA_DETAILS_PACKCRSGRAPH_DEF_HPP + +#include "TpetraCore_config.h" +#include "Teuchos_Array.hpp" +#include "Teuchos_ArrayView.hpp" +#include "Tpetra_Details_castAwayConstDualView.hpp" +#include "Tpetra_Details_createMirrorView.hpp" +#include "Tpetra_Details_getEntryOnHost.hpp" +#include "Tpetra_Details_OrdinalTraits.hpp" +#include "Tpetra_Details_PackTraits.hpp" +#include "Tpetra_CrsGraph_decl.hpp" +#include +#include + +/// \file Tpetra_Details_packCrsGraph.hpp +/// \brief Functions for packing the entries of a Tpetra::CrsGraph +/// for communication, in the case where it is valid to go to the +/// KokkosSparse::CrsGraph (local sparse graph data structure) +/// directly. +/// \warning This file, and its contents, are implementation details +/// of Tpetra. The file itself or its contents may disappear or +/// change at any time. +/// +/// Data (bytes) describing the row of the CRS graph are "packed" +/// (concatenated) in to a (view of) GO* object in the following order: +/// +/// 1. number of entries (LocalOrdinal) +/// 2. global column indices (GlobalOrdinal) +/// 3. proces IDs (optional, int) +/// +/// The functions in this file are companions to +/// Tpetra_Details_unpackCrsGraphAndCombine.hpp, i.e., +/// Tpetra_Details_unpackCrsGraphAndCombine.hpp implements the +/// reverse of the packing order described above to ensure proper +/// unpacking. + +namespace Tpetra { + +#ifndef DOXYGEN_SHOULD_SKIP_THIS +// Forward declaration of Distributor +class Distributor; +#endif // DOXYGEN_SHOULD_SKIP_THIS + +// +// Users must never rely on anything in the Details namespace. +// +namespace Details { + +namespace PackCrsGraphImpl { +/// \brief Compute the number of packets and offsets for the pack procedure +/// +/// \tparam OutputOffsetsViewType the type of the output offsets view +/// \tparam CountsViewType the type of the counts view +/// \tparam InputOffsetsViewType the type of the input offsets view +/// \tparam InputLocalRowIndicesViewType the type of the local row indices view +/// \tparam InputLocalRowPidsViewType the type of the local process IDs view +template +class NumPacketsAndOffsetsFunctor{ +public: + typedef typename OutputOffsetsViewType::non_const_value_type output_offset_type; + typedef typename CountsViewType::non_const_value_type count_type; + typedef typename InputOffsetsViewType::non_const_value_type input_offset_type; + typedef typename InputLocalRowIndicesViewType::non_const_value_type local_row_index_type; + typedef typename InputLocalRowPidsViewType::non_const_value_type local_row_pid_type; + // output Views drive where execution happens. + typedef typename OutputOffsetsViewType::device_type device_type; + static_assert (std::is_same::value, + "OutputOffsetsViewType and CountsViewType must have the same execution space."); + static_assert (Kokkos::Impl::is_view::value, + "OutputOffsetsViewType must be a Kokkos::View."); + static_assert (std::is_same::value, + "OutputOffsetsViewType must be a nonconst Kokkos::View."); + static_assert (std::is_integral::value, + "The type of each entry of OutputOffsetsViewType must be a built-in integer type."); + static_assert (Kokkos::Impl::is_view::value, + "CountsViewType must be a Kokkos::View."); + static_assert (std::is_same::value, + "CountsViewType must be a nonconst Kokkos::View."); + static_assert (std::is_integral::value, + "The type of each entry of CountsViewType must be a built-in integer type."); + static_assert (Kokkos::Impl::is_view::value, + "InputOffsetsViewType must be a Kokkos::View."); + static_assert (std::is_integral::value, + "The type of each entry of InputOffsetsViewType must be a built-in integer type."); + static_assert (Kokkos::Impl::is_view::value, + "InputLocalRowIndicesViewType must be a Kokkos::View."); + static_assert (std::is_integral::value, + "The type of each entry of InputLocalRowIndicesViewType must be a built-in integer type."); + + NumPacketsAndOffsetsFunctor(const OutputOffsetsViewType& outputOffsets, + const CountsViewType& counts, + const InputOffsetsViewType& rowOffsets, + const InputLocalRowIndicesViewType& lclRowInds, + const InputLocalRowPidsViewType& lclRowPids) : + outputOffsets_ (outputOffsets), + counts_ (counts), + rowOffsets_ (rowOffsets), + lclRowInds_ (lclRowInds), + lclRowPids_ (lclRowPids), + error_ ("error") // don't forget this, or you'll get segfaults! + { + if (debug) { + const size_t numRowsToPack = static_cast (lclRowInds_.dimension_0 ()); + + if (numRowsToPack != static_cast (counts_.dimension_0 ())) { + std::ostringstream os; + os << "lclRowInds.dimension_0() = " << numRowsToPack + << " != counts.dimension_0() = " << counts_.dimension_0 () + << "."; + TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, os.str ()); + } + if (static_cast (numRowsToPack + 1) != + static_cast (outputOffsets_.dimension_0 ())) { + std::ostringstream os; + os << "lclRowInds.dimension_0() + 1 = " << (numRowsToPack + 1) + << " != outputOffsets.dimension_0() = " << outputOffsets_.dimension_0 () + << "."; + TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, os.str ()); + } + } + } + + KOKKOS_INLINE_FUNCTION void + operator() (const local_row_index_type& curInd, + output_offset_type& update, + const bool final) const + { + if (debug) { + if (curInd < static_cast (0)) { + error_ () = 1; + return; + } + } + + if (final) { + if (debug) { + if (curInd >= static_cast (outputOffsets_.dimension_0 ())) { + error_ () = 2; + return; + } + } + outputOffsets_(curInd) = update; + } + + if (curInd < static_cast (counts_.dimension_0 ())) { + const auto lclRow = lclRowInds_(curInd); + if (static_cast (lclRow + 1) >= static_cast (rowOffsets_.dimension_0 ()) || + static_cast (lclRow) < static_cast (0)) { + error_ () = 3; + return; + } + // count_type could differ from the type of each row offset. + // For example, row offsets might each be 64 bits, but if their + // difference always fits in 32 bits, we may then safely use a + // 32-bit count_type. + const count_type count = + static_cast (rowOffsets_(lclRow+1) - rowOffsets_(lclRow)); + + // We pack first the global column indices and then pids (if any), + // However, if the number of entries in the row is zero, we pack nothing. + const count_type numEntToPack = (count == 0) + ? static_cast(0) + : count * (1 + (lclRowPids_.size() > 0 ? 1 : 0)); + + if (final) { + counts_(curInd) = numEntToPack; + } + update += numEntToPack; + } + } + + // mfh 31 May 2017: Don't need init or join. If you have join, MUST + // have join both with and without volatile! Otherwise intrawarp + // joins are really slow on GPUs. + + //! Host function for getting the error. + int getError () const { + auto error_h = Kokkos::create_mirror_view (error_); + Kokkos::deep_copy (error_h, error_); + return error_h (); + } + +private: + OutputOffsetsViewType outputOffsets_; + CountsViewType counts_; + typename InputOffsetsViewType::const_type rowOffsets_; + typename InputLocalRowIndicesViewType::const_type lclRowInds_; + typename InputLocalRowPidsViewType::const_type lclRowPids_; + Kokkos::View error_; +}; + +/// \brief Compute the number of packets and offsets for the pack procedure +/// +/// \tparam OutputOffsetsViewType the type of the output offsets view +/// \tparam CountsViewType the type of the counts view +/// \tparam InputOffsetsViewType the type of the input offsets view +/// \tparam InputLocalRowIndicesViewType the type of the local row indices view +/// \tparam InputLocalRowPidsViewType the type of the local process IDs view +/// +/// This is the high level interface to the NumPacketsAndOffsetsFunctor functor +template +typename CountsViewType::non_const_value_type +computeNumPacketsAndOffsets(const OutputOffsetsViewType& outputOffsets, + const CountsViewType& counts, + const InputOffsetsViewType& rowOffsets, + const InputLocalRowIndicesViewType& lclRowInds, + const InputLocalRowPidsViewType& lclRowPids) +{ + typedef NumPacketsAndOffsetsFunctor functor_type; + typedef typename CountsViewType::non_const_value_type count_type; + typedef typename OutputOffsetsViewType::size_type size_type; + typedef typename OutputOffsetsViewType::execution_space execution_space; + typedef typename functor_type::local_row_index_type LO; + typedef Kokkos::RangePolicy range_type; + const char prefix[] = "computeNumPacketsAndOffsets: "; + + count_type count = 0; + const count_type numRowsToPack = lclRowInds.dimension_0 (); + + if (numRowsToPack == 0) { + return count; + } + else { + TEUCHOS_TEST_FOR_EXCEPTION + (rowOffsets.dimension_0 () <= static_cast (1), + std::invalid_argument, prefix << "There is at least one row to pack, " + "but the graph has no rows. lclRowInds.dimension_0() = " << + numRowsToPack << ", but rowOffsets.dimension_0() = " << + rowOffsets.dimension_0 () << " <= 1."); + TEUCHOS_TEST_FOR_EXCEPTION + (outputOffsets.dimension_0 () != + static_cast (numRowsToPack + 1), std::invalid_argument, + prefix << "Output dimension does not match number of rows to pack. " + << "outputOffsets.dimension_0() = " << outputOffsets.dimension_0 () + << " != lclRowInds.dimension_0() + 1 = " + << static_cast (numRowsToPack + 1) << "."); + TEUCHOS_TEST_FOR_EXCEPTION + (counts.dimension_0 () != numRowsToPack, std::invalid_argument, + prefix << "counts.dimension_0() = " << counts.dimension_0 () + << " != numRowsToPack = " << numRowsToPack << "."); + + functor_type f (outputOffsets, counts, rowOffsets, lclRowInds, lclRowPids); + Kokkos::parallel_scan (range_type (0, numRowsToPack + 1), f); + + // At least in debug mode, this functor checks for errors. + const int errCode = f.getError (); + TEUCHOS_TEST_FOR_EXCEPTION + (errCode != 0, std::runtime_error, prefix << "parallel_scan error code " + << errCode << " != 0."); + +#if 0 + size_t total = 0; + for (LO k = 0; k < numRowsToPack; ++k) { + total += counts[k]; + } + if (outputOffsets(numRowsToPack) != total) { + if (errStr.get () == NULL) { + errStr = std::unique_ptr (new std::ostringstream ()); + } + std::ostringstream& os = *errStr; + os << prefix + << "outputOffsets(numRowsToPack=" << numRowsToPack << ") " + << outputOffsets(numRowsToPack) << " != sum of counts = " + << total << "." << std::endl; + if (numRowsToPack != 0) { + // Only print the array if it's not too long. + if (numRowsToPack < static_cast (10)) { + os << "outputOffsets: ["; + for (LO i = 0; i <= numRowsToPack; ++i) { + os << outputOffsets(i); + if (static_cast (i + 1) <= numRowsToPack) { + os << ","; + } + } + os << "]" << std::endl; + os << "counts: ["; + for (LO i = 0; i < numRowsToPack; ++i) { + os << counts(i); + if (static_cast (i + 1) < numRowsToPack) { + os << ","; + } + } + os << "]" << std::endl; + } + else { + os << "outputOffsets(" << (numRowsToPack-1) << ") = " + << outputOffsets(numRowsToPack-1) << "." << std::endl; + } + } + count = outputOffsets(numRowsToPack); + return {false, errStr}; + } +#endif // HAVE_TPETRA_DEBUG + + // Get last entry of outputOffsets, which is the sum of the entries + // of counts. Don't assume UVM. + using Tpetra::Details::getEntryOnHost; + return static_cast (getEntryOnHost (outputOffsets, + numRowsToPack)); + } +} + +/// \brief Packs a single row of the CrsGraph. +/// +/// \tparam ColumnMap the type of the local column map +/// +/// Data (bytes) describing the row of the CRS graph are "packed" +/// (concatenated) in to a single (view of) GO* in the following order: +/// +/// 1. GO column indices +/// 2. int proces IDs +/// +template +KOKKOS_FUNCTION +size_t +packRow(const ColumnMap& col_map, + const Kokkos::View& exports, + const typename PackTraits::input_array_type& lids_in, + const typename PackTraits::input_array_type& pids_in, + const size_t offset, + const size_t num_ent, + const bool pack_pids) +{ + using Kokkos::subview; + typedef typename ColumnMap::local_ordinal_type LO; + typedef typename ColumnMap::global_ordinal_type GO; + + if (num_ent == 0) { + // Empty rows always take zero bytes, to ensure sparsity. + return static_cast(0); + } + + size_t num_ent_packed = num_ent; + if (pack_pids) num_ent_packed += num_ent; + { + // Copy column indices one at a time, so that we don't need + // temporary storage. + for (size_t k = 0; k < num_ent; ++k) { + const LO lid = lids_in[k]; + const GO gid = col_map.getGlobalElement (lid); + exports(offset+k) = gid; + } + // Copy PIDs one at a time, so that we don't need temporary storage. + if (pack_pids) { + for (size_t k = 0; k < num_ent; ++k) { + const LO lid = lids_in[k]; + const int pid = pids_in[lid]; + exports(offset+num_ent+k) = static_cast(pid); + } + } + } + return num_ent_packed; +} + +template +struct PackCrsGraphFunctor { + typedef LocalGraph local_graph_type; + typedef LocalMap local_map_type; + typedef typename local_map_type::local_ordinal_type LO; + typedef typename local_map_type::global_ordinal_type GO; + typedef typename local_graph_type::device_type device_type; + + typedef Kokkos::View + num_packets_per_lid_view_type; + typedef Kokkos::View offsets_view_type; + typedef Kokkos::View exports_view_type; + typedef typename PackTraits::input_array_type + export_lids_view_type; + typedef typename PackTraits::input_array_type + source_pids_view_type; + + typedef typename num_packets_per_lid_view_type::non_const_value_type + count_type; + typedef typename offsets_view_type::non_const_value_type + offset_type; + typedef Kokkos::pair value_type; + + static_assert (std::is_same::value, + "local_map_type::local_ordinal_type and " + "local_graph_type::data_type must be the same."); + + local_graph_type local_graph; + local_map_type local_col_map; + exports_view_type exports; + num_packets_per_lid_view_type num_packets_per_lid; + export_lids_view_type export_lids; + source_pids_view_type source_pids; + offsets_view_type offsets; + bool pack_pids; + + PackCrsGraphFunctor(const local_graph_type& local_graph_in, + const local_map_type& local_col_map_in, + const exports_view_type& exports_in, + const num_packets_per_lid_view_type& num_packets_per_lid_in, + const export_lids_view_type& export_lids_in, + const source_pids_view_type& source_pids_in, + const offsets_view_type& offsets_in, + const bool pack_pids_in) : + local_graph (local_graph_in), + local_col_map (local_col_map_in), + exports (exports_in), + num_packets_per_lid (num_packets_per_lid_in), + export_lids (export_lids_in), + source_pids (source_pids_in), + offsets (offsets_in), + pack_pids (pack_pids_in) + { + const LO numRows = local_graph_in.numRows (); + const LO rowMapDim = + static_cast (local_graph.row_map.dimension_0 ()); + TEUCHOS_TEST_FOR_EXCEPTION + (numRows != 0 && rowMapDim != numRows + static_cast (1), + std::logic_error, "local_graph.row_map.dimension_0() = " + << rowMapDim << " != numRows (= " << numRows << " ) + 1."); + } + + KOKKOS_INLINE_FUNCTION void init (value_type& dst) const + { + using ::Tpetra::Details::OrdinalTraits; + dst = Kokkos::make_pair (0, OrdinalTraits::invalid ()); + } + + KOKKOS_INLINE_FUNCTION void + join (volatile value_type& dst, const volatile value_type& src) const + { + // `dst` should reflect the first (least) bad index and all other + // associated error codes and data, so prefer keeping it. + if (src.first != 0 && dst.first == 0) { + dst = src; + } + } + + KOKKOS_INLINE_FUNCTION + void operator() (const LO i, value_type& dst) const + { + const size_t offset = offsets[i]; + const LO export_lid = export_lids[i]; + const size_t buf_size = exports.size(); + const size_t num_packets_this_lid = num_packets_per_lid(i); + const size_t num_ent = + static_cast (local_graph.row_map[export_lid+1] + - local_graph.row_map[export_lid]); + + // Only pack this row's data if it has a nonzero number of + // entries. We can do this because receiving processes get the + // number of packets, and will know that zero packets means zero + // entries. + if (num_ent == 0) { + return; + } + + if (export_lid >= static_cast(local_graph.numRows())) { + if (dst.first != 0) { // keep only the first error + dst = Kokkos::make_pair (1, i); // invalid row + } + return; + } + else if ((offset > buf_size || offset + num_packets_this_lid > buf_size)) { + if (dst.first != 0) { // keep only the first error + dst = Kokkos::make_pair (2, i); // out of bounds + } + return; + } + + // We can now pack this row + + // Since the graph is locally indexed on the calling process, we + // have to use its column Map (which it _must_ have in this case) + // to convert to global indices. + const auto row_beg = local_graph.row_map[export_lid]; + const auto row_end = local_graph.row_map[export_lid + 1]; + auto lids_in = subview (local_graph.entries, + Kokkos::make_pair (row_beg, row_end)); + typedef local_map_type LMT; + typedef Packet PT; + typedef BufferDeviceType BDT; + size_t num_ent_packed_this_row = + packRow(local_col_map, exports, lids_in, + source_pids, offset, num_ent, pack_pids); + if (num_ent_packed_this_row != num_packets_this_lid) { + if (dst.first != 0) { // keep only the first error + dst = Kokkos::make_pair (3, i); + } + } + } +}; + +/// \brief Perform the pack operation for the graph +/// +/// \tparam LocalGraph the specialization of the KokkosSparse::CrsGraph +/// local graph +/// \tparam LocalMap the type of the local column map +/// +/// This is a higher level interface to the PackCrsGraphFunctor +template +void +do_pack(const LocalGraph& local_graph, + const LocalMap& local_map, + const Kokkos::View& exports, + const typename PackTraits< + size_t, + BufferDeviceType + >::input_array_type& num_packets_per_lid, + const typename PackTraits< + typename LocalMap::local_ordinal_type, + typename LocalGraph::device_type + >::input_array_type& export_lids, + const typename PackTraits< + int, + typename LocalGraph::device_type + >::input_array_type& source_pids, + const Kokkos::View& offsets, + const bool pack_pids) +{ + typedef typename LocalMap::local_ordinal_type LO; + typedef typename LocalGraph::device_type device_type; + typedef Kokkos::RangePolicy range_type; + const char prefix[] = "Tpetra::Details::PackCrsGraphImpl::do_pack: "; + + if (export_lids.dimension_0 () != 0) { + TEUCHOS_TEST_FOR_EXCEPTION + (static_cast (offsets.dimension_0 ()) != + static_cast (export_lids.dimension_0 () + 1), + std::invalid_argument, prefix << "offsets.dimension_0() = " + << offsets.dimension_0 () << " != export_lids.dimension_0() (= " + << export_lids.dimension_0 () << ") + 1."); + TEUCHOS_TEST_FOR_EXCEPTION + (export_lids.dimension_0 () != num_packets_per_lid.dimension_0 (), + std::invalid_argument, prefix << "export_lids.dimension_0() = " << + export_lids.dimension_0 () << " != num_packets_per_lid.dimension_0() = " + << num_packets_per_lid.dimension_0 () << "."); + // If exports has nonzero length at this point, then the graph + // has at least one entry to pack. Thus, if packing process + // ranks, we had better have at least one process rank to pack. + TEUCHOS_TEST_FOR_EXCEPTION + (pack_pids && exports.dimension_0 () != 0 && + source_pids.dimension_0 () == 0, std::invalid_argument, prefix << + "pack_pids is true, and exports.dimension_0() = " << + exports.dimension_0 () << " != 0, meaning that we need to pack at " + "least one graph entry, but source_pids.dimension_0() = 0."); + } + + typedef PackCrsGraphFunctor pack_functor_type; + pack_functor_type f (local_graph, local_map, exports, + num_packets_per_lid, export_lids, + source_pids, offsets, pack_pids); + + typename pack_functor_type::value_type result; + range_type range (0, num_packets_per_lid.dimension_0 ()); + Kokkos::parallel_reduce (range, f, result); + + if (result.first != 0) { + std::ostringstream os; + + if (result.first == 1) { // invalid local row index + auto export_lids_h = Kokkos::create_mirror_view (export_lids); + Kokkos::deep_copy (export_lids_h, export_lids); + const auto firstBadLid = export_lids_h(result.second); + os << "First bad export LID: export_lids(i=" << result.second << ") = " + << firstBadLid; + } + else if (result.first == 2) { // invalid offset + auto offsets_h = Kokkos::create_mirror_view (offsets); + Kokkos::deep_copy (offsets_h, offsets); + const auto firstBadOffset = offsets_h(result.second); + + auto num_packets_per_lid_h = + Kokkos::create_mirror_view (num_packets_per_lid); + Kokkos::deep_copy (num_packets_per_lid_h, num_packets_per_lid); + os << "First bad offset: offsets(i=" << result.second << ") = " + << firstBadOffset << ", num_packets_per_lid(i) = " + << num_packets_per_lid_h(result.second) << ", buf_size = " + << exports.size (); + } + + TEUCHOS_TEST_FOR_EXCEPTION + (true, std::runtime_error, prefix << "PackCrsGraphFunctor reported " + "error code " << result.first << " for the first bad row " + << result.second << ". " << os.str ()); + } +} + +/// \brief Pack specified entries of the given local sparse graph for +/// communication. +/// +/// \tparam LO The type of local indices. See the +/// documentation of Map for requirements. +/// \tparam GO The type of global indices. See the +/// documentation of Map for requirements. +/// \tparam NT The Kokkos Node type. See the documentation of Map +/// for requirements. +/// +/// \warning This is an implementation detail of Tpetra::CrsGraph. +/// +/// \param sourceGraph [in] the CrsGraph source +/// +/// \param exports [in/out] Output pack buffer; resized if needed. +/// +/// \param num_packets_per_lid [out] Entry k gives the number of bytes +/// packed for row export_lids[k] of the local graph. +/// +/// \param export_lids [in] Local indices of the rows to pack. +/// +/// \param export_pids [in] Process ranks for the column indices in each packed row. +/// +/// \param constant_num_packets [out] Setting this to zero tells the caller +/// to expect a possibly /// different ("nonconstant") number of packets per local index +/// (i.e., a possibly different number of entries per row). +template +void +packCrsGraph(const CrsGraph& sourceGraph, + Kokkos::DualView::packet_type*, + typename CrsGraph::buffer_device_type>& exports, + const Kokkos::View::buffer_device_type>& num_packets_per_lid, + const Kokkos::View& export_lids, + const Kokkos::View& export_pids, + size_t& constant_num_packets, + const bool pack_pids, + Distributor& /* dist */) +{ + using Kokkos::View; + typedef typename CrsGraph::packet_type packet_type; + typedef typename CrsGraph::buffer_device_type buffer_device_type; + typedef typename buffer_device_type::execution_space execution_space; + typedef Kokkos::DualView exports_view_type; + const char prefix[] = "Tpetra::Details::packCrsGraph: "; + constexpr bool debug = false; + + auto local_graph = sourceGraph.getLocalGraph (); + auto local_col_map = sourceGraph.getColMap ()->getLocalMap (); + + // Setting this to zero tells the caller to expect a possibly + // different ("nonconstant") number of packets per local index + // (i.e., a possibly different number of entries per row). + constant_num_packets = 0; + + const size_t num_export_lids = + static_cast (export_lids.dimension_0 ()); + TEUCHOS_TEST_FOR_EXCEPTION + (num_export_lids != + static_cast (num_packets_per_lid.dimension_0 ()), + std::invalid_argument, prefix << "num_export_lids.dimension_0() = " + << num_export_lids << " != num_packets_per_lid.dimension_0() = " + << num_packets_per_lid.dimension_0 () << "."); + if (num_export_lids != 0) { + TEUCHOS_TEST_FOR_EXCEPTION + (num_packets_per_lid.ptr_on_device () == NULL, std::invalid_argument, + prefix << "num_export_lids = "<< num_export_lids << " != 0, but " + "num_packets_per_lid.ptr_on_device() = " + << num_packets_per_lid.ptr_on_device () << " == NULL."); + } + + if (num_export_lids == 0) { + // FIXME (26 Apr 2016) Fences around (UVM) allocations only + // temporarily needed for #227 debugging. Should be able to + // remove them after that's fixed. + execution_space::fence (); + exports = exports_view_type ("exports", 0); + execution_space::fence (); + return; + } + + // Array of offsets into the pack buffer. + Kokkos::View offsets ("offsets", num_export_lids + 1); + + // Compute number of packets per LID (row to send), as well as + // corresponding offsets (the prefix sum of the packet counts). + const size_t count = + computeNumPacketsAndOffsets(offsets, num_packets_per_lid, + local_graph.row_map, export_lids, export_pids); + + // Resize the output pack buffer if needed. + if (count > static_cast (exports.dimension_0 ())) { + // FIXME (26 Apr 2016) Fences around (UVM) allocations only + // temporarily needed for #227 debugging. Should be able to + // remove them after that's fixed. + execution_space::fence (); + exports = exports_view_type ("exports", count); + if (debug) { + std::ostringstream os; + os << "*** exports resized to " << count << std::endl; + std::cerr << os.str (); + } + execution_space::fence (); + } + if (debug) { + std::ostringstream os; + os << "*** count: " << count << ", exports.dimension_0(): " + << exports.dimension_0 () << std::endl; + std::cerr << os.str (); + } + + // If exports has nonzero length at this point, then the graph has + // at least one entry to pack. Thus, if packing process ranks, we + // had better have at least one process rank to pack. + TEUCHOS_TEST_FOR_EXCEPTION + (pack_pids && exports.dimension_0 () != 0 && + export_pids.dimension_0 () == 0, std::invalid_argument, prefix << + "pack_pids is true, and exports.dimension_0() = " << + exports.dimension_0 () << " != 0, meaning that we need to pack at least " + "one graph entry, but export_pids.dimension_0() = 0."); + + typedef typename std::decay::type + local_graph_type; + typedef typename std::decay::type + local_map_type; + typedef typename exports_view_type::t_dev dev_exports_view_type; + typedef typename dev_exports_view_type::memory_space buf_mem_space; + exports.template modify (); + auto exports_d = exports.template view (); + do_pack + (local_graph, local_col_map, exports_d, num_packets_per_lid, + export_lids, export_pids, offsets, pack_pids); + // If we got this far, we succeeded. +} + +} // namespace PackCrsGraphImpl + +template +void +packCrsGraph(const CrsGraph& sourceGraph, + Teuchos::Array::packet_type>& exports, + const Teuchos::ArrayView& numPacketsPerLID, + const Teuchos::ArrayView& exportLIDs, + size_t& constantNumPackets, + Distributor& distor) +{ + typedef typename CrsGraph::packet_type packet_type; + typedef typename CrsGraph::local_graph_type local_graph_type; + typedef typename local_graph_type::device_type device_type; + typedef typename Kokkos::View::HostMirror::execution_space host_exec_space; + typedef Kokkos::Device host_dev_type; + + // mfh 23 Aug 2017: Fix for #1088 requires pack / unpack buffers to + // have a possibly different memory space (CudaSpace) than the + // default CUDA memory space (currently CudaUVMSpace). + typedef typename device_type::execution_space buffer_exec_space; +#ifdef KOKKOS_HAVE_CUDA + typedef typename std::conditional< + std::is_same< + buffer_exec_space, Kokkos::Cuda + >::value, + Kokkos::CudaSpace, + typename device_type::memory_space + >::type buffer_memory_space; +#else + typedef typename device_type::memory_space buffer_memory_space; +#endif // KOKKOS_HAVE_CUDA + // @MFH: why not use CrsGraph::buffer_device_type??? + typedef Kokkos::Device buffer_device_type; + + // Convert all Teuchos::Array to Kokkos::View + + // This is an output array, so we don't have to copy to device here. + // However, we'll have to remember to copy back to host when done. + typename local_graph_type::device_type outputDevice; + auto num_packets_per_lid_d = + create_mirror_view_from_raw_host_array (outputDevice, + numPacketsPerLID.getRawPtr (), + numPacketsPerLID.size (), false, + "num_packets_per_lid"); + // This is an input array, so we have to copy to device here. + // However, we never need to copy it back to host. + auto export_lids_d = + create_mirror_view_from_raw_host_array (outputDevice, + exportLIDs.getRawPtr (), + exportLIDs.size (), true, + "export_lids"); + // Create an empty array of PIDs + Kokkos::View export_pids_d ("export_pids", 0); + + Kokkos::DualView exports_dv ("exports", 0); + constexpr bool pack_pids = false; + PackCrsGraphImpl::packCrsGraph( + sourceGraph, exports_dv, num_packets_per_lid_d, export_lids_d, + export_pids_d, constantNumPackets, pack_pids, distor); + // The counts are an output of packCrsGraph, so we have to copy + // them back to host. + Kokkos::View num_packets_per_lid_h + (numPacketsPerLID.getRawPtr (), + numPacketsPerLID.size ()); + Kokkos::deep_copy (num_packets_per_lid_h, num_packets_per_lid_d); + + // FIXME (mfh 23 Aug 2017) If we're forced to use a DualView for + // exports_dv above, then we have two host copies for exports_h. + + // The exports are an output of packCrsGraph, so we have to + // copy them back to host. + if (static_cast (exports.size ()) != + static_cast (exports_dv.dimension_0 ())) { + exports.resize (exports_dv.dimension_0 ()); + } + Kokkos::View exports_h (exports.getRawPtr (), + exports.size ()); + Kokkos::deep_copy (exports_h, exports_dv.d_view); +} + +template +void +packCrsGraphNew(const CrsGraph& sourceGraph, + Kokkos::DualView::packet_type*, + typename CrsGraph::buffer_device_type>& exports, + const Kokkos::DualView::buffer_device_type>& numPacketsPerLID, + const Kokkos::DualView& exportLIDs, + size_t& constantNumPackets, + Distributor& distor) +{ + typedef typename CrsGraph::local_graph_type local_graph_type; + typedef typename local_graph_type::device_type device_type; + + // mfh 23 Aug 2017: Fix for #1088 requires pack / unpack buffers to + // have a possibly different memory space (CudaSpace) than the + // default CUDA memory space (currently CudaUVMSpace). + typedef typename device_type::execution_space buffer_exec_space; +#ifdef KOKKOS_HAVE_CUDA + typedef typename std::conditional< + std::is_same< + buffer_exec_space, Kokkos::Cuda + >::value, + Kokkos::CudaSpace, + typename device_type::memory_space + >::type buffer_memory_space; +#else + typedef typename device_type::memory_space buffer_memory_space; +#endif // KOKKOS_HAVE_CUDA + // @MFH: why not use CrsGraph::buffer_device_type??? + typedef Kokkos::Device buffer_device_type; + + // Create an empty array of PIDs, since the interface needs it. + Kokkos::View exportPIDs_d ("exportPIDs", 0); + constexpr bool pack_pids = false; + + // Write-only device access + auto numPacketsPerLID_nc = numPacketsPerLID; // const DV& -> DV + numPacketsPerLID_nc.modified_host() = 0; + numPacketsPerLID_nc.modified_device() = 1; + auto numPacketsPerLID_d = numPacketsPerLID.template view (); + + // Read-only device access + auto exportLIDs_nc = Tpetra::Details::castAwayConstDualView (exportLIDs); + exportLIDs_nc.template sync (); + auto exportLIDs_d = exportLIDs.template view (); + + PackCrsGraphImpl::packCrsGraph( + sourceGraph, exports, numPacketsPerLID_d, exportLIDs_d, + exportPIDs_d, constantNumPackets, pack_pids, distor); +} + +template +void +packCrsGraphWithOwningPIDs(const CrsGraph& sourceGraph, + Kokkos::DualView::packet_type*, + typename CrsGraph::buffer_device_type>& + exports_dv, + const Teuchos::ArrayView& numPacketsPerLID, + const Teuchos::ArrayView& exportLIDs, + const Teuchos::ArrayView& sourcePIDs, + size_t& constantNumPackets, + Distributor& distor) +{ + typedef typename CrsGraph::local_graph_type local_graph_type; + typedef typename CrsGraph::packet_type packet_type; + typedef typename CrsGraph::buffer_device_type buffer_device_type; + typedef typename Kokkos::DualView::t_host::execution_space host_exec_space; + typedef Kokkos::Device host_dev_type; + + typename local_graph_type::device_type outputDevice; + + // Convert all Teuchos::Array to Kokkos::View + + // This is an output array, so we don't have to copy to device here. + // However, we'll have to remember to copy back to host when done. + auto num_packets_per_lid_d = + create_mirror_view_from_raw_host_array (buffer_device_type (), + numPacketsPerLID.getRawPtr (), + numPacketsPerLID.size (), false, + "num_packets_per_lid"); + + // This is an input array, so we have to copy to device here. + // However, we never need to copy it back to host. + auto export_lids_d = + create_mirror_view_from_raw_host_array (outputDevice, + exportLIDs.getRawPtr (), + exportLIDs.size (), true, + "export_lids"); + // This is an input array, so we have to copy to device here. + // However, we never need to copy it back to host. + auto export_pids_d = + create_mirror_view_from_raw_host_array (outputDevice, + sourcePIDs.getRawPtr (), + sourcePIDs.size (), true, + "export_pids"); + constexpr bool pack_pids = true; + PackCrsGraphImpl::packCrsGraph( + sourceGraph, exports_dv, num_packets_per_lid_d, export_lids_d, + export_pids_d, constantNumPackets, pack_pids, distor); + + // The counts are an output of packCrsGraph, so we + // have to copy them back to host. + Kokkos::View num_packets_per_lid_h + (numPacketsPerLID.getRawPtr (), numPacketsPerLID.size ()); + Kokkos::deep_copy (num_packets_per_lid_h, num_packets_per_lid_d); +} + +} // namespace Details +} // namespace Tpetra + +#define TPETRA_DETAILS_PACKCRSGRAPH_INSTANT( LO, GO, NT ) \ + template void \ + Details::packCrsGraph ( \ + const CrsGraph&, \ + Teuchos::Array::packet_type>&, \ + const Teuchos::ArrayView&, \ + const Teuchos::ArrayView&, \ + size_t&, \ + Distributor&); \ + template void \ + Details::packCrsGraphNew ( \ + const CrsGraph&, \ + Kokkos::DualView::packet_type*, CrsGraph::buffer_device_type>&, \ + const Kokkos::DualView::buffer_device_type>&, \ + const Kokkos::DualView&, \ + size_t&, \ + Distributor&); \ + template void \ + Details::packCrsGraphWithOwningPIDs ( \ + const CrsGraph&, \ + Kokkos::DualView::packet_type*, CrsGraph::buffer_device_type>&, \ + const Teuchos::ArrayView&, \ + const Teuchos::ArrayView&, \ + const Teuchos::ArrayView&, \ + size_t&, \ + Distributor&); + +#endif // TPETRA_DETAILS_PACKCRSGRAPH_DEF_HPP diff --git a/packages/tpetra/core/src/Tpetra_Details_packCrsMatrix_def.hpp b/packages/tpetra/core/src/Tpetra_Details_packCrsMatrix_def.hpp index 2144d78b9ed3..da4f16a0435f 100644 --- a/packages/tpetra/core/src/Tpetra_Details_packCrsMatrix_def.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_packCrsMatrix_def.hpp @@ -89,6 +89,7 @@ class Distributor; // namespace Details { +namespace PackCrsMatrixImpl { /// \brief Compute the number of packets and offsets for the pack procedure /// /// \tparam OutputOffsetsViewType the type of the output offsets view @@ -755,20 +756,20 @@ do_pack (const LocalMatrix& local_matrix, /// (i.e., a possibly different number of entries per row). template void -packCrsMatrixImpl (const CrsMatrix& sourceMatrix, - Kokkos::DualView& exports, - const Kokkos::View& num_packets_per_lid, - const Kokkos::View& export_lids, - const Kokkos::View& export_pids, - size_t& constant_num_packets, - const bool pack_pids, - Distributor& /* dist */) +packCrsMatrix (const CrsMatrix& sourceMatrix, + Kokkos::DualView& exports, + const Kokkos::View& num_packets_per_lid, + const Kokkos::View& export_lids, + const Kokkos::View& export_pids, + size_t& constant_num_packets, + const bool pack_pids, + Distributor& /* dist */) { using Kokkos::View; typedef BufferDeviceType DT; typedef typename DT::execution_space execution_space; typedef Kokkos::DualView exports_view_type; - const char prefix[] = "Tpetra::Details::packCrsMatrixImpl: "; + const char prefix[] = "Tpetra::Details::PackCrsMatrixImpl::packCrsMatrix: "; constexpr bool debug = false; auto local_matrix = sourceMatrix.getLocalMatrix (); @@ -894,6 +895,8 @@ packCrsMatrixImpl (const CrsMatrix& sourceMatrix, // If we got this far, we succeeded. } +} // namespace PackCrsMatrixImpl + template void packCrsMatrix (const CrsMatrix& sourceMatrix, @@ -948,15 +951,12 @@ packCrsMatrix (const CrsMatrix& sourceMatrix, Kokkos::DualView exports_dv ("exports", 0); constexpr bool pack_pids = false; - packCrsMatrixImpl (sourceMatrix, - exports_dv, - num_packets_per_lid_d, - export_lids_d, - export_pids_d, - constantNumPackets, - pack_pids, distor); - // The counts are an output of packCrsMatrixImpl, so we have to copy - // them back to host. + PackCrsMatrixImpl::packCrsMatrix ( + sourceMatrix, exports_dv, num_packets_per_lid_d, export_lids_d, + export_pids_d, constantNumPackets, pack_pids, distor); + + // The counts are an output of PackCrsMatrixImpl::packCrsMatrix, so we have to + // copy them back to host. Kokkos::View num_packets_per_lid_h (numPacketsPerLID.getRawPtr (), numPacketsPerLID.size ()); @@ -965,8 +965,8 @@ packCrsMatrix (const CrsMatrix& sourceMatrix, // FIXME (mfh 23 Aug 2017) If we're forced to use a DualView for // exports_dv above, then we have two host copies for exports_h. - // The exports are an output of packCrsMatrixImpl, so we have to - // copy them back to host. + // The exports are an output of PackCrsMatrixImpl::packCrsMatrix, so we have + // to copy them back to host. if (static_cast (exports.size ()) != static_cast (exports_dv.dimension_0 ())) { exports.resize (exports_dv.dimension_0 ()); @@ -1021,13 +1021,9 @@ packCrsMatrixNew (const CrsMatrix& sourceMatrix, exportLIDs_nc.template sync (); auto exportLIDs_d = exportLIDs.template view (); - packCrsMatrixImpl (sourceMatrix, - exports, - numPacketsPerLID_d, - exportLIDs_d, - exportPIDs_d, - constantNumPackets, - pack_pids, distor); + PackCrsMatrixImpl::packCrsMatrix ( + sourceMatrix, exports, numPacketsPerLID_d, exportLIDs_d, + exportPIDs_d, constantNumPackets, pack_pids, distor); } template @@ -1072,12 +1068,12 @@ packCrsMatrixWithOwningPIDs (const CrsMatrix& sourceMatrix, sourcePIDs.size (), true, "export_pids"); constexpr bool pack_pids = true; - packCrsMatrixImpl (sourceMatrix, exports_dv, - num_packets_per_lid_d, export_lids_d, - export_pids_d, constantNumPackets, pack_pids, distor); + PackCrsMatrixImpl::packCrsMatrix( + sourceMatrix, exports_dv, num_packets_per_lid_d, export_lids_d, + export_pids_d, constantNumPackets, pack_pids, distor); - // The counts are an output of packCrsMatrixImpl, so we - // have to copy them back to host. + // The counts are an output of PackCrsMatrixImpl::packCrsMatrix, so we have to + // copy them back to host. Kokkos::View num_packets_per_lid_h (numPacketsPerLID.getRawPtr (), numPacketsPerLID.size ()); Kokkos::deep_copy (num_packets_per_lid_h, num_packets_per_lid_d); diff --git a/packages/tpetra/core/src/Tpetra_Details_unpackCrsGraphAndCombine.cpp b/packages/tpetra/core/src/Tpetra_Details_unpackCrsGraphAndCombine.cpp new file mode 100644 index 000000000000..a6a49b762060 --- /dev/null +++ b/packages/tpetra/core/src/Tpetra_Details_unpackCrsGraphAndCombine.cpp @@ -0,0 +1,67 @@ +/* +// @HEADER +// *********************************************************************** +// +// Tpetra: Templated Linear Algebra Services Package +// Copyright (2008) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Michael A. Heroux (maherou@sandia.gov) +// +// ************************************************************************ +// @HEADER +*/ + +#include "TpetraCore_config.h" + +#if defined(HAVE_TPETRA_EXPLICIT_INSTANTIATION) + +// We protect the contents of this file with macros, to assist +// applications that circumvent Trilinos' build system. (We do NOT +// recommend this.) That way, they can still build this file, but as +// long as the macros have correct definitions, they won't build +// anything that's not enabled. + +#include "KokkosCompat_ClassicNodeAPI_Wrapper.hpp" +#include "Tpetra_Details_unpackCrsGraphAndCombine_decl.hpp" +#include "Tpetra_Details_unpackCrsGraphAndCombine_def.hpp" +#include "TpetraCore_ETIHelperMacros.h" + +namespace Tpetra { + + TPETRA_ETI_MANGLING_TYPEDEFS() + + TPETRA_INSTANTIATE_LGN( TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_INSTANT ) + +} // namespace Tpetra + +#endif // Whether we should build this specialization diff --git a/packages/tpetra/core/src/Tpetra_Details_unpackCrsGraphAndCombine_decl.hpp b/packages/tpetra/core/src/Tpetra_Details_unpackCrsGraphAndCombine_decl.hpp new file mode 100644 index 000000000000..c3a42941e9b1 --- /dev/null +++ b/packages/tpetra/core/src/Tpetra_Details_unpackCrsGraphAndCombine_decl.hpp @@ -0,0 +1,262 @@ +// @HEADER +// *********************************************************************** +// +// Tpetra: Templated Linear Algebra Services Package +// Copyright (2008) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Michael A. Heroux (maherou@sandia.gov) +// +// ************************************************************************ +// @HEADER + +#ifndef TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DECL_HPP +#define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DECL_HPP + +#include "TpetraCore_config.h" +#include "Tpetra_CombineMode.hpp" +#include "Kokkos_DualView.hpp" +#include "Tpetra_DistObject_decl.hpp" + +/// \file Tpetra_Details_unpackCrsGraphAndCombine_decl.hpp +/// \brief Declaration of functions for unpacking the entries of a +/// Tpetra::CrsGraph for communication, in the case where it is +/// valid to go to the KokkosSparse::CrsGraph (local sparse graph +/// data structure) directly. +/// \warning This file, and its contents, are implementation details +/// of Tpetra. The file itself or its contents may disappear or +/// change at any time. +/// +/// Data (bytes) describing the row of the CRS graph are "packed" +/// (concatenated) in to a (view of) packet_type* object in the following order: +/// +/// 1. global column indices (GO) +/// 2. process IDs (optional, int) +/// +/// The functions in this file are companions to +/// Tpetra_Details_packCrsGraph.hpp, i.e., Tpetra_Details_packCrsGraph.hpp +/// implements the packing order described above to ensure proper unpacking. + +#ifndef DOXYGEN_SHOULD_SKIP_THIS +namespace Teuchos { +// Forward declaration of Array +template class Array; +// Forward declaration of ArrayView +template class ArrayView; +} // namespace Teuchos +#endif // DOXYGEN_SHOULD_SKIP_THIS + +namespace Tpetra { + +#ifndef DOXYGEN_SHOULD_SKIP_THIS +// Forward declaration of Distributor +class Distributor; +// Forward declaration of CrsGraph +template +class CrsGraph; +#endif // DOXYGEN_SHOULD_SKIP_THIS + +// +// Users must never rely on anything in the Details namespace. +// +namespace Details { + +/// \brief Unpack the imported column indices and combine +/// into graph. +/// +/// \tparam LO The type of local indices. See the +/// documentation of Map for requirements. +/// \tparam GO The type of global indices. See the +/// documentation of Map for requirements. +/// \tparam NT The Node type. See the documentation of Map +/// for requirements. +/// +/// \param sourceGraph [in] the CrsGraph source +/// +/// \param imports [in] Input pack buffer +/// +/// \param numPacketsPerLID [out] Entry k gives the number of bytes +/// packed for row exportLIDs[k] of the local graph. +/// +/// \param importLIDs [in] Local indices of the rows to pack. +/// +/// \param constantNumPackets [out] Setting this to zero tells the caller +/// to expect a possibly /// different ("nonconstant") number of packets per local index +/// (i.e., a possibly different number of entries per row). +/// +/// \param distor [in] The distributor (not used) +/// +/// \param combineMode [in] the mode to use for combining +/// +/// \param atomic [in] whether or not do atomic adds/replaces in to the graph +/// +/// \warning The allowed \c combineMode are: +/// ADD, REPLACE, and ABSMAX. INSERT is not allowed. +/// +/// This is the public interface to the unpack and combine machinery and +/// converts passed Teuchos::ArrayView objects to Kokkos::View objects (and +/// copies back in to the Teuchos::ArrayView objects, if needed). When +/// CrsGraph migrates fully to adopting Kokkos::DualView objects for its storage +/// of data, this procedure could be bypassed. +template +void +unpackCrsGraphAndCombine( + const CrsGraph& sourceGraph, + const Teuchos::ArrayView::packet_type>& imports, + const Teuchos::ArrayView& numPacketsPerLID, + const Teuchos::ArrayView& importLIDs, + size_t constantNumPackets, + Distributor & distor, + CombineMode combineMode, + const bool atomic); + +template +void +unpackCrsGraphAndCombineNew( + const CrsGraph& sourceGraph, + const Kokkos::DualView::packet_type*, + typename CrsGraph::buffer_device_type>& + imports, + const Kokkos::DualView::buffer_device_type>& + numPacketsPerLID, + const Kokkos::DualView& importLIDs, + const size_t constantNumPackets, + Distributor & distor, + const CombineMode combineMode, + const bool atomic); + +/// \brief Special version of Tpetra::Details::unpackCrsGraphAndCombine +/// that also unpacks owning process ranks. +/// +/// Perform the count for unpacking the imported column indices and pids, +/// and combining them into graph. Return (a ceiling on) +/// the number of local stored entries ("nonzeros") in the graph. If +/// there are no shared rows in the sourceGraph this count is exact. +/// +/// Note: This routine also counts the copyAndPermute nonzeros in +/// addition to those that come in via import. +/// +/// \tparam LO The type of local indices. See the +/// documentation of Map for requirements. +/// \tparam GO The type of global indices. See the +/// documentation of Map for requirements. +/// \tparam NT The Kokkos Node type. See the documentation of Map +/// for requirements. +/// +/// \param sourceGraph [in] the CrsGraph source +/// +/// \param imports [in] Input pack buffer +/// +/// \param numPacketsPerLID [out] Entry k gives the number of bytes +/// packed for row exportLIDs[k] of the local graph. +/// +/// \param importLIDs [in] Local indices of the rows to pack. +/// +/// \param constantNumPackets [out] Setting this to zero tells the caller +/// to expect a possibly /// different ("nonconstant") number of packets per local index +/// (i.e., a possibly different number of entries per row). +/// +/// \param distor [in] The distributor (not used) +/// +/// \param combineMode [in] the mode to use for combining +/// +/// \param numSameIds [in] +/// +/// \param permuteToLIDs [in] +/// +/// \param permuteFromLIDs [in] +/// +/// \warning The allowed \c combineMode are: +/// ADD, REPLACE, and ABSMAX. INSERT is not allowed. +// +/// \warning This method is intended for expert developer use +/// only, and should never be called by user code. +/// +/// Note: This is the public interface to the unpack and combine machinery and +/// converts passed Teuchos::ArrayView objects to Kokkos::View objects (and +/// copies back in to the Teuchos::ArrayView objects, if needed). When +/// CrsGraph migrates fully to adopting Kokkos::DualView objects for its storage +/// of data, this procedure could be bypassed. +template +size_t +unpackAndCombineWithOwningPIDsCount( + const CrsGraph & sourceGraph, + const Teuchos::ArrayView &importLIDs, + const Teuchos::ArrayView::packet_type> &imports, + const Teuchos::ArrayView& numPacketsPerLID, + size_t constantNumPackets, + Distributor &distor, + CombineMode combineMode, + size_t numSameIDs, + const Teuchos::ArrayView& permuteToLIDs, + const Teuchos::ArrayView& permuteFromLIDs); + +/// \brief unpackAndCombineIntoCrsArrays +/// +/// \note You should call unpackAndCombineWithOwningPIDsCount first +/// and allocate all arrays accordingly, before calling this +/// function. +/// +/// Note: The SourcePids vector (on input) should contain owning PIDs +/// for each column in the (source) ColMap, as from +/// Tpetra::Import_Util::getPids, with the "-1 for local" option being +/// used. +/// +/// Note: The TargetPids vector (on output) will contain owning PIDs +/// for each entry in the graph, with the "-1 for local" for locally +/// owned entries. +template +void +unpackAndCombineIntoCrsArrays( + const CrsGraph & sourceGraph, + const Teuchos::ArrayView& importLIDs, + const Teuchos::ArrayView::packet_type>& imports, + const Teuchos::ArrayView& numPacketsPerLID, + const size_t constantNumPackets, + Distributor& distor, + const CombineMode combineMode, + const size_t numSameIDs, + const Teuchos::ArrayView& permuteToLIDs, + const Teuchos::ArrayView& permuteFromLIDs, + size_t TargetNumRows, + size_t TargetNumNonzeros, + const int MyTargetPID, + const Teuchos::ArrayView& CRS_rowptr, + const Teuchos::ArrayView& CRS_colind, + const Teuchos::ArrayView& SourcePids, + Teuchos::Array& TargetPids); + +} // namespace Details +} // namespace Tpetra + +#endif // TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DECL_HPP diff --git a/packages/tpetra/core/src/Tpetra_Details_unpackCrsGraphAndCombine_def.hpp b/packages/tpetra/core/src/Tpetra_Details_unpackCrsGraphAndCombine_def.hpp new file mode 100644 index 000000000000..acec53cbe346 --- /dev/null +++ b/packages/tpetra/core/src/Tpetra_Details_unpackCrsGraphAndCombine_def.hpp @@ -0,0 +1,1292 @@ +// @HEADER +// *********************************************************************** +// +// Tpetra: Templated Linear Algebra Services Package +// Copyright (2008) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Michael A. Heroux (maherou@sandia.gov) +// +// ************************************************************************ +// @HEADER + +#ifndef TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP +#define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP + +#include "TpetraCore_config.h" +#include "Teuchos_Array.hpp" +#include "Teuchos_ArrayView.hpp" +#include "Tpetra_Details_castAwayConstDualView.hpp" +#include "Tpetra_Details_computeOffsets.hpp" +#include "Tpetra_Details_createMirrorView.hpp" +#include "Tpetra_Details_OrdinalTraits.hpp" +#include "Tpetra_Details_Behavior.hpp" +#include "Tpetra_CrsGraph_decl.hpp" +#include "Tpetra_Details_getEntryOnHost.hpp" +#include "Kokkos_Core.hpp" +#include +#include + +/// \file Tpetra_Details_unpackCrsGraphAndCombine_def.hpp +/// \brief Definition of functions for unpacking the entries of a +/// Tpetra::CrsGraph for communication, in the case where it is +/// valid to go to the KokkosSparse::CrsGraph (local sparse graph +/// data structure) directly. +/// \warning This file, and its contents, are implementation details +/// of Tpetra. The file itself or its contents may disappear or +/// change at any time. +/// +/// Data (bytes) describing the row of the CRS graph are "packed" +/// (concatenated) in to a (view of) Packet* object in the following order: +/// +/// 1. global column indices (GlobalOrdinal) +/// 2. proces IDs (optional, int) +/// +/// The functions in this file are companions to +/// Tpetra_Details_packCrsGraph.hpp, i.e., Tpetra_Details_packCrsGraph.hpp +/// implements the packing order described above to ensure proper unpacking. + +namespace Tpetra { + +#ifndef DOXYGEN_SHOULD_SKIP_THIS +// Forward declaration of Distributor +class Distributor; +#endif // DOXYGEN_SHOULD_SKIP_THIS + +// +// Users must never rely on anything in the Details namespace. +// +namespace Details { + +namespace UnpackAndCombineCrsGraphImpl { + +/// \brief Unpack a single row of a CrsGraph +/// +/// \tparam LO The type of local indices. See the +/// documentation of Map for requirements. +/// \tparam GO The type of global indices. See the +/// documentation of Map for requirements. +/// \tparam Device The Kokkos device type. See the documentation of Map +/// for requirements. +/// \tparam BufferDevice The "buffer device type." +template +KOKKOS_FUNCTION int +unpackRow(typename Kokkos::View& gids_out, + typename Kokkos::View& pids_out, + const Kokkos::View& imports, + const size_t offset, + const size_t num_ent) +{ + typedef typename Kokkos::View::size_type size_type; + + if (num_ent == 0) { + // Empty rows always take zero bytes, to ensure sparsity. + return 0; + } + + // Unpack GIDs + for (size_type k=0; k 0) { + for (size_type k=0; k(imports(offset+num_ent+k)); + } + + return 0; +} + +/// \brief Unpacks and combines a single row of the CrsGraph. +/// +/// \tparam LocalGraph KokkosSparse::CrsGraph specialization. +/// \tparam LocalMap Type of the "local" column map +/// \tparam BufferDevice Type of the "buffer device type." +/// See Trilinos GitHub Issue #1088 for details. +/// +/// Data (bytes) describing the row of the CRS graph are "unpacked" +/// from a single (concatenated) (view of) Packet* directly into the +/// row of the graph. +template +class UnpackAndCombineFunctor { + + typedef Packet packet_type; + typedef LocalMap local_map_type; + typedef LocalGraph local_graph_type; + typedef BufferDevice buffer_device_type; + + typedef typename local_map_type::local_ordinal_type LO; + typedef typename local_map_type::global_ordinal_type GO; + // Kokkos::parallel_reduce fails to compile if named device_type and typedef + // is public + typedef typename local_map_type::device_type device_type; + typedef typename device_type::execution_space execution_space; + + typedef Kokkos::View num_packets_per_lid_type; + typedef Kokkos::View offsets_type; + typedef Kokkos::View input_buffer_type; + typedef Kokkos::View import_lids_type; + + typedef Kokkos::View lids_scratch_type; + typedef Kokkos::View gids_scratch_type; + typedef Kokkos::View pids_scratch_type; + + static_assert(std::is_same::value, + "LocalMap::local_ordinal_type and " + "LocalGraph::data_type must be the same."); + + local_graph_type local_graph; + local_map_type local_col_map; + input_buffer_type imports; + num_packets_per_lid_type num_packets_per_lid; + import_lids_type import_lids; + offsets_type offsets; + Tpetra::CombineMode combine_mode; + size_t max_num_ent; + bool unpack_pids; + bool atomic; + Kokkos::Experimental::UniqueToken tokens; + lids_scratch_type lids_scratch; + gids_scratch_type gids_scratch; + pids_scratch_type pids_scratch; + + public: + typedef Kokkos::pair value_type; + + UnpackAndCombineFunctor( + const local_graph_type& local_graph_in, + const local_map_type& local_col_map_in, + const input_buffer_type& imports_in, + const num_packets_per_lid_type& num_packets_per_lid_in, + const import_lids_type& import_lids_in, + const offsets_type& offsets_in, + const Tpetra::CombineMode combine_mode_in, + const size_t max_num_ent_in, + const bool unpack_pids_in, + const bool atomic_in) : + local_graph(local_graph_in), + local_col_map(local_col_map_in), + imports(imports_in), + num_packets_per_lid(num_packets_per_lid_in), + import_lids(import_lids_in), + offsets(offsets_in), + combine_mode(combine_mode_in), + max_num_ent(max_num_ent_in), + unpack_pids(unpack_pids_in), + atomic(atomic_in), + tokens(execution_space()), + lids_scratch("pids_scratch", tokens.size() * max_num_ent), + gids_scratch("gids_scratch", tokens.size() * max_num_ent), + pids_scratch("lids_scratch", tokens.size() * max_num_ent) + {} + + KOKKOS_INLINE_FUNCTION void init(value_type& dst) const + { + using Tpetra::Details::OrdinalTraits; + dst = Kokkos::make_pair(0, OrdinalTraits::invalid()); + } + + KOKKOS_INLINE_FUNCTION void + join(volatile value_type& dst, const volatile value_type& src) const + { + // `dst` should reflect the first (least) bad index and + // all other associated error codes and data. Thus, we need only + // check if the `src` object shows an error and if its associated + // bad index is less than `dst`'s bad index. + using Tpetra::Details::OrdinalTraits; + if (src.second != OrdinalTraits::invalid()) { + // An error in the src; check if + // 1. `dst` shows errors + // 2. If `dst` does show errors, if src's bad index is less than + // *this' bad index + if (dst.second == OrdinalTraits::invalid() || + src.second < dst.second) { + dst = src; + } + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const LO i, value_type& dst) const + { + using Kokkos::View; + using Kokkos::subview; + using Kokkos::MemoryUnmanaged; + typedef typename execution_space::size_type size_type; + typedef typename Kokkos::pair slice; + + typedef View lids_out_type; + typedef View pids_out_type; + typedef View gids_out_type; + + const size_t num_packets_this_lid = num_packets_per_lid(i); + const size_t num_ent = (unpack_pids) ? num_packets_this_lid/2 + : num_packets_this_lid; + if (unpack_pids && num_packets_this_lid%2 != 0) { + // Attempting to unpack PIDs, but num_packets_this_lid is not even; this + // should never + dst = Kokkos::make_pair(1, i); + return; + } + + // Only unpack data if there is a nonzero number to unpack + if (num_ent == 0) { + return; + } + + // there is actually something in the row + const size_t buf_size = imports.size(); + const size_t offset = offsets(i); + + if (offset > buf_size || offset + num_packets_this_lid > buf_size) { + dst = Kokkos::make_pair(2, i); // out of bounds + return; + } + + // Get subviews in to the scratch arrays. The token returned from acquire + // is an integer in [0, tokens.size()). It is used to grab a unique (to + // this thread) subview of the scratch arrays. + const size_type token = tokens.acquire(); + const size_t a = static_cast(token) * max_num_ent; + const size_t b = a + num_ent; + lids_out_type lids_out = subview(lids_scratch, slice(a, b)); + gids_out_type gids_out = subview(gids_scratch, slice(a, b)); + pids_out_type pids_out = subview(pids_scratch, slice(a, (unpack_pids ? b : a))); + + // Unpack this row! + int err = unpackRow( + gids_out, pids_out, imports, offset, num_ent); + + if (err != 0) { + dst = Kokkos::make_pair(3, i); + return; + } + + // Column indices come in as global indices, in case the + // source object's column Map differs from the target object's + // (this's) column Map, and must be converted local index values + for (size_t k = 0; k < num_ent; ++k) { + lids_out(k) = local_col_map.getLocalElement(gids_out(k)); + } + + tokens.release(token); + } +}; + +/// \brief Perform the unpack operation for the graph +/// +/// \tparam LocalGraph the specialization of the KokkosSparse::CrsGraph +/// local graph +/// \tparam LocalMap the type of the local column map +/// +/// This is a higher level interface to the UnpackAndCombineFunctor +template +void +unpackAndCombine( + const LocalGraph& local_graph, + const LocalMap& local_map, + const Kokkos::View& imports, + const Kokkos::View& num_packets_per_lid, + const Kokkos::View& import_lids, + const Tpetra::CombineMode combine_mode, + const bool unpack_pids, + const bool atomic) +{ + + TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, + "unpackAndCombine[New] should not (yet) be called, the method is " + "incomplete. To complete, indices need to be inserted (unpacked) in to " + "the destination graph. The local graph, a Kokkos::StaticCrsGraph, does " + "not support insertion of indices"); + + typedef typename LocalMap::local_ordinal_type LO; + typedef typename LocalMap::device_type device_type; + typedef typename device_type::execution_space execution_space; + typedef Kokkos::RangePolicy > range_policy; + typedef UnpackAndCombineFunctor unpack_functor_type; + + const char prefix[] = + "Tpetra::Details::UnpackAndCombineCrsGraphImpl::unpackAndCombine: "; + + const size_t num_import_lids = static_cast(import_lids.dimension_0()); + if (num_import_lids == 0) { + // Nothing to unpack + return; + } + + { + TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == INSERT, + std::invalid_argument, + prefix << "INSERT combine mode is not allowed if the graph has a static graph " + "(i.e., was constructed with the CrsGraph constructor that takes a " + "const CrsGraph pointer)."); + + // Unknown combine mode! + TEUCHOS_TEST_FOR_EXCEPTION(combine_mode != REPLACE, + std::invalid_argument, + prefix << "Invalid combine mode; should never get " + "here! Please report this bug to the Tpetra developers."); + + // Check that sizes of input objects are consistent. + bool bad_num_import_lids = + num_import_lids != static_cast(num_packets_per_lid.dimension_0()); + TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids, + std::invalid_argument, + prefix << "importLIDs.size() (" << num_import_lids << ") != " + "numPacketsPerLID.size() (" << num_packets_per_lid.dimension_0() << ")."); + } // end QA error checking + + // Get the offsets + Kokkos::View offsets("offsets", num_import_lids+1); + computeOffsetsFromCounts(offsets, num_packets_per_lid); + + // Determine the maximum number of entries in any row in the graph. The + // maximum number of entries is needed to allocate unpack buffers on the + // device. + size_t max_num_ent; + Kokkos::parallel_reduce("MaxReduce", + num_packets_per_lid.size(), + KOKKOS_LAMBDA(const int& i, size_t& running_max_num_ent) { + size_t num_packets_this_lid = num_packets_per_lid(i); + size_t num_ent = (unpack_pids) ? num_packets_this_lid/2 + : num_packets_this_lid; + if (num_ent > running_max_num_ent) running_max_num_ent = num_ent; + }, Kokkos::Experimental::Max(max_num_ent)); + + // Now do the actual unpack! + unpack_functor_type f(local_graph, local_map, + imports, num_packets_per_lid, import_lids, offsets, combine_mode, + max_num_ent, unpack_pids, atomic); + + typename unpack_functor_type::value_type x; + Kokkos::parallel_reduce(range_policy(0, static_cast(num_import_lids)), f, x); + auto x_h = x.to_std_pair(); + TEUCHOS_TEST_FOR_EXCEPTION(x_h.first != 0, std::runtime_error, + prefix << "UnpackAndCombineFunctor reported error code " + << x_h.first << " for the first bad row " << x_h.second); + + return; +} + +template +size_t +unpackAndCombineWithOwningPIDsCount( + const LocalGraph& local_graph, + const Kokkos::View permute_from_lids, + const Kokkos::View& imports, + const Kokkos::View& num_packets_per_lid, + const size_t num_same_ids) +{ + using Kokkos::parallel_reduce; + typedef LocalGraph local_graph_type; + typedef typename local_graph_type::data_type LO; + typedef typename local_graph_type::device_type device_type; + typedef typename device_type::execution_space execution_space; + typedef Kokkos::RangePolicy > range_policy; + + size_t count = 0; + LO num_items; + + // Number of graph entries to unpack (returned by this function). + num_items = static_cast(num_same_ids); + if (num_items) { + size_t kcnt = 0; + parallel_reduce( + range_policy(0, num_items), + KOKKOS_LAMBDA(const LO lid, size_t& update) { + update += static_cast(local_graph.row_map[lid+1] + -local_graph.row_map[lid]); + }, kcnt); + count += kcnt; + } + + // Count entries copied directly from the source graph with permuting. + num_items = static_cast(permute_from_lids.dimension_0()); + if (num_items) { + size_t kcnt = 0; + parallel_reduce( + range_policy(0, num_items), + KOKKOS_LAMBDA(const LO i, size_t& update) { + const LO lid = permute_from_lids(i); + update += static_cast(local_graph.row_map[lid+1] + - local_graph.row_map[lid]); + }, kcnt); + count += kcnt; + } + + { + // Count entries received from other MPI processes. + size_t tot_num_ent = 0; + Kokkos::parallel_reduce("SumReduce", + num_packets_per_lid.size(), + KOKKOS_LAMBDA(const int& i, size_t& lsum) { + lsum += num_packets_per_lid(i) / 2; + }, Kokkos::Experimental::Sum(tot_num_ent)); + count += tot_num_ent; + } + + return count; +} + +/// \brief Setup row pointers for remotes +template +void +setupRowPointersForRemotes( + const Kokkos::View& tgt_rowptr, + const Kokkos::View& import_lids, + const Kokkos::View& imports, + const Kokkos::View& num_packets_per_lid) +{ + using Kokkos::parallel_reduce; + typedef Device device_type; + typedef typename device_type::execution_space execution_space; + typedef typename Kokkos::View::size_type size_type; + typedef Kokkos::RangePolicy > range_policy; + + const size_type N = num_packets_per_lid.dimension_0(); + parallel_for("Setup row pointers for remotes", + range_policy(0, N), + KOKKOS_LAMBDA(const size_t i){ + typedef typename std::remove_reference::type atomic_incr_type; + const size_t num_packets_this_lid = num_packets_per_lid(i); + const size_t num_ent = num_packets_this_lid / 2; + Kokkos::atomic_fetch_add(&tgt_rowptr(import_lids(i)), atomic_incr_type(num_ent)); + }); +} + +// Convert array of row lengths to a CRS pointer array +template +void +makeCrsRowPtrFromLengths( + const Kokkos::View& tgt_rowptr, + const Kokkos::View& new_start_row) +{ + using Kokkos::parallel_scan; + typedef Device device_type; + typedef typename device_type::execution_space execution_space; + typedef typename Kokkos::View::size_type size_type; + typedef Kokkos::RangePolicy > range_policy; + const size_type N = new_start_row.dimension_0(); + parallel_scan( + range_policy(0, N), + KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) { + auto cur_val = tgt_rowptr(i); + if (final) { + tgt_rowptr(i) = update; + new_start_row(i) = tgt_rowptr(i); + } + update += cur_val; + } + ); +} + +template +void +copyDataFromSameIDs( + const Kokkos::View& tgt_colind, + const Kokkos::View& tgt_pids, + const Kokkos::View& new_start_row, + const Kokkos::View& tgt_rowptr, + const Kokkos::View& src_pids, + const LocalGraph& local_graph, + const LocalMap& local_col_map, + const size_t num_same_ids, + const int my_pid) +{ + using Kokkos::parallel_for; + typedef typename LocalMap::device_type device_type; + typedef typename LocalMap::local_ordinal_type LO; + typedef typename device_type::execution_space execution_space; + typedef Kokkos::RangePolicy > range_policy; + + parallel_for( + range_policy(0, num_same_ids), + KOKKOS_LAMBDA(const size_t i) { + typedef typename std::remove_reference::type atomic_incr_type; + + const LO src_lid = static_cast(i); + size_t src_row = local_graph.row_map(src_lid); + + const LO tgt_lid = static_cast(i); + const size_t tgt_row = tgt_rowptr(tgt_lid); + + const size_t nsr = local_graph.row_map(src_lid+1) + - local_graph.row_map(src_lid); + Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr)); + + for (size_t j=local_graph.row_map(src_lid); + j +void +copyDataFromPermuteIDs( + const Kokkos::View& tgt_colind, + const Kokkos::View& tgt_pids, + const Kokkos::View& new_start_row, + const Kokkos::View& tgt_rowptr, + const Kokkos::View& src_pids, + const Kokkos::View& permute_to_lids, + const Kokkos::View& permute_from_lids, + const LocalGraph& local_graph, + const LocalMap& local_col_map, + const int my_pid) +{ + using Kokkos::parallel_for; + typedef typename LocalMap::device_type device_type; + typedef typename LocalMap::local_ordinal_type LO; + typedef typename device_type::execution_space execution_space; + typedef typename Kokkos::View::size_type size_type; + typedef Kokkos::RangePolicy > range_policy; + + const size_type num_permute_to_lids = permute_to_lids.dimension_0(); + + parallel_for( + range_policy(0, num_permute_to_lids), + KOKKOS_LAMBDA(const size_t i) { + typedef typename std::remove_reference::type atomic_incr_type; + + const LO src_lid = permute_from_lids(i); + const size_t src_row = local_graph.row_map(src_lid); + + const LO tgt_lid = permute_to_lids(i); + const size_t tgt_row = tgt_rowptr(tgt_lid); + + size_t nsr = local_graph.row_map(src_lid+1) + - local_graph.row_map(src_lid); + Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr)); + + for (size_t j=local_graph.row_map(src_lid); + j +void +unpackAndCombineIntoCrsArrays2( + const Kokkos::View& tgt_colind, + const Kokkos::View& tgt_pids, + const Kokkos::View& new_start_row, + const Kokkos::View& offsets, + const Kokkos::View& import_lids, + const Kokkos::View& imports, + const Kokkos::View& num_packets_per_lid, + const LocalGraph& local_graph, + const LocalMap /*& local_col_map*/, + const int my_pid) +{ + using Kokkos::View; + using Kokkos::subview; + using Kokkos::MemoryUnmanaged; + using Kokkos::parallel_reduce; + using Kokkos::atomic_fetch_add; + + typedef Packet packet_type; + typedef BufferDevice buffer_device_type; + typedef typename LocalMap::device_type device_type; + typedef typename LocalMap::local_ordinal_type LO; + typedef typename LocalMap::global_ordinal_type GO; + typedef typename device_type::execution_space execution_space; + typedef typename Kokkos::View::size_type size_type; + typedef typename Kokkos::pair slice; + typedef Kokkos::RangePolicy > range_policy; + + typedef View pids_out_type; + typedef View gids_out_type; + + const size_type num_import_lids = import_lids.size(); + const char prefix[] = "UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays2: "; + + // RemoteIDs: Loop structure following UnpackAndCombine + int gbl_err_count; + parallel_reduce("Unpack and combine into CRS", + range_policy(0, num_import_lids), + KOKKOS_LAMBDA(const size_t i, int& err) { + typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type; + const size_t num_packets_this_lid = num_packets_per_lid(i); + const size_t num_ent = num_packets_this_lid / 2; + const size_t offset = offsets(i); + const LO lcl_row = import_lids(i); + const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent)); + const size_t end_row = start_row + num_ent; + + gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row)); + pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row)); + + err += unpackRow( + gids_out, pids_out, imports, offset, num_ent); + + // Correct target PIDs. + for (size_t j = 0; j < static_cast(num_ent); ++j) { + const int pid = pids_out(j); + pids_out(j) = (pid != my_pid) ? pid : -1; + } + }, gbl_err_count); + + TEUCHOS_TEST_FOR_EXCEPTION(gbl_err_count != 0, + std::invalid_argument, prefix << + "Attempting to unpack PIDs, but num_ent is not even; this should never " + "happen! Please report this bug to the Tpetra developers."); + + return; +} + +template +void +unpackAndCombineIntoCrsArrays( + const LocalGraph & local_graph, + const LocalMap & local_col_map, + const Kokkos::View& import_lids, + const Kokkos::View& imports, + const Kokkos::View& num_packets_per_lid, + const Kokkos::View& permute_to_lids, + const Kokkos::View& permute_from_lids, + const Kokkos::View& tgt_rowptr, + const Kokkos::View& tgt_colind, + const Kokkos::View& src_pids, + const Kokkos::View& tgt_pids, + const size_t num_same_ids, + const size_t tgt_num_rows, + const size_t tgt_num_nonzeros, + const int my_tgt_pid) +{ + using Kokkos::View; + using Kokkos::subview; + using Kokkos::parallel_for; + using Kokkos::MemoryUnmanaged; + typedef Packet packet_type; + typedef LocalMap local_map_type; + typedef LocalGraph local_graph_type; + typedef BufferDevice buffer_device_type; + typedef typename LocalMap::device_type device_type; + typedef typename LocalMap::local_ordinal_type LO; + typedef typename device_type::execution_space execution_space; + typedef typename Kokkos::View::size_type size_type; + typedef Kokkos::RangePolicy > range_policy; + + const char prefix[] = "UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays: "; + + const size_t N = tgt_num_rows; + const size_t mynnz = tgt_num_nonzeros; + + // In the case of reduced communicators, the sourceGraph won't have + // the right "my_pid", so thus we have to supply it. + const int my_pid = my_tgt_pid; + + // Zero the rowptr + parallel_for( + range_policy(0, N+1), + KOKKOS_LAMBDA(const size_t i) { + tgt_rowptr(i) = 0; + } + ); + + // same IDs: Always first, always in the same place + parallel_for( + range_policy(0, num_same_ids), + KOKKOS_LAMBDA(const size_t i) { + const LO tgt_lid = static_cast(i); + const LO src_lid = static_cast(i); + tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1) + - local_graph.row_map(src_lid); + } + ); + + // Permute IDs: Still local, but reordered + const size_type num_permute_to_lids = permute_to_lids.dimension_0(); + parallel_for( + range_policy(0, num_permute_to_lids), + KOKKOS_LAMBDA(const size_t i) { + const LO tgt_lid = permute_to_lids(i); + const LO src_lid = permute_from_lids(i); + tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1) + - local_graph.row_map(src_lid); + } + ); + + // Get the offsets from the number of packets per LID + const size_type num_import_lids = import_lids.dimension_0(); + View offsets("offsets", num_import_lids+1); + computeOffsetsFromCounts(offsets, num_packets_per_lid); + +#ifdef HAVE_TPETRA_DEBUG + { + auto nth_offset_h = getEntryOnHost(offsets, num_import_lids); + const bool condition = + nth_offset_h != static_cast(imports.dimension_0()); + TEUCHOS_TEST_FOR_EXCEPTION + (condition, std::logic_error, prefix + << "The final offset in bytes " << nth_offset_h + << " != imports.size() = " << imports.dimension_0() + << ". Please report this bug to the Tpetra developers."); + } +#endif // HAVE_TPETRA_DEBUG + + // Setup row pointers for remotes + setupRowPointersForRemotes( + tgt_rowptr, import_lids, imports, num_packets_per_lid); + + // If multiple processes contribute to the same row, we may need to + // update row offsets. This tracks that. + View new_start_row("new_start_row", N+1); + + // Turn row length into a real CRS row pointer + makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row); + { + auto nth_tgt_rowptr_h = getEntryOnHost(tgt_rowptr, N); + bool condition = nth_tgt_rowptr_h != mynnz; + TEUCHOS_TEST_FOR_EXCEPTION(condition, std::invalid_argument, + prefix << "CRS_rowptr[last] = " << + nth_tgt_rowptr_h << "!= mynnz = " << mynnz << "."); + } + + // SameIDs: Copy the data over + copyDataFromSameIDs(tgt_colind, tgt_pids, new_start_row, + tgt_rowptr, src_pids, local_graph, local_col_map, num_same_ids, my_pid); + + copyDataFromPermuteIDs(tgt_colind, tgt_pids, new_start_row, + tgt_rowptr, src_pids, permute_to_lids, permute_from_lids, + local_graph, local_col_map, my_pid); + + if (imports.dimension_0() <= 0) { + return; + } + + unpackAndCombineIntoCrsArrays2< + packet_type,local_graph_type,local_map_type,buffer_device_type>( + tgt_colind, tgt_pids, new_start_row, offsets, import_lids, imports, + num_packets_per_lid, local_graph, local_col_map, my_pid); + + return; +} + +} // namespace UnpackAndCombineCrsGraphImpl + +/// \brief Unpack the imported column indices and combine into graph. +/// +/// \tparam LO The type of local indices. See the +/// documentation of Map for requirements. +/// \tparam GO The type of global indices. See the +/// documentation of Map for requirements. +/// \tparam Node The Kokkos Node type. See the documentation of Map +/// for requirements. +/// +/// \param sourceGraph [in] the CrsGraph source +/// +/// \param imports [in] Input pack buffer +/// +/// \param numPacketsPerLID [out] Entry k gives the number of bytes +/// packed for row exportLIDs[k] of the local graph. +/// +/// \param importLIDs [in] Local indices of the rows to pack. +/// +/// \param constantNumPackets [out] Setting this to zero tells the caller +/// to expect a possibly /// different ("nonconstant") number of packets per local index +/// (i.e., a possibly different number of entries per row). +/// +/// \param distor [in] The distributor (not used) +/// +/// \param combineMode [in] the mode to use for combining indices +/// +/// \param atomic [in] whether or not do atomic adds/replaces in to the graph +/// +/// \warning The allowed \c combineMode are: +/// REPLACE. INSERT is not allowed. +/// +/// This is the public interface to the unpack and combine machinery and +/// converts passed Teuchos::ArrayView objects to Kokkos::View objects (and +/// copies back in to the Teuchos::ArrayView objects, if needed). When +/// CrsGraph migrates fully to adopting Kokkos::DualView objects for its storage +/// of data, this procedure could be bypassed. +template +void +unpackCrsGraphAndCombine( + const CrsGraph& sourceGraph, + const Teuchos::ArrayView::packet_type>& imports, + const Teuchos::ArrayView& numPacketsPerLID, + const Teuchos::ArrayView& importLIDs, + size_t constantNumPackets, + Distributor & distor, + CombineMode combineMode, + const bool atomic) +{ + using Kokkos::View; + typedef typename Node::device_type device_type; + typedef typename CrsGraph::packet_type packet_type; + typedef typename CrsGraph::local_graph_type local_graph_type; + typedef typename CrsGraph::buffer_device_type buffer_device_type; + static_assert(std::is_same::value, + "Node::device_type and LocalGraph::device_type must be the same."); + + typedef typename device_type::execution_space execution_space; + typename execution_space::device_type outputDevice; + + typedef typename buffer_device_type::execution_space buffer_execution_space; + typename buffer_execution_space::device_type bufferOutputDevice; + + // Convert all Teuchos::Array to Kokkos::View. + + // numPacketsPerLID, importLIDs, and imports are input, so we have to copy + // them to device. Since unpacking is done directly in to the local graph + // (lclGraph), no copying needs to be performed after unpacking. + auto imports_d = + create_mirror_view_from_raw_host_array(bufferOutputDevice, + imports.getRawPtr(), imports.size(), + true, "imports"); + + auto num_packets_per_lid_d = + create_mirror_view_from_raw_host_array(bufferOutputDevice, + numPacketsPerLID.getRawPtr(), numPacketsPerLID.size(), + true, "num_packets_per_lid"); + + auto import_lids_d = + create_mirror_view_from_raw_host_array(outputDevice, + importLIDs.getRawPtr(), importLIDs.size(), + true, "import_lids"); + + auto local_graph = sourceGraph.getLocalGraph(); + auto local_col_map = sourceGraph.getColMap()->getLocalMap(); + + // Now do the actual unpack! + typedef decltype(local_col_map) local_map_type; + UnpackAndCombineCrsGraphImpl::unpackAndCombine< + packet_type,local_graph_type,local_map_type,buffer_device_type>( + local_graph, local_col_map, imports_d, num_packets_per_lid_d, + import_lids_d, combineMode, false, atomic); + + return; +} + +template +void +unpackCrsGraphAndCombineNew( + const CrsGraph& sourceGraph, + const Kokkos::DualView::packet_type*, + typename CrsGraph::buffer_device_type>& imports, + const Kokkos::DualView::buffer_device_type>& numPacketsPerLID, + const Kokkos::DualView& importLIDs, + const size_t constantNumPackets, + Distributor& distor, + const CombineMode combineMode, + const bool atomic) +{ + using Tpetra::Details::castAwayConstDualView; + using Kokkos::View; + typedef typename Node::device_type device_type; + typedef CrsGraph crs_graph_type; + typedef typename crs_graph_type::packet_type packet_type; + typedef typename crs_graph_type::local_graph_type local_graph_type; + typedef typename crs_graph_type::buffer_device_type buffer_device_type; + typedef typename buffer_device_type::memory_space buffer_memory_space; + typedef typename device_type::memory_space memory_space; + + static_assert(std::is_same::value, + "Node::device_type and LocalGraph::device_type must be " + "the same."); + + { + auto numPacketsPerLID_nc = castAwayConstDualView(numPacketsPerLID); + numPacketsPerLID_nc.template sync(); + } + auto num_packets_per_lid_d = numPacketsPerLID.template view(); + + { + auto importLIDs_nc = castAwayConstDualView(importLIDs); + importLIDs_nc.template sync(); + } + auto import_lids_d = importLIDs.template view(); + + { + auto imports_nc = castAwayConstDualView(imports); + imports_nc.template sync(); + } + auto imports_d = imports.template view(); + + auto local_graph = sourceGraph.getLocalGraph(); + auto local_col_map = sourceGraph.getColMap()->getLocalMap(); + typedef decltype(local_col_map) local_map_type; + + // Now do the actual unpack! + UnpackAndCombineCrsGraphImpl::unpackAndCombine< + packet_type,local_graph_type,local_map_type,buffer_device_type>( + local_graph, local_col_map, imports_d, num_packets_per_lid_d, + import_lids_d, combineMode, false, atomic); +} + +/// \brief Special version of Tpetra::Details::unpackCrsGraphAndCombine +/// that also unpacks owning process ranks. +/// +/// Perform the count for unpacking the imported column indices and pids, +/// and combining them into graph. Return (a ceiling on) +/// the number of local stored entries ("nonzeros") in the graph. If +/// there are no shared rows in the sourceGraph this count is exact. +/// +/// Note: This routine also counts the copyAndPermute nonzeros in +/// addition to those that come in via import. +/// +/// \tparam LO The type of local indices. See the +/// documentation of Map for requirements. +/// \tparam GO The type of global indices. See the +/// documentation of Map for requirements. +/// \tparam Node The Kokkos Node type. See the documentation of Map +/// for requirements. +/// +/// \param sourceGraph [in] the CrsGraph source +/// +/// \param imports [in] Input pack buffer +/// +/// \param numPacketsPerLID [out] Entry k gives the number of bytes +/// packed for row exportLIDs[k] of the local graph. +/// +/// \param importLIDs [in] Local indices of the rows to pack. +/// +/// \param constantNumPackets [out] Setting this to zero tells the caller +/// to expect a possibly /// different ("nonconstant") number of packets per local index +/// (i.e., a possibly different number of entries per row). +/// +/// \param distor [in] The distributor (not used) +/// +/// \param combineMode [in] the mode to use for combining +/// +/// \param numSameIds [in] +/// +/// \param permuteToLIDs [in] +/// +/// \param permuteFromLIDs [in] +/// +/// \warning The allowed \c combineMode are: +/// ADD, REPLACE, and ABSMAX. INSERT is not allowed. +// +/// \warning This method is intended for expert developer use +/// only, and should never be called by user code. +/// +/// Note: This is the public interface to the unpack and combine machinery and +/// converts passed Teuchos::ArrayView objects to Kokkos::View objects (and +/// copies back in to the Teuchos::ArrayView objects, if needed). When +/// CrsGraph migrates fully to adopting Kokkos::DualView objects for its storage +/// of data, this procedure could be bypassed. +template +size_t +unpackAndCombineWithOwningPIDsCount( + const CrsGraph & sourceGraph, + const Teuchos::ArrayView &importLIDs, + const Teuchos::ArrayView::packet_type> &imports, + const Teuchos::ArrayView& numPacketsPerLID, + size_t constantNumPackets, + Distributor &distor, + CombineMode combineMode, + size_t numSameIDs, + const Teuchos::ArrayView& permuteToLIDs, + const Teuchos::ArrayView& permuteFromLIDs) +{ + using Kokkos::MemoryUnmanaged; + using Kokkos::View; + typedef typename Node::device_type device_type; + typedef typename CrsGraph::packet_type packet_type; + typedef typename CrsGraph::local_graph_type local_graph_type; + typedef typename CrsGraph::buffer_device_type buffer_device_type; + const char prefix[] = "unpackAndCombineWithOwningPIDsCount: "; + + TEUCHOS_TEST_FOR_EXCEPTION + (permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument, + prefix << "permuteToLIDs.size() = " << permuteToLIDs.size() << " != " + "permuteFromLIDs.size() = " << permuteFromLIDs.size() << "."); + // FIXME (mfh 26 Jan 2015) If there are no entries on the calling + // process, then the graph is neither locally nor globally indexed. + const bool locallyIndexed = sourceGraph.isLocallyIndexed(); + TEUCHOS_TEST_FOR_EXCEPTION + (! locallyIndexed, std::invalid_argument, prefix << "The input " + "CrsGraph 'sourceGraph' must be locally indexed."); + TEUCHOS_TEST_FOR_EXCEPTION + (importLIDs.size() != numPacketsPerLID.size(), std::invalid_argument, + prefix << "importLIDs.size() = " << importLIDs.size() << " != " + "numPacketsPerLID.size() = " << numPacketsPerLID.size() << "."); + + auto local_graph = sourceGraph.getLocalGraph(); + auto permute_from_lids_d = + create_mirror_view_from_raw_host_array(device_type(), + permuteFromLIDs.getRawPtr(), + permuteFromLIDs.size(), true, + "permute_from_lids"); + auto imports_d = + create_mirror_view_from_raw_host_array(buffer_device_type(), + imports.getRawPtr(), + imports.size(), true, + "imports"); + auto num_packets_per_lid_d = + create_mirror_view_from_raw_host_array(buffer_device_type(), + numPacketsPerLID.getRawPtr(), + numPacketsPerLID.size(), true, + "num_packets_per_lid"); + + return UnpackAndCombineCrsGraphImpl::unpackAndCombineWithOwningPIDsCount< + packet_type,local_graph_type,buffer_device_type>( + local_graph, permute_from_lids_d, imports_d, num_packets_per_lid_d, numSameIDs); +} + +/// \brief unpackAndCombineIntoCrsArrays +/// +/// \note You should call unpackAndCombineWithOwningPIDsCount first +/// and allocate all arrays accordingly, before calling this +/// function. +/// +/// Note: The SourcePids vector (on input) should contain owning PIDs +/// for each column in the (source) ColMap, as from +/// Tpetra::Import_Util::getPids, with the "-1 for local" option being +/// used. +/// +/// Note: The TargetPids vector (on output) will contain owning PIDs +/// for each entry in the graph, with the "-1 for local" for locally +/// owned entries. +template +void +unpackAndCombineIntoCrsArrays( + const CrsGraph & sourceGraph, + const Teuchos::ArrayView& importLIDs, + const Teuchos::ArrayView::packet_type>& imports, + const Teuchos::ArrayView& numPacketsPerLID, + const size_t constantNumPackets, + Distributor& distor, + const CombineMode combineMode, + const size_t numSameIDs, + const Teuchos::ArrayView& permuteToLIDs, + const Teuchos::ArrayView& permuteFromLIDs, + size_t TargetNumRows, + size_t TargetNumNonzeros, + const int MyTargetPID, + const Teuchos::ArrayView& CRS_rowptr, + const Teuchos::ArrayView& CRS_colind, + const Teuchos::ArrayView& SourcePids, + Teuchos::Array& TargetPids) +{ + using Kokkos::View; + using Kokkos::deep_copy; + using Teuchos::ArrayView; + using Teuchos::outArg; + using Teuchos::REDUCE_MAX; + using Teuchos::reduceAll; + typedef LocalOrdinal LO; + typedef typename CrsGraph::packet_type packet_type; + typedef typename CrsGraph::local_graph_type local_graph_type; + typedef typename CrsGraph::buffer_device_type buffer_device_type; + typedef typename Node::device_type device_type; + typedef typename device_type::execution_space execution_space; + typedef typename buffer_device_type::execution_space buffer_execution_space; + typedef typename ArrayView::size_type size_type; + + const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays: "; + + TEUCHOS_TEST_FOR_EXCEPTION( + TargetNumRows + 1 != static_cast(CRS_rowptr.size()), + std::invalid_argument, prefix << "CRS_rowptr.size() = " << + CRS_rowptr.size() << "!= TargetNumRows+1 = " << TargetNumRows+1 << "."); + + TEUCHOS_TEST_FOR_EXCEPTION( + permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument, + prefix << "permuteToLIDs.size() = " << permuteToLIDs.size() + << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size() << "."); + const size_type numImportLIDs = importLIDs.size(); + + TEUCHOS_TEST_FOR_EXCEPTION( + numImportLIDs != numPacketsPerLID.size(), std::invalid_argument, + prefix << "importLIDs.size() = " << numImportLIDs << " != " + "numPacketsPerLID.size() = " << numPacketsPerLID.size() << "."); + + // Preseed TargetPids with -1 for local + if (static_cast(TargetPids.size()) != TargetNumNonzeros) { + TargetPids.resize(TargetNumNonzeros); + } + TargetPids.assign(TargetNumNonzeros, -1); + + // Grab pointers for sourceGraph + auto local_graph = sourceGraph.getLocalGraph(); + auto local_col_map = sourceGraph.getColMap()->getLocalMap(); + + // Convert input arrays to Kokkos::View + typename execution_space::device_type outputDevice; + typename buffer_execution_space::device_type bufferOutputDevice; + + auto import_lids_d = create_mirror_view_from_raw_host_array(outputDevice, + importLIDs.getRawPtr(), importLIDs.size(), + true, "import_lids"); + + auto imports_d = create_mirror_view_from_raw_host_array(bufferOutputDevice, + imports.getRawPtr(), imports.size(), + true, "imports"); + + auto num_packets_per_lid_d = create_mirror_view_from_raw_host_array(bufferOutputDevice, + numPacketsPerLID.getRawPtr(), numPacketsPerLID.size(), + true, "num_packets_per_lid"); + + auto permute_from_lids_d = create_mirror_view_from_raw_host_array(outputDevice, + permuteFromLIDs.getRawPtr(), permuteFromLIDs.size(), + true, "permute_from_lids"); + + auto permute_to_lids_d = create_mirror_view_from_raw_host_array(outputDevice, + permuteToLIDs.getRawPtr(), permuteToLIDs.size(), + true, "permute_to_lids"); + + auto crs_rowptr_d = create_mirror_view_from_raw_host_array(outputDevice, + CRS_rowptr.getRawPtr(), CRS_rowptr.size(), + true, "crs_rowptr"); + + auto crs_colind_d = create_mirror_view_from_raw_host_array(outputDevice, + CRS_colind.getRawPtr(), CRS_colind.size(), + true, "crs_colidx"); + + auto src_pids_d = create_mirror_view_from_raw_host_array(outputDevice, + SourcePids.getRawPtr(), SourcePids.size(), + true, "src_pids"); + + auto tgt_pids_d = create_mirror_view_from_raw_host_array(outputDevice, + TargetPids.getRawPtr(), TargetPids.size(), + true, "tgt_pids"); + + typedef decltype(local_col_map) local_map_type; + UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays< + packet_type,local_graph_type,local_map_type,buffer_device_type>( + local_graph, local_col_map, import_lids_d, imports_d, num_packets_per_lid_d, + permute_to_lids_d, permute_from_lids_d, crs_rowptr_d, crs_colind_d, src_pids_d, + tgt_pids_d, numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID); + + // Copy outputs back to host + typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h( + CRS_rowptr.getRawPtr(), CRS_rowptr.size()); + deep_copy(crs_rowptr_h, crs_rowptr_d); + + typename decltype(crs_colind_d)::HostMirror crs_colind_h( + CRS_colind.getRawPtr(), CRS_colind.size()); + deep_copy(crs_colind_h, crs_colind_d); + + typename decltype(tgt_pids_d)::HostMirror tgt_pids_h( + TargetPids.getRawPtr(), TargetPids.size()); + deep_copy(tgt_pids_h, tgt_pids_d); + +} + +} // namespace Details +} // namespace Tpetra + +#define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_INSTANT( LO, GO, NT ) \ + template void \ + Details::unpackCrsGraphAndCombine( \ + const CrsGraph&, \ + const Teuchos::ArrayView::packet_type>&, \ + const Teuchos::ArrayView&, \ + const Teuchos::ArrayView&, \ + size_t, \ + Distributor&, \ + CombineMode, \ + const bool); \ + template void \ + Details::unpackCrsGraphAndCombineNew( \ + const CrsGraph&, \ + const Kokkos::DualView::packet_type*, \ + typename CrsGraph::buffer_device_type>&, \ + const Kokkos::DualView::buffer_device_type>&, \ + const Kokkos::DualView&, \ + const size_t, \ + Distributor&, \ + const CombineMode, \ + const bool); \ + template void \ + Details::unpackAndCombineIntoCrsArrays( \ + const CrsGraph &, \ + const Teuchos::ArrayView&, \ + const Teuchos::ArrayView::packet_type>&, \ + const Teuchos::ArrayView&, \ + const size_t, \ + Distributor&, \ + const CombineMode, \ + const size_t, \ + const Teuchos::ArrayView&, \ + const Teuchos::ArrayView&, \ + size_t, \ + size_t, \ + const int, \ + const Teuchos::ArrayView&, \ + const Teuchos::ArrayView&, \ + const Teuchos::ArrayView&, \ + Teuchos::Array&); \ + template size_t \ + Details::unpackAndCombineWithOwningPIDsCount( \ + const CrsGraph &, \ + const Teuchos::ArrayView &, \ + const Teuchos::ArrayView::packet_type> &, \ + const Teuchos::ArrayView&, \ + size_t, \ + Distributor &, \ + CombineMode, \ + size_t, \ + const Teuchos::ArrayView&, \ + const Teuchos::ArrayView&); + +#endif // TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP diff --git a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp index d3a4f6ad409b..3f8240fa55f3 100644 --- a/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp +++ b/packages/tpetra/core/src/Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp @@ -89,6 +89,8 @@ class Distributor; // namespace Details { +namespace UnpackAndCombineCrsMatrixImpl { + /// \brief Unpack a single row of a CrsMatrix /// /// \tparam ST The type of the numerical entries of the matrix. @@ -103,14 +105,14 @@ namespace Details { /// \tparam BDT The "buffer device type." template KOKKOS_FUNCTION int -unpack_crs_matrix_row (typename PackTraits::output_array_type& gids_out, - typename PackTraits::output_array_type& pids_out, - typename PackTraits::output_array_type& vals_out, - const Kokkos::View& imports, - const size_t offset, - const size_t num_bytes, - const size_t num_ent, - const size_t num_bytes_per_value) +unpackRow(typename PackTraits::output_array_type& gids_out, + typename PackTraits::output_array_type& pids_out, + typename PackTraits::output_array_type& vals_out, + const Kokkos::View& imports, + const size_t offset, + const size_t num_bytes, + const size_t num_ent, + const size_t num_bytes_per_value) { if (num_ent == 0) { // Empty rows always take zero bytes, to ensure sparsity. @@ -353,9 +355,9 @@ struct UnpackCrsMatrixAndCombineFunctor { // Unpack this row! int unpack_err = - unpack_crs_matrix_row (gids_out, pids_out, vals_out, - imports, offset, num_bytes, - num_ent, num_bytes_per_value); + unpackRow(gids_out, pids_out, vals_out, + imports, offset, num_bytes, + num_ent, num_bytes_per_value); if (unpack_err != 0) { dst = Kokkos::make_pair (unpack_err, i); // unpack error tokens.release (token); @@ -530,7 +532,7 @@ compute_total_num_entries ( /// This is a higher level interface to the UnpackCrsMatrixAndCombineFunctor template void -do_unpack_and_combine_into_crs_matrix( +unpackAndCombineIntoCrsMatrix( const LocalMatrix& local_matrix, const LocalMap& local_map, const Kokkos::View& imports, @@ -547,7 +549,8 @@ do_unpack_and_combine_into_crs_matrix( typedef Kokkos::RangePolicy > range_policy; typedef UnpackCrsMatrixAndCombineFunctor unpack_functor_type; - const char prefix[] = "Tpetra::Details::do_unpack_and_combine_into_crs_matrix: "; + const char prefix[] = + "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix: "; const size_t num_import_lids = static_cast(import_lids.dimension_0()); if (num_import_lids == 0) { @@ -615,7 +618,7 @@ do_unpack_and_combine_into_crs_matrix( template size_t -unpackAndCombineWithOwningPIDsCountImpl ( +unpackAndCombineWithOwningPIDsCount( const LocalMatrix& local_matrix, const typename PackTraits::input_array_type permute_from_lids, const Kokkos::View& imports, @@ -674,9 +677,9 @@ unpackAndCombineWithOwningPIDsCountImpl ( template KOKKOS_INLINE_FUNCTION size_t -unpack_crs_matrix_row_count (const Kokkos::View& imports, - const size_t offset, - const size_t num_bytes) +unpackRowCount(const Kokkos::View& imports, + const size_t offset, + const size_t num_bytes) { LO num_ent_LO = 0; if (num_bytes > 0) { @@ -693,7 +696,7 @@ unpack_crs_matrix_row_count (const Kokkos::View& imports, /// \brief Setup row pointers for remotes template int -setup_row_pointers_for_remotes( +setupRowPointersForRemotes( const typename PackTraits::output_array_type& tgt_rowptr, const typename PackTraits::input_array_type& import_lids, const Kokkos::View& imports, @@ -715,7 +718,7 @@ setup_row_pointers_for_remotes( typedef typename std::remove_reference< decltype( tgt_rowptr(0) ) >::type atomic_incr_type; const size_t num_bytes = num_packets_per_lid(i); const size_t offset = offsets(i); - const size_t num_ent = unpack_crs_matrix_row_count (imports, offset, num_bytes); + const size_t num_ent = unpackRowCount (imports, offset, num_bytes); if (num_ent == InvalidNum) { k_error += 1; } @@ -727,7 +730,7 @@ setup_row_pointers_for_remotes( // Convert array of row lengths to a CRS pointer array template void -make_crs_row_pointer_from_lengths( +makeCrsRowPtrFromLengths( const typename PackTraits::output_array_type& tgt_rowptr, const Kokkos::View& new_start_row) { @@ -750,7 +753,7 @@ make_crs_row_pointer_from_lengths( template void -copy_data_from_same_ids( +copyDataFromSameIDs( const typename PackTraits::output_array_type& tgt_colind, const typename PackTraits::output_array_type& tgt_pids, const typename PackTraits::output_array_type& tgt_vals, @@ -795,7 +798,7 @@ copy_data_from_same_ids( template void -copy_data_from_permute_ids( +copyDataFromPermuteIDs( const typename PackTraits::output_array_type& tgt_colind, const typename PackTraits::output_array_type& tgt_pids, const typename PackTraits::output_array_type& tgt_vals, @@ -844,7 +847,7 @@ copy_data_from_permute_ids( template int -do_unpack_and_combine_into_crs_arrays( +unpackAndCombineIntoCrsArrays2( const typename PackTraits::output_array_type& tgt_colind, const typename PackTraits::output_array_type& tgt_pids, const typename PackTraits::output_array_type& tgt_vals, @@ -894,7 +897,7 @@ do_unpack_and_combine_into_crs_arrays( // Empty buffer means that the row is empty. return; } - size_t num_ent = unpack_crs_matrix_row_count(imports, offset, num_bytes); + size_t num_ent = unpackRowCount(imports, offset, num_bytes); if (num_ent == InvalidNum) { k_error += 1; return; @@ -907,9 +910,9 @@ do_unpack_and_combine_into_crs_arrays( vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row)); pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row)); - k_error += unpack_crs_matrix_row (gids_out, pids_out, vals_out, - imports, offset, num_bytes, - num_ent, num_bytes_per_value); + k_error += unpackRow(gids_out, pids_out, vals_out, + imports, offset, num_bytes, + num_ent, num_bytes_per_value); // Correct target PIDs. for (size_t j = 0; j < static_cast(num_ent); ++j) { @@ -923,7 +926,7 @@ do_unpack_and_combine_into_crs_arrays( template void -unpackAndCombineIntoCrsArraysImpl( +unpackAndCombineIntoCrsArrays( const LocalMatrix & local_matrix, const LocalMap & local_col_map, const typename PackTraits::input_array_type& import_lids, @@ -954,7 +957,7 @@ unpackAndCombineIntoCrsArraysImpl( typedef Kokkos::RangePolicy > range_policy; typedef BufferDeviceType BDT; - const char prefix[] = "unpackAndCombineIntoCrsArraysImpl: "; + const char prefix[] = "unpackAndCombineIntoCrsArrays: "; const size_t N = tgt_num_rows; const size_t mynnz = tgt_num_nonzeros; @@ -1011,7 +1014,7 @@ unpackAndCombineIntoCrsArraysImpl( // Setup row pointers for remotes int k_error = - setup_row_pointers_for_remotes (tgt_rowptr, + setupRowPointersForRemotes(tgt_rowptr, import_lids, imports, num_packets_per_lid, offsets); TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix << " Error transferring data to target row pointers. " @@ -1022,7 +1025,7 @@ unpackAndCombineIntoCrsArraysImpl( View new_start_row ("new_start_row", N+1); // Turn row length into a real CRS row pointer - make_crs_row_pointer_from_lengths (tgt_rowptr, new_start_row); + makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row); { auto nth_tgt_rowptr_h = getEntryOnHost(tgt_rowptr, N); bool condition = nth_tgt_rowptr_h != mynnz; @@ -1032,10 +1035,10 @@ unpackAndCombineIntoCrsArraysImpl( } // SameIDs: Copy the data over - copy_data_from_same_ids(tgt_colind, tgt_pids, tgt_vals, new_start_row, + copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row, tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid); - copy_data_from_permute_ids(tgt_colind, tgt_pids, tgt_vals, new_start_row, + copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row, tgt_rowptr, src_pids, permute_to_lids, permute_from_lids, local_matrix, local_col_map, my_pid); @@ -1043,7 +1046,7 @@ unpackAndCombineIntoCrsArraysImpl( return; } - int unpack_err = do_unpack_and_combine_into_crs_arrays(tgt_colind, tgt_pids, + int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids, tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid, local_matrix, local_col_map, my_pid, num_bytes_per_value); TEUCHOS_TEST_FOR_EXCEPTION( @@ -1053,6 +1056,8 @@ unpackAndCombineIntoCrsArraysImpl( return; } +} // namespace UnpackAndCombineCrsMatrixImpl + /// \brief Unpack the imported column indices and values, and combine into matrix. /// /// \tparam ST The type of the numerical entries of the matrix. @@ -1135,8 +1140,9 @@ unpackCrsMatrixAndCombine( auto local_col_map = sourceMatrix.getColMap()->getLocalMap(); // Now do the actual unpack! - do_unpack_and_combine_into_crs_matrix(local_matrix, local_col_map, imports_d, - num_packets_per_lid_d, import_lids_d, combineMode, false, atomic); + UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix( + local_matrix, local_col_map, imports_d, num_packets_per_lid_d, + import_lids_d, combineMode, false, atomic); return; } @@ -1190,7 +1196,7 @@ unpackCrsMatrixAndCombineNew (const CrsMatrix& sourceMatrix, typedef decltype (local_col_map) local_map_type; // Now do the actual unpack! - do_unpack_and_combine_into_crs_matrix< + UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix< local_matrix_type, local_map_type, buffer_device_type @@ -1305,11 +1311,9 @@ unpackAndCombineWithOwningPIDsCount ( numPacketsPerLID.size (), true, "num_packets_per_lid"); - return unpackAndCombineWithOwningPIDsCountImpl (local_matrix, - permute_from_lids_d, - imports_d, - num_packets_per_lid_d, - numSameIDs); + return UnpackAndCombineCrsMatrixImpl::unpackAndCombineWithOwningPIDsCount( + local_matrix, permute_from_lids_d, imports_d, + num_packets_per_lid_d, numSameIDs); } /// \brief unpackAndCombineIntoCrsArrays @@ -1493,10 +1497,11 @@ unpackAndCombineIntoCrsArrays ( "never happen, since std::complex does not work in Kokkos::View objects."); #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE - unpackAndCombineIntoCrsArraysImpl(local_matrix, local_col_map, - import_lids_d, imports_d, num_packets_per_lid_d, permute_to_lids_d, - permute_from_lids_d, crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, - tgt_pids_d, numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID, + UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsArrays( + local_matrix, local_col_map, import_lids_d, imports_d, + num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d, + crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d, + numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID, num_bytes_per_value); // Copy outputs back to host diff --git a/packages/tpetra/core/test/CrsGraph/CMakeLists.txt b/packages/tpetra/core/test/CrsGraph/CMakeLists.txt index 14d822a4e7f6..8f12d9147b25 100644 --- a/packages/tpetra/core/test/CrsGraph/CMakeLists.txt +++ b/packages/tpetra/core/test/CrsGraph/CMakeLists.txt @@ -61,3 +61,23 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST( NUM_MPI_PROCS 2 STANDARD_PASS_OUTPUT ) + +# I want this test to _build_ in either MPI or non-MPI ("serial"), +# but I only want it to _run_ in an MPI build with exactly 1 MPI +# process. +TRIBITS_ADD_EXECUTABLE( + CrsGraph_PackUnpack + SOURCES + CrsGraph_PackUnpack.cpp + ${TEUCHOS_STD_UNIT_TEST_MAIN} + COMM serial mpi +) + +TRIBITS_ADD_TEST( + CrsGraph_PackUnpack + NAME CrsGraph_PackUnpack_MPI_1 + ARGS "" + COMM mpi + NUM_MPI_PROCS 1 + STANDARD_PASS_OUTPUT +) diff --git a/packages/tpetra/core/test/CrsGraph/CrsGraph_PackUnpack.cpp b/packages/tpetra/core/test/CrsGraph/CrsGraph_PackUnpack.cpp new file mode 100644 index 000000000000..7c92298ea28c --- /dev/null +++ b/packages/tpetra/core/test/CrsGraph/CrsGraph_PackUnpack.cpp @@ -0,0 +1,419 @@ +/* +// @HEADER +// *********************************************************************** +// +// Tpetra: Templated Linear Algebra Services Package +// Copyright (2008) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Michael A. Heroux (maherou@sandia.gov) +// +// ************************************************************************ +// @HEADER +*/ + +#include "Tpetra_TestingUtilities.hpp" +#include "TpetraCore_ETIHelperMacros.h" +#include "Tpetra_CrsGraph.hpp" +#include "Tpetra_DefaultPlatform.hpp" +#include "Tpetra_Distributor.hpp" +#include "Tpetra_Map.hpp" +#include "Tpetra_Details_gathervPrint.hpp" +#include "Tpetra_Details_packCrsGraph.hpp" +#include "Tpetra_Details_unpackCrsGraphAndCombine.hpp" +#include "Teuchos_CommHelpers.hpp" +#include "Kokkos_ArithTraits.hpp" +#include +#include + +namespace { // anonymous + +#define NUM_ROW_PER_PROC 4 +#define NUM_NZ_COLS 100 + +using Tpetra::TestingUtilities::getDefaultComm; +using Teuchos::rcp; +using Teuchos::RCP; +using Teuchos::Array; +using Teuchos::ArrayView; +using Teuchos::Comm; +using Teuchos::outArg; +using Tpetra::Details::gathervPrint; +using Tpetra::Details::packCrsGraph; +using Tpetra::Details::unpackCrsGraphAndCombine; +using std::endl; + +template +bool +essentially_equal(T a, T b) { + typedef Kokkos::ArithTraits KAT; + const auto eps = KAT::eps(); + return KAT::abs(a - b) <= ( (KAT::abs(a) > KAT::abs(b) ? KAT::abs(b) : KAT::abs(a)) * eps); +} + +template +Teuchos::RCP +generate_test_graph(const Teuchos::RCP >& comm) +{ + typedef CrsGraphType crs_graph_type; + typedef typename crs_graph_type::local_ordinal_type LO; + typedef typename crs_graph_type::global_ordinal_type GO; + typedef typename crs_graph_type::node_type NT; + typedef Tpetra::Map MapType; + + const int world_rank = comm->getRank(); + + const LO num_row_per_proc = NUM_ROW_PER_PROC; // 4; + Array row_gids(num_row_per_proc); + + GO start = static_cast(num_row_per_proc * world_rank); + for (LO i=0; i(start+i); + } + + // Create random, unique column GIDs. + const LO max_num_ent_per_row = NUM_NZ_COLS; //100; + std::random_device rand_dev; + std::mt19937 generator(rand_dev()); + std::uniform_int_distribution distr(1, 2000); + std::set col_gids_set; + typedef typename std::set::size_type SGO; + SGO num_gids_in_set = static_cast(max_num_ent_per_row); + col_gids_set.insert(0); + int num_passes = 0; + while (col_gids_set.size() < num_gids_in_set && num_passes <= 2000) { + col_gids_set.insert(distr(generator)); + num_passes += 1; + } + TEUCHOS_TEST_FOR_EXCEPTION(col_gids_set.size() != num_gids_in_set, + std::runtime_error, "Random column IDs not generated"); + + Array col_gids(col_gids_set.begin(), col_gids_set.end()); + + const GO INVALID = Teuchos::OrdinalTraits::invalid(); + RCP row_map = rcp(new MapType(INVALID, row_gids(), 0, comm)); + RCP col_map = rcp(new MapType(INVALID, col_gids(), 0, comm)); + + auto A = rcp(new crs_graph_type(row_map, col_map, max_num_ent_per_row)); + + Array columns(max_num_ent_per_row); + for (LO j=0; j(start + i); // unused + A->insertLocalIndices(i, columns()); + } + A->fillComplete(col_map, row_map); + + return A; +} + +TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(CrsGraph, PackThenUnpackAndCombine, LO, GO, NT) +{ + typedef Tpetra::CrsGraph crs_graph_type; + typedef typename crs_graph_type::packet_type packet_type; + typedef typename NT::device_type device_type; + typedef typename device_type::execution_space execution_space; + + int lclSuccess = 1; // to be revised below + int gblSuccess = 0; // output argument + + RCP > comm = getDefaultComm(); + const int world_rank = comm->getRank(); + + out << "Creating graph" << endl; + + auto A = generate_test_graph (comm); + auto col_map = A->getColMap(); + auto row_map = A->getRowMap(); + + out << "Preparing arguments for packCrsGraph" << endl; + + LO num_loc_rows = static_cast(A->getNodeNumRows()); + Array exportLIDs (num_loc_rows); // input argument + for (LO i=0; i < num_loc_rows; ++i) { + exportLIDs[i] = static_cast(i); // pack all the rows + } + Array exports; // output argument; to be realloc'd + Array numPacketsPerLID (num_loc_rows, 0); // output argument + size_t constantNumPackets; // output argument + Tpetra::Distributor distor (comm); // argument required, but not used + + out << "Calling packCrsGraph" << endl; + + { + int local_op_ok; + std::ostringstream msg; + try { + packCrsGraph(*A, exports, numPacketsPerLID(), exportLIDs(), + constantNumPackets, distor); + local_op_ok = 1; + } catch (std::exception& e) { + local_op_ok = 0; + msg << e.what(); + } + TEST_ASSERT(local_op_ok == 1); + lclSuccess = success ? 1 : 0; + Teuchos::reduceAll (*comm, Teuchos::REDUCE_MIN, lclSuccess, outArg (gblSuccess)); + TEST_EQUALITY_CONST( gblSuccess, 1 ); + if (gblSuccess != 1) { + if (world_rank == 0) { + out << "packCrsGraph reported an error!" << endl; + } + gathervPrint (out, msg.str(), *comm); + out << endl << "Abandoning test; no point in continuing." << endl; + return; + } + } + + // Now make sure that the pack is correct by creating an empty graph and + // unpacking in to it. The graph should end up being the same as the above graph. + out << "Building second graph" << endl; + RCP B = rcp(new crs_graph_type(row_map, col_map, A->getNodeNumEntries())); + +#ifdef KOKKOS_HAVE_SERIAL + const bool atomic_updates = ! std::is_same::value; +#else + const bool atomic_updates = true; +#endif // KOKKOS_HAVE_SERIAL + + out << "Calling unpackCrsGraphAndCombine with " + << "CombineMode=Tpetra::REPLACE" << endl; + + { + int local_op_ok; + std::ostringstream msg; + try { + unpackCrsGraphAndCombine(*B, exports, numPacketsPerLID(), + exportLIDs(), constantNumPackets, distor, Tpetra::REPLACE, atomic_updates); + local_op_ok = 0; + } catch (std::exception& e) { + // This method should throw because it is not finished! + local_op_ok = 1; + } + + TEST_ASSERT(local_op_ok == 1); + lclSuccess = success ? 1 : 0; + Teuchos::reduceAll (*comm, Teuchos::REDUCE_MIN, lclSuccess, outArg (gblSuccess)); + TEST_EQUALITY_CONST( gblSuccess, 1 ); + if (gblSuccess != 1) { + if (world_rank == 0) { + out << "unpackCrsGraphAndCombine reported an error!" << endl; + } + gathervPrint(out, msg.str(), *comm); + return; // no point in continuing + } + } + + // The test below uses the host Tpetra::CrsGraph interface to + // compare graph values. Thus, we need to do a fence before + // comparing graph values, in order to ensure that changes made on + // device are visible on host. + execution_space::fence (); + + int lclNumErrors = 0; + + out << "Comparing graphs after unpackCrsGraphAndCombine " + "with CombineMode=REPLACE" << endl; + { + std::ostringstream errStrm; + for (LO lclRow=0; lclRow A_indices; + A->getLocalRowView(lclRow, A_indices); + + ArrayView B_indices; + B->getLocalRowView(lclRow, B_indices); + + continue; + /* + * Test to be uncommented when unpackCrsGraphAndCombine is finished. + * + TEST_EQUALITY( A_indices.size (), B_indices.size () ); + + int curNumErrors = 0; + LO num_indices = static_cast(A_indices.size()); + for (LO i=0; i (*comm, Teuchos::REDUCE_SUM, lclNumErrors, outArg (gblNumErrors)); + TEST_EQUALITY_CONST( gblNumErrors, 0 ); + if (gblNumErrors != 0) { + if (world_rank == 0) { + out << "unpackCrsGraphAndCombine comparison found " << gblNumErrors + << " error" << (gblNumErrors != 1 ? "s" : "") << "!" << endl; + } + gathervPrint (out, errStrm.str (), *comm); + return; // no point in continuing + } + + lclSuccess = success ? 1 : 0; + Teuchos::reduceAll (*comm, Teuchos::REDUCE_MIN, lclSuccess, outArg (gblSuccess)); + TEST_EQUALITY_CONST( gblSuccess, 1 ); + if (gblSuccess != 1) { + if (world_rank == 0) { + out << "unpackCrsGraphAndCombine comparison claims zero errors, " + "but success is false on at least one process!" << endl; + } + gathervPrint (out, errStrm.str (), *comm); + return; // no point in continuing + } + } + +} + +// PackWithError sends intentionally bad inputs to pack/unpack to make sure +// that CrsGraph will detect the bad inputs and return the correct +// error diagnostics. +TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(CrsGraph, PackWithError, LO, GO, NT) +{ + + typedef Tpetra::CrsGraph crs_graph_type; + typedef typename crs_graph_type::packet_type packet_type; + + RCP > comm = getDefaultComm(); + const int world_rank = comm->getRank(); + + out << "Creating graph" << endl; + + auto A = generate_test_graph(comm); + auto col_map = A->getColMap(); + auto row_map = A->getRowMap(); + + out << "Calling packCrsGraph" << endl; + + // Prepare arguments for pack. This test is similar to the + // PackThenUnpackAndCombine test, + // but incorrect exportLIDs are sent in to induce a packing error. + int lclSuccess = success ? 1 : 0; + int gblSuccess = 0; // output argument + std::ostringstream errStrm; // for error string local to each process + + { + LO num_loc_rows = static_cast(A->getNodeNumRows()); + Array exportLIDs(num_loc_rows); + // exportLIDs[i] should equal i, but we set it to i+2 + for (LO i=0; i exports; + Array numPacketsPerLID(num_loc_rows, 0); + size_t constantNumPackets; + Tpetra::Distributor distor(comm); + { + int local_op_ok; + std::ostringstream msg; + try { + packCrsGraph(*A, exports, numPacketsPerLID(), exportLIDs(), + constantNumPackets, distor); + local_op_ok = 1; + } catch (std::exception& e) { + local_op_ok = 0; + msg << e.what(); + } + if (local_op_ok == 1) { + // Local pack should not be OK! We requested bad local IDs be exported! + errStrm << "Proc " << world_rank + << ": packCrsGraph returned OK, but bad local IDs were requested!" + << endl; + lclSuccess = 0; + } + } + } + + Teuchos::reduceAll (*comm, Teuchos::REDUCE_MIN, lclSuccess, outArg (gblSuccess)); + TEST_EQUALITY( gblSuccess, 1 ); + if (gblSuccess != 1) { + out << "packCrsGraph failed to notice bad export IDs on some process!" << endl; + gathervPrint (out, errStrm.str (), *comm); + } + + { + // Let's try this again, but send in the wrong number of exportLIDs + LO num_loc_rows = static_cast(A->getNodeNumRows()); + // Note the -1! + out << "Allocating ids... "; + Array exportLIDs(num_loc_rows-1); + for (LO i=0; i < num_loc_rows-1; ++i) { + exportLIDs[i] = i; + } + out << "done" << endl; + + Array exports; + Array numPacketsPerLID(num_loc_rows, 0); + size_t constantNumPackets; + Tpetra::Distributor distor(comm); + out << "Calling packCrsGraph" << endl; + { + int local_op_ok; + std::ostringstream msg; + try { + packCrsGraph(*A, exports, numPacketsPerLID(), exportLIDs(), + constantNumPackets, distor); + local_op_ok = 1; + } catch (std::exception& e) { + local_op_ok = 0; + msg << e.what(); + } + if (local_op_ok == 1) { + // Local pack should not be OK! We requested too few local IDs be exported! + errStrm << "Proc " << world_rank + << ": packCrsGraph returned OK, but too few IDs " + << "were requested to be exported!" + << endl; + lclSuccess = 0; + } + } + Teuchos::reduceAll (*comm, Teuchos::REDUCE_MIN, lclSuccess, outArg (gblSuccess)); + TEST_EQUALITY( gblSuccess, 1 ); + } +} + +#define UNIT_TEST_GROUP( LO, GO, NT ) \ + TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT(CrsGraph, PackThenUnpackAndCombine, LO, GO, NT) \ + TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT(CrsGraph, PackWithError, LO, GO, NT) + +TPETRA_ETI_MANGLING_TYPEDEFS() + +TPETRA_INSTANTIATE_LGN(UNIT_TEST_GROUP) + +} // namespace (anonymous) diff --git a/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp b/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp index 334016ee0db9..2f0cb436fa26 100644 --- a/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp +++ b/packages/tpetra/core/test/ImportExport2/ImportExport2_UnitTests.cpp @@ -697,6 +697,76 @@ namespace { } +template +bool graphs_are_same(const RCP& G1, const RCP& G2) +{ + typedef typename Graph::local_ordinal_type LO; + + int my_rank = G1->getRowMap()->getComm()->getRank(); + + // Make sure each graph is fill complete before checking other properties + if (! G1->isFillComplete()) { + if (my_rank == 0) + cerr << "***Error: Graph 1 is not fill complete!" << endl; + return false; + } + if (! G2->isFillComplete()) { + if (my_rank == 0) + cerr << "***Error: Graph 2 is not fill complete!" << endl; + return false; + } + + int errors = 0; + + if (! G1->getRowMap()->isSameAs(*G2->getRowMap())) { + if (my_rank == 0) + cerr << "***Error: Graph 1's row map is different than Graph 2's" << endl; + errors++; + } + if (! G1->getDomainMap()->isSameAs(*G2->getDomainMap())) { + if (my_rank == 0) + cerr << "***Error: Graph 1's domain map is different than Graph 2's" << endl; + errors++; + } + if (! G1->getRangeMap()->isSameAs(*G2->getRangeMap())) { + if (my_rank == 0) + cerr << "***Error: Graph 1's range map is different than Graph 2's" << endl; + errors++; + } + if (G1->getNodeNumEntries() != G2->getNodeNumEntries()) { + cerr << "***Error: Graph 1 does not have the same number of entries as Graph 2 on Process " + << my_rank << endl; + errors++; + } + + if (errors != 0) return false; + + for (LO i=0; i(G1->getNodeNumRows()); i++) { + ArrayView V1, V2; + G1->getLocalRowView(i, V1); + G2->getLocalRowView(i, V2); + if (V1.size() != V2.size()) { + cerr << "***Error: Graph 1 and Graph 2 have different number of entries in local row " + << i << " on Process " << my_rank << endl; + errors++; + continue; + } + int jerr = 0; + for (LO j=0; static_cast(j& Import, TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( FusedImportExport, doImport, LO, GO, Scalar ) { + typedef Tpetra::CrsGraph Graph; typedef Tpetra::CrsMatrix CrsMatrixType; typedef Tpetra::Map MapType; typedef Tpetra::Import ImportType; @@ -1172,6 +1243,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( FusedImportExport, doImport, LO, GO, Scalar ) RCP > Comm = getDefaultComm(); RCP A, B, C; + RCP Bg; RCP Map1, Map2; RCP Map3; @@ -1194,6 +1266,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( FusedImportExport, doImport, LO, GO, Scalar ) << e.what () << endl; lclErr = 1; } + auto Ag = A->getCrsGraph(); reduceAll (*Comm, REDUCE_MAX, lclErr, outArg (gblErr)); // The test fails if any (MPI) process had trouble. @@ -1224,6 +1297,13 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( FusedImportExport, doImport, LO, GO, Scalar ) } total_err--; } + // Test the graph version + Import1 = rcp(new ImportType(Ag->getRowMap(),Map1)); + Bg = Tpetra::importAndFillCompleteCrsGraph(Ag,*Import1); + if (!graphs_are_same(Bg, B->getCrsGraph())) { + if (MyPID == 0) cerr << "FusedImport: CrsGraph test #1 FAILED." << endl; + total_err--; + } // Execute fused export Export1 = rcp(new ExportType(A->getRowMap(),Map1)); @@ -1237,6 +1317,14 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( FusedImportExport, doImport, LO, GO, Scalar ) total_err--; } + // Test the graph version + Export1 = rcp(new ExportType(Ag->getRowMap(),Map1)); + Bg = Tpetra::exportAndFillCompleteCrsGraph(Ag,*Export1); + if (!graphs_are_same(Bg, B->getCrsGraph())) { + if (MyPID == 0) cerr << "FusedExport: CrsGraph test #1 FAILED." << endl; + total_err--; + } + Comm->barrier (); } catch (std::exception& e) { err << "Process " << MyPID << " threw an exception: " << e.what () << endl; @@ -1275,6 +1363,14 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( FusedImportExport, doImport, LO, GO, Scalar ) total_err--; } + // Test the graph version + Import1 = rcp(new ImportType(Ag->getRowMap(),Map1)); + Bg = Tpetra::importAndFillCompleteCrsGraph(Ag,*Import1); + if (!graphs_are_same(Bg, B->getCrsGraph())) { + if (MyPID == 0) cerr << "FusedImport: CrsGraph test #2 FAILED." << endl; + total_err--; + } + // Execute fused export Export1 = rcp(new ExportType(A->getRowMap(),Map1)); B = Tpetra::exportAndFillCompleteCrsMatrix(A,*Export1); @@ -1286,6 +1382,15 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( FusedImportExport, doImport, LO, GO, Scalar ) } total_err--; } + + // Test the graph version + Export1 = rcp(new ExportType(Ag->getRowMap(),Map1)); + Bg = Tpetra::exportAndFillCompleteCrsGraph(Ag,*Export1); + if (!graphs_are_same(Bg, B->getCrsGraph())) { + if (MyPID == 0) cerr << "FusedExport: CrsGraph test #2 FAILED." << endl; + total_err--; + } + } catch (std::exception& e) { err << "Process " << MyPID << " threw an exception: " << e.what (); lclErr = 1; @@ -1339,6 +1444,14 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( FusedImportExport, doImport, LO, GO, Scalar ) total_err--; } + // Test the graph version + Import1 = rcp(new ImportType(Ag->getRowMap(),Map1)); + Bg = Tpetra::importAndFillCompleteCrsGraph(Ag,*Import1); + if (!graphs_are_same(Bg, B->getCrsGraph())) { + if (MyPID == 0) cerr << "FusedImport: CrsGraph test #4 FAILED." << endl; + total_err--; + } + // Execute fused export Export1 = rcp(new ExportType(A->getRowMap(),Map1)); B = Tpetra::exportAndFillCompleteCrsMatrix(A,*Export1); @@ -1350,6 +1463,15 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( FusedImportExport, doImport, LO, GO, Scalar ) } total_err--; } + + // Test the graph version + Export1 = rcp(new ExportType(Ag->getRowMap(),Map1)); + Bg = Tpetra::exportAndFillCompleteCrsGraph(Ag,*Export1); + if (!graphs_are_same(Bg, B->getCrsGraph())) { + if (MyPID == 0) cerr << "FusedExport: CrsGraph test #4 FAILED." << endl; + total_err--; + } + } catch (std::exception& e) { err << "Process " << MyPID << " threw an exception: " << e.what () << endl; lclErr = 1; @@ -1381,6 +1503,13 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( FusedImportExport, doImport, LO, GO, Scalar ) } total_err--; } + // Test the graph version + Import1 = rcp(new ImportType(Ag->getRowMap(),Map3)); + Bg = Tpetra::importAndFillCompleteCrsGraph(Ag,*Import1,Map3,Map3); + if (!graphs_are_same(Bg, B->getCrsGraph())) { + if (MyPID == 0) cerr << "FusedImport: CrsGraph test #5 FAILED." << endl; + total_err--; + } // Execute fused export Export1 = rcp(new ExportType(A->getRowMap(),Map3)); @@ -1393,6 +1522,15 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( FusedImportExport, doImport, LO, GO, Scalar ) } total_err--; } + + // Test the graph version + Export1 = rcp(new ExportType(Ag->getRowMap(),Map3)); + Bg = Tpetra::exportAndFillCompleteCrsGraph(Ag,*Export1,Map3,Map3); + if (!graphs_are_same(Bg, B->getCrsGraph())) { + if (MyPID == 0) cerr << "FusedExport: CrsGraph test #5 FAILED." << endl; + total_err--; + } + } catch (std::exception& e) { err << "Process " << MyPID << " threw an exception: " << e.what () << endl; lclErr = 1; @@ -1420,7 +1558,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( FusedImportExport, doImport, LO, GO, Scalar ) // Execute fused import constructor Import1 = rcp(new ImportType(A->getRowMap(),Map3)); B = Tpetra::importAndFillCompleteCrsMatrix(A,*Import1,Map3,Map3,rcp(¶ms,false)); - diff=test_with_matvec_reduced_maps(*A,*B,*Map3); if(diff > diff_tol){ if(MyPID==0) { @@ -1430,10 +1567,19 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( FusedImportExport, doImport, LO, GO, Scalar ) total_err--; } + // Test the graph version + Import1 = rcp(new ImportType(Ag->getRowMap(),Map3)); + Bg = Tpetra::importAndFillCompleteCrsGraph(Ag,*Import1,Map3,Map3,rcp(¶ms,false)); + if (Map3->getNodeNumElements() > 0) { + if (!graphs_are_same(Bg, B->getCrsGraph())) { + if (MyPID == 0) cerr << "FusedImport: CrsGraph test #6 FAILED." << endl; + total_err--; + } + } + // Execute fused export constructor Export1 = rcp(new ExportType(A->getRowMap(),Map3)); B = Tpetra::exportAndFillCompleteCrsMatrix(A,*Export1,Map3,Map3,rcp(¶ms,false)); - diff=test_with_matvec_reduced_maps(*A,*B,*Map3); if(diff > diff_tol){ if(MyPID==0) { @@ -1442,6 +1588,17 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( FusedImportExport, doImport, LO, GO, Scalar ) } total_err--; } + + // Test the graph version + Export1 = rcp(new ExportType(Ag->getRowMap(),Map3)); + Bg = Tpetra::exportAndFillCompleteCrsGraph(Ag,*Export1,Map3,Map3,rcp(¶ms,false)); + if (Map3->getNodeNumElements() > 0) { + if (!graphs_are_same(Bg, B->getCrsGraph())) { + if (MyPID == 0) cerr << "FusedExport: CrsGraph test #6 FAILED." << endl; + total_err--; + } + } + } catch (std::exception& e) { err << "Process " << MyPID << " threw an exception: " << e.what () << endl; lclErr = 1; @@ -1472,7 +1629,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( FusedImportExport, doImport, LO, GO, Scalar ) // Execute fused import constructor Import1 = rcp(new ImportType(Map1,A->getRowMap())); B = Tpetra::importAndFillCompleteCrsMatrix(A,*Import1,Map1,Map1,rcp(¶ms,false)); - diff=test_with_matvec(*A,*B); if(diff > diff_tol){ if(MyPID==0) { @@ -1482,10 +1638,17 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( FusedImportExport, doImport, LO, GO, Scalar ) total_err--; } + // Test the graph version + Import1 = rcp(new ImportType(Map1,Ag->getRowMap())); + Bg = Tpetra::importAndFillCompleteCrsGraph(Ag,*Import1,Map1,Map1,rcp(¶ms,false)); + if (!graphs_are_same(Bg, B->getCrsGraph())) { + if (MyPID == 0) cerr << "FusedImport: CrsGraph test #7 FAILED." << endl; + total_err--; + } + // Execute fused export constructor Export1 = rcp(new ExportType(Map1,A->getRowMap())); B = Tpetra::exportAndFillCompleteCrsMatrix(A,*Export1,Map1,Map1,rcp(¶ms,false)); - diff=test_with_matvec(*A,*B); if(diff > diff_tol){ if(MyPID==0) { @@ -1494,6 +1657,15 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( FusedImportExport, doImport, LO, GO, Scalar ) } total_err--; } + + // Test the graph version + Export1 = rcp(new ExportType(Map1,Ag->getRowMap())); + Bg = Tpetra::exportAndFillCompleteCrsGraph(Ag,*Export1,Map1,Map1,rcp(¶ms,false)); + if (!graphs_are_same(Bg, B->getCrsGraph())) { + if (MyPID == 0) cerr << "FusedExport: CrsGraph test #1 FAILED." << endl; + total_err--; + } + } catch (std::exception& e) { err << "Process " << MyPID << " threw an exception: " << e.what () << endl; lclErr = 1; @@ -1512,6 +1684,7 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( FusedImportExport, doImport, LO, GO, Scalar ) try { OSTab tab2 (out); build_test_matrix_with_row_overlap(Comm,A); + Ag = A->getCrsGraph(); Teuchos::ArrayRCP rowptr; Teuchos::ArrayRCP colind; @@ -1524,7 +1697,6 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( FusedImportExport, doImport, LO, GO, Scalar ) params.set("Reverse Mode",true); Import1 = rcp(new ImportType(Map1,A->getRowMap())); B = Tpetra::importAndFillCompleteCrsMatrix(A,*Import1,Map1,Map1,rcp(¶ms,false)); - diff=test_with_matvec(*B,*A); if(diff > diff_tol){ if(MyPID==0) { @@ -1534,6 +1706,14 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( FusedImportExport, doImport, LO, GO, Scalar ) total_err--; } + // Test the graph version + Import1 = rcp(new ImportType(Map1,Ag->getRowMap())); + Bg = Tpetra::importAndFillCompleteCrsGraph(Ag,*Import1,Map1,Map1,rcp(¶ms,false)); + if (!graphs_are_same(Bg, B->getCrsGraph())) { + if (MyPID == 0) cerr << "FusedImport: CrsGraph test #8 FAILED." << endl; + total_err--; + } + // Execute fused export constructor Export1 = rcp(new ExportType(A->getRowMap(),Map1)); B = Tpetra::exportAndFillCompleteCrsMatrix(A,*Export1,Map1,Map1); @@ -1545,6 +1725,15 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL( FusedImportExport, doImport, LO, GO, Scalar ) } total_err--; } + + // Test the graph version + Export1 = rcp(new ExportType(Ag->getRowMap(),Map1)); + Bg = Tpetra::exportAndFillCompleteCrsGraph(Ag,*Export1,Map1,Map1); + if (!graphs_are_same(Bg, B->getCrsGraph())) { + if (MyPID == 0) cerr << "FusedExport: CrsGraph test #1 FAILED." << endl; + total_err--; + } + } catch (std::exception& e) { err << "Process " << MyPID << " threw an exception: " << e.what () << endl; lclErr = 1;